diff --git "a/checkpoint-20904/trainer_state.json" "b/checkpoint-20904/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-20904/trainer_state.json" @@ -0,0 +1,146421 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.9999043291078689, + "eval_steps": 2613, + "global_step": 20904, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 3.240769189720181, + "learning_rate": 1.9138755980861247e-07, + "loss": 1.9865, + "step": 1 + }, + { + "epoch": 0.0, + "eval_loss": 2.280837297439575, + "eval_runtime": 4185.9784, + "eval_samples_per_second": 19.976, + "eval_steps_per_second": 2.497, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 3.223090418187222, + "learning_rate": 3.8277511961722493e-07, + "loss": 1.9703, + "step": 2 + }, + { + "epoch": 0.0, + "grad_norm": 3.6145046678806407, + "learning_rate": 5.741626794258373e-07, + "loss": 2.1381, + "step": 3 + }, + { + "epoch": 0.0, + "grad_norm": 4.6296311605868885, + "learning_rate": 7.655502392344499e-07, + "loss": 2.456, + "step": 4 + }, + { + "epoch": 0.0, + "grad_norm": 3.8401650197500428, + "learning_rate": 9.569377990430622e-07, + "loss": 2.1077, + "step": 5 + }, + { + "epoch": 0.0, + "grad_norm": 3.683890691252437, + "learning_rate": 1.1483253588516746e-06, + "loss": 2.1413, + "step": 6 + }, + { + "epoch": 0.0, + "grad_norm": 3.6432534368591782, + "learning_rate": 1.339712918660287e-06, + "loss": 2.1568, + "step": 7 + }, + { + "epoch": 0.0, + "grad_norm": 4.663365277527622, + "learning_rate": 1.5311004784688997e-06, + "loss": 2.3463, + "step": 8 + }, + { + "epoch": 0.0, + "grad_norm": 3.6372978147069697, + "learning_rate": 1.722488038277512e-06, + "loss": 2.1712, + "step": 9 + }, + { + "epoch": 0.0, + "grad_norm": 3.852872878211269, + "learning_rate": 1.9138755980861244e-06, + "loss": 2.2722, + "step": 10 + }, + { + "epoch": 0.0, + "grad_norm": 3.6549109607459043, + "learning_rate": 2.105263157894737e-06, + "loss": 1.9729, + "step": 11 + }, + { + "epoch": 0.0, + "grad_norm": 3.2318002906734535, + "learning_rate": 2.2966507177033493e-06, + "loss": 2.0595, + "step": 12 + }, + { + "epoch": 0.0, + "grad_norm": 3.6991822490019413, + "learning_rate": 2.488038277511962e-06, + "loss": 2.0928, + "step": 13 + }, + { + "epoch": 0.0, + "grad_norm": 4.159355481440864, + "learning_rate": 2.679425837320574e-06, + "loss": 2.1411, + "step": 14 + }, + { + "epoch": 0.0, + "grad_norm": 4.061437005322893, + "learning_rate": 2.870813397129187e-06, + "loss": 2.3334, + "step": 15 + }, + { + "epoch": 0.0, + "grad_norm": 3.7145200953671766, + "learning_rate": 3.0622009569377995e-06, + "loss": 1.9779, + "step": 16 + }, + { + "epoch": 0.0, + "grad_norm": 4.3951391847417565, + "learning_rate": 3.2535885167464113e-06, + "loss": 2.24, + "step": 17 + }, + { + "epoch": 0.0, + "grad_norm": 4.225335307764433, + "learning_rate": 3.444976076555024e-06, + "loss": 2.2625, + "step": 18 + }, + { + "epoch": 0.0, + "grad_norm": 4.08014522806263, + "learning_rate": 3.636363636363636e-06, + "loss": 2.1072, + "step": 19 + }, + { + "epoch": 0.0, + "grad_norm": 4.750695114221436, + "learning_rate": 3.827751196172249e-06, + "loss": 2.2998, + "step": 20 + }, + { + "epoch": 0.0, + "grad_norm": 4.576295112434383, + "learning_rate": 4.019138755980861e-06, + "loss": 2.1414, + "step": 21 + }, + { + "epoch": 0.0, + "grad_norm": 5.5573739807898805, + "learning_rate": 4.210526315789474e-06, + "loss": 2.3373, + "step": 22 + }, + { + "epoch": 0.0, + "grad_norm": 4.369397245889477, + "learning_rate": 4.401913875598086e-06, + "loss": 2.1658, + "step": 23 + }, + { + "epoch": 0.0, + "grad_norm": 5.28624090773153, + "learning_rate": 4.5933014354066986e-06, + "loss": 2.2938, + "step": 24 + }, + { + "epoch": 0.0, + "grad_norm": 4.9534751396409655, + "learning_rate": 4.784688995215311e-06, + "loss": 2.2733, + "step": 25 + }, + { + "epoch": 0.0, + "grad_norm": 5.381089984082533, + "learning_rate": 4.976076555023924e-06, + "loss": 2.2244, + "step": 26 + }, + { + "epoch": 0.0, + "grad_norm": 6.226111694547378, + "learning_rate": 5.167464114832536e-06, + "loss": 2.2223, + "step": 27 + }, + { + "epoch": 0.0, + "grad_norm": 3.5666909272147245, + "learning_rate": 5.358851674641148e-06, + "loss": 1.7762, + "step": 28 + }, + { + "epoch": 0.0, + "grad_norm": 5.398756537107184, + "learning_rate": 5.550239234449761e-06, + "loss": 2.4073, + "step": 29 + }, + { + "epoch": 0.0, + "grad_norm": 4.315805192926013, + "learning_rate": 5.741626794258374e-06, + "loss": 2.1847, + "step": 30 + }, + { + "epoch": 0.0, + "grad_norm": 3.532277066830244, + "learning_rate": 5.933014354066986e-06, + "loss": 2.2245, + "step": 31 + }, + { + "epoch": 0.0, + "grad_norm": 3.413171467566141, + "learning_rate": 6.124401913875599e-06, + "loss": 2.0755, + "step": 32 + }, + { + "epoch": 0.0, + "grad_norm": 3.2310871639043746, + "learning_rate": 6.315789473684211e-06, + "loss": 2.2345, + "step": 33 + }, + { + "epoch": 0.0, + "grad_norm": 3.3848849049261207, + "learning_rate": 6.5071770334928226e-06, + "loss": 2.0588, + "step": 34 + }, + { + "epoch": 0.0, + "grad_norm": 3.0082260641673706, + "learning_rate": 6.698564593301436e-06, + "loss": 2.1575, + "step": 35 + }, + { + "epoch": 0.0, + "grad_norm": 2.45419363377176, + "learning_rate": 6.889952153110048e-06, + "loss": 2.2532, + "step": 36 + }, + { + "epoch": 0.0, + "grad_norm": 1.316572782291838, + "learning_rate": 7.081339712918661e-06, + "loss": 1.9725, + "step": 37 + }, + { + "epoch": 0.0, + "grad_norm": 1.2302956707592998, + "learning_rate": 7.272727272727272e-06, + "loss": 2.2166, + "step": 38 + }, + { + "epoch": 0.0, + "grad_norm": 1.2417025568909201, + "learning_rate": 7.464114832535886e-06, + "loss": 2.2785, + "step": 39 + }, + { + "epoch": 0.0, + "grad_norm": 1.0623452341947435, + "learning_rate": 7.655502392344498e-06, + "loss": 2.1188, + "step": 40 + }, + { + "epoch": 0.0, + "grad_norm": 0.9459508227302174, + "learning_rate": 7.846889952153112e-06, + "loss": 2.1225, + "step": 41 + }, + { + "epoch": 0.0, + "grad_norm": 0.9821381236389229, + "learning_rate": 8.038277511961722e-06, + "loss": 1.9644, + "step": 42 + }, + { + "epoch": 0.0, + "grad_norm": 0.8227720071849114, + "learning_rate": 8.229665071770336e-06, + "loss": 1.8818, + "step": 43 + }, + { + "epoch": 0.0, + "grad_norm": 0.9772589694714745, + "learning_rate": 8.421052631578948e-06, + "loss": 2.1738, + "step": 44 + }, + { + "epoch": 0.0, + "grad_norm": 0.8288391194584817, + "learning_rate": 8.61244019138756e-06, + "loss": 2.0598, + "step": 45 + }, + { + "epoch": 0.0, + "grad_norm": 0.7365091983688079, + "learning_rate": 8.803827751196173e-06, + "loss": 2.0051, + "step": 46 + }, + { + "epoch": 0.0, + "grad_norm": 0.8202389292532828, + "learning_rate": 8.995215311004785e-06, + "loss": 2.0005, + "step": 47 + }, + { + "epoch": 0.0, + "grad_norm": 0.7517874679840845, + "learning_rate": 9.186602870813397e-06, + "loss": 1.9756, + "step": 48 + }, + { + "epoch": 0.0, + "grad_norm": 0.8041110653388241, + "learning_rate": 9.377990430622011e-06, + "loss": 2.1019, + "step": 49 + }, + { + "epoch": 0.0, + "grad_norm": 0.7364867668939584, + "learning_rate": 9.569377990430622e-06, + "loss": 1.8784, + "step": 50 + }, + { + "epoch": 0.0, + "grad_norm": 0.5933286532299948, + "learning_rate": 9.760765550239234e-06, + "loss": 1.743, + "step": 51 + }, + { + "epoch": 0.0, + "grad_norm": 0.792868413879173, + "learning_rate": 9.952153110047848e-06, + "loss": 1.8373, + "step": 52 + }, + { + "epoch": 0.01, + "grad_norm": 0.8312221240251918, + "learning_rate": 1.014354066985646e-05, + "loss": 1.9098, + "step": 53 + }, + { + "epoch": 0.01, + "grad_norm": 0.8539046550616419, + "learning_rate": 1.0334928229665072e-05, + "loss": 2.0752, + "step": 54 + }, + { + "epoch": 0.01, + "grad_norm": 0.7680076367144099, + "learning_rate": 1.0526315789473684e-05, + "loss": 2.0076, + "step": 55 + }, + { + "epoch": 0.01, + "grad_norm": 0.6386760911858004, + "learning_rate": 1.0717703349282297e-05, + "loss": 1.8095, + "step": 56 + }, + { + "epoch": 0.01, + "grad_norm": 0.7287884197298926, + "learning_rate": 1.0909090909090909e-05, + "loss": 1.9806, + "step": 57 + }, + { + "epoch": 0.01, + "grad_norm": 0.7722026786294885, + "learning_rate": 1.1100478468899523e-05, + "loss": 1.9635, + "step": 58 + }, + { + "epoch": 0.01, + "grad_norm": 0.7231409481435274, + "learning_rate": 1.1291866028708133e-05, + "loss": 1.8613, + "step": 59 + }, + { + "epoch": 0.01, + "grad_norm": 0.7824727930965788, + "learning_rate": 1.1483253588516747e-05, + "loss": 2.1725, + "step": 60 + }, + { + "epoch": 0.01, + "grad_norm": 0.6677563917530093, + "learning_rate": 1.167464114832536e-05, + "loss": 1.9352, + "step": 61 + }, + { + "epoch": 0.01, + "grad_norm": 0.7009328416411565, + "learning_rate": 1.1866028708133972e-05, + "loss": 1.8884, + "step": 62 + }, + { + "epoch": 0.01, + "grad_norm": 0.9109348341862854, + "learning_rate": 1.2057416267942584e-05, + "loss": 2.3183, + "step": 63 + }, + { + "epoch": 0.01, + "grad_norm": 0.8411347289624691, + "learning_rate": 1.2248803827751198e-05, + "loss": 1.8075, + "step": 64 + }, + { + "epoch": 0.01, + "grad_norm": 0.7674394625955453, + "learning_rate": 1.2440191387559808e-05, + "loss": 1.9963, + "step": 65 + }, + { + "epoch": 0.01, + "grad_norm": 0.7848289715014719, + "learning_rate": 1.2631578947368422e-05, + "loss": 1.9276, + "step": 66 + }, + { + "epoch": 0.01, + "grad_norm": 0.7238409990110771, + "learning_rate": 1.2822966507177035e-05, + "loss": 1.9353, + "step": 67 + }, + { + "epoch": 0.01, + "grad_norm": 0.8235100929493389, + "learning_rate": 1.3014354066985645e-05, + "loss": 1.8938, + "step": 68 + }, + { + "epoch": 0.01, + "grad_norm": 0.6283052176051154, + "learning_rate": 1.320574162679426e-05, + "loss": 1.8028, + "step": 69 + }, + { + "epoch": 0.01, + "grad_norm": 0.654778519712797, + "learning_rate": 1.3397129186602871e-05, + "loss": 1.6469, + "step": 70 + }, + { + "epoch": 0.01, + "grad_norm": 0.7728600382414659, + "learning_rate": 1.3588516746411483e-05, + "loss": 1.7839, + "step": 71 + }, + { + "epoch": 0.01, + "grad_norm": 0.9160985506203067, + "learning_rate": 1.3779904306220096e-05, + "loss": 1.8154, + "step": 72 + }, + { + "epoch": 0.01, + "grad_norm": 0.8592513182940328, + "learning_rate": 1.397129186602871e-05, + "loss": 1.8178, + "step": 73 + }, + { + "epoch": 0.01, + "grad_norm": 1.0716389160468884, + "learning_rate": 1.4162679425837322e-05, + "loss": 1.9752, + "step": 74 + }, + { + "epoch": 0.01, + "grad_norm": 1.0526260733114412, + "learning_rate": 1.4354066985645934e-05, + "loss": 1.8998, + "step": 75 + }, + { + "epoch": 0.01, + "grad_norm": 0.9676822741503439, + "learning_rate": 1.4545454545454545e-05, + "loss": 1.7514, + "step": 76 + }, + { + "epoch": 0.01, + "grad_norm": 1.2545561036432802, + "learning_rate": 1.4736842105263157e-05, + "loss": 1.8165, + "step": 77 + }, + { + "epoch": 0.01, + "grad_norm": 1.834675739891747, + "learning_rate": 1.4928229665071772e-05, + "loss": 1.8987, + "step": 78 + }, + { + "epoch": 0.01, + "grad_norm": 1.25568745378183, + "learning_rate": 1.5119617224880383e-05, + "loss": 1.8836, + "step": 79 + }, + { + "epoch": 0.01, + "grad_norm": 0.8547248072580093, + "learning_rate": 1.5311004784688995e-05, + "loss": 1.625, + "step": 80 + }, + { + "epoch": 0.01, + "grad_norm": 0.9130624521617219, + "learning_rate": 1.5502392344497607e-05, + "loss": 1.65, + "step": 81 + }, + { + "epoch": 0.01, + "grad_norm": 1.4571618225829908, + "learning_rate": 1.5693779904306223e-05, + "loss": 1.7592, + "step": 82 + }, + { + "epoch": 0.01, + "grad_norm": 1.5681812827811266, + "learning_rate": 1.5885167464114832e-05, + "loss": 1.7274, + "step": 83 + }, + { + "epoch": 0.01, + "grad_norm": 1.7707101692645586, + "learning_rate": 1.6076555023923444e-05, + "loss": 1.7496, + "step": 84 + }, + { + "epoch": 0.01, + "grad_norm": 1.4920166991759014, + "learning_rate": 1.6267942583732056e-05, + "loss": 1.815, + "step": 85 + }, + { + "epoch": 0.01, + "grad_norm": 0.765151570274179, + "learning_rate": 1.6459330143540672e-05, + "loss": 1.6131, + "step": 86 + }, + { + "epoch": 0.01, + "grad_norm": 0.7780322419019302, + "learning_rate": 1.6650717703349284e-05, + "loss": 1.6553, + "step": 87 + }, + { + "epoch": 0.01, + "grad_norm": 0.6971849026842456, + "learning_rate": 1.6842105263157896e-05, + "loss": 1.6212, + "step": 88 + }, + { + "epoch": 0.01, + "grad_norm": 0.6737627863204966, + "learning_rate": 1.7033492822966505e-05, + "loss": 1.6319, + "step": 89 + }, + { + "epoch": 0.01, + "grad_norm": 0.7581487338796276, + "learning_rate": 1.722488038277512e-05, + "loss": 1.5457, + "step": 90 + }, + { + "epoch": 0.01, + "grad_norm": 0.6884712501693219, + "learning_rate": 1.7416267942583733e-05, + "loss": 1.682, + "step": 91 + }, + { + "epoch": 0.01, + "grad_norm": 0.6472187205118166, + "learning_rate": 1.7607655502392345e-05, + "loss": 1.7893, + "step": 92 + }, + { + "epoch": 0.01, + "grad_norm": 0.5392317136449472, + "learning_rate": 1.7799043062200958e-05, + "loss": 1.5724, + "step": 93 + }, + { + "epoch": 0.01, + "grad_norm": 0.613083864173068, + "learning_rate": 1.799043062200957e-05, + "loss": 1.5933, + "step": 94 + }, + { + "epoch": 0.01, + "grad_norm": 0.554854239010525, + "learning_rate": 1.8181818181818182e-05, + "loss": 1.4444, + "step": 95 + }, + { + "epoch": 0.01, + "grad_norm": 0.5895819501317506, + "learning_rate": 1.8373205741626794e-05, + "loss": 1.463, + "step": 96 + }, + { + "epoch": 0.01, + "grad_norm": 0.6834369586439761, + "learning_rate": 1.8564593301435407e-05, + "loss": 1.6334, + "step": 97 + }, + { + "epoch": 0.01, + "grad_norm": 0.5941595870489239, + "learning_rate": 1.8755980861244022e-05, + "loss": 1.6881, + "step": 98 + }, + { + "epoch": 0.01, + "grad_norm": 0.6765509171345243, + "learning_rate": 1.8947368421052634e-05, + "loss": 1.5896, + "step": 99 + }, + { + "epoch": 0.01, + "grad_norm": 0.5523045136213458, + "learning_rate": 1.9138755980861243e-05, + "loss": 1.4551, + "step": 100 + }, + { + "epoch": 0.01, + "grad_norm": 0.6537194645181346, + "learning_rate": 1.9330143540669855e-05, + "loss": 1.5513, + "step": 101 + }, + { + "epoch": 0.01, + "grad_norm": 0.5087498837023968, + "learning_rate": 1.9521531100478468e-05, + "loss": 1.4715, + "step": 102 + }, + { + "epoch": 0.01, + "grad_norm": 0.6335719742406299, + "learning_rate": 1.9712918660287083e-05, + "loss": 1.5313, + "step": 103 + }, + { + "epoch": 0.01, + "grad_norm": 0.647032541641365, + "learning_rate": 1.9904306220095696e-05, + "loss": 1.4794, + "step": 104 + }, + { + "epoch": 0.01, + "grad_norm": 0.5641054209541718, + "learning_rate": 2.0095693779904308e-05, + "loss": 1.5301, + "step": 105 + }, + { + "epoch": 0.01, + "grad_norm": 0.623411201252464, + "learning_rate": 2.028708133971292e-05, + "loss": 1.4618, + "step": 106 + }, + { + "epoch": 0.01, + "grad_norm": 0.6469566283028377, + "learning_rate": 2.0478468899521532e-05, + "loss": 1.5438, + "step": 107 + }, + { + "epoch": 0.01, + "grad_norm": 0.5328026134772941, + "learning_rate": 2.0669856459330144e-05, + "loss": 1.4587, + "step": 108 + }, + { + "epoch": 0.01, + "grad_norm": 0.5404945621124031, + "learning_rate": 2.0861244019138757e-05, + "loss": 1.3749, + "step": 109 + }, + { + "epoch": 0.01, + "grad_norm": 0.6198671572468233, + "learning_rate": 2.105263157894737e-05, + "loss": 1.5662, + "step": 110 + }, + { + "epoch": 0.01, + "grad_norm": 0.5344071404450242, + "learning_rate": 2.1244019138755985e-05, + "loss": 1.6258, + "step": 111 + }, + { + "epoch": 0.01, + "grad_norm": 0.5077736180550365, + "learning_rate": 2.1435406698564593e-05, + "loss": 1.4085, + "step": 112 + }, + { + "epoch": 0.01, + "grad_norm": 0.5916690974950262, + "learning_rate": 2.1626794258373206e-05, + "loss": 1.4315, + "step": 113 + }, + { + "epoch": 0.01, + "grad_norm": 0.4799015981708095, + "learning_rate": 2.1818181818181818e-05, + "loss": 1.4183, + "step": 114 + }, + { + "epoch": 0.01, + "grad_norm": 0.4571232168458159, + "learning_rate": 2.2009569377990433e-05, + "loss": 1.3658, + "step": 115 + }, + { + "epoch": 0.01, + "grad_norm": 0.4917591648756044, + "learning_rate": 2.2200956937799046e-05, + "loss": 1.3724, + "step": 116 + }, + { + "epoch": 0.01, + "grad_norm": 0.4170683829952185, + "learning_rate": 2.2392344497607658e-05, + "loss": 1.4197, + "step": 117 + }, + { + "epoch": 0.01, + "grad_norm": 0.49893891430948417, + "learning_rate": 2.2583732057416267e-05, + "loss": 1.4005, + "step": 118 + }, + { + "epoch": 0.01, + "grad_norm": 0.4056789281529712, + "learning_rate": 2.2775119617224882e-05, + "loss": 1.4419, + "step": 119 + }, + { + "epoch": 0.01, + "grad_norm": 0.47274513283719816, + "learning_rate": 2.2966507177033495e-05, + "loss": 1.3163, + "step": 120 + }, + { + "epoch": 0.01, + "grad_norm": 0.4495315355981231, + "learning_rate": 2.3157894736842107e-05, + "loss": 1.3599, + "step": 121 + }, + { + "epoch": 0.01, + "grad_norm": 0.4142057365520808, + "learning_rate": 2.334928229665072e-05, + "loss": 1.4484, + "step": 122 + }, + { + "epoch": 0.01, + "grad_norm": 0.4421864791497608, + "learning_rate": 2.354066985645933e-05, + "loss": 1.3629, + "step": 123 + }, + { + "epoch": 0.01, + "grad_norm": 0.44850267009277406, + "learning_rate": 2.3732057416267943e-05, + "loss": 1.4036, + "step": 124 + }, + { + "epoch": 0.01, + "grad_norm": 0.49021107734394687, + "learning_rate": 2.3923444976076556e-05, + "loss": 1.3724, + "step": 125 + }, + { + "epoch": 0.01, + "grad_norm": 0.47738353199872885, + "learning_rate": 2.4114832535885168e-05, + "loss": 1.3304, + "step": 126 + }, + { + "epoch": 0.01, + "grad_norm": 0.4704616937001858, + "learning_rate": 2.4306220095693784e-05, + "loss": 1.376, + "step": 127 + }, + { + "epoch": 0.01, + "grad_norm": 0.4351806446488345, + "learning_rate": 2.4497607655502396e-05, + "loss": 1.3438, + "step": 128 + }, + { + "epoch": 0.01, + "grad_norm": 0.5275547600152736, + "learning_rate": 2.4688995215311005e-05, + "loss": 1.2731, + "step": 129 + }, + { + "epoch": 0.01, + "grad_norm": 0.3787982803600526, + "learning_rate": 2.4880382775119617e-05, + "loss": 1.3392, + "step": 130 + }, + { + "epoch": 0.01, + "grad_norm": 0.3850735556535156, + "learning_rate": 2.507177033492823e-05, + "loss": 1.2673, + "step": 131 + }, + { + "epoch": 0.01, + "grad_norm": 0.4339128757111585, + "learning_rate": 2.5263157894736845e-05, + "loss": 1.3284, + "step": 132 + }, + { + "epoch": 0.01, + "grad_norm": 0.39005187564392557, + "learning_rate": 2.5454545454545454e-05, + "loss": 1.3347, + "step": 133 + }, + { + "epoch": 0.01, + "grad_norm": 0.35602922226403455, + "learning_rate": 2.564593301435407e-05, + "loss": 1.2362, + "step": 134 + }, + { + "epoch": 0.01, + "grad_norm": 0.40601160369590406, + "learning_rate": 2.583732057416268e-05, + "loss": 1.3591, + "step": 135 + }, + { + "epoch": 0.01, + "grad_norm": 0.4408392546545607, + "learning_rate": 2.602870813397129e-05, + "loss": 1.2837, + "step": 136 + }, + { + "epoch": 0.01, + "grad_norm": 0.46362552165830334, + "learning_rate": 2.6220095693779906e-05, + "loss": 1.316, + "step": 137 + }, + { + "epoch": 0.01, + "grad_norm": 0.3504892333833388, + "learning_rate": 2.641148325358852e-05, + "loss": 1.1857, + "step": 138 + }, + { + "epoch": 0.01, + "grad_norm": 0.4362884443543931, + "learning_rate": 2.660287081339713e-05, + "loss": 1.2435, + "step": 139 + }, + { + "epoch": 0.01, + "grad_norm": 0.50850093231113, + "learning_rate": 2.6794258373205743e-05, + "loss": 1.3235, + "step": 140 + }, + { + "epoch": 0.01, + "grad_norm": 0.4187742784108479, + "learning_rate": 2.698564593301435e-05, + "loss": 1.3693, + "step": 141 + }, + { + "epoch": 0.01, + "grad_norm": 0.44107080428520234, + "learning_rate": 2.7177033492822967e-05, + "loss": 1.202, + "step": 142 + }, + { + "epoch": 0.01, + "grad_norm": 0.42167608964495673, + "learning_rate": 2.7368421052631583e-05, + "loss": 1.2693, + "step": 143 + }, + { + "epoch": 0.01, + "grad_norm": 0.4269486411389392, + "learning_rate": 2.755980861244019e-05, + "loss": 1.1825, + "step": 144 + }, + { + "epoch": 0.01, + "grad_norm": 0.4220464029961541, + "learning_rate": 2.7751196172248807e-05, + "loss": 1.3205, + "step": 145 + }, + { + "epoch": 0.01, + "grad_norm": 0.4620642882432513, + "learning_rate": 2.794258373205742e-05, + "loss": 1.2891, + "step": 146 + }, + { + "epoch": 0.01, + "grad_norm": 0.43743294942437483, + "learning_rate": 2.8133971291866028e-05, + "loss": 1.1969, + "step": 147 + }, + { + "epoch": 0.01, + "grad_norm": 0.4414019243397515, + "learning_rate": 2.8325358851674644e-05, + "loss": 1.1672, + "step": 148 + }, + { + "epoch": 0.01, + "grad_norm": 1.1295244665123245, + "learning_rate": 2.8516746411483253e-05, + "loss": 1.3175, + "step": 149 + }, + { + "epoch": 0.01, + "grad_norm": 0.5055922001428373, + "learning_rate": 2.8708133971291868e-05, + "loss": 1.2696, + "step": 150 + }, + { + "epoch": 0.01, + "grad_norm": 0.48319661000499564, + "learning_rate": 2.889952153110048e-05, + "loss": 1.3168, + "step": 151 + }, + { + "epoch": 0.01, + "grad_norm": 0.5720466946520338, + "learning_rate": 2.909090909090909e-05, + "loss": 1.2982, + "step": 152 + }, + { + "epoch": 0.01, + "grad_norm": 0.68753321360197, + "learning_rate": 2.9282296650717705e-05, + "loss": 1.2791, + "step": 153 + }, + { + "epoch": 0.01, + "grad_norm": 0.6221880802372888, + "learning_rate": 2.9473684210526314e-05, + "loss": 1.2647, + "step": 154 + }, + { + "epoch": 0.01, + "grad_norm": 0.5504261765445813, + "learning_rate": 2.966507177033493e-05, + "loss": 1.3102, + "step": 155 + }, + { + "epoch": 0.01, + "grad_norm": 0.6618113787746298, + "learning_rate": 2.9856459330143545e-05, + "loss": 1.292, + "step": 156 + }, + { + "epoch": 0.02, + "grad_norm": 0.6002633450221915, + "learning_rate": 3.0047846889952154e-05, + "loss": 1.33, + "step": 157 + }, + { + "epoch": 0.02, + "grad_norm": 0.7118508153621136, + "learning_rate": 3.0239234449760766e-05, + "loss": 1.2062, + "step": 158 + }, + { + "epoch": 0.02, + "grad_norm": 0.6591754857765905, + "learning_rate": 3.043062200956938e-05, + "loss": 1.2539, + "step": 159 + }, + { + "epoch": 0.02, + "grad_norm": 0.4714338006090391, + "learning_rate": 3.062200956937799e-05, + "loss": 1.2384, + "step": 160 + }, + { + "epoch": 0.02, + "grad_norm": 1.0300987484894157, + "learning_rate": 3.08133971291866e-05, + "loss": 1.2171, + "step": 161 + }, + { + "epoch": 0.02, + "grad_norm": 0.6469298394859133, + "learning_rate": 3.1004784688995215e-05, + "loss": 1.0836, + "step": 162 + }, + { + "epoch": 0.02, + "grad_norm": 0.5601908591127592, + "learning_rate": 3.119617224880383e-05, + "loss": 1.1867, + "step": 163 + }, + { + "epoch": 0.02, + "grad_norm": 0.7363177930575833, + "learning_rate": 3.1387559808612446e-05, + "loss": 1.3128, + "step": 164 + }, + { + "epoch": 0.02, + "grad_norm": 0.7107903409156414, + "learning_rate": 3.157894736842105e-05, + "loss": 1.2501, + "step": 165 + }, + { + "epoch": 0.02, + "grad_norm": 0.464636577316954, + "learning_rate": 3.1770334928229664e-05, + "loss": 1.1671, + "step": 166 + }, + { + "epoch": 0.02, + "grad_norm": 0.8259444575108333, + "learning_rate": 3.196172248803828e-05, + "loss": 1.2928, + "step": 167 + }, + { + "epoch": 0.02, + "grad_norm": 0.7945572957938246, + "learning_rate": 3.215311004784689e-05, + "loss": 1.2516, + "step": 168 + }, + { + "epoch": 0.02, + "grad_norm": 0.5895739273419747, + "learning_rate": 3.234449760765551e-05, + "loss": 1.162, + "step": 169 + }, + { + "epoch": 0.02, + "grad_norm": 0.6927568627921579, + "learning_rate": 3.253588516746411e-05, + "loss": 1.2802, + "step": 170 + }, + { + "epoch": 0.02, + "grad_norm": 0.639241808296525, + "learning_rate": 3.272727272727273e-05, + "loss": 1.1999, + "step": 171 + }, + { + "epoch": 0.02, + "grad_norm": 0.8979192020036151, + "learning_rate": 3.2918660287081344e-05, + "loss": 1.1854, + "step": 172 + }, + { + "epoch": 0.02, + "grad_norm": 0.5314730657868201, + "learning_rate": 3.311004784688995e-05, + "loss": 1.2365, + "step": 173 + }, + { + "epoch": 0.02, + "grad_norm": 0.3835011610660225, + "learning_rate": 3.330143540669857e-05, + "loss": 1.3014, + "step": 174 + }, + { + "epoch": 0.02, + "grad_norm": 0.46600481942740146, + "learning_rate": 3.349282296650718e-05, + "loss": 1.2222, + "step": 175 + }, + { + "epoch": 0.02, + "grad_norm": 0.568331869349057, + "learning_rate": 3.368421052631579e-05, + "loss": 1.2219, + "step": 176 + }, + { + "epoch": 0.02, + "grad_norm": 0.5161782745938108, + "learning_rate": 3.3875598086124405e-05, + "loss": 1.0828, + "step": 177 + }, + { + "epoch": 0.02, + "grad_norm": 0.5678907934059946, + "learning_rate": 3.406698564593301e-05, + "loss": 1.241, + "step": 178 + }, + { + "epoch": 0.02, + "grad_norm": 0.40729089487435055, + "learning_rate": 3.425837320574163e-05, + "loss": 1.2201, + "step": 179 + }, + { + "epoch": 0.02, + "grad_norm": 0.5273474366443621, + "learning_rate": 3.444976076555024e-05, + "loss": 1.1262, + "step": 180 + }, + { + "epoch": 0.02, + "grad_norm": 0.4807276904108668, + "learning_rate": 3.4641148325358854e-05, + "loss": 1.1957, + "step": 181 + }, + { + "epoch": 0.02, + "grad_norm": 0.5372452514446321, + "learning_rate": 3.4832535885167466e-05, + "loss": 1.1621, + "step": 182 + }, + { + "epoch": 0.02, + "grad_norm": 0.6605408366116067, + "learning_rate": 3.502392344497608e-05, + "loss": 1.2327, + "step": 183 + }, + { + "epoch": 0.02, + "grad_norm": 0.5555779108688886, + "learning_rate": 3.521531100478469e-05, + "loss": 1.1938, + "step": 184 + }, + { + "epoch": 0.02, + "grad_norm": 0.5517608446497938, + "learning_rate": 3.54066985645933e-05, + "loss": 1.2651, + "step": 185 + }, + { + "epoch": 0.02, + "grad_norm": 0.4524359827250868, + "learning_rate": 3.5598086124401915e-05, + "loss": 1.2182, + "step": 186 + }, + { + "epoch": 0.02, + "grad_norm": 0.4495056117139968, + "learning_rate": 3.578947368421053e-05, + "loss": 1.2133, + "step": 187 + }, + { + "epoch": 0.02, + "grad_norm": 0.43420633627744476, + "learning_rate": 3.598086124401914e-05, + "loss": 1.078, + "step": 188 + }, + { + "epoch": 0.02, + "grad_norm": 0.4991578407930919, + "learning_rate": 3.617224880382775e-05, + "loss": 1.0683, + "step": 189 + }, + { + "epoch": 0.02, + "grad_norm": 0.46736895171034254, + "learning_rate": 3.6363636363636364e-05, + "loss": 1.1533, + "step": 190 + }, + { + "epoch": 0.02, + "grad_norm": 0.48830760264155376, + "learning_rate": 3.6555023923444976e-05, + "loss": 1.2539, + "step": 191 + }, + { + "epoch": 0.02, + "grad_norm": 0.5148588474362115, + "learning_rate": 3.674641148325359e-05, + "loss": 1.0486, + "step": 192 + }, + { + "epoch": 0.02, + "grad_norm": 0.4669961130515612, + "learning_rate": 3.693779904306221e-05, + "loss": 1.1949, + "step": 193 + }, + { + "epoch": 0.02, + "grad_norm": 0.32879504352969363, + "learning_rate": 3.712918660287081e-05, + "loss": 1.189, + "step": 194 + }, + { + "epoch": 0.02, + "grad_norm": 0.35633398240141656, + "learning_rate": 3.7320574162679425e-05, + "loss": 1.2112, + "step": 195 + }, + { + "epoch": 0.02, + "grad_norm": 0.42378382756263683, + "learning_rate": 3.7511961722488044e-05, + "loss": 1.1999, + "step": 196 + }, + { + "epoch": 0.02, + "grad_norm": 0.41483450868577093, + "learning_rate": 3.770334928229665e-05, + "loss": 1.1283, + "step": 197 + }, + { + "epoch": 0.02, + "grad_norm": 0.4603179290827383, + "learning_rate": 3.789473684210527e-05, + "loss": 1.2353, + "step": 198 + }, + { + "epoch": 0.02, + "grad_norm": 0.5574543274654291, + "learning_rate": 3.8086124401913874e-05, + "loss": 1.1948, + "step": 199 + }, + { + "epoch": 0.02, + "grad_norm": 0.6846239185739854, + "learning_rate": 3.8277511961722486e-05, + "loss": 1.3231, + "step": 200 + }, + { + "epoch": 0.02, + "grad_norm": 0.520683870817622, + "learning_rate": 3.8468899521531105e-05, + "loss": 1.2201, + "step": 201 + }, + { + "epoch": 0.02, + "grad_norm": 0.42683935885681074, + "learning_rate": 3.866028708133971e-05, + "loss": 1.165, + "step": 202 + }, + { + "epoch": 0.02, + "grad_norm": 0.39470003294526096, + "learning_rate": 3.885167464114833e-05, + "loss": 1.1994, + "step": 203 + }, + { + "epoch": 0.02, + "grad_norm": 0.4391753396869876, + "learning_rate": 3.9043062200956935e-05, + "loss": 1.1813, + "step": 204 + }, + { + "epoch": 0.02, + "grad_norm": 0.4449276816002856, + "learning_rate": 3.9234449760765554e-05, + "loss": 1.1764, + "step": 205 + }, + { + "epoch": 0.02, + "grad_norm": 0.45472581470602813, + "learning_rate": 3.9425837320574167e-05, + "loss": 1.1766, + "step": 206 + }, + { + "epoch": 0.02, + "grad_norm": 0.37367376899076454, + "learning_rate": 3.961722488038277e-05, + "loss": 1.0581, + "step": 207 + }, + { + "epoch": 0.02, + "grad_norm": 0.5183447282360463, + "learning_rate": 3.980861244019139e-05, + "loss": 1.1351, + "step": 208 + }, + { + "epoch": 0.02, + "grad_norm": 0.5895054254055697, + "learning_rate": 4e-05, + "loss": 1.2017, + "step": 209 + }, + { + "epoch": 0.02, + "grad_norm": 0.39968348371397444, + "learning_rate": 4.0191387559808616e-05, + "loss": 1.2353, + "step": 210 + }, + { + "epoch": 0.02, + "grad_norm": 0.4089518582688394, + "learning_rate": 4.038277511961723e-05, + "loss": 1.2072, + "step": 211 + }, + { + "epoch": 0.02, + "grad_norm": 0.5490805672572786, + "learning_rate": 4.057416267942584e-05, + "loss": 1.2593, + "step": 212 + }, + { + "epoch": 0.02, + "grad_norm": 0.42397521041394676, + "learning_rate": 4.076555023923445e-05, + "loss": 1.155, + "step": 213 + }, + { + "epoch": 0.02, + "grad_norm": 0.4981026360180907, + "learning_rate": 4.0956937799043064e-05, + "loss": 1.2279, + "step": 214 + }, + { + "epoch": 0.02, + "grad_norm": 0.503200310794026, + "learning_rate": 4.114832535885168e-05, + "loss": 1.1134, + "step": 215 + }, + { + "epoch": 0.02, + "grad_norm": 0.5127306365983669, + "learning_rate": 4.133971291866029e-05, + "loss": 1.1993, + "step": 216 + }, + { + "epoch": 0.02, + "grad_norm": 0.4935452094233486, + "learning_rate": 4.15311004784689e-05, + "loss": 1.1962, + "step": 217 + }, + { + "epoch": 0.02, + "grad_norm": 0.38834813884933866, + "learning_rate": 4.172248803827751e-05, + "loss": 1.2529, + "step": 218 + }, + { + "epoch": 0.02, + "grad_norm": 0.4742710182667002, + "learning_rate": 4.1913875598086126e-05, + "loss": 1.1861, + "step": 219 + }, + { + "epoch": 0.02, + "grad_norm": 0.4762816225973413, + "learning_rate": 4.210526315789474e-05, + "loss": 1.2221, + "step": 220 + }, + { + "epoch": 0.02, + "grad_norm": 0.5405607329270613, + "learning_rate": 4.229665071770335e-05, + "loss": 1.3099, + "step": 221 + }, + { + "epoch": 0.02, + "grad_norm": 0.38474621695351074, + "learning_rate": 4.248803827751197e-05, + "loss": 1.1669, + "step": 222 + }, + { + "epoch": 0.02, + "grad_norm": 0.47522706488556143, + "learning_rate": 4.2679425837320574e-05, + "loss": 1.1813, + "step": 223 + }, + { + "epoch": 0.02, + "grad_norm": 0.43816325603257117, + "learning_rate": 4.287081339712919e-05, + "loss": 1.1912, + "step": 224 + }, + { + "epoch": 0.02, + "grad_norm": 0.47579537819240086, + "learning_rate": 4.3062200956937806e-05, + "loss": 1.165, + "step": 225 + }, + { + "epoch": 0.02, + "grad_norm": 0.5148567080870619, + "learning_rate": 4.325358851674641e-05, + "loss": 1.1997, + "step": 226 + }, + { + "epoch": 0.02, + "grad_norm": 0.4019413785534228, + "learning_rate": 4.344497607655503e-05, + "loss": 1.2448, + "step": 227 + }, + { + "epoch": 0.02, + "grad_norm": 0.4766231281692962, + "learning_rate": 4.3636363636363636e-05, + "loss": 1.1374, + "step": 228 + }, + { + "epoch": 0.02, + "grad_norm": 0.4375975135050194, + "learning_rate": 4.382775119617225e-05, + "loss": 1.2484, + "step": 229 + }, + { + "epoch": 0.02, + "grad_norm": 0.4312959866778293, + "learning_rate": 4.401913875598087e-05, + "loss": 1.1143, + "step": 230 + }, + { + "epoch": 0.02, + "grad_norm": 0.4329834158025715, + "learning_rate": 4.421052631578947e-05, + "loss": 1.2335, + "step": 231 + }, + { + "epoch": 0.02, + "grad_norm": 0.37836691609878936, + "learning_rate": 4.440191387559809e-05, + "loss": 1.118, + "step": 232 + }, + { + "epoch": 0.02, + "grad_norm": 0.37526619539689987, + "learning_rate": 4.45933014354067e-05, + "loss": 1.2968, + "step": 233 + }, + { + "epoch": 0.02, + "grad_norm": 0.4175158591003783, + "learning_rate": 4.4784688995215316e-05, + "loss": 1.1954, + "step": 234 + }, + { + "epoch": 0.02, + "grad_norm": 0.37094222885704037, + "learning_rate": 4.497607655502393e-05, + "loss": 1.0976, + "step": 235 + }, + { + "epoch": 0.02, + "grad_norm": 0.4586856505320375, + "learning_rate": 4.5167464114832533e-05, + "loss": 1.1628, + "step": 236 + }, + { + "epoch": 0.02, + "grad_norm": 0.44587593738328374, + "learning_rate": 4.535885167464115e-05, + "loss": 1.1917, + "step": 237 + }, + { + "epoch": 0.02, + "grad_norm": 0.3594860823761438, + "learning_rate": 4.5550239234449765e-05, + "loss": 1.192, + "step": 238 + }, + { + "epoch": 0.02, + "grad_norm": 0.38283214874386773, + "learning_rate": 4.574162679425838e-05, + "loss": 1.2205, + "step": 239 + }, + { + "epoch": 0.02, + "grad_norm": 0.33492904991363276, + "learning_rate": 4.593301435406699e-05, + "loss": 1.2155, + "step": 240 + }, + { + "epoch": 0.02, + "grad_norm": 0.4113511012996176, + "learning_rate": 4.6124401913875595e-05, + "loss": 1.2222, + "step": 241 + }, + { + "epoch": 0.02, + "grad_norm": 0.4337572841707065, + "learning_rate": 4.6315789473684214e-05, + "loss": 1.0193, + "step": 242 + }, + { + "epoch": 0.02, + "grad_norm": 0.34650969754183997, + "learning_rate": 4.6507177033492826e-05, + "loss": 1.2571, + "step": 243 + }, + { + "epoch": 0.02, + "grad_norm": 0.4758935405313615, + "learning_rate": 4.669856459330144e-05, + "loss": 1.1967, + "step": 244 + }, + { + "epoch": 0.02, + "grad_norm": 0.39684085109310996, + "learning_rate": 4.688995215311005e-05, + "loss": 1.1169, + "step": 245 + }, + { + "epoch": 0.02, + "grad_norm": 0.4823470965603195, + "learning_rate": 4.708133971291866e-05, + "loss": 1.2347, + "step": 246 + }, + { + "epoch": 0.02, + "grad_norm": 0.37717028125791024, + "learning_rate": 4.7272727272727275e-05, + "loss": 1.094, + "step": 247 + }, + { + "epoch": 0.02, + "grad_norm": 0.3667379872819918, + "learning_rate": 4.746411483253589e-05, + "loss": 1.2391, + "step": 248 + }, + { + "epoch": 0.02, + "grad_norm": 0.3407975972338634, + "learning_rate": 4.76555023923445e-05, + "loss": 1.0632, + "step": 249 + }, + { + "epoch": 0.02, + "grad_norm": 0.49238987132507245, + "learning_rate": 4.784688995215311e-05, + "loss": 1.2121, + "step": 250 + }, + { + "epoch": 0.02, + "grad_norm": 0.46706882037804415, + "learning_rate": 4.8038277511961724e-05, + "loss": 1.2437, + "step": 251 + }, + { + "epoch": 0.02, + "grad_norm": 0.4088473587226235, + "learning_rate": 4.8229665071770336e-05, + "loss": 1.1572, + "step": 252 + }, + { + "epoch": 0.02, + "grad_norm": 0.3821462444529073, + "learning_rate": 4.842105263157895e-05, + "loss": 1.1603, + "step": 253 + }, + { + "epoch": 0.02, + "grad_norm": 0.4782990514183433, + "learning_rate": 4.861244019138757e-05, + "loss": 1.2409, + "step": 254 + }, + { + "epoch": 0.02, + "grad_norm": 0.45087555655452305, + "learning_rate": 4.880382775119617e-05, + "loss": 1.1556, + "step": 255 + }, + { + "epoch": 0.02, + "grad_norm": 0.3865676678179632, + "learning_rate": 4.899521531100479e-05, + "loss": 1.1945, + "step": 256 + }, + { + "epoch": 0.02, + "grad_norm": 0.4099982787948705, + "learning_rate": 4.91866028708134e-05, + "loss": 1.17, + "step": 257 + }, + { + "epoch": 0.02, + "grad_norm": 0.46016977600480446, + "learning_rate": 4.937799043062201e-05, + "loss": 1.2024, + "step": 258 + }, + { + "epoch": 0.02, + "grad_norm": 0.4712335677622079, + "learning_rate": 4.956937799043063e-05, + "loss": 1.1252, + "step": 259 + }, + { + "epoch": 0.02, + "grad_norm": 0.3955435220531026, + "learning_rate": 4.9760765550239234e-05, + "loss": 1.201, + "step": 260 + }, + { + "epoch": 0.02, + "grad_norm": 0.48382588647342273, + "learning_rate": 4.995215311004785e-05, + "loss": 1.3135, + "step": 261 + }, + { + "epoch": 0.03, + "grad_norm": 0.39028060459262043, + "learning_rate": 5.014354066985646e-05, + "loss": 1.2573, + "step": 262 + }, + { + "epoch": 0.03, + "grad_norm": 0.4094204023390644, + "learning_rate": 5.033492822966508e-05, + "loss": 1.1411, + "step": 263 + }, + { + "epoch": 0.03, + "grad_norm": 0.37405702073279246, + "learning_rate": 5.052631578947369e-05, + "loss": 1.1924, + "step": 264 + }, + { + "epoch": 0.03, + "grad_norm": 0.38656220760519894, + "learning_rate": 5.0717703349282295e-05, + "loss": 1.2062, + "step": 265 + }, + { + "epoch": 0.03, + "grad_norm": 0.4152109433386491, + "learning_rate": 5.090909090909091e-05, + "loss": 1.2412, + "step": 266 + }, + { + "epoch": 0.03, + "grad_norm": 0.3485732917182752, + "learning_rate": 5.1100478468899526e-05, + "loss": 1.2917, + "step": 267 + }, + { + "epoch": 0.03, + "grad_norm": 0.3551920352066169, + "learning_rate": 5.129186602870814e-05, + "loss": 1.1494, + "step": 268 + }, + { + "epoch": 0.03, + "grad_norm": 0.3817527119708618, + "learning_rate": 5.1483253588516744e-05, + "loss": 1.2689, + "step": 269 + }, + { + "epoch": 0.03, + "grad_norm": 0.35043446712727827, + "learning_rate": 5.167464114832536e-05, + "loss": 1.1263, + "step": 270 + }, + { + "epoch": 0.03, + "grad_norm": 0.4560107489672836, + "learning_rate": 5.1866028708133975e-05, + "loss": 1.2355, + "step": 271 + }, + { + "epoch": 0.03, + "grad_norm": 0.411519693827521, + "learning_rate": 5.205741626794258e-05, + "loss": 1.2213, + "step": 272 + }, + { + "epoch": 0.03, + "grad_norm": 0.4017892181431639, + "learning_rate": 5.22488038277512e-05, + "loss": 1.2103, + "step": 273 + }, + { + "epoch": 0.03, + "grad_norm": 0.46457066156222404, + "learning_rate": 5.244019138755981e-05, + "loss": 1.1638, + "step": 274 + }, + { + "epoch": 0.03, + "grad_norm": 0.41073989648253845, + "learning_rate": 5.2631578947368424e-05, + "loss": 1.2773, + "step": 275 + }, + { + "epoch": 0.03, + "grad_norm": 0.3730027098705776, + "learning_rate": 5.282296650717704e-05, + "loss": 1.1376, + "step": 276 + }, + { + "epoch": 0.03, + "grad_norm": 0.3860157617102252, + "learning_rate": 5.301435406698565e-05, + "loss": 1.2109, + "step": 277 + }, + { + "epoch": 0.03, + "grad_norm": 0.4349386298334119, + "learning_rate": 5.320574162679426e-05, + "loss": 1.1575, + "step": 278 + }, + { + "epoch": 0.03, + "grad_norm": 0.4640773330272392, + "learning_rate": 5.339712918660288e-05, + "loss": 1.1985, + "step": 279 + }, + { + "epoch": 0.03, + "grad_norm": 0.45865729558497004, + "learning_rate": 5.3588516746411485e-05, + "loss": 1.2145, + "step": 280 + }, + { + "epoch": 0.03, + "grad_norm": 0.37111837421497373, + "learning_rate": 5.37799043062201e-05, + "loss": 1.1538, + "step": 281 + }, + { + "epoch": 0.03, + "grad_norm": 0.40099943789483994, + "learning_rate": 5.39712918660287e-05, + "loss": 1.0804, + "step": 282 + }, + { + "epoch": 0.03, + "grad_norm": 0.42817755702563415, + "learning_rate": 5.416267942583733e-05, + "loss": 1.202, + "step": 283 + }, + { + "epoch": 0.03, + "grad_norm": 0.34712198940947375, + "learning_rate": 5.4354066985645934e-05, + "loss": 1.2341, + "step": 284 + }, + { + "epoch": 0.03, + "grad_norm": 0.39457404932238177, + "learning_rate": 5.4545454545454546e-05, + "loss": 1.1571, + "step": 285 + }, + { + "epoch": 0.03, + "grad_norm": 0.36930568124767427, + "learning_rate": 5.4736842105263165e-05, + "loss": 1.1275, + "step": 286 + }, + { + "epoch": 0.03, + "grad_norm": 0.4662030178725213, + "learning_rate": 5.492822966507177e-05, + "loss": 1.1811, + "step": 287 + }, + { + "epoch": 0.03, + "grad_norm": 0.40587674329213624, + "learning_rate": 5.511961722488038e-05, + "loss": 1.102, + "step": 288 + }, + { + "epoch": 0.03, + "grad_norm": 0.35779077893034156, + "learning_rate": 5.5311004784689e-05, + "loss": 1.2788, + "step": 289 + }, + { + "epoch": 0.03, + "grad_norm": 0.31159530938785, + "learning_rate": 5.5502392344497614e-05, + "loss": 1.1778, + "step": 290 + }, + { + "epoch": 0.03, + "grad_norm": 0.4160898781250681, + "learning_rate": 5.569377990430622e-05, + "loss": 1.2064, + "step": 291 + }, + { + "epoch": 0.03, + "grad_norm": 0.43660026046564104, + "learning_rate": 5.588516746411484e-05, + "loss": 1.1494, + "step": 292 + }, + { + "epoch": 0.03, + "grad_norm": 0.40119987919562433, + "learning_rate": 5.607655502392345e-05, + "loss": 1.1623, + "step": 293 + }, + { + "epoch": 0.03, + "grad_norm": 0.37717288362355916, + "learning_rate": 5.6267942583732056e-05, + "loss": 1.1194, + "step": 294 + }, + { + "epoch": 0.03, + "grad_norm": 0.4057229096001704, + "learning_rate": 5.645933014354067e-05, + "loss": 1.0581, + "step": 295 + }, + { + "epoch": 0.03, + "grad_norm": 0.4670818825975868, + "learning_rate": 5.665071770334929e-05, + "loss": 1.171, + "step": 296 + }, + { + "epoch": 0.03, + "grad_norm": 0.5482039795115164, + "learning_rate": 5.68421052631579e-05, + "loss": 1.1082, + "step": 297 + }, + { + "epoch": 0.03, + "grad_norm": 0.4468277071138632, + "learning_rate": 5.7033492822966505e-05, + "loss": 1.2785, + "step": 298 + }, + { + "epoch": 0.03, + "grad_norm": 0.3500497863626851, + "learning_rate": 5.7224880382775124e-05, + "loss": 1.1713, + "step": 299 + }, + { + "epoch": 0.03, + "grad_norm": 0.5457539009822924, + "learning_rate": 5.7416267942583736e-05, + "loss": 1.1259, + "step": 300 + }, + { + "epoch": 0.03, + "grad_norm": 0.3521279155346515, + "learning_rate": 5.760765550239234e-05, + "loss": 1.2143, + "step": 301 + }, + { + "epoch": 0.03, + "grad_norm": 0.38014559184795155, + "learning_rate": 5.779904306220096e-05, + "loss": 1.2169, + "step": 302 + }, + { + "epoch": 0.03, + "grad_norm": 0.4368587205355613, + "learning_rate": 5.799043062200957e-05, + "loss": 1.1354, + "step": 303 + }, + { + "epoch": 0.03, + "grad_norm": 0.3773750084543655, + "learning_rate": 5.818181818181818e-05, + "loss": 1.204, + "step": 304 + }, + { + "epoch": 0.03, + "grad_norm": 0.4661908984222087, + "learning_rate": 5.8373205741626804e-05, + "loss": 1.2339, + "step": 305 + }, + { + "epoch": 0.03, + "grad_norm": 0.4233656313317972, + "learning_rate": 5.856459330143541e-05, + "loss": 1.3069, + "step": 306 + }, + { + "epoch": 0.03, + "grad_norm": 0.4452233614591402, + "learning_rate": 5.875598086124402e-05, + "loss": 1.1966, + "step": 307 + }, + { + "epoch": 0.03, + "grad_norm": 0.379869871739064, + "learning_rate": 5.894736842105263e-05, + "loss": 1.1233, + "step": 308 + }, + { + "epoch": 0.03, + "grad_norm": 0.3634900550395493, + "learning_rate": 5.9138755980861246e-05, + "loss": 1.2106, + "step": 309 + }, + { + "epoch": 0.03, + "grad_norm": 0.38506818193045056, + "learning_rate": 5.933014354066986e-05, + "loss": 1.2411, + "step": 310 + }, + { + "epoch": 0.03, + "grad_norm": 0.41159846103302217, + "learning_rate": 5.9521531100478464e-05, + "loss": 1.3279, + "step": 311 + }, + { + "epoch": 0.03, + "grad_norm": 0.28921210577002654, + "learning_rate": 5.971291866028709e-05, + "loss": 1.0785, + "step": 312 + }, + { + "epoch": 0.03, + "grad_norm": 0.38616302251379914, + "learning_rate": 5.9904306220095695e-05, + "loss": 1.2536, + "step": 313 + }, + { + "epoch": 0.03, + "grad_norm": 0.4101883225822147, + "learning_rate": 6.009569377990431e-05, + "loss": 1.1666, + "step": 314 + }, + { + "epoch": 0.03, + "grad_norm": 0.3423514370886581, + "learning_rate": 6.028708133971293e-05, + "loss": 1.2331, + "step": 315 + }, + { + "epoch": 0.03, + "grad_norm": 0.41207456262790537, + "learning_rate": 6.047846889952153e-05, + "loss": 1.1403, + "step": 316 + }, + { + "epoch": 0.03, + "grad_norm": 0.40588734564935647, + "learning_rate": 6.0669856459330144e-05, + "loss": 1.1986, + "step": 317 + }, + { + "epoch": 0.03, + "grad_norm": 0.37104840888562607, + "learning_rate": 6.086124401913876e-05, + "loss": 1.169, + "step": 318 + }, + { + "epoch": 0.03, + "grad_norm": 0.39655898223668296, + "learning_rate": 6.105263157894737e-05, + "loss": 1.1296, + "step": 319 + }, + { + "epoch": 0.03, + "grad_norm": 0.5221245550161207, + "learning_rate": 6.124401913875598e-05, + "loss": 1.0809, + "step": 320 + }, + { + "epoch": 0.03, + "grad_norm": 0.4021445879644258, + "learning_rate": 6.143540669856461e-05, + "loss": 1.1897, + "step": 321 + }, + { + "epoch": 0.03, + "grad_norm": 0.38180254930918234, + "learning_rate": 6.16267942583732e-05, + "loss": 1.1859, + "step": 322 + }, + { + "epoch": 0.03, + "grad_norm": 0.3828401683172079, + "learning_rate": 6.181818181818182e-05, + "loss": 1.0985, + "step": 323 + }, + { + "epoch": 0.03, + "grad_norm": 0.43356536766643855, + "learning_rate": 6.200956937799043e-05, + "loss": 1.1524, + "step": 324 + }, + { + "epoch": 0.03, + "grad_norm": 0.39197554253988764, + "learning_rate": 6.220095693779904e-05, + "loss": 1.1607, + "step": 325 + }, + { + "epoch": 0.03, + "grad_norm": 0.3768192836960789, + "learning_rate": 6.239234449760765e-05, + "loss": 1.2115, + "step": 326 + }, + { + "epoch": 0.03, + "grad_norm": 0.4123016217273888, + "learning_rate": 6.258373205741627e-05, + "loss": 1.2046, + "step": 327 + }, + { + "epoch": 0.03, + "grad_norm": 0.38690332283125584, + "learning_rate": 6.277511961722489e-05, + "loss": 1.2207, + "step": 328 + }, + { + "epoch": 0.03, + "grad_norm": 0.3958359496615601, + "learning_rate": 6.296650717703349e-05, + "loss": 1.1334, + "step": 329 + }, + { + "epoch": 0.03, + "grad_norm": 0.36561214007046366, + "learning_rate": 6.31578947368421e-05, + "loss": 1.1196, + "step": 330 + }, + { + "epoch": 0.03, + "grad_norm": 0.36953124111301633, + "learning_rate": 6.334928229665073e-05, + "loss": 1.1203, + "step": 331 + }, + { + "epoch": 0.03, + "grad_norm": 0.391180586728009, + "learning_rate": 6.354066985645933e-05, + "loss": 1.2479, + "step": 332 + }, + { + "epoch": 0.03, + "grad_norm": 0.483739471817649, + "learning_rate": 6.373205741626794e-05, + "loss": 1.1225, + "step": 333 + }, + { + "epoch": 0.03, + "grad_norm": 0.4233252985553476, + "learning_rate": 6.392344497607657e-05, + "loss": 1.1772, + "step": 334 + }, + { + "epoch": 0.03, + "grad_norm": 0.4261190391654762, + "learning_rate": 6.411483253588518e-05, + "loss": 1.1752, + "step": 335 + }, + { + "epoch": 0.03, + "grad_norm": 0.4217550233368759, + "learning_rate": 6.430622009569378e-05, + "loss": 1.2335, + "step": 336 + }, + { + "epoch": 0.03, + "grad_norm": 0.4126104400962645, + "learning_rate": 6.449760765550239e-05, + "loss": 1.1095, + "step": 337 + }, + { + "epoch": 0.03, + "grad_norm": 0.48994576863601885, + "learning_rate": 6.468899521531101e-05, + "loss": 1.2933, + "step": 338 + }, + { + "epoch": 0.03, + "grad_norm": 0.3928982582790676, + "learning_rate": 6.488038277511961e-05, + "loss": 1.1598, + "step": 339 + }, + { + "epoch": 0.03, + "grad_norm": 0.38476512783934874, + "learning_rate": 6.507177033492823e-05, + "loss": 1.3208, + "step": 340 + }, + { + "epoch": 0.03, + "grad_norm": 0.3810331248367921, + "learning_rate": 6.526315789473685e-05, + "loss": 1.1183, + "step": 341 + }, + { + "epoch": 0.03, + "grad_norm": 0.38985962649469763, + "learning_rate": 6.545454545454546e-05, + "loss": 1.2319, + "step": 342 + }, + { + "epoch": 0.03, + "grad_norm": 0.44280071804731985, + "learning_rate": 6.564593301435406e-05, + "loss": 1.2374, + "step": 343 + }, + { + "epoch": 0.03, + "grad_norm": 0.3221949054184306, + "learning_rate": 6.583732057416269e-05, + "loss": 1.182, + "step": 344 + }, + { + "epoch": 0.03, + "grad_norm": 0.3765883193637935, + "learning_rate": 6.60287081339713e-05, + "loss": 1.085, + "step": 345 + }, + { + "epoch": 0.03, + "grad_norm": 0.36077725709386577, + "learning_rate": 6.62200956937799e-05, + "loss": 1.1741, + "step": 346 + }, + { + "epoch": 0.03, + "grad_norm": 0.3844614909912839, + "learning_rate": 6.641148325358852e-05, + "loss": 1.1308, + "step": 347 + }, + { + "epoch": 0.03, + "grad_norm": 0.356160190962461, + "learning_rate": 6.660287081339714e-05, + "loss": 1.2303, + "step": 348 + }, + { + "epoch": 0.03, + "grad_norm": 0.40199466397728084, + "learning_rate": 6.679425837320575e-05, + "loss": 1.1404, + "step": 349 + }, + { + "epoch": 0.03, + "grad_norm": 0.40760457805411343, + "learning_rate": 6.698564593301436e-05, + "loss": 1.1441, + "step": 350 + }, + { + "epoch": 0.03, + "grad_norm": 0.36664650507067836, + "learning_rate": 6.717703349282297e-05, + "loss": 1.0822, + "step": 351 + }, + { + "epoch": 0.03, + "grad_norm": 0.3774008230280487, + "learning_rate": 6.736842105263159e-05, + "loss": 1.1882, + "step": 352 + }, + { + "epoch": 0.03, + "grad_norm": 0.3743284992535933, + "learning_rate": 6.755980861244018e-05, + "loss": 1.1174, + "step": 353 + }, + { + "epoch": 0.03, + "grad_norm": 0.3803668006858151, + "learning_rate": 6.775119617224881e-05, + "loss": 1.2074, + "step": 354 + }, + { + "epoch": 0.03, + "grad_norm": 0.3777040131335034, + "learning_rate": 6.794258373205742e-05, + "loss": 1.0946, + "step": 355 + }, + { + "epoch": 0.03, + "grad_norm": 0.3794465077922223, + "learning_rate": 6.813397129186602e-05, + "loss": 1.0683, + "step": 356 + }, + { + "epoch": 0.03, + "grad_norm": 0.3794400803152343, + "learning_rate": 6.832535885167465e-05, + "loss": 1.1377, + "step": 357 + }, + { + "epoch": 0.03, + "grad_norm": 0.3246160370634747, + "learning_rate": 6.851674641148326e-05, + "loss": 1.2539, + "step": 358 + }, + { + "epoch": 0.03, + "grad_norm": 0.3663994418682161, + "learning_rate": 6.870813397129187e-05, + "loss": 1.179, + "step": 359 + }, + { + "epoch": 0.03, + "grad_norm": 0.4165644814006045, + "learning_rate": 6.889952153110048e-05, + "loss": 1.2068, + "step": 360 + }, + { + "epoch": 0.03, + "grad_norm": 0.40002794816857074, + "learning_rate": 6.90909090909091e-05, + "loss": 1.236, + "step": 361 + }, + { + "epoch": 0.03, + "grad_norm": 0.36752801689828113, + "learning_rate": 6.928229665071771e-05, + "loss": 1.1806, + "step": 362 + }, + { + "epoch": 0.03, + "grad_norm": 0.4118641376720516, + "learning_rate": 6.947368421052632e-05, + "loss": 1.2099, + "step": 363 + }, + { + "epoch": 0.03, + "grad_norm": 0.342905855911731, + "learning_rate": 6.966507177033493e-05, + "loss": 1.1753, + "step": 364 + }, + { + "epoch": 0.03, + "grad_norm": 0.37601728725049927, + "learning_rate": 6.985645933014354e-05, + "loss": 1.2123, + "step": 365 + }, + { + "epoch": 0.04, + "grad_norm": 0.3996876187572624, + "learning_rate": 7.004784688995216e-05, + "loss": 1.2063, + "step": 366 + }, + { + "epoch": 0.04, + "grad_norm": 0.4995513430917594, + "learning_rate": 7.023923444976077e-05, + "loss": 1.1249, + "step": 367 + }, + { + "epoch": 0.04, + "grad_norm": 0.38011883982869116, + "learning_rate": 7.043062200956938e-05, + "loss": 1.091, + "step": 368 + }, + { + "epoch": 0.04, + "grad_norm": 0.40185997614954744, + "learning_rate": 7.0622009569378e-05, + "loss": 1.2879, + "step": 369 + }, + { + "epoch": 0.04, + "grad_norm": 0.37530412149005404, + "learning_rate": 7.08133971291866e-05, + "loss": 1.2832, + "step": 370 + }, + { + "epoch": 0.04, + "grad_norm": 0.4067556360842582, + "learning_rate": 7.100478468899522e-05, + "loss": 1.0513, + "step": 371 + }, + { + "epoch": 0.04, + "grad_norm": 0.4450195271251879, + "learning_rate": 7.119617224880383e-05, + "loss": 1.1781, + "step": 372 + }, + { + "epoch": 0.04, + "grad_norm": 0.3878038365086055, + "learning_rate": 7.138755980861244e-05, + "loss": 1.2654, + "step": 373 + }, + { + "epoch": 0.04, + "grad_norm": 0.34902250016212555, + "learning_rate": 7.157894736842105e-05, + "loss": 1.0807, + "step": 374 + }, + { + "epoch": 0.04, + "grad_norm": 0.4789371923266489, + "learning_rate": 7.177033492822967e-05, + "loss": 1.0782, + "step": 375 + }, + { + "epoch": 0.04, + "grad_norm": 0.3443621112474482, + "learning_rate": 7.196172248803828e-05, + "loss": 1.0874, + "step": 376 + }, + { + "epoch": 0.04, + "grad_norm": 0.4354953964559702, + "learning_rate": 7.215311004784689e-05, + "loss": 1.1893, + "step": 377 + }, + { + "epoch": 0.04, + "grad_norm": 0.3806981757319183, + "learning_rate": 7.23444976076555e-05, + "loss": 1.1033, + "step": 378 + }, + { + "epoch": 0.04, + "grad_norm": 0.3640708591532273, + "learning_rate": 7.253588516746413e-05, + "loss": 1.1008, + "step": 379 + }, + { + "epoch": 0.04, + "grad_norm": 0.40487824027906655, + "learning_rate": 7.272727272727273e-05, + "loss": 1.2676, + "step": 380 + }, + { + "epoch": 0.04, + "grad_norm": 0.3974099352341619, + "learning_rate": 7.291866028708134e-05, + "loss": 1.1297, + "step": 381 + }, + { + "epoch": 0.04, + "grad_norm": 0.4412922873426402, + "learning_rate": 7.311004784688995e-05, + "loss": 1.2022, + "step": 382 + }, + { + "epoch": 0.04, + "grad_norm": 0.4177424000999219, + "learning_rate": 7.330143540669856e-05, + "loss": 1.0769, + "step": 383 + }, + { + "epoch": 0.04, + "grad_norm": 0.37843676467423115, + "learning_rate": 7.349282296650718e-05, + "loss": 1.1895, + "step": 384 + }, + { + "epoch": 0.04, + "grad_norm": 0.37904784029757954, + "learning_rate": 7.368421052631579e-05, + "loss": 1.0346, + "step": 385 + }, + { + "epoch": 0.04, + "grad_norm": 0.3678229762318383, + "learning_rate": 7.387559808612442e-05, + "loss": 1.1415, + "step": 386 + }, + { + "epoch": 0.04, + "grad_norm": 0.44631509974989175, + "learning_rate": 7.406698564593301e-05, + "loss": 1.1593, + "step": 387 + }, + { + "epoch": 0.04, + "grad_norm": 0.4904633395013365, + "learning_rate": 7.425837320574163e-05, + "loss": 1.1944, + "step": 388 + }, + { + "epoch": 0.04, + "grad_norm": 0.37152475181241584, + "learning_rate": 7.444976076555025e-05, + "loss": 1.1698, + "step": 389 + }, + { + "epoch": 0.04, + "grad_norm": 0.4011204212131284, + "learning_rate": 7.464114832535885e-05, + "loss": 1.205, + "step": 390 + }, + { + "epoch": 0.04, + "grad_norm": 0.36761682195572815, + "learning_rate": 7.483253588516746e-05, + "loss": 1.2973, + "step": 391 + }, + { + "epoch": 0.04, + "grad_norm": 0.40374546939981903, + "learning_rate": 7.502392344497609e-05, + "loss": 1.1507, + "step": 392 + }, + { + "epoch": 0.04, + "grad_norm": 0.4255006771988514, + "learning_rate": 7.52153110047847e-05, + "loss": 1.1891, + "step": 393 + }, + { + "epoch": 0.04, + "grad_norm": 0.3865518214834786, + "learning_rate": 7.54066985645933e-05, + "loss": 1.185, + "step": 394 + }, + { + "epoch": 0.04, + "grad_norm": 0.41827717268535636, + "learning_rate": 7.559808612440191e-05, + "loss": 1.0622, + "step": 395 + }, + { + "epoch": 0.04, + "grad_norm": 0.37110879057589674, + "learning_rate": 7.578947368421054e-05, + "loss": 1.1381, + "step": 396 + }, + { + "epoch": 0.04, + "grad_norm": 0.46618966480421226, + "learning_rate": 7.598086124401914e-05, + "loss": 1.2105, + "step": 397 + }, + { + "epoch": 0.04, + "grad_norm": 0.4266096767917638, + "learning_rate": 7.617224880382775e-05, + "loss": 1.2304, + "step": 398 + }, + { + "epoch": 0.04, + "grad_norm": 0.3951434711007925, + "learning_rate": 7.636363636363637e-05, + "loss": 1.1194, + "step": 399 + }, + { + "epoch": 0.04, + "grad_norm": 0.5249468851294161, + "learning_rate": 7.655502392344497e-05, + "loss": 1.1557, + "step": 400 + }, + { + "epoch": 0.04, + "grad_norm": 0.35849242393679126, + "learning_rate": 7.674641148325359e-05, + "loss": 1.1586, + "step": 401 + }, + { + "epoch": 0.04, + "grad_norm": 0.4163074752217252, + "learning_rate": 7.693779904306221e-05, + "loss": 1.1884, + "step": 402 + }, + { + "epoch": 0.04, + "grad_norm": 0.39359654231287206, + "learning_rate": 7.712918660287082e-05, + "loss": 1.2162, + "step": 403 + }, + { + "epoch": 0.04, + "grad_norm": 0.40661594470681456, + "learning_rate": 7.732057416267942e-05, + "loss": 1.1578, + "step": 404 + }, + { + "epoch": 0.04, + "grad_norm": 0.4060870336417891, + "learning_rate": 7.751196172248805e-05, + "loss": 1.2462, + "step": 405 + }, + { + "epoch": 0.04, + "grad_norm": 0.34776732343372285, + "learning_rate": 7.770334928229666e-05, + "loss": 1.1907, + "step": 406 + }, + { + "epoch": 0.04, + "grad_norm": 0.37012929061225097, + "learning_rate": 7.789473684210526e-05, + "loss": 1.084, + "step": 407 + }, + { + "epoch": 0.04, + "grad_norm": 0.3191687282557576, + "learning_rate": 7.808612440191387e-05, + "loss": 1.1337, + "step": 408 + }, + { + "epoch": 0.04, + "grad_norm": 0.428711696888013, + "learning_rate": 7.82775119617225e-05, + "loss": 1.2142, + "step": 409 + }, + { + "epoch": 0.04, + "grad_norm": 0.5197761387762115, + "learning_rate": 7.846889952153111e-05, + "loss": 1.2915, + "step": 410 + }, + { + "epoch": 0.04, + "grad_norm": 0.41273170978967216, + "learning_rate": 7.866028708133971e-05, + "loss": 1.1028, + "step": 411 + }, + { + "epoch": 0.04, + "grad_norm": 0.3568549043871766, + "learning_rate": 7.885167464114833e-05, + "loss": 1.1575, + "step": 412 + }, + { + "epoch": 0.04, + "grad_norm": 0.43257689825178397, + "learning_rate": 7.904306220095695e-05, + "loss": 1.1115, + "step": 413 + }, + { + "epoch": 0.04, + "grad_norm": 0.4355392516104201, + "learning_rate": 7.923444976076554e-05, + "loss": 1.208, + "step": 414 + }, + { + "epoch": 0.04, + "grad_norm": 0.48359013518775035, + "learning_rate": 7.942583732057417e-05, + "loss": 1.1864, + "step": 415 + }, + { + "epoch": 0.04, + "grad_norm": 0.3730295702560985, + "learning_rate": 7.961722488038278e-05, + "loss": 1.1261, + "step": 416 + }, + { + "epoch": 0.04, + "grad_norm": 0.3966685187279031, + "learning_rate": 7.98086124401914e-05, + "loss": 1.2786, + "step": 417 + }, + { + "epoch": 0.04, + "grad_norm": 0.3786523559458125, + "learning_rate": 8e-05, + "loss": 1.1127, + "step": 418 + }, + { + "epoch": 0.04, + "grad_norm": 0.4039420784784596, + "learning_rate": 8.019138755980862e-05, + "loss": 1.1653, + "step": 419 + }, + { + "epoch": 0.04, + "grad_norm": 0.3836563304661635, + "learning_rate": 8.038277511961723e-05, + "loss": 1.1504, + "step": 420 + }, + { + "epoch": 0.04, + "grad_norm": 0.3915882211031514, + "learning_rate": 8.057416267942584e-05, + "loss": 1.1881, + "step": 421 + }, + { + "epoch": 0.04, + "grad_norm": 0.3585833381861096, + "learning_rate": 8.076555023923446e-05, + "loss": 1.1215, + "step": 422 + }, + { + "epoch": 0.04, + "grad_norm": 0.34093033582588217, + "learning_rate": 8.095693779904307e-05, + "loss": 1.1502, + "step": 423 + }, + { + "epoch": 0.04, + "grad_norm": 0.35015224800454925, + "learning_rate": 8.114832535885168e-05, + "loss": 1.1377, + "step": 424 + }, + { + "epoch": 0.04, + "grad_norm": 0.36225302300603957, + "learning_rate": 8.133971291866029e-05, + "loss": 1.146, + "step": 425 + }, + { + "epoch": 0.04, + "grad_norm": 0.401290151391233, + "learning_rate": 8.15311004784689e-05, + "loss": 1.1257, + "step": 426 + }, + { + "epoch": 0.04, + "grad_norm": 0.42354295033157774, + "learning_rate": 8.172248803827752e-05, + "loss": 1.1861, + "step": 427 + }, + { + "epoch": 0.04, + "grad_norm": 0.47531433461725525, + "learning_rate": 8.191387559808613e-05, + "loss": 1.2387, + "step": 428 + }, + { + "epoch": 0.04, + "grad_norm": 0.3956478442168383, + "learning_rate": 8.210526315789474e-05, + "loss": 1.2046, + "step": 429 + }, + { + "epoch": 0.04, + "grad_norm": 0.40300702532566746, + "learning_rate": 8.229665071770335e-05, + "loss": 1.2882, + "step": 430 + }, + { + "epoch": 0.04, + "grad_norm": 0.4247965626832039, + "learning_rate": 8.248803827751197e-05, + "loss": 1.2151, + "step": 431 + }, + { + "epoch": 0.04, + "grad_norm": 0.3667653514802216, + "learning_rate": 8.267942583732058e-05, + "loss": 1.2397, + "step": 432 + }, + { + "epoch": 0.04, + "grad_norm": 0.368289021497025, + "learning_rate": 8.287081339712919e-05, + "loss": 1.1336, + "step": 433 + }, + { + "epoch": 0.04, + "grad_norm": 0.35548517154282255, + "learning_rate": 8.30622009569378e-05, + "loss": 1.0854, + "step": 434 + }, + { + "epoch": 0.04, + "grad_norm": 0.43281064565162874, + "learning_rate": 8.325358851674641e-05, + "loss": 1.1418, + "step": 435 + }, + { + "epoch": 0.04, + "grad_norm": 0.36804478901968624, + "learning_rate": 8.344497607655503e-05, + "loss": 1.1754, + "step": 436 + }, + { + "epoch": 0.04, + "grad_norm": 0.3680855445618144, + "learning_rate": 8.363636363636364e-05, + "loss": 1.1909, + "step": 437 + }, + { + "epoch": 0.04, + "grad_norm": 0.38659927520583764, + "learning_rate": 8.382775119617225e-05, + "loss": 1.2076, + "step": 438 + }, + { + "epoch": 0.04, + "grad_norm": 0.341767677446713, + "learning_rate": 8.401913875598086e-05, + "loss": 1.0057, + "step": 439 + }, + { + "epoch": 0.04, + "grad_norm": 0.4256342728148254, + "learning_rate": 8.421052631578948e-05, + "loss": 1.2104, + "step": 440 + }, + { + "epoch": 0.04, + "grad_norm": 0.37216536384041526, + "learning_rate": 8.440191387559809e-05, + "loss": 1.1144, + "step": 441 + }, + { + "epoch": 0.04, + "grad_norm": 0.3791754076001631, + "learning_rate": 8.45933014354067e-05, + "loss": 1.2319, + "step": 442 + }, + { + "epoch": 0.04, + "grad_norm": 0.42421700255160355, + "learning_rate": 8.478468899521531e-05, + "loss": 1.1934, + "step": 443 + }, + { + "epoch": 0.04, + "grad_norm": 0.3678445305043515, + "learning_rate": 8.497607655502394e-05, + "loss": 1.1886, + "step": 444 + }, + { + "epoch": 0.04, + "grad_norm": 0.4219847133147515, + "learning_rate": 8.516746411483254e-05, + "loss": 1.1748, + "step": 445 + }, + { + "epoch": 0.04, + "grad_norm": 0.33966219534041503, + "learning_rate": 8.535885167464115e-05, + "loss": 1.1416, + "step": 446 + }, + { + "epoch": 0.04, + "grad_norm": 0.3426950270667347, + "learning_rate": 8.555023923444977e-05, + "loss": 1.2117, + "step": 447 + }, + { + "epoch": 0.04, + "grad_norm": 0.31268235060757094, + "learning_rate": 8.574162679425837e-05, + "loss": 1.1099, + "step": 448 + }, + { + "epoch": 0.04, + "grad_norm": 0.4283403832969395, + "learning_rate": 8.593301435406699e-05, + "loss": 1.1204, + "step": 449 + }, + { + "epoch": 0.04, + "grad_norm": 0.360981174023238, + "learning_rate": 8.612440191387561e-05, + "loss": 1.1203, + "step": 450 + }, + { + "epoch": 0.04, + "grad_norm": 0.31441146897954253, + "learning_rate": 8.631578947368421e-05, + "loss": 1.1703, + "step": 451 + }, + { + "epoch": 0.04, + "grad_norm": 0.6665116249955306, + "learning_rate": 8.650717703349282e-05, + "loss": 1.1965, + "step": 452 + }, + { + "epoch": 0.04, + "grad_norm": 0.34282354864583103, + "learning_rate": 8.669856459330143e-05, + "loss": 1.1108, + "step": 453 + }, + { + "epoch": 0.04, + "grad_norm": 0.3353048313959663, + "learning_rate": 8.688995215311006e-05, + "loss": 1.0893, + "step": 454 + }, + { + "epoch": 0.04, + "grad_norm": 0.3851891541705526, + "learning_rate": 8.708133971291866e-05, + "loss": 1.065, + "step": 455 + }, + { + "epoch": 0.04, + "grad_norm": 0.40582359225585646, + "learning_rate": 8.727272727272727e-05, + "loss": 1.1551, + "step": 456 + }, + { + "epoch": 0.04, + "grad_norm": 0.3310055383140587, + "learning_rate": 8.74641148325359e-05, + "loss": 1.1374, + "step": 457 + }, + { + "epoch": 0.04, + "grad_norm": 0.3697503368052755, + "learning_rate": 8.76555023923445e-05, + "loss": 1.0852, + "step": 458 + }, + { + "epoch": 0.04, + "grad_norm": 0.3494933513359935, + "learning_rate": 8.784688995215311e-05, + "loss": 1.1092, + "step": 459 + }, + { + "epoch": 0.04, + "grad_norm": 0.359799849220072, + "learning_rate": 8.803827751196173e-05, + "loss": 1.0291, + "step": 460 + }, + { + "epoch": 0.04, + "grad_norm": 0.3692901471865079, + "learning_rate": 8.822966507177035e-05, + "loss": 1.1516, + "step": 461 + }, + { + "epoch": 0.04, + "grad_norm": 0.31771285202360866, + "learning_rate": 8.842105263157894e-05, + "loss": 1.0414, + "step": 462 + }, + { + "epoch": 0.04, + "grad_norm": 0.39382351019262535, + "learning_rate": 8.861244019138757e-05, + "loss": 1.1987, + "step": 463 + }, + { + "epoch": 0.04, + "grad_norm": 0.37948154502600623, + "learning_rate": 8.880382775119618e-05, + "loss": 1.0162, + "step": 464 + }, + { + "epoch": 0.04, + "grad_norm": 0.4454086610740419, + "learning_rate": 8.899521531100478e-05, + "loss": 1.1686, + "step": 465 + }, + { + "epoch": 0.04, + "grad_norm": 0.3992597027171503, + "learning_rate": 8.91866028708134e-05, + "loss": 1.1067, + "step": 466 + }, + { + "epoch": 0.04, + "grad_norm": 0.402787251335542, + "learning_rate": 8.937799043062202e-05, + "loss": 1.1733, + "step": 467 + }, + { + "epoch": 0.04, + "grad_norm": 0.3772514735030477, + "learning_rate": 8.956937799043063e-05, + "loss": 1.1926, + "step": 468 + }, + { + "epoch": 0.04, + "grad_norm": 0.38379331799775773, + "learning_rate": 8.976076555023923e-05, + "loss": 1.2207, + "step": 469 + }, + { + "epoch": 0.04, + "grad_norm": 0.37849416076396786, + "learning_rate": 8.995215311004786e-05, + "loss": 1.158, + "step": 470 + }, + { + "epoch": 0.05, + "grad_norm": 0.40932072449271345, + "learning_rate": 9.014354066985647e-05, + "loss": 1.124, + "step": 471 + }, + { + "epoch": 0.05, + "grad_norm": 0.34313554427354404, + "learning_rate": 9.033492822966507e-05, + "loss": 1.2141, + "step": 472 + }, + { + "epoch": 0.05, + "grad_norm": 0.2960457574671995, + "learning_rate": 9.052631578947369e-05, + "loss": 1.1269, + "step": 473 + }, + { + "epoch": 0.05, + "grad_norm": 0.3364724543703671, + "learning_rate": 9.07177033492823e-05, + "loss": 1.0963, + "step": 474 + }, + { + "epoch": 0.05, + "grad_norm": 0.4102740455894671, + "learning_rate": 9.090909090909092e-05, + "loss": 0.9836, + "step": 475 + }, + { + "epoch": 0.05, + "grad_norm": 0.3576390479541009, + "learning_rate": 9.110047846889953e-05, + "loss": 1.088, + "step": 476 + }, + { + "epoch": 0.05, + "grad_norm": 0.4063740081724684, + "learning_rate": 9.129186602870814e-05, + "loss": 1.0916, + "step": 477 + }, + { + "epoch": 0.05, + "grad_norm": 0.3866688811135483, + "learning_rate": 9.148325358851675e-05, + "loss": 1.1582, + "step": 478 + }, + { + "epoch": 0.05, + "grad_norm": 0.35233155736688976, + "learning_rate": 9.167464114832537e-05, + "loss": 1.2166, + "step": 479 + }, + { + "epoch": 0.05, + "grad_norm": 0.3523742613417453, + "learning_rate": 9.186602870813398e-05, + "loss": 1.1492, + "step": 480 + }, + { + "epoch": 0.05, + "grad_norm": 0.4091175614367992, + "learning_rate": 9.205741626794259e-05, + "loss": 1.2015, + "step": 481 + }, + { + "epoch": 0.05, + "grad_norm": 0.40461705008021104, + "learning_rate": 9.224880382775119e-05, + "loss": 1.1021, + "step": 482 + }, + { + "epoch": 0.05, + "grad_norm": 0.3555229450892527, + "learning_rate": 9.244019138755981e-05, + "loss": 1.1677, + "step": 483 + }, + { + "epoch": 0.05, + "grad_norm": 0.39219146946985217, + "learning_rate": 9.263157894736843e-05, + "loss": 1.1448, + "step": 484 + }, + { + "epoch": 0.05, + "grad_norm": 0.3959881230046531, + "learning_rate": 9.282296650717704e-05, + "loss": 1.1031, + "step": 485 + }, + { + "epoch": 0.05, + "grad_norm": 0.37437224905160343, + "learning_rate": 9.301435406698565e-05, + "loss": 1.2172, + "step": 486 + }, + { + "epoch": 0.05, + "grad_norm": 0.34627056702655357, + "learning_rate": 9.320574162679426e-05, + "loss": 1.0524, + "step": 487 + }, + { + "epoch": 0.05, + "grad_norm": 0.34958866976468206, + "learning_rate": 9.339712918660288e-05, + "loss": 1.1962, + "step": 488 + }, + { + "epoch": 0.05, + "grad_norm": 0.3592293846050586, + "learning_rate": 9.358851674641149e-05, + "loss": 1.1292, + "step": 489 + }, + { + "epoch": 0.05, + "grad_norm": 0.4512710733697596, + "learning_rate": 9.37799043062201e-05, + "loss": 1.1657, + "step": 490 + }, + { + "epoch": 0.05, + "grad_norm": 0.3630522232972779, + "learning_rate": 9.397129186602871e-05, + "loss": 1.1852, + "step": 491 + }, + { + "epoch": 0.05, + "grad_norm": 0.37647580929450847, + "learning_rate": 9.416267942583733e-05, + "loss": 1.1968, + "step": 492 + }, + { + "epoch": 0.05, + "grad_norm": 0.4947723206359184, + "learning_rate": 9.435406698564594e-05, + "loss": 1.1535, + "step": 493 + }, + { + "epoch": 0.05, + "grad_norm": 0.384481476371926, + "learning_rate": 9.454545454545455e-05, + "loss": 1.0762, + "step": 494 + }, + { + "epoch": 0.05, + "grad_norm": 0.30759408641071373, + "learning_rate": 9.473684210526316e-05, + "loss": 1.2025, + "step": 495 + }, + { + "epoch": 0.05, + "grad_norm": 0.39262382714536653, + "learning_rate": 9.492822966507177e-05, + "loss": 1.1072, + "step": 496 + }, + { + "epoch": 0.05, + "grad_norm": 0.3707577579487458, + "learning_rate": 9.511961722488039e-05, + "loss": 1.201, + "step": 497 + }, + { + "epoch": 0.05, + "grad_norm": 0.37259701318014815, + "learning_rate": 9.5311004784689e-05, + "loss": 1.1808, + "step": 498 + }, + { + "epoch": 0.05, + "grad_norm": 0.3899497483331902, + "learning_rate": 9.550239234449761e-05, + "loss": 1.155, + "step": 499 + }, + { + "epoch": 0.05, + "grad_norm": 0.3056736695030619, + "learning_rate": 9.569377990430622e-05, + "loss": 1.1485, + "step": 500 + }, + { + "epoch": 0.05, + "grad_norm": 0.3771828447592661, + "learning_rate": 9.588516746411484e-05, + "loss": 1.1475, + "step": 501 + }, + { + "epoch": 0.05, + "grad_norm": 0.3512506645855738, + "learning_rate": 9.607655502392345e-05, + "loss": 1.137, + "step": 502 + }, + { + "epoch": 0.05, + "grad_norm": 0.44787972025939427, + "learning_rate": 9.626794258373206e-05, + "loss": 1.1137, + "step": 503 + }, + { + "epoch": 0.05, + "grad_norm": 0.43899468578920847, + "learning_rate": 9.645933014354067e-05, + "loss": 1.198, + "step": 504 + }, + { + "epoch": 0.05, + "grad_norm": 0.37520028378057024, + "learning_rate": 9.66507177033493e-05, + "loss": 1.12, + "step": 505 + }, + { + "epoch": 0.05, + "grad_norm": 0.44437839309325783, + "learning_rate": 9.68421052631579e-05, + "loss": 1.1266, + "step": 506 + }, + { + "epoch": 0.05, + "grad_norm": 0.3713684887370264, + "learning_rate": 9.703349282296651e-05, + "loss": 1.0703, + "step": 507 + }, + { + "epoch": 0.05, + "grad_norm": 0.3785759236620265, + "learning_rate": 9.722488038277513e-05, + "loss": 1.1825, + "step": 508 + }, + { + "epoch": 0.05, + "grad_norm": 0.3949986518868616, + "learning_rate": 9.741626794258373e-05, + "loss": 1.205, + "step": 509 + }, + { + "epoch": 0.05, + "grad_norm": 0.32820866590924014, + "learning_rate": 9.760765550239235e-05, + "loss": 1.0473, + "step": 510 + }, + { + "epoch": 0.05, + "grad_norm": 0.3438055422000543, + "learning_rate": 9.779904306220096e-05, + "loss": 1.2331, + "step": 511 + }, + { + "epoch": 0.05, + "grad_norm": 0.4782013192332416, + "learning_rate": 9.799043062200958e-05, + "loss": 1.0641, + "step": 512 + }, + { + "epoch": 0.05, + "grad_norm": 0.37127997208806374, + "learning_rate": 9.818181818181818e-05, + "loss": 1.1557, + "step": 513 + }, + { + "epoch": 0.05, + "grad_norm": 0.37810255782547136, + "learning_rate": 9.83732057416268e-05, + "loss": 1.1545, + "step": 514 + }, + { + "epoch": 0.05, + "grad_norm": 0.4105466030008024, + "learning_rate": 9.856459330143542e-05, + "loss": 1.2272, + "step": 515 + }, + { + "epoch": 0.05, + "grad_norm": 0.4517743884986355, + "learning_rate": 9.875598086124402e-05, + "loss": 1.1229, + "step": 516 + }, + { + "epoch": 0.05, + "grad_norm": 0.399218104135688, + "learning_rate": 9.894736842105263e-05, + "loss": 1.1467, + "step": 517 + }, + { + "epoch": 0.05, + "grad_norm": 0.3693460754043359, + "learning_rate": 9.913875598086126e-05, + "loss": 1.169, + "step": 518 + }, + { + "epoch": 0.05, + "grad_norm": 0.3446333300820591, + "learning_rate": 9.933014354066987e-05, + "loss": 1.0783, + "step": 519 + }, + { + "epoch": 0.05, + "grad_norm": 0.41719266096581403, + "learning_rate": 9.952153110047847e-05, + "loss": 1.211, + "step": 520 + }, + { + "epoch": 0.05, + "grad_norm": 0.4139824733210239, + "learning_rate": 9.97129186602871e-05, + "loss": 1.2271, + "step": 521 + }, + { + "epoch": 0.05, + "grad_norm": 0.3284583647624778, + "learning_rate": 9.99043062200957e-05, + "loss": 1.1363, + "step": 522 + }, + { + "epoch": 0.05, + "grad_norm": 0.3743628377274405, + "learning_rate": 0.0001000956937799043, + "loss": 1.1254, + "step": 523 + }, + { + "epoch": 0.05, + "grad_norm": 0.4187480747840408, + "learning_rate": 0.00010028708133971292, + "loss": 1.1542, + "step": 524 + }, + { + "epoch": 0.05, + "grad_norm": 0.35701214246846424, + "learning_rate": 0.00010047846889952153, + "loss": 1.1318, + "step": 525 + }, + { + "epoch": 0.05, + "grad_norm": 0.39966360139311247, + "learning_rate": 0.00010066985645933015, + "loss": 1.0679, + "step": 526 + }, + { + "epoch": 0.05, + "grad_norm": 0.42408986083219885, + "learning_rate": 0.00010086124401913877, + "loss": 1.086, + "step": 527 + }, + { + "epoch": 0.05, + "grad_norm": 0.33931985700987544, + "learning_rate": 0.00010105263157894738, + "loss": 1.1684, + "step": 528 + }, + { + "epoch": 0.05, + "grad_norm": 0.33040989829144524, + "learning_rate": 0.00010124401913875599, + "loss": 1.1301, + "step": 529 + }, + { + "epoch": 0.05, + "grad_norm": 0.3404311052002135, + "learning_rate": 0.00010143540669856459, + "loss": 1.0848, + "step": 530 + }, + { + "epoch": 0.05, + "grad_norm": 0.34611637641008364, + "learning_rate": 0.0001016267942583732, + "loss": 1.1635, + "step": 531 + }, + { + "epoch": 0.05, + "grad_norm": 0.3672822694004909, + "learning_rate": 0.00010181818181818181, + "loss": 1.0811, + "step": 532 + }, + { + "epoch": 0.05, + "grad_norm": 0.34761252015363225, + "learning_rate": 0.00010200956937799044, + "loss": 1.1502, + "step": 533 + }, + { + "epoch": 0.05, + "grad_norm": 0.27503858553543464, + "learning_rate": 0.00010220095693779905, + "loss": 1.1257, + "step": 534 + }, + { + "epoch": 0.05, + "grad_norm": 0.3368407495501332, + "learning_rate": 0.00010239234449760766, + "loss": 1.1195, + "step": 535 + }, + { + "epoch": 0.05, + "grad_norm": 0.3448997570516004, + "learning_rate": 0.00010258373205741628, + "loss": 1.1955, + "step": 536 + }, + { + "epoch": 0.05, + "grad_norm": 0.4366845539188124, + "learning_rate": 0.00010277511961722488, + "loss": 1.1175, + "step": 537 + }, + { + "epoch": 0.05, + "grad_norm": 0.35681716286224935, + "learning_rate": 0.00010296650717703349, + "loss": 1.1577, + "step": 538 + }, + { + "epoch": 0.05, + "grad_norm": 0.3359937686441125, + "learning_rate": 0.00010315789473684211, + "loss": 1.1318, + "step": 539 + }, + { + "epoch": 0.05, + "grad_norm": 0.3398927803425864, + "learning_rate": 0.00010334928229665073, + "loss": 1.1278, + "step": 540 + }, + { + "epoch": 0.05, + "grad_norm": 0.38532827109393014, + "learning_rate": 0.00010354066985645934, + "loss": 1.1273, + "step": 541 + }, + { + "epoch": 0.05, + "grad_norm": 0.3740363062511281, + "learning_rate": 0.00010373205741626795, + "loss": 1.0253, + "step": 542 + }, + { + "epoch": 0.05, + "grad_norm": 0.34342518179762227, + "learning_rate": 0.00010392344497607656, + "loss": 1.1462, + "step": 543 + }, + { + "epoch": 0.05, + "grad_norm": 0.3590449087862375, + "learning_rate": 0.00010411483253588516, + "loss": 1.1392, + "step": 544 + }, + { + "epoch": 0.05, + "grad_norm": 0.37655451333728496, + "learning_rate": 0.00010430622009569377, + "loss": 1.1218, + "step": 545 + }, + { + "epoch": 0.05, + "grad_norm": 0.3336387102623628, + "learning_rate": 0.0001044976076555024, + "loss": 1.2268, + "step": 546 + }, + { + "epoch": 0.05, + "grad_norm": 0.38550698369605846, + "learning_rate": 0.00010468899521531101, + "loss": 1.1752, + "step": 547 + }, + { + "epoch": 0.05, + "grad_norm": 0.39228288833022, + "learning_rate": 0.00010488038277511962, + "loss": 1.2562, + "step": 548 + }, + { + "epoch": 0.05, + "grad_norm": 0.34746795623865406, + "learning_rate": 0.00010507177033492824, + "loss": 1.2464, + "step": 549 + }, + { + "epoch": 0.05, + "grad_norm": 0.350081344824654, + "learning_rate": 0.00010526315789473685, + "loss": 1.0909, + "step": 550 + }, + { + "epoch": 0.05, + "grad_norm": 0.35258812822080665, + "learning_rate": 0.00010545454545454545, + "loss": 1.1647, + "step": 551 + }, + { + "epoch": 0.05, + "grad_norm": 0.3894512796451852, + "learning_rate": 0.00010564593301435409, + "loss": 1.0903, + "step": 552 + }, + { + "epoch": 0.05, + "grad_norm": 0.3560681821041678, + "learning_rate": 0.00010583732057416268, + "loss": 1.0612, + "step": 553 + }, + { + "epoch": 0.05, + "grad_norm": 0.37166266505359585, + "learning_rate": 0.0001060287081339713, + "loss": 1.2101, + "step": 554 + }, + { + "epoch": 0.05, + "grad_norm": 0.36530860942069354, + "learning_rate": 0.00010622009569377991, + "loss": 1.044, + "step": 555 + }, + { + "epoch": 0.05, + "grad_norm": 0.33955376662887854, + "learning_rate": 0.00010641148325358852, + "loss": 1.093, + "step": 556 + }, + { + "epoch": 0.05, + "grad_norm": 0.31114780875878933, + "learning_rate": 0.00010660287081339712, + "loss": 1.146, + "step": 557 + }, + { + "epoch": 0.05, + "grad_norm": 0.3676757652032558, + "learning_rate": 0.00010679425837320576, + "loss": 1.0386, + "step": 558 + }, + { + "epoch": 0.05, + "grad_norm": 0.32774291097340136, + "learning_rate": 0.00010698564593301437, + "loss": 1.0935, + "step": 559 + }, + { + "epoch": 0.05, + "grad_norm": 0.33490784632156506, + "learning_rate": 0.00010717703349282297, + "loss": 1.1692, + "step": 560 + }, + { + "epoch": 0.05, + "grad_norm": 0.3502773499002359, + "learning_rate": 0.00010736842105263158, + "loss": 1.1153, + "step": 561 + }, + { + "epoch": 0.05, + "grad_norm": 0.36351558175076165, + "learning_rate": 0.0001075598086124402, + "loss": 1.0359, + "step": 562 + }, + { + "epoch": 0.05, + "grad_norm": 0.32317295912626365, + "learning_rate": 0.00010775119617224881, + "loss": 1.0597, + "step": 563 + }, + { + "epoch": 0.05, + "grad_norm": 0.37457465032706805, + "learning_rate": 0.0001079425837320574, + "loss": 1.2346, + "step": 564 + }, + { + "epoch": 0.05, + "grad_norm": 0.3230784156429463, + "learning_rate": 0.00010813397129186604, + "loss": 1.0292, + "step": 565 + }, + { + "epoch": 0.05, + "grad_norm": 0.29938140614694264, + "learning_rate": 0.00010832535885167466, + "loss": 1.068, + "step": 566 + }, + { + "epoch": 0.05, + "grad_norm": 0.37830783204331137, + "learning_rate": 0.00010851674641148326, + "loss": 1.1449, + "step": 567 + }, + { + "epoch": 0.05, + "grad_norm": 0.36419764707025626, + "learning_rate": 0.00010870813397129187, + "loss": 1.0981, + "step": 568 + }, + { + "epoch": 0.05, + "grad_norm": 0.3748154030309419, + "learning_rate": 0.00010889952153110048, + "loss": 1.2252, + "step": 569 + }, + { + "epoch": 0.05, + "grad_norm": 0.3368617151990764, + "learning_rate": 0.00010909090909090909, + "loss": 1.2124, + "step": 570 + }, + { + "epoch": 0.05, + "grad_norm": 0.37988770907164504, + "learning_rate": 0.00010928229665071772, + "loss": 1.2043, + "step": 571 + }, + { + "epoch": 0.05, + "grad_norm": 0.31449121687746323, + "learning_rate": 0.00010947368421052633, + "loss": 1.1775, + "step": 572 + }, + { + "epoch": 0.05, + "grad_norm": 0.3568124415520435, + "learning_rate": 0.00010966507177033494, + "loss": 1.1014, + "step": 573 + }, + { + "epoch": 0.05, + "grad_norm": 0.3673211031508777, + "learning_rate": 0.00010985645933014354, + "loss": 1.2182, + "step": 574 + }, + { + "epoch": 0.06, + "grad_norm": 0.34202187757429514, + "learning_rate": 0.00011004784688995215, + "loss": 1.0369, + "step": 575 + }, + { + "epoch": 0.06, + "grad_norm": 0.3905415136176411, + "learning_rate": 0.00011023923444976077, + "loss": 1.1507, + "step": 576 + }, + { + "epoch": 0.06, + "grad_norm": 0.3468401555240628, + "learning_rate": 0.00011043062200956938, + "loss": 1.1642, + "step": 577 + }, + { + "epoch": 0.06, + "grad_norm": 0.4192072995004005, + "learning_rate": 0.000110622009569378, + "loss": 1.164, + "step": 578 + }, + { + "epoch": 0.06, + "grad_norm": 0.379758035778582, + "learning_rate": 0.00011081339712918662, + "loss": 1.1536, + "step": 579 + }, + { + "epoch": 0.06, + "grad_norm": 0.41419578240684846, + "learning_rate": 0.00011100478468899523, + "loss": 1.0945, + "step": 580 + }, + { + "epoch": 0.06, + "grad_norm": 0.3250523681546178, + "learning_rate": 0.00011119617224880383, + "loss": 1.099, + "step": 581 + }, + { + "epoch": 0.06, + "grad_norm": 0.38315025894656307, + "learning_rate": 0.00011138755980861244, + "loss": 1.1642, + "step": 582 + }, + { + "epoch": 0.06, + "grad_norm": 0.30382004794249146, + "learning_rate": 0.00011157894736842105, + "loss": 1.252, + "step": 583 + }, + { + "epoch": 0.06, + "grad_norm": 0.36873995920664776, + "learning_rate": 0.00011177033492822968, + "loss": 1.1613, + "step": 584 + }, + { + "epoch": 0.06, + "grad_norm": 0.40209606862367175, + "learning_rate": 0.00011196172248803829, + "loss": 1.2187, + "step": 585 + }, + { + "epoch": 0.06, + "grad_norm": 0.39972092223451644, + "learning_rate": 0.0001121531100478469, + "loss": 1.1339, + "step": 586 + }, + { + "epoch": 0.06, + "grad_norm": 0.3581792841100828, + "learning_rate": 0.00011234449760765551, + "loss": 1.1467, + "step": 587 + }, + { + "epoch": 0.06, + "grad_norm": 0.33476559121109767, + "learning_rate": 0.00011253588516746411, + "loss": 1.1809, + "step": 588 + }, + { + "epoch": 0.06, + "grad_norm": 0.3450568464002908, + "learning_rate": 0.00011272727272727272, + "loss": 1.1955, + "step": 589 + }, + { + "epoch": 0.06, + "grad_norm": 0.30713056981495374, + "learning_rate": 0.00011291866028708134, + "loss": 1.153, + "step": 590 + }, + { + "epoch": 0.06, + "grad_norm": 0.40056590609091713, + "learning_rate": 0.00011311004784688996, + "loss": 1.0824, + "step": 591 + }, + { + "epoch": 0.06, + "grad_norm": 0.32202083458180564, + "learning_rate": 0.00011330143540669858, + "loss": 1.1039, + "step": 592 + }, + { + "epoch": 0.06, + "grad_norm": 0.28281909786135145, + "learning_rate": 0.00011349282296650719, + "loss": 1.1722, + "step": 593 + }, + { + "epoch": 0.06, + "grad_norm": 0.2930441788185507, + "learning_rate": 0.0001136842105263158, + "loss": 1.1902, + "step": 594 + }, + { + "epoch": 0.06, + "grad_norm": 0.3491373061731604, + "learning_rate": 0.0001138755980861244, + "loss": 1.1247, + "step": 595 + }, + { + "epoch": 0.06, + "grad_norm": 0.3110622779886572, + "learning_rate": 0.00011406698564593301, + "loss": 1.2289, + "step": 596 + }, + { + "epoch": 0.06, + "grad_norm": 0.34519065720613423, + "learning_rate": 0.00011425837320574164, + "loss": 1.1169, + "step": 597 + }, + { + "epoch": 0.06, + "grad_norm": 0.3066625621843041, + "learning_rate": 0.00011444976076555025, + "loss": 1.1645, + "step": 598 + }, + { + "epoch": 0.06, + "grad_norm": 0.32116731229953854, + "learning_rate": 0.00011464114832535886, + "loss": 1.0933, + "step": 599 + }, + { + "epoch": 0.06, + "grad_norm": 0.3511568531959789, + "learning_rate": 0.00011483253588516747, + "loss": 1.1087, + "step": 600 + }, + { + "epoch": 0.06, + "grad_norm": 0.32112239871920967, + "learning_rate": 0.00011502392344497607, + "loss": 1.1406, + "step": 601 + }, + { + "epoch": 0.06, + "grad_norm": 0.39367325401303266, + "learning_rate": 0.00011521531100478468, + "loss": 1.1545, + "step": 602 + }, + { + "epoch": 0.06, + "grad_norm": 0.3392107735520774, + "learning_rate": 0.0001154066985645933, + "loss": 1.1566, + "step": 603 + }, + { + "epoch": 0.06, + "grad_norm": 0.35936783606471423, + "learning_rate": 0.00011559808612440192, + "loss": 1.1135, + "step": 604 + }, + { + "epoch": 0.06, + "grad_norm": 0.3453223570806925, + "learning_rate": 0.00011578947368421053, + "loss": 1.1679, + "step": 605 + }, + { + "epoch": 0.06, + "grad_norm": 0.3988207022091826, + "learning_rate": 0.00011598086124401915, + "loss": 1.1266, + "step": 606 + }, + { + "epoch": 0.06, + "grad_norm": 0.35616581701014133, + "learning_rate": 0.00011617224880382776, + "loss": 1.0747, + "step": 607 + }, + { + "epoch": 0.06, + "grad_norm": 0.34856430848542924, + "learning_rate": 0.00011636363636363636, + "loss": 1.1737, + "step": 608 + }, + { + "epoch": 0.06, + "grad_norm": 0.39749502570874873, + "learning_rate": 0.00011655502392344497, + "loss": 1.1367, + "step": 609 + }, + { + "epoch": 0.06, + "grad_norm": 0.3817892480214725, + "learning_rate": 0.00011674641148325361, + "loss": 1.1423, + "step": 610 + }, + { + "epoch": 0.06, + "grad_norm": 0.37169774084550616, + "learning_rate": 0.00011693779904306221, + "loss": 1.2363, + "step": 611 + }, + { + "epoch": 0.06, + "grad_norm": 0.36680842275104286, + "learning_rate": 0.00011712918660287082, + "loss": 1.1137, + "step": 612 + }, + { + "epoch": 0.06, + "grad_norm": 0.30862259202802894, + "learning_rate": 0.00011732057416267943, + "loss": 1.1156, + "step": 613 + }, + { + "epoch": 0.06, + "grad_norm": 0.3381253043590406, + "learning_rate": 0.00011751196172248804, + "loss": 1.1913, + "step": 614 + }, + { + "epoch": 0.06, + "grad_norm": 0.35640646013161875, + "learning_rate": 0.00011770334928229664, + "loss": 1.2953, + "step": 615 + }, + { + "epoch": 0.06, + "grad_norm": 0.3180351478003401, + "learning_rate": 0.00011789473684210525, + "loss": 1.1151, + "step": 616 + }, + { + "epoch": 0.06, + "grad_norm": 0.2838039245590443, + "learning_rate": 0.0001180861244019139, + "loss": 1.1178, + "step": 617 + }, + { + "epoch": 0.06, + "grad_norm": 0.32734113884885613, + "learning_rate": 0.00011827751196172249, + "loss": 1.1095, + "step": 618 + }, + { + "epoch": 0.06, + "grad_norm": 0.4976044214884747, + "learning_rate": 0.0001184688995215311, + "loss": 1.1425, + "step": 619 + }, + { + "epoch": 0.06, + "grad_norm": 0.32042584160221055, + "learning_rate": 0.00011866028708133972, + "loss": 1.0799, + "step": 620 + }, + { + "epoch": 0.06, + "grad_norm": 0.3258958466495425, + "learning_rate": 0.00011885167464114833, + "loss": 1.221, + "step": 621 + }, + { + "epoch": 0.06, + "grad_norm": 0.3112060174372619, + "learning_rate": 0.00011904306220095693, + "loss": 1.2115, + "step": 622 + }, + { + "epoch": 0.06, + "grad_norm": 0.30118176636144206, + "learning_rate": 0.00011923444976076557, + "loss": 1.1361, + "step": 623 + }, + { + "epoch": 0.06, + "grad_norm": 0.3361051300196263, + "learning_rate": 0.00011942583732057418, + "loss": 1.191, + "step": 624 + }, + { + "epoch": 0.06, + "grad_norm": 0.2931267938868614, + "learning_rate": 0.00011961722488038278, + "loss": 1.0851, + "step": 625 + }, + { + "epoch": 0.06, + "grad_norm": 0.32113885088697364, + "learning_rate": 0.00011980861244019139, + "loss": 1.1867, + "step": 626 + }, + { + "epoch": 0.06, + "grad_norm": 0.34101527118946584, + "learning_rate": 0.00012, + "loss": 1.1836, + "step": 627 + }, + { + "epoch": 0.06, + "grad_norm": 0.27834089475655605, + "learning_rate": 0.00012019138755980862, + "loss": 1.2618, + "step": 628 + }, + { + "epoch": 0.06, + "grad_norm": 0.3689617429853991, + "learning_rate": 0.00012038277511961724, + "loss": 1.1289, + "step": 629 + }, + { + "epoch": 0.06, + "grad_norm": 0.32141982153995574, + "learning_rate": 0.00012057416267942585, + "loss": 1.245, + "step": 630 + }, + { + "epoch": 0.06, + "grad_norm": 0.3577846092529067, + "learning_rate": 0.00012076555023923447, + "loss": 1.1214, + "step": 631 + }, + { + "epoch": 0.06, + "grad_norm": 0.3547093306453314, + "learning_rate": 0.00012095693779904306, + "loss": 1.1151, + "step": 632 + }, + { + "epoch": 0.06, + "grad_norm": 0.3235863986240357, + "learning_rate": 0.00012114832535885168, + "loss": 1.1431, + "step": 633 + }, + { + "epoch": 0.06, + "grad_norm": 0.3315145079366769, + "learning_rate": 0.00012133971291866029, + "loss": 1.1589, + "step": 634 + }, + { + "epoch": 0.06, + "grad_norm": 0.44616643435072917, + "learning_rate": 0.0001215311004784689, + "loss": 1.1899, + "step": 635 + }, + { + "epoch": 0.06, + "grad_norm": 0.34695172958733533, + "learning_rate": 0.00012172248803827753, + "loss": 1.2423, + "step": 636 + }, + { + "epoch": 0.06, + "grad_norm": 0.32832699890038897, + "learning_rate": 0.00012191387559808614, + "loss": 1.0669, + "step": 637 + }, + { + "epoch": 0.06, + "grad_norm": 0.34250522043183074, + "learning_rate": 0.00012210526315789474, + "loss": 1.1031, + "step": 638 + }, + { + "epoch": 0.06, + "grad_norm": 0.3536036436534466, + "learning_rate": 0.00012229665071770336, + "loss": 1.1833, + "step": 639 + }, + { + "epoch": 0.06, + "grad_norm": 0.38952860754869895, + "learning_rate": 0.00012248803827751196, + "loss": 1.2269, + "step": 640 + }, + { + "epoch": 0.06, + "grad_norm": 0.38372067281414196, + "learning_rate": 0.00012267942583732056, + "loss": 1.1696, + "step": 641 + }, + { + "epoch": 0.06, + "grad_norm": 0.36604610363956575, + "learning_rate": 0.00012287081339712921, + "loss": 1.143, + "step": 642 + }, + { + "epoch": 0.06, + "grad_norm": 0.29457638902628325, + "learning_rate": 0.0001230622009569378, + "loss": 1.1118, + "step": 643 + }, + { + "epoch": 0.06, + "grad_norm": 0.31971053516113995, + "learning_rate": 0.0001232535885167464, + "loss": 1.2257, + "step": 644 + }, + { + "epoch": 0.06, + "grad_norm": 0.3479548829091419, + "learning_rate": 0.00012344497607655504, + "loss": 1.2003, + "step": 645 + }, + { + "epoch": 0.06, + "grad_norm": 0.39895729607686864, + "learning_rate": 0.00012363636363636364, + "loss": 1.2362, + "step": 646 + }, + { + "epoch": 0.06, + "grad_norm": 0.32542242184693576, + "learning_rate": 0.00012382775119617226, + "loss": 1.1924, + "step": 647 + }, + { + "epoch": 0.06, + "grad_norm": 0.37946269414290873, + "learning_rate": 0.00012401913875598086, + "loss": 1.2259, + "step": 648 + }, + { + "epoch": 0.06, + "grad_norm": 0.37785595963877666, + "learning_rate": 0.00012421052631578949, + "loss": 1.252, + "step": 649 + }, + { + "epoch": 0.06, + "grad_norm": 0.39908004119966145, + "learning_rate": 0.00012440191387559808, + "loss": 1.1444, + "step": 650 + }, + { + "epoch": 0.06, + "grad_norm": 0.38865614676807153, + "learning_rate": 0.0001245933014354067, + "loss": 1.2314, + "step": 651 + }, + { + "epoch": 0.06, + "grad_norm": 0.3204842742106689, + "learning_rate": 0.0001247846889952153, + "loss": 1.0824, + "step": 652 + }, + { + "epoch": 0.06, + "grad_norm": 0.35369352498295387, + "learning_rate": 0.00012497607655502393, + "loss": 1.0264, + "step": 653 + }, + { + "epoch": 0.06, + "grad_norm": 0.3305618992525529, + "learning_rate": 0.00012516746411483253, + "loss": 1.1012, + "step": 654 + }, + { + "epoch": 0.06, + "grad_norm": 0.3757616845139893, + "learning_rate": 0.00012535885167464116, + "loss": 1.2916, + "step": 655 + }, + { + "epoch": 0.06, + "grad_norm": 0.32567276622705355, + "learning_rate": 0.00012555023923444978, + "loss": 1.218, + "step": 656 + }, + { + "epoch": 0.06, + "grad_norm": 0.30320222866051544, + "learning_rate": 0.00012574162679425838, + "loss": 1.0551, + "step": 657 + }, + { + "epoch": 0.06, + "grad_norm": 0.47041450898052456, + "learning_rate": 0.00012593301435406698, + "loss": 1.2101, + "step": 658 + }, + { + "epoch": 0.06, + "grad_norm": 0.3265512828583142, + "learning_rate": 0.0001261244019138756, + "loss": 1.0902, + "step": 659 + }, + { + "epoch": 0.06, + "grad_norm": 0.3095200661644063, + "learning_rate": 0.0001263157894736842, + "loss": 1.2483, + "step": 660 + }, + { + "epoch": 0.06, + "grad_norm": 0.39865694866961127, + "learning_rate": 0.0001265071770334928, + "loss": 1.0507, + "step": 661 + }, + { + "epoch": 0.06, + "grad_norm": 0.35606148522081404, + "learning_rate": 0.00012669856459330146, + "loss": 1.0753, + "step": 662 + }, + { + "epoch": 0.06, + "grad_norm": 0.3445720816931114, + "learning_rate": 0.00012688995215311006, + "loss": 1.143, + "step": 663 + }, + { + "epoch": 0.06, + "grad_norm": 0.3677160012348687, + "learning_rate": 0.00012708133971291866, + "loss": 1.0608, + "step": 664 + }, + { + "epoch": 0.06, + "grad_norm": 0.2980120877326159, + "learning_rate": 0.00012727272727272728, + "loss": 1.0872, + "step": 665 + }, + { + "epoch": 0.06, + "grad_norm": 0.2896118505469009, + "learning_rate": 0.00012746411483253588, + "loss": 1.1485, + "step": 666 + }, + { + "epoch": 0.06, + "grad_norm": 0.3350125319603418, + "learning_rate": 0.0001276555023923445, + "loss": 1.1236, + "step": 667 + }, + { + "epoch": 0.06, + "grad_norm": 0.3517268797460554, + "learning_rate": 0.00012784688995215313, + "loss": 1.1378, + "step": 668 + }, + { + "epoch": 0.06, + "grad_norm": 0.4220707921759215, + "learning_rate": 0.00012803827751196173, + "loss": 1.1656, + "step": 669 + }, + { + "epoch": 0.06, + "grad_norm": 0.3098050517214006, + "learning_rate": 0.00012822966507177036, + "loss": 1.0732, + "step": 670 + }, + { + "epoch": 0.06, + "grad_norm": 0.38036416406983276, + "learning_rate": 0.00012842105263157895, + "loss": 1.1597, + "step": 671 + }, + { + "epoch": 0.06, + "grad_norm": 0.32201151129472433, + "learning_rate": 0.00012861244019138755, + "loss": 1.1557, + "step": 672 + }, + { + "epoch": 0.06, + "grad_norm": 0.3477368553208273, + "learning_rate": 0.00012880382775119618, + "loss": 1.093, + "step": 673 + }, + { + "epoch": 0.06, + "grad_norm": 0.33206153473346633, + "learning_rate": 0.00012899521531100478, + "loss": 1.0872, + "step": 674 + }, + { + "epoch": 0.06, + "grad_norm": 0.3797973671348287, + "learning_rate": 0.0001291866028708134, + "loss": 1.1932, + "step": 675 + }, + { + "epoch": 0.06, + "grad_norm": 0.38021465107794655, + "learning_rate": 0.00012937799043062203, + "loss": 1.2037, + "step": 676 + }, + { + "epoch": 0.06, + "grad_norm": 0.40680529142131094, + "learning_rate": 0.00012956937799043063, + "loss": 1.13, + "step": 677 + }, + { + "epoch": 0.06, + "grad_norm": 0.3662653154346482, + "learning_rate": 0.00012976076555023923, + "loss": 1.137, + "step": 678 + }, + { + "epoch": 0.06, + "grad_norm": 0.389523749301837, + "learning_rate": 0.00012995215311004785, + "loss": 1.1714, + "step": 679 + }, + { + "epoch": 0.07, + "grad_norm": 0.33672031522727297, + "learning_rate": 0.00013014354066985645, + "loss": 1.08, + "step": 680 + }, + { + "epoch": 0.07, + "grad_norm": 0.3259489924855725, + "learning_rate": 0.00013033492822966508, + "loss": 1.1592, + "step": 681 + }, + { + "epoch": 0.07, + "grad_norm": 0.4310205006695421, + "learning_rate": 0.0001305263157894737, + "loss": 1.1125, + "step": 682 + }, + { + "epoch": 0.07, + "grad_norm": 0.3354208256542673, + "learning_rate": 0.0001307177033492823, + "loss": 1.1612, + "step": 683 + }, + { + "epoch": 0.07, + "grad_norm": 0.33633246177327786, + "learning_rate": 0.00013090909090909093, + "loss": 1.1075, + "step": 684 + }, + { + "epoch": 0.07, + "grad_norm": 0.31028161222188255, + "learning_rate": 0.00013110047846889953, + "loss": 1.1152, + "step": 685 + }, + { + "epoch": 0.07, + "grad_norm": 0.4524180110599837, + "learning_rate": 0.00013129186602870812, + "loss": 1.1259, + "step": 686 + }, + { + "epoch": 0.07, + "grad_norm": 0.3742481866033862, + "learning_rate": 0.00013148325358851675, + "loss": 1.2494, + "step": 687 + }, + { + "epoch": 0.07, + "grad_norm": 0.3619012526518613, + "learning_rate": 0.00013167464114832538, + "loss": 1.101, + "step": 688 + }, + { + "epoch": 0.07, + "grad_norm": 0.32394020898287806, + "learning_rate": 0.00013186602870813397, + "loss": 1.1392, + "step": 689 + }, + { + "epoch": 0.07, + "grad_norm": 0.33391715304609637, + "learning_rate": 0.0001320574162679426, + "loss": 1.1594, + "step": 690 + }, + { + "epoch": 0.07, + "grad_norm": 0.28785180948649514, + "learning_rate": 0.0001322488038277512, + "loss": 1.0496, + "step": 691 + }, + { + "epoch": 0.07, + "grad_norm": 0.38088138786466363, + "learning_rate": 0.0001324401913875598, + "loss": 1.065, + "step": 692 + }, + { + "epoch": 0.07, + "grad_norm": 0.3208927040153503, + "learning_rate": 0.00013263157894736842, + "loss": 1.1636, + "step": 693 + }, + { + "epoch": 0.07, + "grad_norm": 0.3033641718971694, + "learning_rate": 0.00013282296650717705, + "loss": 1.1452, + "step": 694 + }, + { + "epoch": 0.07, + "grad_norm": 0.2948469058966827, + "learning_rate": 0.00013301435406698565, + "loss": 1.1589, + "step": 695 + }, + { + "epoch": 0.07, + "grad_norm": 0.30477569159510964, + "learning_rate": 0.00013320574162679427, + "loss": 1.181, + "step": 696 + }, + { + "epoch": 0.07, + "grad_norm": 0.3410300113129024, + "learning_rate": 0.00013339712918660287, + "loss": 1.1628, + "step": 697 + }, + { + "epoch": 0.07, + "grad_norm": 0.3058561315902832, + "learning_rate": 0.0001335885167464115, + "loss": 1.084, + "step": 698 + }, + { + "epoch": 0.07, + "grad_norm": 0.34051994364928995, + "learning_rate": 0.0001337799043062201, + "loss": 1.0576, + "step": 699 + }, + { + "epoch": 0.07, + "grad_norm": 0.3096222099295184, + "learning_rate": 0.00013397129186602872, + "loss": 1.0741, + "step": 700 + }, + { + "epoch": 0.07, + "grad_norm": 0.28113955153682396, + "learning_rate": 0.00013416267942583732, + "loss": 1.1019, + "step": 701 + }, + { + "epoch": 0.07, + "grad_norm": 0.2956254577619277, + "learning_rate": 0.00013435406698564595, + "loss": 1.1044, + "step": 702 + }, + { + "epoch": 0.07, + "grad_norm": 0.31157316700478505, + "learning_rate": 0.00013454545454545455, + "loss": 1.1694, + "step": 703 + }, + { + "epoch": 0.07, + "grad_norm": 0.3072975606896904, + "learning_rate": 0.00013473684210526317, + "loss": 1.1256, + "step": 704 + }, + { + "epoch": 0.07, + "grad_norm": 0.35422315692846823, + "learning_rate": 0.00013492822966507177, + "loss": 1.202, + "step": 705 + }, + { + "epoch": 0.07, + "grad_norm": 0.3039926886966394, + "learning_rate": 0.00013511961722488037, + "loss": 1.1784, + "step": 706 + }, + { + "epoch": 0.07, + "grad_norm": 0.272337523028655, + "learning_rate": 0.00013531100478468902, + "loss": 1.126, + "step": 707 + }, + { + "epoch": 0.07, + "grad_norm": 0.3396634306821353, + "learning_rate": 0.00013550239234449762, + "loss": 1.1249, + "step": 708 + }, + { + "epoch": 0.07, + "grad_norm": 0.32221861659032364, + "learning_rate": 0.00013569377990430622, + "loss": 1.1312, + "step": 709 + }, + { + "epoch": 0.07, + "grad_norm": 0.30678629631733856, + "learning_rate": 0.00013588516746411485, + "loss": 1.1462, + "step": 710 + }, + { + "epoch": 0.07, + "grad_norm": 0.331787225449244, + "learning_rate": 0.00013607655502392344, + "loss": 1.1713, + "step": 711 + }, + { + "epoch": 0.07, + "grad_norm": 0.2617883116352453, + "learning_rate": 0.00013626794258373204, + "loss": 1.1893, + "step": 712 + }, + { + "epoch": 0.07, + "grad_norm": 0.34346153319515627, + "learning_rate": 0.0001364593301435407, + "loss": 1.1891, + "step": 713 + }, + { + "epoch": 0.07, + "grad_norm": 0.3606280112508664, + "learning_rate": 0.0001366507177033493, + "loss": 1.3581, + "step": 714 + }, + { + "epoch": 0.07, + "grad_norm": 0.30602332471532506, + "learning_rate": 0.0001368421052631579, + "loss": 1.2075, + "step": 715 + }, + { + "epoch": 0.07, + "grad_norm": 0.2966643308304905, + "learning_rate": 0.00013703349282296652, + "loss": 1.1248, + "step": 716 + }, + { + "epoch": 0.07, + "grad_norm": 0.3692677324708085, + "learning_rate": 0.00013722488038277512, + "loss": 1.1325, + "step": 717 + }, + { + "epoch": 0.07, + "grad_norm": 0.3153436398786279, + "learning_rate": 0.00013741626794258374, + "loss": 1.141, + "step": 718 + }, + { + "epoch": 0.07, + "grad_norm": 0.2993620796785782, + "learning_rate": 0.00013760765550239234, + "loss": 1.151, + "step": 719 + }, + { + "epoch": 0.07, + "grad_norm": 0.360199053324579, + "learning_rate": 0.00013779904306220097, + "loss": 1.1671, + "step": 720 + }, + { + "epoch": 0.07, + "grad_norm": 0.34616040756962774, + "learning_rate": 0.0001379904306220096, + "loss": 1.1314, + "step": 721 + }, + { + "epoch": 0.07, + "grad_norm": 0.32093543405134595, + "learning_rate": 0.0001381818181818182, + "loss": 1.1017, + "step": 722 + }, + { + "epoch": 0.07, + "grad_norm": 0.3072115942434, + "learning_rate": 0.0001383732057416268, + "loss": 1.2032, + "step": 723 + }, + { + "epoch": 0.07, + "grad_norm": 0.33680085828062, + "learning_rate": 0.00013856459330143542, + "loss": 1.191, + "step": 724 + }, + { + "epoch": 0.07, + "grad_norm": 0.27852291513017413, + "learning_rate": 0.00013875598086124402, + "loss": 1.2035, + "step": 725 + }, + { + "epoch": 0.07, + "grad_norm": 0.3269080941652961, + "learning_rate": 0.00013894736842105264, + "loss": 1.1417, + "step": 726 + }, + { + "epoch": 0.07, + "grad_norm": 0.2911551586198448, + "learning_rate": 0.00013913875598086127, + "loss": 1.2055, + "step": 727 + }, + { + "epoch": 0.07, + "grad_norm": 0.3478754660709439, + "learning_rate": 0.00013933014354066987, + "loss": 1.1967, + "step": 728 + }, + { + "epoch": 0.07, + "grad_norm": 0.31136552748186935, + "learning_rate": 0.00013952153110047846, + "loss": 1.1666, + "step": 729 + }, + { + "epoch": 0.07, + "grad_norm": 0.29853571203421375, + "learning_rate": 0.0001397129186602871, + "loss": 1.1213, + "step": 730 + }, + { + "epoch": 0.07, + "grad_norm": 0.29830710212889877, + "learning_rate": 0.0001399043062200957, + "loss": 1.2283, + "step": 731 + }, + { + "epoch": 0.07, + "grad_norm": 0.2988658722078936, + "learning_rate": 0.00014009569377990431, + "loss": 1.2391, + "step": 732 + }, + { + "epoch": 0.07, + "grad_norm": 0.3453871024753651, + "learning_rate": 0.00014028708133971294, + "loss": 1.2111, + "step": 733 + }, + { + "epoch": 0.07, + "grad_norm": 0.3495768486847103, + "learning_rate": 0.00014047846889952154, + "loss": 1.1027, + "step": 734 + }, + { + "epoch": 0.07, + "grad_norm": 0.3164024002188871, + "learning_rate": 0.00014066985645933016, + "loss": 1.1015, + "step": 735 + }, + { + "epoch": 0.07, + "grad_norm": 0.3044139633248399, + "learning_rate": 0.00014086124401913876, + "loss": 1.0509, + "step": 736 + }, + { + "epoch": 0.07, + "grad_norm": 0.281079329494108, + "learning_rate": 0.00014105263157894736, + "loss": 1.0826, + "step": 737 + }, + { + "epoch": 0.07, + "grad_norm": 0.30636375000054217, + "learning_rate": 0.000141244019138756, + "loss": 1.2202, + "step": 738 + }, + { + "epoch": 0.07, + "grad_norm": 0.29149180784544115, + "learning_rate": 0.0001414354066985646, + "loss": 1.1551, + "step": 739 + }, + { + "epoch": 0.07, + "grad_norm": 0.3073819307679817, + "learning_rate": 0.0001416267942583732, + "loss": 1.2248, + "step": 740 + }, + { + "epoch": 0.07, + "grad_norm": 0.3217985704338287, + "learning_rate": 0.00014181818181818184, + "loss": 1.2045, + "step": 741 + }, + { + "epoch": 0.07, + "grad_norm": 0.3369269229369114, + "learning_rate": 0.00014200956937799044, + "loss": 1.1671, + "step": 742 + }, + { + "epoch": 0.07, + "grad_norm": 0.2981155510935532, + "learning_rate": 0.00014220095693779904, + "loss": 1.1354, + "step": 743 + }, + { + "epoch": 0.07, + "grad_norm": 0.3002935893022973, + "learning_rate": 0.00014239234449760766, + "loss": 1.0369, + "step": 744 + }, + { + "epoch": 0.07, + "grad_norm": 0.3061234355072447, + "learning_rate": 0.00014258373205741626, + "loss": 1.0122, + "step": 745 + }, + { + "epoch": 0.07, + "grad_norm": 0.28759317860073835, + "learning_rate": 0.00014277511961722489, + "loss": 1.0997, + "step": 746 + }, + { + "epoch": 0.07, + "grad_norm": 0.3064828735905134, + "learning_rate": 0.0001429665071770335, + "loss": 1.1225, + "step": 747 + }, + { + "epoch": 0.07, + "grad_norm": 0.3604086247045263, + "learning_rate": 0.0001431578947368421, + "loss": 1.1635, + "step": 748 + }, + { + "epoch": 0.07, + "grad_norm": 0.2914789050629064, + "learning_rate": 0.0001433492822966507, + "loss": 1.1704, + "step": 749 + }, + { + "epoch": 0.07, + "grad_norm": 0.3105462532363453, + "learning_rate": 0.00014354066985645933, + "loss": 1.144, + "step": 750 + }, + { + "epoch": 0.07, + "grad_norm": 0.29242484393022483, + "learning_rate": 0.00014373205741626793, + "loss": 1.0991, + "step": 751 + }, + { + "epoch": 0.07, + "grad_norm": 0.3009843941043775, + "learning_rate": 0.00014392344497607656, + "loss": 1.1409, + "step": 752 + }, + { + "epoch": 0.07, + "grad_norm": 0.35368948528183997, + "learning_rate": 0.00014411483253588518, + "loss": 1.0839, + "step": 753 + }, + { + "epoch": 0.07, + "grad_norm": 0.2908104621427735, + "learning_rate": 0.00014430622009569378, + "loss": 1.0997, + "step": 754 + }, + { + "epoch": 0.07, + "grad_norm": 0.2761031765983028, + "learning_rate": 0.0001444976076555024, + "loss": 1.0389, + "step": 755 + }, + { + "epoch": 0.07, + "grad_norm": 0.34458640320872364, + "learning_rate": 0.000144688995215311, + "loss": 1.0666, + "step": 756 + }, + { + "epoch": 0.07, + "grad_norm": 0.3426791854461418, + "learning_rate": 0.0001448803827751196, + "loss": 1.0227, + "step": 757 + }, + { + "epoch": 0.07, + "grad_norm": 0.33484757936373594, + "learning_rate": 0.00014507177033492826, + "loss": 1.1276, + "step": 758 + }, + { + "epoch": 0.07, + "grad_norm": 0.31113235116750904, + "learning_rate": 0.00014526315789473686, + "loss": 1.1435, + "step": 759 + }, + { + "epoch": 0.07, + "grad_norm": 0.3187397115174627, + "learning_rate": 0.00014545454545454546, + "loss": 1.2091, + "step": 760 + }, + { + "epoch": 0.07, + "grad_norm": 0.322859636752832, + "learning_rate": 0.00014564593301435408, + "loss": 1.1463, + "step": 761 + }, + { + "epoch": 0.07, + "grad_norm": 0.3054079698873811, + "learning_rate": 0.00014583732057416268, + "loss": 1.1532, + "step": 762 + }, + { + "epoch": 0.07, + "grad_norm": 0.37827200804472255, + "learning_rate": 0.00014602870813397128, + "loss": 1.1536, + "step": 763 + }, + { + "epoch": 0.07, + "grad_norm": 0.33688004627148077, + "learning_rate": 0.0001462200956937799, + "loss": 1.1633, + "step": 764 + }, + { + "epoch": 0.07, + "grad_norm": 0.3057781806456222, + "learning_rate": 0.00014641148325358853, + "loss": 1.1336, + "step": 765 + }, + { + "epoch": 0.07, + "grad_norm": 0.3214472678202446, + "learning_rate": 0.00014660287081339713, + "loss": 1.178, + "step": 766 + }, + { + "epoch": 0.07, + "grad_norm": 0.3615283182183831, + "learning_rate": 0.00014679425837320576, + "loss": 1.1158, + "step": 767 + }, + { + "epoch": 0.07, + "grad_norm": 0.3147571028922824, + "learning_rate": 0.00014698564593301435, + "loss": 1.131, + "step": 768 + }, + { + "epoch": 0.07, + "grad_norm": 0.269304950091198, + "learning_rate": 0.00014717703349282298, + "loss": 1.1885, + "step": 769 + }, + { + "epoch": 0.07, + "grad_norm": 0.2817004499058875, + "learning_rate": 0.00014736842105263158, + "loss": 1.0634, + "step": 770 + }, + { + "epoch": 0.07, + "grad_norm": 0.34677018154047495, + "learning_rate": 0.0001475598086124402, + "loss": 1.2329, + "step": 771 + }, + { + "epoch": 0.07, + "grad_norm": 0.33187657643162116, + "learning_rate": 0.00014775119617224883, + "loss": 1.0673, + "step": 772 + }, + { + "epoch": 0.07, + "grad_norm": 0.33397275501257906, + "learning_rate": 0.00014794258373205743, + "loss": 1.0975, + "step": 773 + }, + { + "epoch": 0.07, + "grad_norm": 0.2977218730080119, + "learning_rate": 0.00014813397129186603, + "loss": 1.0581, + "step": 774 + }, + { + "epoch": 0.07, + "grad_norm": 0.3480266756123412, + "learning_rate": 0.00014832535885167465, + "loss": 1.2395, + "step": 775 + }, + { + "epoch": 0.07, + "grad_norm": 0.33681513429762355, + "learning_rate": 0.00014851674641148325, + "loss": 1.1306, + "step": 776 + }, + { + "epoch": 0.07, + "grad_norm": 0.31749818370425387, + "learning_rate": 0.00014870813397129185, + "loss": 1.0901, + "step": 777 + }, + { + "epoch": 0.07, + "grad_norm": 0.33455138564966774, + "learning_rate": 0.0001488995215311005, + "loss": 1.2032, + "step": 778 + }, + { + "epoch": 0.07, + "grad_norm": 0.3504419380990198, + "learning_rate": 0.0001490909090909091, + "loss": 1.0602, + "step": 779 + }, + { + "epoch": 0.07, + "grad_norm": 0.3023880222584541, + "learning_rate": 0.0001492822966507177, + "loss": 1.1374, + "step": 780 + }, + { + "epoch": 0.07, + "grad_norm": 0.5469036927255182, + "learning_rate": 0.00014947368421052633, + "loss": 1.1802, + "step": 781 + }, + { + "epoch": 0.07, + "grad_norm": 0.308842167335779, + "learning_rate": 0.00014966507177033493, + "loss": 1.0936, + "step": 782 + }, + { + "epoch": 0.07, + "grad_norm": 0.2986359721179498, + "learning_rate": 0.00014985645933014355, + "loss": 1.1307, + "step": 783 + }, + { + "epoch": 0.08, + "grad_norm": 0.31664348432490785, + "learning_rate": 0.00015004784688995218, + "loss": 1.2007, + "step": 784 + }, + { + "epoch": 0.08, + "grad_norm": 0.265541663614485, + "learning_rate": 0.00015023923444976078, + "loss": 1.1276, + "step": 785 + }, + { + "epoch": 0.08, + "grad_norm": 0.3204030653032886, + "learning_rate": 0.0001504306220095694, + "loss": 1.1439, + "step": 786 + }, + { + "epoch": 0.08, + "grad_norm": 0.2783784466928858, + "learning_rate": 0.000150622009569378, + "loss": 1.2329, + "step": 787 + }, + { + "epoch": 0.08, + "grad_norm": 0.30216230746794037, + "learning_rate": 0.0001508133971291866, + "loss": 1.1853, + "step": 788 + }, + { + "epoch": 0.08, + "grad_norm": 0.3085281413718923, + "learning_rate": 0.00015100478468899522, + "loss": 1.1631, + "step": 789 + }, + { + "epoch": 0.08, + "grad_norm": 0.3221875710439296, + "learning_rate": 0.00015119617224880382, + "loss": 1.0776, + "step": 790 + }, + { + "epoch": 0.08, + "grad_norm": 0.2846073440656778, + "learning_rate": 0.00015138755980861245, + "loss": 1.1563, + "step": 791 + }, + { + "epoch": 0.08, + "grad_norm": 0.26550401235533877, + "learning_rate": 0.00015157894736842108, + "loss": 1.0467, + "step": 792 + }, + { + "epoch": 0.08, + "grad_norm": 0.37881120245858113, + "learning_rate": 0.00015177033492822967, + "loss": 1.1699, + "step": 793 + }, + { + "epoch": 0.08, + "grad_norm": 0.33594596707038277, + "learning_rate": 0.00015196172248803827, + "loss": 1.1653, + "step": 794 + }, + { + "epoch": 0.08, + "grad_norm": 0.3233311908095246, + "learning_rate": 0.0001521531100478469, + "loss": 1.1507, + "step": 795 + }, + { + "epoch": 0.08, + "grad_norm": 0.267677768320064, + "learning_rate": 0.0001523444976076555, + "loss": 1.1356, + "step": 796 + }, + { + "epoch": 0.08, + "grad_norm": 0.29484155965355746, + "learning_rate": 0.00015253588516746412, + "loss": 1.0681, + "step": 797 + }, + { + "epoch": 0.08, + "grad_norm": 0.3284519128368135, + "learning_rate": 0.00015272727272727275, + "loss": 1.0024, + "step": 798 + }, + { + "epoch": 0.08, + "grad_norm": 0.28701234783478413, + "learning_rate": 0.00015291866028708135, + "loss": 1.0892, + "step": 799 + }, + { + "epoch": 0.08, + "grad_norm": 0.31864235047065265, + "learning_rate": 0.00015311004784688995, + "loss": 1.1727, + "step": 800 + }, + { + "epoch": 0.08, + "grad_norm": 0.2745618500591329, + "learning_rate": 0.00015330143540669857, + "loss": 1.0224, + "step": 801 + }, + { + "epoch": 0.08, + "grad_norm": 0.30047657670046785, + "learning_rate": 0.00015349282296650717, + "loss": 1.1017, + "step": 802 + }, + { + "epoch": 0.08, + "grad_norm": 0.2914367942025512, + "learning_rate": 0.0001536842105263158, + "loss": 1.0163, + "step": 803 + }, + { + "epoch": 0.08, + "grad_norm": 0.2816221100141218, + "learning_rate": 0.00015387559808612442, + "loss": 1.1372, + "step": 804 + }, + { + "epoch": 0.08, + "grad_norm": 0.34419073166680986, + "learning_rate": 0.00015406698564593302, + "loss": 1.1991, + "step": 805 + }, + { + "epoch": 0.08, + "grad_norm": 0.2888432316245811, + "learning_rate": 0.00015425837320574165, + "loss": 1.1627, + "step": 806 + }, + { + "epoch": 0.08, + "grad_norm": 0.3191302154072048, + "learning_rate": 0.00015444976076555024, + "loss": 1.2458, + "step": 807 + }, + { + "epoch": 0.08, + "grad_norm": 0.2727293598902053, + "learning_rate": 0.00015464114832535884, + "loss": 1.1085, + "step": 808 + }, + { + "epoch": 0.08, + "grad_norm": 0.3029996217533104, + "learning_rate": 0.00015483253588516747, + "loss": 1.1319, + "step": 809 + }, + { + "epoch": 0.08, + "grad_norm": 0.2874429714766323, + "learning_rate": 0.0001550239234449761, + "loss": 1.1123, + "step": 810 + }, + { + "epoch": 0.08, + "grad_norm": 0.2665739546686572, + "learning_rate": 0.0001552153110047847, + "loss": 1.143, + "step": 811 + }, + { + "epoch": 0.08, + "grad_norm": 0.2958054625397739, + "learning_rate": 0.00015540669856459332, + "loss": 1.135, + "step": 812 + }, + { + "epoch": 0.08, + "grad_norm": 0.255412895550101, + "learning_rate": 0.00015559808612440192, + "loss": 1.1859, + "step": 813 + }, + { + "epoch": 0.08, + "grad_norm": 0.31547097343732156, + "learning_rate": 0.00015578947368421052, + "loss": 1.1657, + "step": 814 + }, + { + "epoch": 0.08, + "grad_norm": 0.34867394487181774, + "learning_rate": 0.00015598086124401914, + "loss": 1.0777, + "step": 815 + }, + { + "epoch": 0.08, + "grad_norm": 0.2921678038171233, + "learning_rate": 0.00015617224880382774, + "loss": 1.0522, + "step": 816 + }, + { + "epoch": 0.08, + "grad_norm": 0.28919534963089716, + "learning_rate": 0.00015636363636363637, + "loss": 1.0779, + "step": 817 + }, + { + "epoch": 0.08, + "grad_norm": 0.3114505303709412, + "learning_rate": 0.000156555023923445, + "loss": 1.1119, + "step": 818 + }, + { + "epoch": 0.08, + "grad_norm": 0.3065682691442617, + "learning_rate": 0.0001567464114832536, + "loss": 1.1352, + "step": 819 + }, + { + "epoch": 0.08, + "grad_norm": 0.3614199195441891, + "learning_rate": 0.00015693779904306222, + "loss": 1.1612, + "step": 820 + }, + { + "epoch": 0.08, + "grad_norm": 0.3101608207788147, + "learning_rate": 0.00015712918660287082, + "loss": 1.252, + "step": 821 + }, + { + "epoch": 0.08, + "grad_norm": 0.2975075722366304, + "learning_rate": 0.00015732057416267941, + "loss": 1.0687, + "step": 822 + }, + { + "epoch": 0.08, + "grad_norm": 0.2664048730695144, + "learning_rate": 0.00015751196172248807, + "loss": 1.0832, + "step": 823 + }, + { + "epoch": 0.08, + "grad_norm": 0.2952527620974602, + "learning_rate": 0.00015770334928229667, + "loss": 1.1082, + "step": 824 + }, + { + "epoch": 0.08, + "grad_norm": 0.3295245234429144, + "learning_rate": 0.00015789473684210527, + "loss": 1.162, + "step": 825 + }, + { + "epoch": 0.08, + "grad_norm": 0.3102397113238992, + "learning_rate": 0.0001580861244019139, + "loss": 1.0738, + "step": 826 + }, + { + "epoch": 0.08, + "grad_norm": 0.2693269386909286, + "learning_rate": 0.0001582775119617225, + "loss": 1.028, + "step": 827 + }, + { + "epoch": 0.08, + "grad_norm": 0.3669225930993825, + "learning_rate": 0.0001584688995215311, + "loss": 1.15, + "step": 828 + }, + { + "epoch": 0.08, + "grad_norm": 0.29318593683220057, + "learning_rate": 0.00015866028708133974, + "loss": 1.2408, + "step": 829 + }, + { + "epoch": 0.08, + "grad_norm": 0.2894248048442511, + "learning_rate": 0.00015885167464114834, + "loss": 1.1752, + "step": 830 + }, + { + "epoch": 0.08, + "grad_norm": 0.29125115224083087, + "learning_rate": 0.00015904306220095694, + "loss": 0.9574, + "step": 831 + }, + { + "epoch": 0.08, + "grad_norm": 0.40878890760263803, + "learning_rate": 0.00015923444976076556, + "loss": 1.2104, + "step": 832 + }, + { + "epoch": 0.08, + "grad_norm": 0.27861032872082103, + "learning_rate": 0.00015942583732057416, + "loss": 1.1413, + "step": 833 + }, + { + "epoch": 0.08, + "grad_norm": 0.2982611167661862, + "learning_rate": 0.0001596172248803828, + "loss": 1.2593, + "step": 834 + }, + { + "epoch": 0.08, + "grad_norm": 0.29326214489772795, + "learning_rate": 0.0001598086124401914, + "loss": 1.18, + "step": 835 + }, + { + "epoch": 0.08, + "grad_norm": 0.27131394008530785, + "learning_rate": 0.00016, + "loss": 1.1674, + "step": 836 + }, + { + "epoch": 0.08, + "grad_norm": 0.26712034034407034, + "learning_rate": 0.00016019138755980864, + "loss": 1.0161, + "step": 837 + }, + { + "epoch": 0.08, + "grad_norm": 0.36369725154573823, + "learning_rate": 0.00016038277511961724, + "loss": 1.175, + "step": 838 + }, + { + "epoch": 0.08, + "grad_norm": 0.3085671724318983, + "learning_rate": 0.00016057416267942584, + "loss": 1.1461, + "step": 839 + }, + { + "epoch": 0.08, + "grad_norm": 0.28077141855727894, + "learning_rate": 0.00016076555023923446, + "loss": 1.0922, + "step": 840 + }, + { + "epoch": 0.08, + "grad_norm": 0.3270351461507469, + "learning_rate": 0.00016095693779904306, + "loss": 1.0463, + "step": 841 + }, + { + "epoch": 0.08, + "grad_norm": 0.23981764247780088, + "learning_rate": 0.0001611483253588517, + "loss": 0.9635, + "step": 842 + }, + { + "epoch": 0.08, + "grad_norm": 0.28201419160149344, + "learning_rate": 0.0001613397129186603, + "loss": 1.1173, + "step": 843 + }, + { + "epoch": 0.08, + "grad_norm": 0.26889491956006867, + "learning_rate": 0.0001615311004784689, + "loss": 1.1132, + "step": 844 + }, + { + "epoch": 0.08, + "grad_norm": 0.27688897066555573, + "learning_rate": 0.0001617224880382775, + "loss": 1.0963, + "step": 845 + }, + { + "epoch": 0.08, + "grad_norm": 0.24565660227717426, + "learning_rate": 0.00016191387559808614, + "loss": 1.0694, + "step": 846 + }, + { + "epoch": 0.08, + "grad_norm": 0.28311675225629357, + "learning_rate": 0.00016210526315789473, + "loss": 1.0727, + "step": 847 + }, + { + "epoch": 0.08, + "grad_norm": 0.25275425247450756, + "learning_rate": 0.00016229665071770336, + "loss": 1.1726, + "step": 848 + }, + { + "epoch": 0.08, + "grad_norm": 0.3073349091629191, + "learning_rate": 0.00016248803827751199, + "loss": 1.237, + "step": 849 + }, + { + "epoch": 0.08, + "grad_norm": 0.3698105893782691, + "learning_rate": 0.00016267942583732058, + "loss": 1.1529, + "step": 850 + }, + { + "epoch": 0.08, + "grad_norm": 0.3066504421764291, + "learning_rate": 0.00016287081339712918, + "loss": 1.1875, + "step": 851 + }, + { + "epoch": 0.08, + "grad_norm": 0.2853734077261547, + "learning_rate": 0.0001630622009569378, + "loss": 1.1232, + "step": 852 + }, + { + "epoch": 0.08, + "grad_norm": 0.27498683022213083, + "learning_rate": 0.0001632535885167464, + "loss": 1.2345, + "step": 853 + }, + { + "epoch": 0.08, + "grad_norm": 0.26436373680139863, + "learning_rate": 0.00016344497607655503, + "loss": 1.1443, + "step": 854 + }, + { + "epoch": 0.08, + "grad_norm": 0.29039546604591765, + "learning_rate": 0.00016363636363636366, + "loss": 1.1451, + "step": 855 + }, + { + "epoch": 0.08, + "grad_norm": 0.2845332734411919, + "learning_rate": 0.00016382775119617226, + "loss": 1.1658, + "step": 856 + }, + { + "epoch": 0.08, + "grad_norm": 0.3118984941168386, + "learning_rate": 0.00016401913875598088, + "loss": 1.133, + "step": 857 + }, + { + "epoch": 0.08, + "grad_norm": 0.2910324342007811, + "learning_rate": 0.00016421052631578948, + "loss": 1.138, + "step": 858 + }, + { + "epoch": 0.08, + "grad_norm": 0.3067211385198509, + "learning_rate": 0.00016440191387559808, + "loss": 1.1517, + "step": 859 + }, + { + "epoch": 0.08, + "grad_norm": 0.290740982507053, + "learning_rate": 0.0001645933014354067, + "loss": 1.0561, + "step": 860 + }, + { + "epoch": 0.08, + "grad_norm": 0.3144516777697552, + "learning_rate": 0.0001647846889952153, + "loss": 1.0661, + "step": 861 + }, + { + "epoch": 0.08, + "grad_norm": 0.2970636821654555, + "learning_rate": 0.00016497607655502393, + "loss": 1.1634, + "step": 862 + }, + { + "epoch": 0.08, + "grad_norm": 0.3146333025319219, + "learning_rate": 0.00016516746411483256, + "loss": 1.0652, + "step": 863 + }, + { + "epoch": 0.08, + "grad_norm": 0.2644767264588937, + "learning_rate": 0.00016535885167464116, + "loss": 1.1516, + "step": 864 + }, + { + "epoch": 0.08, + "grad_norm": 0.3006840203451009, + "learning_rate": 0.00016555023923444975, + "loss": 1.1175, + "step": 865 + }, + { + "epoch": 0.08, + "grad_norm": 0.2809420339644184, + "learning_rate": 0.00016574162679425838, + "loss": 1.1057, + "step": 866 + }, + { + "epoch": 0.08, + "grad_norm": 0.3769059520574524, + "learning_rate": 0.00016593301435406698, + "loss": 1.1713, + "step": 867 + }, + { + "epoch": 0.08, + "grad_norm": 0.33622542833176833, + "learning_rate": 0.0001661244019138756, + "loss": 1.2223, + "step": 868 + }, + { + "epoch": 0.08, + "grad_norm": 0.2715119578007493, + "learning_rate": 0.00016631578947368423, + "loss": 1.0926, + "step": 869 + }, + { + "epoch": 0.08, + "grad_norm": 0.2788006611781337, + "learning_rate": 0.00016650717703349283, + "loss": 1.0598, + "step": 870 + }, + { + "epoch": 0.08, + "grad_norm": 0.29918887354582546, + "learning_rate": 0.00016669856459330145, + "loss": 1.1363, + "step": 871 + }, + { + "epoch": 0.08, + "grad_norm": 0.27116956033088324, + "learning_rate": 0.00016688995215311005, + "loss": 1.1357, + "step": 872 + }, + { + "epoch": 0.08, + "grad_norm": 0.3651972053932287, + "learning_rate": 0.00016708133971291865, + "loss": 1.1862, + "step": 873 + }, + { + "epoch": 0.08, + "grad_norm": 0.2941314020229377, + "learning_rate": 0.00016727272727272728, + "loss": 1.229, + "step": 874 + }, + { + "epoch": 0.08, + "grad_norm": 0.29667386462622886, + "learning_rate": 0.0001674641148325359, + "loss": 1.105, + "step": 875 + }, + { + "epoch": 0.08, + "grad_norm": 0.2888327808151174, + "learning_rate": 0.0001676555023923445, + "loss": 1.1328, + "step": 876 + }, + { + "epoch": 0.08, + "grad_norm": 0.31564538771648376, + "learning_rate": 0.00016784688995215313, + "loss": 1.2407, + "step": 877 + }, + { + "epoch": 0.08, + "grad_norm": 0.31419100780147885, + "learning_rate": 0.00016803827751196173, + "loss": 1.0472, + "step": 878 + }, + { + "epoch": 0.08, + "grad_norm": 0.26956520480679047, + "learning_rate": 0.00016822966507177033, + "loss": 1.1524, + "step": 879 + }, + { + "epoch": 0.08, + "grad_norm": 0.2726391902939466, + "learning_rate": 0.00016842105263157895, + "loss": 1.11, + "step": 880 + }, + { + "epoch": 0.08, + "grad_norm": 0.2993169367221595, + "learning_rate": 0.00016861244019138758, + "loss": 1.2059, + "step": 881 + }, + { + "epoch": 0.08, + "grad_norm": 0.2936290798225595, + "learning_rate": 0.00016880382775119618, + "loss": 1.1026, + "step": 882 + }, + { + "epoch": 0.08, + "grad_norm": 0.25728140759420537, + "learning_rate": 0.0001689952153110048, + "loss": 1.136, + "step": 883 + }, + { + "epoch": 0.08, + "grad_norm": 0.2659884049250215, + "learning_rate": 0.0001691866028708134, + "loss": 1.0311, + "step": 884 + }, + { + "epoch": 0.08, + "grad_norm": 0.29849696827544475, + "learning_rate": 0.00016937799043062203, + "loss": 1.0995, + "step": 885 + }, + { + "epoch": 0.08, + "grad_norm": 0.28395796526200556, + "learning_rate": 0.00016956937799043062, + "loss": 1.0948, + "step": 886 + }, + { + "epoch": 0.08, + "grad_norm": 0.3445355283030851, + "learning_rate": 0.00016976076555023925, + "loss": 1.1103, + "step": 887 + }, + { + "epoch": 0.08, + "grad_norm": 0.2538735838657434, + "learning_rate": 0.00016995215311004788, + "loss": 1.0752, + "step": 888 + }, + { + "epoch": 0.09, + "grad_norm": 0.3026322031952384, + "learning_rate": 0.00017014354066985647, + "loss": 1.1359, + "step": 889 + }, + { + "epoch": 0.09, + "grad_norm": 0.3200296529545863, + "learning_rate": 0.00017033492822966507, + "loss": 1.1851, + "step": 890 + }, + { + "epoch": 0.09, + "grad_norm": 0.29333134950174405, + "learning_rate": 0.0001705263157894737, + "loss": 1.0547, + "step": 891 + }, + { + "epoch": 0.09, + "grad_norm": 0.3189167751421413, + "learning_rate": 0.0001707177033492823, + "loss": 1.0605, + "step": 892 + }, + { + "epoch": 0.09, + "grad_norm": 0.28038685627245685, + "learning_rate": 0.0001709090909090909, + "loss": 1.1087, + "step": 893 + }, + { + "epoch": 0.09, + "grad_norm": 0.27600976689482803, + "learning_rate": 0.00017110047846889955, + "loss": 1.1681, + "step": 894 + }, + { + "epoch": 0.09, + "grad_norm": 0.30930194872855393, + "learning_rate": 0.00017129186602870815, + "loss": 1.1325, + "step": 895 + }, + { + "epoch": 0.09, + "grad_norm": 0.31300532276472537, + "learning_rate": 0.00017148325358851675, + "loss": 1.1002, + "step": 896 + }, + { + "epoch": 0.09, + "grad_norm": 0.2991016495421357, + "learning_rate": 0.00017167464114832537, + "loss": 1.076, + "step": 897 + }, + { + "epoch": 0.09, + "grad_norm": 0.31594108017658423, + "learning_rate": 0.00017186602870813397, + "loss": 1.136, + "step": 898 + }, + { + "epoch": 0.09, + "grad_norm": 0.27497095491638146, + "learning_rate": 0.0001720574162679426, + "loss": 1.2323, + "step": 899 + }, + { + "epoch": 0.09, + "grad_norm": 0.310962024257193, + "learning_rate": 0.00017224880382775122, + "loss": 1.1398, + "step": 900 + }, + { + "epoch": 0.09, + "grad_norm": 0.29697512814616595, + "learning_rate": 0.00017244019138755982, + "loss": 1.1342, + "step": 901 + }, + { + "epoch": 0.09, + "grad_norm": 0.239946301235031, + "learning_rate": 0.00017263157894736842, + "loss": 1.2081, + "step": 902 + }, + { + "epoch": 0.09, + "grad_norm": 0.2874130680609666, + "learning_rate": 0.00017282296650717705, + "loss": 1.125, + "step": 903 + }, + { + "epoch": 0.09, + "grad_norm": 0.3210023421862061, + "learning_rate": 0.00017301435406698564, + "loss": 1.2268, + "step": 904 + }, + { + "epoch": 0.09, + "grad_norm": 0.25985019789372976, + "learning_rate": 0.00017320574162679427, + "loss": 1.0704, + "step": 905 + }, + { + "epoch": 0.09, + "grad_norm": 0.28932579761775323, + "learning_rate": 0.00017339712918660287, + "loss": 1.1939, + "step": 906 + }, + { + "epoch": 0.09, + "grad_norm": 0.2984436701321717, + "learning_rate": 0.0001735885167464115, + "loss": 1.1441, + "step": 907 + }, + { + "epoch": 0.09, + "grad_norm": 0.33279429925895665, + "learning_rate": 0.00017377990430622012, + "loss": 1.2299, + "step": 908 + }, + { + "epoch": 0.09, + "grad_norm": 0.28785481315035893, + "learning_rate": 0.00017397129186602872, + "loss": 1.118, + "step": 909 + }, + { + "epoch": 0.09, + "grad_norm": 0.31655385538112546, + "learning_rate": 0.00017416267942583732, + "loss": 1.189, + "step": 910 + }, + { + "epoch": 0.09, + "grad_norm": 0.3038855880357351, + "learning_rate": 0.00017435406698564594, + "loss": 1.0654, + "step": 911 + }, + { + "epoch": 0.09, + "grad_norm": 0.3042729440177485, + "learning_rate": 0.00017454545454545454, + "loss": 1.1553, + "step": 912 + }, + { + "epoch": 0.09, + "grad_norm": 0.266680706575244, + "learning_rate": 0.00017473684210526317, + "loss": 1.1006, + "step": 913 + }, + { + "epoch": 0.09, + "grad_norm": 0.2741425104907168, + "learning_rate": 0.0001749282296650718, + "loss": 1.2319, + "step": 914 + }, + { + "epoch": 0.09, + "grad_norm": 0.2520129822636353, + "learning_rate": 0.0001751196172248804, + "loss": 1.1394, + "step": 915 + }, + { + "epoch": 0.09, + "grad_norm": 0.28607836069753895, + "learning_rate": 0.000175311004784689, + "loss": 1.2108, + "step": 916 + }, + { + "epoch": 0.09, + "grad_norm": 0.28836296745411716, + "learning_rate": 0.00017550239234449762, + "loss": 1.0772, + "step": 917 + }, + { + "epoch": 0.09, + "grad_norm": 0.26291170956841414, + "learning_rate": 0.00017569377990430622, + "loss": 1.1532, + "step": 918 + }, + { + "epoch": 0.09, + "grad_norm": 0.27624746568071396, + "learning_rate": 0.00017588516746411484, + "loss": 1.1178, + "step": 919 + }, + { + "epoch": 0.09, + "grad_norm": 0.28200386776822395, + "learning_rate": 0.00017607655502392347, + "loss": 1.1105, + "step": 920 + }, + { + "epoch": 0.09, + "grad_norm": 0.25462518315632554, + "learning_rate": 0.00017626794258373207, + "loss": 1.0717, + "step": 921 + }, + { + "epoch": 0.09, + "grad_norm": 0.27932944411599797, + "learning_rate": 0.0001764593301435407, + "loss": 1.2486, + "step": 922 + }, + { + "epoch": 0.09, + "grad_norm": 0.29462379215808215, + "learning_rate": 0.0001766507177033493, + "loss": 1.226, + "step": 923 + }, + { + "epoch": 0.09, + "grad_norm": 0.2741976731865599, + "learning_rate": 0.0001768421052631579, + "loss": 1.1797, + "step": 924 + }, + { + "epoch": 0.09, + "grad_norm": 0.2532434659032646, + "learning_rate": 0.00017703349282296652, + "loss": 1.0828, + "step": 925 + }, + { + "epoch": 0.09, + "grad_norm": 0.329346060797211, + "learning_rate": 0.00017722488038277514, + "loss": 1.1125, + "step": 926 + }, + { + "epoch": 0.09, + "grad_norm": 0.2644644824352827, + "learning_rate": 0.00017741626794258374, + "loss": 1.1048, + "step": 927 + }, + { + "epoch": 0.09, + "grad_norm": 0.2617940651450908, + "learning_rate": 0.00017760765550239237, + "loss": 1.2178, + "step": 928 + }, + { + "epoch": 0.09, + "grad_norm": 0.29432756373678265, + "learning_rate": 0.00017779904306220096, + "loss": 1.1336, + "step": 929 + }, + { + "epoch": 0.09, + "grad_norm": 0.28911304731696175, + "learning_rate": 0.00017799043062200956, + "loss": 1.1578, + "step": 930 + }, + { + "epoch": 0.09, + "grad_norm": 0.3006870934673598, + "learning_rate": 0.0001781818181818182, + "loss": 1.0588, + "step": 931 + }, + { + "epoch": 0.09, + "grad_norm": 0.31210608325092193, + "learning_rate": 0.0001783732057416268, + "loss": 1.2426, + "step": 932 + }, + { + "epoch": 0.09, + "grad_norm": 0.27626145357478726, + "learning_rate": 0.0001785645933014354, + "loss": 1.1609, + "step": 933 + }, + { + "epoch": 0.09, + "grad_norm": 0.2683905399507039, + "learning_rate": 0.00017875598086124404, + "loss": 1.1457, + "step": 934 + }, + { + "epoch": 0.09, + "grad_norm": 0.2661353870666551, + "learning_rate": 0.00017894736842105264, + "loss": 1.2095, + "step": 935 + }, + { + "epoch": 0.09, + "grad_norm": 0.33062559297582395, + "learning_rate": 0.00017913875598086126, + "loss": 1.2396, + "step": 936 + }, + { + "epoch": 0.09, + "grad_norm": 0.26950737804952357, + "learning_rate": 0.00017933014354066986, + "loss": 1.1983, + "step": 937 + }, + { + "epoch": 0.09, + "grad_norm": 0.29499843784362234, + "learning_rate": 0.00017952153110047846, + "loss": 1.2442, + "step": 938 + }, + { + "epoch": 0.09, + "grad_norm": 0.31186904072609634, + "learning_rate": 0.00017971291866028709, + "loss": 1.1494, + "step": 939 + }, + { + "epoch": 0.09, + "grad_norm": 0.2514154775647367, + "learning_rate": 0.0001799043062200957, + "loss": 1.1061, + "step": 940 + }, + { + "epoch": 0.09, + "grad_norm": 0.28595401946483395, + "learning_rate": 0.0001800956937799043, + "loss": 1.0322, + "step": 941 + }, + { + "epoch": 0.09, + "grad_norm": 0.32459069445525873, + "learning_rate": 0.00018028708133971294, + "loss": 1.2007, + "step": 942 + }, + { + "epoch": 0.09, + "grad_norm": 0.31789984803696647, + "learning_rate": 0.00018047846889952154, + "loss": 1.1057, + "step": 943 + }, + { + "epoch": 0.09, + "grad_norm": 0.2893543986536651, + "learning_rate": 0.00018066985645933013, + "loss": 1.1108, + "step": 944 + }, + { + "epoch": 0.09, + "grad_norm": 0.225754938363265, + "learning_rate": 0.00018086124401913876, + "loss": 1.0842, + "step": 945 + }, + { + "epoch": 0.09, + "grad_norm": 0.3473860341063463, + "learning_rate": 0.00018105263157894739, + "loss": 1.0824, + "step": 946 + }, + { + "epoch": 0.09, + "grad_norm": 0.2922950981615233, + "learning_rate": 0.00018124401913875598, + "loss": 1.1143, + "step": 947 + }, + { + "epoch": 0.09, + "grad_norm": 0.29161352434420446, + "learning_rate": 0.0001814354066985646, + "loss": 1.2325, + "step": 948 + }, + { + "epoch": 0.09, + "grad_norm": 0.29481985803408, + "learning_rate": 0.0001816267942583732, + "loss": 1.1527, + "step": 949 + }, + { + "epoch": 0.09, + "grad_norm": 0.2874729386092549, + "learning_rate": 0.00018181818181818183, + "loss": 1.1476, + "step": 950 + }, + { + "epoch": 0.09, + "grad_norm": 0.24921692256090058, + "learning_rate": 0.00018200956937799043, + "loss": 1.1567, + "step": 951 + }, + { + "epoch": 0.09, + "grad_norm": 0.31742487463024494, + "learning_rate": 0.00018220095693779906, + "loss": 1.1013, + "step": 952 + }, + { + "epoch": 0.09, + "grad_norm": 0.27456172247325683, + "learning_rate": 0.00018239234449760766, + "loss": 1.2069, + "step": 953 + }, + { + "epoch": 0.09, + "grad_norm": 0.28859064993450634, + "learning_rate": 0.00018258373205741628, + "loss": 1.2123, + "step": 954 + }, + { + "epoch": 0.09, + "grad_norm": 0.2750397123362856, + "learning_rate": 0.00018277511961722488, + "loss": 1.1231, + "step": 955 + }, + { + "epoch": 0.09, + "grad_norm": 0.23915748062608722, + "learning_rate": 0.0001829665071770335, + "loss": 1.1612, + "step": 956 + }, + { + "epoch": 0.09, + "grad_norm": 0.2722479832588104, + "learning_rate": 0.0001831578947368421, + "loss": 1.1897, + "step": 957 + }, + { + "epoch": 0.09, + "grad_norm": 0.2863352675014705, + "learning_rate": 0.00018334928229665073, + "loss": 1.1062, + "step": 958 + }, + { + "epoch": 0.09, + "grad_norm": 0.25258807482282203, + "learning_rate": 0.00018354066985645936, + "loss": 1.1574, + "step": 959 + }, + { + "epoch": 0.09, + "grad_norm": 0.27455220282180104, + "learning_rate": 0.00018373205741626796, + "loss": 1.0886, + "step": 960 + }, + { + "epoch": 0.09, + "grad_norm": 0.2693559937731345, + "learning_rate": 0.00018392344497607656, + "loss": 1.1873, + "step": 961 + }, + { + "epoch": 0.09, + "grad_norm": 0.2487980816971801, + "learning_rate": 0.00018411483253588518, + "loss": 1.2245, + "step": 962 + }, + { + "epoch": 0.09, + "grad_norm": 0.6732577941816555, + "learning_rate": 0.00018430622009569378, + "loss": 1.1813, + "step": 963 + }, + { + "epoch": 0.09, + "grad_norm": 0.2912759304052633, + "learning_rate": 0.00018449760765550238, + "loss": 1.168, + "step": 964 + }, + { + "epoch": 0.09, + "grad_norm": 0.26989856763778836, + "learning_rate": 0.00018468899521531103, + "loss": 1.0137, + "step": 965 + }, + { + "epoch": 0.09, + "grad_norm": 0.25602835842131616, + "learning_rate": 0.00018488038277511963, + "loss": 1.1879, + "step": 966 + }, + { + "epoch": 0.09, + "grad_norm": 0.25725078226468107, + "learning_rate": 0.00018507177033492823, + "loss": 1.174, + "step": 967 + }, + { + "epoch": 0.09, + "grad_norm": 0.27889203556658276, + "learning_rate": 0.00018526315789473685, + "loss": 1.2013, + "step": 968 + }, + { + "epoch": 0.09, + "grad_norm": 0.28462011286220296, + "learning_rate": 0.00018545454545454545, + "loss": 1.0878, + "step": 969 + }, + { + "epoch": 0.09, + "grad_norm": 0.27682759364760257, + "learning_rate": 0.00018564593301435408, + "loss": 1.085, + "step": 970 + }, + { + "epoch": 0.09, + "grad_norm": 0.28136179421463786, + "learning_rate": 0.0001858373205741627, + "loss": 1.1807, + "step": 971 + }, + { + "epoch": 0.09, + "grad_norm": 0.2506017584700625, + "learning_rate": 0.0001860287081339713, + "loss": 1.1538, + "step": 972 + }, + { + "epoch": 0.09, + "grad_norm": 0.2681898339952538, + "learning_rate": 0.00018622009569377993, + "loss": 1.0615, + "step": 973 + }, + { + "epoch": 0.09, + "grad_norm": 0.2838246571307257, + "learning_rate": 0.00018641148325358853, + "loss": 1.1778, + "step": 974 + }, + { + "epoch": 0.09, + "grad_norm": 0.2758038504041395, + "learning_rate": 0.00018660287081339713, + "loss": 1.1038, + "step": 975 + }, + { + "epoch": 0.09, + "grad_norm": 0.28266068816982276, + "learning_rate": 0.00018679425837320575, + "loss": 1.1487, + "step": 976 + }, + { + "epoch": 0.09, + "grad_norm": 0.2655825547541941, + "learning_rate": 0.00018698564593301435, + "loss": 1.0846, + "step": 977 + }, + { + "epoch": 0.09, + "grad_norm": 0.2750864417199089, + "learning_rate": 0.00018717703349282298, + "loss": 1.0925, + "step": 978 + }, + { + "epoch": 0.09, + "grad_norm": 0.28328763891237363, + "learning_rate": 0.0001873684210526316, + "loss": 1.1602, + "step": 979 + }, + { + "epoch": 0.09, + "grad_norm": 0.274427495879147, + "learning_rate": 0.0001875598086124402, + "loss": 1.0184, + "step": 980 + }, + { + "epoch": 0.09, + "grad_norm": 0.29677822769592865, + "learning_rate": 0.0001877511961722488, + "loss": 1.1316, + "step": 981 + }, + { + "epoch": 0.09, + "grad_norm": 0.35675044865453487, + "learning_rate": 0.00018794258373205743, + "loss": 1.1299, + "step": 982 + }, + { + "epoch": 0.09, + "grad_norm": 0.27471990871455726, + "learning_rate": 0.00018813397129186602, + "loss": 1.0977, + "step": 983 + }, + { + "epoch": 0.09, + "grad_norm": 0.32677576558264015, + "learning_rate": 0.00018832535885167465, + "loss": 1.0416, + "step": 984 + }, + { + "epoch": 0.09, + "grad_norm": 0.3449420887466517, + "learning_rate": 0.00018851674641148328, + "loss": 1.218, + "step": 985 + }, + { + "epoch": 0.09, + "grad_norm": 0.3187055721961639, + "learning_rate": 0.00018870813397129187, + "loss": 1.1091, + "step": 986 + }, + { + "epoch": 0.09, + "grad_norm": 0.3143792697319127, + "learning_rate": 0.0001888995215311005, + "loss": 1.067, + "step": 987 + }, + { + "epoch": 0.09, + "grad_norm": 0.2742909947428014, + "learning_rate": 0.0001890909090909091, + "loss": 1.225, + "step": 988 + }, + { + "epoch": 0.09, + "grad_norm": 0.27319677319302543, + "learning_rate": 0.0001892822966507177, + "loss": 1.1487, + "step": 989 + }, + { + "epoch": 0.09, + "grad_norm": 0.2758157497549949, + "learning_rate": 0.00018947368421052632, + "loss": 1.1109, + "step": 990 + }, + { + "epoch": 0.09, + "grad_norm": 0.2574079506381213, + "learning_rate": 0.00018966507177033495, + "loss": 1.1476, + "step": 991 + }, + { + "epoch": 0.09, + "grad_norm": 0.332702187603211, + "learning_rate": 0.00018985645933014355, + "loss": 1.0896, + "step": 992 + }, + { + "epoch": 0.1, + "grad_norm": 0.2861721583962, + "learning_rate": 0.00019004784688995217, + "loss": 1.2234, + "step": 993 + }, + { + "epoch": 0.1, + "grad_norm": 0.2467936267051518, + "learning_rate": 0.00019023923444976077, + "loss": 1.1656, + "step": 994 + }, + { + "epoch": 0.1, + "grad_norm": 0.28781558903434595, + "learning_rate": 0.00019043062200956937, + "loss": 1.1853, + "step": 995 + }, + { + "epoch": 0.1, + "grad_norm": 0.2916455357271407, + "learning_rate": 0.000190622009569378, + "loss": 1.0269, + "step": 996 + }, + { + "epoch": 0.1, + "grad_norm": 0.3201870144576391, + "learning_rate": 0.00019081339712918662, + "loss": 1.1852, + "step": 997 + }, + { + "epoch": 0.1, + "grad_norm": 0.27545254213477577, + "learning_rate": 0.00019100478468899522, + "loss": 1.0957, + "step": 998 + }, + { + "epoch": 0.1, + "grad_norm": 0.2826496819385951, + "learning_rate": 0.00019119617224880385, + "loss": 1.2255, + "step": 999 + }, + { + "epoch": 0.1, + "grad_norm": 0.2967102485192698, + "learning_rate": 0.00019138755980861245, + "loss": 1.1536, + "step": 1000 + }, + { + "epoch": 0.1, + "grad_norm": 0.29117608778714893, + "learning_rate": 0.00019157894736842104, + "loss": 1.0878, + "step": 1001 + }, + { + "epoch": 0.1, + "grad_norm": 0.28851304804169287, + "learning_rate": 0.00019177033492822967, + "loss": 1.0898, + "step": 1002 + }, + { + "epoch": 0.1, + "grad_norm": 0.27111717804566754, + "learning_rate": 0.00019196172248803827, + "loss": 1.2214, + "step": 1003 + }, + { + "epoch": 0.1, + "grad_norm": 0.29632228590140464, + "learning_rate": 0.0001921531100478469, + "loss": 1.1534, + "step": 1004 + }, + { + "epoch": 0.1, + "grad_norm": 0.30166486227944156, + "learning_rate": 0.00019234449760765552, + "loss": 1.1784, + "step": 1005 + }, + { + "epoch": 0.1, + "grad_norm": 0.261168294050402, + "learning_rate": 0.00019253588516746412, + "loss": 1.2274, + "step": 1006 + }, + { + "epoch": 0.1, + "grad_norm": 0.2696524388115216, + "learning_rate": 0.00019272727272727274, + "loss": 1.1256, + "step": 1007 + }, + { + "epoch": 0.1, + "grad_norm": 0.30883168940001077, + "learning_rate": 0.00019291866028708134, + "loss": 1.1804, + "step": 1008 + }, + { + "epoch": 0.1, + "grad_norm": 0.2901725454324794, + "learning_rate": 0.00019311004784688994, + "loss": 1.0656, + "step": 1009 + }, + { + "epoch": 0.1, + "grad_norm": 0.30050679633218647, + "learning_rate": 0.0001933014354066986, + "loss": 1.1217, + "step": 1010 + }, + { + "epoch": 0.1, + "grad_norm": 0.2763711001518656, + "learning_rate": 0.0001934928229665072, + "loss": 1.2114, + "step": 1011 + }, + { + "epoch": 0.1, + "grad_norm": 0.2676109407157463, + "learning_rate": 0.0001936842105263158, + "loss": 1.0474, + "step": 1012 + }, + { + "epoch": 0.1, + "grad_norm": 0.2747480845011328, + "learning_rate": 0.00019387559808612442, + "loss": 1.038, + "step": 1013 + }, + { + "epoch": 0.1, + "grad_norm": 0.24960295337688276, + "learning_rate": 0.00019406698564593302, + "loss": 1.0625, + "step": 1014 + }, + { + "epoch": 0.1, + "grad_norm": 0.2721591800223072, + "learning_rate": 0.00019425837320574162, + "loss": 1.1327, + "step": 1015 + }, + { + "epoch": 0.1, + "grad_norm": 0.2877329511310855, + "learning_rate": 0.00019444976076555027, + "loss": 1.228, + "step": 1016 + }, + { + "epoch": 0.1, + "grad_norm": 0.2568028077694964, + "learning_rate": 0.00019464114832535887, + "loss": 1.0683, + "step": 1017 + }, + { + "epoch": 0.1, + "grad_norm": 0.2678405294971607, + "learning_rate": 0.00019483253588516747, + "loss": 1.1125, + "step": 1018 + }, + { + "epoch": 0.1, + "grad_norm": 0.2963652522200652, + "learning_rate": 0.0001950239234449761, + "loss": 1.0905, + "step": 1019 + }, + { + "epoch": 0.1, + "grad_norm": 0.26009393679319537, + "learning_rate": 0.0001952153110047847, + "loss": 1.1036, + "step": 1020 + }, + { + "epoch": 0.1, + "grad_norm": 0.3049720818580699, + "learning_rate": 0.00019540669856459332, + "loss": 1.1964, + "step": 1021 + }, + { + "epoch": 0.1, + "grad_norm": 0.3050130613963167, + "learning_rate": 0.00019559808612440191, + "loss": 1.1293, + "step": 1022 + }, + { + "epoch": 0.1, + "grad_norm": 0.24297369971258104, + "learning_rate": 0.00019578947368421054, + "loss": 1.1143, + "step": 1023 + }, + { + "epoch": 0.1, + "grad_norm": 0.24077286684290172, + "learning_rate": 0.00019598086124401917, + "loss": 1.0764, + "step": 1024 + }, + { + "epoch": 0.1, + "grad_norm": 0.3113100418888948, + "learning_rate": 0.00019617224880382777, + "loss": 1.1246, + "step": 1025 + }, + { + "epoch": 0.1, + "grad_norm": 0.2784731985247703, + "learning_rate": 0.00019636363636363636, + "loss": 1.0998, + "step": 1026 + }, + { + "epoch": 0.1, + "grad_norm": 0.2542533680624268, + "learning_rate": 0.000196555023923445, + "loss": 1.114, + "step": 1027 + }, + { + "epoch": 0.1, + "grad_norm": 0.28332309977048276, + "learning_rate": 0.0001967464114832536, + "loss": 1.1719, + "step": 1028 + }, + { + "epoch": 0.1, + "grad_norm": 0.25261282572279636, + "learning_rate": 0.00019693779904306221, + "loss": 1.1069, + "step": 1029 + }, + { + "epoch": 0.1, + "grad_norm": 0.28908512950153364, + "learning_rate": 0.00019712918660287084, + "loss": 1.0939, + "step": 1030 + }, + { + "epoch": 0.1, + "grad_norm": 0.2624681443069945, + "learning_rate": 0.00019732057416267944, + "loss": 1.1426, + "step": 1031 + }, + { + "epoch": 0.1, + "grad_norm": 0.26954809036931093, + "learning_rate": 0.00019751196172248804, + "loss": 1.0777, + "step": 1032 + }, + { + "epoch": 0.1, + "grad_norm": 0.2921435580998635, + "learning_rate": 0.00019770334928229666, + "loss": 1.0915, + "step": 1033 + }, + { + "epoch": 0.1, + "grad_norm": 0.21257005701595452, + "learning_rate": 0.00019789473684210526, + "loss": 1.1055, + "step": 1034 + }, + { + "epoch": 0.1, + "grad_norm": 0.27523674720420943, + "learning_rate": 0.0001980861244019139, + "loss": 1.0515, + "step": 1035 + }, + { + "epoch": 0.1, + "grad_norm": 0.26415508019617007, + "learning_rate": 0.0001982775119617225, + "loss": 1.0515, + "step": 1036 + }, + { + "epoch": 0.1, + "grad_norm": 0.25592610307218705, + "learning_rate": 0.0001984688995215311, + "loss": 1.1751, + "step": 1037 + }, + { + "epoch": 0.1, + "grad_norm": 0.25430310175648296, + "learning_rate": 0.00019866028708133974, + "loss": 1.123, + "step": 1038 + }, + { + "epoch": 0.1, + "grad_norm": 0.2861528947212422, + "learning_rate": 0.00019885167464114834, + "loss": 1.0859, + "step": 1039 + }, + { + "epoch": 0.1, + "grad_norm": 0.2738046774076065, + "learning_rate": 0.00019904306220095693, + "loss": 1.1405, + "step": 1040 + }, + { + "epoch": 0.1, + "grad_norm": 0.2726143048105954, + "learning_rate": 0.00019923444976076556, + "loss": 1.264, + "step": 1041 + }, + { + "epoch": 0.1, + "grad_norm": 0.27872152562297303, + "learning_rate": 0.0001994258373205742, + "loss": 1.3155, + "step": 1042 + }, + { + "epoch": 0.1, + "grad_norm": 0.2852218650666301, + "learning_rate": 0.00019961722488038279, + "loss": 1.1655, + "step": 1043 + }, + { + "epoch": 0.1, + "grad_norm": 0.23588395214808744, + "learning_rate": 0.0001998086124401914, + "loss": 1.0397, + "step": 1044 + }, + { + "epoch": 0.1, + "grad_norm": 0.2666368869674148, + "learning_rate": 0.0002, + "loss": 1.1416, + "step": 1045 + }, + { + "epoch": 0.1, + "grad_norm": 0.2930022628207633, + "learning_rate": 0.00019999999874871857, + "loss": 1.0405, + "step": 1046 + }, + { + "epoch": 0.1, + "grad_norm": 0.2710774549293637, + "learning_rate": 0.00019999999499487433, + "loss": 1.1506, + "step": 1047 + }, + { + "epoch": 0.1, + "grad_norm": 0.25385995300238745, + "learning_rate": 0.00019999998873846737, + "loss": 1.267, + "step": 1048 + }, + { + "epoch": 0.1, + "grad_norm": 0.27774997441775784, + "learning_rate": 0.00019999997997949785, + "loss": 1.1939, + "step": 1049 + }, + { + "epoch": 0.1, + "grad_norm": 0.26076832391011084, + "learning_rate": 0.00019999996871796597, + "loss": 1.1117, + "step": 1050 + }, + { + "epoch": 0.1, + "grad_norm": 0.235868498103916, + "learning_rate": 0.00019999995495387202, + "loss": 1.1701, + "step": 1051 + }, + { + "epoch": 0.1, + "grad_norm": 0.24077986955464514, + "learning_rate": 0.00019999993868721638, + "loss": 1.137, + "step": 1052 + }, + { + "epoch": 0.1, + "grad_norm": 0.2512166776788439, + "learning_rate": 0.0001999999199179994, + "loss": 1.1414, + "step": 1053 + }, + { + "epoch": 0.1, + "grad_norm": 0.2654407191870313, + "learning_rate": 0.00019999989864622159, + "loss": 1.0333, + "step": 1054 + }, + { + "epoch": 0.1, + "grad_norm": 0.2642901952276395, + "learning_rate": 0.00019999987487188348, + "loss": 1.1385, + "step": 1055 + }, + { + "epoch": 0.1, + "grad_norm": 0.23723222653673273, + "learning_rate": 0.00019999984859498562, + "loss": 1.1103, + "step": 1056 + }, + { + "epoch": 0.1, + "grad_norm": 0.2576508658140534, + "learning_rate": 0.00019999981981552872, + "loss": 1.0641, + "step": 1057 + }, + { + "epoch": 0.1, + "grad_norm": 0.2512702002371694, + "learning_rate": 0.00019999978853351346, + "loss": 1.1742, + "step": 1058 + }, + { + "epoch": 0.1, + "grad_norm": 0.2544113883838849, + "learning_rate": 0.0001999997547489407, + "loss": 1.249, + "step": 1059 + }, + { + "epoch": 0.1, + "grad_norm": 0.29453181610522905, + "learning_rate": 0.00019999971846181117, + "loss": 1.0817, + "step": 1060 + }, + { + "epoch": 0.1, + "grad_norm": 0.3046660451518799, + "learning_rate": 0.00019999967967212587, + "loss": 1.1202, + "step": 1061 + }, + { + "epoch": 0.1, + "grad_norm": 0.2711240525076058, + "learning_rate": 0.0001999996383798857, + "loss": 1.1488, + "step": 1062 + }, + { + "epoch": 0.1, + "grad_norm": 0.26069781523913904, + "learning_rate": 0.0001999995945850918, + "loss": 1.1204, + "step": 1063 + }, + { + "epoch": 0.1, + "grad_norm": 0.2759936162347398, + "learning_rate": 0.00019999954828774514, + "loss": 1.2418, + "step": 1064 + }, + { + "epoch": 0.1, + "grad_norm": 0.32559445650786323, + "learning_rate": 0.00019999949948784696, + "loss": 1.0396, + "step": 1065 + }, + { + "epoch": 0.1, + "grad_norm": 0.29446236701732303, + "learning_rate": 0.00019999944818539843, + "loss": 0.9828, + "step": 1066 + }, + { + "epoch": 0.1, + "grad_norm": 0.2512414839667447, + "learning_rate": 0.00019999939438040092, + "loss": 1.0964, + "step": 1067 + }, + { + "epoch": 0.1, + "grad_norm": 0.2522166834487399, + "learning_rate": 0.00019999933807285567, + "loss": 1.1991, + "step": 1068 + }, + { + "epoch": 0.1, + "grad_norm": 0.2776488352071124, + "learning_rate": 0.00019999927926276417, + "loss": 1.2357, + "step": 1069 + }, + { + "epoch": 0.1, + "grad_norm": 0.3107756422517976, + "learning_rate": 0.00019999921795012783, + "loss": 1.1875, + "step": 1070 + }, + { + "epoch": 0.1, + "grad_norm": 0.23140904026513692, + "learning_rate": 0.00019999915413494823, + "loss": 1.0612, + "step": 1071 + }, + { + "epoch": 0.1, + "grad_norm": 0.28830876945790945, + "learning_rate": 0.00019999908781722693, + "loss": 1.0622, + "step": 1072 + }, + { + "epoch": 0.1, + "grad_norm": 0.24641652710238304, + "learning_rate": 0.00019999901899696564, + "loss": 1.1553, + "step": 1073 + }, + { + "epoch": 0.1, + "grad_norm": 0.3285726477920543, + "learning_rate": 0.00019999894767416603, + "loss": 1.1287, + "step": 1074 + }, + { + "epoch": 0.1, + "grad_norm": 0.2868313621923491, + "learning_rate": 0.00019999887384882992, + "loss": 1.1679, + "step": 1075 + }, + { + "epoch": 0.1, + "grad_norm": 0.2888935086026084, + "learning_rate": 0.00019999879752095914, + "loss": 1.078, + "step": 1076 + }, + { + "epoch": 0.1, + "grad_norm": 0.25581751198117825, + "learning_rate": 0.0001999987186905556, + "loss": 1.1637, + "step": 1077 + }, + { + "epoch": 0.1, + "grad_norm": 0.28668728467855203, + "learning_rate": 0.0001999986373576213, + "loss": 1.1557, + "step": 1078 + }, + { + "epoch": 0.1, + "grad_norm": 0.2858855815822476, + "learning_rate": 0.00019999855352215824, + "loss": 1.1543, + "step": 1079 + }, + { + "epoch": 0.1, + "grad_norm": 0.24866679351848656, + "learning_rate": 0.0001999984671841685, + "loss": 1.088, + "step": 1080 + }, + { + "epoch": 0.1, + "grad_norm": 0.2802988000108613, + "learning_rate": 0.00019999837834365432, + "loss": 1.1045, + "step": 1081 + }, + { + "epoch": 0.1, + "grad_norm": 0.28290564790646, + "learning_rate": 0.00019999828700061786, + "loss": 1.1013, + "step": 1082 + }, + { + "epoch": 0.1, + "grad_norm": 0.2670174309559056, + "learning_rate": 0.0001999981931550614, + "loss": 1.0202, + "step": 1083 + }, + { + "epoch": 0.1, + "grad_norm": 0.29742334914408336, + "learning_rate": 0.00019999809680698734, + "loss": 1.1634, + "step": 1084 + }, + { + "epoch": 0.1, + "grad_norm": 0.2613603430268145, + "learning_rate": 0.00019999799795639804, + "loss": 1.1906, + "step": 1085 + }, + { + "epoch": 0.1, + "grad_norm": 0.2368844788947555, + "learning_rate": 0.000199997896603296, + "loss": 1.1789, + "step": 1086 + }, + { + "epoch": 0.1, + "grad_norm": 0.28495588713062425, + "learning_rate": 0.00019999779274768376, + "loss": 1.1759, + "step": 1087 + }, + { + "epoch": 0.1, + "grad_norm": 0.2822715528001476, + "learning_rate": 0.0001999976863895639, + "loss": 1.0508, + "step": 1088 + }, + { + "epoch": 0.1, + "grad_norm": 0.28574997489486803, + "learning_rate": 0.0001999975775289391, + "loss": 1.1224, + "step": 1089 + }, + { + "epoch": 0.1, + "grad_norm": 0.261898017632014, + "learning_rate": 0.00019999746616581208, + "loss": 1.1035, + "step": 1090 + }, + { + "epoch": 0.1, + "grad_norm": 0.28859851706983464, + "learning_rate": 0.00019999735230018562, + "loss": 1.1726, + "step": 1091 + }, + { + "epoch": 0.1, + "grad_norm": 0.26017489671317706, + "learning_rate": 0.00019999723593206256, + "loss": 1.0777, + "step": 1092 + }, + { + "epoch": 0.1, + "grad_norm": 0.2808346316808804, + "learning_rate": 0.00019999711706144584, + "loss": 1.1169, + "step": 1093 + }, + { + "epoch": 0.1, + "grad_norm": 0.26961618081539596, + "learning_rate": 0.0001999969956883384, + "loss": 1.1525, + "step": 1094 + }, + { + "epoch": 0.1, + "grad_norm": 0.23998423085029977, + "learning_rate": 0.0001999968718127433, + "loss": 1.1088, + "step": 1095 + }, + { + "epoch": 0.1, + "grad_norm": 0.27133623294565795, + "learning_rate": 0.00019999674543466368, + "loss": 1.0187, + "step": 1096 + }, + { + "epoch": 0.1, + "grad_norm": 0.28888071684005906, + "learning_rate": 0.00019999661655410261, + "loss": 1.0476, + "step": 1097 + }, + { + "epoch": 0.11, + "grad_norm": 0.26874358976360063, + "learning_rate": 0.0001999964851710634, + "loss": 1.1429, + "step": 1098 + }, + { + "epoch": 0.11, + "grad_norm": 0.2803133560421145, + "learning_rate": 0.0001999963512855493, + "loss": 1.2504, + "step": 1099 + }, + { + "epoch": 0.11, + "grad_norm": 0.2913063287806824, + "learning_rate": 0.00019999621489756364, + "loss": 1.1604, + "step": 1100 + }, + { + "epoch": 0.11, + "grad_norm": 0.27440490495841235, + "learning_rate": 0.00019999607600710984, + "loss": 1.1517, + "step": 1101 + }, + { + "epoch": 0.11, + "grad_norm": 0.277572309820751, + "learning_rate": 0.00019999593461419144, + "loss": 1.0957, + "step": 1102 + }, + { + "epoch": 0.11, + "grad_norm": 0.3075924298382781, + "learning_rate": 0.0001999957907188119, + "loss": 1.142, + "step": 1103 + }, + { + "epoch": 0.11, + "grad_norm": 0.23985413219751897, + "learning_rate": 0.00019999564432097487, + "loss": 1.1932, + "step": 1104 + }, + { + "epoch": 0.11, + "grad_norm": 0.2408338302884486, + "learning_rate": 0.00019999549542068395, + "loss": 1.0735, + "step": 1105 + }, + { + "epoch": 0.11, + "grad_norm": 0.26874856387294116, + "learning_rate": 0.00019999534401794297, + "loss": 1.1553, + "step": 1106 + }, + { + "epoch": 0.11, + "grad_norm": 0.3014584853984502, + "learning_rate": 0.00019999519011275566, + "loss": 1.1655, + "step": 1107 + }, + { + "epoch": 0.11, + "grad_norm": 0.2843833242046219, + "learning_rate": 0.00019999503370512583, + "loss": 1.1877, + "step": 1108 + }, + { + "epoch": 0.11, + "grad_norm": 0.2512315616335756, + "learning_rate": 0.00019999487479505746, + "loss": 1.2, + "step": 1109 + }, + { + "epoch": 0.11, + "grad_norm": 0.3854687733857706, + "learning_rate": 0.00019999471338255452, + "loss": 1.1755, + "step": 1110 + }, + { + "epoch": 0.11, + "grad_norm": 0.2957638364283729, + "learning_rate": 0.00019999454946762103, + "loss": 1.1496, + "step": 1111 + }, + { + "epoch": 0.11, + "grad_norm": 0.2866505879252708, + "learning_rate": 0.00019999438305026108, + "loss": 0.9671, + "step": 1112 + }, + { + "epoch": 0.11, + "grad_norm": 0.31100005319009444, + "learning_rate": 0.00019999421413047886, + "loss": 1.1924, + "step": 1113 + }, + { + "epoch": 0.11, + "grad_norm": 0.2736868129625665, + "learning_rate": 0.00019999404270827856, + "loss": 1.0565, + "step": 1114 + }, + { + "epoch": 0.11, + "grad_norm": 0.3082559508155182, + "learning_rate": 0.00019999386878366454, + "loss": 1.1636, + "step": 1115 + }, + { + "epoch": 0.11, + "grad_norm": 0.2709734888315765, + "learning_rate": 0.0001999936923566411, + "loss": 1.1289, + "step": 1116 + }, + { + "epoch": 0.11, + "grad_norm": 0.32185710854614685, + "learning_rate": 0.00019999351342721262, + "loss": 1.1404, + "step": 1117 + }, + { + "epoch": 0.11, + "grad_norm": 0.31162451372291133, + "learning_rate": 0.0001999933319953837, + "loss": 1.112, + "step": 1118 + }, + { + "epoch": 0.11, + "grad_norm": 0.2752825720487004, + "learning_rate": 0.00019999314806115872, + "loss": 1.143, + "step": 1119 + }, + { + "epoch": 0.11, + "grad_norm": 0.2917340741765025, + "learning_rate": 0.0001999929616245424, + "loss": 1.0736, + "step": 1120 + }, + { + "epoch": 0.11, + "grad_norm": 0.302518080441679, + "learning_rate": 0.0001999927726855394, + "loss": 1.0372, + "step": 1121 + }, + { + "epoch": 0.11, + "grad_norm": 0.25312327730893897, + "learning_rate": 0.00019999258124415442, + "loss": 1.1355, + "step": 1122 + }, + { + "epoch": 0.11, + "grad_norm": 0.2656439197184839, + "learning_rate": 0.00019999238730039222, + "loss": 1.0496, + "step": 1123 + }, + { + "epoch": 0.11, + "grad_norm": 0.24862847164472834, + "learning_rate": 0.00019999219085425768, + "loss": 1.0786, + "step": 1124 + }, + { + "epoch": 0.11, + "grad_norm": 0.28410932219305585, + "learning_rate": 0.00019999199190575575, + "loss": 1.0904, + "step": 1125 + }, + { + "epoch": 0.11, + "grad_norm": 0.2720824714159536, + "learning_rate": 0.00019999179045489135, + "loss": 1.0153, + "step": 1126 + }, + { + "epoch": 0.11, + "grad_norm": 0.2759096778009793, + "learning_rate": 0.00019999158650166958, + "loss": 1.1001, + "step": 1127 + }, + { + "epoch": 0.11, + "grad_norm": 0.6106436469666682, + "learning_rate": 0.0001999913800460955, + "loss": 1.1342, + "step": 1128 + }, + { + "epoch": 0.11, + "grad_norm": 0.23702252854532238, + "learning_rate": 0.00019999117108817428, + "loss": 1.0916, + "step": 1129 + }, + { + "epoch": 0.11, + "grad_norm": 0.25849383000967896, + "learning_rate": 0.0001999909596279112, + "loss": 1.1749, + "step": 1130 + }, + { + "epoch": 0.11, + "grad_norm": 0.28015440839970107, + "learning_rate": 0.0001999907456653115, + "loss": 1.1269, + "step": 1131 + }, + { + "epoch": 0.11, + "grad_norm": 0.26460543807236786, + "learning_rate": 0.00019999052920038053, + "loss": 1.1749, + "step": 1132 + }, + { + "epoch": 0.11, + "grad_norm": 0.27199116543714963, + "learning_rate": 0.0001999903102331237, + "loss": 1.0885, + "step": 1133 + }, + { + "epoch": 0.11, + "grad_norm": 0.22326728149419828, + "learning_rate": 0.00019999008876354658, + "loss": 1.106, + "step": 1134 + }, + { + "epoch": 0.11, + "grad_norm": 0.2957916257263048, + "learning_rate": 0.0001999898647916546, + "loss": 1.0432, + "step": 1135 + }, + { + "epoch": 0.11, + "grad_norm": 0.2926664019325837, + "learning_rate": 0.00019998963831745344, + "loss": 1.0905, + "step": 1136 + }, + { + "epoch": 0.11, + "grad_norm": 0.2560158671609372, + "learning_rate": 0.00019998940934094872, + "loss": 1.0585, + "step": 1137 + }, + { + "epoch": 0.11, + "grad_norm": 0.26739898061958195, + "learning_rate": 0.00019998917786214618, + "loss": 1.0375, + "step": 1138 + }, + { + "epoch": 0.11, + "grad_norm": 0.28363806649148315, + "learning_rate": 0.00019998894388105164, + "loss": 1.1372, + "step": 1139 + }, + { + "epoch": 0.11, + "grad_norm": 0.24811695289905492, + "learning_rate": 0.00019998870739767094, + "loss": 0.963, + "step": 1140 + }, + { + "epoch": 0.11, + "grad_norm": 0.28924944955268556, + "learning_rate": 0.00019998846841201, + "loss": 1.084, + "step": 1141 + }, + { + "epoch": 0.11, + "grad_norm": 0.2636078259266071, + "learning_rate": 0.00019998822692407478, + "loss": 1.0698, + "step": 1142 + }, + { + "epoch": 0.11, + "grad_norm": 0.2960715052303091, + "learning_rate": 0.0001999879829338714, + "loss": 1.2418, + "step": 1143 + }, + { + "epoch": 0.11, + "grad_norm": 0.23055523423338184, + "learning_rate": 0.00019998773644140584, + "loss": 1.169, + "step": 1144 + }, + { + "epoch": 0.11, + "grad_norm": 0.3043989132237114, + "learning_rate": 0.00019998748744668436, + "loss": 1.1707, + "step": 1145 + }, + { + "epoch": 0.11, + "grad_norm": 0.2652220574427321, + "learning_rate": 0.00019998723594971316, + "loss": 1.0908, + "step": 1146 + }, + { + "epoch": 0.11, + "grad_norm": 0.3564678490888735, + "learning_rate": 0.00019998698195049857, + "loss": 1.2161, + "step": 1147 + }, + { + "epoch": 0.11, + "grad_norm": 0.31147303496629464, + "learning_rate": 0.0001999867254490469, + "loss": 1.0795, + "step": 1148 + }, + { + "epoch": 0.11, + "grad_norm": 0.33889241778400275, + "learning_rate": 0.00019998646644536457, + "loss": 1.2739, + "step": 1149 + }, + { + "epoch": 0.11, + "grad_norm": 0.27545717862181845, + "learning_rate": 0.00019998620493945807, + "loss": 1.074, + "step": 1150 + }, + { + "epoch": 0.11, + "grad_norm": 0.2576593733645889, + "learning_rate": 0.00019998594093133395, + "loss": 1.1171, + "step": 1151 + }, + { + "epoch": 0.11, + "grad_norm": 0.25688278200109543, + "learning_rate": 0.00019998567442099888, + "loss": 1.081, + "step": 1152 + }, + { + "epoch": 0.11, + "grad_norm": 0.3149168655482506, + "learning_rate": 0.0001999854054084594, + "loss": 1.1463, + "step": 1153 + }, + { + "epoch": 0.11, + "grad_norm": 0.28337563224538714, + "learning_rate": 0.00019998513389372233, + "loss": 1.1503, + "step": 1154 + }, + { + "epoch": 0.11, + "grad_norm": 0.28770905143410885, + "learning_rate": 0.00019998485987679447, + "loss": 1.0847, + "step": 1155 + }, + { + "epoch": 0.11, + "grad_norm": 0.2606963858756736, + "learning_rate": 0.00019998458335768264, + "loss": 1.2108, + "step": 1156 + }, + { + "epoch": 0.11, + "grad_norm": 0.2764798228490211, + "learning_rate": 0.00019998430433639376, + "loss": 1.1206, + "step": 1157 + }, + { + "epoch": 0.11, + "grad_norm": 0.3028071620221027, + "learning_rate": 0.00019998402281293484, + "loss": 1.1628, + "step": 1158 + }, + { + "epoch": 0.11, + "grad_norm": 0.23132033284887418, + "learning_rate": 0.00019998373878731291, + "loss": 1.0603, + "step": 1159 + }, + { + "epoch": 0.11, + "grad_norm": 0.3197463940127305, + "learning_rate": 0.0001999834522595351, + "loss": 1.1337, + "step": 1160 + }, + { + "epoch": 0.11, + "grad_norm": 0.258332321698546, + "learning_rate": 0.00019998316322960853, + "loss": 1.1347, + "step": 1161 + }, + { + "epoch": 0.11, + "grad_norm": 0.37002001593224093, + "learning_rate": 0.00019998287169754045, + "loss": 1.0973, + "step": 1162 + }, + { + "epoch": 0.11, + "grad_norm": 0.35455352761567094, + "learning_rate": 0.00019998257766333822, + "loss": 1.0645, + "step": 1163 + }, + { + "epoch": 0.11, + "grad_norm": 0.25846010518779355, + "learning_rate": 0.00019998228112700912, + "loss": 1.099, + "step": 1164 + }, + { + "epoch": 0.11, + "grad_norm": 0.45574094165617823, + "learning_rate": 0.00019998198208856058, + "loss": 1.2218, + "step": 1165 + }, + { + "epoch": 0.11, + "grad_norm": 0.2806569349689396, + "learning_rate": 0.0001999816805480001, + "loss": 1.163, + "step": 1166 + }, + { + "epoch": 0.11, + "grad_norm": 0.3230556556910955, + "learning_rate": 0.00019998137650533527, + "loss": 1.0275, + "step": 1167 + }, + { + "epoch": 0.11, + "grad_norm": 0.295834882980768, + "learning_rate": 0.0001999810699605736, + "loss": 1.0928, + "step": 1168 + }, + { + "epoch": 0.11, + "grad_norm": 0.2838870309959414, + "learning_rate": 0.0001999807609137229, + "loss": 1.1008, + "step": 1169 + }, + { + "epoch": 0.11, + "grad_norm": 0.3164419453755688, + "learning_rate": 0.00019998044936479076, + "loss": 1.1307, + "step": 1170 + }, + { + "epoch": 0.11, + "grad_norm": 0.22581223994903243, + "learning_rate": 0.00019998013531378504, + "loss": 1.1228, + "step": 1171 + }, + { + "epoch": 0.11, + "grad_norm": 0.2611545463660394, + "learning_rate": 0.00019997981876071364, + "loss": 1.1299, + "step": 1172 + }, + { + "epoch": 0.11, + "grad_norm": 0.2531091040846973, + "learning_rate": 0.00019997949970558437, + "loss": 1.2127, + "step": 1173 + }, + { + "epoch": 0.11, + "grad_norm": 0.3026109003824534, + "learning_rate": 0.00019997917814840537, + "loss": 1.1661, + "step": 1174 + }, + { + "epoch": 0.11, + "grad_norm": 0.3246636144590807, + "learning_rate": 0.00019997885408918454, + "loss": 1.0933, + "step": 1175 + }, + { + "epoch": 0.11, + "grad_norm": 0.2574562133107501, + "learning_rate": 0.0001999785275279301, + "loss": 1.1417, + "step": 1176 + }, + { + "epoch": 0.11, + "grad_norm": 0.2774322857021015, + "learning_rate": 0.00019997819846465014, + "loss": 1.2012, + "step": 1177 + }, + { + "epoch": 0.11, + "grad_norm": 0.2616318983640859, + "learning_rate": 0.00019997786689935292, + "loss": 1.064, + "step": 1178 + }, + { + "epoch": 0.11, + "grad_norm": 0.28254029261985597, + "learning_rate": 0.00019997753283204677, + "loss": 1.0777, + "step": 1179 + }, + { + "epoch": 0.11, + "grad_norm": 0.27452271119130867, + "learning_rate": 0.00019997719626274, + "loss": 1.2698, + "step": 1180 + }, + { + "epoch": 0.11, + "grad_norm": 0.28281251810785174, + "learning_rate": 0.0001999768571914411, + "loss": 1.0866, + "step": 1181 + }, + { + "epoch": 0.11, + "grad_norm": 0.28877654959328175, + "learning_rate": 0.00019997651561815848, + "loss": 1.0607, + "step": 1182 + }, + { + "epoch": 0.11, + "grad_norm": 0.31037965805002504, + "learning_rate": 0.00019997617154290077, + "loss": 1.1267, + "step": 1183 + }, + { + "epoch": 0.11, + "grad_norm": 0.28875914132128616, + "learning_rate": 0.0001999758249656765, + "loss": 1.159, + "step": 1184 + }, + { + "epoch": 0.11, + "grad_norm": 0.26263865798211755, + "learning_rate": 0.00019997547588649438, + "loss": 1.1947, + "step": 1185 + }, + { + "epoch": 0.11, + "grad_norm": 0.25456732695723555, + "learning_rate": 0.00019997512430536314, + "loss": 1.1032, + "step": 1186 + }, + { + "epoch": 0.11, + "grad_norm": 0.32332294350656676, + "learning_rate": 0.00019997477022229158, + "loss": 1.1283, + "step": 1187 + }, + { + "epoch": 0.11, + "grad_norm": 0.3066913219600098, + "learning_rate": 0.00019997441363728857, + "loss": 1.2178, + "step": 1188 + }, + { + "epoch": 0.11, + "grad_norm": 0.2982903122596879, + "learning_rate": 0.00019997405455036304, + "loss": 1.1613, + "step": 1189 + }, + { + "epoch": 0.11, + "grad_norm": 0.3072555573162715, + "learning_rate": 0.00019997369296152396, + "loss": 1.1927, + "step": 1190 + }, + { + "epoch": 0.11, + "grad_norm": 0.27576655968710867, + "learning_rate": 0.00019997332887078034, + "loss": 1.148, + "step": 1191 + }, + { + "epoch": 0.11, + "grad_norm": 0.2651145511693654, + "learning_rate": 0.0001999729622781414, + "loss": 1.061, + "step": 1192 + }, + { + "epoch": 0.11, + "grad_norm": 0.26492079668590324, + "learning_rate": 0.00019997259318361622, + "loss": 1.0942, + "step": 1193 + }, + { + "epoch": 0.11, + "grad_norm": 0.268256812103858, + "learning_rate": 0.00019997222158721405, + "loss": 1.0478, + "step": 1194 + }, + { + "epoch": 0.11, + "grad_norm": 0.29544097416774406, + "learning_rate": 0.00019997184748894422, + "loss": 1.084, + "step": 1195 + }, + { + "epoch": 0.11, + "grad_norm": 0.24315111594316274, + "learning_rate": 0.00019997147088881607, + "loss": 1.1187, + "step": 1196 + }, + { + "epoch": 0.11, + "grad_norm": 0.2887864253869539, + "learning_rate": 0.00019997109178683905, + "loss": 1.1425, + "step": 1197 + }, + { + "epoch": 0.11, + "grad_norm": 0.243613837120699, + "learning_rate": 0.0001999707101830226, + "loss": 1.2192, + "step": 1198 + }, + { + "epoch": 0.11, + "grad_norm": 0.2670339437152679, + "learning_rate": 0.00019997032607737633, + "loss": 0.9346, + "step": 1199 + }, + { + "epoch": 0.11, + "grad_norm": 0.286415306705152, + "learning_rate": 0.0001999699394699098, + "loss": 1.2044, + "step": 1200 + }, + { + "epoch": 0.11, + "grad_norm": 0.2649888516882499, + "learning_rate": 0.0001999695503606327, + "loss": 1.1028, + "step": 1201 + }, + { + "epoch": 0.11, + "grad_norm": 0.2784005327190465, + "learning_rate": 0.00019996915874955477, + "loss": 1.1883, + "step": 1202 + }, + { + "epoch": 0.12, + "grad_norm": 0.2827618352465213, + "learning_rate": 0.00019996876463668586, + "loss": 1.1373, + "step": 1203 + }, + { + "epoch": 0.12, + "grad_norm": 0.27252281665016315, + "learning_rate": 0.00019996836802203575, + "loss": 1.1434, + "step": 1204 + }, + { + "epoch": 0.12, + "grad_norm": 0.2829042974144935, + "learning_rate": 0.00019996796890561438, + "loss": 1.1242, + "step": 1205 + }, + { + "epoch": 0.12, + "grad_norm": 0.25919288560265524, + "learning_rate": 0.0001999675672874318, + "loss": 1.0836, + "step": 1206 + }, + { + "epoch": 0.12, + "grad_norm": 0.2462264710662166, + "learning_rate": 0.00019996716316749802, + "loss": 1.0824, + "step": 1207 + }, + { + "epoch": 0.12, + "grad_norm": 0.24248848464047051, + "learning_rate": 0.00019996675654582313, + "loss": 1.1398, + "step": 1208 + }, + { + "epoch": 0.12, + "grad_norm": 0.2542004323802939, + "learning_rate": 0.00019996634742241732, + "loss": 1.0721, + "step": 1209 + }, + { + "epoch": 0.12, + "grad_norm": 0.2665303881003603, + "learning_rate": 0.0001999659357972909, + "loss": 1.1183, + "step": 1210 + }, + { + "epoch": 0.12, + "grad_norm": 0.2776270813403137, + "learning_rate": 0.00019996552167045407, + "loss": 1.2601, + "step": 1211 + }, + { + "epoch": 0.12, + "grad_norm": 0.3169789236787061, + "learning_rate": 0.00019996510504191722, + "loss": 1.2331, + "step": 1212 + }, + { + "epoch": 0.12, + "grad_norm": 0.247880062307769, + "learning_rate": 0.00019996468591169082, + "loss": 1.1088, + "step": 1213 + }, + { + "epoch": 0.12, + "grad_norm": 0.2743889662351891, + "learning_rate": 0.00019996426427978532, + "loss": 1.0541, + "step": 1214 + }, + { + "epoch": 0.12, + "grad_norm": 0.2911803551557875, + "learning_rate": 0.00019996384014621128, + "loss": 1.0826, + "step": 1215 + }, + { + "epoch": 0.12, + "grad_norm": 0.26095098186965116, + "learning_rate": 0.0001999634135109793, + "loss": 1.3256, + "step": 1216 + }, + { + "epoch": 0.12, + "grad_norm": 0.2856385392323691, + "learning_rate": 0.0001999629843741001, + "loss": 1.1093, + "step": 1217 + }, + { + "epoch": 0.12, + "grad_norm": 0.2530173370522601, + "learning_rate": 0.00019996255273558436, + "loss": 1.0579, + "step": 1218 + }, + { + "epoch": 0.12, + "grad_norm": 0.2549728915097247, + "learning_rate": 0.00019996211859544296, + "loss": 0.9691, + "step": 1219 + }, + { + "epoch": 0.12, + "grad_norm": 0.2799442188519433, + "learning_rate": 0.00019996168195368668, + "loss": 1.0273, + "step": 1220 + }, + { + "epoch": 0.12, + "grad_norm": 0.30580035630418173, + "learning_rate": 0.0001999612428103265, + "loss": 1.2193, + "step": 1221 + }, + { + "epoch": 0.12, + "grad_norm": 0.2773508640042361, + "learning_rate": 0.00019996080116537339, + "loss": 1.059, + "step": 1222 + }, + { + "epoch": 0.12, + "grad_norm": 0.2917592835447524, + "learning_rate": 0.0001999603570188384, + "loss": 1.1454, + "step": 1223 + }, + { + "epoch": 0.12, + "grad_norm": 0.27511773778172, + "learning_rate": 0.00019995991037073267, + "loss": 1.0708, + "step": 1224 + }, + { + "epoch": 0.12, + "grad_norm": 0.2299812326773386, + "learning_rate": 0.00019995946122106735, + "loss": 0.9796, + "step": 1225 + }, + { + "epoch": 0.12, + "grad_norm": 0.28024776222182823, + "learning_rate": 0.00019995900956985369, + "loss": 1.1255, + "step": 1226 + }, + { + "epoch": 0.12, + "grad_norm": 0.26814475354624795, + "learning_rate": 0.000199958555417103, + "loss": 1.0957, + "step": 1227 + }, + { + "epoch": 0.12, + "grad_norm": 0.2872677212088015, + "learning_rate": 0.00019995809876282664, + "loss": 1.035, + "step": 1228 + }, + { + "epoch": 0.12, + "grad_norm": 0.2521840908110662, + "learning_rate": 0.00019995763960703605, + "loss": 1.0637, + "step": 1229 + }, + { + "epoch": 0.12, + "grad_norm": 0.40673410489243833, + "learning_rate": 0.00019995717794974268, + "loss": 1.0153, + "step": 1230 + }, + { + "epoch": 0.12, + "grad_norm": 0.31023442713481986, + "learning_rate": 0.0001999567137909581, + "loss": 1.1233, + "step": 1231 + }, + { + "epoch": 0.12, + "grad_norm": 0.2712959430519531, + "learning_rate": 0.000199956247130694, + "loss": 1.1379, + "step": 1232 + }, + { + "epoch": 0.12, + "grad_norm": 0.28625127223535557, + "learning_rate": 0.0001999557779689619, + "loss": 1.2184, + "step": 1233 + }, + { + "epoch": 0.12, + "grad_norm": 0.30076306094072325, + "learning_rate": 0.0001999553063057737, + "loss": 1.1837, + "step": 1234 + }, + { + "epoch": 0.12, + "grad_norm": 0.2653252660175731, + "learning_rate": 0.00019995483214114114, + "loss": 1.0155, + "step": 1235 + }, + { + "epoch": 0.12, + "grad_norm": 0.2752439328777632, + "learning_rate": 0.0001999543554750761, + "loss": 1.1711, + "step": 1236 + }, + { + "epoch": 0.12, + "grad_norm": 0.27211117879122465, + "learning_rate": 0.00019995387630759046, + "loss": 1.1123, + "step": 1237 + }, + { + "epoch": 0.12, + "grad_norm": 0.26487361645343366, + "learning_rate": 0.00019995339463869626, + "loss": 1.074, + "step": 1238 + }, + { + "epoch": 0.12, + "grad_norm": 0.3021321790418319, + "learning_rate": 0.00019995291046840554, + "loss": 1.119, + "step": 1239 + }, + { + "epoch": 0.12, + "grad_norm": 0.2361755130550678, + "learning_rate": 0.00019995242379673041, + "loss": 1.1614, + "step": 1240 + }, + { + "epoch": 0.12, + "grad_norm": 0.254336538394881, + "learning_rate": 0.00019995193462368308, + "loss": 1.0516, + "step": 1241 + }, + { + "epoch": 0.12, + "grad_norm": 0.26752053595877906, + "learning_rate": 0.00019995144294927575, + "loss": 1.0767, + "step": 1242 + }, + { + "epoch": 0.12, + "grad_norm": 0.2597082485713151, + "learning_rate": 0.00019995094877352075, + "loss": 1.0225, + "step": 1243 + }, + { + "epoch": 0.12, + "grad_norm": 0.2455539656845191, + "learning_rate": 0.00019995045209643042, + "loss": 1.1359, + "step": 1244 + }, + { + "epoch": 0.12, + "grad_norm": 0.27400725474036985, + "learning_rate": 0.00019994995291801725, + "loss": 1.1361, + "step": 1245 + }, + { + "epoch": 0.12, + "grad_norm": 0.2924655478762353, + "learning_rate": 0.00019994945123829366, + "loss": 1.1821, + "step": 1246 + }, + { + "epoch": 0.12, + "grad_norm": 0.29710840841514063, + "learning_rate": 0.00019994894705727224, + "loss": 1.2383, + "step": 1247 + }, + { + "epoch": 0.12, + "grad_norm": 0.2813768369686393, + "learning_rate": 0.0001999484403749656, + "loss": 1.0407, + "step": 1248 + }, + { + "epoch": 0.12, + "grad_norm": 0.28144059564025686, + "learning_rate": 0.00019994793119138644, + "loss": 1.2073, + "step": 1249 + }, + { + "epoch": 0.12, + "grad_norm": 0.24328115129403632, + "learning_rate": 0.00019994741950654746, + "loss": 1.1233, + "step": 1250 + }, + { + "epoch": 0.12, + "grad_norm": 0.2779013414605166, + "learning_rate": 0.00019994690532046155, + "loss": 1.1908, + "step": 1251 + }, + { + "epoch": 0.12, + "grad_norm": 0.2939081176965555, + "learning_rate": 0.00019994638863314146, + "loss": 1.0965, + "step": 1252 + }, + { + "epoch": 0.12, + "grad_norm": 0.267694708686987, + "learning_rate": 0.0001999458694446002, + "loss": 1.0733, + "step": 1253 + }, + { + "epoch": 0.12, + "grad_norm": 0.31511116622951807, + "learning_rate": 0.00019994534775485075, + "loss": 1.0266, + "step": 1254 + }, + { + "epoch": 0.12, + "grad_norm": 0.2718983181420745, + "learning_rate": 0.00019994482356390617, + "loss": 1.1895, + "step": 1255 + }, + { + "epoch": 0.12, + "grad_norm": 0.2558317058922546, + "learning_rate": 0.00019994429687177957, + "loss": 1.1239, + "step": 1256 + }, + { + "epoch": 0.12, + "grad_norm": 0.2744763799941594, + "learning_rate": 0.00019994376767848407, + "loss": 1.2122, + "step": 1257 + }, + { + "epoch": 0.12, + "grad_norm": 0.2643964783968129, + "learning_rate": 0.00019994323598403302, + "loss": 1.0028, + "step": 1258 + }, + { + "epoch": 0.12, + "grad_norm": 0.25268007251056396, + "learning_rate": 0.0001999427017884397, + "loss": 1.1376, + "step": 1259 + }, + { + "epoch": 0.12, + "grad_norm": 0.230312980862343, + "learning_rate": 0.0001999421650917174, + "loss": 1.0732, + "step": 1260 + }, + { + "epoch": 0.12, + "grad_norm": 0.2740089140496981, + "learning_rate": 0.00019994162589387964, + "loss": 0.9982, + "step": 1261 + }, + { + "epoch": 0.12, + "grad_norm": 0.2726058611551938, + "learning_rate": 0.0001999410841949399, + "loss": 1.039, + "step": 1262 + }, + { + "epoch": 0.12, + "grad_norm": 0.3233351751878856, + "learning_rate": 0.00019994053999491167, + "loss": 1.2084, + "step": 1263 + }, + { + "epoch": 0.12, + "grad_norm": 0.24718142863026807, + "learning_rate": 0.00019993999329380864, + "loss": 0.9947, + "step": 1264 + }, + { + "epoch": 0.12, + "grad_norm": 0.2877635724046095, + "learning_rate": 0.00019993944409164448, + "loss": 1.1189, + "step": 1265 + }, + { + "epoch": 0.12, + "grad_norm": 0.3002448997033507, + "learning_rate": 0.00019993889238843288, + "loss": 1.0936, + "step": 1266 + }, + { + "epoch": 0.12, + "grad_norm": 0.24948366812390463, + "learning_rate": 0.00019993833818418772, + "loss": 1.1574, + "step": 1267 + }, + { + "epoch": 0.12, + "grad_norm": 0.24037766838141317, + "learning_rate": 0.00019993778147892285, + "loss": 1.1475, + "step": 1268 + }, + { + "epoch": 0.12, + "grad_norm": 0.24578207537112048, + "learning_rate": 0.00019993722227265218, + "loss": 1.1365, + "step": 1269 + }, + { + "epoch": 0.12, + "grad_norm": 0.24088318104194462, + "learning_rate": 0.00019993666056538972, + "loss": 1.0947, + "step": 1270 + }, + { + "epoch": 0.12, + "grad_norm": 0.2921571119742658, + "learning_rate": 0.0001999360963571495, + "loss": 1.0772, + "step": 1271 + }, + { + "epoch": 0.12, + "grad_norm": 0.30049773628170273, + "learning_rate": 0.00019993552964794566, + "loss": 1.2072, + "step": 1272 + }, + { + "epoch": 0.12, + "grad_norm": 0.3160778257013834, + "learning_rate": 0.0001999349604377924, + "loss": 1.0676, + "step": 1273 + }, + { + "epoch": 0.12, + "grad_norm": 0.23884600224412095, + "learning_rate": 0.00019993438872670396, + "loss": 1.0855, + "step": 1274 + }, + { + "epoch": 0.12, + "grad_norm": 0.2615500721708398, + "learning_rate": 0.0001999338145146946, + "loss": 1.1958, + "step": 1275 + }, + { + "epoch": 0.12, + "grad_norm": 0.2591847182045251, + "learning_rate": 0.00019993323780177874, + "loss": 1.0991, + "step": 1276 + }, + { + "epoch": 0.12, + "grad_norm": 0.2800525884700228, + "learning_rate": 0.00019993265858797083, + "loss": 1.0018, + "step": 1277 + }, + { + "epoch": 0.12, + "grad_norm": 0.25703108671920066, + "learning_rate": 0.0001999320768732853, + "loss": 1.0842, + "step": 1278 + }, + { + "epoch": 0.12, + "grad_norm": 0.2814109826464174, + "learning_rate": 0.00019993149265773674, + "loss": 1.1056, + "step": 1279 + }, + { + "epoch": 0.12, + "grad_norm": 0.26560101203311826, + "learning_rate": 0.0001999309059413398, + "loss": 1.1028, + "step": 1280 + }, + { + "epoch": 0.12, + "grad_norm": 0.2592301570333206, + "learning_rate": 0.00019993031672410912, + "loss": 1.2395, + "step": 1281 + }, + { + "epoch": 0.12, + "grad_norm": 0.2903887529235589, + "learning_rate": 0.00019992972500605945, + "loss": 1.2269, + "step": 1282 + }, + { + "epoch": 0.12, + "grad_norm": 0.30985749070799845, + "learning_rate": 0.00019992913078720559, + "loss": 1.0394, + "step": 1283 + }, + { + "epoch": 0.12, + "grad_norm": 0.2427582461596586, + "learning_rate": 0.00019992853406756246, + "loss": 1.0323, + "step": 1284 + }, + { + "epoch": 0.12, + "grad_norm": 0.2674764639506977, + "learning_rate": 0.00019992793484714495, + "loss": 1.0569, + "step": 1285 + }, + { + "epoch": 0.12, + "grad_norm": 0.27159902681019893, + "learning_rate": 0.00019992733312596808, + "loss": 1.0051, + "step": 1286 + }, + { + "epoch": 0.12, + "grad_norm": 0.27222923746834743, + "learning_rate": 0.00019992672890404689, + "loss": 1.1311, + "step": 1287 + }, + { + "epoch": 0.12, + "grad_norm": 0.2557430982813261, + "learning_rate": 0.0001999261221813965, + "loss": 1.121, + "step": 1288 + }, + { + "epoch": 0.12, + "grad_norm": 0.2927850888283984, + "learning_rate": 0.0001999255129580321, + "loss": 1.1497, + "step": 1289 + }, + { + "epoch": 0.12, + "grad_norm": 0.2859310412286254, + "learning_rate": 0.00019992490123396897, + "loss": 1.1786, + "step": 1290 + }, + { + "epoch": 0.12, + "grad_norm": 0.25665923491793874, + "learning_rate": 0.00019992428700922236, + "loss": 1.0947, + "step": 1291 + }, + { + "epoch": 0.12, + "grad_norm": 0.24916615055802435, + "learning_rate": 0.00019992367028380764, + "loss": 1.1687, + "step": 1292 + }, + { + "epoch": 0.12, + "grad_norm": 0.2731992285743491, + "learning_rate": 0.00019992305105774033, + "loss": 1.1253, + "step": 1293 + }, + { + "epoch": 0.12, + "grad_norm": 0.26082758101857634, + "learning_rate": 0.0001999224293310358, + "loss": 1.1551, + "step": 1294 + }, + { + "epoch": 0.12, + "grad_norm": 0.2677306810392585, + "learning_rate": 0.00019992180510370976, + "loss": 1.2005, + "step": 1295 + }, + { + "epoch": 0.12, + "grad_norm": 0.23143531134792353, + "learning_rate": 0.00019992117837577768, + "loss": 1.0862, + "step": 1296 + }, + { + "epoch": 0.12, + "grad_norm": 0.2655840524523936, + "learning_rate": 0.00019992054914725533, + "loss": 1.1536, + "step": 1297 + }, + { + "epoch": 0.12, + "grad_norm": 0.2640787870858716, + "learning_rate": 0.00019991991741815849, + "loss": 1.1011, + "step": 1298 + }, + { + "epoch": 0.12, + "grad_norm": 0.26001791207694314, + "learning_rate": 0.00019991928318850285, + "loss": 1.191, + "step": 1299 + }, + { + "epoch": 0.12, + "grad_norm": 0.34274104078637135, + "learning_rate": 0.0001999186464583044, + "loss": 1.1189, + "step": 1300 + }, + { + "epoch": 0.12, + "grad_norm": 0.25164999546725786, + "learning_rate": 0.000199918007227579, + "loss": 1.1755, + "step": 1301 + }, + { + "epoch": 0.12, + "grad_norm": 0.2619736635468605, + "learning_rate": 0.00019991736549634267, + "loss": 1.0093, + "step": 1302 + }, + { + "epoch": 0.12, + "grad_norm": 0.2610328681252208, + "learning_rate": 0.00019991672126461147, + "loss": 1.2091, + "step": 1303 + }, + { + "epoch": 0.12, + "grad_norm": 0.27609879841972346, + "learning_rate": 0.00019991607453240153, + "loss": 1.1545, + "step": 1304 + }, + { + "epoch": 0.12, + "grad_norm": 0.2438919416348471, + "learning_rate": 0.00019991542529972905, + "loss": 1.0793, + "step": 1305 + }, + { + "epoch": 0.12, + "grad_norm": 0.34769105624706653, + "learning_rate": 0.00019991477356661022, + "loss": 1.0689, + "step": 1306 + }, + { + "epoch": 0.13, + "grad_norm": 0.24988099416617487, + "learning_rate": 0.0001999141193330614, + "loss": 0.9971, + "step": 1307 + }, + { + "epoch": 0.13, + "grad_norm": 0.26587548843318687, + "learning_rate": 0.00019991346259909897, + "loss": 1.0847, + "step": 1308 + }, + { + "epoch": 0.13, + "grad_norm": 0.27946757540245054, + "learning_rate": 0.00019991280336473935, + "loss": 1.137, + "step": 1309 + }, + { + "epoch": 0.13, + "grad_norm": 0.2928663604215012, + "learning_rate": 0.000199912141629999, + "loss": 1.157, + "step": 1310 + }, + { + "epoch": 0.13, + "grad_norm": 0.24871198294790245, + "learning_rate": 0.00019991147739489455, + "loss": 1.0734, + "step": 1311 + }, + { + "epoch": 0.13, + "grad_norm": 0.2384639920280004, + "learning_rate": 0.00019991081065944254, + "loss": 1.0737, + "step": 1312 + }, + { + "epoch": 0.13, + "grad_norm": 0.2656285356842293, + "learning_rate": 0.00019991014142365976, + "loss": 1.0513, + "step": 1313 + }, + { + "epoch": 0.13, + "grad_norm": 0.28562521186701684, + "learning_rate": 0.00019990946968756286, + "loss": 1.1639, + "step": 1314 + }, + { + "epoch": 0.13, + "grad_norm": 0.2648382753477716, + "learning_rate": 0.0001999087954511687, + "loss": 1.1691, + "step": 1315 + }, + { + "epoch": 0.13, + "grad_norm": 0.3183214028731789, + "learning_rate": 0.00019990811871449412, + "loss": 1.1523, + "step": 1316 + }, + { + "epoch": 0.13, + "grad_norm": 0.27420941387779174, + "learning_rate": 0.0001999074394775561, + "loss": 1.0882, + "step": 1317 + }, + { + "epoch": 0.13, + "grad_norm": 0.24879969547328998, + "learning_rate": 0.00019990675774037164, + "loss": 1.0498, + "step": 1318 + }, + { + "epoch": 0.13, + "grad_norm": 0.2738940434550004, + "learning_rate": 0.00019990607350295776, + "loss": 1.145, + "step": 1319 + }, + { + "epoch": 0.13, + "grad_norm": 0.25098756631874913, + "learning_rate": 0.0001999053867653316, + "loss": 1.0748, + "step": 1320 + }, + { + "epoch": 0.13, + "grad_norm": 0.285510083056974, + "learning_rate": 0.00019990469752751032, + "loss": 1.0964, + "step": 1321 + }, + { + "epoch": 0.13, + "grad_norm": 0.2528776643597391, + "learning_rate": 0.00019990400578951125, + "loss": 1.1219, + "step": 1322 + }, + { + "epoch": 0.13, + "grad_norm": 0.2737172954753307, + "learning_rate": 0.0001999033115513516, + "loss": 1.144, + "step": 1323 + }, + { + "epoch": 0.13, + "grad_norm": 0.2788543402132724, + "learning_rate": 0.00019990261481304882, + "loss": 1.2348, + "step": 1324 + }, + { + "epoch": 0.13, + "grad_norm": 0.26612476420962583, + "learning_rate": 0.00019990191557462032, + "loss": 1.1158, + "step": 1325 + }, + { + "epoch": 0.13, + "grad_norm": 0.27339420148068405, + "learning_rate": 0.00019990121383608357, + "loss": 1.193, + "step": 1326 + }, + { + "epoch": 0.13, + "grad_norm": 0.2660334700185283, + "learning_rate": 0.0001999005095974562, + "loss": 1.0692, + "step": 1327 + }, + { + "epoch": 0.13, + "grad_norm": 0.2779459146879295, + "learning_rate": 0.00019989980285875576, + "loss": 1.1296, + "step": 1328 + }, + { + "epoch": 0.13, + "grad_norm": 0.30588845858874797, + "learning_rate": 0.00019989909361999998, + "loss": 1.102, + "step": 1329 + }, + { + "epoch": 0.13, + "grad_norm": 0.27026409826467807, + "learning_rate": 0.0001998983818812066, + "loss": 1.0786, + "step": 1330 + }, + { + "epoch": 0.13, + "grad_norm": 0.23833256583341253, + "learning_rate": 0.00019989766764239342, + "loss": 1.1167, + "step": 1331 + }, + { + "epoch": 0.13, + "grad_norm": 0.21010010951243135, + "learning_rate": 0.00019989695090357832, + "loss": 0.9995, + "step": 1332 + }, + { + "epoch": 0.13, + "grad_norm": 0.28853987325226016, + "learning_rate": 0.00019989623166477926, + "loss": 0.9722, + "step": 1333 + }, + { + "epoch": 0.13, + "grad_norm": 0.27581828527816954, + "learning_rate": 0.0001998955099260142, + "loss": 1.1367, + "step": 1334 + }, + { + "epoch": 0.13, + "grad_norm": 0.3266981204077468, + "learning_rate": 0.00019989478568730124, + "loss": 1.0954, + "step": 1335 + }, + { + "epoch": 0.13, + "grad_norm": 0.26634585407279676, + "learning_rate": 0.00019989405894865848, + "loss": 1.0318, + "step": 1336 + }, + { + "epoch": 0.13, + "grad_norm": 0.2862324766372512, + "learning_rate": 0.0001998933297101041, + "loss": 1.2006, + "step": 1337 + }, + { + "epoch": 0.13, + "grad_norm": 0.3219899860952693, + "learning_rate": 0.0001998925979716564, + "loss": 1.2428, + "step": 1338 + }, + { + "epoch": 0.13, + "grad_norm": 0.2593333848699128, + "learning_rate": 0.0001998918637333336, + "loss": 1.0661, + "step": 1339 + }, + { + "epoch": 0.13, + "grad_norm": 0.25460421291647545, + "learning_rate": 0.00019989112699515417, + "loss": 1.1549, + "step": 1340 + }, + { + "epoch": 0.13, + "grad_norm": 0.2806093932847469, + "learning_rate": 0.0001998903877571365, + "loss": 1.1396, + "step": 1341 + }, + { + "epoch": 0.13, + "grad_norm": 0.28375827740366566, + "learning_rate": 0.00019988964601929911, + "loss": 1.1933, + "step": 1342 + }, + { + "epoch": 0.13, + "grad_norm": 0.25635075031051086, + "learning_rate": 0.00019988890178166053, + "loss": 1.0908, + "step": 1343 + }, + { + "epoch": 0.13, + "grad_norm": 0.2888250115582476, + "learning_rate": 0.00019988815504423942, + "loss": 1.1556, + "step": 1344 + }, + { + "epoch": 0.13, + "grad_norm": 0.23088975772081866, + "learning_rate": 0.00019988740580705443, + "loss": 1.1304, + "step": 1345 + }, + { + "epoch": 0.13, + "grad_norm": 0.27596330498232263, + "learning_rate": 0.00019988665407012435, + "loss": 1.11, + "step": 1346 + }, + { + "epoch": 0.13, + "grad_norm": 0.24512542720942831, + "learning_rate": 0.00019988589983346798, + "loss": 1.1509, + "step": 1347 + }, + { + "epoch": 0.13, + "grad_norm": 0.25605979489959907, + "learning_rate": 0.00019988514309710417, + "loss": 1.0923, + "step": 1348 + }, + { + "epoch": 0.13, + "grad_norm": 0.2532094527125873, + "learning_rate": 0.0001998843838610519, + "loss": 1.0768, + "step": 1349 + }, + { + "epoch": 0.13, + "grad_norm": 0.2951636432930729, + "learning_rate": 0.00019988362212533013, + "loss": 1.1431, + "step": 1350 + }, + { + "epoch": 0.13, + "grad_norm": 0.25484076775146086, + "learning_rate": 0.000199882857889958, + "loss": 1.1345, + "step": 1351 + }, + { + "epoch": 0.13, + "grad_norm": 0.23726743985931845, + "learning_rate": 0.0001998820911549545, + "loss": 1.106, + "step": 1352 + }, + { + "epoch": 0.13, + "grad_norm": 0.2587569140280319, + "learning_rate": 0.00019988132192033892, + "loss": 1.1131, + "step": 1353 + }, + { + "epoch": 0.13, + "grad_norm": 0.26259253430908974, + "learning_rate": 0.0001998805501861305, + "loss": 1.1066, + "step": 1354 + }, + { + "epoch": 0.13, + "grad_norm": 0.24656763039460808, + "learning_rate": 0.00019987977595234852, + "loss": 1.1207, + "step": 1355 + }, + { + "epoch": 0.13, + "grad_norm": 0.27728609474508775, + "learning_rate": 0.0001998789992190124, + "loss": 1.0683, + "step": 1356 + }, + { + "epoch": 0.13, + "grad_norm": 0.266917547376331, + "learning_rate": 0.00019987821998614154, + "loss": 1.1693, + "step": 1357 + }, + { + "epoch": 0.13, + "grad_norm": 0.2915661042761893, + "learning_rate": 0.00019987743825375544, + "loss": 1.1064, + "step": 1358 + }, + { + "epoch": 0.13, + "grad_norm": 0.24772664636633338, + "learning_rate": 0.00019987665402187367, + "loss": 0.9948, + "step": 1359 + }, + { + "epoch": 0.13, + "grad_norm": 0.2855087767458927, + "learning_rate": 0.0001998758672905159, + "loss": 1.1449, + "step": 1360 + }, + { + "epoch": 0.13, + "grad_norm": 0.2772392094730354, + "learning_rate": 0.00019987507805970176, + "loss": 1.1349, + "step": 1361 + }, + { + "epoch": 0.13, + "grad_norm": 0.2612704349828971, + "learning_rate": 0.000199874286329451, + "loss": 1.0995, + "step": 1362 + }, + { + "epoch": 0.13, + "grad_norm": 0.24900155104384822, + "learning_rate": 0.00019987349209978352, + "loss": 1.1157, + "step": 1363 + }, + { + "epoch": 0.13, + "grad_norm": 0.26136521449937644, + "learning_rate": 0.0001998726953707191, + "loss": 1.1324, + "step": 1364 + }, + { + "epoch": 0.13, + "grad_norm": 0.27010231238155247, + "learning_rate": 0.0001998718961422777, + "loss": 1.1295, + "step": 1365 + }, + { + "epoch": 0.13, + "grad_norm": 0.28056706441584167, + "learning_rate": 0.00019987109441447934, + "loss": 1.1236, + "step": 1366 + }, + { + "epoch": 0.13, + "grad_norm": 0.24673474376997923, + "learning_rate": 0.00019987029018734407, + "loss": 1.1493, + "step": 1367 + }, + { + "epoch": 0.13, + "grad_norm": 0.25553237785153865, + "learning_rate": 0.00019986948346089201, + "loss": 1.1698, + "step": 1368 + }, + { + "epoch": 0.13, + "grad_norm": 0.2370749606006542, + "learning_rate": 0.0001998686742351434, + "loss": 1.03, + "step": 1369 + }, + { + "epoch": 0.13, + "grad_norm": 0.2786623699017042, + "learning_rate": 0.00019986786251011842, + "loss": 1.0002, + "step": 1370 + }, + { + "epoch": 0.13, + "grad_norm": 0.2582710457594854, + "learning_rate": 0.0001998670482858374, + "loss": 1.1957, + "step": 1371 + }, + { + "epoch": 0.13, + "grad_norm": 0.2414613658655144, + "learning_rate": 0.00019986623156232076, + "loss": 1.1471, + "step": 1372 + }, + { + "epoch": 0.13, + "grad_norm": 0.28959428619565936, + "learning_rate": 0.0001998654123395889, + "loss": 1.1716, + "step": 1373 + }, + { + "epoch": 0.13, + "grad_norm": 0.22250465518264687, + "learning_rate": 0.00019986459061766234, + "loss": 1.146, + "step": 1374 + }, + { + "epoch": 0.13, + "grad_norm": 0.26717391829997156, + "learning_rate": 0.00019986376639656163, + "loss": 1.1858, + "step": 1375 + }, + { + "epoch": 0.13, + "grad_norm": 0.2580258297035148, + "learning_rate": 0.00019986293967630742, + "loss": 1.1115, + "step": 1376 + }, + { + "epoch": 0.13, + "grad_norm": 0.2678948286097016, + "learning_rate": 0.0001998621104569204, + "loss": 1.1202, + "step": 1377 + }, + { + "epoch": 0.13, + "grad_norm": 0.291208905746566, + "learning_rate": 0.00019986127873842128, + "loss": 1.1424, + "step": 1378 + }, + { + "epoch": 0.13, + "grad_norm": 0.21902207523941578, + "learning_rate": 0.00019986044452083087, + "loss": 1.1687, + "step": 1379 + }, + { + "epoch": 0.13, + "grad_norm": 0.2907782173595465, + "learning_rate": 0.00019985960780417012, + "loss": 1.068, + "step": 1380 + }, + { + "epoch": 0.13, + "grad_norm": 0.2629226322026941, + "learning_rate": 0.0001998587685884599, + "loss": 1.0816, + "step": 1381 + }, + { + "epoch": 0.13, + "grad_norm": 0.2828450761974692, + "learning_rate": 0.00019985792687372126, + "loss": 1.0958, + "step": 1382 + }, + { + "epoch": 0.13, + "grad_norm": 0.26457153514550896, + "learning_rate": 0.00019985708265997523, + "loss": 1.1454, + "step": 1383 + }, + { + "epoch": 0.13, + "grad_norm": 0.25857136554982324, + "learning_rate": 0.00019985623594724294, + "loss": 1.074, + "step": 1384 + }, + { + "epoch": 0.13, + "grad_norm": 0.24297032009355968, + "learning_rate": 0.00019985538673554558, + "loss": 1.2311, + "step": 1385 + }, + { + "epoch": 0.13, + "grad_norm": 0.29342141362537133, + "learning_rate": 0.00019985453502490447, + "loss": 1.0765, + "step": 1386 + }, + { + "epoch": 0.13, + "grad_norm": 0.27688151236884495, + "learning_rate": 0.0001998536808153408, + "loss": 1.1203, + "step": 1387 + }, + { + "epoch": 0.13, + "grad_norm": 0.288243343301449, + "learning_rate": 0.000199852824106876, + "loss": 1.0883, + "step": 1388 + }, + { + "epoch": 0.13, + "grad_norm": 0.252259697983258, + "learning_rate": 0.00019985196489953158, + "loss": 1.2147, + "step": 1389 + }, + { + "epoch": 0.13, + "grad_norm": 0.26515095142272627, + "learning_rate": 0.00019985110319332896, + "loss": 1.0793, + "step": 1390 + }, + { + "epoch": 0.13, + "grad_norm": 0.28200900869959167, + "learning_rate": 0.00019985023898828972, + "loss": 1.0851, + "step": 1391 + }, + { + "epoch": 0.13, + "grad_norm": 0.22101241462701582, + "learning_rate": 0.0001998493722844355, + "loss": 1.0172, + "step": 1392 + }, + { + "epoch": 0.13, + "grad_norm": 0.29717507654200653, + "learning_rate": 0.000199848503081788, + "loss": 1.1634, + "step": 1393 + }, + { + "epoch": 0.13, + "grad_norm": 0.2734078285310837, + "learning_rate": 0.00019984763138036893, + "loss": 1.206, + "step": 1394 + }, + { + "epoch": 0.13, + "grad_norm": 0.31430473861156777, + "learning_rate": 0.00019984675718020016, + "loss": 1.1355, + "step": 1395 + }, + { + "epoch": 0.13, + "grad_norm": 0.25927920918090913, + "learning_rate": 0.00019984588048130352, + "loss": 1.0166, + "step": 1396 + }, + { + "epoch": 0.13, + "grad_norm": 0.2943475145072853, + "learning_rate": 0.000199845001283701, + "loss": 1.0848, + "step": 1397 + }, + { + "epoch": 0.13, + "grad_norm": 0.26595828195937876, + "learning_rate": 0.0001998441195874145, + "loss": 1.0692, + "step": 1398 + }, + { + "epoch": 0.13, + "grad_norm": 0.27821823753565383, + "learning_rate": 0.00019984323539246624, + "loss": 1.2192, + "step": 1399 + }, + { + "epoch": 0.13, + "grad_norm": 0.2678637821368645, + "learning_rate": 0.00019984234869887825, + "loss": 1.121, + "step": 1400 + }, + { + "epoch": 0.13, + "grad_norm": 0.25399068189204, + "learning_rate": 0.0001998414595066727, + "loss": 1.0925, + "step": 1401 + }, + { + "epoch": 0.13, + "grad_norm": 0.23859094414183193, + "learning_rate": 0.00019984056781587191, + "loss": 1.0955, + "step": 1402 + }, + { + "epoch": 0.13, + "grad_norm": 0.23969916810545017, + "learning_rate": 0.00019983967362649814, + "loss": 1.0125, + "step": 1403 + }, + { + "epoch": 0.13, + "grad_norm": 0.2542524563052129, + "learning_rate": 0.0001998387769385738, + "loss": 1.0373, + "step": 1404 + }, + { + "epoch": 0.13, + "grad_norm": 0.27487621829351494, + "learning_rate": 0.0001998378777521213, + "loss": 1.0952, + "step": 1405 + }, + { + "epoch": 0.13, + "grad_norm": 0.2654518507826389, + "learning_rate": 0.0001998369760671632, + "loss": 1.1369, + "step": 1406 + }, + { + "epoch": 0.13, + "grad_norm": 0.273289916929377, + "learning_rate": 0.000199836071883722, + "loss": 1.1703, + "step": 1407 + }, + { + "epoch": 0.13, + "grad_norm": 0.26544162174771085, + "learning_rate": 0.0001998351652018204, + "loss": 1.1784, + "step": 1408 + }, + { + "epoch": 0.13, + "grad_norm": 0.2746593440350384, + "learning_rate": 0.000199834256021481, + "loss": 1.0443, + "step": 1409 + }, + { + "epoch": 0.13, + "grad_norm": 0.2501215693772626, + "learning_rate": 0.00019983334434272662, + "loss": 1.089, + "step": 1410 + }, + { + "epoch": 0.13, + "grad_norm": 0.268614062421362, + "learning_rate": 0.00019983243016558007, + "loss": 0.9597, + "step": 1411 + }, + { + "epoch": 0.14, + "grad_norm": 0.25030986160663105, + "learning_rate": 0.00019983151349006417, + "loss": 1.2056, + "step": 1412 + }, + { + "epoch": 0.14, + "grad_norm": 0.2603381688146406, + "learning_rate": 0.00019983059431620195, + "loss": 1.1547, + "step": 1413 + }, + { + "epoch": 0.14, + "grad_norm": 0.2421120755485211, + "learning_rate": 0.0001998296726440163, + "loss": 1.1084, + "step": 1414 + }, + { + "epoch": 0.14, + "grad_norm": 0.2521346105354822, + "learning_rate": 0.00019982874847353043, + "loss": 1.0624, + "step": 1415 + }, + { + "epoch": 0.14, + "grad_norm": 0.27862535658172966, + "learning_rate": 0.00019982782180476733, + "loss": 1.1186, + "step": 1416 + }, + { + "epoch": 0.14, + "grad_norm": 0.2947914398618809, + "learning_rate": 0.00019982689263775026, + "loss": 1.0843, + "step": 1417 + }, + { + "epoch": 0.14, + "grad_norm": 0.28536726878487906, + "learning_rate": 0.0001998259609725025, + "loss": 1.0653, + "step": 1418 + }, + { + "epoch": 0.14, + "grad_norm": 0.2854261291390814, + "learning_rate": 0.00019982502680904732, + "loss": 1.1266, + "step": 1419 + }, + { + "epoch": 0.14, + "grad_norm": 0.2788285385425596, + "learning_rate": 0.0001998240901474081, + "loss": 1.1592, + "step": 1420 + }, + { + "epoch": 0.14, + "grad_norm": 0.2919295150525652, + "learning_rate": 0.00019982315098760825, + "loss": 1.23, + "step": 1421 + }, + { + "epoch": 0.14, + "grad_norm": 0.2667438613342299, + "learning_rate": 0.00019982220932967135, + "loss": 1.1653, + "step": 1422 + }, + { + "epoch": 0.14, + "grad_norm": 0.2937856653774384, + "learning_rate": 0.00019982126517362092, + "loss": 1.054, + "step": 1423 + }, + { + "epoch": 0.14, + "grad_norm": 0.2787585191603891, + "learning_rate": 0.0001998203185194806, + "loss": 1.1195, + "step": 1424 + }, + { + "epoch": 0.14, + "grad_norm": 0.2808724466906523, + "learning_rate": 0.00019981936936727402, + "loss": 1.1324, + "step": 1425 + }, + { + "epoch": 0.14, + "grad_norm": 0.24401430651001996, + "learning_rate": 0.00019981841771702505, + "loss": 1.138, + "step": 1426 + }, + { + "epoch": 0.14, + "grad_norm": 0.25174511987554116, + "learning_rate": 0.00019981746356875744, + "loss": 1.1906, + "step": 1427 + }, + { + "epoch": 0.14, + "grad_norm": 0.2538503054933492, + "learning_rate": 0.00019981650692249504, + "loss": 1.1029, + "step": 1428 + }, + { + "epoch": 0.14, + "grad_norm": 0.2764211181550846, + "learning_rate": 0.00019981554777826185, + "loss": 1.058, + "step": 1429 + }, + { + "epoch": 0.14, + "grad_norm": 0.2952219050552689, + "learning_rate": 0.00019981458613608182, + "loss": 1.0941, + "step": 1430 + }, + { + "epoch": 0.14, + "grad_norm": 0.2787064009838231, + "learning_rate": 0.00019981362199597907, + "loss": 1.1565, + "step": 1431 + }, + { + "epoch": 0.14, + "grad_norm": 0.2737728015545912, + "learning_rate": 0.00019981265535797766, + "loss": 1.181, + "step": 1432 + }, + { + "epoch": 0.14, + "grad_norm": 0.24759285916434062, + "learning_rate": 0.00019981168622210184, + "loss": 1.1008, + "step": 1433 + }, + { + "epoch": 0.14, + "grad_norm": 0.25095588755500636, + "learning_rate": 0.00019981071458837586, + "loss": 1.1312, + "step": 1434 + }, + { + "epoch": 0.14, + "grad_norm": 0.24482769279016886, + "learning_rate": 0.00019980974045682399, + "loss": 1.0652, + "step": 1435 + }, + { + "epoch": 0.14, + "grad_norm": 0.3060859989141741, + "learning_rate": 0.00019980876382747064, + "loss": 1.157, + "step": 1436 + }, + { + "epoch": 0.14, + "grad_norm": 0.2880153757669946, + "learning_rate": 0.00019980778470034025, + "loss": 1.2203, + "step": 1437 + }, + { + "epoch": 0.14, + "grad_norm": 0.27454359894274216, + "learning_rate": 0.00019980680307545733, + "loss": 1.0434, + "step": 1438 + }, + { + "epoch": 0.14, + "grad_norm": 0.2883237368727595, + "learning_rate": 0.00019980581895284646, + "loss": 1.1272, + "step": 1439 + }, + { + "epoch": 0.14, + "grad_norm": 0.2703448143010634, + "learning_rate": 0.0001998048323325322, + "loss": 1.0698, + "step": 1440 + }, + { + "epoch": 0.14, + "grad_norm": 0.2583337496657767, + "learning_rate": 0.00019980384321453931, + "loss": 1.0424, + "step": 1441 + }, + { + "epoch": 0.14, + "grad_norm": 0.2698951572905667, + "learning_rate": 0.00019980285159889251, + "loss": 1.1694, + "step": 1442 + }, + { + "epoch": 0.14, + "grad_norm": 0.3289970000039555, + "learning_rate": 0.00019980185748561663, + "loss": 1.1242, + "step": 1443 + }, + { + "epoch": 0.14, + "grad_norm": 0.33181434848256175, + "learning_rate": 0.00019980086087473655, + "loss": 1.2397, + "step": 1444 + }, + { + "epoch": 0.14, + "grad_norm": 0.3011135638992476, + "learning_rate": 0.0001997998617662772, + "loss": 1.1714, + "step": 1445 + }, + { + "epoch": 0.14, + "grad_norm": 0.26023630385034113, + "learning_rate": 0.0001997988601602636, + "loss": 1.1072, + "step": 1446 + }, + { + "epoch": 0.14, + "grad_norm": 0.27251940674998737, + "learning_rate": 0.00019979785605672078, + "loss": 1.0825, + "step": 1447 + }, + { + "epoch": 0.14, + "grad_norm": 0.2554656785010738, + "learning_rate": 0.0001997968494556739, + "loss": 1.0752, + "step": 1448 + }, + { + "epoch": 0.14, + "grad_norm": 0.23684498161731513, + "learning_rate": 0.00019979584035714813, + "loss": 1.06, + "step": 1449 + }, + { + "epoch": 0.14, + "grad_norm": 0.31872148847175163, + "learning_rate": 0.00019979482876116876, + "loss": 1.1667, + "step": 1450 + }, + { + "epoch": 0.14, + "grad_norm": 0.25697407458502153, + "learning_rate": 0.00019979381466776107, + "loss": 1.0822, + "step": 1451 + }, + { + "epoch": 0.14, + "grad_norm": 0.2680749561732018, + "learning_rate": 0.00019979279807695046, + "loss": 1.1473, + "step": 1452 + }, + { + "epoch": 0.14, + "grad_norm": 0.23738996733574236, + "learning_rate": 0.00019979177898876233, + "loss": 1.1218, + "step": 1453 + }, + { + "epoch": 0.14, + "grad_norm": 0.2518660152482727, + "learning_rate": 0.00019979075740322224, + "loss": 1.1364, + "step": 1454 + }, + { + "epoch": 0.14, + "grad_norm": 0.2843340075762548, + "learning_rate": 0.00019978973332035574, + "loss": 1.1052, + "step": 1455 + }, + { + "epoch": 0.14, + "grad_norm": 0.23668848969056627, + "learning_rate": 0.0001997887067401884, + "loss": 1.1692, + "step": 1456 + }, + { + "epoch": 0.14, + "grad_norm": 0.28502539435405605, + "learning_rate": 0.000199787677662746, + "loss": 1.2374, + "step": 1457 + }, + { + "epoch": 0.14, + "grad_norm": 0.25655559740700473, + "learning_rate": 0.00019978664608805423, + "loss": 1.1088, + "step": 1458 + }, + { + "epoch": 0.14, + "grad_norm": 0.2585840843019671, + "learning_rate": 0.00019978561201613895, + "loss": 1.1601, + "step": 1459 + }, + { + "epoch": 0.14, + "grad_norm": 0.2550422041426665, + "learning_rate": 0.00019978457544702602, + "loss": 1.1033, + "step": 1460 + }, + { + "epoch": 0.14, + "grad_norm": 0.2899278429933367, + "learning_rate": 0.00019978353638074137, + "loss": 1.1611, + "step": 1461 + }, + { + "epoch": 0.14, + "grad_norm": 0.234358263645623, + "learning_rate": 0.000199782494817311, + "loss": 1.0151, + "step": 1462 + }, + { + "epoch": 0.14, + "grad_norm": 0.24241424067059672, + "learning_rate": 0.00019978145075676097, + "loss": 1.1854, + "step": 1463 + }, + { + "epoch": 0.14, + "grad_norm": 0.2685159364940143, + "learning_rate": 0.00019978040419911744, + "loss": 1.0691, + "step": 1464 + }, + { + "epoch": 0.14, + "grad_norm": 0.31415333163030407, + "learning_rate": 0.0001997793551444066, + "loss": 1.0344, + "step": 1465 + }, + { + "epoch": 0.14, + "grad_norm": 0.2366061685991993, + "learning_rate": 0.00019977830359265472, + "loss": 1.1004, + "step": 1466 + }, + { + "epoch": 0.14, + "grad_norm": 0.2485341188969311, + "learning_rate": 0.00019977724954388802, + "loss": 1.1078, + "step": 1467 + }, + { + "epoch": 0.14, + "grad_norm": 0.29387809184755664, + "learning_rate": 0.00019977619299813297, + "loss": 1.1114, + "step": 1468 + }, + { + "epoch": 0.14, + "grad_norm": 0.2652962832837201, + "learning_rate": 0.00019977513395541598, + "loss": 1.1429, + "step": 1469 + }, + { + "epoch": 0.14, + "grad_norm": 0.296485600586474, + "learning_rate": 0.00019977407241576355, + "loss": 1.1725, + "step": 1470 + }, + { + "epoch": 0.14, + "grad_norm": 0.25206336596375595, + "learning_rate": 0.00019977300837920227, + "loss": 1.1611, + "step": 1471 + }, + { + "epoch": 0.14, + "grad_norm": 0.2550564024652486, + "learning_rate": 0.00019977194184575873, + "loss": 1.1339, + "step": 1472 + }, + { + "epoch": 0.14, + "grad_norm": 0.27675386051052386, + "learning_rate": 0.00019977087281545966, + "loss": 1.1179, + "step": 1473 + }, + { + "epoch": 0.14, + "grad_norm": 0.2615226908183446, + "learning_rate": 0.00019976980128833178, + "loss": 1.0797, + "step": 1474 + }, + { + "epoch": 0.14, + "grad_norm": 0.268113224944456, + "learning_rate": 0.00019976872726440193, + "loss": 1.1163, + "step": 1475 + }, + { + "epoch": 0.14, + "grad_norm": 0.23912506420391394, + "learning_rate": 0.00019976765074369697, + "loss": 1.099, + "step": 1476 + }, + { + "epoch": 0.14, + "grad_norm": 0.28513459887618126, + "learning_rate": 0.00019976657172624383, + "loss": 1.0811, + "step": 1477 + }, + { + "epoch": 0.14, + "grad_norm": 0.28114711060770275, + "learning_rate": 0.00019976549021206958, + "loss": 1.2168, + "step": 1478 + }, + { + "epoch": 0.14, + "grad_norm": 0.2572724456046117, + "learning_rate": 0.0001997644062012012, + "loss": 1.0752, + "step": 1479 + }, + { + "epoch": 0.14, + "grad_norm": 0.3019482336468289, + "learning_rate": 0.00019976331969366587, + "loss": 1.1129, + "step": 1480 + }, + { + "epoch": 0.14, + "grad_norm": 0.2566195486989078, + "learning_rate": 0.00019976223068949076, + "loss": 1.1825, + "step": 1481 + }, + { + "epoch": 0.14, + "grad_norm": 0.27980667259100733, + "learning_rate": 0.00019976113918870314, + "loss": 1.1828, + "step": 1482 + }, + { + "epoch": 0.14, + "grad_norm": 0.30150809201942436, + "learning_rate": 0.0001997600451913303, + "loss": 1.1247, + "step": 1483 + }, + { + "epoch": 0.14, + "grad_norm": 0.2716498140574145, + "learning_rate": 0.0001997589486973996, + "loss": 1.1254, + "step": 1484 + }, + { + "epoch": 0.14, + "grad_norm": 0.2625386839698671, + "learning_rate": 0.00019975784970693855, + "loss": 1.1081, + "step": 1485 + }, + { + "epoch": 0.14, + "grad_norm": 0.29792002537853113, + "learning_rate": 0.00019975674821997463, + "loss": 1.1597, + "step": 1486 + }, + { + "epoch": 0.14, + "grad_norm": 0.2301276867707596, + "learning_rate": 0.00019975564423653538, + "loss": 1.1342, + "step": 1487 + }, + { + "epoch": 0.14, + "grad_norm": 0.27119741051691004, + "learning_rate": 0.00019975453775664844, + "loss": 1.0453, + "step": 1488 + }, + { + "epoch": 0.14, + "grad_norm": 0.25640967867486597, + "learning_rate": 0.0001997534287803415, + "loss": 1.0959, + "step": 1489 + }, + { + "epoch": 0.14, + "grad_norm": 0.2817881208330018, + "learning_rate": 0.00019975231730764227, + "loss": 1.1004, + "step": 1490 + }, + { + "epoch": 0.14, + "grad_norm": 0.2632972390210799, + "learning_rate": 0.00019975120333857866, + "loss": 1.0682, + "step": 1491 + }, + { + "epoch": 0.14, + "grad_norm": 0.256251122254461, + "learning_rate": 0.0001997500868731785, + "loss": 1.1663, + "step": 1492 + }, + { + "epoch": 0.14, + "grad_norm": 0.26242859708220295, + "learning_rate": 0.0001997489679114697, + "loss": 1.0843, + "step": 1493 + }, + { + "epoch": 0.14, + "grad_norm": 0.28009382446859793, + "learning_rate": 0.0001997478464534803, + "loss": 1.1744, + "step": 1494 + }, + { + "epoch": 0.14, + "grad_norm": 0.2448192879010323, + "learning_rate": 0.0001997467224992383, + "loss": 1.1268, + "step": 1495 + }, + { + "epoch": 0.14, + "grad_norm": 0.23428824139945728, + "learning_rate": 0.00019974559604877195, + "loss": 1.0997, + "step": 1496 + }, + { + "epoch": 0.14, + "grad_norm": 0.2833166934983789, + "learning_rate": 0.00019974446710210934, + "loss": 1.0867, + "step": 1497 + }, + { + "epoch": 0.14, + "grad_norm": 0.25022212744952455, + "learning_rate": 0.00019974333565927878, + "loss": 1.0903, + "step": 1498 + }, + { + "epoch": 0.14, + "grad_norm": 0.26855972657496696, + "learning_rate": 0.00019974220172030852, + "loss": 1.0304, + "step": 1499 + }, + { + "epoch": 0.14, + "grad_norm": 0.26473122846509034, + "learning_rate": 0.000199741065285227, + "loss": 1.0981, + "step": 1500 + }, + { + "epoch": 0.14, + "grad_norm": 0.2915333904654244, + "learning_rate": 0.00019973992635406265, + "loss": 1.157, + "step": 1501 + }, + { + "epoch": 0.14, + "grad_norm": 0.37217781009539613, + "learning_rate": 0.00019973878492684393, + "loss": 1.1401, + "step": 1502 + }, + { + "epoch": 0.14, + "grad_norm": 0.2745430144825149, + "learning_rate": 0.00019973764100359942, + "loss": 1.1318, + "step": 1503 + }, + { + "epoch": 0.14, + "grad_norm": 0.2755977808247509, + "learning_rate": 0.0001997364945843578, + "loss": 1.1609, + "step": 1504 + }, + { + "epoch": 0.14, + "grad_norm": 0.2915733276431069, + "learning_rate": 0.00019973534566914772, + "loss": 1.086, + "step": 1505 + }, + { + "epoch": 0.14, + "grad_norm": 0.2599783190286603, + "learning_rate": 0.00019973419425799792, + "loss": 1.1108, + "step": 1506 + }, + { + "epoch": 0.14, + "grad_norm": 0.280753933923634, + "learning_rate": 0.0001997330403509372, + "loss": 1.147, + "step": 1507 + }, + { + "epoch": 0.14, + "grad_norm": 0.27055165527737557, + "learning_rate": 0.0001997318839479945, + "loss": 1.0952, + "step": 1508 + }, + { + "epoch": 0.14, + "grad_norm": 0.2491475388699138, + "learning_rate": 0.00019973072504919875, + "loss": 1.0893, + "step": 1509 + }, + { + "epoch": 0.14, + "grad_norm": 0.22529007248240643, + "learning_rate": 0.00019972956365457887, + "loss": 0.9453, + "step": 1510 + }, + { + "epoch": 0.14, + "grad_norm": 0.29669549846388793, + "learning_rate": 0.000199728399764164, + "loss": 1.2225, + "step": 1511 + }, + { + "epoch": 0.14, + "grad_norm": 0.27716026577726227, + "learning_rate": 0.00019972723337798327, + "loss": 1.2158, + "step": 1512 + }, + { + "epoch": 0.14, + "grad_norm": 0.2615727080933855, + "learning_rate": 0.00019972606449606583, + "loss": 1.1745, + "step": 1513 + }, + { + "epoch": 0.14, + "grad_norm": 0.2613301650841691, + "learning_rate": 0.00019972489311844097, + "loss": 1.0105, + "step": 1514 + }, + { + "epoch": 0.14, + "grad_norm": 0.2637350666912871, + "learning_rate": 0.00019972371924513796, + "loss": 1.2445, + "step": 1515 + }, + { + "epoch": 0.15, + "grad_norm": 0.23442111345358183, + "learning_rate": 0.0001997225428761862, + "loss": 1.0897, + "step": 1516 + }, + { + "epoch": 0.15, + "grad_norm": 0.2790398743404188, + "learning_rate": 0.00019972136401161516, + "loss": 1.0976, + "step": 1517 + }, + { + "epoch": 0.15, + "grad_norm": 0.24161573374816125, + "learning_rate": 0.00019972018265145428, + "loss": 1.0778, + "step": 1518 + }, + { + "epoch": 0.15, + "grad_norm": 0.2599798006860897, + "learning_rate": 0.00019971899879573317, + "loss": 1.0797, + "step": 1519 + }, + { + "epoch": 0.15, + "grad_norm": 0.24106641987255334, + "learning_rate": 0.00019971781244448145, + "loss": 1.0863, + "step": 1520 + }, + { + "epoch": 0.15, + "grad_norm": 0.28462310372801436, + "learning_rate": 0.0001997166235977288, + "loss": 1.2261, + "step": 1521 + }, + { + "epoch": 0.15, + "grad_norm": 0.2715980154291507, + "learning_rate": 0.00019971543225550498, + "loss": 1.074, + "step": 1522 + }, + { + "epoch": 0.15, + "grad_norm": 0.24961681655649778, + "learning_rate": 0.0001997142384178398, + "loss": 1.102, + "step": 1523 + }, + { + "epoch": 0.15, + "grad_norm": 0.27808524258122086, + "learning_rate": 0.00019971304208476313, + "loss": 1.0456, + "step": 1524 + }, + { + "epoch": 0.15, + "grad_norm": 0.272393710564596, + "learning_rate": 0.0001997118432563049, + "loss": 0.9979, + "step": 1525 + }, + { + "epoch": 0.15, + "grad_norm": 0.2985225834683353, + "learning_rate": 0.00019971064193249517, + "loss": 1.1062, + "step": 1526 + }, + { + "epoch": 0.15, + "grad_norm": 0.31751577230140715, + "learning_rate": 0.0001997094381133639, + "loss": 1.1574, + "step": 1527 + }, + { + "epoch": 0.15, + "grad_norm": 0.2311208784177862, + "learning_rate": 0.00019970823179894134, + "loss": 1.0605, + "step": 1528 + }, + { + "epoch": 0.15, + "grad_norm": 0.27395265358450693, + "learning_rate": 0.00019970702298925756, + "loss": 1.0394, + "step": 1529 + }, + { + "epoch": 0.15, + "grad_norm": 0.31050969831593717, + "learning_rate": 0.0001997058116843429, + "loss": 1.21, + "step": 1530 + }, + { + "epoch": 0.15, + "grad_norm": 0.29740261342037966, + "learning_rate": 0.00019970459788422762, + "loss": 1.0973, + "step": 1531 + }, + { + "epoch": 0.15, + "grad_norm": 0.30710726934897853, + "learning_rate": 0.00019970338158894213, + "loss": 1.0899, + "step": 1532 + }, + { + "epoch": 0.15, + "grad_norm": 0.38054688041895546, + "learning_rate": 0.00019970216279851686, + "loss": 1.0608, + "step": 1533 + }, + { + "epoch": 0.15, + "grad_norm": 0.26727890820952926, + "learning_rate": 0.0001997009415129823, + "loss": 1.1018, + "step": 1534 + }, + { + "epoch": 0.15, + "grad_norm": 0.29384471950665425, + "learning_rate": 0.000199699717732369, + "loss": 1.088, + "step": 1535 + }, + { + "epoch": 0.15, + "grad_norm": 0.245541461264911, + "learning_rate": 0.00019969849145670763, + "loss": 1.0829, + "step": 1536 + }, + { + "epoch": 0.15, + "grad_norm": 0.3031137108785239, + "learning_rate": 0.0001996972626860288, + "loss": 1.138, + "step": 1537 + }, + { + "epoch": 0.15, + "grad_norm": 0.27477763946901834, + "learning_rate": 0.0001996960314203634, + "loss": 1.1612, + "step": 1538 + }, + { + "epoch": 0.15, + "grad_norm": 0.2970092386217553, + "learning_rate": 0.0001996947976597421, + "loss": 1.0688, + "step": 1539 + }, + { + "epoch": 0.15, + "grad_norm": 0.3258862801086761, + "learning_rate": 0.00019969356140419584, + "loss": 1.1302, + "step": 1540 + }, + { + "epoch": 0.15, + "grad_norm": 0.2995257919569518, + "learning_rate": 0.00019969232265375556, + "loss": 1.0475, + "step": 1541 + }, + { + "epoch": 0.15, + "grad_norm": 0.26050135512792294, + "learning_rate": 0.00019969108140845224, + "loss": 1.1356, + "step": 1542 + }, + { + "epoch": 0.15, + "grad_norm": 0.258577882146825, + "learning_rate": 0.00019968983766831695, + "loss": 1.0545, + "step": 1543 + }, + { + "epoch": 0.15, + "grad_norm": 0.2627007677252112, + "learning_rate": 0.00019968859143338084, + "loss": 1.1386, + "step": 1544 + }, + { + "epoch": 0.15, + "grad_norm": 0.2633365267212909, + "learning_rate": 0.00019968734270367505, + "loss": 1.1784, + "step": 1545 + }, + { + "epoch": 0.15, + "grad_norm": 0.28411120690910896, + "learning_rate": 0.0001996860914792309, + "loss": 1.0888, + "step": 1546 + }, + { + "epoch": 0.15, + "grad_norm": 0.26646479947544477, + "learning_rate": 0.00019968483776007962, + "loss": 1.1659, + "step": 1547 + }, + { + "epoch": 0.15, + "grad_norm": 0.24786146098955286, + "learning_rate": 0.00019968358154625265, + "loss": 1.2185, + "step": 1548 + }, + { + "epoch": 0.15, + "grad_norm": 0.23607348527883257, + "learning_rate": 0.0001996823228377814, + "loss": 1.0735, + "step": 1549 + }, + { + "epoch": 0.15, + "grad_norm": 0.2570468565891483, + "learning_rate": 0.00019968106163469735, + "loss": 1.0616, + "step": 1550 + }, + { + "epoch": 0.15, + "grad_norm": 0.25852020457845143, + "learning_rate": 0.00019967979793703212, + "loss": 1.093, + "step": 1551 + }, + { + "epoch": 0.15, + "grad_norm": 0.28278864970040046, + "learning_rate": 0.00019967853174481727, + "loss": 1.0388, + "step": 1552 + }, + { + "epoch": 0.15, + "grad_norm": 0.2941535073545967, + "learning_rate": 0.00019967726305808453, + "loss": 1.0706, + "step": 1553 + }, + { + "epoch": 0.15, + "grad_norm": 0.2622528773036849, + "learning_rate": 0.00019967599187686562, + "loss": 1.0883, + "step": 1554 + }, + { + "epoch": 0.15, + "grad_norm": 0.27742024001398574, + "learning_rate": 0.00019967471820119242, + "loss": 1.0728, + "step": 1555 + }, + { + "epoch": 0.15, + "grad_norm": 0.3052593236911406, + "learning_rate": 0.00019967344203109671, + "loss": 1.1789, + "step": 1556 + }, + { + "epoch": 0.15, + "grad_norm": 0.28599434696605697, + "learning_rate": 0.0001996721633666105, + "loss": 1.1562, + "step": 1557 + }, + { + "epoch": 0.15, + "grad_norm": 0.24494683600954692, + "learning_rate": 0.00019967088220776573, + "loss": 1.2048, + "step": 1558 + }, + { + "epoch": 0.15, + "grad_norm": 0.25067138267095485, + "learning_rate": 0.0001996695985545945, + "loss": 1.1256, + "step": 1559 + }, + { + "epoch": 0.15, + "grad_norm": 0.3027501173185624, + "learning_rate": 0.00019966831240712893, + "loss": 1.1089, + "step": 1560 + }, + { + "epoch": 0.15, + "grad_norm": 0.29832312031178204, + "learning_rate": 0.0001996670237654012, + "loss": 1.1435, + "step": 1561 + }, + { + "epoch": 0.15, + "grad_norm": 0.2602686000520101, + "learning_rate": 0.00019966573262944357, + "loss": 1.2332, + "step": 1562 + }, + { + "epoch": 0.15, + "grad_norm": 0.2805057631189005, + "learning_rate": 0.00019966443899928831, + "loss": 1.0837, + "step": 1563 + }, + { + "epoch": 0.15, + "grad_norm": 0.38166616945412846, + "learning_rate": 0.00019966314287496787, + "loss": 1.059, + "step": 1564 + }, + { + "epoch": 0.15, + "grad_norm": 0.2764938914379305, + "learning_rate": 0.00019966184425651464, + "loss": 1.1139, + "step": 1565 + }, + { + "epoch": 0.15, + "grad_norm": 0.24932842836394434, + "learning_rate": 0.0001996605431439611, + "loss": 1.1142, + "step": 1566 + }, + { + "epoch": 0.15, + "grad_norm": 0.29196882597427254, + "learning_rate": 0.00019965923953733987, + "loss": 1.1607, + "step": 1567 + }, + { + "epoch": 0.15, + "grad_norm": 0.2709530754699249, + "learning_rate": 0.00019965793343668347, + "loss": 1.0495, + "step": 1568 + }, + { + "epoch": 0.15, + "grad_norm": 0.2598432982826659, + "learning_rate": 0.0001996566248420247, + "loss": 1.1288, + "step": 1569 + }, + { + "epoch": 0.15, + "grad_norm": 0.46312766196550453, + "learning_rate": 0.00019965531375339628, + "loss": 1.1307, + "step": 1570 + }, + { + "epoch": 0.15, + "grad_norm": 0.26262906846828266, + "learning_rate": 0.00019965400017083097, + "loss": 1.1543, + "step": 1571 + }, + { + "epoch": 0.15, + "grad_norm": 0.25314149624310883, + "learning_rate": 0.00019965268409436168, + "loss": 1.0466, + "step": 1572 + }, + { + "epoch": 0.15, + "grad_norm": 0.2687356408453828, + "learning_rate": 0.00019965136552402136, + "loss": 1.1159, + "step": 1573 + }, + { + "epoch": 0.15, + "grad_norm": 0.2708428789609682, + "learning_rate": 0.00019965004445984298, + "loss": 1.055, + "step": 1574 + }, + { + "epoch": 0.15, + "grad_norm": 0.3839155117397319, + "learning_rate": 0.0001996487209018596, + "loss": 1.0844, + "step": 1575 + }, + { + "epoch": 0.15, + "grad_norm": 0.7734934544648255, + "learning_rate": 0.00019964739485010436, + "loss": 1.1704, + "step": 1576 + }, + { + "epoch": 0.15, + "grad_norm": 0.2985808342595769, + "learning_rate": 0.00019964606630461042, + "loss": 1.1233, + "step": 1577 + }, + { + "epoch": 0.15, + "grad_norm": 0.28199357321934904, + "learning_rate": 0.00019964473526541107, + "loss": 1.1306, + "step": 1578 + }, + { + "epoch": 0.15, + "grad_norm": 0.2708719269403245, + "learning_rate": 0.0001996434017325396, + "loss": 1.2283, + "step": 1579 + }, + { + "epoch": 0.15, + "grad_norm": 0.2792985547025937, + "learning_rate": 0.00019964206570602936, + "loss": 1.1385, + "step": 1580 + }, + { + "epoch": 0.15, + "grad_norm": 0.23415050247231725, + "learning_rate": 0.0001996407271859138, + "loss": 1.0389, + "step": 1581 + }, + { + "epoch": 0.15, + "grad_norm": 0.2740160527199919, + "learning_rate": 0.00019963938617222643, + "loss": 1.1243, + "step": 1582 + }, + { + "epoch": 0.15, + "grad_norm": 0.27399387661110913, + "learning_rate": 0.0001996380426650008, + "loss": 1.1087, + "step": 1583 + }, + { + "epoch": 0.15, + "grad_norm": 0.2692759658009343, + "learning_rate": 0.0001996366966642705, + "loss": 1.1163, + "step": 1584 + }, + { + "epoch": 0.15, + "grad_norm": 0.9607464989916183, + "learning_rate": 0.0001996353481700693, + "loss": 1.0442, + "step": 1585 + }, + { + "epoch": 0.15, + "grad_norm": 0.24459484067831805, + "learning_rate": 0.00019963399718243084, + "loss": 1.0841, + "step": 1586 + }, + { + "epoch": 0.15, + "grad_norm": 0.2762521737160385, + "learning_rate": 0.00019963264370138903, + "loss": 1.1496, + "step": 1587 + }, + { + "epoch": 0.15, + "grad_norm": 0.25933505174178323, + "learning_rate": 0.0001996312877269777, + "loss": 1.2522, + "step": 1588 + }, + { + "epoch": 0.15, + "grad_norm": 0.27189774149978774, + "learning_rate": 0.00019962992925923073, + "loss": 1.1413, + "step": 1589 + }, + { + "epoch": 0.15, + "grad_norm": 3.576402306240423, + "learning_rate": 0.00019962856829818223, + "loss": 1.1702, + "step": 1590 + }, + { + "epoch": 0.15, + "grad_norm": 0.28151915869781446, + "learning_rate": 0.00019962720484386614, + "loss": 1.1608, + "step": 1591 + }, + { + "epoch": 0.15, + "grad_norm": 3.487170407807799, + "learning_rate": 0.00019962583889631663, + "loss": 1.1038, + "step": 1592 + }, + { + "epoch": 0.15, + "grad_norm": 0.25611542436265444, + "learning_rate": 0.00019962447045556792, + "loss": 1.0629, + "step": 1593 + }, + { + "epoch": 0.15, + "grad_norm": 0.24414987846237798, + "learning_rate": 0.00019962309952165425, + "loss": 1.0264, + "step": 1594 + }, + { + "epoch": 0.15, + "grad_norm": 0.26011148045017146, + "learning_rate": 0.00019962172609460982, + "loss": 1.0993, + "step": 1595 + }, + { + "epoch": 0.15, + "grad_norm": 0.2618849192631391, + "learning_rate": 0.00019962035017446916, + "loss": 1.1054, + "step": 1596 + }, + { + "epoch": 0.15, + "grad_norm": 0.24456979964789494, + "learning_rate": 0.0001996189717612666, + "loss": 1.1605, + "step": 1597 + }, + { + "epoch": 0.15, + "grad_norm": 0.2832924711395636, + "learning_rate": 0.00019961759085503666, + "loss": 1.1245, + "step": 1598 + }, + { + "epoch": 0.15, + "grad_norm": 0.24882178646084718, + "learning_rate": 0.00019961620745581387, + "loss": 1.0725, + "step": 1599 + }, + { + "epoch": 0.15, + "grad_norm": 0.22925300602806323, + "learning_rate": 0.00019961482156363296, + "loss": 1.0953, + "step": 1600 + }, + { + "epoch": 0.15, + "grad_norm": 0.2531531358177607, + "learning_rate": 0.00019961343317852846, + "loss": 1.0204, + "step": 1601 + }, + { + "epoch": 0.15, + "grad_norm": 0.2738032960535575, + "learning_rate": 0.00019961204230053525, + "loss": 1.1092, + "step": 1602 + }, + { + "epoch": 0.15, + "grad_norm": 0.2731400838916497, + "learning_rate": 0.00019961064892968806, + "loss": 1.1173, + "step": 1603 + }, + { + "epoch": 0.15, + "grad_norm": 0.2493741546517925, + "learning_rate": 0.00019960925306602176, + "loss": 1.0769, + "step": 1604 + }, + { + "epoch": 0.15, + "grad_norm": 0.25050950474007155, + "learning_rate": 0.0001996078547095713, + "loss": 1.133, + "step": 1605 + }, + { + "epoch": 0.15, + "grad_norm": 0.29817105037980673, + "learning_rate": 0.0001996064538603717, + "loss": 1.1855, + "step": 1606 + }, + { + "epoch": 0.15, + "grad_norm": 0.28263632425868207, + "learning_rate": 0.00019960505051845796, + "loss": 1.136, + "step": 1607 + }, + { + "epoch": 0.15, + "grad_norm": 0.23133396852491225, + "learning_rate": 0.00019960364468386526, + "loss": 0.9476, + "step": 1608 + }, + { + "epoch": 0.15, + "grad_norm": 0.2775442108767544, + "learning_rate": 0.00019960223635662874, + "loss": 1.1606, + "step": 1609 + }, + { + "epoch": 0.15, + "grad_norm": 0.2805090162171342, + "learning_rate": 0.00019960082553678365, + "loss": 1.143, + "step": 1610 + }, + { + "epoch": 0.15, + "grad_norm": 0.2537997461349602, + "learning_rate": 0.0001995994122243653, + "loss": 1.1834, + "step": 1611 + }, + { + "epoch": 0.15, + "grad_norm": 0.32560983069595756, + "learning_rate": 0.00019959799641940907, + "loss": 0.9919, + "step": 1612 + }, + { + "epoch": 0.15, + "grad_norm": 0.2777624913037504, + "learning_rate": 0.0001995965781219504, + "loss": 1.1872, + "step": 1613 + }, + { + "epoch": 0.15, + "grad_norm": 0.2758857420110973, + "learning_rate": 0.00019959515733202477, + "loss": 1.137, + "step": 1614 + }, + { + "epoch": 0.15, + "grad_norm": 0.256755743287946, + "learning_rate": 0.0001995937340496677, + "loss": 1.0922, + "step": 1615 + }, + { + "epoch": 0.15, + "grad_norm": 0.2517079524462437, + "learning_rate": 0.00019959230827491488, + "loss": 1.0859, + "step": 1616 + }, + { + "epoch": 0.15, + "grad_norm": 0.2724013994730525, + "learning_rate": 0.00019959088000780193, + "loss": 1.0288, + "step": 1617 + }, + { + "epoch": 0.15, + "grad_norm": 0.3008600441450719, + "learning_rate": 0.00019958944924836463, + "loss": 1.044, + "step": 1618 + }, + { + "epoch": 0.15, + "grad_norm": 0.29350285017468836, + "learning_rate": 0.00019958801599663877, + "loss": 1.1309, + "step": 1619 + }, + { + "epoch": 0.15, + "grad_norm": 0.26935905493782025, + "learning_rate": 0.0001995865802526602, + "loss": 1.1497, + "step": 1620 + }, + { + "epoch": 0.16, + "grad_norm": 0.2619749727571851, + "learning_rate": 0.0001995851420164649, + "loss": 1.226, + "step": 1621 + }, + { + "epoch": 0.16, + "grad_norm": 0.2566559504175784, + "learning_rate": 0.00019958370128808883, + "loss": 1.126, + "step": 1622 + }, + { + "epoch": 0.16, + "grad_norm": 0.2950868154557528, + "learning_rate": 0.00019958225806756806, + "loss": 1.1165, + "step": 1623 + }, + { + "epoch": 0.16, + "grad_norm": 0.262703858238673, + "learning_rate": 0.00019958081235493867, + "loss": 1.1535, + "step": 1624 + }, + { + "epoch": 0.16, + "grad_norm": 0.27043560750628914, + "learning_rate": 0.00019957936415023687, + "loss": 1.1192, + "step": 1625 + }, + { + "epoch": 0.16, + "grad_norm": 0.263242304281921, + "learning_rate": 0.00019957791345349892, + "loss": 1.0326, + "step": 1626 + }, + { + "epoch": 0.16, + "grad_norm": 0.2815449484727248, + "learning_rate": 0.0001995764602647611, + "loss": 1.0835, + "step": 1627 + }, + { + "epoch": 0.16, + "grad_norm": 0.2868280654211211, + "learning_rate": 0.00019957500458405976, + "loss": 1.1983, + "step": 1628 + }, + { + "epoch": 0.16, + "grad_norm": 0.31775561559603943, + "learning_rate": 0.00019957354641143136, + "loss": 1.163, + "step": 1629 + }, + { + "epoch": 0.16, + "grad_norm": 0.2962143104484358, + "learning_rate": 0.00019957208574691238, + "loss": 1.2085, + "step": 1630 + }, + { + "epoch": 0.16, + "grad_norm": 0.33134737467969283, + "learning_rate": 0.0001995706225905394, + "loss": 1.1736, + "step": 1631 + }, + { + "epoch": 0.16, + "grad_norm": 0.2973914160477879, + "learning_rate": 0.00019956915694234895, + "loss": 1.1877, + "step": 1632 + }, + { + "epoch": 0.16, + "grad_norm": 0.23797305266943983, + "learning_rate": 0.00019956768880237781, + "loss": 1.13, + "step": 1633 + }, + { + "epoch": 0.16, + "grad_norm": 0.28430135940967705, + "learning_rate": 0.0001995662181706627, + "loss": 1.1628, + "step": 1634 + }, + { + "epoch": 0.16, + "grad_norm": 0.2606274355294148, + "learning_rate": 0.00019956474504724038, + "loss": 1.1124, + "step": 1635 + }, + { + "epoch": 0.16, + "grad_norm": 0.26540643513755174, + "learning_rate": 0.00019956326943214775, + "loss": 1.0509, + "step": 1636 + }, + { + "epoch": 0.16, + "grad_norm": 0.25782459518811457, + "learning_rate": 0.00019956179132542173, + "loss": 1.0932, + "step": 1637 + }, + { + "epoch": 0.16, + "grad_norm": 0.24697183220142635, + "learning_rate": 0.00019956031072709932, + "loss": 1.2274, + "step": 1638 + }, + { + "epoch": 0.16, + "grad_norm": 0.2609501474414402, + "learning_rate": 0.0001995588276372175, + "loss": 1.0552, + "step": 1639 + }, + { + "epoch": 0.16, + "grad_norm": 0.2649049635321031, + "learning_rate": 0.00019955734205581352, + "loss": 0.9704, + "step": 1640 + }, + { + "epoch": 0.16, + "grad_norm": 0.2593017975988553, + "learning_rate": 0.00019955585398292447, + "loss": 1.1551, + "step": 1641 + }, + { + "epoch": 0.16, + "grad_norm": 0.25956189155243276, + "learning_rate": 0.0001995543634185876, + "loss": 1.1989, + "step": 1642 + }, + { + "epoch": 0.16, + "grad_norm": 0.2560637809135954, + "learning_rate": 0.0001995528703628402, + "loss": 1.1312, + "step": 1643 + }, + { + "epoch": 0.16, + "grad_norm": 0.27697911347482446, + "learning_rate": 0.00019955137481571968, + "loss": 1.2054, + "step": 1644 + }, + { + "epoch": 0.16, + "grad_norm": 0.24778928635339068, + "learning_rate": 0.00019954987677726343, + "loss": 1.1358, + "step": 1645 + }, + { + "epoch": 0.16, + "grad_norm": 0.26610781310481363, + "learning_rate": 0.00019954837624750895, + "loss": 1.1007, + "step": 1646 + }, + { + "epoch": 0.16, + "grad_norm": 0.2799296986376897, + "learning_rate": 0.0001995468732264938, + "loss": 1.0621, + "step": 1647 + }, + { + "epoch": 0.16, + "grad_norm": 0.26342475793330683, + "learning_rate": 0.00019954536771425556, + "loss": 1.1325, + "step": 1648 + }, + { + "epoch": 0.16, + "grad_norm": 0.2809166549010256, + "learning_rate": 0.00019954385971083193, + "loss": 1.0778, + "step": 1649 + }, + { + "epoch": 0.16, + "grad_norm": 0.28395265589643537, + "learning_rate": 0.00019954234921626068, + "loss": 0.9792, + "step": 1650 + }, + { + "epoch": 0.16, + "grad_norm": 0.3037040271080715, + "learning_rate": 0.00019954083623057955, + "loss": 1.1754, + "step": 1651 + }, + { + "epoch": 0.16, + "grad_norm": 0.23645934008105318, + "learning_rate": 0.00019953932075382646, + "loss": 1.1307, + "step": 1652 + }, + { + "epoch": 0.16, + "grad_norm": 0.2723890058669645, + "learning_rate": 0.00019953780278603932, + "loss": 1.1161, + "step": 1653 + }, + { + "epoch": 0.16, + "grad_norm": 0.2607964386960627, + "learning_rate": 0.00019953628232725608, + "loss": 1.1741, + "step": 1654 + }, + { + "epoch": 0.16, + "grad_norm": 0.27487552426540823, + "learning_rate": 0.0001995347593775148, + "loss": 1.131, + "step": 1655 + }, + { + "epoch": 0.16, + "grad_norm": 0.24819637701868438, + "learning_rate": 0.00019953323393685367, + "loss": 1.1246, + "step": 1656 + }, + { + "epoch": 0.16, + "grad_norm": 0.2880503601951672, + "learning_rate": 0.00019953170600531074, + "loss": 1.1414, + "step": 1657 + }, + { + "epoch": 0.16, + "grad_norm": 0.2661262998136285, + "learning_rate": 0.00019953017558292438, + "loss": 1.1857, + "step": 1658 + }, + { + "epoch": 0.16, + "grad_norm": 0.2660986350757843, + "learning_rate": 0.00019952864266973278, + "loss": 1.1092, + "step": 1659 + }, + { + "epoch": 0.16, + "grad_norm": 0.2713659116488214, + "learning_rate": 0.00019952710726577435, + "loss": 1.0772, + "step": 1660 + }, + { + "epoch": 0.16, + "grad_norm": 0.29223182156019084, + "learning_rate": 0.00019952556937108753, + "loss": 1.0789, + "step": 1661 + }, + { + "epoch": 0.16, + "grad_norm": 0.2582755115758311, + "learning_rate": 0.00019952402898571077, + "loss": 1.1875, + "step": 1662 + }, + { + "epoch": 0.16, + "grad_norm": 0.2654852860667591, + "learning_rate": 0.00019952248610968264, + "loss": 1.0323, + "step": 1663 + }, + { + "epoch": 0.16, + "grad_norm": 0.2935124336052021, + "learning_rate": 0.00019952094074304175, + "loss": 0.9859, + "step": 1664 + }, + { + "epoch": 0.16, + "grad_norm": 0.2546178333136953, + "learning_rate": 0.00019951939288582676, + "loss": 1.0756, + "step": 1665 + }, + { + "epoch": 0.16, + "grad_norm": 0.3002327247319587, + "learning_rate": 0.0001995178425380764, + "loss": 1.1356, + "step": 1666 + }, + { + "epoch": 0.16, + "grad_norm": 0.2792232404834516, + "learning_rate": 0.00019951628969982953, + "loss": 1.1251, + "step": 1667 + }, + { + "epoch": 0.16, + "grad_norm": 0.28042254510601033, + "learning_rate": 0.00019951473437112495, + "loss": 1.0406, + "step": 1668 + }, + { + "epoch": 0.16, + "grad_norm": 0.2869293910747367, + "learning_rate": 0.0001995131765520016, + "loss": 1.0321, + "step": 1669 + }, + { + "epoch": 0.16, + "grad_norm": 0.26731773733638436, + "learning_rate": 0.00019951161624249844, + "loss": 1.1865, + "step": 1670 + }, + { + "epoch": 0.16, + "grad_norm": 0.26494146088310505, + "learning_rate": 0.00019951005344265462, + "loss": 1.105, + "step": 1671 + }, + { + "epoch": 0.16, + "grad_norm": 0.26633126397973417, + "learning_rate": 0.0001995084881525091, + "loss": 1.0366, + "step": 1672 + }, + { + "epoch": 0.16, + "grad_norm": 0.27563381490550426, + "learning_rate": 0.00019950692037210113, + "loss": 1.1346, + "step": 1673 + }, + { + "epoch": 0.16, + "grad_norm": 0.2808572907767721, + "learning_rate": 0.00019950535010146994, + "loss": 1.1304, + "step": 1674 + }, + { + "epoch": 0.16, + "grad_norm": 0.249082820217972, + "learning_rate": 0.00019950377734065486, + "loss": 1.1375, + "step": 1675 + }, + { + "epoch": 0.16, + "grad_norm": 0.242030163260417, + "learning_rate": 0.00019950220208969519, + "loss": 1.0647, + "step": 1676 + }, + { + "epoch": 0.16, + "grad_norm": 0.23873021173771233, + "learning_rate": 0.00019950062434863038, + "loss": 1.1427, + "step": 1677 + }, + { + "epoch": 0.16, + "grad_norm": 0.2526857334515431, + "learning_rate": 0.00019949904411749995, + "loss": 1.0652, + "step": 1678 + }, + { + "epoch": 0.16, + "grad_norm": 0.22005647549892257, + "learning_rate": 0.00019949746139634336, + "loss": 1.2141, + "step": 1679 + }, + { + "epoch": 0.16, + "grad_norm": 0.29830008364083893, + "learning_rate": 0.0001994958761852003, + "loss": 0.9724, + "step": 1680 + }, + { + "epoch": 0.16, + "grad_norm": 0.2336810810860809, + "learning_rate": 0.00019949428848411036, + "loss": 1.0718, + "step": 1681 + }, + { + "epoch": 0.16, + "grad_norm": 0.2596500357999934, + "learning_rate": 0.00019949269829311336, + "loss": 1.1449, + "step": 1682 + }, + { + "epoch": 0.16, + "grad_norm": 0.28287000384537997, + "learning_rate": 0.00019949110561224905, + "loss": 1.1513, + "step": 1683 + }, + { + "epoch": 0.16, + "grad_norm": 0.27286503590527955, + "learning_rate": 0.00019948951044155728, + "loss": 1.095, + "step": 1684 + }, + { + "epoch": 0.16, + "grad_norm": 0.26619367233558344, + "learning_rate": 0.000199487912781078, + "loss": 1.0861, + "step": 1685 + }, + { + "epoch": 0.16, + "grad_norm": 0.29588681836665753, + "learning_rate": 0.0001994863126308512, + "loss": 1.0361, + "step": 1686 + }, + { + "epoch": 0.16, + "grad_norm": 0.27036346205815504, + "learning_rate": 0.00019948470999091685, + "loss": 1.1066, + "step": 1687 + }, + { + "epoch": 0.16, + "grad_norm": 0.24182695997990206, + "learning_rate": 0.00019948310486131513, + "loss": 0.9643, + "step": 1688 + }, + { + "epoch": 0.16, + "grad_norm": 0.26214221449152814, + "learning_rate": 0.0001994814972420862, + "loss": 1.1246, + "step": 1689 + }, + { + "epoch": 0.16, + "grad_norm": 0.2804122217034823, + "learning_rate": 0.0001994798871332703, + "loss": 1.1795, + "step": 1690 + }, + { + "epoch": 0.16, + "grad_norm": 0.2594172377288309, + "learning_rate": 0.00019947827453490767, + "loss": 1.1388, + "step": 1691 + }, + { + "epoch": 0.16, + "grad_norm": 0.2893954998932536, + "learning_rate": 0.0001994766594470387, + "loss": 1.0079, + "step": 1692 + }, + { + "epoch": 0.16, + "grad_norm": 0.2936972888903826, + "learning_rate": 0.0001994750418697038, + "loss": 1.1258, + "step": 1693 + }, + { + "epoch": 0.16, + "grad_norm": 0.26900132587121695, + "learning_rate": 0.00019947342180294346, + "loss": 1.2546, + "step": 1694 + }, + { + "epoch": 0.16, + "grad_norm": 0.26577176120374074, + "learning_rate": 0.00019947179924679825, + "loss": 1.1571, + "step": 1695 + }, + { + "epoch": 0.16, + "grad_norm": 0.2734445337941885, + "learning_rate": 0.00019947017420130872, + "loss": 1.0859, + "step": 1696 + }, + { + "epoch": 0.16, + "grad_norm": 0.24000154406623206, + "learning_rate": 0.0001994685466665156, + "loss": 1.1031, + "step": 1697 + }, + { + "epoch": 0.16, + "grad_norm": 0.22779641522136282, + "learning_rate": 0.00019946691664245956, + "loss": 1.0854, + "step": 1698 + }, + { + "epoch": 0.16, + "grad_norm": 0.2479814495549998, + "learning_rate": 0.0001994652841291814, + "loss": 1.1121, + "step": 1699 + }, + { + "epoch": 0.16, + "grad_norm": 0.250268222311568, + "learning_rate": 0.00019946364912672203, + "loss": 1.0779, + "step": 1700 + }, + { + "epoch": 0.16, + "grad_norm": 0.24912184436362922, + "learning_rate": 0.00019946201163512233, + "loss": 1.0067, + "step": 1701 + }, + { + "epoch": 0.16, + "grad_norm": 0.3207294751787716, + "learning_rate": 0.00019946037165442327, + "loss": 1.303, + "step": 1702 + }, + { + "epoch": 0.16, + "grad_norm": 0.28907607252287926, + "learning_rate": 0.0001994587291846659, + "loss": 1.1877, + "step": 1703 + }, + { + "epoch": 0.16, + "grad_norm": 0.25848051240176273, + "learning_rate": 0.0001994570842258913, + "loss": 1.0081, + "step": 1704 + }, + { + "epoch": 0.16, + "grad_norm": 0.2756315421016595, + "learning_rate": 0.00019945543677814067, + "loss": 1.0329, + "step": 1705 + }, + { + "epoch": 0.16, + "grad_norm": 0.2505304319664102, + "learning_rate": 0.00019945378684145526, + "loss": 1.1502, + "step": 1706 + }, + { + "epoch": 0.16, + "grad_norm": 0.3140138337651653, + "learning_rate": 0.00019945213441587633, + "loss": 1.108, + "step": 1707 + }, + { + "epoch": 0.16, + "grad_norm": 0.2650198799790111, + "learning_rate": 0.0001994504795014452, + "loss": 1.0843, + "step": 1708 + }, + { + "epoch": 0.16, + "grad_norm": 0.27741715785219834, + "learning_rate": 0.00019944882209820333, + "loss": 1.0471, + "step": 1709 + }, + { + "epoch": 0.16, + "grad_norm": 0.28688576394081394, + "learning_rate": 0.0001994471622061922, + "loss": 1.0745, + "step": 1710 + }, + { + "epoch": 0.16, + "grad_norm": 0.3160625561308828, + "learning_rate": 0.0001994454998254533, + "loss": 1.0412, + "step": 1711 + }, + { + "epoch": 0.16, + "grad_norm": 0.29390242181574067, + "learning_rate": 0.0001994438349560283, + "loss": 1.0297, + "step": 1712 + }, + { + "epoch": 0.16, + "grad_norm": 0.2662667067917202, + "learning_rate": 0.00019944216759795885, + "loss": 1.0189, + "step": 1713 + }, + { + "epoch": 0.16, + "grad_norm": 0.2652127225373576, + "learning_rate": 0.00019944049775128661, + "loss": 1.0433, + "step": 1714 + }, + { + "epoch": 0.16, + "grad_norm": 0.28198238645107127, + "learning_rate": 0.00019943882541605343, + "loss": 1.1984, + "step": 1715 + }, + { + "epoch": 0.16, + "grad_norm": 0.2930815699661024, + "learning_rate": 0.00019943715059230117, + "loss": 1.0741, + "step": 1716 + }, + { + "epoch": 0.16, + "grad_norm": 0.32027322958606264, + "learning_rate": 0.0001994354732800717, + "loss": 1.097, + "step": 1717 + }, + { + "epoch": 0.16, + "grad_norm": 0.23663660962171493, + "learning_rate": 0.00019943379347940704, + "loss": 1.0879, + "step": 1718 + }, + { + "epoch": 0.16, + "grad_norm": 0.26819927151645284, + "learning_rate": 0.0001994321111903492, + "loss": 1.1009, + "step": 1719 + }, + { + "epoch": 0.16, + "grad_norm": 0.23575422949387753, + "learning_rate": 0.00019943042641294028, + "loss": 1.1155, + "step": 1720 + }, + { + "epoch": 0.16, + "grad_norm": 0.23615429167267799, + "learning_rate": 0.00019942873914722243, + "loss": 1.1978, + "step": 1721 + }, + { + "epoch": 0.16, + "grad_norm": 0.23075710530915816, + "learning_rate": 0.00019942704939323794, + "loss": 1.1802, + "step": 1722 + }, + { + "epoch": 0.16, + "grad_norm": 0.26619797669604706, + "learning_rate": 0.00019942535715102903, + "loss": 1.2049, + "step": 1723 + }, + { + "epoch": 0.16, + "grad_norm": 0.2766557794492051, + "learning_rate": 0.00019942366242063807, + "loss": 1.1062, + "step": 1724 + }, + { + "epoch": 0.17, + "grad_norm": 0.2650480870677421, + "learning_rate": 0.00019942196520210748, + "loss": 1.2134, + "step": 1725 + }, + { + "epoch": 0.17, + "grad_norm": 0.25567802198203454, + "learning_rate": 0.00019942026549547973, + "loss": 1.1747, + "step": 1726 + }, + { + "epoch": 0.17, + "grad_norm": 0.27010898900806307, + "learning_rate": 0.00019941856330079732, + "loss": 1.2341, + "step": 1727 + }, + { + "epoch": 0.17, + "grad_norm": 0.2672243888343421, + "learning_rate": 0.0001994168586181029, + "loss": 1.038, + "step": 1728 + }, + { + "epoch": 0.17, + "grad_norm": 0.27037675730748634, + "learning_rate": 0.00019941515144743913, + "loss": 1.1484, + "step": 1729 + }, + { + "epoch": 0.17, + "grad_norm": 0.27423511003272355, + "learning_rate": 0.00019941344178884868, + "loss": 1.1925, + "step": 1730 + }, + { + "epoch": 0.17, + "grad_norm": 0.25846818596543564, + "learning_rate": 0.0001994117296423744, + "loss": 1.0063, + "step": 1731 + }, + { + "epoch": 0.17, + "grad_norm": 0.2686589532404364, + "learning_rate": 0.0001994100150080591, + "loss": 1.0285, + "step": 1732 + }, + { + "epoch": 0.17, + "grad_norm": 0.2766088906498271, + "learning_rate": 0.00019940829788594569, + "loss": 1.0765, + "step": 1733 + }, + { + "epoch": 0.17, + "grad_norm": 0.26339315358801624, + "learning_rate": 0.00019940657827607715, + "loss": 1.1692, + "step": 1734 + }, + { + "epoch": 0.17, + "grad_norm": 0.2603560134384319, + "learning_rate": 0.0001994048561784965, + "loss": 1.0406, + "step": 1735 + }, + { + "epoch": 0.17, + "grad_norm": 0.2904525258177709, + "learning_rate": 0.0001994031315932469, + "loss": 0.9866, + "step": 1736 + }, + { + "epoch": 0.17, + "grad_norm": 0.254967597751732, + "learning_rate": 0.00019940140452037142, + "loss": 1.1711, + "step": 1737 + }, + { + "epoch": 0.17, + "grad_norm": 0.2587539173182572, + "learning_rate": 0.00019939967495991332, + "loss": 1.1377, + "step": 1738 + }, + { + "epoch": 0.17, + "grad_norm": 0.26239329017748414, + "learning_rate": 0.0001993979429119159, + "loss": 1.0846, + "step": 1739 + }, + { + "epoch": 0.17, + "grad_norm": 0.2641030306092387, + "learning_rate": 0.00019939620837642247, + "loss": 1.1515, + "step": 1740 + }, + { + "epoch": 0.17, + "grad_norm": 0.29629448536301656, + "learning_rate": 0.00019939447135347647, + "loss": 1.1464, + "step": 1741 + }, + { + "epoch": 0.17, + "grad_norm": 0.259586106165695, + "learning_rate": 0.00019939273184312137, + "loss": 1.0899, + "step": 1742 + }, + { + "epoch": 0.17, + "grad_norm": 0.26296382387837713, + "learning_rate": 0.0001993909898454007, + "loss": 1.0482, + "step": 1743 + }, + { + "epoch": 0.17, + "grad_norm": 0.28051523053769833, + "learning_rate": 0.000199389245360358, + "loss": 1.1547, + "step": 1744 + }, + { + "epoch": 0.17, + "grad_norm": 0.2671245457129155, + "learning_rate": 0.00019938749838803696, + "loss": 1.0592, + "step": 1745 + }, + { + "epoch": 0.17, + "grad_norm": 0.3181482392262407, + "learning_rate": 0.00019938574892848135, + "loss": 1.0635, + "step": 1746 + }, + { + "epoch": 0.17, + "grad_norm": 0.2899294422265618, + "learning_rate": 0.0001993839969817349, + "loss": 1.0768, + "step": 1747 + }, + { + "epoch": 0.17, + "grad_norm": 0.25002771184882505, + "learning_rate": 0.00019938224254784147, + "loss": 1.1619, + "step": 1748 + }, + { + "epoch": 0.17, + "grad_norm": 0.2568140049401059, + "learning_rate": 0.00019938048562684495, + "loss": 1.0229, + "step": 1749 + }, + { + "epoch": 0.17, + "grad_norm": 0.36292519666598, + "learning_rate": 0.00019937872621878934, + "loss": 1.1794, + "step": 1750 + }, + { + "epoch": 0.17, + "grad_norm": 0.2932029976728667, + "learning_rate": 0.0001993769643237186, + "loss": 1.1902, + "step": 1751 + }, + { + "epoch": 0.17, + "grad_norm": 0.2771533942225469, + "learning_rate": 0.00019937519994167694, + "loss": 1.0731, + "step": 1752 + }, + { + "epoch": 0.17, + "grad_norm": 0.30070233491956233, + "learning_rate": 0.00019937343307270842, + "loss": 1.1186, + "step": 1753 + }, + { + "epoch": 0.17, + "grad_norm": 0.23520740536849988, + "learning_rate": 0.00019937166371685727, + "loss": 1.0768, + "step": 1754 + }, + { + "epoch": 0.17, + "grad_norm": 0.2689615596060478, + "learning_rate": 0.0001993698918741678, + "loss": 1.0889, + "step": 1755 + }, + { + "epoch": 0.17, + "grad_norm": 0.2667096465377454, + "learning_rate": 0.0001993681175446843, + "loss": 1.1273, + "step": 1756 + }, + { + "epoch": 0.17, + "grad_norm": 0.2774024784883662, + "learning_rate": 0.00019936634072845126, + "loss": 1.0687, + "step": 1757 + }, + { + "epoch": 0.17, + "grad_norm": 0.2948555113393165, + "learning_rate": 0.00019936456142551306, + "loss": 1.1369, + "step": 1758 + }, + { + "epoch": 0.17, + "grad_norm": 0.2783762471401322, + "learning_rate": 0.00019936277963591428, + "loss": 1.1218, + "step": 1759 + }, + { + "epoch": 0.17, + "grad_norm": 0.24719261245618435, + "learning_rate": 0.00019936099535969946, + "loss": 1.1024, + "step": 1760 + }, + { + "epoch": 0.17, + "grad_norm": 0.3182614470412637, + "learning_rate": 0.00019935920859691332, + "loss": 1.1849, + "step": 1761 + }, + { + "epoch": 0.17, + "grad_norm": 0.23710818113640875, + "learning_rate": 0.00019935741934760053, + "loss": 1.0466, + "step": 1762 + }, + { + "epoch": 0.17, + "grad_norm": 0.2638247973270966, + "learning_rate": 0.00019935562761180586, + "loss": 1.046, + "step": 1763 + }, + { + "epoch": 0.17, + "grad_norm": 0.29051740614485994, + "learning_rate": 0.0001993538333895742, + "loss": 1.0124, + "step": 1764 + }, + { + "epoch": 0.17, + "grad_norm": 0.344576484247007, + "learning_rate": 0.0001993520366809504, + "loss": 1.0553, + "step": 1765 + }, + { + "epoch": 0.17, + "grad_norm": 0.2734046389788186, + "learning_rate": 0.00019935023748597942, + "loss": 1.1175, + "step": 1766 + }, + { + "epoch": 0.17, + "grad_norm": 0.2658344263426795, + "learning_rate": 0.00019934843580470633, + "loss": 1.1288, + "step": 1767 + }, + { + "epoch": 0.17, + "grad_norm": 0.24386557141888807, + "learning_rate": 0.0001993466316371762, + "loss": 1.0556, + "step": 1768 + }, + { + "epoch": 0.17, + "grad_norm": 0.2903532438407926, + "learning_rate": 0.00019934482498343417, + "loss": 1.1214, + "step": 1769 + }, + { + "epoch": 0.17, + "grad_norm": 0.23273429637205256, + "learning_rate": 0.00019934301584352543, + "loss": 1.1819, + "step": 1770 + }, + { + "epoch": 0.17, + "grad_norm": 0.26436150903465644, + "learning_rate": 0.0001993412042174953, + "loss": 1.2129, + "step": 1771 + }, + { + "epoch": 0.17, + "grad_norm": 0.272699055168821, + "learning_rate": 0.00019933939010538914, + "loss": 1.0845, + "step": 1772 + }, + { + "epoch": 0.17, + "grad_norm": 0.24313514050010182, + "learning_rate": 0.00019933757350725227, + "loss": 1.0947, + "step": 1773 + }, + { + "epoch": 0.17, + "grad_norm": 0.2743855476662608, + "learning_rate": 0.00019933575442313022, + "loss": 1.0721, + "step": 1774 + }, + { + "epoch": 0.17, + "grad_norm": 0.27484803850628725, + "learning_rate": 0.00019933393285306847, + "loss": 1.086, + "step": 1775 + }, + { + "epoch": 0.17, + "grad_norm": 0.2862674448464463, + "learning_rate": 0.0001993321087971126, + "loss": 1.1687, + "step": 1776 + }, + { + "epoch": 0.17, + "grad_norm": 0.2654433158884731, + "learning_rate": 0.00019933028225530832, + "loss": 1.1524, + "step": 1777 + }, + { + "epoch": 0.17, + "grad_norm": 0.28570985555687595, + "learning_rate": 0.00019932845322770127, + "loss": 1.1032, + "step": 1778 + }, + { + "epoch": 0.17, + "grad_norm": 0.2398384669725776, + "learning_rate": 0.00019932662171433726, + "loss": 1.1805, + "step": 1779 + }, + { + "epoch": 0.17, + "grad_norm": 0.27132648388882097, + "learning_rate": 0.00019932478771526212, + "loss": 1.1706, + "step": 1780 + }, + { + "epoch": 0.17, + "grad_norm": 0.2490907055488986, + "learning_rate": 0.00019932295123052175, + "loss": 1.0303, + "step": 1781 + }, + { + "epoch": 0.17, + "grad_norm": 0.2483810784888273, + "learning_rate": 0.0001993211122601621, + "loss": 1.183, + "step": 1782 + }, + { + "epoch": 0.17, + "grad_norm": 0.23864167216181773, + "learning_rate": 0.00019931927080422921, + "loss": 1.0438, + "step": 1783 + }, + { + "epoch": 0.17, + "grad_norm": 0.27530929268313675, + "learning_rate": 0.0001993174268627691, + "loss": 1.0515, + "step": 1784 + }, + { + "epoch": 0.17, + "grad_norm": 0.24646180451212257, + "learning_rate": 0.00019931558043582802, + "loss": 1.1064, + "step": 1785 + }, + { + "epoch": 0.17, + "grad_norm": 0.3072867379708468, + "learning_rate": 0.00019931373152345206, + "loss": 1.0433, + "step": 1786 + }, + { + "epoch": 0.17, + "grad_norm": 0.2699984246251364, + "learning_rate": 0.0001993118801256876, + "loss": 1.2135, + "step": 1787 + }, + { + "epoch": 0.17, + "grad_norm": 0.29491554910347884, + "learning_rate": 0.00019931002624258093, + "loss": 1.0451, + "step": 1788 + }, + { + "epoch": 0.17, + "grad_norm": 0.2592446364845811, + "learning_rate": 0.00019930816987417843, + "loss": 1.0678, + "step": 1789 + }, + { + "epoch": 0.17, + "grad_norm": 0.27551160462797714, + "learning_rate": 0.00019930631102052656, + "loss": 1.099, + "step": 1790 + }, + { + "epoch": 0.17, + "grad_norm": 0.3004312874610158, + "learning_rate": 0.00019930444968167184, + "loss": 1.2349, + "step": 1791 + }, + { + "epoch": 0.17, + "grad_norm": 0.2902276940371733, + "learning_rate": 0.00019930258585766083, + "loss": 1.0191, + "step": 1792 + }, + { + "epoch": 0.17, + "grad_norm": 0.2880266394681308, + "learning_rate": 0.00019930071954854026, + "loss": 1.0946, + "step": 1793 + }, + { + "epoch": 0.17, + "grad_norm": 0.27739961453298684, + "learning_rate": 0.00019929885075435673, + "loss": 1.0883, + "step": 1794 + }, + { + "epoch": 0.17, + "grad_norm": 0.26237306161853147, + "learning_rate": 0.00019929697947515705, + "loss": 1.13, + "step": 1795 + }, + { + "epoch": 0.17, + "grad_norm": 0.25732915392100997, + "learning_rate": 0.0001992951057109881, + "loss": 1.0223, + "step": 1796 + }, + { + "epoch": 0.17, + "grad_norm": 0.26231079106015476, + "learning_rate": 0.00019929322946189669, + "loss": 1.2334, + "step": 1797 + }, + { + "epoch": 0.17, + "grad_norm": 0.2689310820422752, + "learning_rate": 0.00019929135072792979, + "loss": 1.0859, + "step": 1798 + }, + { + "epoch": 0.17, + "grad_norm": 0.28231810159636933, + "learning_rate": 0.00019928946950913446, + "loss": 1.1499, + "step": 1799 + }, + { + "epoch": 0.17, + "grad_norm": 0.26114756115035764, + "learning_rate": 0.00019928758580555777, + "loss": 1.0692, + "step": 1800 + }, + { + "epoch": 0.17, + "grad_norm": 0.21499560470777954, + "learning_rate": 0.00019928569961724684, + "loss": 1.0246, + "step": 1801 + }, + { + "epoch": 0.17, + "grad_norm": 0.2499670112727543, + "learning_rate": 0.00019928381094424887, + "loss": 1.0571, + "step": 1802 + }, + { + "epoch": 0.17, + "grad_norm": 0.2668951326275685, + "learning_rate": 0.00019928191978661112, + "loss": 1.0914, + "step": 1803 + }, + { + "epoch": 0.17, + "grad_norm": 0.2436534609761875, + "learning_rate": 0.00019928002614438096, + "loss": 1.115, + "step": 1804 + }, + { + "epoch": 0.17, + "grad_norm": 0.24543054408218462, + "learning_rate": 0.00019927813001760573, + "loss": 1.1398, + "step": 1805 + }, + { + "epoch": 0.17, + "grad_norm": 0.275192488837919, + "learning_rate": 0.0001992762314063329, + "loss": 1.157, + "step": 1806 + }, + { + "epoch": 0.17, + "grad_norm": 0.24910350712903606, + "learning_rate": 0.00019927433031061, + "loss": 1.1897, + "step": 1807 + }, + { + "epoch": 0.17, + "grad_norm": 0.2440628864265639, + "learning_rate": 0.0001992724267304846, + "loss": 1.1515, + "step": 1808 + }, + { + "epoch": 0.17, + "grad_norm": 0.2878309034150722, + "learning_rate": 0.0001992705206660043, + "loss": 1.2264, + "step": 1809 + }, + { + "epoch": 0.17, + "grad_norm": 0.2910816442764025, + "learning_rate": 0.00019926861211721684, + "loss": 1.0837, + "step": 1810 + }, + { + "epoch": 0.17, + "grad_norm": 0.26250435365108804, + "learning_rate": 0.00019926670108416997, + "loss": 1.2048, + "step": 1811 + }, + { + "epoch": 0.17, + "grad_norm": 0.2870815342486622, + "learning_rate": 0.00019926478756691153, + "loss": 1.1559, + "step": 1812 + }, + { + "epoch": 0.17, + "grad_norm": 0.27090400410537946, + "learning_rate": 0.0001992628715654894, + "loss": 1.2068, + "step": 1813 + }, + { + "epoch": 0.17, + "grad_norm": 0.28120763276720057, + "learning_rate": 0.0001992609530799515, + "loss": 1.0354, + "step": 1814 + }, + { + "epoch": 0.17, + "grad_norm": 0.2894660442297703, + "learning_rate": 0.0001992590321103459, + "loss": 1.0584, + "step": 1815 + }, + { + "epoch": 0.17, + "grad_norm": 0.2780933045178876, + "learning_rate": 0.00019925710865672063, + "loss": 1.2506, + "step": 1816 + }, + { + "epoch": 0.17, + "grad_norm": 0.28692036245587266, + "learning_rate": 0.0001992551827191238, + "loss": 1.1621, + "step": 1817 + }, + { + "epoch": 0.17, + "grad_norm": 0.2982135909302484, + "learning_rate": 0.00019925325429760368, + "loss": 1.0948, + "step": 1818 + }, + { + "epoch": 0.17, + "grad_norm": 0.2820173648746567, + "learning_rate": 0.0001992513233922085, + "loss": 1.1477, + "step": 1819 + }, + { + "epoch": 0.17, + "grad_norm": 0.2639471711581256, + "learning_rate": 0.00019924939000298656, + "loss": 1.0992, + "step": 1820 + }, + { + "epoch": 0.17, + "grad_norm": 0.3258628877911058, + "learning_rate": 0.00019924745412998625, + "loss": 1.0337, + "step": 1821 + }, + { + "epoch": 0.17, + "grad_norm": 0.32533873777010336, + "learning_rate": 0.00019924551577325605, + "loss": 1.036, + "step": 1822 + }, + { + "epoch": 0.17, + "grad_norm": 0.29923641265792245, + "learning_rate": 0.00019924357493284443, + "loss": 1.1199, + "step": 1823 + }, + { + "epoch": 0.17, + "grad_norm": 0.2871942608589218, + "learning_rate": 0.00019924163160879997, + "loss": 1.111, + "step": 1824 + }, + { + "epoch": 0.17, + "grad_norm": 0.32570726041176123, + "learning_rate": 0.0001992396858011713, + "loss": 1.208, + "step": 1825 + }, + { + "epoch": 0.17, + "grad_norm": 0.2745227229675084, + "learning_rate": 0.00019923773751000714, + "loss": 1.0936, + "step": 1826 + }, + { + "epoch": 0.17, + "grad_norm": 0.29321444725480744, + "learning_rate": 0.00019923578673535622, + "loss": 1.1939, + "step": 1827 + }, + { + "epoch": 0.17, + "grad_norm": 0.31320266548147835, + "learning_rate": 0.0001992338334772674, + "loss": 1.092, + "step": 1828 + }, + { + "epoch": 0.17, + "grad_norm": 0.27134903395742416, + "learning_rate": 0.0001992318777357895, + "loss": 1.1936, + "step": 1829 + }, + { + "epoch": 0.18, + "grad_norm": 0.25932835062459336, + "learning_rate": 0.0001992299195109715, + "loss": 1.1709, + "step": 1830 + }, + { + "epoch": 0.18, + "grad_norm": 0.2821426140168247, + "learning_rate": 0.0001992279588028624, + "loss": 1.0848, + "step": 1831 + }, + { + "epoch": 0.18, + "grad_norm": 0.27780532527873364, + "learning_rate": 0.00019922599561151126, + "loss": 1.0701, + "step": 1832 + }, + { + "epoch": 0.18, + "grad_norm": 0.3213413443644143, + "learning_rate": 0.00019922402993696725, + "loss": 1.2066, + "step": 1833 + }, + { + "epoch": 0.18, + "grad_norm": 0.24152642775118963, + "learning_rate": 0.00019922206177927948, + "loss": 1.0779, + "step": 1834 + }, + { + "epoch": 0.18, + "grad_norm": 0.26358149683834353, + "learning_rate": 0.00019922009113849728, + "loss": 1.0631, + "step": 1835 + }, + { + "epoch": 0.18, + "grad_norm": 0.2771938797580255, + "learning_rate": 0.00019921811801466995, + "loss": 1.0627, + "step": 1836 + }, + { + "epoch": 0.18, + "grad_norm": 0.24390434386104548, + "learning_rate": 0.00019921614240784688, + "loss": 1.0826, + "step": 1837 + }, + { + "epoch": 0.18, + "grad_norm": 0.2883536207722251, + "learning_rate": 0.00019921416431807748, + "loss": 1.0587, + "step": 1838 + }, + { + "epoch": 0.18, + "grad_norm": 0.26058794322850115, + "learning_rate": 0.00019921218374541124, + "loss": 1.0926, + "step": 1839 + }, + { + "epoch": 0.18, + "grad_norm": 0.253199761945334, + "learning_rate": 0.00019921020068989776, + "loss": 1.0659, + "step": 1840 + }, + { + "epoch": 0.18, + "grad_norm": 0.27605817699682705, + "learning_rate": 0.00019920821515158666, + "loss": 1.0807, + "step": 1841 + }, + { + "epoch": 0.18, + "grad_norm": 0.34296685061041043, + "learning_rate": 0.0001992062271305276, + "loss": 1.0399, + "step": 1842 + }, + { + "epoch": 0.18, + "grad_norm": 0.24791123495573145, + "learning_rate": 0.0001992042366267704, + "loss": 1.0986, + "step": 1843 + }, + { + "epoch": 0.18, + "grad_norm": 0.2730759255331824, + "learning_rate": 0.0001992022436403648, + "loss": 1.1249, + "step": 1844 + }, + { + "epoch": 0.18, + "grad_norm": 0.25825382605152414, + "learning_rate": 0.0001992002481713607, + "loss": 1.1515, + "step": 1845 + }, + { + "epoch": 0.18, + "grad_norm": 0.2709947998255462, + "learning_rate": 0.0001991982502198081, + "loss": 1.0644, + "step": 1846 + }, + { + "epoch": 0.18, + "grad_norm": 0.3041130032344396, + "learning_rate": 0.0001991962497857569, + "loss": 1.2027, + "step": 1847 + }, + { + "epoch": 0.18, + "grad_norm": 0.2514054383827712, + "learning_rate": 0.00019919424686925722, + "loss": 1.1181, + "step": 1848 + }, + { + "epoch": 0.18, + "grad_norm": 0.2671075622096655, + "learning_rate": 0.00019919224147035914, + "loss": 1.0748, + "step": 1849 + }, + { + "epoch": 0.18, + "grad_norm": 0.2563779371324541, + "learning_rate": 0.00019919023358911292, + "loss": 1.1708, + "step": 1850 + }, + { + "epoch": 0.18, + "grad_norm": 0.27518701288612535, + "learning_rate": 0.00019918822322556877, + "loss": 1.068, + "step": 1851 + }, + { + "epoch": 0.18, + "grad_norm": 0.23464980973730304, + "learning_rate": 0.00019918621037977693, + "loss": 1.115, + "step": 1852 + }, + { + "epoch": 0.18, + "grad_norm": 0.22754310999457095, + "learning_rate": 0.0001991841950517879, + "loss": 1.143, + "step": 1853 + }, + { + "epoch": 0.18, + "grad_norm": 0.2402610477626132, + "learning_rate": 0.00019918217724165205, + "loss": 1.045, + "step": 1854 + }, + { + "epoch": 0.18, + "grad_norm": 0.2466867783180613, + "learning_rate": 0.00019918015694941988, + "loss": 1.1413, + "step": 1855 + }, + { + "epoch": 0.18, + "grad_norm": 0.27021081362108734, + "learning_rate": 0.00019917813417514194, + "loss": 1.1366, + "step": 1856 + }, + { + "epoch": 0.18, + "grad_norm": 0.2290266939618748, + "learning_rate": 0.00019917610891886884, + "loss": 1.077, + "step": 1857 + }, + { + "epoch": 0.18, + "grad_norm": 0.2912099506203328, + "learning_rate": 0.0001991740811806513, + "loss": 1.2381, + "step": 1858 + }, + { + "epoch": 0.18, + "grad_norm": 0.25710098640782725, + "learning_rate": 0.00019917205096054005, + "loss": 1.1494, + "step": 1859 + }, + { + "epoch": 0.18, + "grad_norm": 0.23444875824179745, + "learning_rate": 0.00019917001825858592, + "loss": 1.1993, + "step": 1860 + }, + { + "epoch": 0.18, + "grad_norm": 0.27972502638604985, + "learning_rate": 0.00019916798307483973, + "loss": 1.0881, + "step": 1861 + }, + { + "epoch": 0.18, + "grad_norm": 0.22927317304767958, + "learning_rate": 0.00019916594540935246, + "loss": 1.1226, + "step": 1862 + }, + { + "epoch": 0.18, + "grad_norm": 0.2777824230056472, + "learning_rate": 0.00019916390526217507, + "loss": 1.2791, + "step": 1863 + }, + { + "epoch": 0.18, + "grad_norm": 0.29153525342371706, + "learning_rate": 0.0001991618626333586, + "loss": 1.1707, + "step": 1864 + }, + { + "epoch": 0.18, + "grad_norm": 0.26506907160815607, + "learning_rate": 0.00019915981752295422, + "loss": 1.1309, + "step": 1865 + }, + { + "epoch": 0.18, + "grad_norm": 0.24281279217203466, + "learning_rate": 0.00019915776993101311, + "loss": 1.117, + "step": 1866 + }, + { + "epoch": 0.18, + "grad_norm": 0.2915237131661395, + "learning_rate": 0.00019915571985758645, + "loss": 1.1615, + "step": 1867 + }, + { + "epoch": 0.18, + "grad_norm": 0.26598319618634586, + "learning_rate": 0.00019915366730272562, + "loss": 1.2443, + "step": 1868 + }, + { + "epoch": 0.18, + "grad_norm": 0.25083927497077174, + "learning_rate": 0.00019915161226648193, + "loss": 1.091, + "step": 1869 + }, + { + "epoch": 0.18, + "grad_norm": 0.24871843858798745, + "learning_rate": 0.00019914955474890683, + "loss": 1.2225, + "step": 1870 + }, + { + "epoch": 0.18, + "grad_norm": 0.2807205864271564, + "learning_rate": 0.00019914749475005182, + "loss": 1.0856, + "step": 1871 + }, + { + "epoch": 0.18, + "grad_norm": 0.2564207640831039, + "learning_rate": 0.00019914543226996846, + "loss": 1.1381, + "step": 1872 + }, + { + "epoch": 0.18, + "grad_norm": 0.2711542979084927, + "learning_rate": 0.00019914336730870828, + "loss": 1.1482, + "step": 1873 + }, + { + "epoch": 0.18, + "grad_norm": 0.24605707766197607, + "learning_rate": 0.00019914129986632308, + "loss": 1.0468, + "step": 1874 + }, + { + "epoch": 0.18, + "grad_norm": 0.25895981792303296, + "learning_rate": 0.00019913922994286453, + "loss": 1.1124, + "step": 1875 + }, + { + "epoch": 0.18, + "grad_norm": 0.26071698636999885, + "learning_rate": 0.00019913715753838444, + "loss": 1.0977, + "step": 1876 + }, + { + "epoch": 0.18, + "grad_norm": 0.26270406723403217, + "learning_rate": 0.00019913508265293468, + "loss": 0.9724, + "step": 1877 + }, + { + "epoch": 0.18, + "grad_norm": 0.2729621283459525, + "learning_rate": 0.00019913300528656718, + "loss": 1.0379, + "step": 1878 + }, + { + "epoch": 0.18, + "grad_norm": 0.2667009672221466, + "learning_rate": 0.00019913092543933392, + "loss": 1.201, + "step": 1879 + }, + { + "epoch": 0.18, + "grad_norm": 0.3062682088725694, + "learning_rate": 0.00019912884311128692, + "loss": 1.2133, + "step": 1880 + }, + { + "epoch": 0.18, + "grad_norm": 0.2471603219169982, + "learning_rate": 0.00019912675830247834, + "loss": 1.0426, + "step": 1881 + }, + { + "epoch": 0.18, + "grad_norm": 0.2706944754253154, + "learning_rate": 0.00019912467101296035, + "loss": 1.0739, + "step": 1882 + }, + { + "epoch": 0.18, + "grad_norm": 0.23877771124529687, + "learning_rate": 0.00019912258124278517, + "loss": 1.1371, + "step": 1883 + }, + { + "epoch": 0.18, + "grad_norm": 0.2501800385911424, + "learning_rate": 0.00019912048899200507, + "loss": 0.985, + "step": 1884 + }, + { + "epoch": 0.18, + "grad_norm": 0.2543490544283289, + "learning_rate": 0.00019911839426067245, + "loss": 1.0922, + "step": 1885 + }, + { + "epoch": 0.18, + "grad_norm": 0.2610356534075215, + "learning_rate": 0.0001991162970488397, + "loss": 1.062, + "step": 1886 + }, + { + "epoch": 0.18, + "grad_norm": 0.31809853336105126, + "learning_rate": 0.0001991141973565594, + "loss": 1.0172, + "step": 1887 + }, + { + "epoch": 0.18, + "grad_norm": 0.3037942696710212, + "learning_rate": 0.00019911209518388393, + "loss": 1.2201, + "step": 1888 + }, + { + "epoch": 0.18, + "grad_norm": 0.2612653091466676, + "learning_rate": 0.00019910999053086604, + "loss": 1.1529, + "step": 1889 + }, + { + "epoch": 0.18, + "grad_norm": 0.25927362339534166, + "learning_rate": 0.00019910788339755833, + "loss": 1.1727, + "step": 1890 + }, + { + "epoch": 0.18, + "grad_norm": 0.26339036209636246, + "learning_rate": 0.00019910577378401355, + "loss": 1.0759, + "step": 1891 + }, + { + "epoch": 0.18, + "grad_norm": 0.26180032269848985, + "learning_rate": 0.00019910366169028452, + "loss": 1.0782, + "step": 1892 + }, + { + "epoch": 0.18, + "grad_norm": 0.27889254421508974, + "learning_rate": 0.00019910154711642403, + "loss": 1.1011, + "step": 1893 + }, + { + "epoch": 0.18, + "grad_norm": 0.2549984542896029, + "learning_rate": 0.00019909943006248505, + "loss": 1.15, + "step": 1894 + }, + { + "epoch": 0.18, + "grad_norm": 0.2274655301809786, + "learning_rate": 0.0001990973105285206, + "loss": 1.1843, + "step": 1895 + }, + { + "epoch": 0.18, + "grad_norm": 0.259196981448463, + "learning_rate": 0.00019909518851458363, + "loss": 1.0451, + "step": 1896 + }, + { + "epoch": 0.18, + "grad_norm": 0.37452396398591586, + "learning_rate": 0.0001990930640207273, + "loss": 1.1319, + "step": 1897 + }, + { + "epoch": 0.18, + "grad_norm": 0.30511870306850003, + "learning_rate": 0.00019909093704700473, + "loss": 1.1613, + "step": 1898 + }, + { + "epoch": 0.18, + "grad_norm": 0.29479478078757926, + "learning_rate": 0.00019908880759346925, + "loss": 1.1725, + "step": 1899 + }, + { + "epoch": 0.18, + "grad_norm": 0.3086910524148668, + "learning_rate": 0.00019908667566017406, + "loss": 1.1686, + "step": 1900 + }, + { + "epoch": 0.18, + "grad_norm": 0.2974264651570825, + "learning_rate": 0.0001990845412471725, + "loss": 1.1257, + "step": 1901 + }, + { + "epoch": 0.18, + "grad_norm": 0.256728492572231, + "learning_rate": 0.00019908240435451805, + "loss": 1.0166, + "step": 1902 + }, + { + "epoch": 0.18, + "grad_norm": 0.23937868610540455, + "learning_rate": 0.00019908026498226418, + "loss": 1.1205, + "step": 1903 + }, + { + "epoch": 0.18, + "grad_norm": 0.272693028176549, + "learning_rate": 0.00019907812313046437, + "loss": 1.1055, + "step": 1904 + }, + { + "epoch": 0.18, + "grad_norm": 0.27354401730685546, + "learning_rate": 0.00019907597879917227, + "loss": 1.1253, + "step": 1905 + }, + { + "epoch": 0.18, + "grad_norm": 0.27305416183963116, + "learning_rate": 0.00019907383198844157, + "loss": 1.0841, + "step": 1906 + }, + { + "epoch": 0.18, + "grad_norm": 0.31852024294594067, + "learning_rate": 0.00019907168269832592, + "loss": 1.1546, + "step": 1907 + }, + { + "epoch": 0.18, + "grad_norm": 0.29709692567412604, + "learning_rate": 0.00019906953092887916, + "loss": 1.2313, + "step": 1908 + }, + { + "epoch": 0.18, + "grad_norm": 0.2617821076298669, + "learning_rate": 0.00019906737668015515, + "loss": 0.943, + "step": 1909 + }, + { + "epoch": 0.18, + "grad_norm": 0.2636246176724674, + "learning_rate": 0.00019906521995220774, + "loss": 1.0627, + "step": 1910 + }, + { + "epoch": 0.18, + "grad_norm": 0.2679413444345284, + "learning_rate": 0.00019906306074509095, + "loss": 1.1503, + "step": 1911 + }, + { + "epoch": 0.18, + "grad_norm": 0.24358831048061988, + "learning_rate": 0.0001990608990588588, + "loss": 1.0467, + "step": 1912 + }, + { + "epoch": 0.18, + "grad_norm": 0.2873239577441084, + "learning_rate": 0.0001990587348935654, + "loss": 1.2148, + "step": 1913 + }, + { + "epoch": 0.18, + "grad_norm": 0.2970349427468681, + "learning_rate": 0.00019905656824926492, + "loss": 1.1718, + "step": 1914 + }, + { + "epoch": 0.18, + "grad_norm": 0.328114785718262, + "learning_rate": 0.00019905439912601156, + "loss": 1.0894, + "step": 1915 + }, + { + "epoch": 0.18, + "grad_norm": 0.2768655282500371, + "learning_rate": 0.00019905222752385958, + "loss": 0.9798, + "step": 1916 + }, + { + "epoch": 0.18, + "grad_norm": 0.24581149927304233, + "learning_rate": 0.00019905005344286338, + "loss": 1.1947, + "step": 1917 + }, + { + "epoch": 0.18, + "grad_norm": 0.24905142815716402, + "learning_rate": 0.00019904787688307735, + "loss": 1.0603, + "step": 1918 + }, + { + "epoch": 0.18, + "grad_norm": 0.25006568481196073, + "learning_rate": 0.00019904569784455592, + "loss": 1.1451, + "step": 1919 + }, + { + "epoch": 0.18, + "grad_norm": 0.24640239497730002, + "learning_rate": 0.0001990435163273537, + "loss": 1.1513, + "step": 1920 + }, + { + "epoch": 0.18, + "grad_norm": 0.3158678665771197, + "learning_rate": 0.00019904133233152518, + "loss": 1.1675, + "step": 1921 + }, + { + "epoch": 0.18, + "grad_norm": 0.2925864535264506, + "learning_rate": 0.0001990391458571251, + "loss": 1.0303, + "step": 1922 + }, + { + "epoch": 0.18, + "grad_norm": 0.2919025168815423, + "learning_rate": 0.00019903695690420817, + "loss": 1.2033, + "step": 1923 + }, + { + "epoch": 0.18, + "grad_norm": 0.2537949728039277, + "learning_rate": 0.00019903476547282914, + "loss": 1.144, + "step": 1924 + }, + { + "epoch": 0.18, + "grad_norm": 0.2583763269969566, + "learning_rate": 0.00019903257156304285, + "loss": 1.1037, + "step": 1925 + }, + { + "epoch": 0.18, + "grad_norm": 0.2613319587588986, + "learning_rate": 0.00019903037517490422, + "loss": 1.0958, + "step": 1926 + }, + { + "epoch": 0.18, + "grad_norm": 0.2595089644687432, + "learning_rate": 0.00019902817630846822, + "loss": 1.1155, + "step": 1927 + }, + { + "epoch": 0.18, + "grad_norm": 0.25297594336763907, + "learning_rate": 0.00019902597496378985, + "loss": 1.1028, + "step": 1928 + }, + { + "epoch": 0.18, + "grad_norm": 0.30513489135176775, + "learning_rate": 0.00019902377114092425, + "loss": 1.1394, + "step": 1929 + }, + { + "epoch": 0.18, + "grad_norm": 0.3021684093102426, + "learning_rate": 0.00019902156483992653, + "loss": 0.9847, + "step": 1930 + }, + { + "epoch": 0.18, + "grad_norm": 0.2590031991887478, + "learning_rate": 0.00019901935606085193, + "loss": 1.036, + "step": 1931 + }, + { + "epoch": 0.18, + "grad_norm": 0.27555119779022413, + "learning_rate": 0.00019901714480375572, + "loss": 1.0828, + "step": 1932 + }, + { + "epoch": 0.18, + "grad_norm": 0.2590125935515052, + "learning_rate": 0.0001990149310686932, + "loss": 1.0109, + "step": 1933 + }, + { + "epoch": 0.19, + "grad_norm": 0.27535539380394297, + "learning_rate": 0.0001990127148557198, + "loss": 1.0645, + "step": 1934 + }, + { + "epoch": 0.19, + "grad_norm": 0.2604038154667027, + "learning_rate": 0.000199010496164891, + "loss": 1.1085, + "step": 1935 + }, + { + "epoch": 0.19, + "grad_norm": 0.24979308709369744, + "learning_rate": 0.0001990082749962623, + "loss": 1.084, + "step": 1936 + }, + { + "epoch": 0.19, + "grad_norm": 0.3032327587240259, + "learning_rate": 0.0001990060513498893, + "loss": 1.1131, + "step": 1937 + }, + { + "epoch": 0.19, + "grad_norm": 0.2678255255634505, + "learning_rate": 0.00019900382522582765, + "loss": 1.1368, + "step": 1938 + }, + { + "epoch": 0.19, + "grad_norm": 0.2180195498119384, + "learning_rate": 0.00019900159662413305, + "loss": 1.0595, + "step": 1939 + }, + { + "epoch": 0.19, + "grad_norm": 0.23825147073097672, + "learning_rate": 0.00019899936554486128, + "loss": 1.1574, + "step": 1940 + }, + { + "epoch": 0.19, + "grad_norm": 0.23730357351239173, + "learning_rate": 0.00019899713198806812, + "loss": 1.1184, + "step": 1941 + }, + { + "epoch": 0.19, + "grad_norm": 0.2805064829854749, + "learning_rate": 0.00019899489595380957, + "loss": 1.2007, + "step": 1942 + }, + { + "epoch": 0.19, + "grad_norm": 0.2473758737702457, + "learning_rate": 0.00019899265744214152, + "loss": 1.0602, + "step": 1943 + }, + { + "epoch": 0.19, + "grad_norm": 0.2718909446342109, + "learning_rate": 0.00019899041645312, + "loss": 1.1384, + "step": 1944 + }, + { + "epoch": 0.19, + "grad_norm": 0.2541735443801485, + "learning_rate": 0.0001989881729868011, + "loss": 1.0676, + "step": 1945 + }, + { + "epoch": 0.19, + "grad_norm": 0.2500138203165472, + "learning_rate": 0.00019898592704324094, + "loss": 1.0983, + "step": 1946 + }, + { + "epoch": 0.19, + "grad_norm": 0.2593377151457589, + "learning_rate": 0.00019898367862249575, + "loss": 1.0257, + "step": 1947 + }, + { + "epoch": 0.19, + "grad_norm": 0.2495745540042679, + "learning_rate": 0.00019898142772462182, + "loss": 1.0384, + "step": 1948 + }, + { + "epoch": 0.19, + "grad_norm": 0.2969538903134302, + "learning_rate": 0.00019897917434967544, + "loss": 1.1127, + "step": 1949 + }, + { + "epoch": 0.19, + "grad_norm": 0.26632242247343, + "learning_rate": 0.00019897691849771301, + "loss": 1.1186, + "step": 1950 + }, + { + "epoch": 0.19, + "grad_norm": 0.29353632758131, + "learning_rate": 0.00019897466016879098, + "loss": 1.0999, + "step": 1951 + }, + { + "epoch": 0.19, + "grad_norm": 0.2888464592603483, + "learning_rate": 0.00019897239936296588, + "loss": 1.0546, + "step": 1952 + }, + { + "epoch": 0.19, + "grad_norm": 0.2576028803170492, + "learning_rate": 0.00019897013608029428, + "loss": 1.0409, + "step": 1953 + }, + { + "epoch": 0.19, + "grad_norm": 0.310447192134622, + "learning_rate": 0.00019896787032083285, + "loss": 1.1755, + "step": 1954 + }, + { + "epoch": 0.19, + "grad_norm": 0.28394813161935395, + "learning_rate": 0.00019896560208463825, + "loss": 1.071, + "step": 1955 + }, + { + "epoch": 0.19, + "grad_norm": 0.23857662880995645, + "learning_rate": 0.00019896333137176726, + "loss": 0.9972, + "step": 1956 + }, + { + "epoch": 0.19, + "grad_norm": 0.27471986726786446, + "learning_rate": 0.00019896105818227673, + "loss": 1.1453, + "step": 1957 + }, + { + "epoch": 0.19, + "grad_norm": 0.26890257495666114, + "learning_rate": 0.00019895878251622348, + "loss": 1.0331, + "step": 1958 + }, + { + "epoch": 0.19, + "grad_norm": 0.2663478542354145, + "learning_rate": 0.00019895650437366452, + "loss": 1.0474, + "step": 1959 + }, + { + "epoch": 0.19, + "grad_norm": 0.2604279634361445, + "learning_rate": 0.00019895422375465686, + "loss": 1.1096, + "step": 1960 + }, + { + "epoch": 0.19, + "grad_norm": 0.2606384507083339, + "learning_rate": 0.00019895194065925754, + "loss": 1.0248, + "step": 1961 + }, + { + "epoch": 0.19, + "grad_norm": 0.27935688700196437, + "learning_rate": 0.00019894965508752375, + "loss": 1.2211, + "step": 1962 + }, + { + "epoch": 0.19, + "grad_norm": 0.2745748469246835, + "learning_rate": 0.00019894736703951263, + "loss": 1.0072, + "step": 1963 + }, + { + "epoch": 0.19, + "grad_norm": 0.23091984266360946, + "learning_rate": 0.00019894507651528148, + "loss": 1.043, + "step": 1964 + }, + { + "epoch": 0.19, + "grad_norm": 0.22878065896236086, + "learning_rate": 0.00019894278351488757, + "loss": 1.0798, + "step": 1965 + }, + { + "epoch": 0.19, + "grad_norm": 0.2554600035367144, + "learning_rate": 0.00019894048803838834, + "loss": 1.1627, + "step": 1966 + }, + { + "epoch": 0.19, + "grad_norm": 0.2737920223861705, + "learning_rate": 0.00019893819008584123, + "loss": 1.1778, + "step": 1967 + }, + { + "epoch": 0.19, + "grad_norm": 0.2708368675723523, + "learning_rate": 0.0001989358896573037, + "loss": 0.9277, + "step": 1968 + }, + { + "epoch": 0.19, + "grad_norm": 0.24550243352794962, + "learning_rate": 0.00019893358675283337, + "loss": 1.1226, + "step": 1969 + }, + { + "epoch": 0.19, + "grad_norm": 0.2642424699796039, + "learning_rate": 0.00019893128137248787, + "loss": 1.1078, + "step": 1970 + }, + { + "epoch": 0.19, + "grad_norm": 0.2573419222534839, + "learning_rate": 0.00019892897351632484, + "loss": 1.2793, + "step": 1971 + }, + { + "epoch": 0.19, + "grad_norm": 0.27224735814088064, + "learning_rate": 0.00019892666318440213, + "loss": 1.0788, + "step": 1972 + }, + { + "epoch": 0.19, + "grad_norm": 0.2766376630314068, + "learning_rate": 0.00019892435037677746, + "loss": 1.1132, + "step": 1973 + }, + { + "epoch": 0.19, + "grad_norm": 0.3067049268603965, + "learning_rate": 0.00019892203509350875, + "loss": 1.0906, + "step": 1974 + }, + { + "epoch": 0.19, + "grad_norm": 0.27698131540064885, + "learning_rate": 0.00019891971733465395, + "loss": 1.1791, + "step": 1975 + }, + { + "epoch": 0.19, + "grad_norm": 0.2874873201823343, + "learning_rate": 0.00019891739710027105, + "loss": 1.1604, + "step": 1976 + }, + { + "epoch": 0.19, + "grad_norm": 0.28015375821188654, + "learning_rate": 0.00019891507439041814, + "loss": 1.1313, + "step": 1977 + }, + { + "epoch": 0.19, + "grad_norm": 0.2690366043454793, + "learning_rate": 0.0001989127492051533, + "loss": 1.199, + "step": 1978 + }, + { + "epoch": 0.19, + "grad_norm": 0.23644122244331564, + "learning_rate": 0.00019891042154453477, + "loss": 1.0604, + "step": 1979 + }, + { + "epoch": 0.19, + "grad_norm": 0.23968738721131738, + "learning_rate": 0.00019890809140862077, + "loss": 1.0409, + "step": 1980 + }, + { + "epoch": 0.19, + "grad_norm": 0.24461539048772374, + "learning_rate": 0.0001989057587974696, + "loss": 1.103, + "step": 1981 + }, + { + "epoch": 0.19, + "grad_norm": 0.2826392500904235, + "learning_rate": 0.0001989034237111397, + "loss": 1.1352, + "step": 1982 + }, + { + "epoch": 0.19, + "grad_norm": 0.2772470869290075, + "learning_rate": 0.0001989010861496894, + "loss": 1.1124, + "step": 1983 + }, + { + "epoch": 0.19, + "grad_norm": 0.2848620471084776, + "learning_rate": 0.00019889874611317732, + "loss": 1.0845, + "step": 1984 + }, + { + "epoch": 0.19, + "grad_norm": 0.2797268047479781, + "learning_rate": 0.00019889640360166194, + "loss": 1.1135, + "step": 1985 + }, + { + "epoch": 0.19, + "grad_norm": 0.2569875278431773, + "learning_rate": 0.00019889405861520188, + "loss": 1.1096, + "step": 1986 + }, + { + "epoch": 0.19, + "grad_norm": 0.2617687295928765, + "learning_rate": 0.0001988917111538559, + "loss": 1.0682, + "step": 1987 + }, + { + "epoch": 0.19, + "grad_norm": 0.28290160215390237, + "learning_rate": 0.00019888936121768266, + "loss": 1.1322, + "step": 1988 + }, + { + "epoch": 0.19, + "grad_norm": 0.2626538158195342, + "learning_rate": 0.00019888700880674103, + "loss": 1.1404, + "step": 1989 + }, + { + "epoch": 0.19, + "grad_norm": 0.27468104620820544, + "learning_rate": 0.00019888465392108986, + "loss": 1.211, + "step": 1990 + }, + { + "epoch": 0.19, + "grad_norm": 0.2684528257690631, + "learning_rate": 0.00019888229656078808, + "loss": 1.086, + "step": 1991 + }, + { + "epoch": 0.19, + "grad_norm": 0.2701652519028749, + "learning_rate": 0.00019887993672589466, + "loss": 1.1998, + "step": 1992 + }, + { + "epoch": 0.19, + "grad_norm": 0.26483806146239974, + "learning_rate": 0.00019887757441646868, + "loss": 1.0015, + "step": 1993 + }, + { + "epoch": 0.19, + "grad_norm": 0.25776041537869526, + "learning_rate": 0.00019887520963256927, + "loss": 1.1646, + "step": 1994 + }, + { + "epoch": 0.19, + "grad_norm": 0.2857416415267135, + "learning_rate": 0.00019887284237425558, + "loss": 1.1295, + "step": 1995 + }, + { + "epoch": 0.19, + "grad_norm": 0.2720535216646917, + "learning_rate": 0.00019887047264158692, + "loss": 1.0362, + "step": 1996 + }, + { + "epoch": 0.19, + "grad_norm": 0.2556899383886345, + "learning_rate": 0.0001988681004346225, + "loss": 1.239, + "step": 1997 + }, + { + "epoch": 0.19, + "grad_norm": 0.29555721414719655, + "learning_rate": 0.00019886572575342174, + "loss": 1.1347, + "step": 1998 + }, + { + "epoch": 0.19, + "grad_norm": 0.28921149656501194, + "learning_rate": 0.00019886334859804406, + "loss": 1.1826, + "step": 1999 + }, + { + "epoch": 0.19, + "grad_norm": 0.274012668267724, + "learning_rate": 0.00019886096896854896, + "loss": 1.0865, + "step": 2000 + }, + { + "epoch": 0.19, + "grad_norm": 0.2422334744946127, + "learning_rate": 0.00019885858686499594, + "loss": 1.0813, + "step": 2001 + }, + { + "epoch": 0.19, + "grad_norm": 0.30780112818652483, + "learning_rate": 0.00019885620228744468, + "loss": 1.0997, + "step": 2002 + }, + { + "epoch": 0.19, + "grad_norm": 0.24632043466385958, + "learning_rate": 0.00019885381523595484, + "loss": 1.0984, + "step": 2003 + }, + { + "epoch": 0.19, + "grad_norm": 0.27662537602831366, + "learning_rate": 0.00019885142571058614, + "loss": 1.1465, + "step": 2004 + }, + { + "epoch": 0.19, + "grad_norm": 0.24285561219836202, + "learning_rate": 0.00019884903371139838, + "loss": 1.0971, + "step": 2005 + }, + { + "epoch": 0.19, + "grad_norm": 0.24656395139808884, + "learning_rate": 0.00019884663923845142, + "loss": 1.143, + "step": 2006 + }, + { + "epoch": 0.19, + "grad_norm": 0.2629422784962355, + "learning_rate": 0.0001988442422918052, + "loss": 1.1407, + "step": 2007 + }, + { + "epoch": 0.19, + "grad_norm": 0.2811274440605523, + "learning_rate": 0.0001988418428715197, + "loss": 1.159, + "step": 2008 + }, + { + "epoch": 0.19, + "grad_norm": 0.28599512163427754, + "learning_rate": 0.00019883944097765497, + "loss": 1.2293, + "step": 2009 + }, + { + "epoch": 0.19, + "grad_norm": 0.2637511197971286, + "learning_rate": 0.0001988370366102711, + "loss": 1.0897, + "step": 2010 + }, + { + "epoch": 0.19, + "grad_norm": 0.2621878190951607, + "learning_rate": 0.00019883462976942826, + "loss": 1.0737, + "step": 2011 + }, + { + "epoch": 0.19, + "grad_norm": 0.2762514383867885, + "learning_rate": 0.0001988322204551867, + "loss": 1.1701, + "step": 2012 + }, + { + "epoch": 0.19, + "grad_norm": 0.24133383394031283, + "learning_rate": 0.00019882980866760673, + "loss": 1.0147, + "step": 2013 + }, + { + "epoch": 0.19, + "grad_norm": 0.26354770911760284, + "learning_rate": 0.00019882739440674863, + "loss": 1.1734, + "step": 2014 + }, + { + "epoch": 0.19, + "grad_norm": 0.28989400649087416, + "learning_rate": 0.00019882497767267294, + "loss": 0.9902, + "step": 2015 + }, + { + "epoch": 0.19, + "grad_norm": 0.27932170163937037, + "learning_rate": 0.00019882255846544005, + "loss": 1.1016, + "step": 2016 + }, + { + "epoch": 0.19, + "grad_norm": 0.25887068615609143, + "learning_rate": 0.00019882013678511052, + "loss": 1.0908, + "step": 2017 + }, + { + "epoch": 0.19, + "grad_norm": 0.28911454498855693, + "learning_rate": 0.000198817712631745, + "loss": 1.1229, + "step": 2018 + }, + { + "epoch": 0.19, + "grad_norm": 0.2546367208219804, + "learning_rate": 0.00019881528600540404, + "loss": 1.1906, + "step": 2019 + }, + { + "epoch": 0.19, + "grad_norm": 0.2736344196999536, + "learning_rate": 0.0001988128569061485, + "loss": 1.1621, + "step": 2020 + }, + { + "epoch": 0.19, + "grad_norm": 0.2818744152091652, + "learning_rate": 0.0001988104253340391, + "loss": 1.0773, + "step": 2021 + }, + { + "epoch": 0.19, + "grad_norm": 0.2826631297854352, + "learning_rate": 0.00019880799128913672, + "loss": 1.043, + "step": 2022 + }, + { + "epoch": 0.19, + "grad_norm": 0.22842355541316772, + "learning_rate": 0.00019880555477150223, + "loss": 0.8834, + "step": 2023 + }, + { + "epoch": 0.19, + "grad_norm": 0.3137419246856175, + "learning_rate": 0.00019880311578119667, + "loss": 1.1337, + "step": 2024 + }, + { + "epoch": 0.19, + "grad_norm": 0.2733714490698032, + "learning_rate": 0.00019880067431828102, + "loss": 1.183, + "step": 2025 + }, + { + "epoch": 0.19, + "grad_norm": 0.26580523535512374, + "learning_rate": 0.00019879823038281642, + "loss": 1.121, + "step": 2026 + }, + { + "epoch": 0.19, + "grad_norm": 0.26858458548658276, + "learning_rate": 0.000198795783974864, + "loss": 1.0527, + "step": 2027 + }, + { + "epoch": 0.19, + "grad_norm": 0.2763698286676362, + "learning_rate": 0.00019879333509448496, + "loss": 1.0042, + "step": 2028 + }, + { + "epoch": 0.19, + "grad_norm": 0.24269742693449786, + "learning_rate": 0.00019879088374174066, + "loss": 1.1615, + "step": 2029 + }, + { + "epoch": 0.19, + "grad_norm": 0.30073652024639724, + "learning_rate": 0.0001987884299166924, + "loss": 1.121, + "step": 2030 + }, + { + "epoch": 0.19, + "grad_norm": 0.2556748923041058, + "learning_rate": 0.00019878597361940161, + "loss": 0.8961, + "step": 2031 + }, + { + "epoch": 0.19, + "grad_norm": 0.2591634855819998, + "learning_rate": 0.00019878351484992974, + "loss": 1.1487, + "step": 2032 + }, + { + "epoch": 0.19, + "grad_norm": 0.2600632642281181, + "learning_rate": 0.00019878105360833832, + "loss": 1.1916, + "step": 2033 + }, + { + "epoch": 0.19, + "grad_norm": 0.28116821489552873, + "learning_rate": 0.00019877858989468894, + "loss": 1.0512, + "step": 2034 + }, + { + "epoch": 0.19, + "grad_norm": 0.20261666938146597, + "learning_rate": 0.0001987761237090433, + "loss": 1.1838, + "step": 2035 + }, + { + "epoch": 0.19, + "grad_norm": 0.29561896466102555, + "learning_rate": 0.00019877365505146304, + "loss": 1.0852, + "step": 2036 + }, + { + "epoch": 0.19, + "grad_norm": 0.29527708211041853, + "learning_rate": 0.00019877118392201, + "loss": 1.1186, + "step": 2037 + }, + { + "epoch": 0.19, + "grad_norm": 0.250583627663694, + "learning_rate": 0.00019876871032074603, + "loss": 1.045, + "step": 2038 + }, + { + "epoch": 0.2, + "grad_norm": 0.3097749132789377, + "learning_rate": 0.000198766234247733, + "loss": 1.1162, + "step": 2039 + }, + { + "epoch": 0.2, + "grad_norm": 0.2557816245578032, + "learning_rate": 0.0001987637557030329, + "loss": 1.0323, + "step": 2040 + }, + { + "epoch": 0.2, + "grad_norm": 0.28034027348239304, + "learning_rate": 0.00019876127468670772, + "loss": 1.1111, + "step": 2041 + }, + { + "epoch": 0.2, + "grad_norm": 0.27069634417343275, + "learning_rate": 0.00019875879119881957, + "loss": 1.0432, + "step": 2042 + }, + { + "epoch": 0.2, + "grad_norm": 0.2579239339061907, + "learning_rate": 0.00019875630523943062, + "loss": 1.0104, + "step": 2043 + }, + { + "epoch": 0.2, + "grad_norm": 0.28379405935029695, + "learning_rate": 0.00019875381680860304, + "loss": 1.1044, + "step": 2044 + }, + { + "epoch": 0.2, + "grad_norm": 0.25276024384054346, + "learning_rate": 0.00019875132590639917, + "loss": 1.0816, + "step": 2045 + }, + { + "epoch": 0.2, + "grad_norm": 0.2529581499654312, + "learning_rate": 0.00019874883253288126, + "loss": 1.0982, + "step": 2046 + }, + { + "epoch": 0.2, + "grad_norm": 0.27524747771098035, + "learning_rate": 0.00019874633668811177, + "loss": 1.1365, + "step": 2047 + }, + { + "epoch": 0.2, + "grad_norm": 0.28979122397869284, + "learning_rate": 0.00019874383837215314, + "loss": 1.1472, + "step": 2048 + }, + { + "epoch": 0.2, + "grad_norm": 0.26208338201886544, + "learning_rate": 0.00019874133758506792, + "loss": 1.0844, + "step": 2049 + }, + { + "epoch": 0.2, + "grad_norm": 0.23244493332423305, + "learning_rate": 0.00019873883432691868, + "loss": 1.0652, + "step": 2050 + }, + { + "epoch": 0.2, + "grad_norm": 0.26326298210214855, + "learning_rate": 0.000198736328597768, + "loss": 1.1114, + "step": 2051 + }, + { + "epoch": 0.2, + "grad_norm": 0.25775993234870526, + "learning_rate": 0.0001987338203976787, + "loss": 0.9868, + "step": 2052 + }, + { + "epoch": 0.2, + "grad_norm": 0.2558696856240754, + "learning_rate": 0.00019873130972671347, + "loss": 1.0485, + "step": 2053 + }, + { + "epoch": 0.2, + "grad_norm": 0.25810577145871305, + "learning_rate": 0.00019872879658493515, + "loss": 1.0948, + "step": 2054 + }, + { + "epoch": 0.2, + "grad_norm": 0.26764993308160495, + "learning_rate": 0.00019872628097240667, + "loss": 1.1752, + "step": 2055 + }, + { + "epoch": 0.2, + "grad_norm": 0.2844642341098131, + "learning_rate": 0.00019872376288919093, + "loss": 1.1397, + "step": 2056 + }, + { + "epoch": 0.2, + "grad_norm": 0.27934363473211593, + "learning_rate": 0.00019872124233535102, + "loss": 1.2909, + "step": 2057 + }, + { + "epoch": 0.2, + "grad_norm": 0.28305414844226917, + "learning_rate": 0.00019871871931094996, + "loss": 1.1307, + "step": 2058 + }, + { + "epoch": 0.2, + "grad_norm": 0.2547448994013944, + "learning_rate": 0.0001987161938160509, + "loss": 1.0631, + "step": 2059 + }, + { + "epoch": 0.2, + "grad_norm": 0.2457309364326485, + "learning_rate": 0.00019871366585071706, + "loss": 1.0862, + "step": 2060 + }, + { + "epoch": 0.2, + "grad_norm": 0.2892837695062467, + "learning_rate": 0.00019871113541501168, + "loss": 1.1139, + "step": 2061 + }, + { + "epoch": 0.2, + "grad_norm": 0.297033511165508, + "learning_rate": 0.0001987086025089981, + "loss": 1.158, + "step": 2062 + }, + { + "epoch": 0.2, + "grad_norm": 0.28710128200720647, + "learning_rate": 0.00019870606713273968, + "loss": 1.0244, + "step": 2063 + }, + { + "epoch": 0.2, + "grad_norm": 0.29989733418311626, + "learning_rate": 0.00019870352928629993, + "loss": 1.0888, + "step": 2064 + }, + { + "epoch": 0.2, + "grad_norm": 0.2698108293542839, + "learning_rate": 0.00019870098896974234, + "loss": 1.0765, + "step": 2065 + }, + { + "epoch": 0.2, + "grad_norm": 0.2833643780662451, + "learning_rate": 0.00019869844618313046, + "loss": 1.0251, + "step": 2066 + }, + { + "epoch": 0.2, + "grad_norm": 0.3349028897187998, + "learning_rate": 0.00019869590092652791, + "loss": 1.1527, + "step": 2067 + }, + { + "epoch": 0.2, + "grad_norm": 0.26736402567126216, + "learning_rate": 0.0001986933531999984, + "loss": 1.1222, + "step": 2068 + }, + { + "epoch": 0.2, + "grad_norm": 0.24629779305771238, + "learning_rate": 0.00019869080300360576, + "loss": 1.0542, + "step": 2069 + }, + { + "epoch": 0.2, + "grad_norm": 0.2659353555548773, + "learning_rate": 0.00019868825033741373, + "loss": 1.1196, + "step": 2070 + }, + { + "epoch": 0.2, + "grad_norm": 0.2880977765710642, + "learning_rate": 0.00019868569520148618, + "loss": 1.1662, + "step": 2071 + }, + { + "epoch": 0.2, + "grad_norm": 0.26777705281811776, + "learning_rate": 0.0001986831375958871, + "loss": 1.1153, + "step": 2072 + }, + { + "epoch": 0.2, + "grad_norm": 0.2446161703167875, + "learning_rate": 0.0001986805775206805, + "loss": 1.1845, + "step": 2073 + }, + { + "epoch": 0.2, + "grad_norm": 0.2873479752588241, + "learning_rate": 0.00019867801497593042, + "loss": 1.19, + "step": 2074 + }, + { + "epoch": 0.2, + "grad_norm": 0.2574173866976346, + "learning_rate": 0.000198675449961701, + "loss": 1.1004, + "step": 2075 + }, + { + "epoch": 0.2, + "grad_norm": 0.31037055152728826, + "learning_rate": 0.00019867288247805642, + "loss": 1.1266, + "step": 2076 + }, + { + "epoch": 0.2, + "grad_norm": 0.25230875618755544, + "learning_rate": 0.00019867031252506095, + "loss": 1.0861, + "step": 2077 + }, + { + "epoch": 0.2, + "grad_norm": 0.28027907388788925, + "learning_rate": 0.0001986677401027789, + "loss": 1.0899, + "step": 2078 + }, + { + "epoch": 0.2, + "grad_norm": 0.2629017191349244, + "learning_rate": 0.00019866516521127462, + "loss": 1.1268, + "step": 2079 + }, + { + "epoch": 0.2, + "grad_norm": 0.2858944012914975, + "learning_rate": 0.0001986625878506126, + "loss": 1.1248, + "step": 2080 + }, + { + "epoch": 0.2, + "grad_norm": 0.3010750896726883, + "learning_rate": 0.00019866000802085728, + "loss": 1.109, + "step": 2081 + }, + { + "epoch": 0.2, + "grad_norm": 0.27335673435624314, + "learning_rate": 0.0001986574257220733, + "loss": 1.0929, + "step": 2082 + }, + { + "epoch": 0.2, + "grad_norm": 0.25277983760592904, + "learning_rate": 0.0001986548409543252, + "loss": 1.0946, + "step": 2083 + }, + { + "epoch": 0.2, + "grad_norm": 0.2522955007250379, + "learning_rate": 0.00019865225371767773, + "loss": 1.1279, + "step": 2084 + }, + { + "epoch": 0.2, + "grad_norm": 0.2876473859106391, + "learning_rate": 0.00019864966401219559, + "loss": 1.044, + "step": 2085 + }, + { + "epoch": 0.2, + "grad_norm": 0.5481907916561444, + "learning_rate": 0.00019864707183794362, + "loss": 1.3456, + "step": 2086 + }, + { + "epoch": 0.2, + "grad_norm": 0.26281784160346944, + "learning_rate": 0.00019864447719498667, + "loss": 1.2029, + "step": 2087 + }, + { + "epoch": 0.2, + "grad_norm": 0.27656267392775247, + "learning_rate": 0.00019864188008338968, + "loss": 1.1244, + "step": 2088 + }, + { + "epoch": 0.2, + "grad_norm": 0.2712507307178155, + "learning_rate": 0.00019863928050321765, + "loss": 1.2326, + "step": 2089 + }, + { + "epoch": 0.2, + "grad_norm": 0.27024578206691424, + "learning_rate": 0.00019863667845453563, + "loss": 1.1642, + "step": 2090 + }, + { + "epoch": 0.2, + "grad_norm": 0.305876067098806, + "learning_rate": 0.00019863407393740876, + "loss": 1.2, + "step": 2091 + }, + { + "epoch": 0.2, + "grad_norm": 0.2665786067215833, + "learning_rate": 0.00019863146695190217, + "loss": 1.1217, + "step": 2092 + }, + { + "epoch": 0.2, + "grad_norm": 0.23808439538640014, + "learning_rate": 0.00019862885749808115, + "loss": 1.089, + "step": 2093 + }, + { + "epoch": 0.2, + "grad_norm": 0.27236917331109767, + "learning_rate": 0.00019862624557601103, + "loss": 1.1333, + "step": 2094 + }, + { + "epoch": 0.2, + "grad_norm": 0.26305710425726253, + "learning_rate": 0.00019862363118575705, + "loss": 1.1396, + "step": 2095 + }, + { + "epoch": 0.2, + "grad_norm": 0.2302881958735561, + "learning_rate": 0.00019862101432738475, + "loss": 1.0263, + "step": 2096 + }, + { + "epoch": 0.2, + "grad_norm": 0.2954108631439019, + "learning_rate": 0.0001986183950009596, + "loss": 1.1058, + "step": 2097 + }, + { + "epoch": 0.2, + "grad_norm": 0.2948895500433461, + "learning_rate": 0.00019861577320654712, + "loss": 1.1621, + "step": 2098 + }, + { + "epoch": 0.2, + "grad_norm": 0.24612354772654924, + "learning_rate": 0.00019861314894421294, + "loss": 1.1682, + "step": 2099 + }, + { + "epoch": 0.2, + "grad_norm": 0.2770485117180903, + "learning_rate": 0.00019861052221402275, + "loss": 1.0537, + "step": 2100 + }, + { + "epoch": 0.2, + "grad_norm": 0.2803003393924788, + "learning_rate": 0.00019860789301604222, + "loss": 1.1575, + "step": 2101 + }, + { + "epoch": 0.2, + "grad_norm": 0.263398275519541, + "learning_rate": 0.00019860526135033723, + "loss": 1.1161, + "step": 2102 + }, + { + "epoch": 0.2, + "grad_norm": 0.2735697531308213, + "learning_rate": 0.0001986026272169736, + "loss": 1.1304, + "step": 2103 + }, + { + "epoch": 0.2, + "grad_norm": 0.2837690209815238, + "learning_rate": 0.00019859999061601726, + "loss": 0.9939, + "step": 2104 + }, + { + "epoch": 0.2, + "grad_norm": 0.2611549781543971, + "learning_rate": 0.00019859735154753418, + "loss": 1.0968, + "step": 2105 + }, + { + "epoch": 0.2, + "grad_norm": 0.2858960886543411, + "learning_rate": 0.0001985947100115904, + "loss": 1.1623, + "step": 2106 + }, + { + "epoch": 0.2, + "grad_norm": 0.3657978801967696, + "learning_rate": 0.00019859206600825207, + "loss": 1.2114, + "step": 2107 + }, + { + "epoch": 0.2, + "grad_norm": 0.24528859351726237, + "learning_rate": 0.0001985894195375853, + "loss": 1.1096, + "step": 2108 + }, + { + "epoch": 0.2, + "grad_norm": 0.309781272595587, + "learning_rate": 0.00019858677059965632, + "loss": 1.1382, + "step": 2109 + }, + { + "epoch": 0.2, + "grad_norm": 0.3015108916795954, + "learning_rate": 0.0001985841191945315, + "loss": 1.0789, + "step": 2110 + }, + { + "epoch": 0.2, + "grad_norm": 0.27510018422236365, + "learning_rate": 0.0001985814653222771, + "loss": 1.1214, + "step": 2111 + }, + { + "epoch": 0.2, + "grad_norm": 0.2504556220073607, + "learning_rate": 0.0001985788089829596, + "loss": 1.1829, + "step": 2112 + }, + { + "epoch": 0.2, + "grad_norm": 0.27607247184581263, + "learning_rate": 0.00019857615017664543, + "loss": 1.2014, + "step": 2113 + }, + { + "epoch": 0.2, + "grad_norm": 0.28257879262143415, + "learning_rate": 0.00019857348890340117, + "loss": 1.1302, + "step": 2114 + }, + { + "epoch": 0.2, + "grad_norm": 0.2961265516298664, + "learning_rate": 0.0001985708251632934, + "loss": 1.0324, + "step": 2115 + }, + { + "epoch": 0.2, + "grad_norm": 0.2942307299808682, + "learning_rate": 0.00019856815895638876, + "loss": 1.0799, + "step": 2116 + }, + { + "epoch": 0.2, + "grad_norm": 0.2541933617332, + "learning_rate": 0.000198565490282754, + "loss": 1.1498, + "step": 2117 + }, + { + "epoch": 0.2, + "grad_norm": 0.28011641730308906, + "learning_rate": 0.0001985628191424559, + "loss": 1.1392, + "step": 2118 + }, + { + "epoch": 0.2, + "grad_norm": 0.2807759455450216, + "learning_rate": 0.0001985601455355613, + "loss": 1.1776, + "step": 2119 + }, + { + "epoch": 0.2, + "grad_norm": 0.32430654597893255, + "learning_rate": 0.00019855746946213714, + "loss": 1.1778, + "step": 2120 + }, + { + "epoch": 0.2, + "grad_norm": 0.2525816278621571, + "learning_rate": 0.00019855479092225037, + "loss": 1.1537, + "step": 2121 + }, + { + "epoch": 0.2, + "grad_norm": 0.26640266147857056, + "learning_rate": 0.00019855210991596796, + "loss": 1.096, + "step": 2122 + }, + { + "epoch": 0.2, + "grad_norm": 0.25884771414681745, + "learning_rate": 0.00019854942644335712, + "loss": 1.1562, + "step": 2123 + }, + { + "epoch": 0.2, + "grad_norm": 0.27617724462201587, + "learning_rate": 0.00019854674050448493, + "loss": 1.1385, + "step": 2124 + }, + { + "epoch": 0.2, + "grad_norm": 0.2919548651872331, + "learning_rate": 0.00019854405209941863, + "loss": 1.0791, + "step": 2125 + }, + { + "epoch": 0.2, + "grad_norm": 0.24993403620332835, + "learning_rate": 0.00019854136122822547, + "loss": 1.0431, + "step": 2126 + }, + { + "epoch": 0.2, + "grad_norm": 0.23940290308480794, + "learning_rate": 0.0001985386678909728, + "loss": 1.0944, + "step": 2127 + }, + { + "epoch": 0.2, + "grad_norm": 0.2852028804707256, + "learning_rate": 0.00019853597208772808, + "loss": 1.0735, + "step": 2128 + }, + { + "epoch": 0.2, + "grad_norm": 0.2799825280793891, + "learning_rate": 0.0001985332738185587, + "loss": 1.1108, + "step": 2129 + }, + { + "epoch": 0.2, + "grad_norm": 0.2803961566058768, + "learning_rate": 0.00019853057308353225, + "loss": 1.1428, + "step": 2130 + }, + { + "epoch": 0.2, + "grad_norm": 0.2681024606858511, + "learning_rate": 0.00019852786988271628, + "loss": 1.1777, + "step": 2131 + }, + { + "epoch": 0.2, + "grad_norm": 0.28971370065149094, + "learning_rate": 0.0001985251642161784, + "loss": 1.1166, + "step": 2132 + }, + { + "epoch": 0.2, + "grad_norm": 0.2809462172886824, + "learning_rate": 0.0001985224560839864, + "loss": 1.1337, + "step": 2133 + }, + { + "epoch": 0.2, + "grad_norm": 0.2662105547019178, + "learning_rate": 0.00019851974548620803, + "loss": 1.2131, + "step": 2134 + }, + { + "epoch": 0.2, + "grad_norm": 0.2689850661970803, + "learning_rate": 0.0001985170324229111, + "loss": 1.1857, + "step": 2135 + }, + { + "epoch": 0.2, + "grad_norm": 0.2831472805779883, + "learning_rate": 0.00019851431689416353, + "loss": 1.1575, + "step": 2136 + }, + { + "epoch": 0.2, + "grad_norm": 0.2877033555483126, + "learning_rate": 0.00019851159890003323, + "loss": 1.0868, + "step": 2137 + }, + { + "epoch": 0.2, + "grad_norm": 0.29781126767542937, + "learning_rate": 0.00019850887844058827, + "loss": 1.1535, + "step": 2138 + }, + { + "epoch": 0.2, + "grad_norm": 0.2528619996193506, + "learning_rate": 0.00019850615551589672, + "loss": 1.0632, + "step": 2139 + }, + { + "epoch": 0.2, + "grad_norm": 0.2605060917972941, + "learning_rate": 0.00019850343012602672, + "loss": 1.1709, + "step": 2140 + }, + { + "epoch": 0.2, + "grad_norm": 0.2773145775379898, + "learning_rate": 0.0001985007022710465, + "loss": 1.1957, + "step": 2141 + }, + { + "epoch": 0.2, + "grad_norm": 0.28927051493387645, + "learning_rate": 0.00019849797195102426, + "loss": 1.0608, + "step": 2142 + }, + { + "epoch": 0.21, + "grad_norm": 0.31331821900541645, + "learning_rate": 0.0001984952391660284, + "loss": 1.1371, + "step": 2143 + }, + { + "epoch": 0.21, + "grad_norm": 0.2785760487723139, + "learning_rate": 0.00019849250391612726, + "loss": 1.1553, + "step": 2144 + }, + { + "epoch": 0.21, + "grad_norm": 0.28557512665641493, + "learning_rate": 0.0001984897662013893, + "loss": 1.1505, + "step": 2145 + }, + { + "epoch": 0.21, + "grad_norm": 0.2640270431957491, + "learning_rate": 0.00019848702602188304, + "loss": 1.0196, + "step": 2146 + }, + { + "epoch": 0.21, + "grad_norm": 0.26301343020312196, + "learning_rate": 0.00019848428337767708, + "loss": 1.0716, + "step": 2147 + }, + { + "epoch": 0.21, + "grad_norm": 0.24955497958144957, + "learning_rate": 0.00019848153826884004, + "loss": 1.1068, + "step": 2148 + }, + { + "epoch": 0.21, + "grad_norm": 0.25592481094445924, + "learning_rate": 0.00019847879069544058, + "loss": 1.0493, + "step": 2149 + }, + { + "epoch": 0.21, + "grad_norm": 0.2690607872687816, + "learning_rate": 0.0001984760406575475, + "loss": 1.1645, + "step": 2150 + }, + { + "epoch": 0.21, + "grad_norm": 0.29603325449239903, + "learning_rate": 0.00019847328815522964, + "loss": 1.0333, + "step": 2151 + }, + { + "epoch": 0.21, + "grad_norm": 0.25742762041890327, + "learning_rate": 0.00019847053318855582, + "loss": 1.2017, + "step": 2152 + }, + { + "epoch": 0.21, + "grad_norm": 0.30645008656891, + "learning_rate": 0.00019846777575759504, + "loss": 1.1346, + "step": 2153 + }, + { + "epoch": 0.21, + "grad_norm": 0.27044205667054494, + "learning_rate": 0.00019846501586241627, + "loss": 1.097, + "step": 2154 + }, + { + "epoch": 0.21, + "grad_norm": 0.23220679441493658, + "learning_rate": 0.00019846225350308864, + "loss": 1.0664, + "step": 2155 + }, + { + "epoch": 0.21, + "grad_norm": 0.26546624754158665, + "learning_rate": 0.00019845948867968117, + "loss": 1.0479, + "step": 2156 + }, + { + "epoch": 0.21, + "grad_norm": 0.2798970718045841, + "learning_rate": 0.00019845672139226316, + "loss": 1.0244, + "step": 2157 + }, + { + "epoch": 0.21, + "grad_norm": 0.2784787690728781, + "learning_rate": 0.00019845395164090382, + "loss": 1.1114, + "step": 2158 + }, + { + "epoch": 0.21, + "grad_norm": 0.24443020869424956, + "learning_rate": 0.00019845117942567244, + "loss": 1.1341, + "step": 2159 + }, + { + "epoch": 0.21, + "grad_norm": 0.244756739484968, + "learning_rate": 0.00019844840474663843, + "loss": 1.0807, + "step": 2160 + }, + { + "epoch": 0.21, + "grad_norm": 0.2702201314078314, + "learning_rate": 0.00019844562760387122, + "loss": 1.1269, + "step": 2161 + }, + { + "epoch": 0.21, + "grad_norm": 0.29077240998538223, + "learning_rate": 0.00019844284799744032, + "loss": 1.1688, + "step": 2162 + }, + { + "epoch": 0.21, + "grad_norm": 0.2683927419703879, + "learning_rate": 0.00019844006592741525, + "loss": 1.0173, + "step": 2163 + }, + { + "epoch": 0.21, + "grad_norm": 0.3109361300962534, + "learning_rate": 0.0001984372813938657, + "loss": 1.148, + "step": 2164 + }, + { + "epoch": 0.21, + "grad_norm": 0.2501468531327423, + "learning_rate": 0.00019843449439686128, + "loss": 1.1907, + "step": 2165 + }, + { + "epoch": 0.21, + "grad_norm": 0.2664858040953975, + "learning_rate": 0.0001984317049364718, + "loss": 1.1097, + "step": 2166 + }, + { + "epoch": 0.21, + "grad_norm": 0.2549104447589198, + "learning_rate": 0.00019842891301276704, + "loss": 1.0737, + "step": 2167 + }, + { + "epoch": 0.21, + "grad_norm": 0.2908504505180112, + "learning_rate": 0.00019842611862581685, + "loss": 1.0539, + "step": 2168 + }, + { + "epoch": 0.21, + "grad_norm": 0.2959941920542236, + "learning_rate": 0.00019842332177569122, + "loss": 1.1418, + "step": 2169 + }, + { + "epoch": 0.21, + "grad_norm": 0.26475147358616613, + "learning_rate": 0.00019842052246246008, + "loss": 1.0866, + "step": 2170 + }, + { + "epoch": 0.21, + "grad_norm": 0.28262317137702664, + "learning_rate": 0.0001984177206861935, + "loss": 1.1431, + "step": 2171 + }, + { + "epoch": 0.21, + "grad_norm": 0.2615494504849684, + "learning_rate": 0.00019841491644696164, + "loss": 1.1576, + "step": 2172 + }, + { + "epoch": 0.21, + "grad_norm": 0.27419165343889973, + "learning_rate": 0.00019841210974483464, + "loss": 1.1325, + "step": 2173 + }, + { + "epoch": 0.21, + "grad_norm": 0.24040329342282296, + "learning_rate": 0.0001984093005798827, + "loss": 1.1437, + "step": 2174 + }, + { + "epoch": 0.21, + "grad_norm": 0.29409408115598895, + "learning_rate": 0.00019840648895217623, + "loss": 1.1064, + "step": 2175 + }, + { + "epoch": 0.21, + "grad_norm": 0.2523665722905447, + "learning_rate": 0.00019840367486178548, + "loss": 1.07, + "step": 2176 + }, + { + "epoch": 0.21, + "grad_norm": 0.25402746178920604, + "learning_rate": 0.00019840085830878095, + "loss": 1.0573, + "step": 2177 + }, + { + "epoch": 0.21, + "grad_norm": 0.24450222623833068, + "learning_rate": 0.00019839803929323305, + "loss": 1.1127, + "step": 2178 + }, + { + "epoch": 0.21, + "grad_norm": 0.24554337014527297, + "learning_rate": 0.00019839521781521245, + "loss": 1.0781, + "step": 2179 + }, + { + "epoch": 0.21, + "grad_norm": 0.29644405786503714, + "learning_rate": 0.00019839239387478962, + "loss": 1.1072, + "step": 2180 + }, + { + "epoch": 0.21, + "grad_norm": 0.23532298664328116, + "learning_rate": 0.00019838956747203533, + "loss": 0.9529, + "step": 2181 + }, + { + "epoch": 0.21, + "grad_norm": 0.255567234762623, + "learning_rate": 0.00019838673860702027, + "loss": 1.2165, + "step": 2182 + }, + { + "epoch": 0.21, + "grad_norm": 0.2565392465769596, + "learning_rate": 0.00019838390727981527, + "loss": 1.049, + "step": 2183 + }, + { + "epoch": 0.21, + "grad_norm": 0.28869850717775036, + "learning_rate": 0.00019838107349049111, + "loss": 1.2043, + "step": 2184 + }, + { + "epoch": 0.21, + "grad_norm": 0.2717398071388341, + "learning_rate": 0.0001983782372391188, + "loss": 1.1689, + "step": 2185 + }, + { + "epoch": 0.21, + "grad_norm": 0.2714997167115452, + "learning_rate": 0.00019837539852576923, + "loss": 1.0412, + "step": 2186 + }, + { + "epoch": 0.21, + "grad_norm": 0.2528223715764014, + "learning_rate": 0.0001983725573505135, + "loss": 1.0636, + "step": 2187 + }, + { + "epoch": 0.21, + "grad_norm": 0.24457509080188328, + "learning_rate": 0.0001983697137134227, + "loss": 1.0427, + "step": 2188 + }, + { + "epoch": 0.21, + "grad_norm": 0.2647502073171626, + "learning_rate": 0.00019836686761456803, + "loss": 1.1109, + "step": 2189 + }, + { + "epoch": 0.21, + "grad_norm": 0.25621160412291943, + "learning_rate": 0.00019836401905402062, + "loss": 1.1426, + "step": 2190 + }, + { + "epoch": 0.21, + "grad_norm": 0.2875669800942636, + "learning_rate": 0.00019836116803185184, + "loss": 1.0843, + "step": 2191 + }, + { + "epoch": 0.21, + "grad_norm": 0.26793735322362255, + "learning_rate": 0.000198358314548133, + "loss": 1.2198, + "step": 2192 + }, + { + "epoch": 0.21, + "grad_norm": 0.30265207091393975, + "learning_rate": 0.00019835545860293551, + "loss": 0.9996, + "step": 2193 + }, + { + "epoch": 0.21, + "grad_norm": 0.25384247043679864, + "learning_rate": 0.0001983526001963309, + "loss": 1.1222, + "step": 2194 + }, + { + "epoch": 0.21, + "grad_norm": 0.25800061296353, + "learning_rate": 0.00019834973932839062, + "loss": 1.0905, + "step": 2195 + }, + { + "epoch": 0.21, + "grad_norm": 0.27769824178444574, + "learning_rate": 0.00019834687599918632, + "loss": 1.0538, + "step": 2196 + }, + { + "epoch": 0.21, + "grad_norm": 0.2772132245071213, + "learning_rate": 0.00019834401020878963, + "loss": 1.0624, + "step": 2197 + }, + { + "epoch": 0.21, + "grad_norm": 0.26363087570930127, + "learning_rate": 0.0001983411419572723, + "loss": 0.9887, + "step": 2198 + }, + { + "epoch": 0.21, + "grad_norm": 0.2815107276805014, + "learning_rate": 0.00019833827124470608, + "loss": 1.1811, + "step": 2199 + }, + { + "epoch": 0.21, + "grad_norm": 0.2785544171403854, + "learning_rate": 0.0001983353980711628, + "loss": 1.1437, + "step": 2200 + }, + { + "epoch": 0.21, + "grad_norm": 0.3011117821316356, + "learning_rate": 0.0001983325224367144, + "loss": 1.0398, + "step": 2201 + }, + { + "epoch": 0.21, + "grad_norm": 0.2445670553372607, + "learning_rate": 0.00019832964434143282, + "loss": 1.101, + "step": 2202 + }, + { + "epoch": 0.21, + "grad_norm": 0.25914004062255874, + "learning_rate": 0.00019832676378539005, + "loss": 1.1808, + "step": 2203 + }, + { + "epoch": 0.21, + "grad_norm": 0.2754672779595424, + "learning_rate": 0.00019832388076865826, + "loss": 1.0929, + "step": 2204 + }, + { + "epoch": 0.21, + "grad_norm": 0.2565507943348922, + "learning_rate": 0.00019832099529130959, + "loss": 1.0699, + "step": 2205 + }, + { + "epoch": 0.21, + "grad_norm": 0.3343174008427606, + "learning_rate": 0.00019831810735341618, + "loss": 1.0145, + "step": 2206 + }, + { + "epoch": 0.21, + "grad_norm": 0.2804796337948149, + "learning_rate": 0.00019831521695505035, + "loss": 1.0897, + "step": 2207 + }, + { + "epoch": 0.21, + "grad_norm": 0.2525345630451486, + "learning_rate": 0.00019831232409628445, + "loss": 1.0794, + "step": 2208 + }, + { + "epoch": 0.21, + "grad_norm": 0.24602578478108195, + "learning_rate": 0.0001983094287771908, + "loss": 1.1439, + "step": 2209 + }, + { + "epoch": 0.21, + "grad_norm": 0.26742569319862003, + "learning_rate": 0.00019830653099784195, + "loss": 1.1399, + "step": 2210 + }, + { + "epoch": 0.21, + "grad_norm": 0.30626650715688947, + "learning_rate": 0.00019830363075831037, + "loss": 1.2276, + "step": 2211 + }, + { + "epoch": 0.21, + "grad_norm": 0.2978427240509176, + "learning_rate": 0.00019830072805866866, + "loss": 1.215, + "step": 2212 + }, + { + "epoch": 0.21, + "grad_norm": 0.2795761909559458, + "learning_rate": 0.00019829782289898943, + "loss": 1.2044, + "step": 2213 + }, + { + "epoch": 0.21, + "grad_norm": 0.28310847050083876, + "learning_rate": 0.0001982949152793454, + "loss": 1.1433, + "step": 2214 + }, + { + "epoch": 0.21, + "grad_norm": 0.2796955122824297, + "learning_rate": 0.00019829200519980937, + "loss": 1.0606, + "step": 2215 + }, + { + "epoch": 0.21, + "grad_norm": 0.2528523703698838, + "learning_rate": 0.0001982890926604541, + "loss": 1.068, + "step": 2216 + }, + { + "epoch": 0.21, + "grad_norm": 0.2523940552498862, + "learning_rate": 0.00019828617766135255, + "loss": 1.0647, + "step": 2217 + }, + { + "epoch": 0.21, + "grad_norm": 0.2970231511295705, + "learning_rate": 0.0001982832602025776, + "loss": 1.2357, + "step": 2218 + }, + { + "epoch": 0.21, + "grad_norm": 0.27974233774133495, + "learning_rate": 0.00019828034028420232, + "loss": 1.0735, + "step": 2219 + }, + { + "epoch": 0.21, + "grad_norm": 0.269451164229955, + "learning_rate": 0.00019827741790629975, + "loss": 1.0784, + "step": 2220 + }, + { + "epoch": 0.21, + "grad_norm": 0.2658199878337128, + "learning_rate": 0.00019827449306894304, + "loss": 1.0841, + "step": 2221 + }, + { + "epoch": 0.21, + "grad_norm": 0.257731802421506, + "learning_rate": 0.00019827156577220537, + "loss": 1.2333, + "step": 2222 + }, + { + "epoch": 0.21, + "grad_norm": 0.32039613850942644, + "learning_rate": 0.00019826863601616, + "loss": 1.1436, + "step": 2223 + }, + { + "epoch": 0.21, + "grad_norm": 0.23336247900235474, + "learning_rate": 0.00019826570380088025, + "loss": 1.1719, + "step": 2224 + }, + { + "epoch": 0.21, + "grad_norm": 0.28395673225685364, + "learning_rate": 0.0001982627691264395, + "loss": 1.183, + "step": 2225 + }, + { + "epoch": 0.21, + "grad_norm": 0.2848669382303132, + "learning_rate": 0.00019825983199291122, + "loss": 1.1098, + "step": 2226 + }, + { + "epoch": 0.21, + "grad_norm": 0.27510914460004676, + "learning_rate": 0.0001982568924003689, + "loss": 1.1285, + "step": 2227 + }, + { + "epoch": 0.21, + "grad_norm": 0.2921265346308141, + "learning_rate": 0.00019825395034888605, + "loss": 1.1692, + "step": 2228 + }, + { + "epoch": 0.21, + "grad_norm": 0.2761631545730766, + "learning_rate": 0.00019825100583853637, + "loss": 1.1872, + "step": 2229 + }, + { + "epoch": 0.21, + "grad_norm": 0.2592595323871488, + "learning_rate": 0.00019824805886939353, + "loss": 1.0289, + "step": 2230 + }, + { + "epoch": 0.21, + "grad_norm": 0.29309277372861997, + "learning_rate": 0.00019824510944153125, + "loss": 1.1123, + "step": 2231 + }, + { + "epoch": 0.21, + "grad_norm": 0.27306258003376527, + "learning_rate": 0.00019824215755502337, + "loss": 1.1453, + "step": 2232 + }, + { + "epoch": 0.21, + "grad_norm": 0.263424668387752, + "learning_rate": 0.00019823920320994373, + "loss": 1.1002, + "step": 2233 + }, + { + "epoch": 0.21, + "grad_norm": 0.2938351047841059, + "learning_rate": 0.00019823624640636633, + "loss": 1.127, + "step": 2234 + }, + { + "epoch": 0.21, + "grad_norm": 0.24927930137531826, + "learning_rate": 0.0001982332871443651, + "loss": 1.0708, + "step": 2235 + }, + { + "epoch": 0.21, + "grad_norm": 0.2786877424279345, + "learning_rate": 0.00019823032542401413, + "loss": 1.0868, + "step": 2236 + }, + { + "epoch": 0.21, + "grad_norm": 0.2798063724399594, + "learning_rate": 0.00019822736124538754, + "loss": 1.1573, + "step": 2237 + }, + { + "epoch": 0.21, + "grad_norm": 0.2633597183766863, + "learning_rate": 0.00019822439460855947, + "loss": 1.1058, + "step": 2238 + }, + { + "epoch": 0.21, + "grad_norm": 0.24785877289941977, + "learning_rate": 0.00019822142551360422, + "loss": 1.0471, + "step": 2239 + }, + { + "epoch": 0.21, + "grad_norm": 0.2713868952406667, + "learning_rate": 0.00019821845396059606, + "loss": 1.0428, + "step": 2240 + }, + { + "epoch": 0.21, + "grad_norm": 0.2811381014767392, + "learning_rate": 0.0001982154799496094, + "loss": 1.0762, + "step": 2241 + }, + { + "epoch": 0.21, + "grad_norm": 0.2725029043198944, + "learning_rate": 0.00019821250348071856, + "loss": 1.1293, + "step": 2242 + }, + { + "epoch": 0.21, + "grad_norm": 0.2827100509016029, + "learning_rate": 0.00019820952455399814, + "loss": 1.1447, + "step": 2243 + }, + { + "epoch": 0.21, + "grad_norm": 0.2758022374658988, + "learning_rate": 0.00019820654316952263, + "loss": 1.1659, + "step": 2244 + }, + { + "epoch": 0.21, + "grad_norm": 0.2633396491372797, + "learning_rate": 0.00019820355932736666, + "loss": 1.0462, + "step": 2245 + }, + { + "epoch": 0.21, + "grad_norm": 0.2731226166685037, + "learning_rate": 0.00019820057302760488, + "loss": 0.9548, + "step": 2246 + }, + { + "epoch": 0.21, + "grad_norm": 0.25556114167089006, + "learning_rate": 0.00019819758427031206, + "loss": 1.2312, + "step": 2247 + }, + { + "epoch": 0.22, + "grad_norm": 0.28848997593382414, + "learning_rate": 0.00019819459305556297, + "loss": 1.0739, + "step": 2248 + }, + { + "epoch": 0.22, + "grad_norm": 0.23351405165987268, + "learning_rate": 0.0001981915993834325, + "loss": 1.0641, + "step": 2249 + }, + { + "epoch": 0.22, + "grad_norm": 0.26791227089364905, + "learning_rate": 0.00019818860325399552, + "loss": 1.1015, + "step": 2250 + }, + { + "epoch": 0.22, + "grad_norm": 0.2820213434051579, + "learning_rate": 0.00019818560466732706, + "loss": 1.063, + "step": 2251 + }, + { + "epoch": 0.22, + "grad_norm": 0.24527078074648306, + "learning_rate": 0.00019818260362350213, + "loss": 1.1702, + "step": 2252 + }, + { + "epoch": 0.22, + "grad_norm": 0.2720420411260554, + "learning_rate": 0.0001981796001225958, + "loss": 1.0912, + "step": 2253 + }, + { + "epoch": 0.22, + "grad_norm": 0.2713012314046693, + "learning_rate": 0.00019817659416468332, + "loss": 1.0524, + "step": 2254 + }, + { + "epoch": 0.22, + "grad_norm": 0.26924822640795093, + "learning_rate": 0.00019817358574983983, + "loss": 1.0871, + "step": 2255 + }, + { + "epoch": 0.22, + "grad_norm": 0.27363733386951783, + "learning_rate": 0.0001981705748781407, + "loss": 1.0598, + "step": 2256 + }, + { + "epoch": 0.22, + "grad_norm": 0.24189809697792714, + "learning_rate": 0.0001981675615496612, + "loss": 1.084, + "step": 2257 + }, + { + "epoch": 0.22, + "grad_norm": 0.2706773118754261, + "learning_rate": 0.0001981645457644768, + "loss": 1.0637, + "step": 2258 + }, + { + "epoch": 0.22, + "grad_norm": 0.27820113281091335, + "learning_rate": 0.00019816152752266292, + "loss": 1.1624, + "step": 2259 + }, + { + "epoch": 0.22, + "grad_norm": 0.23093374468477146, + "learning_rate": 0.00019815850682429516, + "loss": 1.1735, + "step": 2260 + }, + { + "epoch": 0.22, + "grad_norm": 0.26705391917092386, + "learning_rate": 0.00019815548366944904, + "loss": 1.049, + "step": 2261 + }, + { + "epoch": 0.22, + "grad_norm": 0.28355313423369083, + "learning_rate": 0.00019815245805820028, + "loss": 1.0949, + "step": 2262 + }, + { + "epoch": 0.22, + "grad_norm": 0.2395712965169708, + "learning_rate": 0.00019814942999062457, + "loss": 1.05, + "step": 2263 + }, + { + "epoch": 0.22, + "grad_norm": 0.285804478616941, + "learning_rate": 0.00019814639946679768, + "loss": 1.1369, + "step": 2264 + }, + { + "epoch": 0.22, + "grad_norm": 0.25061529704124386, + "learning_rate": 0.00019814336648679546, + "loss": 1.0655, + "step": 2265 + }, + { + "epoch": 0.22, + "grad_norm": 0.2909795934470434, + "learning_rate": 0.0001981403310506938, + "loss": 1.1807, + "step": 2266 + }, + { + "epoch": 0.22, + "grad_norm": 0.23297851873356334, + "learning_rate": 0.00019813729315856869, + "loss": 1.152, + "step": 2267 + }, + { + "epoch": 0.22, + "grad_norm": 0.25889655866668293, + "learning_rate": 0.00019813425281049613, + "loss": 1.1054, + "step": 2268 + }, + { + "epoch": 0.22, + "grad_norm": 0.23977654506120644, + "learning_rate": 0.00019813121000655223, + "loss": 1.1002, + "step": 2269 + }, + { + "epoch": 0.22, + "grad_norm": 0.2549715877517098, + "learning_rate": 0.00019812816474681314, + "loss": 1.057, + "step": 2270 + }, + { + "epoch": 0.22, + "grad_norm": 0.26247547673776234, + "learning_rate": 0.00019812511703135504, + "loss": 1.0619, + "step": 2271 + }, + { + "epoch": 0.22, + "grad_norm": 0.2867669044848128, + "learning_rate": 0.00019812206686025424, + "loss": 1.1794, + "step": 2272 + }, + { + "epoch": 0.22, + "grad_norm": 0.27304725331072943, + "learning_rate": 0.000198119014233587, + "loss": 1.1895, + "step": 2273 + }, + { + "epoch": 0.22, + "grad_norm": 0.25831673662414345, + "learning_rate": 0.00019811595915142979, + "loss": 1.088, + "step": 2274 + }, + { + "epoch": 0.22, + "grad_norm": 0.23021022674148214, + "learning_rate": 0.00019811290161385906, + "loss": 1.0841, + "step": 2275 + }, + { + "epoch": 0.22, + "grad_norm": 0.2575794159303839, + "learning_rate": 0.00019810984162095129, + "loss": 1.0906, + "step": 2276 + }, + { + "epoch": 0.22, + "grad_norm": 0.2609400179291492, + "learning_rate": 0.00019810677917278305, + "loss": 1.1717, + "step": 2277 + }, + { + "epoch": 0.22, + "grad_norm": 0.28398870543915045, + "learning_rate": 0.00019810371426943105, + "loss": 1.1347, + "step": 2278 + }, + { + "epoch": 0.22, + "grad_norm": 0.26309231855699067, + "learning_rate": 0.0001981006469109719, + "loss": 1.1804, + "step": 2279 + }, + { + "epoch": 0.22, + "grad_norm": 0.2902283356657511, + "learning_rate": 0.00019809757709748243, + "loss": 1.1167, + "step": 2280 + }, + { + "epoch": 0.22, + "grad_norm": 0.25748170041372764, + "learning_rate": 0.00019809450482903942, + "loss": 1.1476, + "step": 2281 + }, + { + "epoch": 0.22, + "grad_norm": 0.2667597093950612, + "learning_rate": 0.0001980914301057198, + "loss": 1.1277, + "step": 2282 + }, + { + "epoch": 0.22, + "grad_norm": 0.27836946786170796, + "learning_rate": 0.0001980883529276005, + "loss": 1.1525, + "step": 2283 + }, + { + "epoch": 0.22, + "grad_norm": 0.3083167991873422, + "learning_rate": 0.0001980852732947585, + "loss": 1.1216, + "step": 2284 + }, + { + "epoch": 0.22, + "grad_norm": 0.2676745480686396, + "learning_rate": 0.00019808219120727086, + "loss": 1.1328, + "step": 2285 + }, + { + "epoch": 0.22, + "grad_norm": 0.25527058852259726, + "learning_rate": 0.0001980791066652148, + "loss": 1.065, + "step": 2286 + }, + { + "epoch": 0.22, + "grad_norm": 0.28337351811282757, + "learning_rate": 0.00019807601966866746, + "loss": 1.1723, + "step": 2287 + }, + { + "epoch": 0.22, + "grad_norm": 0.27319098266987507, + "learning_rate": 0.00019807293021770604, + "loss": 1.0549, + "step": 2288 + }, + { + "epoch": 0.22, + "grad_norm": 0.30841736311542484, + "learning_rate": 0.00019806983831240795, + "loss": 1.1445, + "step": 2289 + }, + { + "epoch": 0.22, + "grad_norm": 0.2555510965247522, + "learning_rate": 0.0001980667439528505, + "loss": 1.1424, + "step": 2290 + }, + { + "epoch": 0.22, + "grad_norm": 0.2641571799003314, + "learning_rate": 0.00019806364713911116, + "loss": 1.033, + "step": 2291 + }, + { + "epoch": 0.22, + "grad_norm": 0.2838900082793651, + "learning_rate": 0.0001980605478712674, + "loss": 1.0774, + "step": 2292 + }, + { + "epoch": 0.22, + "grad_norm": 0.31407713147896055, + "learning_rate": 0.00019805744614939682, + "loss": 1.2683, + "step": 2293 + }, + { + "epoch": 0.22, + "grad_norm": 0.27082803879903133, + "learning_rate": 0.00019805434197357703, + "loss": 1.1711, + "step": 2294 + }, + { + "epoch": 0.22, + "grad_norm": 0.27007517574821516, + "learning_rate": 0.0001980512353438857, + "loss": 1.1142, + "step": 2295 + }, + { + "epoch": 0.22, + "grad_norm": 0.25200965101215933, + "learning_rate": 0.00019804812626040056, + "loss": 1.1365, + "step": 2296 + }, + { + "epoch": 0.22, + "grad_norm": 0.2482335861309017, + "learning_rate": 0.00019804501472319946, + "loss": 1.0387, + "step": 2297 + }, + { + "epoch": 0.22, + "grad_norm": 0.27093750047305093, + "learning_rate": 0.0001980419007323602, + "loss": 1.0562, + "step": 2298 + }, + { + "epoch": 0.22, + "grad_norm": 0.2823581467965368, + "learning_rate": 0.00019803878428796082, + "loss": 1.2542, + "step": 2299 + }, + { + "epoch": 0.22, + "grad_norm": 0.27114630287941716, + "learning_rate": 0.00019803566539007924, + "loss": 1.1863, + "step": 2300 + }, + { + "epoch": 0.22, + "grad_norm": 0.27533847009087203, + "learning_rate": 0.0001980325440387935, + "loss": 0.9211, + "step": 2301 + }, + { + "epoch": 0.22, + "grad_norm": 0.24736978230602902, + "learning_rate": 0.00019802942023418175, + "loss": 1.1474, + "step": 2302 + }, + { + "epoch": 0.22, + "grad_norm": 0.24528908482065118, + "learning_rate": 0.00019802629397632212, + "loss": 1.0203, + "step": 2303 + }, + { + "epoch": 0.22, + "grad_norm": 0.3102766690223985, + "learning_rate": 0.00019802316526529293, + "loss": 1.1166, + "step": 2304 + }, + { + "epoch": 0.22, + "grad_norm": 0.2645211126197188, + "learning_rate": 0.00019802003410117238, + "loss": 1.09, + "step": 2305 + }, + { + "epoch": 0.22, + "grad_norm": 0.24314341601375852, + "learning_rate": 0.0001980169004840389, + "loss": 1.067, + "step": 2306 + }, + { + "epoch": 0.22, + "grad_norm": 0.28901370914218866, + "learning_rate": 0.00019801376441397087, + "loss": 1.125, + "step": 2307 + }, + { + "epoch": 0.22, + "grad_norm": 0.22977734256634133, + "learning_rate": 0.00019801062589104676, + "loss": 1.1017, + "step": 2308 + }, + { + "epoch": 0.22, + "grad_norm": 0.2763909341056602, + "learning_rate": 0.00019800748491534517, + "loss": 1.1466, + "step": 2309 + }, + { + "epoch": 0.22, + "grad_norm": 0.2638965585187832, + "learning_rate": 0.00019800434148694468, + "loss": 1.0884, + "step": 2310 + }, + { + "epoch": 0.22, + "grad_norm": 0.25365372028597144, + "learning_rate": 0.00019800119560592393, + "loss": 1.1063, + "step": 2311 + }, + { + "epoch": 0.22, + "grad_norm": 0.2946212685821378, + "learning_rate": 0.0001979980472723617, + "loss": 1.0209, + "step": 2312 + }, + { + "epoch": 0.22, + "grad_norm": 0.28135433865494547, + "learning_rate": 0.00019799489648633675, + "loss": 1.1626, + "step": 2313 + }, + { + "epoch": 0.22, + "grad_norm": 0.3064233206742547, + "learning_rate": 0.00019799174324792787, + "loss": 1.1433, + "step": 2314 + }, + { + "epoch": 0.22, + "grad_norm": 0.2800872851032662, + "learning_rate": 0.00019798858755721405, + "loss": 1.039, + "step": 2315 + }, + { + "epoch": 0.22, + "grad_norm": 0.25898676357853834, + "learning_rate": 0.00019798542941427426, + "loss": 1.1401, + "step": 2316 + }, + { + "epoch": 0.22, + "grad_norm": 0.26001355286806555, + "learning_rate": 0.00019798226881918753, + "loss": 1.0741, + "step": 2317 + }, + { + "epoch": 0.22, + "grad_norm": 0.31195060460939816, + "learning_rate": 0.00019797910577203293, + "loss": 1.155, + "step": 2318 + }, + { + "epoch": 0.22, + "grad_norm": 0.2853360314799912, + "learning_rate": 0.00019797594027288963, + "loss": 1.1006, + "step": 2319 + }, + { + "epoch": 0.22, + "grad_norm": 0.2282844809122414, + "learning_rate": 0.00019797277232183684, + "loss": 1.0532, + "step": 2320 + }, + { + "epoch": 0.22, + "grad_norm": 0.28551517768089857, + "learning_rate": 0.00019796960191895385, + "loss": 1.0486, + "step": 2321 + }, + { + "epoch": 0.22, + "grad_norm": 0.2590261403859847, + "learning_rate": 0.00019796642906432004, + "loss": 1.0397, + "step": 2322 + }, + { + "epoch": 0.22, + "grad_norm": 0.23480527152036285, + "learning_rate": 0.0001979632537580147, + "loss": 1.0072, + "step": 2323 + }, + { + "epoch": 0.22, + "grad_norm": 0.26973107870500546, + "learning_rate": 0.00019796007600011742, + "loss": 1.1077, + "step": 2324 + }, + { + "epoch": 0.22, + "grad_norm": 0.26492201462618237, + "learning_rate": 0.0001979568957907077, + "loss": 1.1157, + "step": 2325 + }, + { + "epoch": 0.22, + "grad_norm": 0.279425391462633, + "learning_rate": 0.00019795371312986504, + "loss": 1.0199, + "step": 2326 + }, + { + "epoch": 0.22, + "grad_norm": 0.2836141793634087, + "learning_rate": 0.00019795052801766915, + "loss": 1.1172, + "step": 2327 + }, + { + "epoch": 0.22, + "grad_norm": 0.2544431831518797, + "learning_rate": 0.0001979473404541998, + "loss": 1.0867, + "step": 2328 + }, + { + "epoch": 0.22, + "grad_norm": 0.2402666227462647, + "learning_rate": 0.0001979441504395366, + "loss": 1.0139, + "step": 2329 + }, + { + "epoch": 0.22, + "grad_norm": 0.2503197635519736, + "learning_rate": 0.00019794095797375953, + "loss": 1.0556, + "step": 2330 + }, + { + "epoch": 0.22, + "grad_norm": 0.2581127509866034, + "learning_rate": 0.00019793776305694846, + "loss": 0.9451, + "step": 2331 + }, + { + "epoch": 0.22, + "grad_norm": 0.24403281634446966, + "learning_rate": 0.0001979345656891833, + "loss": 1.1031, + "step": 2332 + }, + { + "epoch": 0.22, + "grad_norm": 0.2532440703626388, + "learning_rate": 0.00019793136587054405, + "loss": 1.0738, + "step": 2333 + }, + { + "epoch": 0.22, + "grad_norm": 0.2597097391411967, + "learning_rate": 0.00019792816360111087, + "loss": 1.1359, + "step": 2334 + }, + { + "epoch": 0.22, + "grad_norm": 0.27991032105875546, + "learning_rate": 0.00019792495888096382, + "loss": 0.9373, + "step": 2335 + }, + { + "epoch": 0.22, + "grad_norm": 0.2713405323422737, + "learning_rate": 0.00019792175171018313, + "loss": 1.0818, + "step": 2336 + }, + { + "epoch": 0.22, + "grad_norm": 0.2942256218211258, + "learning_rate": 0.00019791854208884907, + "loss": 1.1304, + "step": 2337 + }, + { + "epoch": 0.22, + "grad_norm": 0.24670357822710387, + "learning_rate": 0.00019791533001704194, + "loss": 1.0164, + "step": 2338 + }, + { + "epoch": 0.22, + "grad_norm": 0.25797879496591175, + "learning_rate": 0.00019791211549484216, + "loss": 1.0922, + "step": 2339 + }, + { + "epoch": 0.22, + "grad_norm": 0.28874847418524446, + "learning_rate": 0.00019790889852233016, + "loss": 1.2126, + "step": 2340 + }, + { + "epoch": 0.22, + "grad_norm": 0.2831045631432414, + "learning_rate": 0.00019790567909958644, + "loss": 1.1913, + "step": 2341 + }, + { + "epoch": 0.22, + "grad_norm": 0.25052218126412557, + "learning_rate": 0.00019790245722669153, + "loss": 1.1242, + "step": 2342 + }, + { + "epoch": 0.22, + "grad_norm": 0.2980993557396919, + "learning_rate": 0.00019789923290372614, + "loss": 1.0499, + "step": 2343 + }, + { + "epoch": 0.22, + "grad_norm": 0.26321626908133683, + "learning_rate": 0.00019789600613077092, + "loss": 1.0864, + "step": 2344 + }, + { + "epoch": 0.22, + "grad_norm": 0.26596294842052304, + "learning_rate": 0.0001978927769079066, + "loss": 1.0538, + "step": 2345 + }, + { + "epoch": 0.22, + "grad_norm": 0.25422065022309154, + "learning_rate": 0.00019788954523521402, + "loss": 1.2115, + "step": 2346 + }, + { + "epoch": 0.22, + "grad_norm": 0.23598171182692104, + "learning_rate": 0.00019788631111277406, + "loss": 1.0686, + "step": 2347 + }, + { + "epoch": 0.22, + "grad_norm": 0.2847052874921601, + "learning_rate": 0.00019788307454066763, + "loss": 1.0641, + "step": 2348 + }, + { + "epoch": 0.22, + "grad_norm": 0.25357051014186355, + "learning_rate": 0.00019787983551897576, + "loss": 1.0484, + "step": 2349 + }, + { + "epoch": 0.22, + "grad_norm": 0.2411514518029808, + "learning_rate": 0.00019787659404777946, + "loss": 1.133, + "step": 2350 + }, + { + "epoch": 0.22, + "grad_norm": 0.25446854039811434, + "learning_rate": 0.0001978733501271599, + "loss": 1.1672, + "step": 2351 + }, + { + "epoch": 0.23, + "grad_norm": 0.26777410057209505, + "learning_rate": 0.00019787010375719826, + "loss": 1.138, + "step": 2352 + }, + { + "epoch": 0.23, + "grad_norm": 0.27808638420934434, + "learning_rate": 0.0001978668549379757, + "loss": 1.1782, + "step": 2353 + }, + { + "epoch": 0.23, + "grad_norm": 0.2592972824957828, + "learning_rate": 0.00019786360366957367, + "loss": 1.1013, + "step": 2354 + }, + { + "epoch": 0.23, + "grad_norm": 0.25628836959733703, + "learning_rate": 0.0001978603499520734, + "loss": 0.9967, + "step": 2355 + }, + { + "epoch": 0.23, + "grad_norm": 0.2636173075017777, + "learning_rate": 0.0001978570937855564, + "loss": 1.1131, + "step": 2356 + }, + { + "epoch": 0.23, + "grad_norm": 0.2965203958377406, + "learning_rate": 0.0001978538351701041, + "loss": 1.1412, + "step": 2357 + }, + { + "epoch": 0.23, + "grad_norm": 0.2551730645828324, + "learning_rate": 0.0001978505741057981, + "loss": 1.1347, + "step": 2358 + }, + { + "epoch": 0.23, + "grad_norm": 0.3173189097283464, + "learning_rate": 0.00019784731059271996, + "loss": 1.1354, + "step": 2359 + }, + { + "epoch": 0.23, + "grad_norm": 0.287566565828461, + "learning_rate": 0.00019784404463095144, + "loss": 1.0052, + "step": 2360 + }, + { + "epoch": 0.23, + "grad_norm": 0.24804179463343937, + "learning_rate": 0.00019784077622057416, + "loss": 1.1699, + "step": 2361 + }, + { + "epoch": 0.23, + "grad_norm": 0.3073578708099378, + "learning_rate": 0.00019783750536166993, + "loss": 1.1715, + "step": 2362 + }, + { + "epoch": 0.23, + "grad_norm": 0.22734700217911738, + "learning_rate": 0.0001978342320543207, + "loss": 0.9563, + "step": 2363 + }, + { + "epoch": 0.23, + "grad_norm": 0.2590258564685437, + "learning_rate": 0.0001978309562986083, + "loss": 1.0816, + "step": 2364 + }, + { + "epoch": 0.23, + "grad_norm": 0.27557855270603093, + "learning_rate": 0.00019782767809461475, + "loss": 1.077, + "step": 2365 + }, + { + "epoch": 0.23, + "grad_norm": 0.25603874246559705, + "learning_rate": 0.00019782439744242205, + "loss": 1.146, + "step": 2366 + }, + { + "epoch": 0.23, + "grad_norm": 0.2761244679810728, + "learning_rate": 0.00019782111434211235, + "loss": 1.107, + "step": 2367 + }, + { + "epoch": 0.23, + "grad_norm": 0.26568572466382623, + "learning_rate": 0.00019781782879376775, + "loss": 1.1306, + "step": 2368 + }, + { + "epoch": 0.23, + "grad_norm": 0.2723644753582144, + "learning_rate": 0.00019781454079747054, + "loss": 1.1705, + "step": 2369 + }, + { + "epoch": 0.23, + "grad_norm": 0.26902633260641967, + "learning_rate": 0.00019781125035330297, + "loss": 1.0754, + "step": 2370 + }, + { + "epoch": 0.23, + "grad_norm": 0.2711384423626026, + "learning_rate": 0.0001978079574613474, + "loss": 1.1202, + "step": 2371 + }, + { + "epoch": 0.23, + "grad_norm": 0.28204033368789444, + "learning_rate": 0.0001978046621216862, + "loss": 1.1706, + "step": 2372 + }, + { + "epoch": 0.23, + "grad_norm": 0.26064152152322406, + "learning_rate": 0.00019780136433440184, + "loss": 1.0596, + "step": 2373 + }, + { + "epoch": 0.23, + "grad_norm": 0.2907116807628749, + "learning_rate": 0.00019779806409957692, + "loss": 1.1855, + "step": 2374 + }, + { + "epoch": 0.23, + "grad_norm": 0.2739091926383725, + "learning_rate": 0.00019779476141729396, + "loss": 1.1093, + "step": 2375 + }, + { + "epoch": 0.23, + "grad_norm": 0.24773766770558472, + "learning_rate": 0.00019779145628763564, + "loss": 1.0423, + "step": 2376 + }, + { + "epoch": 0.23, + "grad_norm": 0.31039070403873764, + "learning_rate": 0.00019778814871068465, + "loss": 1.125, + "step": 2377 + }, + { + "epoch": 0.23, + "grad_norm": 0.25711704080946696, + "learning_rate": 0.0001977848386865238, + "loss": 1.1752, + "step": 2378 + }, + { + "epoch": 0.23, + "grad_norm": 0.24583579381340756, + "learning_rate": 0.0001977815262152359, + "loss": 1.0188, + "step": 2379 + }, + { + "epoch": 0.23, + "grad_norm": 0.28912054275819865, + "learning_rate": 0.00019777821129690387, + "loss": 1.0406, + "step": 2380 + }, + { + "epoch": 0.23, + "grad_norm": 0.27568971892193483, + "learning_rate": 0.0001977748939316106, + "loss": 1.1377, + "step": 2381 + }, + { + "epoch": 0.23, + "grad_norm": 0.2703312754633334, + "learning_rate": 0.0001977715741194392, + "loss": 1.0889, + "step": 2382 + }, + { + "epoch": 0.23, + "grad_norm": 0.3244168867345363, + "learning_rate": 0.00019776825186047268, + "loss": 1.2365, + "step": 2383 + }, + { + "epoch": 0.23, + "grad_norm": 0.26694652233140037, + "learning_rate": 0.00019776492715479428, + "loss": 0.9792, + "step": 2384 + }, + { + "epoch": 0.23, + "grad_norm": 0.25774010566880007, + "learning_rate": 0.00019776160000248706, + "loss": 1.0835, + "step": 2385 + }, + { + "epoch": 0.23, + "grad_norm": 0.24706317723666119, + "learning_rate": 0.0001977582704036344, + "loss": 1.0586, + "step": 2386 + }, + { + "epoch": 0.23, + "grad_norm": 0.2990804163988819, + "learning_rate": 0.00019775493835831959, + "loss": 1.0996, + "step": 2387 + }, + { + "epoch": 0.23, + "grad_norm": 0.25726590762789603, + "learning_rate": 0.00019775160386662597, + "loss": 1.117, + "step": 2388 + }, + { + "epoch": 0.23, + "grad_norm": 0.2650577582941275, + "learning_rate": 0.00019774826692863705, + "loss": 1.0128, + "step": 2389 + }, + { + "epoch": 0.23, + "grad_norm": 0.2668462355680707, + "learning_rate": 0.00019774492754443635, + "loss": 1.0804, + "step": 2390 + }, + { + "epoch": 0.23, + "grad_norm": 0.2992539843834909, + "learning_rate": 0.00019774158571410737, + "loss": 1.2372, + "step": 2391 + }, + { + "epoch": 0.23, + "grad_norm": 0.2864988762111991, + "learning_rate": 0.00019773824143773377, + "loss": 1.0984, + "step": 2392 + }, + { + "epoch": 0.23, + "grad_norm": 0.304882593233717, + "learning_rate": 0.00019773489471539926, + "loss": 1.1334, + "step": 2393 + }, + { + "epoch": 0.23, + "grad_norm": 0.28019107709650354, + "learning_rate": 0.00019773154554718762, + "loss": 1.0708, + "step": 2394 + }, + { + "epoch": 0.23, + "grad_norm": 0.22866457976217683, + "learning_rate": 0.00019772819393318262, + "loss": 1.0887, + "step": 2395 + }, + { + "epoch": 0.23, + "grad_norm": 0.26481598326652184, + "learning_rate": 0.00019772483987346812, + "loss": 1.1488, + "step": 2396 + }, + { + "epoch": 0.23, + "grad_norm": 0.27476032047452265, + "learning_rate": 0.0001977214833681281, + "loss": 1.0884, + "step": 2397 + }, + { + "epoch": 0.23, + "grad_norm": 0.2671685597445549, + "learning_rate": 0.00019771812441724652, + "loss": 1.021, + "step": 2398 + }, + { + "epoch": 0.23, + "grad_norm": 0.2777768402686966, + "learning_rate": 0.00019771476302090754, + "loss": 1.0786, + "step": 2399 + }, + { + "epoch": 0.23, + "grad_norm": 0.25286519332057394, + "learning_rate": 0.00019771139917919512, + "loss": 0.9957, + "step": 2400 + }, + { + "epoch": 0.23, + "grad_norm": 0.25899496107583564, + "learning_rate": 0.00019770803289219355, + "loss": 1.232, + "step": 2401 + }, + { + "epoch": 0.23, + "grad_norm": 0.2554893399049988, + "learning_rate": 0.00019770466415998706, + "loss": 1.1702, + "step": 2402 + }, + { + "epoch": 0.23, + "grad_norm": 0.26615765059825036, + "learning_rate": 0.00019770129298265994, + "loss": 1.1589, + "step": 2403 + }, + { + "epoch": 0.23, + "grad_norm": 0.2867731760890007, + "learning_rate": 0.00019769791936029657, + "loss": 1.0231, + "step": 2404 + }, + { + "epoch": 0.23, + "grad_norm": 0.25865612120055026, + "learning_rate": 0.00019769454329298134, + "loss": 1.0742, + "step": 2405 + }, + { + "epoch": 0.23, + "grad_norm": 0.27168406194912653, + "learning_rate": 0.00019769116478079876, + "loss": 1.11, + "step": 2406 + }, + { + "epoch": 0.23, + "grad_norm": 0.29872484561116197, + "learning_rate": 0.00019768778382383344, + "loss": 1.0637, + "step": 2407 + }, + { + "epoch": 0.23, + "grad_norm": 0.27164584328580743, + "learning_rate": 0.0001976844004221699, + "loss": 1.0909, + "step": 2408 + }, + { + "epoch": 0.23, + "grad_norm": 0.2739762871753536, + "learning_rate": 0.00019768101457589283, + "loss": 1.0961, + "step": 2409 + }, + { + "epoch": 0.23, + "grad_norm": 0.25357273039005795, + "learning_rate": 0.00019767762628508702, + "loss": 1.0625, + "step": 2410 + }, + { + "epoch": 0.23, + "grad_norm": 0.26504750469407357, + "learning_rate": 0.00019767423554983718, + "loss": 1.0843, + "step": 2411 + }, + { + "epoch": 0.23, + "grad_norm": 0.274182183625254, + "learning_rate": 0.00019767084237022823, + "loss": 1.144, + "step": 2412 + }, + { + "epoch": 0.23, + "grad_norm": 0.282485486193323, + "learning_rate": 0.00019766744674634508, + "loss": 1.1495, + "step": 2413 + }, + { + "epoch": 0.23, + "grad_norm": 0.24729551379445136, + "learning_rate": 0.00019766404867827269, + "loss": 1.1148, + "step": 2414 + }, + { + "epoch": 0.23, + "grad_norm": 0.23229018534328089, + "learning_rate": 0.00019766064816609607, + "loss": 1.0309, + "step": 2415 + }, + { + "epoch": 0.23, + "grad_norm": 0.252259979313865, + "learning_rate": 0.00019765724520990038, + "loss": 1.1207, + "step": 2416 + }, + { + "epoch": 0.23, + "grad_norm": 0.23700561147186552, + "learning_rate": 0.00019765383980977074, + "loss": 1.1039, + "step": 2417 + }, + { + "epoch": 0.23, + "grad_norm": 0.26556810321631696, + "learning_rate": 0.0001976504319657924, + "loss": 1.1749, + "step": 2418 + }, + { + "epoch": 0.23, + "grad_norm": 0.23143332904854963, + "learning_rate": 0.00019764702167805064, + "loss": 1.1775, + "step": 2419 + }, + { + "epoch": 0.23, + "grad_norm": 0.27696732231432797, + "learning_rate": 0.00019764360894663076, + "loss": 1.0399, + "step": 2420 + }, + { + "epoch": 0.23, + "grad_norm": 0.27237322850717344, + "learning_rate": 0.00019764019377161823, + "loss": 1.0703, + "step": 2421 + }, + { + "epoch": 0.23, + "grad_norm": 0.2535297853960644, + "learning_rate": 0.00019763677615309847, + "loss": 1.0938, + "step": 2422 + }, + { + "epoch": 0.23, + "grad_norm": 0.26018058934136795, + "learning_rate": 0.00019763335609115703, + "loss": 1.0601, + "step": 2423 + }, + { + "epoch": 0.23, + "grad_norm": 0.3439807499205691, + "learning_rate": 0.0001976299335858795, + "loss": 1.04, + "step": 2424 + }, + { + "epoch": 0.23, + "grad_norm": 0.24885428737327994, + "learning_rate": 0.0001976265086373515, + "loss": 1.0878, + "step": 2425 + }, + { + "epoch": 0.23, + "grad_norm": 0.2518594237134699, + "learning_rate": 0.0001976230812456588, + "loss": 1.1054, + "step": 2426 + }, + { + "epoch": 0.23, + "grad_norm": 0.27045653240229783, + "learning_rate": 0.0001976196514108871, + "loss": 0.9974, + "step": 2427 + }, + { + "epoch": 0.23, + "grad_norm": 0.27897290427116067, + "learning_rate": 0.0001976162191331223, + "loss": 1.1238, + "step": 2428 + }, + { + "epoch": 0.23, + "grad_norm": 0.26488970207228696, + "learning_rate": 0.00019761278441245023, + "loss": 1.0545, + "step": 2429 + }, + { + "epoch": 0.23, + "grad_norm": 0.2676698182906196, + "learning_rate": 0.00019760934724895692, + "loss": 1.1392, + "step": 2430 + }, + { + "epoch": 0.23, + "grad_norm": 0.2554504815510488, + "learning_rate": 0.00019760590764272834, + "loss": 1.1628, + "step": 2431 + }, + { + "epoch": 0.23, + "grad_norm": 0.23803751811537502, + "learning_rate": 0.0001976024655938506, + "loss": 1.1082, + "step": 2432 + }, + { + "epoch": 0.23, + "grad_norm": 0.2843747176664285, + "learning_rate": 0.00019759902110240977, + "loss": 1.0482, + "step": 2433 + }, + { + "epoch": 0.23, + "grad_norm": 0.23996308884375614, + "learning_rate": 0.00019759557416849214, + "loss": 1.0972, + "step": 2434 + }, + { + "epoch": 0.23, + "grad_norm": 0.26431108924886854, + "learning_rate": 0.00019759212479218393, + "loss": 1.1708, + "step": 2435 + }, + { + "epoch": 0.23, + "grad_norm": 0.26205638413878823, + "learning_rate": 0.0001975886729735714, + "loss": 1.1489, + "step": 2436 + }, + { + "epoch": 0.23, + "grad_norm": 0.26602482241211184, + "learning_rate": 0.00019758521871274107, + "loss": 1.1432, + "step": 2437 + }, + { + "epoch": 0.23, + "grad_norm": 0.2611287789955635, + "learning_rate": 0.00019758176200977928, + "loss": 1.0747, + "step": 2438 + }, + { + "epoch": 0.23, + "grad_norm": 0.2512098131069008, + "learning_rate": 0.00019757830286477258, + "loss": 1.1516, + "step": 2439 + }, + { + "epoch": 0.23, + "grad_norm": 0.300485183834668, + "learning_rate": 0.0001975748412778075, + "loss": 1.0355, + "step": 2440 + }, + { + "epoch": 0.23, + "grad_norm": 0.24888115485358228, + "learning_rate": 0.00019757137724897073, + "loss": 1.1842, + "step": 2441 + }, + { + "epoch": 0.23, + "grad_norm": 0.24322023911969542, + "learning_rate": 0.0001975679107783489, + "loss": 1.1434, + "step": 2442 + }, + { + "epoch": 0.23, + "grad_norm": 0.25278550537075395, + "learning_rate": 0.00019756444186602877, + "loss": 1.1414, + "step": 2443 + }, + { + "epoch": 0.23, + "grad_norm": 0.2629822500133049, + "learning_rate": 0.0001975609705120972, + "loss": 1.1144, + "step": 2444 + }, + { + "epoch": 0.23, + "grad_norm": 0.25330431801210734, + "learning_rate": 0.00019755749671664102, + "loss": 1.1006, + "step": 2445 + }, + { + "epoch": 0.23, + "grad_norm": 0.27277917321975415, + "learning_rate": 0.00019755402047974717, + "loss": 1.1218, + "step": 2446 + }, + { + "epoch": 0.23, + "grad_norm": 0.24954891050559894, + "learning_rate": 0.00019755054180150262, + "loss": 1.168, + "step": 2447 + }, + { + "epoch": 0.23, + "grad_norm": 0.24441850562430112, + "learning_rate": 0.00019754706068199446, + "loss": 0.9717, + "step": 2448 + }, + { + "epoch": 0.23, + "grad_norm": 0.26980275249400415, + "learning_rate": 0.00019754357712130984, + "loss": 1.0781, + "step": 2449 + }, + { + "epoch": 0.23, + "grad_norm": 0.2644012899065647, + "learning_rate": 0.00019754009111953586, + "loss": 1.2219, + "step": 2450 + }, + { + "epoch": 0.23, + "grad_norm": 0.28502445348284167, + "learning_rate": 0.00019753660267675982, + "loss": 1.1411, + "step": 2451 + }, + { + "epoch": 0.23, + "grad_norm": 0.24105312734962844, + "learning_rate": 0.000197533111793069, + "loss": 1.0395, + "step": 2452 + }, + { + "epoch": 0.23, + "grad_norm": 0.28402705301365877, + "learning_rate": 0.0001975296184685507, + "loss": 1.1438, + "step": 2453 + }, + { + "epoch": 0.23, + "grad_norm": 0.251590448441616, + "learning_rate": 0.00019752612270329247, + "loss": 1.0017, + "step": 2454 + }, + { + "epoch": 0.23, + "grad_norm": 0.27385681060181105, + "learning_rate": 0.0001975226244973817, + "loss": 1.2012, + "step": 2455 + }, + { + "epoch": 0.23, + "grad_norm": 0.24837745278204337, + "learning_rate": 0.000197519123850906, + "loss": 1.1111, + "step": 2456 + }, + { + "epoch": 0.24, + "grad_norm": 0.25263755603908267, + "learning_rate": 0.0001975156207639529, + "loss": 1.1382, + "step": 2457 + }, + { + "epoch": 0.24, + "grad_norm": 0.2913062612133686, + "learning_rate": 0.0001975121152366101, + "loss": 1.144, + "step": 2458 + }, + { + "epoch": 0.24, + "grad_norm": 0.279969123492799, + "learning_rate": 0.00019750860726896536, + "loss": 1.1385, + "step": 2459 + }, + { + "epoch": 0.24, + "grad_norm": 0.24114428531190943, + "learning_rate": 0.00019750509686110643, + "loss": 1.0758, + "step": 2460 + }, + { + "epoch": 0.24, + "grad_norm": 0.24865099555297668, + "learning_rate": 0.00019750158401312117, + "loss": 1.0137, + "step": 2461 + }, + { + "epoch": 0.24, + "grad_norm": 0.30049591909461865, + "learning_rate": 0.0001974980687250975, + "loss": 1.324, + "step": 2462 + }, + { + "epoch": 0.24, + "grad_norm": 0.2535091796023302, + "learning_rate": 0.00019749455099712332, + "loss": 1.1444, + "step": 2463 + }, + { + "epoch": 0.24, + "grad_norm": 0.24893191882319649, + "learning_rate": 0.00019749103082928682, + "loss": 0.8933, + "step": 2464 + }, + { + "epoch": 0.24, + "grad_norm": 0.2930991676695541, + "learning_rate": 0.00019748750822167594, + "loss": 1.0118, + "step": 2465 + }, + { + "epoch": 0.24, + "grad_norm": 0.2436760336989602, + "learning_rate": 0.00019748398317437894, + "loss": 1.0733, + "step": 2466 + }, + { + "epoch": 0.24, + "grad_norm": 0.2815078161913315, + "learning_rate": 0.00019748045568748396, + "loss": 1.1311, + "step": 2467 + }, + { + "epoch": 0.24, + "grad_norm": 0.2707645703704046, + "learning_rate": 0.00019747692576107935, + "loss": 1.1313, + "step": 2468 + }, + { + "epoch": 0.24, + "grad_norm": 0.2605533882651966, + "learning_rate": 0.00019747339339525337, + "loss": 1.0691, + "step": 2469 + }, + { + "epoch": 0.24, + "grad_norm": 0.25404802860541503, + "learning_rate": 0.00019746985859009448, + "loss": 1.1801, + "step": 2470 + }, + { + "epoch": 0.24, + "grad_norm": 0.2769596865608125, + "learning_rate": 0.00019746632134569114, + "loss": 1.0646, + "step": 2471 + }, + { + "epoch": 0.24, + "grad_norm": 0.28755610236991974, + "learning_rate": 0.0001974627816621318, + "loss": 1.0567, + "step": 2472 + }, + { + "epoch": 0.24, + "grad_norm": 0.2561327873358053, + "learning_rate": 0.00019745923953950516, + "loss": 1.1097, + "step": 2473 + }, + { + "epoch": 0.24, + "grad_norm": 0.27123359888401705, + "learning_rate": 0.00019745569497789975, + "loss": 1.0804, + "step": 2474 + }, + { + "epoch": 0.24, + "grad_norm": 0.3011360941789264, + "learning_rate": 0.00019745214797740437, + "loss": 0.9762, + "step": 2475 + }, + { + "epoch": 0.24, + "grad_norm": 0.23621703283724582, + "learning_rate": 0.00019744859853810772, + "loss": 1.2314, + "step": 2476 + }, + { + "epoch": 0.24, + "grad_norm": 0.2585948578333519, + "learning_rate": 0.00019744504666009864, + "loss": 1.1219, + "step": 2477 + }, + { + "epoch": 0.24, + "grad_norm": 0.23886114739760986, + "learning_rate": 0.00019744149234346604, + "loss": 1.0854, + "step": 2478 + }, + { + "epoch": 0.24, + "grad_norm": 0.27090142697493064, + "learning_rate": 0.00019743793558829885, + "loss": 1.1247, + "step": 2479 + }, + { + "epoch": 0.24, + "grad_norm": 0.26214502250978616, + "learning_rate": 0.00019743437639468606, + "loss": 1.0928, + "step": 2480 + }, + { + "epoch": 0.24, + "grad_norm": 0.26399673853102246, + "learning_rate": 0.00019743081476271675, + "loss": 1.229, + "step": 2481 + }, + { + "epoch": 0.24, + "grad_norm": 0.3147492977324059, + "learning_rate": 0.00019742725069248014, + "loss": 1.1473, + "step": 2482 + }, + { + "epoch": 0.24, + "grad_norm": 0.26237262208219775, + "learning_rate": 0.0001974236841840653, + "loss": 1.0796, + "step": 2483 + }, + { + "epoch": 0.24, + "grad_norm": 0.2556663951288371, + "learning_rate": 0.00019742011523756154, + "loss": 1.0103, + "step": 2484 + }, + { + "epoch": 0.24, + "grad_norm": 0.2780031870200213, + "learning_rate": 0.0001974165438530582, + "loss": 1.1362, + "step": 2485 + }, + { + "epoch": 0.24, + "grad_norm": 0.27186052495109936, + "learning_rate": 0.0001974129700306446, + "loss": 1.0304, + "step": 2486 + }, + { + "epoch": 0.24, + "grad_norm": 0.28105904722620484, + "learning_rate": 0.0001974093937704102, + "loss": 1.1844, + "step": 2487 + }, + { + "epoch": 0.24, + "grad_norm": 0.25863957400723964, + "learning_rate": 0.00019740581507244449, + "loss": 1.1119, + "step": 2488 + }, + { + "epoch": 0.24, + "grad_norm": 0.28394214465783835, + "learning_rate": 0.00019740223393683706, + "loss": 1.1119, + "step": 2489 + }, + { + "epoch": 0.24, + "grad_norm": 0.26485094517848373, + "learning_rate": 0.00019739865036367751, + "loss": 1.1412, + "step": 2490 + }, + { + "epoch": 0.24, + "grad_norm": 0.28470688440068453, + "learning_rate": 0.0001973950643530555, + "loss": 1.0705, + "step": 2491 + }, + { + "epoch": 0.24, + "grad_norm": 0.26331398514634713, + "learning_rate": 0.00019739147590506085, + "loss": 1.052, + "step": 2492 + }, + { + "epoch": 0.24, + "grad_norm": 0.25097699443135785, + "learning_rate": 0.00019738788501978325, + "loss": 1.0758, + "step": 2493 + }, + { + "epoch": 0.24, + "grad_norm": 0.24766896766086546, + "learning_rate": 0.00019738429169731262, + "loss": 1.0952, + "step": 2494 + }, + { + "epoch": 0.24, + "grad_norm": 0.28022175227433616, + "learning_rate": 0.00019738069593773893, + "loss": 1.0738, + "step": 2495 + }, + { + "epoch": 0.24, + "grad_norm": 0.2522932834715685, + "learning_rate": 0.0001973770977411521, + "loss": 1.1787, + "step": 2496 + }, + { + "epoch": 0.24, + "grad_norm": 0.2926767224083834, + "learning_rate": 0.0001973734971076422, + "loss": 1.1114, + "step": 2497 + }, + { + "epoch": 0.24, + "grad_norm": 0.2915842765013931, + "learning_rate": 0.00019736989403729935, + "loss": 1.0235, + "step": 2498 + }, + { + "epoch": 0.24, + "grad_norm": 0.2462394398490279, + "learning_rate": 0.0001973662885302137, + "loss": 1.0616, + "step": 2499 + }, + { + "epoch": 0.24, + "grad_norm": 0.2608600216455592, + "learning_rate": 0.00019736268058647547, + "loss": 1.2348, + "step": 2500 + }, + { + "epoch": 0.24, + "grad_norm": 0.27127694754468956, + "learning_rate": 0.000197359070206175, + "loss": 1.2243, + "step": 2501 + }, + { + "epoch": 0.24, + "grad_norm": 0.27181980704622394, + "learning_rate": 0.00019735545738940258, + "loss": 1.0685, + "step": 2502 + }, + { + "epoch": 0.24, + "grad_norm": 0.26463599414343286, + "learning_rate": 0.00019735184213624866, + "loss": 1.083, + "step": 2503 + }, + { + "epoch": 0.24, + "grad_norm": 0.2858932841784855, + "learning_rate": 0.00019734822444680372, + "loss": 1.1853, + "step": 2504 + }, + { + "epoch": 0.24, + "grad_norm": 0.29334417625161935, + "learning_rate": 0.00019734460432115826, + "loss": 1.149, + "step": 2505 + }, + { + "epoch": 0.24, + "grad_norm": 0.30013981243424426, + "learning_rate": 0.00019734098175940292, + "loss": 1.1454, + "step": 2506 + }, + { + "epoch": 0.24, + "grad_norm": 0.25388848360987426, + "learning_rate": 0.00019733735676162833, + "loss": 1.0328, + "step": 2507 + }, + { + "epoch": 0.24, + "grad_norm": 0.2911267465827057, + "learning_rate": 0.0001973337293279252, + "loss": 1.0848, + "step": 2508 + }, + { + "epoch": 0.24, + "grad_norm": 0.3184503213445064, + "learning_rate": 0.00019733009945838435, + "loss": 1.1812, + "step": 2509 + }, + { + "epoch": 0.24, + "grad_norm": 0.24860610501636035, + "learning_rate": 0.00019732646715309656, + "loss": 1.0944, + "step": 2510 + }, + { + "epoch": 0.24, + "grad_norm": 0.2523314374067835, + "learning_rate": 0.00019732283241215276, + "loss": 1.0563, + "step": 2511 + }, + { + "epoch": 0.24, + "grad_norm": 0.26493511638695577, + "learning_rate": 0.00019731919523564395, + "loss": 1.0797, + "step": 2512 + }, + { + "epoch": 0.24, + "grad_norm": 0.34257964691618786, + "learning_rate": 0.00019731555562366108, + "loss": 1.0442, + "step": 2513 + }, + { + "epoch": 0.24, + "grad_norm": 0.278969034097831, + "learning_rate": 0.0001973119135762953, + "loss": 1.085, + "step": 2514 + }, + { + "epoch": 0.24, + "grad_norm": 0.2821218023861396, + "learning_rate": 0.00019730826909363771, + "loss": 1.1072, + "step": 2515 + }, + { + "epoch": 0.24, + "grad_norm": 0.2550975254765698, + "learning_rate": 0.00019730462217577955, + "loss": 1.0796, + "step": 2516 + }, + { + "epoch": 0.24, + "grad_norm": 0.24932942154107635, + "learning_rate": 0.00019730097282281202, + "loss": 1.0744, + "step": 2517 + }, + { + "epoch": 0.24, + "grad_norm": 0.278741555046221, + "learning_rate": 0.00019729732103482652, + "loss": 1.2485, + "step": 2518 + }, + { + "epoch": 0.24, + "grad_norm": 0.28093405939217236, + "learning_rate": 0.0001972936668119144, + "loss": 1.107, + "step": 2519 + }, + { + "epoch": 0.24, + "grad_norm": 0.25703049143132695, + "learning_rate": 0.00019729001015416714, + "loss": 1.1391, + "step": 2520 + }, + { + "epoch": 0.24, + "grad_norm": 0.26010024870942694, + "learning_rate": 0.00019728635106167622, + "loss": 1.0808, + "step": 2521 + }, + { + "epoch": 0.24, + "grad_norm": 0.2640446473647651, + "learning_rate": 0.00019728268953453324, + "loss": 1.0537, + "step": 2522 + }, + { + "epoch": 0.24, + "grad_norm": 0.26949297336392625, + "learning_rate": 0.0001972790255728298, + "loss": 1.0032, + "step": 2523 + }, + { + "epoch": 0.24, + "grad_norm": 0.27078666739320834, + "learning_rate": 0.00019727535917665764, + "loss": 1.0726, + "step": 2524 + }, + { + "epoch": 0.24, + "grad_norm": 0.2270614158675183, + "learning_rate": 0.00019727169034610843, + "loss": 1.084, + "step": 2525 + }, + { + "epoch": 0.24, + "grad_norm": 0.2772909155669702, + "learning_rate": 0.00019726801908127403, + "loss": 1.0104, + "step": 2526 + }, + { + "epoch": 0.24, + "grad_norm": 0.26805528081435065, + "learning_rate": 0.00019726434538224638, + "loss": 1.0985, + "step": 2527 + }, + { + "epoch": 0.24, + "grad_norm": 0.3021364267615302, + "learning_rate": 0.00019726066924911732, + "loss": 1.1961, + "step": 2528 + }, + { + "epoch": 0.24, + "grad_norm": 0.2742459883930204, + "learning_rate": 0.0001972569906819789, + "loss": 0.8326, + "step": 2529 + }, + { + "epoch": 0.24, + "grad_norm": 0.2965953559002404, + "learning_rate": 0.00019725330968092315, + "loss": 1.107, + "step": 2530 + }, + { + "epoch": 0.24, + "grad_norm": 0.2684835136930391, + "learning_rate": 0.0001972496262460422, + "loss": 1.0751, + "step": 2531 + }, + { + "epoch": 0.24, + "grad_norm": 0.2718707210872007, + "learning_rate": 0.00019724594037742824, + "loss": 1.0515, + "step": 2532 + }, + { + "epoch": 0.24, + "grad_norm": 0.25725745750943513, + "learning_rate": 0.00019724225207517354, + "loss": 1.1485, + "step": 2533 + }, + { + "epoch": 0.24, + "grad_norm": 0.2890461500320697, + "learning_rate": 0.0001972385613393703, + "loss": 1.1419, + "step": 2534 + }, + { + "epoch": 0.24, + "grad_norm": 0.2440614465912653, + "learning_rate": 0.000197234868170111, + "loss": 1.0766, + "step": 2535 + }, + { + "epoch": 0.24, + "grad_norm": 0.2631890740774793, + "learning_rate": 0.00019723117256748802, + "loss": 1.0583, + "step": 2536 + }, + { + "epoch": 0.24, + "grad_norm": 0.2654794914552144, + "learning_rate": 0.0001972274745315938, + "loss": 1.103, + "step": 2537 + }, + { + "epoch": 0.24, + "grad_norm": 0.25772820159308557, + "learning_rate": 0.00019722377406252095, + "loss": 1.1091, + "step": 2538 + }, + { + "epoch": 0.24, + "grad_norm": 0.2759349478590902, + "learning_rate": 0.00019722007116036204, + "loss": 1.0997, + "step": 2539 + }, + { + "epoch": 0.24, + "grad_norm": 0.29182935407079413, + "learning_rate": 0.00019721636582520978, + "loss": 0.9975, + "step": 2540 + }, + { + "epoch": 0.24, + "grad_norm": 0.2706259485506027, + "learning_rate": 0.00019721265805715686, + "loss": 1.0848, + "step": 2541 + }, + { + "epoch": 0.24, + "grad_norm": 0.27778012967187665, + "learning_rate": 0.00019720894785629604, + "loss": 1.1421, + "step": 2542 + }, + { + "epoch": 0.24, + "grad_norm": 0.24224758551615272, + "learning_rate": 0.00019720523522272023, + "loss": 1.1094, + "step": 2543 + }, + { + "epoch": 0.24, + "grad_norm": 0.27897485799673416, + "learning_rate": 0.0001972015201565223, + "loss": 0.9736, + "step": 2544 + }, + { + "epoch": 0.24, + "grad_norm": 0.25755355735125685, + "learning_rate": 0.00019719780265779527, + "loss": 1.1512, + "step": 2545 + }, + { + "epoch": 0.24, + "grad_norm": 0.26839822619910775, + "learning_rate": 0.00019719408272663211, + "loss": 1.0875, + "step": 2546 + }, + { + "epoch": 0.24, + "grad_norm": 0.3230859730143896, + "learning_rate": 0.00019719036036312595, + "loss": 1.1185, + "step": 2547 + }, + { + "epoch": 0.24, + "grad_norm": 0.3000203530525595, + "learning_rate": 0.00019718663556736997, + "loss": 0.9699, + "step": 2548 + }, + { + "epoch": 0.24, + "grad_norm": 0.2673877046628615, + "learning_rate": 0.00019718290833945732, + "loss": 1.0584, + "step": 2549 + }, + { + "epoch": 0.24, + "grad_norm": 0.28638413313248523, + "learning_rate": 0.00019717917867948136, + "loss": 1.1215, + "step": 2550 + }, + { + "epoch": 0.24, + "grad_norm": 0.27932539137807955, + "learning_rate": 0.00019717544658753533, + "loss": 1.0187, + "step": 2551 + }, + { + "epoch": 0.24, + "grad_norm": 0.29577216947674934, + "learning_rate": 0.00019717171206371268, + "loss": 1.0782, + "step": 2552 + }, + { + "epoch": 0.24, + "grad_norm": 0.29132618922289466, + "learning_rate": 0.00019716797510810688, + "loss": 1.1716, + "step": 2553 + }, + { + "epoch": 0.24, + "grad_norm": 0.30978702723114215, + "learning_rate": 0.00019716423572081144, + "loss": 1.176, + "step": 2554 + }, + { + "epoch": 0.24, + "grad_norm": 0.26767419284947913, + "learning_rate": 0.0001971604939019199, + "loss": 0.9729, + "step": 2555 + }, + { + "epoch": 0.24, + "grad_norm": 0.24950178862061026, + "learning_rate": 0.000197156749651526, + "loss": 1.1032, + "step": 2556 + }, + { + "epoch": 0.24, + "grad_norm": 0.2676153474407372, + "learning_rate": 0.00019715300296972333, + "loss": 1.0011, + "step": 2557 + }, + { + "epoch": 0.24, + "grad_norm": 0.24233870876913635, + "learning_rate": 0.00019714925385660572, + "loss": 1.0203, + "step": 2558 + }, + { + "epoch": 0.24, + "grad_norm": 0.26922786191679665, + "learning_rate": 0.00019714550231226697, + "loss": 1.0598, + "step": 2559 + }, + { + "epoch": 0.24, + "grad_norm": 0.25818023471817164, + "learning_rate": 0.000197141748336801, + "loss": 1.1347, + "step": 2560 + }, + { + "epoch": 0.25, + "grad_norm": 0.2618807690441127, + "learning_rate": 0.00019713799193030166, + "loss": 1.1211, + "step": 2561 + }, + { + "epoch": 0.25, + "grad_norm": 0.29741704565126115, + "learning_rate": 0.00019713423309286309, + "loss": 1.0597, + "step": 2562 + }, + { + "epoch": 0.25, + "grad_norm": 0.28884125218560003, + "learning_rate": 0.00019713047182457928, + "loss": 1.0358, + "step": 2563 + }, + { + "epoch": 0.25, + "grad_norm": 0.26649025648335334, + "learning_rate": 0.00019712670812554434, + "loss": 1.162, + "step": 2564 + }, + { + "epoch": 0.25, + "grad_norm": 0.2947735363660406, + "learning_rate": 0.00019712294199585248, + "loss": 1.0586, + "step": 2565 + }, + { + "epoch": 0.25, + "grad_norm": 0.25677075754484635, + "learning_rate": 0.000197119173435598, + "loss": 1.1841, + "step": 2566 + }, + { + "epoch": 0.25, + "grad_norm": 0.2637205544668129, + "learning_rate": 0.00019711540244487515, + "loss": 1.0261, + "step": 2567 + }, + { + "epoch": 0.25, + "grad_norm": 0.24823082695362084, + "learning_rate": 0.0001971116290237783, + "loss": 1.0277, + "step": 2568 + }, + { + "epoch": 0.25, + "grad_norm": 0.25229952638255043, + "learning_rate": 0.0001971078531724019, + "loss": 1.0751, + "step": 2569 + }, + { + "epoch": 0.25, + "grad_norm": 0.30254391196407815, + "learning_rate": 0.00019710407489084047, + "loss": 1.1231, + "step": 2570 + }, + { + "epoch": 0.25, + "grad_norm": 0.3021948667516261, + "learning_rate": 0.00019710029417918854, + "loss": 1.0033, + "step": 2571 + }, + { + "epoch": 0.25, + "grad_norm": 0.23588733965722716, + "learning_rate": 0.00019709651103754067, + "loss": 1.1007, + "step": 2572 + }, + { + "epoch": 0.25, + "grad_norm": 0.24382613129524622, + "learning_rate": 0.00019709272546599164, + "loss": 1.0672, + "step": 2573 + }, + { + "epoch": 0.25, + "grad_norm": 0.2733353690652961, + "learning_rate": 0.00019708893746463613, + "loss": 1.1596, + "step": 2574 + }, + { + "epoch": 0.25, + "grad_norm": 0.26912681562116336, + "learning_rate": 0.00019708514703356894, + "loss": 1.1428, + "step": 2575 + }, + { + "epoch": 0.25, + "grad_norm": 0.2956110006161434, + "learning_rate": 0.00019708135417288491, + "loss": 1.0804, + "step": 2576 + }, + { + "epoch": 0.25, + "grad_norm": 0.2630689064650718, + "learning_rate": 0.000197077558882679, + "loss": 1.0586, + "step": 2577 + }, + { + "epoch": 0.25, + "grad_norm": 0.2689950863610693, + "learning_rate": 0.00019707376116304617, + "loss": 1.1105, + "step": 2578 + }, + { + "epoch": 0.25, + "grad_norm": 0.2910234057476756, + "learning_rate": 0.00019706996101408146, + "loss": 1.0988, + "step": 2579 + }, + { + "epoch": 0.25, + "grad_norm": 0.27854677846223636, + "learning_rate": 0.00019706615843587995, + "loss": 1.1806, + "step": 2580 + }, + { + "epoch": 0.25, + "grad_norm": 0.24191714015622726, + "learning_rate": 0.00019706235342853683, + "loss": 1.0358, + "step": 2581 + }, + { + "epoch": 0.25, + "grad_norm": 0.2654110236015743, + "learning_rate": 0.00019705854599214734, + "loss": 1.1535, + "step": 2582 + }, + { + "epoch": 0.25, + "grad_norm": 0.24454579378976135, + "learning_rate": 0.0001970547361268067, + "loss": 1.0781, + "step": 2583 + }, + { + "epoch": 0.25, + "grad_norm": 0.24819542572155914, + "learning_rate": 0.00019705092383261028, + "loss": 1.1531, + "step": 2584 + }, + { + "epoch": 0.25, + "grad_norm": 0.31712338038836446, + "learning_rate": 0.00019704710910965352, + "loss": 1.2182, + "step": 2585 + }, + { + "epoch": 0.25, + "grad_norm": 0.2770504057135462, + "learning_rate": 0.00019704329195803188, + "loss": 0.9587, + "step": 2586 + }, + { + "epoch": 0.25, + "grad_norm": 0.2381187276476534, + "learning_rate": 0.00019703947237784087, + "loss": 0.9862, + "step": 2587 + }, + { + "epoch": 0.25, + "grad_norm": 0.29302550861588544, + "learning_rate": 0.00019703565036917605, + "loss": 1.0806, + "step": 2588 + }, + { + "epoch": 0.25, + "grad_norm": 0.26005985628816053, + "learning_rate": 0.0001970318259321331, + "loss": 0.9799, + "step": 2589 + }, + { + "epoch": 0.25, + "grad_norm": 0.27882886267706336, + "learning_rate": 0.0001970279990668077, + "loss": 1.1686, + "step": 2590 + }, + { + "epoch": 0.25, + "grad_norm": 0.26205774855080516, + "learning_rate": 0.0001970241697732957, + "loss": 1.1234, + "step": 2591 + }, + { + "epoch": 0.25, + "grad_norm": 0.26668517374001244, + "learning_rate": 0.00019702033805169285, + "loss": 1.0884, + "step": 2592 + }, + { + "epoch": 0.25, + "grad_norm": 0.25715862286863395, + "learning_rate": 0.00019701650390209504, + "loss": 1.2244, + "step": 2593 + }, + { + "epoch": 0.25, + "grad_norm": 0.3058673752020222, + "learning_rate": 0.00019701266732459827, + "loss": 1.0458, + "step": 2594 + }, + { + "epoch": 0.25, + "grad_norm": 0.2529497741528999, + "learning_rate": 0.00019700882831929852, + "loss": 1.3244, + "step": 2595 + }, + { + "epoch": 0.25, + "grad_norm": 0.2943742754766903, + "learning_rate": 0.0001970049868862919, + "loss": 1.0521, + "step": 2596 + }, + { + "epoch": 0.25, + "grad_norm": 0.25522996903552037, + "learning_rate": 0.0001970011430256745, + "loss": 1.0895, + "step": 2597 + }, + { + "epoch": 0.25, + "grad_norm": 0.2671836556503752, + "learning_rate": 0.00019699729673754255, + "loss": 1.1154, + "step": 2598 + }, + { + "epoch": 0.25, + "grad_norm": 0.3007590432393228, + "learning_rate": 0.00019699344802199224, + "loss": 1.2871, + "step": 2599 + }, + { + "epoch": 0.25, + "grad_norm": 0.2991338944551984, + "learning_rate": 0.00019698959687911998, + "loss": 1.0738, + "step": 2600 + }, + { + "epoch": 0.25, + "grad_norm": 0.26091410208452626, + "learning_rate": 0.00019698574330902208, + "loss": 1.2443, + "step": 2601 + }, + { + "epoch": 0.25, + "grad_norm": 0.27563048614294183, + "learning_rate": 0.00019698188731179502, + "loss": 1.0806, + "step": 2602 + }, + { + "epoch": 0.25, + "grad_norm": 0.26585816626226483, + "learning_rate": 0.00019697802888753526, + "loss": 0.9916, + "step": 2603 + }, + { + "epoch": 0.25, + "grad_norm": 0.27522846970536974, + "learning_rate": 0.0001969741680363394, + "loss": 1.0998, + "step": 2604 + }, + { + "epoch": 0.25, + "grad_norm": 0.2602657313030757, + "learning_rate": 0.00019697030475830402, + "loss": 1.0419, + "step": 2605 + }, + { + "epoch": 0.25, + "grad_norm": 0.2453226312120246, + "learning_rate": 0.00019696643905352582, + "loss": 0.9631, + "step": 2606 + }, + { + "epoch": 0.25, + "grad_norm": 0.29471825123420187, + "learning_rate": 0.00019696257092210155, + "loss": 1.0666, + "step": 2607 + }, + { + "epoch": 0.25, + "grad_norm": 0.2902443561039826, + "learning_rate": 0.000196958700364128, + "loss": 1.0988, + "step": 2608 + }, + { + "epoch": 0.25, + "grad_norm": 0.2538912185129694, + "learning_rate": 0.00019695482737970202, + "loss": 1.1586, + "step": 2609 + }, + { + "epoch": 0.25, + "grad_norm": 0.28579871153918274, + "learning_rate": 0.0001969509519689206, + "loss": 1.0839, + "step": 2610 + }, + { + "epoch": 0.25, + "grad_norm": 0.23553918792552284, + "learning_rate": 0.00019694707413188062, + "loss": 1.0165, + "step": 2611 + }, + { + "epoch": 0.25, + "grad_norm": 0.28381053637003834, + "learning_rate": 0.0001969431938686792, + "loss": 1.0899, + "step": 2612 + }, + { + "epoch": 0.25, + "grad_norm": 0.2605318408738508, + "learning_rate": 0.00019693931117941346, + "loss": 1.1033, + "step": 2613 + }, + { + "epoch": 0.25, + "eval_loss": 1.1338403224945068, + "eval_runtime": 4230.6339, + "eval_samples_per_second": 19.765, + "eval_steps_per_second": 2.471, + "step": 2613 + }, + { + "epoch": 0.25, + "grad_norm": 0.255252333620048, + "learning_rate": 0.00019693542606418052, + "loss": 1.1488, + "step": 2614 + }, + { + "epoch": 0.25, + "grad_norm": 0.30633153587103285, + "learning_rate": 0.00019693153852307757, + "loss": 1.0757, + "step": 2615 + }, + { + "epoch": 0.25, + "grad_norm": 0.27505703870757664, + "learning_rate": 0.000196927648556202, + "loss": 1.1299, + "step": 2616 + }, + { + "epoch": 0.25, + "grad_norm": 0.2713935955775416, + "learning_rate": 0.00019692375616365112, + "loss": 1.0189, + "step": 2617 + }, + { + "epoch": 0.25, + "grad_norm": 0.2684321316986978, + "learning_rate": 0.00019691986134552227, + "loss": 1.1241, + "step": 2618 + }, + { + "epoch": 0.25, + "grad_norm": 0.25621412932428106, + "learning_rate": 0.00019691596410191303, + "loss": 1.0671, + "step": 2619 + }, + { + "epoch": 0.25, + "grad_norm": 0.26289530369330816, + "learning_rate": 0.00019691206443292085, + "loss": 0.9654, + "step": 2620 + }, + { + "epoch": 0.25, + "grad_norm": 0.2650174904695066, + "learning_rate": 0.00019690816233864337, + "loss": 1.0996, + "step": 2621 + }, + { + "epoch": 0.25, + "grad_norm": 0.2575327507563281, + "learning_rate": 0.0001969042578191782, + "loss": 1.1166, + "step": 2622 + }, + { + "epoch": 0.25, + "grad_norm": 0.26454046915240054, + "learning_rate": 0.00019690035087462307, + "loss": 0.9949, + "step": 2623 + }, + { + "epoch": 0.25, + "grad_norm": 0.2805971262563047, + "learning_rate": 0.0001968964415050758, + "loss": 1.0859, + "step": 2624 + }, + { + "epoch": 0.25, + "grad_norm": 0.21973703426240196, + "learning_rate": 0.00019689252971063416, + "loss": 1.1679, + "step": 2625 + }, + { + "epoch": 0.25, + "grad_norm": 0.3139171336564937, + "learning_rate": 0.00019688861549139607, + "loss": 1.2573, + "step": 2626 + }, + { + "epoch": 0.25, + "grad_norm": 0.3076875510868648, + "learning_rate": 0.0001968846988474595, + "loss": 1.1237, + "step": 2627 + }, + { + "epoch": 0.25, + "grad_norm": 0.28812414941928133, + "learning_rate": 0.00019688077977892245, + "loss": 1.0934, + "step": 2628 + }, + { + "epoch": 0.25, + "grad_norm": 0.31182412232961415, + "learning_rate": 0.00019687685828588297, + "loss": 1.0891, + "step": 2629 + }, + { + "epoch": 0.25, + "grad_norm": 0.2625520463054526, + "learning_rate": 0.00019687293436843926, + "loss": 1.1112, + "step": 2630 + }, + { + "epoch": 0.25, + "grad_norm": 0.25837520805297015, + "learning_rate": 0.00019686900802668946, + "loss": 0.9673, + "step": 2631 + }, + { + "epoch": 0.25, + "grad_norm": 0.27460647995152365, + "learning_rate": 0.00019686507926073188, + "loss": 1.1495, + "step": 2632 + }, + { + "epoch": 0.25, + "grad_norm": 0.26861650854795094, + "learning_rate": 0.00019686114807066478, + "loss": 1.0432, + "step": 2633 + }, + { + "epoch": 0.25, + "grad_norm": 0.28385341495610805, + "learning_rate": 0.0001968572144565866, + "loss": 1.1461, + "step": 2634 + }, + { + "epoch": 0.25, + "grad_norm": 0.271570918418974, + "learning_rate": 0.00019685327841859572, + "loss": 1.185, + "step": 2635 + }, + { + "epoch": 0.25, + "grad_norm": 0.2673269371926348, + "learning_rate": 0.00019684933995679074, + "loss": 1.1282, + "step": 2636 + }, + { + "epoch": 0.25, + "grad_norm": 0.2403632191882647, + "learning_rate": 0.0001968453990712701, + "loss": 1.0608, + "step": 2637 + }, + { + "epoch": 0.25, + "grad_norm": 0.2586849785633008, + "learning_rate": 0.00019684145576213252, + "loss": 1.1015, + "step": 2638 + }, + { + "epoch": 0.25, + "grad_norm": 0.2649189546870879, + "learning_rate": 0.00019683751002947663, + "loss": 1.1428, + "step": 2639 + }, + { + "epoch": 0.25, + "grad_norm": 0.2524600423422356, + "learning_rate": 0.0001968335618734012, + "loss": 1.0497, + "step": 2640 + }, + { + "epoch": 0.25, + "grad_norm": 0.25016994107064466, + "learning_rate": 0.00019682961129400503, + "loss": 1.1158, + "step": 2641 + }, + { + "epoch": 0.25, + "grad_norm": 0.26421571146347217, + "learning_rate": 0.000196825658291387, + "loss": 1.0628, + "step": 2642 + }, + { + "epoch": 0.25, + "grad_norm": 0.282173839472887, + "learning_rate": 0.00019682170286564597, + "loss": 0.998, + "step": 2643 + }, + { + "epoch": 0.25, + "grad_norm": 0.2323812037769926, + "learning_rate": 0.00019681774501688102, + "loss": 1.0137, + "step": 2644 + }, + { + "epoch": 0.25, + "grad_norm": 0.2901381801505397, + "learning_rate": 0.0001968137847451911, + "loss": 1.0866, + "step": 2645 + }, + { + "epoch": 0.25, + "grad_norm": 0.26576724197777407, + "learning_rate": 0.0001968098220506754, + "loss": 1.0927, + "step": 2646 + }, + { + "epoch": 0.25, + "grad_norm": 0.2696230704622509, + "learning_rate": 0.0001968058569334331, + "loss": 1.1894, + "step": 2647 + }, + { + "epoch": 0.25, + "grad_norm": 0.26017036741055427, + "learning_rate": 0.00019680188939356336, + "loss": 0.9647, + "step": 2648 + }, + { + "epoch": 0.25, + "grad_norm": 0.2677987402758377, + "learning_rate": 0.0001967979194311655, + "loss": 1.0862, + "step": 2649 + }, + { + "epoch": 0.25, + "grad_norm": 0.28711631837028534, + "learning_rate": 0.00019679394704633888, + "loss": 1.2582, + "step": 2650 + }, + { + "epoch": 0.25, + "grad_norm": 0.26765916334917605, + "learning_rate": 0.00019678997223918288, + "loss": 1.0659, + "step": 2651 + }, + { + "epoch": 0.25, + "grad_norm": 0.26777059535455733, + "learning_rate": 0.000196785995009797, + "loss": 1.1978, + "step": 2652 + }, + { + "epoch": 0.25, + "grad_norm": 0.22518302167201712, + "learning_rate": 0.00019678201535828076, + "loss": 1.0836, + "step": 2653 + }, + { + "epoch": 0.25, + "grad_norm": 0.2639486825941676, + "learning_rate": 0.00019677803328473377, + "loss": 1.1509, + "step": 2654 + }, + { + "epoch": 0.25, + "grad_norm": 0.23538041430872067, + "learning_rate": 0.00019677404878925566, + "loss": 0.9895, + "step": 2655 + }, + { + "epoch": 0.25, + "grad_norm": 0.295994436523556, + "learning_rate": 0.00019677006187194618, + "loss": 1.0979, + "step": 2656 + }, + { + "epoch": 0.25, + "grad_norm": 0.28307792530010506, + "learning_rate": 0.00019676607253290508, + "loss": 1.0768, + "step": 2657 + }, + { + "epoch": 0.25, + "grad_norm": 0.2761519436955159, + "learning_rate": 0.0001967620807722322, + "loss": 1.0224, + "step": 2658 + }, + { + "epoch": 0.25, + "grad_norm": 0.2879025567486724, + "learning_rate": 0.00019675808659002744, + "loss": 1.1083, + "step": 2659 + }, + { + "epoch": 0.25, + "grad_norm": 0.3004898697546929, + "learning_rate": 0.00019675408998639076, + "loss": 1.1356, + "step": 2660 + }, + { + "epoch": 0.25, + "grad_norm": 0.31015042588546826, + "learning_rate": 0.00019675009096142214, + "loss": 1.1486, + "step": 2661 + }, + { + "epoch": 0.25, + "grad_norm": 0.2811558967690104, + "learning_rate": 0.0001967460895152217, + "loss": 1.0928, + "step": 2662 + }, + { + "epoch": 0.25, + "grad_norm": 0.2633365965205071, + "learning_rate": 0.00019674208564788957, + "loss": 1.1153, + "step": 2663 + }, + { + "epoch": 0.25, + "grad_norm": 0.2860499750304183, + "learning_rate": 0.00019673807935952596, + "loss": 1.1082, + "step": 2664 + }, + { + "epoch": 0.25, + "grad_norm": 0.29758088608843136, + "learning_rate": 0.0001967340706502311, + "loss": 1.2912, + "step": 2665 + }, + { + "epoch": 0.26, + "grad_norm": 0.2568418533290371, + "learning_rate": 0.00019673005952010534, + "loss": 1.1485, + "step": 2666 + }, + { + "epoch": 0.26, + "grad_norm": 0.26096215810194917, + "learning_rate": 0.00019672604596924904, + "loss": 1.0748, + "step": 2667 + }, + { + "epoch": 0.26, + "grad_norm": 0.28867342954233294, + "learning_rate": 0.00019672202999776266, + "loss": 1.0794, + "step": 2668 + }, + { + "epoch": 0.26, + "grad_norm": 0.21142091669081053, + "learning_rate": 0.0001967180116057467, + "loss": 1.0324, + "step": 2669 + }, + { + "epoch": 0.26, + "grad_norm": 0.2592661813230898, + "learning_rate": 0.00019671399079330168, + "loss": 1.1463, + "step": 2670 + }, + { + "epoch": 0.26, + "grad_norm": 0.23960804899506707, + "learning_rate": 0.00019670996756052827, + "loss": 1.0908, + "step": 2671 + }, + { + "epoch": 0.26, + "grad_norm": 0.24684475939341896, + "learning_rate": 0.00019670594190752713, + "loss": 1.036, + "step": 2672 + }, + { + "epoch": 0.26, + "grad_norm": 0.28350271509951525, + "learning_rate": 0.00019670191383439907, + "loss": 1.0251, + "step": 2673 + }, + { + "epoch": 0.26, + "grad_norm": 0.5535529462100653, + "learning_rate": 0.00019669788334124476, + "loss": 1.052, + "step": 2674 + }, + { + "epoch": 0.26, + "grad_norm": 0.3077728064262205, + "learning_rate": 0.0001966938504281652, + "loss": 1.116, + "step": 2675 + }, + { + "epoch": 0.26, + "grad_norm": 0.2656350026116821, + "learning_rate": 0.00019668981509526128, + "loss": 1.2018, + "step": 2676 + }, + { + "epoch": 0.26, + "grad_norm": 0.2479927609636305, + "learning_rate": 0.00019668577734263394, + "loss": 1.104, + "step": 2677 + }, + { + "epoch": 0.26, + "grad_norm": 0.2777215172332179, + "learning_rate": 0.00019668173717038426, + "loss": 1.1844, + "step": 2678 + }, + { + "epoch": 0.26, + "grad_norm": 0.2829530059947927, + "learning_rate": 0.00019667769457861335, + "loss": 1.0842, + "step": 2679 + }, + { + "epoch": 0.26, + "grad_norm": 0.2550816083760124, + "learning_rate": 0.00019667364956742236, + "loss": 1.1394, + "step": 2680 + }, + { + "epoch": 0.26, + "grad_norm": 0.2753775575550518, + "learning_rate": 0.00019666960213691255, + "loss": 1.1293, + "step": 2681 + }, + { + "epoch": 0.26, + "grad_norm": 0.2767717867669593, + "learning_rate": 0.0001966655522871852, + "loss": 1.0783, + "step": 2682 + }, + { + "epoch": 0.26, + "grad_norm": 0.26993060617287373, + "learning_rate": 0.00019666150001834164, + "loss": 1.0235, + "step": 2683 + }, + { + "epoch": 0.26, + "grad_norm": 0.2870195128020576, + "learning_rate": 0.00019665744533048328, + "loss": 1.0422, + "step": 2684 + }, + { + "epoch": 0.26, + "grad_norm": 0.27640315301309837, + "learning_rate": 0.0001966533882237116, + "loss": 1.1165, + "step": 2685 + }, + { + "epoch": 0.26, + "grad_norm": 0.2694095090611699, + "learning_rate": 0.00019664932869812814, + "loss": 1.0239, + "step": 2686 + }, + { + "epoch": 0.26, + "grad_norm": 0.32106995191546245, + "learning_rate": 0.0001966452667538345, + "loss": 1.1492, + "step": 2687 + }, + { + "epoch": 0.26, + "grad_norm": 0.27424915699699154, + "learning_rate": 0.00019664120239093233, + "loss": 1.1685, + "step": 2688 + }, + { + "epoch": 0.26, + "grad_norm": 0.29135122016294623, + "learning_rate": 0.0001966371356095233, + "loss": 1.1547, + "step": 2689 + }, + { + "epoch": 0.26, + "grad_norm": 0.26527031382403177, + "learning_rate": 0.00019663306640970926, + "loss": 1.1219, + "step": 2690 + }, + { + "epoch": 0.26, + "grad_norm": 0.27127353395064424, + "learning_rate": 0.00019662899479159197, + "loss": 0.9899, + "step": 2691 + }, + { + "epoch": 0.26, + "grad_norm": 0.3033029254082761, + "learning_rate": 0.00019662492075527336, + "loss": 1.1249, + "step": 2692 + }, + { + "epoch": 0.26, + "grad_norm": 0.26657961186508666, + "learning_rate": 0.00019662084430085538, + "loss": 1.1842, + "step": 2693 + }, + { + "epoch": 0.26, + "grad_norm": 0.28599030101340955, + "learning_rate": 0.00019661676542844007, + "loss": 1.1839, + "step": 2694 + }, + { + "epoch": 0.26, + "grad_norm": 0.30087762030890813, + "learning_rate": 0.00019661268413812946, + "loss": 1.0863, + "step": 2695 + }, + { + "epoch": 0.26, + "grad_norm": 0.325088281089176, + "learning_rate": 0.00019660860043002574, + "loss": 1.049, + "step": 2696 + }, + { + "epoch": 0.26, + "grad_norm": 0.2828097114488758, + "learning_rate": 0.00019660451430423103, + "loss": 1.1857, + "step": 2697 + }, + { + "epoch": 0.26, + "grad_norm": 0.2701302351874178, + "learning_rate": 0.00019660042576084767, + "loss": 1.1144, + "step": 2698 + }, + { + "epoch": 0.26, + "grad_norm": 0.2835386842245424, + "learning_rate": 0.00019659633479997794, + "loss": 1.1066, + "step": 2699 + }, + { + "epoch": 0.26, + "grad_norm": 0.243379840423929, + "learning_rate": 0.00019659224142172424, + "loss": 1.0204, + "step": 2700 + }, + { + "epoch": 0.26, + "grad_norm": 0.24266059096127535, + "learning_rate": 0.00019658814562618896, + "loss": 1.1088, + "step": 2701 + }, + { + "epoch": 0.26, + "grad_norm": 0.2617930016670921, + "learning_rate": 0.00019658404741347462, + "loss": 1.1113, + "step": 2702 + }, + { + "epoch": 0.26, + "grad_norm": 0.2547097198503651, + "learning_rate": 0.00019657994678368385, + "loss": 1.0444, + "step": 2703 + }, + { + "epoch": 0.26, + "grad_norm": 0.2625250549016416, + "learning_rate": 0.00019657584373691917, + "loss": 1.1396, + "step": 2704 + }, + { + "epoch": 0.26, + "grad_norm": 0.26689206957202594, + "learning_rate": 0.0001965717382732833, + "loss": 1.1596, + "step": 2705 + }, + { + "epoch": 0.26, + "grad_norm": 0.2903595268276037, + "learning_rate": 0.000196567630392879, + "loss": 1.0474, + "step": 2706 + }, + { + "epoch": 0.26, + "grad_norm": 0.2572495572058499, + "learning_rate": 0.00019656352009580908, + "loss": 1.0761, + "step": 2707 + }, + { + "epoch": 0.26, + "grad_norm": 0.2470130498530773, + "learning_rate": 0.00019655940738217635, + "loss": 1.0546, + "step": 2708 + }, + { + "epoch": 0.26, + "grad_norm": 0.2930008567798905, + "learning_rate": 0.00019655529225208378, + "loss": 1.1107, + "step": 2709 + }, + { + "epoch": 0.26, + "grad_norm": 0.25989997031388734, + "learning_rate": 0.00019655117470563434, + "loss": 1.0839, + "step": 2710 + }, + { + "epoch": 0.26, + "grad_norm": 0.30098329989662703, + "learning_rate": 0.00019654705474293107, + "loss": 1.1056, + "step": 2711 + }, + { + "epoch": 0.26, + "grad_norm": 0.2424595664908627, + "learning_rate": 0.00019654293236407707, + "loss": 1.0923, + "step": 2712 + }, + { + "epoch": 0.26, + "grad_norm": 0.28519149075066125, + "learning_rate": 0.00019653880756917552, + "loss": 1.0625, + "step": 2713 + }, + { + "epoch": 0.26, + "grad_norm": 0.31144165591888, + "learning_rate": 0.00019653468035832965, + "loss": 1.2098, + "step": 2714 + }, + { + "epoch": 0.26, + "grad_norm": 0.2714581157840239, + "learning_rate": 0.0001965305507316427, + "loss": 1.092, + "step": 2715 + }, + { + "epoch": 0.26, + "grad_norm": 0.28440142688999087, + "learning_rate": 0.0001965264186892181, + "loss": 1.0165, + "step": 2716 + }, + { + "epoch": 0.26, + "grad_norm": 0.24262222631275346, + "learning_rate": 0.00019652228423115917, + "loss": 1.0364, + "step": 2717 + }, + { + "epoch": 0.26, + "grad_norm": 0.2743500722654148, + "learning_rate": 0.00019651814735756942, + "loss": 1.0864, + "step": 2718 + }, + { + "epoch": 0.26, + "grad_norm": 0.28526861621648486, + "learning_rate": 0.00019651400806855237, + "loss": 1.1124, + "step": 2719 + }, + { + "epoch": 0.26, + "grad_norm": 0.31205779575841586, + "learning_rate": 0.00019650986636421164, + "loss": 1.102, + "step": 2720 + }, + { + "epoch": 0.26, + "grad_norm": 0.2661120177899195, + "learning_rate": 0.00019650572224465084, + "loss": 1.1081, + "step": 2721 + }, + { + "epoch": 0.26, + "grad_norm": 0.29878434646252183, + "learning_rate": 0.00019650157570997364, + "loss": 1.1004, + "step": 2722 + }, + { + "epoch": 0.26, + "grad_norm": 0.2612778406780732, + "learning_rate": 0.00019649742676028394, + "loss": 1.2379, + "step": 2723 + }, + { + "epoch": 0.26, + "grad_norm": 0.2933566914172725, + "learning_rate": 0.00019649327539568543, + "loss": 1.162, + "step": 2724 + }, + { + "epoch": 0.26, + "grad_norm": 0.24094136996286655, + "learning_rate": 0.0001964891216162821, + "loss": 1.0807, + "step": 2725 + }, + { + "epoch": 0.26, + "grad_norm": 0.2943459316325484, + "learning_rate": 0.00019648496542217783, + "loss": 0.998, + "step": 2726 + }, + { + "epoch": 0.26, + "grad_norm": 0.22961741180973028, + "learning_rate": 0.00019648080681347664, + "loss": 1.1305, + "step": 2727 + }, + { + "epoch": 0.26, + "grad_norm": 0.2701859846638287, + "learning_rate": 0.00019647664579028267, + "loss": 1.0889, + "step": 2728 + }, + { + "epoch": 0.26, + "grad_norm": 0.26857093567006685, + "learning_rate": 0.0001964724823527, + "loss": 1.2013, + "step": 2729 + }, + { + "epoch": 0.26, + "grad_norm": 0.31949382152840944, + "learning_rate": 0.0001964683165008328, + "loss": 1.1718, + "step": 2730 + }, + { + "epoch": 0.26, + "grad_norm": 0.28367090867227224, + "learning_rate": 0.00019646414823478535, + "loss": 1.1491, + "step": 2731 + }, + { + "epoch": 0.26, + "grad_norm": 0.2504701281061607, + "learning_rate": 0.000196459977554662, + "loss": 1.1017, + "step": 2732 + }, + { + "epoch": 0.26, + "grad_norm": 0.25221421853430986, + "learning_rate": 0.00019645580446056706, + "loss": 1.1185, + "step": 2733 + }, + { + "epoch": 0.26, + "grad_norm": 0.28143888782957394, + "learning_rate": 0.000196451628952605, + "loss": 1.1759, + "step": 2734 + }, + { + "epoch": 0.26, + "grad_norm": 0.2578191227270372, + "learning_rate": 0.00019644745103088033, + "loss": 1.1787, + "step": 2735 + }, + { + "epoch": 0.26, + "grad_norm": 0.26056470647250435, + "learning_rate": 0.00019644327069549754, + "loss": 1.1649, + "step": 2736 + }, + { + "epoch": 0.26, + "grad_norm": 0.27176880852893454, + "learning_rate": 0.00019643908794656135, + "loss": 1.1057, + "step": 2737 + }, + { + "epoch": 0.26, + "grad_norm": 0.2882235242635738, + "learning_rate": 0.00019643490278417632, + "loss": 1.1081, + "step": 2738 + }, + { + "epoch": 0.26, + "grad_norm": 0.27263458978440536, + "learning_rate": 0.00019643071520844725, + "loss": 1.0712, + "step": 2739 + }, + { + "epoch": 0.26, + "grad_norm": 0.25866965428198324, + "learning_rate": 0.00019642652521947894, + "loss": 1.1159, + "step": 2740 + }, + { + "epoch": 0.26, + "grad_norm": 0.2630902327301495, + "learning_rate": 0.00019642233281737625, + "loss": 1.0603, + "step": 2741 + }, + { + "epoch": 0.26, + "grad_norm": 0.25820714973330133, + "learning_rate": 0.00019641813800224406, + "loss": 0.9922, + "step": 2742 + }, + { + "epoch": 0.26, + "grad_norm": 0.2724772043679332, + "learning_rate": 0.00019641394077418736, + "loss": 1.0461, + "step": 2743 + }, + { + "epoch": 0.26, + "grad_norm": 0.27088818855216434, + "learning_rate": 0.00019640974113331123, + "loss": 1.1959, + "step": 2744 + }, + { + "epoch": 0.26, + "grad_norm": 0.2733607733632018, + "learning_rate": 0.00019640553907972072, + "loss": 1.1886, + "step": 2745 + }, + { + "epoch": 0.26, + "grad_norm": 0.24676149904011196, + "learning_rate": 0.000196401334613521, + "loss": 1.0969, + "step": 2746 + }, + { + "epoch": 0.26, + "grad_norm": 0.27834584403561474, + "learning_rate": 0.00019639712773481728, + "loss": 1.1597, + "step": 2747 + }, + { + "epoch": 0.26, + "grad_norm": 0.25755736145056124, + "learning_rate": 0.0001963929184437149, + "loss": 0.9978, + "step": 2748 + }, + { + "epoch": 0.26, + "grad_norm": 0.31079398376305944, + "learning_rate": 0.00019638870674031913, + "loss": 1.1268, + "step": 2749 + }, + { + "epoch": 0.26, + "grad_norm": 0.2774704968716273, + "learning_rate": 0.0001963844926247354, + "loss": 1.1641, + "step": 2750 + }, + { + "epoch": 0.26, + "grad_norm": 0.2781649290866802, + "learning_rate": 0.00019638027609706916, + "loss": 1.0707, + "step": 2751 + }, + { + "epoch": 0.26, + "grad_norm": 0.2552137884918176, + "learning_rate": 0.00019637605715742593, + "loss": 1.2287, + "step": 2752 + }, + { + "epoch": 0.26, + "grad_norm": 0.2599068780978502, + "learning_rate": 0.00019637183580591133, + "loss": 1.0331, + "step": 2753 + }, + { + "epoch": 0.26, + "grad_norm": 0.27220701541938597, + "learning_rate": 0.00019636761204263093, + "loss": 1.0395, + "step": 2754 + }, + { + "epoch": 0.26, + "grad_norm": 0.24397827889832657, + "learning_rate": 0.0001963633858676905, + "loss": 1.1166, + "step": 2755 + }, + { + "epoch": 0.26, + "grad_norm": 0.2497024411579182, + "learning_rate": 0.00019635915728119575, + "loss": 1.0847, + "step": 2756 + }, + { + "epoch": 0.26, + "grad_norm": 0.2589117765444782, + "learning_rate": 0.00019635492628325256, + "loss": 1.0086, + "step": 2757 + }, + { + "epoch": 0.26, + "grad_norm": 0.29313802184742693, + "learning_rate": 0.00019635069287396678, + "loss": 1.1229, + "step": 2758 + }, + { + "epoch": 0.26, + "grad_norm": 0.2618259520734273, + "learning_rate": 0.00019634645705344435, + "loss": 1.1459, + "step": 2759 + }, + { + "epoch": 0.26, + "grad_norm": 0.2454773691105509, + "learning_rate": 0.0001963422188217913, + "loss": 0.968, + "step": 2760 + }, + { + "epoch": 0.26, + "grad_norm": 0.28372490824875446, + "learning_rate": 0.00019633797817911365, + "loss": 1.1127, + "step": 2761 + }, + { + "epoch": 0.26, + "grad_norm": 0.3273752692424284, + "learning_rate": 0.00019633373512551754, + "loss": 1.1669, + "step": 2762 + }, + { + "epoch": 0.26, + "grad_norm": 0.2366079234217832, + "learning_rate": 0.0001963294896611092, + "loss": 1.0848, + "step": 2763 + }, + { + "epoch": 0.26, + "grad_norm": 0.5261995752384835, + "learning_rate": 0.00019632524178599483, + "loss": 1.1333, + "step": 2764 + }, + { + "epoch": 0.26, + "grad_norm": 0.26665251541131973, + "learning_rate": 0.00019632099150028074, + "loss": 1.0651, + "step": 2765 + }, + { + "epoch": 0.26, + "grad_norm": 0.2584540431624957, + "learning_rate": 0.0001963167388040733, + "loss": 1.107, + "step": 2766 + }, + { + "epoch": 0.26, + "grad_norm": 0.2634462477386324, + "learning_rate": 0.00019631248369747893, + "loss": 1.124, + "step": 2767 + }, + { + "epoch": 0.26, + "grad_norm": 0.29816863160086016, + "learning_rate": 0.00019630822618060413, + "loss": 1.1173, + "step": 2768 + }, + { + "epoch": 0.26, + "grad_norm": 0.25963330208582736, + "learning_rate": 0.00019630396625355546, + "loss": 1.0285, + "step": 2769 + }, + { + "epoch": 0.27, + "grad_norm": 0.2887065947096526, + "learning_rate": 0.00019629970391643947, + "loss": 1.1987, + "step": 2770 + }, + { + "epoch": 0.27, + "grad_norm": 0.27628742653188076, + "learning_rate": 0.0001962954391693629, + "loss": 1.067, + "step": 2771 + }, + { + "epoch": 0.27, + "grad_norm": 0.2707072531440452, + "learning_rate": 0.00019629117201243242, + "loss": 0.9484, + "step": 2772 + }, + { + "epoch": 0.27, + "grad_norm": 0.2973518749444581, + "learning_rate": 0.0001962869024457549, + "loss": 1.0383, + "step": 2773 + }, + { + "epoch": 0.27, + "grad_norm": 0.26365187418455077, + "learning_rate": 0.0001962826304694371, + "loss": 1.0947, + "step": 2774 + }, + { + "epoch": 0.27, + "grad_norm": 0.2591589967435732, + "learning_rate": 0.00019627835608358596, + "loss": 1.07, + "step": 2775 + }, + { + "epoch": 0.27, + "grad_norm": 0.26492792550338157, + "learning_rate": 0.00019627407928830842, + "loss": 1.1614, + "step": 2776 + }, + { + "epoch": 0.27, + "grad_norm": 0.25376976035220183, + "learning_rate": 0.00019626980008371158, + "loss": 1.0263, + "step": 2777 + }, + { + "epoch": 0.27, + "grad_norm": 0.23447258204438867, + "learning_rate": 0.0001962655184699025, + "loss": 1.1122, + "step": 2778 + }, + { + "epoch": 0.27, + "grad_norm": 0.2562093782282185, + "learning_rate": 0.00019626123444698828, + "loss": 1.0457, + "step": 2779 + }, + { + "epoch": 0.27, + "grad_norm": 0.2723192435370688, + "learning_rate": 0.00019625694801507618, + "loss": 1.0636, + "step": 2780 + }, + { + "epoch": 0.27, + "grad_norm": 0.29084143851467165, + "learning_rate": 0.00019625265917427346, + "loss": 1.1492, + "step": 2781 + }, + { + "epoch": 0.27, + "grad_norm": 0.2759691568719512, + "learning_rate": 0.00019624836792468746, + "loss": 1.0947, + "step": 2782 + }, + { + "epoch": 0.27, + "grad_norm": 0.2847571657532783, + "learning_rate": 0.00019624407426642557, + "loss": 1.0998, + "step": 2783 + }, + { + "epoch": 0.27, + "grad_norm": 0.2821857453688257, + "learning_rate": 0.00019623977819959522, + "loss": 1.0525, + "step": 2784 + }, + { + "epoch": 0.27, + "grad_norm": 0.2710683595054374, + "learning_rate": 0.00019623547972430394, + "loss": 1.1189, + "step": 2785 + }, + { + "epoch": 0.27, + "grad_norm": 0.2650861466646489, + "learning_rate": 0.00019623117884065932, + "loss": 1.0532, + "step": 2786 + }, + { + "epoch": 0.27, + "grad_norm": 0.28632313517563235, + "learning_rate": 0.00019622687554876893, + "loss": 1.0432, + "step": 2787 + }, + { + "epoch": 0.27, + "grad_norm": 0.2927818066954993, + "learning_rate": 0.00019622256984874053, + "loss": 1.0847, + "step": 2788 + }, + { + "epoch": 0.27, + "grad_norm": 0.2631443348972877, + "learning_rate": 0.00019621826174068185, + "loss": 1.1038, + "step": 2789 + }, + { + "epoch": 0.27, + "grad_norm": 0.26694650397049624, + "learning_rate": 0.00019621395122470066, + "loss": 0.9954, + "step": 2790 + }, + { + "epoch": 0.27, + "grad_norm": 0.27909721930965364, + "learning_rate": 0.00019620963830090492, + "loss": 0.9486, + "step": 2791 + }, + { + "epoch": 0.27, + "grad_norm": 0.24079022570344805, + "learning_rate": 0.0001962053229694025, + "loss": 0.9676, + "step": 2792 + }, + { + "epoch": 0.27, + "grad_norm": 0.2559668456069455, + "learning_rate": 0.0001962010052303014, + "loss": 1.1104, + "step": 2793 + }, + { + "epoch": 0.27, + "grad_norm": 0.25016354937216784, + "learning_rate": 0.0001961966850837097, + "loss": 1.1744, + "step": 2794 + }, + { + "epoch": 0.27, + "grad_norm": 0.25412464453761213, + "learning_rate": 0.0001961923625297355, + "loss": 1.1563, + "step": 2795 + }, + { + "epoch": 0.27, + "grad_norm": 0.25700418834836664, + "learning_rate": 0.00019618803756848695, + "loss": 1.155, + "step": 2796 + }, + { + "epoch": 0.27, + "grad_norm": 0.29832287781586436, + "learning_rate": 0.0001961837102000723, + "loss": 1.0929, + "step": 2797 + }, + { + "epoch": 0.27, + "grad_norm": 0.24969396689469361, + "learning_rate": 0.00019617938042459988, + "loss": 1.0692, + "step": 2798 + }, + { + "epoch": 0.27, + "grad_norm": 0.27702787637233217, + "learning_rate": 0.00019617504824217803, + "loss": 1.0812, + "step": 2799 + }, + { + "epoch": 0.27, + "grad_norm": 0.2900431406784246, + "learning_rate": 0.00019617071365291512, + "loss": 1.1213, + "step": 2800 + }, + { + "epoch": 0.27, + "grad_norm": 0.2650773606437207, + "learning_rate": 0.0001961663766569197, + "loss": 1.0438, + "step": 2801 + }, + { + "epoch": 0.27, + "grad_norm": 0.23346920952561745, + "learning_rate": 0.00019616203725430023, + "loss": 0.9972, + "step": 2802 + }, + { + "epoch": 0.27, + "grad_norm": 0.2739900839005771, + "learning_rate": 0.00019615769544516532, + "loss": 1.0435, + "step": 2803 + }, + { + "epoch": 0.27, + "grad_norm": 0.3190423287534823, + "learning_rate": 0.00019615335122962372, + "loss": 1.1342, + "step": 2804 + }, + { + "epoch": 0.27, + "grad_norm": 0.2830709781599123, + "learning_rate": 0.00019614900460778403, + "loss": 1.0853, + "step": 2805 + }, + { + "epoch": 0.27, + "grad_norm": 0.26452593616904946, + "learning_rate": 0.00019614465557975507, + "loss": 1.1323, + "step": 2806 + }, + { + "epoch": 0.27, + "grad_norm": 0.2592995682321865, + "learning_rate": 0.00019614030414564568, + "loss": 1.068, + "step": 2807 + }, + { + "epoch": 0.27, + "grad_norm": 0.2827474115219191, + "learning_rate": 0.00019613595030556477, + "loss": 1.1488, + "step": 2808 + }, + { + "epoch": 0.27, + "grad_norm": 0.2649963078693095, + "learning_rate": 0.0001961315940596213, + "loss": 0.996, + "step": 2809 + }, + { + "epoch": 0.27, + "grad_norm": 0.26346285838705263, + "learning_rate": 0.00019612723540792426, + "loss": 1.1175, + "step": 2810 + }, + { + "epoch": 0.27, + "grad_norm": 0.28954499176521176, + "learning_rate": 0.00019612287435058273, + "loss": 1.1733, + "step": 2811 + }, + { + "epoch": 0.27, + "grad_norm": 0.2761795119898941, + "learning_rate": 0.00019611851088770585, + "loss": 1.1321, + "step": 2812 + }, + { + "epoch": 0.27, + "grad_norm": 0.277378324643611, + "learning_rate": 0.00019611414501940284, + "loss": 1.291, + "step": 2813 + }, + { + "epoch": 0.27, + "grad_norm": 0.29898305586569235, + "learning_rate": 0.00019610977674578296, + "loss": 1.1056, + "step": 2814 + }, + { + "epoch": 0.27, + "grad_norm": 0.2728332232087226, + "learning_rate": 0.00019610540606695547, + "loss": 1.1051, + "step": 2815 + }, + { + "epoch": 0.27, + "grad_norm": 0.3020216145302391, + "learning_rate": 0.0001961010329830298, + "loss": 0.9909, + "step": 2816 + }, + { + "epoch": 0.27, + "grad_norm": 0.2614179979610706, + "learning_rate": 0.00019609665749411543, + "loss": 1.0927, + "step": 2817 + }, + { + "epoch": 0.27, + "grad_norm": 0.2486123416932773, + "learning_rate": 0.00019609227960032177, + "loss": 1.1096, + "step": 2818 + }, + { + "epoch": 0.27, + "grad_norm": 0.31135834548170077, + "learning_rate": 0.00019608789930175845, + "loss": 1.1178, + "step": 2819 + }, + { + "epoch": 0.27, + "grad_norm": 0.277755259399668, + "learning_rate": 0.00019608351659853503, + "loss": 1.1473, + "step": 2820 + }, + { + "epoch": 0.27, + "grad_norm": 0.2584745367908558, + "learning_rate": 0.00019607913149076125, + "loss": 1.0122, + "step": 2821 + }, + { + "epoch": 0.27, + "grad_norm": 0.2619607556881495, + "learning_rate": 0.0001960747439785468, + "loss": 1.128, + "step": 2822 + }, + { + "epoch": 0.27, + "grad_norm": 0.29050519695473265, + "learning_rate": 0.00019607035406200152, + "loss": 1.0491, + "step": 2823 + }, + { + "epoch": 0.27, + "grad_norm": 0.21812110204389035, + "learning_rate": 0.00019606596174123525, + "loss": 1.1075, + "step": 2824 + }, + { + "epoch": 0.27, + "grad_norm": 0.2839251249015717, + "learning_rate": 0.00019606156701635792, + "loss": 1.1837, + "step": 2825 + }, + { + "epoch": 0.27, + "grad_norm": 0.2958308386052112, + "learning_rate": 0.0001960571698874795, + "loss": 1.0984, + "step": 2826 + }, + { + "epoch": 0.27, + "grad_norm": 0.2671536081657146, + "learning_rate": 0.00019605277035470998, + "loss": 1.0137, + "step": 2827 + }, + { + "epoch": 0.27, + "grad_norm": 0.260771384058964, + "learning_rate": 0.00019604836841815958, + "loss": 1.0675, + "step": 2828 + }, + { + "epoch": 0.27, + "grad_norm": 0.2615419961091573, + "learning_rate": 0.00019604396407793835, + "loss": 1.055, + "step": 2829 + }, + { + "epoch": 0.27, + "grad_norm": 0.27494047722891274, + "learning_rate": 0.0001960395573341566, + "loss": 1.1479, + "step": 2830 + }, + { + "epoch": 0.27, + "grad_norm": 0.3121477873782516, + "learning_rate": 0.00019603514818692454, + "loss": 1.0033, + "step": 2831 + }, + { + "epoch": 0.27, + "grad_norm": 0.26737638033087213, + "learning_rate": 0.00019603073663635256, + "loss": 1.1077, + "step": 2832 + }, + { + "epoch": 0.27, + "grad_norm": 0.24631307751232157, + "learning_rate": 0.00019602632268255103, + "loss": 1.0545, + "step": 2833 + }, + { + "epoch": 0.27, + "grad_norm": 0.3013621704790624, + "learning_rate": 0.00019602190632563043, + "loss": 1.0969, + "step": 2834 + }, + { + "epoch": 0.27, + "grad_norm": 0.2783637530154318, + "learning_rate": 0.00019601748756570126, + "loss": 1.0622, + "step": 2835 + }, + { + "epoch": 0.27, + "grad_norm": 0.2574957740041804, + "learning_rate": 0.00019601306640287415, + "loss": 1.101, + "step": 2836 + }, + { + "epoch": 0.27, + "grad_norm": 0.26247517047321034, + "learning_rate": 0.00019600864283725967, + "loss": 1.0651, + "step": 2837 + }, + { + "epoch": 0.27, + "grad_norm": 0.25876279601882096, + "learning_rate": 0.0001960042168689686, + "loss": 1.062, + "step": 2838 + }, + { + "epoch": 0.27, + "grad_norm": 0.25523429862063185, + "learning_rate": 0.00019599978849811164, + "loss": 1.1267, + "step": 2839 + }, + { + "epoch": 0.27, + "grad_norm": 0.23537001704505256, + "learning_rate": 0.00019599535772479968, + "loss": 1.0823, + "step": 2840 + }, + { + "epoch": 0.27, + "grad_norm": 0.2462574908056951, + "learning_rate": 0.00019599092454914351, + "loss": 1.0301, + "step": 2841 + }, + { + "epoch": 0.27, + "grad_norm": 0.2721311709805547, + "learning_rate": 0.00019598648897125416, + "loss": 1.067, + "step": 2842 + }, + { + "epoch": 0.27, + "grad_norm": 0.3069559448268241, + "learning_rate": 0.0001959820509912426, + "loss": 1.013, + "step": 2843 + }, + { + "epoch": 0.27, + "grad_norm": 0.2702912452683172, + "learning_rate": 0.00019597761060921985, + "loss": 1.1137, + "step": 2844 + }, + { + "epoch": 0.27, + "grad_norm": 0.2797213146710235, + "learning_rate": 0.00019597316782529715, + "loss": 1.0939, + "step": 2845 + }, + { + "epoch": 0.27, + "grad_norm": 0.2787776466517135, + "learning_rate": 0.00019596872263958552, + "loss": 1.1058, + "step": 2846 + }, + { + "epoch": 0.27, + "grad_norm": 0.26116233626175234, + "learning_rate": 0.00019596427505219635, + "loss": 1.0094, + "step": 2847 + }, + { + "epoch": 0.27, + "grad_norm": 0.2693281423942913, + "learning_rate": 0.0001959598250632409, + "loss": 1.1635, + "step": 2848 + }, + { + "epoch": 0.27, + "grad_norm": 0.2506743951314745, + "learning_rate": 0.00019595537267283047, + "loss": 1.1075, + "step": 2849 + }, + { + "epoch": 0.27, + "grad_norm": 0.24106080467021557, + "learning_rate": 0.00019595091788107656, + "loss": 0.9761, + "step": 2850 + }, + { + "epoch": 0.27, + "grad_norm": 0.2860007338075784, + "learning_rate": 0.0001959464606880906, + "loss": 1.0474, + "step": 2851 + }, + { + "epoch": 0.27, + "grad_norm": 0.2359433024977078, + "learning_rate": 0.00019594200109398417, + "loss": 1.0804, + "step": 2852 + }, + { + "epoch": 0.27, + "grad_norm": 0.24715768704519478, + "learning_rate": 0.0001959375390988689, + "loss": 1.1157, + "step": 2853 + }, + { + "epoch": 0.27, + "grad_norm": 0.2586351713329031, + "learning_rate": 0.0001959330747028564, + "loss": 1.0448, + "step": 2854 + }, + { + "epoch": 0.27, + "grad_norm": 0.25499580779707726, + "learning_rate": 0.00019592860790605842, + "loss": 1.1657, + "step": 2855 + }, + { + "epoch": 0.27, + "grad_norm": 0.29734674634840746, + "learning_rate": 0.0001959241387085867, + "loss": 1.1446, + "step": 2856 + }, + { + "epoch": 0.27, + "grad_norm": 0.2647465723279589, + "learning_rate": 0.00019591966711055315, + "loss": 1.1668, + "step": 2857 + }, + { + "epoch": 0.27, + "grad_norm": 0.26368966136330935, + "learning_rate": 0.00019591519311206964, + "loss": 1.1992, + "step": 2858 + }, + { + "epoch": 0.27, + "grad_norm": 0.3126849078518487, + "learning_rate": 0.00019591071671324817, + "loss": 1.141, + "step": 2859 + }, + { + "epoch": 0.27, + "grad_norm": 0.25583665345229606, + "learning_rate": 0.00019590623791420071, + "loss": 1.1441, + "step": 2860 + }, + { + "epoch": 0.27, + "grad_norm": 0.23927224607942188, + "learning_rate": 0.00019590175671503938, + "loss": 1.1212, + "step": 2861 + }, + { + "epoch": 0.27, + "grad_norm": 0.28570558886340036, + "learning_rate": 0.00019589727311587632, + "loss": 1.0623, + "step": 2862 + }, + { + "epoch": 0.27, + "grad_norm": 0.23141130803687024, + "learning_rate": 0.00019589278711682373, + "loss": 1.1051, + "step": 2863 + }, + { + "epoch": 0.27, + "grad_norm": 0.31126189932366843, + "learning_rate": 0.00019588829871799388, + "loss": 1.098, + "step": 2864 + }, + { + "epoch": 0.27, + "grad_norm": 0.2368691173217574, + "learning_rate": 0.00019588380791949906, + "loss": 0.937, + "step": 2865 + }, + { + "epoch": 0.27, + "grad_norm": 0.2905469407114906, + "learning_rate": 0.0001958793147214517, + "loss": 1.1837, + "step": 2866 + }, + { + "epoch": 0.27, + "grad_norm": 0.25681129260211033, + "learning_rate": 0.00019587481912396426, + "loss": 1.0659, + "step": 2867 + }, + { + "epoch": 0.27, + "grad_norm": 0.3115969168588661, + "learning_rate": 0.0001958703211271492, + "loss": 1.1245, + "step": 2868 + }, + { + "epoch": 0.27, + "grad_norm": 0.2629500967052983, + "learning_rate": 0.0001958658207311191, + "loss": 1.0451, + "step": 2869 + }, + { + "epoch": 0.27, + "grad_norm": 0.26897588144338785, + "learning_rate": 0.0001958613179359866, + "loss": 1.0232, + "step": 2870 + }, + { + "epoch": 0.27, + "grad_norm": 0.2745821063034373, + "learning_rate": 0.00019585681274186434, + "loss": 1.1058, + "step": 2871 + }, + { + "epoch": 0.27, + "grad_norm": 0.28134411874328125, + "learning_rate": 0.00019585230514886513, + "loss": 1.0646, + "step": 2872 + }, + { + "epoch": 0.27, + "grad_norm": 0.2739086818209478, + "learning_rate": 0.0001958477951571017, + "loss": 1.0774, + "step": 2873 + }, + { + "epoch": 0.27, + "grad_norm": 0.24039032053442347, + "learning_rate": 0.000195843282766687, + "loss": 1.1367, + "step": 2874 + }, + { + "epoch": 0.28, + "grad_norm": 0.2729224048445481, + "learning_rate": 0.00019583876797773391, + "loss": 1.0894, + "step": 2875 + }, + { + "epoch": 0.28, + "grad_norm": 0.2548563216514712, + "learning_rate": 0.0001958342507903554, + "loss": 1.1571, + "step": 2876 + }, + { + "epoch": 0.28, + "grad_norm": 0.2611894266017868, + "learning_rate": 0.00019582973120466454, + "loss": 1.1219, + "step": 2877 + }, + { + "epoch": 0.28, + "grad_norm": 0.2517135134023402, + "learning_rate": 0.00019582520922077444, + "loss": 1.1457, + "step": 2878 + }, + { + "epoch": 0.28, + "grad_norm": 0.25685092446562596, + "learning_rate": 0.00019582068483879822, + "loss": 1.0947, + "step": 2879 + }, + { + "epoch": 0.28, + "grad_norm": 0.2531800123875422, + "learning_rate": 0.00019581615805884918, + "loss": 1.0824, + "step": 2880 + }, + { + "epoch": 0.28, + "grad_norm": 0.29746440693794585, + "learning_rate": 0.00019581162888104056, + "loss": 1.127, + "step": 2881 + }, + { + "epoch": 0.28, + "grad_norm": 0.2951013215212664, + "learning_rate": 0.0001958070973054857, + "loss": 0.9708, + "step": 2882 + }, + { + "epoch": 0.28, + "grad_norm": 0.26703604465532904, + "learning_rate": 0.00019580256333229804, + "loss": 1.0996, + "step": 2883 + }, + { + "epoch": 0.28, + "grad_norm": 0.3054149559628321, + "learning_rate": 0.00019579802696159098, + "loss": 1.1972, + "step": 2884 + }, + { + "epoch": 0.28, + "grad_norm": 0.2554844575267877, + "learning_rate": 0.00019579348819347814, + "loss": 1.1815, + "step": 2885 + }, + { + "epoch": 0.28, + "grad_norm": 0.27725302476518116, + "learning_rate": 0.00019578894702807303, + "loss": 1.1779, + "step": 2886 + }, + { + "epoch": 0.28, + "grad_norm": 0.2812121223614431, + "learning_rate": 0.0001957844034654893, + "loss": 1.0931, + "step": 2887 + }, + { + "epoch": 0.28, + "grad_norm": 0.28657879183155893, + "learning_rate": 0.0001957798575058407, + "loss": 1.209, + "step": 2888 + }, + { + "epoch": 0.28, + "grad_norm": 0.32607385402952277, + "learning_rate": 0.00019577530914924096, + "loss": 1.083, + "step": 2889 + }, + { + "epoch": 0.28, + "grad_norm": 0.2622053283912726, + "learning_rate": 0.00019577075839580395, + "loss": 1.1331, + "step": 2890 + }, + { + "epoch": 0.28, + "grad_norm": 0.27330452773103864, + "learning_rate": 0.00019576620524564347, + "loss": 1.1223, + "step": 2891 + }, + { + "epoch": 0.28, + "grad_norm": 0.28722568397660336, + "learning_rate": 0.00019576164969887353, + "loss": 1.0848, + "step": 2892 + }, + { + "epoch": 0.28, + "grad_norm": 0.25111702949361947, + "learning_rate": 0.00019575709175560815, + "loss": 1.1015, + "step": 2893 + }, + { + "epoch": 0.28, + "grad_norm": 0.27784956399299426, + "learning_rate": 0.00019575253141596136, + "loss": 1.0712, + "step": 2894 + }, + { + "epoch": 0.28, + "grad_norm": 0.29027995612195606, + "learning_rate": 0.00019574796868004728, + "loss": 1.0522, + "step": 2895 + }, + { + "epoch": 0.28, + "grad_norm": 0.2662180849874586, + "learning_rate": 0.00019574340354798012, + "loss": 1.0711, + "step": 2896 + }, + { + "epoch": 0.28, + "grad_norm": 0.272996240282476, + "learning_rate": 0.00019573883601987409, + "loss": 1.1081, + "step": 2897 + }, + { + "epoch": 0.28, + "grad_norm": 0.2500926481674787, + "learning_rate": 0.00019573426609584353, + "loss": 1.0818, + "step": 2898 + }, + { + "epoch": 0.28, + "grad_norm": 0.2690599907288768, + "learning_rate": 0.00019572969377600278, + "loss": 1.1512, + "step": 2899 + }, + { + "epoch": 0.28, + "grad_norm": 0.26895706721452967, + "learning_rate": 0.00019572511906046632, + "loss": 1.106, + "step": 2900 + }, + { + "epoch": 0.28, + "grad_norm": 0.28870985265507426, + "learning_rate": 0.00019572054194934855, + "loss": 1.0406, + "step": 2901 + }, + { + "epoch": 0.28, + "grad_norm": 0.2569564435809099, + "learning_rate": 0.00019571596244276408, + "loss": 1.1162, + "step": 2902 + }, + { + "epoch": 0.28, + "grad_norm": 0.26799107456705956, + "learning_rate": 0.0001957113805408275, + "loss": 1.0301, + "step": 2903 + }, + { + "epoch": 0.28, + "grad_norm": 0.2476133822775531, + "learning_rate": 0.00019570679624365348, + "loss": 1.1889, + "step": 2904 + }, + { + "epoch": 0.28, + "grad_norm": 0.28688265263367885, + "learning_rate": 0.00019570220955135673, + "loss": 1.0879, + "step": 2905 + }, + { + "epoch": 0.28, + "grad_norm": 0.279647668676218, + "learning_rate": 0.000195697620464052, + "loss": 1.1741, + "step": 2906 + }, + { + "epoch": 0.28, + "grad_norm": 0.26565186704575483, + "learning_rate": 0.0001956930289818542, + "loss": 1.0533, + "step": 2907 + }, + { + "epoch": 0.28, + "grad_norm": 0.27848478850209946, + "learning_rate": 0.00019568843510487822, + "loss": 1.0685, + "step": 2908 + }, + { + "epoch": 0.28, + "grad_norm": 0.27824963257277385, + "learning_rate": 0.00019568383883323902, + "loss": 1.154, + "step": 2909 + }, + { + "epoch": 0.28, + "grad_norm": 0.2889721158688345, + "learning_rate": 0.0001956792401670516, + "loss": 1.0229, + "step": 2910 + }, + { + "epoch": 0.28, + "grad_norm": 0.2433224132508536, + "learning_rate": 0.00019567463910643106, + "loss": 1.0934, + "step": 2911 + }, + { + "epoch": 0.28, + "grad_norm": 0.2558852060337151, + "learning_rate": 0.00019567003565149256, + "loss": 1.035, + "step": 2912 + }, + { + "epoch": 0.28, + "grad_norm": 0.25619363373049336, + "learning_rate": 0.0001956654298023513, + "loss": 1.1415, + "step": 2913 + }, + { + "epoch": 0.28, + "grad_norm": 0.2913829589793719, + "learning_rate": 0.0001956608215591225, + "loss": 1.0902, + "step": 2914 + }, + { + "epoch": 0.28, + "grad_norm": 0.25636509940637525, + "learning_rate": 0.00019565621092192156, + "loss": 1.0804, + "step": 2915 + }, + { + "epoch": 0.28, + "grad_norm": 0.2911863913856473, + "learning_rate": 0.00019565159789086377, + "loss": 1.0234, + "step": 2916 + }, + { + "epoch": 0.28, + "grad_norm": 0.3098448701698118, + "learning_rate": 0.00019564698246606467, + "loss": 1.018, + "step": 2917 + }, + { + "epoch": 0.28, + "grad_norm": 0.28894296908926365, + "learning_rate": 0.00019564236464763971, + "loss": 1.0444, + "step": 2918 + }, + { + "epoch": 0.28, + "grad_norm": 0.2969290728405071, + "learning_rate": 0.00019563774443570448, + "loss": 1.0826, + "step": 2919 + }, + { + "epoch": 0.28, + "grad_norm": 0.26737166147445435, + "learning_rate": 0.00019563312183037458, + "loss": 1.1668, + "step": 2920 + }, + { + "epoch": 0.28, + "grad_norm": 0.28206692002567496, + "learning_rate": 0.0001956284968317657, + "loss": 1.0922, + "step": 2921 + }, + { + "epoch": 0.28, + "grad_norm": 0.24812211567721631, + "learning_rate": 0.0001956238694399936, + "loss": 1.1432, + "step": 2922 + }, + { + "epoch": 0.28, + "grad_norm": 0.2599840736277116, + "learning_rate": 0.00019561923965517405, + "loss": 1.0521, + "step": 2923 + }, + { + "epoch": 0.28, + "grad_norm": 0.2543940271199818, + "learning_rate": 0.00019561460747742295, + "loss": 1.1435, + "step": 2924 + }, + { + "epoch": 0.28, + "grad_norm": 0.27672619084132033, + "learning_rate": 0.0001956099729068562, + "loss": 1.0804, + "step": 2925 + }, + { + "epoch": 0.28, + "grad_norm": 0.257274251896411, + "learning_rate": 0.0001956053359435898, + "loss": 1.1605, + "step": 2926 + }, + { + "epoch": 0.28, + "grad_norm": 0.2700086089277536, + "learning_rate": 0.00019560069658773976, + "loss": 1.1006, + "step": 2927 + }, + { + "epoch": 0.28, + "grad_norm": 0.2729066392922699, + "learning_rate": 0.00019559605483942223, + "loss": 0.9848, + "step": 2928 + }, + { + "epoch": 0.28, + "grad_norm": 0.2819343853836847, + "learning_rate": 0.0001955914106987533, + "loss": 1.0987, + "step": 2929 + }, + { + "epoch": 0.28, + "grad_norm": 0.28151027604586293, + "learning_rate": 0.00019558676416584929, + "loss": 1.1282, + "step": 2930 + }, + { + "epoch": 0.28, + "grad_norm": 0.26677614820796297, + "learning_rate": 0.0001955821152408264, + "loss": 1.1034, + "step": 2931 + }, + { + "epoch": 0.28, + "grad_norm": 0.3059666303412851, + "learning_rate": 0.00019557746392380104, + "loss": 1.1612, + "step": 2932 + }, + { + "epoch": 0.28, + "grad_norm": 0.2819684123621462, + "learning_rate": 0.00019557281021488957, + "loss": 1.0681, + "step": 2933 + }, + { + "epoch": 0.28, + "grad_norm": 0.23809460047261669, + "learning_rate": 0.00019556815411420842, + "loss": 1.0539, + "step": 2934 + }, + { + "epoch": 0.28, + "grad_norm": 0.26195198259872626, + "learning_rate": 0.0001955634956218742, + "loss": 1.0663, + "step": 2935 + }, + { + "epoch": 0.28, + "grad_norm": 0.2776958285854396, + "learning_rate": 0.00019555883473800344, + "loss": 1.2042, + "step": 2936 + }, + { + "epoch": 0.28, + "grad_norm": 0.2776878056257854, + "learning_rate": 0.00019555417146271275, + "loss": 1.0723, + "step": 2937 + }, + { + "epoch": 0.28, + "grad_norm": 0.22289840395374008, + "learning_rate": 0.00019554950579611888, + "loss": 1.101, + "step": 2938 + }, + { + "epoch": 0.28, + "grad_norm": 0.2765820638955678, + "learning_rate": 0.00019554483773833855, + "loss": 1.1081, + "step": 2939 + }, + { + "epoch": 0.28, + "grad_norm": 0.313800298123316, + "learning_rate": 0.00019554016728948865, + "loss": 1.1322, + "step": 2940 + }, + { + "epoch": 0.28, + "grad_norm": 0.25205600427022146, + "learning_rate": 0.00019553549444968602, + "loss": 1.0419, + "step": 2941 + }, + { + "epoch": 0.28, + "grad_norm": 0.271795514200843, + "learning_rate": 0.00019553081921904757, + "loss": 1.1375, + "step": 2942 + }, + { + "epoch": 0.28, + "grad_norm": 0.28101236125294443, + "learning_rate": 0.00019552614159769034, + "loss": 1.0403, + "step": 2943 + }, + { + "epoch": 0.28, + "grad_norm": 0.25207664771282795, + "learning_rate": 0.0001955214615857314, + "loss": 1.0785, + "step": 2944 + }, + { + "epoch": 0.28, + "grad_norm": 0.27380543511172994, + "learning_rate": 0.00019551677918328784, + "loss": 1.1187, + "step": 2945 + }, + { + "epoch": 0.28, + "grad_norm": 0.2732262776480482, + "learning_rate": 0.00019551209439047683, + "loss": 1.1426, + "step": 2946 + }, + { + "epoch": 0.28, + "grad_norm": 0.25773244127240097, + "learning_rate": 0.00019550740720741564, + "loss": 1.019, + "step": 2947 + }, + { + "epoch": 0.28, + "grad_norm": 0.2649249368780316, + "learning_rate": 0.0001955027176342216, + "loss": 1.0553, + "step": 2948 + }, + { + "epoch": 0.28, + "grad_norm": 0.2778435224973774, + "learning_rate": 0.00019549802567101198, + "loss": 1.1484, + "step": 2949 + }, + { + "epoch": 0.28, + "grad_norm": 0.2835832433902629, + "learning_rate": 0.00019549333131790427, + "loss": 1.1626, + "step": 2950 + }, + { + "epoch": 0.28, + "grad_norm": 0.2684901729578864, + "learning_rate": 0.00019548863457501592, + "loss": 1.0469, + "step": 2951 + }, + { + "epoch": 0.28, + "grad_norm": 0.2917738619766324, + "learning_rate": 0.0001954839354424645, + "loss": 1.0706, + "step": 2952 + }, + { + "epoch": 0.28, + "grad_norm": 0.2719180117243129, + "learning_rate": 0.00019547923392036756, + "loss": 1.069, + "step": 2953 + }, + { + "epoch": 0.28, + "grad_norm": 0.32365658454747653, + "learning_rate": 0.00019547453000884278, + "loss": 1.2248, + "step": 2954 + }, + { + "epoch": 0.28, + "grad_norm": 0.22919570649820376, + "learning_rate": 0.0001954698237080079, + "loss": 0.9762, + "step": 2955 + }, + { + "epoch": 0.28, + "grad_norm": 0.27049554069580856, + "learning_rate": 0.00019546511501798068, + "loss": 1.0445, + "step": 2956 + }, + { + "epoch": 0.28, + "grad_norm": 0.2321876330280462, + "learning_rate": 0.00019546040393887896, + "loss": 0.9582, + "step": 2957 + }, + { + "epoch": 0.28, + "grad_norm": 0.2866826620321833, + "learning_rate": 0.00019545569047082063, + "loss": 1.0803, + "step": 2958 + }, + { + "epoch": 0.28, + "grad_norm": 0.2513540583580118, + "learning_rate": 0.00019545097461392364, + "loss": 1.083, + "step": 2959 + }, + { + "epoch": 0.28, + "grad_norm": 0.26654262390528605, + "learning_rate": 0.00019544625636830606, + "loss": 1.0319, + "step": 2960 + }, + { + "epoch": 0.28, + "grad_norm": 0.2413056145935159, + "learning_rate": 0.00019544153573408592, + "loss": 1.158, + "step": 2961 + }, + { + "epoch": 0.28, + "grad_norm": 0.26314675374807356, + "learning_rate": 0.00019543681271138135, + "loss": 1.1581, + "step": 2962 + }, + { + "epoch": 0.28, + "grad_norm": 0.302120035744808, + "learning_rate": 0.00019543208730031056, + "loss": 1.0621, + "step": 2963 + }, + { + "epoch": 0.28, + "grad_norm": 0.2784231038369151, + "learning_rate": 0.0001954273595009918, + "loss": 1.1521, + "step": 2964 + }, + { + "epoch": 0.28, + "grad_norm": 0.3301489966327534, + "learning_rate": 0.00019542262931354342, + "loss": 1.1991, + "step": 2965 + }, + { + "epoch": 0.28, + "grad_norm": 0.2625920950709, + "learning_rate": 0.00019541789673808378, + "loss": 1.1439, + "step": 2966 + }, + { + "epoch": 0.28, + "grad_norm": 0.2799876119710994, + "learning_rate": 0.00019541316177473127, + "loss": 1.2343, + "step": 2967 + }, + { + "epoch": 0.28, + "grad_norm": 0.25861702387999425, + "learning_rate": 0.00019540842442360444, + "loss": 1.0334, + "step": 2968 + }, + { + "epoch": 0.28, + "grad_norm": 0.26861453164120885, + "learning_rate": 0.00019540368468482183, + "loss": 1.0876, + "step": 2969 + }, + { + "epoch": 0.28, + "grad_norm": 0.2790820406297911, + "learning_rate": 0.00019539894255850203, + "loss": 1.2192, + "step": 2970 + }, + { + "epoch": 0.28, + "grad_norm": 0.26958726146743006, + "learning_rate": 0.00019539419804476377, + "loss": 1.071, + "step": 2971 + }, + { + "epoch": 0.28, + "grad_norm": 0.2677639222097805, + "learning_rate": 0.00019538945114372573, + "loss": 1.2223, + "step": 2972 + }, + { + "epoch": 0.28, + "grad_norm": 0.26245494649962503, + "learning_rate": 0.00019538470185550674, + "loss": 1.105, + "step": 2973 + }, + { + "epoch": 0.28, + "grad_norm": 0.2693830042475195, + "learning_rate": 0.00019537995018022563, + "loss": 1.1118, + "step": 2974 + }, + { + "epoch": 0.28, + "grad_norm": 0.24631907921671092, + "learning_rate": 0.0001953751961180013, + "loss": 1.0035, + "step": 2975 + }, + { + "epoch": 0.28, + "grad_norm": 0.30613365197670445, + "learning_rate": 0.00019537043966895277, + "loss": 1.0775, + "step": 2976 + }, + { + "epoch": 0.28, + "grad_norm": 0.2776873480898374, + "learning_rate": 0.00019536568083319903, + "loss": 1.0197, + "step": 2977 + }, + { + "epoch": 0.28, + "grad_norm": 0.28885110017090304, + "learning_rate": 0.00019536091961085922, + "loss": 1.1091, + "step": 2978 + }, + { + "epoch": 0.29, + "grad_norm": 0.2567410212869977, + "learning_rate": 0.00019535615600205247, + "loss": 1.0443, + "step": 2979 + }, + { + "epoch": 0.29, + "grad_norm": 0.28945345981693216, + "learning_rate": 0.00019535139000689795, + "loss": 1.0625, + "step": 2980 + }, + { + "epoch": 0.29, + "grad_norm": 0.2774312492094651, + "learning_rate": 0.000195346621625515, + "loss": 1.1077, + "step": 2981 + }, + { + "epoch": 0.29, + "grad_norm": 0.26395699634012787, + "learning_rate": 0.00019534185085802293, + "loss": 1.0201, + "step": 2982 + }, + { + "epoch": 0.29, + "grad_norm": 0.28496537247736714, + "learning_rate": 0.0001953370777045411, + "loss": 1.1266, + "step": 2983 + }, + { + "epoch": 0.29, + "grad_norm": 0.3243637973540252, + "learning_rate": 0.00019533230216518897, + "loss": 1.1888, + "step": 2984 + }, + { + "epoch": 0.29, + "grad_norm": 0.29081576092276984, + "learning_rate": 0.00019532752424008607, + "loss": 1.0315, + "step": 2985 + }, + { + "epoch": 0.29, + "grad_norm": 0.2850948882845963, + "learning_rate": 0.00019532274392935198, + "loss": 1.0013, + "step": 2986 + }, + { + "epoch": 0.29, + "grad_norm": 0.2923430922930697, + "learning_rate": 0.0001953179612331063, + "loss": 0.9897, + "step": 2987 + }, + { + "epoch": 0.29, + "grad_norm": 0.29321038881690914, + "learning_rate": 0.00019531317615146873, + "loss": 1.1548, + "step": 2988 + }, + { + "epoch": 0.29, + "grad_norm": 0.28328990204826776, + "learning_rate": 0.00019530838868455906, + "loss": 1.0857, + "step": 2989 + }, + { + "epoch": 0.29, + "grad_norm": 0.25769647168620946, + "learning_rate": 0.00019530359883249701, + "loss": 1.025, + "step": 2990 + }, + { + "epoch": 0.29, + "grad_norm": 0.2796837930311345, + "learning_rate": 0.00019529880659540256, + "loss": 1.1879, + "step": 2991 + }, + { + "epoch": 0.29, + "grad_norm": 0.2787744820467792, + "learning_rate": 0.00019529401197339557, + "loss": 1.1248, + "step": 2992 + }, + { + "epoch": 0.29, + "grad_norm": 0.269889282961234, + "learning_rate": 0.00019528921496659603, + "loss": 1.0331, + "step": 2993 + }, + { + "epoch": 0.29, + "grad_norm": 0.28222433704753624, + "learning_rate": 0.00019528441557512398, + "loss": 1.0523, + "step": 2994 + }, + { + "epoch": 0.29, + "grad_norm": 0.27863870289129955, + "learning_rate": 0.00019527961379909957, + "loss": 1.1745, + "step": 2995 + }, + { + "epoch": 0.29, + "grad_norm": 0.31883034906636215, + "learning_rate": 0.00019527480963864294, + "loss": 1.0541, + "step": 2996 + }, + { + "epoch": 0.29, + "grad_norm": 0.27111306738362173, + "learning_rate": 0.0001952700030938743, + "loss": 1.1796, + "step": 2997 + }, + { + "epoch": 0.29, + "grad_norm": 0.27408759182730913, + "learning_rate": 0.00019526519416491401, + "loss": 1.0041, + "step": 2998 + }, + { + "epoch": 0.29, + "grad_norm": 0.24522235103299234, + "learning_rate": 0.0001952603828518823, + "loss": 0.8971, + "step": 2999 + }, + { + "epoch": 0.29, + "grad_norm": 0.2583032592016034, + "learning_rate": 0.00019525556915489967, + "loss": 0.9294, + "step": 3000 + }, + { + "epoch": 0.29, + "grad_norm": 0.25624359117015355, + "learning_rate": 0.00019525075307408655, + "loss": 0.9701, + "step": 3001 + }, + { + "epoch": 0.29, + "grad_norm": 0.28720231190552714, + "learning_rate": 0.0001952459346095635, + "loss": 1.0356, + "step": 3002 + }, + { + "epoch": 0.29, + "grad_norm": 0.29696988241107114, + "learning_rate": 0.00019524111376145105, + "loss": 1.0428, + "step": 3003 + }, + { + "epoch": 0.29, + "grad_norm": 0.26750283784091977, + "learning_rate": 0.00019523629052986988, + "loss": 1.0176, + "step": 3004 + }, + { + "epoch": 0.29, + "grad_norm": 0.28397737724131106, + "learning_rate": 0.00019523146491494067, + "loss": 1.0977, + "step": 3005 + }, + { + "epoch": 0.29, + "grad_norm": 0.30294167237041875, + "learning_rate": 0.0001952266369167842, + "loss": 1.1875, + "step": 3006 + }, + { + "epoch": 0.29, + "grad_norm": 0.2580269760608724, + "learning_rate": 0.00019522180653552132, + "loss": 1.0923, + "step": 3007 + }, + { + "epoch": 0.29, + "grad_norm": 0.2788977172196587, + "learning_rate": 0.00019521697377127285, + "loss": 1.1057, + "step": 3008 + }, + { + "epoch": 0.29, + "grad_norm": 0.26291500633347537, + "learning_rate": 0.00019521213862415979, + "loss": 1.1551, + "step": 3009 + }, + { + "epoch": 0.29, + "grad_norm": 0.2760993256298351, + "learning_rate": 0.00019520730109430314, + "loss": 1.0357, + "step": 3010 + }, + { + "epoch": 0.29, + "grad_norm": 0.2851223724038372, + "learning_rate": 0.0001952024611818239, + "loss": 1.1088, + "step": 3011 + }, + { + "epoch": 0.29, + "grad_norm": 0.2613411081969399, + "learning_rate": 0.00019519761888684326, + "loss": 1.0058, + "step": 3012 + }, + { + "epoch": 0.29, + "grad_norm": 0.2374352044099212, + "learning_rate": 0.0001951927742094824, + "loss": 1.0545, + "step": 3013 + }, + { + "epoch": 0.29, + "grad_norm": 0.27080671804410106, + "learning_rate": 0.00019518792714986254, + "loss": 1.1475, + "step": 3014 + }, + { + "epoch": 0.29, + "grad_norm": 0.2993708318879525, + "learning_rate": 0.00019518307770810496, + "loss": 1.0931, + "step": 3015 + }, + { + "epoch": 0.29, + "grad_norm": 0.29400732138156965, + "learning_rate": 0.00019517822588433102, + "loss": 1.0799, + "step": 3016 + }, + { + "epoch": 0.29, + "grad_norm": 0.25464534887846263, + "learning_rate": 0.0001951733716786622, + "loss": 0.993, + "step": 3017 + }, + { + "epoch": 0.29, + "grad_norm": 0.2652448178700676, + "learning_rate": 0.0001951685150912199, + "loss": 1.1835, + "step": 3018 + }, + { + "epoch": 0.29, + "grad_norm": 0.3077147987981465, + "learning_rate": 0.00019516365612212572, + "loss": 1.0706, + "step": 3019 + }, + { + "epoch": 0.29, + "grad_norm": 0.30685934393160413, + "learning_rate": 0.00019515879477150123, + "loss": 1.0244, + "step": 3020 + }, + { + "epoch": 0.29, + "grad_norm": 0.2856291230251649, + "learning_rate": 0.00019515393103946812, + "loss": 1.0963, + "step": 3021 + }, + { + "epoch": 0.29, + "grad_norm": 0.2767228605351276, + "learning_rate": 0.00019514906492614805, + "loss": 0.9945, + "step": 3022 + }, + { + "epoch": 0.29, + "grad_norm": 0.27035783848571304, + "learning_rate": 0.00019514419643166283, + "loss": 1.1075, + "step": 3023 + }, + { + "epoch": 0.29, + "grad_norm": 0.2990769279659998, + "learning_rate": 0.0001951393255561343, + "loss": 1.1556, + "step": 3024 + }, + { + "epoch": 0.29, + "grad_norm": 0.2844357764929915, + "learning_rate": 0.00019513445229968438, + "loss": 0.9933, + "step": 3025 + }, + { + "epoch": 0.29, + "grad_norm": 0.3070197133609208, + "learning_rate": 0.000195129576662435, + "loss": 1.0885, + "step": 3026 + }, + { + "epoch": 0.29, + "grad_norm": 0.2820008198156176, + "learning_rate": 0.0001951246986445082, + "loss": 1.0819, + "step": 3027 + }, + { + "epoch": 0.29, + "grad_norm": 0.27229352040640303, + "learning_rate": 0.00019511981824602598, + "loss": 1.046, + "step": 3028 + }, + { + "epoch": 0.29, + "grad_norm": 0.2739544259171004, + "learning_rate": 0.00019511493546711054, + "loss": 1.0647, + "step": 3029 + }, + { + "epoch": 0.29, + "grad_norm": 0.2620879572449313, + "learning_rate": 0.00019511005030788407, + "loss": 1.1027, + "step": 3030 + }, + { + "epoch": 0.29, + "grad_norm": 0.251134914749705, + "learning_rate": 0.00019510516276846884, + "loss": 1.0464, + "step": 3031 + }, + { + "epoch": 0.29, + "grad_norm": 0.23480951173895892, + "learning_rate": 0.0001951002728489871, + "loss": 1.0258, + "step": 3032 + }, + { + "epoch": 0.29, + "grad_norm": 0.2637238517781191, + "learning_rate": 0.0001950953805495613, + "loss": 1.0609, + "step": 3033 + }, + { + "epoch": 0.29, + "grad_norm": 0.23941970843056096, + "learning_rate": 0.0001950904858703138, + "loss": 0.9995, + "step": 3034 + }, + { + "epoch": 0.29, + "grad_norm": 0.2674317195338645, + "learning_rate": 0.00019508558881136716, + "loss": 1.1166, + "step": 3035 + }, + { + "epoch": 0.29, + "grad_norm": 0.2671056081371556, + "learning_rate": 0.0001950806893728439, + "loss": 1.1134, + "step": 3036 + }, + { + "epoch": 0.29, + "grad_norm": 0.25637597126512307, + "learning_rate": 0.0001950757875548666, + "loss": 1.0847, + "step": 3037 + }, + { + "epoch": 0.29, + "grad_norm": 0.26941442259152115, + "learning_rate": 0.000195070883357558, + "loss": 1.1351, + "step": 3038 + }, + { + "epoch": 0.29, + "grad_norm": 0.26139367123922513, + "learning_rate": 0.00019506597678104078, + "loss": 1.1819, + "step": 3039 + }, + { + "epoch": 0.29, + "grad_norm": 0.2730769424356869, + "learning_rate": 0.00019506106782543774, + "loss": 1.0862, + "step": 3040 + }, + { + "epoch": 0.29, + "grad_norm": 0.2564183571077773, + "learning_rate": 0.00019505615649087173, + "loss": 1.057, + "step": 3041 + }, + { + "epoch": 0.29, + "grad_norm": 0.27496154581521765, + "learning_rate": 0.00019505124277746568, + "loss": 1.0365, + "step": 3042 + }, + { + "epoch": 0.29, + "grad_norm": 0.2906578614460428, + "learning_rate": 0.00019504632668534253, + "loss": 1.0765, + "step": 3043 + }, + { + "epoch": 0.29, + "grad_norm": 0.24889624819261374, + "learning_rate": 0.00019504140821462534, + "loss": 1.0847, + "step": 3044 + }, + { + "epoch": 0.29, + "grad_norm": 0.26592584153440635, + "learning_rate": 0.00019503648736543715, + "loss": 1.0803, + "step": 3045 + }, + { + "epoch": 0.29, + "grad_norm": 0.2944881481822344, + "learning_rate": 0.00019503156413790113, + "loss": 1.0591, + "step": 3046 + }, + { + "epoch": 0.29, + "grad_norm": 0.2918642575968384, + "learning_rate": 0.00019502663853214052, + "loss": 1.0976, + "step": 3047 + }, + { + "epoch": 0.29, + "grad_norm": 0.26006791100294435, + "learning_rate": 0.00019502171054827856, + "loss": 1.1608, + "step": 3048 + }, + { + "epoch": 0.29, + "grad_norm": 0.3177927171422205, + "learning_rate": 0.00019501678018643854, + "loss": 1.1429, + "step": 3049 + }, + { + "epoch": 0.29, + "grad_norm": 0.26535744168889774, + "learning_rate": 0.0001950118474467439, + "loss": 1.0564, + "step": 3050 + }, + { + "epoch": 0.29, + "grad_norm": 0.30516176813295376, + "learning_rate": 0.00019500691232931806, + "loss": 1.085, + "step": 3051 + }, + { + "epoch": 0.29, + "grad_norm": 0.3062397007408206, + "learning_rate": 0.00019500197483428454, + "loss": 1.202, + "step": 3052 + }, + { + "epoch": 0.29, + "grad_norm": 0.30600809859823863, + "learning_rate": 0.0001949970349617669, + "loss": 1.0306, + "step": 3053 + }, + { + "epoch": 0.29, + "grad_norm": 0.26553989672712847, + "learning_rate": 0.00019499209271188874, + "loss": 1.1253, + "step": 3054 + }, + { + "epoch": 0.29, + "grad_norm": 0.3115655362592651, + "learning_rate": 0.00019498714808477375, + "loss": 1.0844, + "step": 3055 + }, + { + "epoch": 0.29, + "grad_norm": 0.2733439826555348, + "learning_rate": 0.00019498220108054573, + "loss": 1.0594, + "step": 3056 + }, + { + "epoch": 0.29, + "grad_norm": 0.27824607858191935, + "learning_rate": 0.00019497725169932839, + "loss": 1.1842, + "step": 3057 + }, + { + "epoch": 0.29, + "grad_norm": 0.29361249395554595, + "learning_rate": 0.00019497229994124563, + "loss": 1.156, + "step": 3058 + }, + { + "epoch": 0.29, + "grad_norm": 0.2909096666085275, + "learning_rate": 0.00019496734580642139, + "loss": 1.0713, + "step": 3059 + }, + { + "epoch": 0.29, + "grad_norm": 0.3008406748557583, + "learning_rate": 0.00019496238929497968, + "loss": 0.9974, + "step": 3060 + }, + { + "epoch": 0.29, + "grad_norm": 0.2557208115014124, + "learning_rate": 0.00019495743040704445, + "loss": 1.1056, + "step": 3061 + }, + { + "epoch": 0.29, + "grad_norm": 0.2619314513826558, + "learning_rate": 0.00019495246914273985, + "loss": 1.121, + "step": 3062 + }, + { + "epoch": 0.29, + "grad_norm": 0.2777726951083541, + "learning_rate": 0.00019494750550219, + "loss": 1.1167, + "step": 3063 + }, + { + "epoch": 0.29, + "grad_norm": 0.27592070412234426, + "learning_rate": 0.00019494253948551922, + "loss": 1.1754, + "step": 3064 + }, + { + "epoch": 0.29, + "grad_norm": 0.27823194441497656, + "learning_rate": 0.0001949375710928517, + "loss": 1.0839, + "step": 3065 + }, + { + "epoch": 0.29, + "grad_norm": 0.2982548750310373, + "learning_rate": 0.00019493260032431176, + "loss": 1.1597, + "step": 3066 + }, + { + "epoch": 0.29, + "grad_norm": 0.2859522245926003, + "learning_rate": 0.00019492762718002386, + "loss": 1.1175, + "step": 3067 + }, + { + "epoch": 0.29, + "grad_norm": 0.25703939560705324, + "learning_rate": 0.00019492265166011244, + "loss": 1.0775, + "step": 3068 + }, + { + "epoch": 0.29, + "grad_norm": 0.28439233978147976, + "learning_rate": 0.000194917673764702, + "loss": 1.1447, + "step": 3069 + }, + { + "epoch": 0.29, + "grad_norm": 0.2794544690967338, + "learning_rate": 0.00019491269349391712, + "loss": 1.1776, + "step": 3070 + }, + { + "epoch": 0.29, + "grad_norm": 0.2438574837351192, + "learning_rate": 0.00019490771084788242, + "loss": 1.1019, + "step": 3071 + }, + { + "epoch": 0.29, + "grad_norm": 0.24023569609710485, + "learning_rate": 0.00019490272582672262, + "loss": 1.1135, + "step": 3072 + }, + { + "epoch": 0.29, + "grad_norm": 0.29972615964686367, + "learning_rate": 0.00019489773843056244, + "loss": 1.069, + "step": 3073 + }, + { + "epoch": 0.29, + "grad_norm": 0.2589384956974427, + "learning_rate": 0.00019489274865952676, + "loss": 1.2025, + "step": 3074 + }, + { + "epoch": 0.29, + "grad_norm": 0.2597338399690944, + "learning_rate": 0.00019488775651374038, + "loss": 1.0932, + "step": 3075 + }, + { + "epoch": 0.29, + "grad_norm": 0.31065520260111457, + "learning_rate": 0.00019488276199332825, + "loss": 1.2195, + "step": 3076 + }, + { + "epoch": 0.29, + "grad_norm": 0.25453729205708414, + "learning_rate": 0.0001948777650984154, + "loss": 1.107, + "step": 3077 + }, + { + "epoch": 0.29, + "grad_norm": 0.29862291936090024, + "learning_rate": 0.00019487276582912683, + "loss": 1.1301, + "step": 3078 + }, + { + "epoch": 0.29, + "grad_norm": 0.3221818146284146, + "learning_rate": 0.00019486776418558766, + "loss": 1.1191, + "step": 3079 + }, + { + "epoch": 0.29, + "grad_norm": 0.3782095978448149, + "learning_rate": 0.0001948627601679231, + "loss": 1.0482, + "step": 3080 + }, + { + "epoch": 0.29, + "grad_norm": 0.28378395622830654, + "learning_rate": 0.0001948577537762583, + "loss": 1.1376, + "step": 3081 + }, + { + "epoch": 0.29, + "grad_norm": 0.3319042029552207, + "learning_rate": 0.00019485274501071864, + "loss": 1.0665, + "step": 3082 + }, + { + "epoch": 0.29, + "grad_norm": 0.26347510714336875, + "learning_rate": 0.00019484773387142942, + "loss": 1.1515, + "step": 3083 + }, + { + "epoch": 0.3, + "grad_norm": 0.3101715271811368, + "learning_rate": 0.000194842720358516, + "loss": 1.1255, + "step": 3084 + }, + { + "epoch": 0.3, + "grad_norm": 0.25293432322802967, + "learning_rate": 0.00019483770447210397, + "loss": 1.0296, + "step": 3085 + }, + { + "epoch": 0.3, + "grad_norm": 0.28621886274041924, + "learning_rate": 0.00019483268621231875, + "loss": 0.9487, + "step": 3086 + }, + { + "epoch": 0.3, + "grad_norm": 0.2747791927139919, + "learning_rate": 0.00019482766557928592, + "loss": 1.0543, + "step": 3087 + }, + { + "epoch": 0.3, + "grad_norm": 0.26223503943348814, + "learning_rate": 0.00019482264257313122, + "loss": 1.0122, + "step": 3088 + }, + { + "epoch": 0.3, + "grad_norm": 0.27709490802994835, + "learning_rate": 0.00019481761719398027, + "loss": 1.119, + "step": 3089 + }, + { + "epoch": 0.3, + "grad_norm": 0.26196843932694963, + "learning_rate": 0.00019481258944195886, + "loss": 1.0707, + "step": 3090 + }, + { + "epoch": 0.3, + "grad_norm": 0.2782018365147699, + "learning_rate": 0.00019480755931719281, + "loss": 1.038, + "step": 3091 + }, + { + "epoch": 0.3, + "grad_norm": 0.24361390545083103, + "learning_rate": 0.00019480252681980802, + "loss": 1.056, + "step": 3092 + }, + { + "epoch": 0.3, + "grad_norm": 0.2674864730406895, + "learning_rate": 0.0001947974919499304, + "loss": 1.1546, + "step": 3093 + }, + { + "epoch": 0.3, + "grad_norm": 0.25038494984812143, + "learning_rate": 0.00019479245470768595, + "loss": 1.0509, + "step": 3094 + }, + { + "epoch": 0.3, + "grad_norm": 0.27600843103065575, + "learning_rate": 0.00019478741509320076, + "loss": 1.1192, + "step": 3095 + }, + { + "epoch": 0.3, + "grad_norm": 0.2736297017538167, + "learning_rate": 0.00019478237310660093, + "loss": 1.1505, + "step": 3096 + }, + { + "epoch": 0.3, + "grad_norm": 0.2545876917992688, + "learning_rate": 0.00019477732874801265, + "loss": 1.0992, + "step": 3097 + }, + { + "epoch": 0.3, + "grad_norm": 0.2539192301177348, + "learning_rate": 0.0001947722820175622, + "loss": 1.044, + "step": 3098 + }, + { + "epoch": 0.3, + "grad_norm": 0.25814263130227105, + "learning_rate": 0.00019476723291537575, + "loss": 1.1827, + "step": 3099 + }, + { + "epoch": 0.3, + "grad_norm": 0.2976356116278242, + "learning_rate": 0.0001947621814415798, + "loss": 0.9778, + "step": 3100 + }, + { + "epoch": 0.3, + "grad_norm": 0.2625055643669641, + "learning_rate": 0.00019475712759630068, + "loss": 1.0887, + "step": 3101 + }, + { + "epoch": 0.3, + "grad_norm": 0.26312176570889856, + "learning_rate": 0.00019475207137966487, + "loss": 1.0807, + "step": 3102 + }, + { + "epoch": 0.3, + "grad_norm": 0.2618322867889844, + "learning_rate": 0.00019474701279179895, + "loss": 1.2045, + "step": 3103 + }, + { + "epoch": 0.3, + "grad_norm": 0.2891550174279668, + "learning_rate": 0.00019474195183282947, + "loss": 1.0771, + "step": 3104 + }, + { + "epoch": 0.3, + "grad_norm": 0.3202983567379544, + "learning_rate": 0.00019473688850288312, + "loss": 1.1852, + "step": 3105 + }, + { + "epoch": 0.3, + "grad_norm": 0.25021772062444586, + "learning_rate": 0.0001947318228020866, + "loss": 1.1832, + "step": 3106 + }, + { + "epoch": 0.3, + "grad_norm": 0.2930599815597174, + "learning_rate": 0.00019472675473056666, + "loss": 1.0763, + "step": 3107 + }, + { + "epoch": 0.3, + "grad_norm": 0.30676853669698495, + "learning_rate": 0.00019472168428845014, + "loss": 1.0405, + "step": 3108 + }, + { + "epoch": 0.3, + "grad_norm": 0.2648409443563144, + "learning_rate": 0.00019471661147586395, + "loss": 1.1125, + "step": 3109 + }, + { + "epoch": 0.3, + "grad_norm": 0.26584314244912965, + "learning_rate": 0.00019471153629293503, + "loss": 1.0697, + "step": 3110 + }, + { + "epoch": 0.3, + "grad_norm": 0.37140190776674137, + "learning_rate": 0.0001947064587397904, + "loss": 1.0939, + "step": 3111 + }, + { + "epoch": 0.3, + "grad_norm": 0.259017588096064, + "learning_rate": 0.00019470137881655712, + "loss": 1.0809, + "step": 3112 + }, + { + "epoch": 0.3, + "grad_norm": 0.26998271747435276, + "learning_rate": 0.00019469629652336232, + "loss": 1.0425, + "step": 3113 + }, + { + "epoch": 0.3, + "grad_norm": 0.25718343306878455, + "learning_rate": 0.0001946912118603332, + "loss": 1.1163, + "step": 3114 + }, + { + "epoch": 0.3, + "grad_norm": 0.2568678014483129, + "learning_rate": 0.00019468612482759695, + "loss": 0.9441, + "step": 3115 + }, + { + "epoch": 0.3, + "grad_norm": 0.2765085464241682, + "learning_rate": 0.00019468103542528094, + "loss": 1.0876, + "step": 3116 + }, + { + "epoch": 0.3, + "grad_norm": 0.23905715647271397, + "learning_rate": 0.0001946759436535125, + "loss": 1.1495, + "step": 3117 + }, + { + "epoch": 0.3, + "grad_norm": 0.23414722392686665, + "learning_rate": 0.00019467084951241907, + "loss": 1.0045, + "step": 3118 + }, + { + "epoch": 0.3, + "grad_norm": 0.2825675728276391, + "learning_rate": 0.00019466575300212816, + "loss": 1.0469, + "step": 3119 + }, + { + "epoch": 0.3, + "grad_norm": 0.27456256708811555, + "learning_rate": 0.00019466065412276727, + "loss": 0.952, + "step": 3120 + }, + { + "epoch": 0.3, + "grad_norm": 0.23902984443073147, + "learning_rate": 0.00019465555287446402, + "loss": 1.1261, + "step": 3121 + }, + { + "epoch": 0.3, + "grad_norm": 0.2860878827274362, + "learning_rate": 0.00019465044925734605, + "loss": 0.9592, + "step": 3122 + }, + { + "epoch": 0.3, + "grad_norm": 0.27511877405249713, + "learning_rate": 0.00019464534327154112, + "loss": 1.0913, + "step": 3123 + }, + { + "epoch": 0.3, + "grad_norm": 0.25798267940905334, + "learning_rate": 0.000194640234917177, + "loss": 1.0157, + "step": 3124 + }, + { + "epoch": 0.3, + "grad_norm": 0.284100415728136, + "learning_rate": 0.00019463512419438153, + "loss": 1.1027, + "step": 3125 + }, + { + "epoch": 0.3, + "grad_norm": 0.2382259658744238, + "learning_rate": 0.00019463001110328257, + "loss": 1.1828, + "step": 3126 + }, + { + "epoch": 0.3, + "grad_norm": 0.31222857704200785, + "learning_rate": 0.0001946248956440081, + "loss": 1.1124, + "step": 3127 + }, + { + "epoch": 0.3, + "grad_norm": 0.2625066692419136, + "learning_rate": 0.00019461977781668618, + "loss": 1.0737, + "step": 3128 + }, + { + "epoch": 0.3, + "grad_norm": 0.2793854323707662, + "learning_rate": 0.00019461465762144487, + "loss": 1.1363, + "step": 3129 + }, + { + "epoch": 0.3, + "grad_norm": 0.27108476180470265, + "learning_rate": 0.00019460953505841223, + "loss": 1.1485, + "step": 3130 + }, + { + "epoch": 0.3, + "grad_norm": 0.26903383341011894, + "learning_rate": 0.0001946044101277166, + "loss": 1.0214, + "step": 3131 + }, + { + "epoch": 0.3, + "grad_norm": 0.30317173630025673, + "learning_rate": 0.00019459928282948607, + "loss": 1.0941, + "step": 3132 + }, + { + "epoch": 0.3, + "grad_norm": 0.2833482336806812, + "learning_rate": 0.00019459415316384906, + "loss": 1.1549, + "step": 3133 + }, + { + "epoch": 0.3, + "grad_norm": 0.2652521067523786, + "learning_rate": 0.00019458902113093395, + "loss": 1.0997, + "step": 3134 + }, + { + "epoch": 0.3, + "grad_norm": 0.2796165666023849, + "learning_rate": 0.0001945838867308691, + "loss": 1.1974, + "step": 3135 + }, + { + "epoch": 0.3, + "grad_norm": 0.2795419726491581, + "learning_rate": 0.00019457874996378304, + "loss": 1.0421, + "step": 3136 + }, + { + "epoch": 0.3, + "grad_norm": 0.27461888378734867, + "learning_rate": 0.00019457361082980432, + "loss": 1.0375, + "step": 3137 + }, + { + "epoch": 0.3, + "grad_norm": 0.2751564840738016, + "learning_rate": 0.00019456846932906156, + "loss": 0.9755, + "step": 3138 + }, + { + "epoch": 0.3, + "grad_norm": 0.27520201040938214, + "learning_rate": 0.00019456332546168343, + "loss": 0.9982, + "step": 3139 + }, + { + "epoch": 0.3, + "grad_norm": 0.28743296092030146, + "learning_rate": 0.00019455817922779868, + "loss": 0.8786, + "step": 3140 + }, + { + "epoch": 0.3, + "grad_norm": 0.2791638788893473, + "learning_rate": 0.000194553030627536, + "loss": 1.1424, + "step": 3141 + }, + { + "epoch": 0.3, + "grad_norm": 0.2605528838298395, + "learning_rate": 0.00019454787966102435, + "loss": 1.0785, + "step": 3142 + }, + { + "epoch": 0.3, + "grad_norm": 0.2652776678480821, + "learning_rate": 0.00019454272632839255, + "loss": 1.0047, + "step": 3143 + }, + { + "epoch": 0.3, + "grad_norm": 0.2770395767937961, + "learning_rate": 0.00019453757062976964, + "loss": 1.1224, + "step": 3144 + }, + { + "epoch": 0.3, + "grad_norm": 0.2774545055078794, + "learning_rate": 0.00019453241256528462, + "loss": 1.0218, + "step": 3145 + }, + { + "epoch": 0.3, + "grad_norm": 0.29292827749120715, + "learning_rate": 0.00019452725213506654, + "loss": 1.1559, + "step": 3146 + }, + { + "epoch": 0.3, + "grad_norm": 0.2769766053696637, + "learning_rate": 0.00019452208933924459, + "loss": 1.0685, + "step": 3147 + }, + { + "epoch": 0.3, + "grad_norm": 0.24168206829639038, + "learning_rate": 0.00019451692417794792, + "loss": 1.092, + "step": 3148 + }, + { + "epoch": 0.3, + "grad_norm": 0.3243097289376712, + "learning_rate": 0.00019451175665130584, + "loss": 1.1109, + "step": 3149 + }, + { + "epoch": 0.3, + "grad_norm": 0.2851118834219872, + "learning_rate": 0.00019450658675944764, + "loss": 1.0859, + "step": 3150 + }, + { + "epoch": 0.3, + "grad_norm": 0.26646697928523183, + "learning_rate": 0.00019450141450250272, + "loss": 1.092, + "step": 3151 + }, + { + "epoch": 0.3, + "grad_norm": 0.31149135380894666, + "learning_rate": 0.0001944962398806005, + "loss": 1.0533, + "step": 3152 + }, + { + "epoch": 0.3, + "grad_norm": 0.31641860365957436, + "learning_rate": 0.00019449106289387048, + "loss": 1.0906, + "step": 3153 + }, + { + "epoch": 0.3, + "grad_norm": 0.2861114243921771, + "learning_rate": 0.00019448588354244227, + "loss": 1.1436, + "step": 3154 + }, + { + "epoch": 0.3, + "grad_norm": 0.2678359187873902, + "learning_rate": 0.0001944807018264454, + "loss": 1.0751, + "step": 3155 + }, + { + "epoch": 0.3, + "grad_norm": 0.2795381844318716, + "learning_rate": 0.00019447551774600958, + "loss": 1.0243, + "step": 3156 + }, + { + "epoch": 0.3, + "grad_norm": 0.2646096339194622, + "learning_rate": 0.00019447033130126458, + "loss": 1.0279, + "step": 3157 + }, + { + "epoch": 0.3, + "grad_norm": 0.295589829682441, + "learning_rate": 0.00019446514249234017, + "loss": 1.0735, + "step": 3158 + }, + { + "epoch": 0.3, + "grad_norm": 0.2980645819323912, + "learning_rate": 0.0001944599513193662, + "loss": 1.145, + "step": 3159 + }, + { + "epoch": 0.3, + "grad_norm": 0.3115102825868531, + "learning_rate": 0.00019445475778247256, + "loss": 1.1983, + "step": 3160 + }, + { + "epoch": 0.3, + "grad_norm": 0.2813320240238785, + "learning_rate": 0.00019444956188178927, + "loss": 1.1677, + "step": 3161 + }, + { + "epoch": 0.3, + "grad_norm": 0.23562367846120347, + "learning_rate": 0.00019444436361744632, + "loss": 1.1973, + "step": 3162 + }, + { + "epoch": 0.3, + "grad_norm": 0.2928636477767414, + "learning_rate": 0.0001944391629895738, + "loss": 1.1904, + "step": 3163 + }, + { + "epoch": 0.3, + "grad_norm": 0.2938628446043182, + "learning_rate": 0.0001944339599983019, + "loss": 1.1338, + "step": 3164 + }, + { + "epoch": 0.3, + "grad_norm": 0.2698493025290344, + "learning_rate": 0.00019442875464376077, + "loss": 1.085, + "step": 3165 + }, + { + "epoch": 0.3, + "grad_norm": 0.30254516206760285, + "learning_rate": 0.00019442354692608075, + "loss": 1.21, + "step": 3166 + }, + { + "epoch": 0.3, + "grad_norm": 0.26210762133120497, + "learning_rate": 0.0001944183368453921, + "loss": 1.0347, + "step": 3167 + }, + { + "epoch": 0.3, + "grad_norm": 0.2307520308777141, + "learning_rate": 0.00019441312440182524, + "loss": 1.0734, + "step": 3168 + }, + { + "epoch": 0.3, + "grad_norm": 0.256456644213979, + "learning_rate": 0.0001944079095955106, + "loss": 1.1074, + "step": 3169 + }, + { + "epoch": 0.3, + "grad_norm": 0.27238535013463366, + "learning_rate": 0.00019440269242657868, + "loss": 1.1605, + "step": 3170 + }, + { + "epoch": 0.3, + "grad_norm": 0.27536976098604626, + "learning_rate": 0.00019439747289516009, + "loss": 1.2405, + "step": 3171 + }, + { + "epoch": 0.3, + "grad_norm": 0.2537359030538808, + "learning_rate": 0.00019439225100138536, + "loss": 1.0907, + "step": 3172 + }, + { + "epoch": 0.3, + "grad_norm": 0.27291761429286016, + "learning_rate": 0.00019438702674538525, + "loss": 1.0893, + "step": 3173 + }, + { + "epoch": 0.3, + "grad_norm": 0.2650287732828565, + "learning_rate": 0.00019438180012729047, + "loss": 1.0594, + "step": 3174 + }, + { + "epoch": 0.3, + "grad_norm": 0.2566176933125399, + "learning_rate": 0.00019437657114723184, + "loss": 1.0371, + "step": 3175 + }, + { + "epoch": 0.3, + "grad_norm": 0.27936305636263997, + "learning_rate": 0.0001943713398053402, + "loss": 1.1312, + "step": 3176 + }, + { + "epoch": 0.3, + "grad_norm": 0.29167361844632245, + "learning_rate": 0.00019436610610174646, + "loss": 1.116, + "step": 3177 + }, + { + "epoch": 0.3, + "grad_norm": 0.28843098771577746, + "learning_rate": 0.00019436087003658163, + "loss": 1.0541, + "step": 3178 + }, + { + "epoch": 0.3, + "grad_norm": 0.32516725974523086, + "learning_rate": 0.0001943556316099767, + "loss": 1.1834, + "step": 3179 + }, + { + "epoch": 0.3, + "grad_norm": 0.2541028278026109, + "learning_rate": 0.0001943503908220628, + "loss": 1.1216, + "step": 3180 + }, + { + "epoch": 0.3, + "grad_norm": 0.2834034653405795, + "learning_rate": 0.00019434514767297108, + "loss": 1.2544, + "step": 3181 + }, + { + "epoch": 0.3, + "grad_norm": 0.30931710915213423, + "learning_rate": 0.00019433990216283274, + "loss": 1.0865, + "step": 3182 + }, + { + "epoch": 0.3, + "grad_norm": 0.2777726505717253, + "learning_rate": 0.00019433465429177904, + "loss": 1.0133, + "step": 3183 + }, + { + "epoch": 0.3, + "grad_norm": 0.2893309338829881, + "learning_rate": 0.00019432940405994135, + "loss": 1.1005, + "step": 3184 + }, + { + "epoch": 0.3, + "grad_norm": 0.2718000624288272, + "learning_rate": 0.00019432415146745103, + "loss": 0.9954, + "step": 3185 + }, + { + "epoch": 0.3, + "grad_norm": 0.2963971328585791, + "learning_rate": 0.00019431889651443953, + "loss": 1.0576, + "step": 3186 + }, + { + "epoch": 0.3, + "grad_norm": 0.2767798358909039, + "learning_rate": 0.00019431363920103837, + "loss": 1.1268, + "step": 3187 + }, + { + "epoch": 0.3, + "grad_norm": 0.31700853184697214, + "learning_rate": 0.00019430837952737914, + "loss": 1.061, + "step": 3188 + }, + { + "epoch": 0.31, + "grad_norm": 0.25763423615478015, + "learning_rate": 0.0001943031174935934, + "loss": 1.0927, + "step": 3189 + }, + { + "epoch": 0.31, + "grad_norm": 0.2610139408028157, + "learning_rate": 0.00019429785309981292, + "loss": 1.0666, + "step": 3190 + }, + { + "epoch": 0.31, + "grad_norm": 0.2884216831748455, + "learning_rate": 0.00019429258634616941, + "loss": 1.064, + "step": 3191 + }, + { + "epoch": 0.31, + "grad_norm": 0.2502879838444048, + "learning_rate": 0.00019428731723279463, + "loss": 1.1431, + "step": 3192 + }, + { + "epoch": 0.31, + "grad_norm": 0.25880669047725313, + "learning_rate": 0.0001942820457598205, + "loss": 1.0648, + "step": 3193 + }, + { + "epoch": 0.31, + "grad_norm": 0.251763418883559, + "learning_rate": 0.0001942767719273789, + "loss": 1.095, + "step": 3194 + }, + { + "epoch": 0.31, + "grad_norm": 0.28454723290611905, + "learning_rate": 0.00019427149573560183, + "loss": 1.0639, + "step": 3195 + }, + { + "epoch": 0.31, + "grad_norm": 0.2740113209524548, + "learning_rate": 0.00019426621718462137, + "loss": 1.0383, + "step": 3196 + }, + { + "epoch": 0.31, + "grad_norm": 0.2894651427378075, + "learning_rate": 0.00019426093627456954, + "loss": 1.0393, + "step": 3197 + }, + { + "epoch": 0.31, + "grad_norm": 0.272356851821048, + "learning_rate": 0.00019425565300557857, + "loss": 1.0492, + "step": 3198 + }, + { + "epoch": 0.31, + "grad_norm": 0.2834745514772499, + "learning_rate": 0.00019425036737778063, + "loss": 1.1115, + "step": 3199 + }, + { + "epoch": 0.31, + "grad_norm": 0.290735855354118, + "learning_rate": 0.00019424507939130802, + "loss": 1.1519, + "step": 3200 + }, + { + "epoch": 0.31, + "grad_norm": 0.26309247215034526, + "learning_rate": 0.00019423978904629303, + "loss": 1.1589, + "step": 3201 + }, + { + "epoch": 0.31, + "grad_norm": 0.3034788416160877, + "learning_rate": 0.00019423449634286812, + "loss": 1.0927, + "step": 3202 + }, + { + "epoch": 0.31, + "grad_norm": 0.24953378202378695, + "learning_rate": 0.00019422920128116573, + "loss": 1.0734, + "step": 3203 + }, + { + "epoch": 0.31, + "grad_norm": 0.30391403166399206, + "learning_rate": 0.00019422390386131835, + "loss": 1.1223, + "step": 3204 + }, + { + "epoch": 0.31, + "grad_norm": 0.26768113646614333, + "learning_rate": 0.00019421860408345856, + "loss": 1.074, + "step": 3205 + }, + { + "epoch": 0.31, + "grad_norm": 0.26968925174933783, + "learning_rate": 0.000194213301947719, + "loss": 1.1126, + "step": 3206 + }, + { + "epoch": 0.31, + "grad_norm": 0.25338870451527706, + "learning_rate": 0.0001942079974542323, + "loss": 1.2239, + "step": 3207 + }, + { + "epoch": 0.31, + "grad_norm": 0.27878381417056575, + "learning_rate": 0.0001942026906031313, + "loss": 1.184, + "step": 3208 + }, + { + "epoch": 0.31, + "grad_norm": 0.25893716244148623, + "learning_rate": 0.00019419738139454874, + "loss": 1.1045, + "step": 3209 + }, + { + "epoch": 0.31, + "grad_norm": 0.2657479758737288, + "learning_rate": 0.0001941920698286175, + "loss": 1.0898, + "step": 3210 + }, + { + "epoch": 0.31, + "grad_norm": 0.274528853814968, + "learning_rate": 0.00019418675590547054, + "loss": 1.2649, + "step": 3211 + }, + { + "epoch": 0.31, + "grad_norm": 0.28015720519786214, + "learning_rate": 0.00019418143962524084, + "loss": 1.1167, + "step": 3212 + }, + { + "epoch": 0.31, + "grad_norm": 0.26974290585106064, + "learning_rate": 0.00019417612098806137, + "loss": 1.0439, + "step": 3213 + }, + { + "epoch": 0.31, + "grad_norm": 0.26730225585717254, + "learning_rate": 0.00019417079999406532, + "loss": 1.1091, + "step": 3214 + }, + { + "epoch": 0.31, + "grad_norm": 0.24578940286887485, + "learning_rate": 0.0001941654766433858, + "loss": 1.1115, + "step": 3215 + }, + { + "epoch": 0.31, + "grad_norm": 0.24916795650749093, + "learning_rate": 0.00019416015093615604, + "loss": 0.9763, + "step": 3216 + }, + { + "epoch": 0.31, + "grad_norm": 0.3032785764886797, + "learning_rate": 0.00019415482287250935, + "loss": 1.0748, + "step": 3217 + }, + { + "epoch": 0.31, + "grad_norm": 0.2840877489578721, + "learning_rate": 0.00019414949245257903, + "loss": 1.0943, + "step": 3218 + }, + { + "epoch": 0.31, + "grad_norm": 0.2721658275584308, + "learning_rate": 0.0001941441596764985, + "loss": 1.0798, + "step": 3219 + }, + { + "epoch": 0.31, + "grad_norm": 0.27711342122729105, + "learning_rate": 0.00019413882454440118, + "loss": 1.0857, + "step": 3220 + }, + { + "epoch": 0.31, + "grad_norm": 0.264240156198682, + "learning_rate": 0.00019413348705642065, + "loss": 1.1476, + "step": 3221 + }, + { + "epoch": 0.31, + "grad_norm": 0.2672728525407875, + "learning_rate": 0.00019412814721269042, + "loss": 1.1006, + "step": 3222 + }, + { + "epoch": 0.31, + "grad_norm": 0.2790227898377365, + "learning_rate": 0.00019412280501334418, + "loss": 1.0214, + "step": 3223 + }, + { + "epoch": 0.31, + "grad_norm": 0.26686766116618904, + "learning_rate": 0.00019411746045851553, + "loss": 1.0939, + "step": 3224 + }, + { + "epoch": 0.31, + "grad_norm": 0.26689843862224927, + "learning_rate": 0.00019411211354833832, + "loss": 1.1118, + "step": 3225 + }, + { + "epoch": 0.31, + "grad_norm": 0.2645315881506633, + "learning_rate": 0.00019410676428294633, + "loss": 1.064, + "step": 3226 + }, + { + "epoch": 0.31, + "grad_norm": 0.27750130260616634, + "learning_rate": 0.00019410141266247338, + "loss": 1.1626, + "step": 3227 + }, + { + "epoch": 0.31, + "grad_norm": 0.2739213945232073, + "learning_rate": 0.0001940960586870535, + "loss": 1.0854, + "step": 3228 + }, + { + "epoch": 0.31, + "grad_norm": 0.28287550537217365, + "learning_rate": 0.00019409070235682055, + "loss": 1.0474, + "step": 3229 + }, + { + "epoch": 0.31, + "grad_norm": 0.2731751302387, + "learning_rate": 0.0001940853436719087, + "loss": 0.9977, + "step": 3230 + }, + { + "epoch": 0.31, + "grad_norm": 0.2950771929234802, + "learning_rate": 0.00019407998263245194, + "loss": 1.1031, + "step": 3231 + }, + { + "epoch": 0.31, + "grad_norm": 0.274205168506372, + "learning_rate": 0.0001940746192385845, + "loss": 1.0148, + "step": 3232 + }, + { + "epoch": 0.31, + "grad_norm": 0.24694847878806808, + "learning_rate": 0.0001940692534904406, + "loss": 1.0189, + "step": 3233 + }, + { + "epoch": 0.31, + "grad_norm": 0.2884458695976643, + "learning_rate": 0.00019406388538815454, + "loss": 1.0534, + "step": 3234 + }, + { + "epoch": 0.31, + "grad_norm": 0.25614325988090403, + "learning_rate": 0.0001940585149318606, + "loss": 1.1938, + "step": 3235 + }, + { + "epoch": 0.31, + "grad_norm": 0.28634252057649984, + "learning_rate": 0.0001940531421216932, + "loss": 1.1215, + "step": 3236 + }, + { + "epoch": 0.31, + "grad_norm": 0.26353357452427006, + "learning_rate": 0.00019404776695778684, + "loss": 1.0671, + "step": 3237 + }, + { + "epoch": 0.31, + "grad_norm": 0.252301189841136, + "learning_rate": 0.00019404238944027596, + "loss": 1.0318, + "step": 3238 + }, + { + "epoch": 0.31, + "grad_norm": 0.2921004823006027, + "learning_rate": 0.0001940370095692952, + "loss": 1.1743, + "step": 3239 + }, + { + "epoch": 0.31, + "grad_norm": 0.3010241530589875, + "learning_rate": 0.0001940316273449792, + "loss": 1.1167, + "step": 3240 + }, + { + "epoch": 0.31, + "grad_norm": 0.26470231728020255, + "learning_rate": 0.00019402624276746263, + "loss": 1.1322, + "step": 3241 + }, + { + "epoch": 0.31, + "grad_norm": 0.26518892722531195, + "learning_rate": 0.00019402085583688022, + "loss": 1.043, + "step": 3242 + }, + { + "epoch": 0.31, + "grad_norm": 0.28690580948021155, + "learning_rate": 0.0001940154665533668, + "loss": 1.0667, + "step": 3243 + }, + { + "epoch": 0.31, + "grad_norm": 0.2810395515843624, + "learning_rate": 0.00019401007491705725, + "loss": 0.9801, + "step": 3244 + }, + { + "epoch": 0.31, + "grad_norm": 0.2572499963401515, + "learning_rate": 0.00019400468092808647, + "loss": 1.159, + "step": 3245 + }, + { + "epoch": 0.31, + "grad_norm": 0.2644492737063085, + "learning_rate": 0.00019399928458658952, + "loss": 1.119, + "step": 3246 + }, + { + "epoch": 0.31, + "grad_norm": 0.2693774572143553, + "learning_rate": 0.00019399388589270134, + "loss": 1.1763, + "step": 3247 + }, + { + "epoch": 0.31, + "grad_norm": 0.3074129041443745, + "learning_rate": 0.00019398848484655714, + "loss": 1.1109, + "step": 3248 + }, + { + "epoch": 0.31, + "grad_norm": 0.22885608780661518, + "learning_rate": 0.00019398308144829202, + "loss": 1.1484, + "step": 3249 + }, + { + "epoch": 0.31, + "grad_norm": 0.2431406554688736, + "learning_rate": 0.0001939776756980412, + "loss": 1.1024, + "step": 3250 + }, + { + "epoch": 0.31, + "grad_norm": 0.2508813769274511, + "learning_rate": 0.00019397226759594003, + "loss": 1.1161, + "step": 3251 + }, + { + "epoch": 0.31, + "grad_norm": 0.259498174018731, + "learning_rate": 0.00019396685714212378, + "loss": 1.1121, + "step": 3252 + }, + { + "epoch": 0.31, + "grad_norm": 0.25690080144023086, + "learning_rate": 0.00019396144433672787, + "loss": 1.1951, + "step": 3253 + }, + { + "epoch": 0.31, + "grad_norm": 0.2703955152553523, + "learning_rate": 0.00019395602917988774, + "loss": 1.0848, + "step": 3254 + }, + { + "epoch": 0.31, + "grad_norm": 0.3248488191794343, + "learning_rate": 0.00019395061167173895, + "loss": 1.1507, + "step": 3255 + }, + { + "epoch": 0.31, + "grad_norm": 0.28991024777070024, + "learning_rate": 0.00019394519181241705, + "loss": 1.1015, + "step": 3256 + }, + { + "epoch": 0.31, + "grad_norm": 0.2764459982278797, + "learning_rate": 0.00019393976960205772, + "loss": 0.9972, + "step": 3257 + }, + { + "epoch": 0.31, + "grad_norm": 0.26862234853504874, + "learning_rate": 0.00019393434504079657, + "loss": 1.1453, + "step": 3258 + }, + { + "epoch": 0.31, + "grad_norm": 0.27971969886015396, + "learning_rate": 0.00019392891812876944, + "loss": 1.0825, + "step": 3259 + }, + { + "epoch": 0.31, + "grad_norm": 0.2770468588755897, + "learning_rate": 0.00019392348886611207, + "loss": 1.0536, + "step": 3260 + }, + { + "epoch": 0.31, + "grad_norm": 0.271609510880968, + "learning_rate": 0.00019391805725296038, + "loss": 1.0481, + "step": 3261 + }, + { + "epoch": 0.31, + "grad_norm": 0.24387465904724714, + "learning_rate": 0.00019391262328945027, + "loss": 1.0953, + "step": 3262 + }, + { + "epoch": 0.31, + "grad_norm": 0.306543464049848, + "learning_rate": 0.00019390718697571776, + "loss": 1.1486, + "step": 3263 + }, + { + "epoch": 0.31, + "grad_norm": 0.30439534530685314, + "learning_rate": 0.00019390174831189887, + "loss": 1.0251, + "step": 3264 + }, + { + "epoch": 0.31, + "grad_norm": 0.2702567545507524, + "learning_rate": 0.0001938963072981297, + "loss": 1.0219, + "step": 3265 + }, + { + "epoch": 0.31, + "grad_norm": 0.24990979733983004, + "learning_rate": 0.00019389086393454644, + "loss": 0.9841, + "step": 3266 + }, + { + "epoch": 0.31, + "grad_norm": 0.30401365328102387, + "learning_rate": 0.0001938854182212853, + "loss": 1.0608, + "step": 3267 + }, + { + "epoch": 0.31, + "grad_norm": 0.3051517353842027, + "learning_rate": 0.00019387997015848254, + "loss": 1.0624, + "step": 3268 + }, + { + "epoch": 0.31, + "grad_norm": 0.28665306616118164, + "learning_rate": 0.00019387451974627455, + "loss": 1.0742, + "step": 3269 + }, + { + "epoch": 0.31, + "grad_norm": 0.3231516980435964, + "learning_rate": 0.0001938690669847977, + "loss": 1.1872, + "step": 3270 + }, + { + "epoch": 0.31, + "grad_norm": 0.2956462625808931, + "learning_rate": 0.00019386361187418848, + "loss": 1.1729, + "step": 3271 + }, + { + "epoch": 0.31, + "grad_norm": 0.2657542153528369, + "learning_rate": 0.00019385815441458335, + "loss": 1.0359, + "step": 3272 + }, + { + "epoch": 0.31, + "grad_norm": 0.3012539639681833, + "learning_rate": 0.0001938526946061189, + "loss": 1.224, + "step": 3273 + }, + { + "epoch": 0.31, + "grad_norm": 0.23858077659378998, + "learning_rate": 0.00019384723244893182, + "loss": 0.9866, + "step": 3274 + }, + { + "epoch": 0.31, + "grad_norm": 0.28237967640012435, + "learning_rate": 0.00019384176794315876, + "loss": 1.1095, + "step": 3275 + }, + { + "epoch": 0.31, + "grad_norm": 0.2599923150922761, + "learning_rate": 0.0001938363010889365, + "loss": 1.1204, + "step": 3276 + }, + { + "epoch": 0.31, + "grad_norm": 0.2869091154710219, + "learning_rate": 0.00019383083188640178, + "loss": 1.0668, + "step": 3277 + }, + { + "epoch": 0.31, + "grad_norm": 0.31710794291071975, + "learning_rate": 0.00019382536033569155, + "loss": 1.0612, + "step": 3278 + }, + { + "epoch": 0.31, + "grad_norm": 0.29559893780311114, + "learning_rate": 0.0001938198864369427, + "loss": 1.077, + "step": 3279 + }, + { + "epoch": 0.31, + "grad_norm": 0.2898590799989514, + "learning_rate": 0.00019381441019029224, + "loss": 0.9822, + "step": 3280 + }, + { + "epoch": 0.31, + "grad_norm": 0.2653384099576511, + "learning_rate": 0.00019380893159587722, + "loss": 1.0328, + "step": 3281 + }, + { + "epoch": 0.31, + "grad_norm": 0.3059551368813422, + "learning_rate": 0.00019380345065383468, + "loss": 1.1349, + "step": 3282 + }, + { + "epoch": 0.31, + "grad_norm": 0.24886086837418994, + "learning_rate": 0.0001937979673643019, + "loss": 1.068, + "step": 3283 + }, + { + "epoch": 0.31, + "grad_norm": 0.23919413973427292, + "learning_rate": 0.000193792481727416, + "loss": 1.1181, + "step": 3284 + }, + { + "epoch": 0.31, + "grad_norm": 0.25404095588714376, + "learning_rate": 0.0001937869937433143, + "loss": 1.045, + "step": 3285 + }, + { + "epoch": 0.31, + "grad_norm": 0.223637661799907, + "learning_rate": 0.00019378150341213416, + "loss": 1.0104, + "step": 3286 + }, + { + "epoch": 0.31, + "grad_norm": 0.2779252243873626, + "learning_rate": 0.00019377601073401293, + "loss": 1.1353, + "step": 3287 + }, + { + "epoch": 0.31, + "grad_norm": 0.27508596502440386, + "learning_rate": 0.0001937705157090881, + "loss": 1.0028, + "step": 3288 + }, + { + "epoch": 0.31, + "grad_norm": 0.2829961966551457, + "learning_rate": 0.0001937650183374972, + "loss": 1.1252, + "step": 3289 + }, + { + "epoch": 0.31, + "grad_norm": 0.2547718896278451, + "learning_rate": 0.00019375951861937775, + "loss": 1.1001, + "step": 3290 + }, + { + "epoch": 0.31, + "grad_norm": 0.2386176095295127, + "learning_rate": 0.00019375401655486745, + "loss": 0.9711, + "step": 3291 + }, + { + "epoch": 0.31, + "grad_norm": 0.26234578344158516, + "learning_rate": 0.00019374851214410397, + "loss": 1.0339, + "step": 3292 + }, + { + "epoch": 0.32, + "grad_norm": 0.2919799926949464, + "learning_rate": 0.00019374300538722503, + "loss": 1.0436, + "step": 3293 + }, + { + "epoch": 0.32, + "grad_norm": 0.26522986184686614, + "learning_rate": 0.00019373749628436848, + "loss": 1.1032, + "step": 3294 + }, + { + "epoch": 0.32, + "grad_norm": 0.27747724141686647, + "learning_rate": 0.00019373198483567215, + "loss": 1.1199, + "step": 3295 + }, + { + "epoch": 0.32, + "grad_norm": 0.251558053545879, + "learning_rate": 0.00019372647104127401, + "loss": 1.0908, + "step": 3296 + }, + { + "epoch": 0.32, + "grad_norm": 0.2866203569622034, + "learning_rate": 0.00019372095490131206, + "loss": 1.1136, + "step": 3297 + }, + { + "epoch": 0.32, + "grad_norm": 0.26519901166294035, + "learning_rate": 0.00019371543641592427, + "loss": 1.0744, + "step": 3298 + }, + { + "epoch": 0.32, + "grad_norm": 0.2670922090903857, + "learning_rate": 0.0001937099155852488, + "loss": 1.0004, + "step": 3299 + }, + { + "epoch": 0.32, + "grad_norm": 0.2582252671213613, + "learning_rate": 0.0001937043924094238, + "loss": 1.0158, + "step": 3300 + }, + { + "epoch": 0.32, + "grad_norm": 0.2780671456060699, + "learning_rate": 0.00019369886688858746, + "loss": 1.0033, + "step": 3301 + }, + { + "epoch": 0.32, + "grad_norm": 0.26115496930707605, + "learning_rate": 0.00019369333902287812, + "loss": 1.1021, + "step": 3302 + }, + { + "epoch": 0.32, + "grad_norm": 0.2806226279994436, + "learning_rate": 0.00019368780881243408, + "loss": 1.0112, + "step": 3303 + }, + { + "epoch": 0.32, + "grad_norm": 0.2544906876251208, + "learning_rate": 0.00019368227625739376, + "loss": 1.1054, + "step": 3304 + }, + { + "epoch": 0.32, + "grad_norm": 0.2856721128498456, + "learning_rate": 0.00019367674135789559, + "loss": 1.1403, + "step": 3305 + }, + { + "epoch": 0.32, + "grad_norm": 0.28275774405141135, + "learning_rate": 0.00019367120411407807, + "loss": 1.1926, + "step": 3306 + }, + { + "epoch": 0.32, + "grad_norm": 0.30945016209372866, + "learning_rate": 0.00019366566452607984, + "loss": 1.0632, + "step": 3307 + }, + { + "epoch": 0.32, + "grad_norm": 0.28757906013236484, + "learning_rate": 0.00019366012259403945, + "loss": 1.0334, + "step": 3308 + }, + { + "epoch": 0.32, + "grad_norm": 0.28623289799348706, + "learning_rate": 0.00019365457831809564, + "loss": 1.0065, + "step": 3309 + }, + { + "epoch": 0.32, + "grad_norm": 0.31283779658466215, + "learning_rate": 0.00019364903169838714, + "loss": 1.1444, + "step": 3310 + }, + { + "epoch": 0.32, + "grad_norm": 0.25353872879290523, + "learning_rate": 0.0001936434827350528, + "loss": 1.1551, + "step": 3311 + }, + { + "epoch": 0.32, + "grad_norm": 0.28420385173740026, + "learning_rate": 0.00019363793142823142, + "loss": 1.0866, + "step": 3312 + }, + { + "epoch": 0.32, + "grad_norm": 0.2756468967799023, + "learning_rate": 0.00019363237777806193, + "loss": 1.1621, + "step": 3313 + }, + { + "epoch": 0.32, + "grad_norm": 0.2725051208626741, + "learning_rate": 0.0001936268217846834, + "loss": 1.0359, + "step": 3314 + }, + { + "epoch": 0.32, + "grad_norm": 0.2785478077945499, + "learning_rate": 0.0001936212634482348, + "loss": 1.1558, + "step": 3315 + }, + { + "epoch": 0.32, + "grad_norm": 0.2506394562927538, + "learning_rate": 0.00019361570276885522, + "loss": 1.1897, + "step": 3316 + }, + { + "epoch": 0.32, + "grad_norm": 0.2606058706252523, + "learning_rate": 0.00019361013974668385, + "loss": 1.1776, + "step": 3317 + }, + { + "epoch": 0.32, + "grad_norm": 0.2496048710743572, + "learning_rate": 0.0001936045743818599, + "loss": 0.9721, + "step": 3318 + }, + { + "epoch": 0.32, + "grad_norm": 0.26993436748357164, + "learning_rate": 0.00019359900667452264, + "loss": 1.104, + "step": 3319 + }, + { + "epoch": 0.32, + "grad_norm": 0.255141548047229, + "learning_rate": 0.0001935934366248114, + "loss": 1.0802, + "step": 3320 + }, + { + "epoch": 0.32, + "grad_norm": 0.27647316322378807, + "learning_rate": 0.00019358786423286564, + "loss": 1.0241, + "step": 3321 + }, + { + "epoch": 0.32, + "grad_norm": 0.2923601566113196, + "learning_rate": 0.00019358228949882474, + "loss": 1.0406, + "step": 3322 + }, + { + "epoch": 0.32, + "grad_norm": 0.24263042595969897, + "learning_rate": 0.00019357671242282821, + "loss": 0.9864, + "step": 3323 + }, + { + "epoch": 0.32, + "grad_norm": 0.27774846850692353, + "learning_rate": 0.00019357113300501566, + "loss": 1.0937, + "step": 3324 + }, + { + "epoch": 0.32, + "grad_norm": 0.27923854319931557, + "learning_rate": 0.0001935655512455267, + "loss": 1.0601, + "step": 3325 + }, + { + "epoch": 0.32, + "grad_norm": 0.3061443475173794, + "learning_rate": 0.000193559967144501, + "loss": 1.2078, + "step": 3326 + }, + { + "epoch": 0.32, + "grad_norm": 0.2846307312850083, + "learning_rate": 0.00019355438070207834, + "loss": 1.0301, + "step": 3327 + }, + { + "epoch": 0.32, + "grad_norm": 0.27792518753199547, + "learning_rate": 0.0001935487919183985, + "loss": 1.0656, + "step": 3328 + }, + { + "epoch": 0.32, + "grad_norm": 0.2845450991292036, + "learning_rate": 0.00019354320079360132, + "loss": 1.0507, + "step": 3329 + }, + { + "epoch": 0.32, + "grad_norm": 0.31509060671206557, + "learning_rate": 0.0001935376073278268, + "loss": 1.1854, + "step": 3330 + }, + { + "epoch": 0.32, + "grad_norm": 0.27538788262531033, + "learning_rate": 0.00019353201152121484, + "loss": 1.0761, + "step": 3331 + }, + { + "epoch": 0.32, + "grad_norm": 0.3085113970962017, + "learning_rate": 0.00019352641337390552, + "loss": 1.0149, + "step": 3332 + }, + { + "epoch": 0.32, + "grad_norm": 0.2649187417930221, + "learning_rate": 0.00019352081288603895, + "loss": 0.9846, + "step": 3333 + }, + { + "epoch": 0.32, + "grad_norm": 0.2702951480669255, + "learning_rate": 0.0001935152100577552, + "loss": 1.1289, + "step": 3334 + }, + { + "epoch": 0.32, + "grad_norm": 0.2587123130961769, + "learning_rate": 0.00019350960488919458, + "loss": 1.1603, + "step": 3335 + }, + { + "epoch": 0.32, + "grad_norm": 0.26093692625675025, + "learning_rate": 0.00019350399738049735, + "loss": 1.189, + "step": 3336 + }, + { + "epoch": 0.32, + "grad_norm": 0.2571900526712086, + "learning_rate": 0.0001934983875318038, + "loss": 1.1219, + "step": 3337 + }, + { + "epoch": 0.32, + "grad_norm": 0.2823069020729405, + "learning_rate": 0.0001934927753432543, + "loss": 1.0871, + "step": 3338 + }, + { + "epoch": 0.32, + "grad_norm": 0.29397830877053815, + "learning_rate": 0.00019348716081498942, + "loss": 1.0944, + "step": 3339 + }, + { + "epoch": 0.32, + "grad_norm": 0.29592501153858697, + "learning_rate": 0.00019348154394714952, + "loss": 1.0847, + "step": 3340 + }, + { + "epoch": 0.32, + "grad_norm": 0.27462052813299775, + "learning_rate": 0.00019347592473987528, + "loss": 1.1752, + "step": 3341 + }, + { + "epoch": 0.32, + "grad_norm": 0.24616995149088777, + "learning_rate": 0.00019347030319330727, + "loss": 1.1025, + "step": 3342 + }, + { + "epoch": 0.32, + "grad_norm": 0.27129435980716926, + "learning_rate": 0.00019346467930758614, + "loss": 1.1367, + "step": 3343 + }, + { + "epoch": 0.32, + "grad_norm": 0.27336188709510006, + "learning_rate": 0.0001934590530828527, + "loss": 1.1193, + "step": 3344 + }, + { + "epoch": 0.32, + "grad_norm": 0.27826604163520996, + "learning_rate": 0.0001934534245192477, + "loss": 1.1751, + "step": 3345 + }, + { + "epoch": 0.32, + "grad_norm": 0.2998027219687268, + "learning_rate": 0.00019344779361691203, + "loss": 1.1368, + "step": 3346 + }, + { + "epoch": 0.32, + "grad_norm": 0.28056799541807836, + "learning_rate": 0.0001934421603759866, + "loss": 1.0334, + "step": 3347 + }, + { + "epoch": 0.32, + "grad_norm": 0.24723826046732345, + "learning_rate": 0.00019343652479661237, + "loss": 1.0881, + "step": 3348 + }, + { + "epoch": 0.32, + "grad_norm": 0.3324987905537479, + "learning_rate": 0.00019343088687893037, + "loss": 1.0454, + "step": 3349 + }, + { + "epoch": 0.32, + "grad_norm": 0.2612587061661008, + "learning_rate": 0.00019342524662308174, + "loss": 1.1285, + "step": 3350 + }, + { + "epoch": 0.32, + "grad_norm": 0.3175396328178754, + "learning_rate": 0.0001934196040292076, + "loss": 1.1002, + "step": 3351 + }, + { + "epoch": 0.32, + "grad_norm": 0.2982419959527945, + "learning_rate": 0.00019341395909744914, + "loss": 1.1169, + "step": 3352 + }, + { + "epoch": 0.32, + "grad_norm": 0.2665630466583381, + "learning_rate": 0.00019340831182794763, + "loss": 1.0946, + "step": 3353 + }, + { + "epoch": 0.32, + "grad_norm": 0.2747715419393994, + "learning_rate": 0.00019340266222084445, + "loss": 1.0806, + "step": 3354 + }, + { + "epoch": 0.32, + "grad_norm": 0.26678051390620816, + "learning_rate": 0.00019339701027628093, + "loss": 1.0541, + "step": 3355 + }, + { + "epoch": 0.32, + "grad_norm": 0.2598902398444097, + "learning_rate": 0.00019339135599439852, + "loss": 1.0699, + "step": 3356 + }, + { + "epoch": 0.32, + "grad_norm": 0.254933790104563, + "learning_rate": 0.00019338569937533872, + "loss": 1.174, + "step": 3357 + }, + { + "epoch": 0.32, + "grad_norm": 0.3128276440616595, + "learning_rate": 0.00019338004041924314, + "loss": 1.1016, + "step": 3358 + }, + { + "epoch": 0.32, + "grad_norm": 0.296430797427781, + "learning_rate": 0.00019337437912625332, + "loss": 1.108, + "step": 3359 + }, + { + "epoch": 0.32, + "grad_norm": 0.2970936872836418, + "learning_rate": 0.00019336871549651102, + "loss": 1.0936, + "step": 3360 + }, + { + "epoch": 0.32, + "grad_norm": 0.30049312775342935, + "learning_rate": 0.0001933630495301579, + "loss": 1.1077, + "step": 3361 + }, + { + "epoch": 0.32, + "grad_norm": 0.25869029149150324, + "learning_rate": 0.0001933573812273358, + "loss": 1.1524, + "step": 3362 + }, + { + "epoch": 0.32, + "grad_norm": 0.25180878439069226, + "learning_rate": 0.00019335171058818657, + "loss": 0.9934, + "step": 3363 + }, + { + "epoch": 0.32, + "grad_norm": 0.2582499973976482, + "learning_rate": 0.0001933460376128521, + "loss": 1.0993, + "step": 3364 + }, + { + "epoch": 0.32, + "grad_norm": 0.27195751975656457, + "learning_rate": 0.0001933403623014744, + "loss": 1.0789, + "step": 3365 + }, + { + "epoch": 0.32, + "grad_norm": 0.2725473211575861, + "learning_rate": 0.00019333468465419545, + "loss": 1.1782, + "step": 3366 + }, + { + "epoch": 0.32, + "grad_norm": 0.2696499000977069, + "learning_rate": 0.00019332900467115735, + "loss": 1.079, + "step": 3367 + }, + { + "epoch": 0.32, + "grad_norm": 0.26468748026222183, + "learning_rate": 0.00019332332235250227, + "loss": 1.0032, + "step": 3368 + }, + { + "epoch": 0.32, + "grad_norm": 0.27136388855246457, + "learning_rate": 0.00019331763769837239, + "loss": 1.1859, + "step": 3369 + }, + { + "epoch": 0.32, + "grad_norm": 0.3147503114706967, + "learning_rate": 0.00019331195070890997, + "loss": 0.9721, + "step": 3370 + }, + { + "epoch": 0.32, + "grad_norm": 0.26623718570346866, + "learning_rate": 0.00019330626138425733, + "loss": 1.1342, + "step": 3371 + }, + { + "epoch": 0.32, + "grad_norm": 0.238352475784514, + "learning_rate": 0.0001933005697245569, + "loss": 1.0439, + "step": 3372 + }, + { + "epoch": 0.32, + "grad_norm": 0.28164020535606377, + "learning_rate": 0.00019329487572995104, + "loss": 1.0531, + "step": 3373 + }, + { + "epoch": 0.32, + "grad_norm": 0.2798281524042237, + "learning_rate": 0.0001932891794005823, + "loss": 1.0947, + "step": 3374 + }, + { + "epoch": 0.32, + "grad_norm": 0.2593240400068639, + "learning_rate": 0.0001932834807365932, + "loss": 1.3053, + "step": 3375 + }, + { + "epoch": 0.32, + "grad_norm": 0.2956714093140169, + "learning_rate": 0.0001932777797381264, + "loss": 1.1146, + "step": 3376 + }, + { + "epoch": 0.32, + "grad_norm": 0.24933340729423376, + "learning_rate": 0.0001932720764053245, + "loss": 1.0169, + "step": 3377 + }, + { + "epoch": 0.32, + "grad_norm": 0.2746804152121392, + "learning_rate": 0.0001932663707383303, + "loss": 1.2153, + "step": 3378 + }, + { + "epoch": 0.32, + "grad_norm": 0.2438039762627365, + "learning_rate": 0.00019326066273728652, + "loss": 1.1174, + "step": 3379 + }, + { + "epoch": 0.32, + "grad_norm": 0.2667417249557865, + "learning_rate": 0.00019325495240233608, + "loss": 1.1442, + "step": 3380 + }, + { + "epoch": 0.32, + "grad_norm": 0.282105093224216, + "learning_rate": 0.0001932492397336218, + "loss": 1.0328, + "step": 3381 + }, + { + "epoch": 0.32, + "grad_norm": 0.2700271902826537, + "learning_rate": 0.00019324352473128675, + "loss": 1.1098, + "step": 3382 + }, + { + "epoch": 0.32, + "grad_norm": 0.2810010796311137, + "learning_rate": 0.00019323780739547382, + "loss": 1.1228, + "step": 3383 + }, + { + "epoch": 0.32, + "grad_norm": 0.27543586143634324, + "learning_rate": 0.00019323208772632623, + "loss": 1.0899, + "step": 3384 + }, + { + "epoch": 0.32, + "grad_norm": 0.28148476530118366, + "learning_rate": 0.00019322636572398705, + "loss": 1.0954, + "step": 3385 + }, + { + "epoch": 0.32, + "grad_norm": 0.24691162930671992, + "learning_rate": 0.00019322064138859943, + "loss": 1.0638, + "step": 3386 + }, + { + "epoch": 0.32, + "grad_norm": 0.2766771037796505, + "learning_rate": 0.0001932149147203067, + "loss": 1.0988, + "step": 3387 + }, + { + "epoch": 0.32, + "grad_norm": 0.27879746565463825, + "learning_rate": 0.00019320918571925214, + "loss": 1.1285, + "step": 3388 + }, + { + "epoch": 0.32, + "grad_norm": 0.22878400180565586, + "learning_rate": 0.00019320345438557913, + "loss": 1.0721, + "step": 3389 + }, + { + "epoch": 0.32, + "grad_norm": 0.28209510083017475, + "learning_rate": 0.0001931977207194311, + "loss": 1.0092, + "step": 3390 + }, + { + "epoch": 0.32, + "grad_norm": 0.2718261063085623, + "learning_rate": 0.00019319198472095154, + "loss": 1.118, + "step": 3391 + }, + { + "epoch": 0.32, + "grad_norm": 0.29842791503503757, + "learning_rate": 0.00019318624639028397, + "loss": 1.0923, + "step": 3392 + }, + { + "epoch": 0.32, + "grad_norm": 0.24518361819708703, + "learning_rate": 0.00019318050572757206, + "loss": 1.0292, + "step": 3393 + }, + { + "epoch": 0.32, + "grad_norm": 0.24792628965630142, + "learning_rate": 0.00019317476273295937, + "loss": 1.0833, + "step": 3394 + }, + { + "epoch": 0.32, + "grad_norm": 0.29078999381758364, + "learning_rate": 0.00019316901740658974, + "loss": 1.1031, + "step": 3395 + }, + { + "epoch": 0.32, + "grad_norm": 0.2567096843477097, + "learning_rate": 0.00019316326974860688, + "loss": 1.0013, + "step": 3396 + }, + { + "epoch": 0.32, + "grad_norm": 0.2554741348697831, + "learning_rate": 0.00019315751975915464, + "loss": 1.0173, + "step": 3397 + }, + { + "epoch": 0.33, + "grad_norm": 0.28242380704250564, + "learning_rate": 0.00019315176743837692, + "loss": 1.1378, + "step": 3398 + }, + { + "epoch": 0.33, + "grad_norm": 0.31001384373991414, + "learning_rate": 0.00019314601278641767, + "loss": 1.1999, + "step": 3399 + }, + { + "epoch": 0.33, + "grad_norm": 0.2833937019206038, + "learning_rate": 0.0001931402558034209, + "loss": 1.1491, + "step": 3400 + }, + { + "epoch": 0.33, + "grad_norm": 0.3026457744535161, + "learning_rate": 0.00019313449648953075, + "loss": 1.2268, + "step": 3401 + }, + { + "epoch": 0.33, + "grad_norm": 0.26414264616917044, + "learning_rate": 0.00019312873484489122, + "loss": 1.0955, + "step": 3402 + }, + { + "epoch": 0.33, + "grad_norm": 0.24532958588409323, + "learning_rate": 0.0001931229708696466, + "loss": 1.0478, + "step": 3403 + }, + { + "epoch": 0.33, + "grad_norm": 0.30331695478379483, + "learning_rate": 0.00019311720456394115, + "loss": 1.0953, + "step": 3404 + }, + { + "epoch": 0.33, + "grad_norm": 0.28074250597379036, + "learning_rate": 0.00019311143592791908, + "loss": 1.1166, + "step": 3405 + }, + { + "epoch": 0.33, + "grad_norm": 0.2409279290445585, + "learning_rate": 0.00019310566496172482, + "loss": 1.1022, + "step": 3406 + }, + { + "epoch": 0.33, + "grad_norm": 0.27337622876374895, + "learning_rate": 0.00019309989166550276, + "loss": 1.0369, + "step": 3407 + }, + { + "epoch": 0.33, + "grad_norm": 0.27239368467190694, + "learning_rate": 0.00019309411603939746, + "loss": 1.0825, + "step": 3408 + }, + { + "epoch": 0.33, + "grad_norm": 0.2510203565258273, + "learning_rate": 0.00019308833808355335, + "loss": 1.1297, + "step": 3409 + }, + { + "epoch": 0.33, + "grad_norm": 0.3019608047256465, + "learning_rate": 0.0001930825577981151, + "loss": 1.0537, + "step": 3410 + }, + { + "epoch": 0.33, + "grad_norm": 0.27782305027611853, + "learning_rate": 0.00019307677518322732, + "loss": 1.0568, + "step": 3411 + }, + { + "epoch": 0.33, + "grad_norm": 0.27292941976306373, + "learning_rate": 0.00019307099023903475, + "loss": 1.1049, + "step": 3412 + }, + { + "epoch": 0.33, + "grad_norm": 0.31644950630512886, + "learning_rate": 0.00019306520296568213, + "loss": 0.9, + "step": 3413 + }, + { + "epoch": 0.33, + "grad_norm": 0.25546779805105374, + "learning_rate": 0.00019305941336331437, + "loss": 1.1242, + "step": 3414 + }, + { + "epoch": 0.33, + "grad_norm": 0.2792316426100012, + "learning_rate": 0.00019305362143207629, + "loss": 1.0101, + "step": 3415 + }, + { + "epoch": 0.33, + "grad_norm": 0.28856119813600223, + "learning_rate": 0.00019304782717211282, + "loss": 1.0683, + "step": 3416 + }, + { + "epoch": 0.33, + "grad_norm": 0.2538864469655074, + "learning_rate": 0.00019304203058356903, + "loss": 1.0736, + "step": 3417 + }, + { + "epoch": 0.33, + "grad_norm": 0.27697511534173397, + "learning_rate": 0.00019303623166658994, + "loss": 1.0237, + "step": 3418 + }, + { + "epoch": 0.33, + "grad_norm": 0.2816777125591762, + "learning_rate": 0.00019303043042132067, + "loss": 0.9735, + "step": 3419 + }, + { + "epoch": 0.33, + "grad_norm": 0.270336650155053, + "learning_rate": 0.00019302462684790643, + "loss": 1.0452, + "step": 3420 + }, + { + "epoch": 0.33, + "grad_norm": 0.3171677773770914, + "learning_rate": 0.00019301882094649244, + "loss": 1.0691, + "step": 3421 + }, + { + "epoch": 0.33, + "grad_norm": 0.30382729652830914, + "learning_rate": 0.00019301301271722397, + "loss": 1.1365, + "step": 3422 + }, + { + "epoch": 0.33, + "grad_norm": 0.2811965071217192, + "learning_rate": 0.00019300720216024642, + "loss": 1.0391, + "step": 3423 + }, + { + "epoch": 0.33, + "grad_norm": 0.2525300639055227, + "learning_rate": 0.00019300138927570517, + "loss": 1.1168, + "step": 3424 + }, + { + "epoch": 0.33, + "grad_norm": 0.2566849695783432, + "learning_rate": 0.00019299557406374574, + "loss": 1.0444, + "step": 3425 + }, + { + "epoch": 0.33, + "grad_norm": 0.29423746734781697, + "learning_rate": 0.00019298975652451357, + "loss": 1.1018, + "step": 3426 + }, + { + "epoch": 0.33, + "grad_norm": 0.27988383125452326, + "learning_rate": 0.00019298393665815434, + "loss": 1.0843, + "step": 3427 + }, + { + "epoch": 0.33, + "grad_norm": 0.2914047802879343, + "learning_rate": 0.00019297811446481364, + "loss": 1.1238, + "step": 3428 + }, + { + "epoch": 0.33, + "grad_norm": 0.26554271237108157, + "learning_rate": 0.0001929722899446372, + "loss": 1.0654, + "step": 3429 + }, + { + "epoch": 0.33, + "grad_norm": 0.2626759072606683, + "learning_rate": 0.00019296646309777078, + "loss": 1.0299, + "step": 3430 + }, + { + "epoch": 0.33, + "grad_norm": 0.24310810286273438, + "learning_rate": 0.00019296063392436016, + "loss": 1.1926, + "step": 3431 + }, + { + "epoch": 0.33, + "grad_norm": 0.2681389299412346, + "learning_rate": 0.0001929548024245513, + "loss": 1.0535, + "step": 3432 + }, + { + "epoch": 0.33, + "grad_norm": 0.2705554100674985, + "learning_rate": 0.00019294896859849007, + "loss": 1.1006, + "step": 3433 + }, + { + "epoch": 0.33, + "grad_norm": 0.29956584728704405, + "learning_rate": 0.00019294313244632246, + "loss": 1.1618, + "step": 3434 + }, + { + "epoch": 0.33, + "grad_norm": 0.25979972995620976, + "learning_rate": 0.00019293729396819455, + "loss": 1.1341, + "step": 3435 + }, + { + "epoch": 0.33, + "grad_norm": 0.31840805453945864, + "learning_rate": 0.0001929314531642525, + "loss": 1.0505, + "step": 3436 + }, + { + "epoch": 0.33, + "grad_norm": 0.30601505649557453, + "learning_rate": 0.0001929256100346424, + "loss": 1.1247, + "step": 3437 + }, + { + "epoch": 0.33, + "grad_norm": 0.255162069191932, + "learning_rate": 0.0001929197645795105, + "loss": 1.0914, + "step": 3438 + }, + { + "epoch": 0.33, + "grad_norm": 0.27220081852340894, + "learning_rate": 0.00019291391679900308, + "loss": 1.1204, + "step": 3439 + }, + { + "epoch": 0.33, + "grad_norm": 0.2680103064629152, + "learning_rate": 0.00019290806669326651, + "loss": 1.1374, + "step": 3440 + }, + { + "epoch": 0.33, + "grad_norm": 0.26907374179829274, + "learning_rate": 0.0001929022142624472, + "loss": 1.104, + "step": 3441 + }, + { + "epoch": 0.33, + "grad_norm": 0.25948281052343475, + "learning_rate": 0.00019289635950669158, + "loss": 1.1315, + "step": 3442 + }, + { + "epoch": 0.33, + "grad_norm": 0.25383699970065504, + "learning_rate": 0.00019289050242614616, + "loss": 1.0563, + "step": 3443 + }, + { + "epoch": 0.33, + "grad_norm": 0.28630705685350616, + "learning_rate": 0.00019288464302095757, + "loss": 1.0699, + "step": 3444 + }, + { + "epoch": 0.33, + "grad_norm": 0.2622608246535195, + "learning_rate": 0.00019287878129127238, + "loss": 1.0172, + "step": 3445 + }, + { + "epoch": 0.33, + "grad_norm": 0.27001089784818566, + "learning_rate": 0.00019287291723723735, + "loss": 0.9808, + "step": 3446 + }, + { + "epoch": 0.33, + "grad_norm": 0.3040076339208099, + "learning_rate": 0.00019286705085899916, + "loss": 1.2147, + "step": 3447 + }, + { + "epoch": 0.33, + "grad_norm": 0.2996187859781569, + "learning_rate": 0.00019286118215670471, + "loss": 1.1517, + "step": 3448 + }, + { + "epoch": 0.33, + "grad_norm": 0.33317490146125284, + "learning_rate": 0.00019285531113050075, + "loss": 1.0603, + "step": 3449 + }, + { + "epoch": 0.33, + "grad_norm": 0.2900875599565296, + "learning_rate": 0.00019284943778053433, + "loss": 1.1299, + "step": 3450 + }, + { + "epoch": 0.33, + "grad_norm": 0.28251347088813633, + "learning_rate": 0.00019284356210695234, + "loss": 1.1064, + "step": 3451 + }, + { + "epoch": 0.33, + "grad_norm": 0.28341004906992046, + "learning_rate": 0.00019283768410990185, + "loss": 1.0892, + "step": 3452 + }, + { + "epoch": 0.33, + "grad_norm": 0.2883977918976394, + "learning_rate": 0.00019283180378953, + "loss": 1.1733, + "step": 3453 + }, + { + "epoch": 0.33, + "grad_norm": 0.260390677587717, + "learning_rate": 0.0001928259211459839, + "loss": 0.9542, + "step": 3454 + }, + { + "epoch": 0.33, + "grad_norm": 0.29136611867774026, + "learning_rate": 0.0001928200361794108, + "loss": 1.0044, + "step": 3455 + }, + { + "epoch": 0.33, + "grad_norm": 0.26446944983636694, + "learning_rate": 0.00019281414888995795, + "loss": 1.083, + "step": 3456 + }, + { + "epoch": 0.33, + "grad_norm": 0.2483864576667101, + "learning_rate": 0.0001928082592777727, + "loss": 1.0629, + "step": 3457 + }, + { + "epoch": 0.33, + "grad_norm": 0.24787693382866996, + "learning_rate": 0.00019280236734300243, + "loss": 1.0367, + "step": 3458 + }, + { + "epoch": 0.33, + "grad_norm": 0.3038463165988747, + "learning_rate": 0.00019279647308579457, + "loss": 1.0523, + "step": 3459 + }, + { + "epoch": 0.33, + "grad_norm": 0.278602231033513, + "learning_rate": 0.00019279057650629667, + "loss": 1.168, + "step": 3460 + }, + { + "epoch": 0.33, + "grad_norm": 0.2488098065303038, + "learning_rate": 0.0001927846776046563, + "loss": 1.0209, + "step": 3461 + }, + { + "epoch": 0.33, + "grad_norm": 0.2758374803041767, + "learning_rate": 0.00019277877638102103, + "loss": 1.2443, + "step": 3462 + }, + { + "epoch": 0.33, + "grad_norm": 0.26309237485251713, + "learning_rate": 0.00019277287283553856, + "loss": 0.9914, + "step": 3463 + }, + { + "epoch": 0.33, + "grad_norm": 0.23795915412482324, + "learning_rate": 0.00019276696696835668, + "loss": 1.1086, + "step": 3464 + }, + { + "epoch": 0.33, + "grad_norm": 0.3224730190795291, + "learning_rate": 0.0001927610587796231, + "loss": 1.2036, + "step": 3465 + }, + { + "epoch": 0.33, + "grad_norm": 0.282804393013691, + "learning_rate": 0.00019275514826948577, + "loss": 1.137, + "step": 3466 + }, + { + "epoch": 0.33, + "grad_norm": 0.26066639065374003, + "learning_rate": 0.00019274923543809253, + "loss": 1.127, + "step": 3467 + }, + { + "epoch": 0.33, + "grad_norm": 0.24788428251360412, + "learning_rate": 0.00019274332028559142, + "loss": 1.0519, + "step": 3468 + }, + { + "epoch": 0.33, + "grad_norm": 0.290526276121983, + "learning_rate": 0.0001927374028121304, + "loss": 1.1116, + "step": 3469 + }, + { + "epoch": 0.33, + "grad_norm": 0.2673170033387275, + "learning_rate": 0.00019273148301785759, + "loss": 1.1649, + "step": 3470 + }, + { + "epoch": 0.33, + "grad_norm": 0.264616160691623, + "learning_rate": 0.00019272556090292115, + "loss": 1.1035, + "step": 3471 + }, + { + "epoch": 0.33, + "grad_norm": 0.22386779170530668, + "learning_rate": 0.00019271963646746927, + "loss": 1.0443, + "step": 3472 + }, + { + "epoch": 0.33, + "grad_norm": 0.29588656285029913, + "learning_rate": 0.00019271370971165022, + "loss": 1.1571, + "step": 3473 + }, + { + "epoch": 0.33, + "grad_norm": 0.26869130366990424, + "learning_rate": 0.00019270778063561233, + "loss": 0.8483, + "step": 3474 + }, + { + "epoch": 0.33, + "grad_norm": 0.2442917412236581, + "learning_rate": 0.00019270184923950395, + "loss": 1.1119, + "step": 3475 + }, + { + "epoch": 0.33, + "grad_norm": 0.27434823182121076, + "learning_rate": 0.00019269591552347352, + "loss": 1.0498, + "step": 3476 + }, + { + "epoch": 0.33, + "grad_norm": 0.26614974020743387, + "learning_rate": 0.00019268997948766956, + "loss": 1.0791, + "step": 3477 + }, + { + "epoch": 0.33, + "grad_norm": 0.2959411757891701, + "learning_rate": 0.00019268404113224059, + "loss": 1.1704, + "step": 3478 + }, + { + "epoch": 0.33, + "grad_norm": 0.26576214735143877, + "learning_rate": 0.00019267810045733527, + "loss": 1.0451, + "step": 3479 + }, + { + "epoch": 0.33, + "grad_norm": 0.26797549883523514, + "learning_rate": 0.00019267215746310222, + "loss": 1.1066, + "step": 3480 + }, + { + "epoch": 0.33, + "grad_norm": 0.26415504530857153, + "learning_rate": 0.0001926662121496902, + "loss": 0.9906, + "step": 3481 + }, + { + "epoch": 0.33, + "grad_norm": 0.2811191242637275, + "learning_rate": 0.000192660264517248, + "loss": 1.2062, + "step": 3482 + }, + { + "epoch": 0.33, + "grad_norm": 0.26967275386109973, + "learning_rate": 0.0001926543145659244, + "loss": 1.1001, + "step": 3483 + }, + { + "epoch": 0.33, + "grad_norm": 0.24658725442432267, + "learning_rate": 0.00019264836229586837, + "loss": 1.0698, + "step": 3484 + }, + { + "epoch": 0.33, + "grad_norm": 0.2716734887510684, + "learning_rate": 0.00019264240770722885, + "loss": 1.0129, + "step": 3485 + }, + { + "epoch": 0.33, + "grad_norm": 0.24634587706914243, + "learning_rate": 0.00019263645080015485, + "loss": 0.9294, + "step": 3486 + }, + { + "epoch": 0.33, + "grad_norm": 0.26665979653082733, + "learning_rate": 0.00019263049157479544, + "loss": 1.0361, + "step": 3487 + }, + { + "epoch": 0.33, + "grad_norm": 0.262098888665953, + "learning_rate": 0.0001926245300312998, + "loss": 1.0617, + "step": 3488 + }, + { + "epoch": 0.33, + "grad_norm": 0.27380325184122906, + "learning_rate": 0.00019261856616981703, + "loss": 1.0235, + "step": 3489 + }, + { + "epoch": 0.33, + "grad_norm": 0.25372809019534937, + "learning_rate": 0.00019261259999049646, + "loss": 1.0733, + "step": 3490 + }, + { + "epoch": 0.33, + "grad_norm": 0.2703271495420422, + "learning_rate": 0.00019260663149348736, + "loss": 1.1621, + "step": 3491 + }, + { + "epoch": 0.33, + "grad_norm": 0.285631485877341, + "learning_rate": 0.00019260066067893915, + "loss": 1.119, + "step": 3492 + }, + { + "epoch": 0.33, + "grad_norm": 0.28067090620266755, + "learning_rate": 0.00019259468754700114, + "loss": 1.1283, + "step": 3493 + }, + { + "epoch": 0.33, + "grad_norm": 0.25704313202950074, + "learning_rate": 0.00019258871209782292, + "loss": 1.1286, + "step": 3494 + }, + { + "epoch": 0.33, + "grad_norm": 0.29208835332994176, + "learning_rate": 0.00019258273433155399, + "loss": 1.1172, + "step": 3495 + }, + { + "epoch": 0.33, + "grad_norm": 0.26926728161587604, + "learning_rate": 0.00019257675424834395, + "loss": 1.0684, + "step": 3496 + }, + { + "epoch": 0.33, + "grad_norm": 0.2732931154751818, + "learning_rate": 0.00019257077184834244, + "loss": 1.1041, + "step": 3497 + }, + { + "epoch": 0.33, + "grad_norm": 0.26420765126059936, + "learning_rate": 0.00019256478713169917, + "loss": 1.0886, + "step": 3498 + }, + { + "epoch": 0.33, + "grad_norm": 0.26082579867661154, + "learning_rate": 0.00019255880009856396, + "loss": 1.0275, + "step": 3499 + }, + { + "epoch": 0.33, + "grad_norm": 0.28137431081329883, + "learning_rate": 0.0001925528107490866, + "loss": 1.0764, + "step": 3500 + }, + { + "epoch": 0.33, + "grad_norm": 0.26907207440559944, + "learning_rate": 0.00019254681908341696, + "loss": 1.0703, + "step": 3501 + }, + { + "epoch": 0.34, + "grad_norm": 0.31906228924539476, + "learning_rate": 0.00019254082510170503, + "loss": 1.0448, + "step": 3502 + }, + { + "epoch": 0.34, + "grad_norm": 0.27170761831755913, + "learning_rate": 0.0001925348288041008, + "loss": 1.1028, + "step": 3503 + }, + { + "epoch": 0.34, + "grad_norm": 0.2855022956911831, + "learning_rate": 0.00019252883019075433, + "loss": 1.0736, + "step": 3504 + }, + { + "epoch": 0.34, + "grad_norm": 0.2857537178954334, + "learning_rate": 0.0001925228292618157, + "loss": 1.0172, + "step": 3505 + }, + { + "epoch": 0.34, + "grad_norm": 0.26288096894393326, + "learning_rate": 0.0001925168260174351, + "loss": 1.0262, + "step": 3506 + }, + { + "epoch": 0.34, + "grad_norm": 0.2682150517518947, + "learning_rate": 0.00019251082045776283, + "loss": 1.0227, + "step": 3507 + }, + { + "epoch": 0.34, + "grad_norm": 0.28175497164571117, + "learning_rate": 0.00019250481258294911, + "loss": 1.0507, + "step": 3508 + }, + { + "epoch": 0.34, + "grad_norm": 0.28227600468816066, + "learning_rate": 0.00019249880239314435, + "loss": 1.1972, + "step": 3509 + }, + { + "epoch": 0.34, + "grad_norm": 0.2933560845393136, + "learning_rate": 0.0001924927898884989, + "loss": 1.0141, + "step": 3510 + }, + { + "epoch": 0.34, + "grad_norm": 0.2659979972720397, + "learning_rate": 0.0001924867750691633, + "loss": 1.1605, + "step": 3511 + }, + { + "epoch": 0.34, + "grad_norm": 0.28458190404309464, + "learning_rate": 0.00019248075793528794, + "loss": 1.1147, + "step": 3512 + }, + { + "epoch": 0.34, + "grad_norm": 0.30482251375794783, + "learning_rate": 0.00019247473848702358, + "loss": 1.2108, + "step": 3513 + }, + { + "epoch": 0.34, + "grad_norm": 0.30589628132765245, + "learning_rate": 0.00019246871672452072, + "loss": 1.1377, + "step": 3514 + }, + { + "epoch": 0.34, + "grad_norm": 0.2515859551958094, + "learning_rate": 0.00019246269264793013, + "loss": 1.0792, + "step": 3515 + }, + { + "epoch": 0.34, + "grad_norm": 0.2822974779571469, + "learning_rate": 0.00019245666625740252, + "loss": 1.1263, + "step": 3516 + }, + { + "epoch": 0.34, + "grad_norm": 0.26320433675688415, + "learning_rate": 0.00019245063755308873, + "loss": 1.0395, + "step": 3517 + }, + { + "epoch": 0.34, + "grad_norm": 0.27747602120555126, + "learning_rate": 0.00019244460653513966, + "loss": 1.0819, + "step": 3518 + }, + { + "epoch": 0.34, + "grad_norm": 0.27754855559046226, + "learning_rate": 0.00019243857320370622, + "loss": 1.1293, + "step": 3519 + }, + { + "epoch": 0.34, + "grad_norm": 0.2718497648773705, + "learning_rate": 0.00019243253755893934, + "loss": 1.0904, + "step": 3520 + }, + { + "epoch": 0.34, + "grad_norm": 0.24497317555871573, + "learning_rate": 0.00019242649960099018, + "loss": 1.0817, + "step": 3521 + }, + { + "epoch": 0.34, + "grad_norm": 0.2832235216362736, + "learning_rate": 0.00019242045933000974, + "loss": 1.1896, + "step": 3522 + }, + { + "epoch": 0.34, + "grad_norm": 0.2804650893498078, + "learning_rate": 0.00019241441674614925, + "loss": 1.0998, + "step": 3523 + }, + { + "epoch": 0.34, + "grad_norm": 0.2746480898368063, + "learning_rate": 0.00019240837184955986, + "loss": 1.1329, + "step": 3524 + }, + { + "epoch": 0.34, + "grad_norm": 0.27811869745054746, + "learning_rate": 0.0001924023246403929, + "loss": 1.2432, + "step": 3525 + }, + { + "epoch": 0.34, + "grad_norm": 0.2438034625107248, + "learning_rate": 0.0001923962751187997, + "loss": 1.0454, + "step": 3526 + }, + { + "epoch": 0.34, + "grad_norm": 0.29872712215291863, + "learning_rate": 0.00019239022328493166, + "loss": 1.0986, + "step": 3527 + }, + { + "epoch": 0.34, + "grad_norm": 0.28352993745178234, + "learning_rate": 0.00019238416913894022, + "loss": 1.0937, + "step": 3528 + }, + { + "epoch": 0.34, + "grad_norm": 0.2573940016691036, + "learning_rate": 0.00019237811268097685, + "loss": 1.061, + "step": 3529 + }, + { + "epoch": 0.34, + "grad_norm": 0.26547291167111237, + "learning_rate": 0.00019237205391119317, + "loss": 1.1062, + "step": 3530 + }, + { + "epoch": 0.34, + "grad_norm": 0.2434569652472095, + "learning_rate": 0.0001923659928297408, + "loss": 0.994, + "step": 3531 + }, + { + "epoch": 0.34, + "grad_norm": 0.24604489384038644, + "learning_rate": 0.00019235992943677138, + "loss": 1.0407, + "step": 3532 + }, + { + "epoch": 0.34, + "grad_norm": 0.2566897641461854, + "learning_rate": 0.0001923538637324367, + "loss": 1.0814, + "step": 3533 + }, + { + "epoch": 0.34, + "grad_norm": 0.2833118574755522, + "learning_rate": 0.00019234779571688856, + "loss": 1.0133, + "step": 3534 + }, + { + "epoch": 0.34, + "grad_norm": 0.2641589772291687, + "learning_rate": 0.00019234172539027875, + "loss": 1.0779, + "step": 3535 + }, + { + "epoch": 0.34, + "grad_norm": 0.26348102788720923, + "learning_rate": 0.00019233565275275926, + "loss": 1.0349, + "step": 3536 + }, + { + "epoch": 0.34, + "grad_norm": 0.26442584979280237, + "learning_rate": 0.00019232957780448203, + "loss": 1.0908, + "step": 3537 + }, + { + "epoch": 0.34, + "grad_norm": 0.21497138036257077, + "learning_rate": 0.00019232350054559908, + "loss": 1.0206, + "step": 3538 + }, + { + "epoch": 0.34, + "grad_norm": 0.28584115929317455, + "learning_rate": 0.00019231742097626248, + "loss": 1.0948, + "step": 3539 + }, + { + "epoch": 0.34, + "grad_norm": 0.28169323923794704, + "learning_rate": 0.00019231133909662442, + "loss": 1.1113, + "step": 3540 + }, + { + "epoch": 0.34, + "grad_norm": 0.27099757848452277, + "learning_rate": 0.0001923052549068371, + "loss": 1.1069, + "step": 3541 + }, + { + "epoch": 0.34, + "grad_norm": 0.2576793427814868, + "learning_rate": 0.00019229916840705276, + "loss": 0.9917, + "step": 3542 + }, + { + "epoch": 0.34, + "grad_norm": 0.261636637446303, + "learning_rate": 0.0001922930795974237, + "loss": 1.0584, + "step": 3543 + }, + { + "epoch": 0.34, + "grad_norm": 0.27616031671725766, + "learning_rate": 0.0001922869884781023, + "loss": 1.0674, + "step": 3544 + }, + { + "epoch": 0.34, + "grad_norm": 0.2603326493271169, + "learning_rate": 0.0001922808950492411, + "loss": 0.9925, + "step": 3545 + }, + { + "epoch": 0.34, + "grad_norm": 0.2611098162609211, + "learning_rate": 0.00019227479931099243, + "loss": 1.1144, + "step": 3546 + }, + { + "epoch": 0.34, + "grad_norm": 0.28213224868364634, + "learning_rate": 0.00019226870126350893, + "loss": 0.9999, + "step": 3547 + }, + { + "epoch": 0.34, + "grad_norm": 0.26909330897869277, + "learning_rate": 0.00019226260090694322, + "loss": 1.1827, + "step": 3548 + }, + { + "epoch": 0.34, + "grad_norm": 0.2204883513538407, + "learning_rate": 0.00019225649824144788, + "loss": 1.0633, + "step": 3549 + }, + { + "epoch": 0.34, + "grad_norm": 0.2800738248678148, + "learning_rate": 0.00019225039326717575, + "loss": 1.0618, + "step": 3550 + }, + { + "epoch": 0.34, + "grad_norm": 0.3026024017055852, + "learning_rate": 0.0001922442859842795, + "loss": 1.0372, + "step": 3551 + }, + { + "epoch": 0.34, + "grad_norm": 0.25052538631469556, + "learning_rate": 0.00019223817639291206, + "loss": 1.1017, + "step": 3552 + }, + { + "epoch": 0.34, + "grad_norm": 0.2534940085440102, + "learning_rate": 0.00019223206449322627, + "loss": 1.063, + "step": 3553 + }, + { + "epoch": 0.34, + "grad_norm": 0.2829769188702781, + "learning_rate": 0.0001922259502853751, + "loss": 1.0918, + "step": 3554 + }, + { + "epoch": 0.34, + "grad_norm": 0.2515647647546932, + "learning_rate": 0.0001922198337695116, + "loss": 1.1084, + "step": 3555 + }, + { + "epoch": 0.34, + "grad_norm": 0.2716176574803204, + "learning_rate": 0.00019221371494578874, + "loss": 1.1048, + "step": 3556 + }, + { + "epoch": 0.34, + "grad_norm": 0.2544586203933426, + "learning_rate": 0.00019220759381435976, + "loss": 0.9334, + "step": 3557 + }, + { + "epoch": 0.34, + "grad_norm": 0.25686564637963666, + "learning_rate": 0.00019220147037537775, + "loss": 1.1342, + "step": 3558 + }, + { + "epoch": 0.34, + "grad_norm": 0.2856418475083659, + "learning_rate": 0.00019219534462899603, + "loss": 1.0993, + "step": 3559 + }, + { + "epoch": 0.34, + "grad_norm": 0.25892037090159264, + "learning_rate": 0.00019218921657536785, + "loss": 1.1242, + "step": 3560 + }, + { + "epoch": 0.34, + "grad_norm": 0.22637468596243374, + "learning_rate": 0.00019218308621464657, + "loss": 0.9695, + "step": 3561 + }, + { + "epoch": 0.34, + "grad_norm": 0.24205716240527825, + "learning_rate": 0.00019217695354698566, + "loss": 0.9977, + "step": 3562 + }, + { + "epoch": 0.34, + "grad_norm": 0.28419771649107817, + "learning_rate": 0.00019217081857253855, + "loss": 1.1305, + "step": 3563 + }, + { + "epoch": 0.34, + "grad_norm": 0.2997025561180733, + "learning_rate": 0.00019216468129145878, + "loss": 1.1392, + "step": 3564 + }, + { + "epoch": 0.34, + "grad_norm": 0.2575883782828923, + "learning_rate": 0.00019215854170389992, + "loss": 1.1146, + "step": 3565 + }, + { + "epoch": 0.34, + "grad_norm": 0.2378156355843756, + "learning_rate": 0.00019215239981001565, + "loss": 1.1623, + "step": 3566 + }, + { + "epoch": 0.34, + "grad_norm": 0.22766346459594225, + "learning_rate": 0.00019214625560995963, + "loss": 1.1813, + "step": 3567 + }, + { + "epoch": 0.34, + "grad_norm": 0.2815971957249664, + "learning_rate": 0.0001921401091038857, + "loss": 1.0946, + "step": 3568 + }, + { + "epoch": 0.34, + "grad_norm": 0.24854801853444775, + "learning_rate": 0.0001921339602919476, + "loss": 1.1996, + "step": 3569 + }, + { + "epoch": 0.34, + "grad_norm": 0.28777374446942766, + "learning_rate": 0.00019212780917429923, + "loss": 1.2163, + "step": 3570 + }, + { + "epoch": 0.34, + "grad_norm": 0.2755310826907456, + "learning_rate": 0.00019212165575109452, + "loss": 1.2163, + "step": 3571 + }, + { + "epoch": 0.34, + "grad_norm": 0.23202223527427898, + "learning_rate": 0.00019211550002248755, + "loss": 1.0259, + "step": 3572 + }, + { + "epoch": 0.34, + "grad_norm": 0.25126040011653317, + "learning_rate": 0.00019210934198863225, + "loss": 1.1719, + "step": 3573 + }, + { + "epoch": 0.34, + "grad_norm": 0.31182851203253326, + "learning_rate": 0.00019210318164968276, + "loss": 0.9726, + "step": 3574 + }, + { + "epoch": 0.34, + "grad_norm": 0.30037163533010824, + "learning_rate": 0.00019209701900579332, + "loss": 1.1423, + "step": 3575 + }, + { + "epoch": 0.34, + "grad_norm": 0.2769715350587386, + "learning_rate": 0.00019209085405711806, + "loss": 1.023, + "step": 3576 + }, + { + "epoch": 0.34, + "grad_norm": 0.2737139517166542, + "learning_rate": 0.0001920846868038113, + "loss": 1.1156, + "step": 3577 + }, + { + "epoch": 0.34, + "grad_norm": 0.27215926972723625, + "learning_rate": 0.00019207851724602738, + "loss": 1.2292, + "step": 3578 + }, + { + "epoch": 0.34, + "grad_norm": 0.25589677103292885, + "learning_rate": 0.0001920723453839207, + "loss": 1.066, + "step": 3579 + }, + { + "epoch": 0.34, + "grad_norm": 0.27063989588024306, + "learning_rate": 0.00019206617121764573, + "loss": 1.1828, + "step": 3580 + }, + { + "epoch": 0.34, + "grad_norm": 0.2621987437844738, + "learning_rate": 0.00019205999474735695, + "loss": 1.113, + "step": 3581 + }, + { + "epoch": 0.34, + "grad_norm": 0.27664343721303936, + "learning_rate": 0.00019205381597320895, + "loss": 1.0935, + "step": 3582 + }, + { + "epoch": 0.34, + "grad_norm": 0.2521436526457091, + "learning_rate": 0.00019204763489535633, + "loss": 1.0474, + "step": 3583 + }, + { + "epoch": 0.34, + "grad_norm": 0.29893749568707256, + "learning_rate": 0.00019204145151395383, + "loss": 1.0565, + "step": 3584 + }, + { + "epoch": 0.34, + "grad_norm": 0.2865188185320406, + "learning_rate": 0.00019203526582915615, + "loss": 1.0099, + "step": 3585 + }, + { + "epoch": 0.34, + "grad_norm": 0.30840096394075645, + "learning_rate": 0.0001920290778411181, + "loss": 1.0744, + "step": 3586 + }, + { + "epoch": 0.34, + "grad_norm": 0.27719489340614456, + "learning_rate": 0.00019202288754999454, + "loss": 1.1818, + "step": 3587 + }, + { + "epoch": 0.34, + "grad_norm": 0.26038266306240404, + "learning_rate": 0.00019201669495594036, + "loss": 1.1681, + "step": 3588 + }, + { + "epoch": 0.34, + "grad_norm": 0.2535864656959433, + "learning_rate": 0.00019201050005911057, + "loss": 1.0594, + "step": 3589 + }, + { + "epoch": 0.34, + "grad_norm": 0.26996214011774267, + "learning_rate": 0.0001920043028596602, + "loss": 1.0319, + "step": 3590 + }, + { + "epoch": 0.34, + "grad_norm": 0.293644543486573, + "learning_rate": 0.00019199810335774432, + "loss": 1.0956, + "step": 3591 + }, + { + "epoch": 0.34, + "grad_norm": 0.24802640655532923, + "learning_rate": 0.0001919919015535181, + "loss": 1.028, + "step": 3592 + }, + { + "epoch": 0.34, + "grad_norm": 0.288494054514753, + "learning_rate": 0.0001919856974471367, + "loss": 1.2163, + "step": 3593 + }, + { + "epoch": 0.34, + "grad_norm": 0.28835626422546046, + "learning_rate": 0.00019197949103875542, + "loss": 1.1595, + "step": 3594 + }, + { + "epoch": 0.34, + "grad_norm": 0.2834452761231032, + "learning_rate": 0.00019197328232852957, + "loss": 1.2283, + "step": 3595 + }, + { + "epoch": 0.34, + "grad_norm": 0.2712391659872608, + "learning_rate": 0.00019196707131661456, + "loss": 0.9865, + "step": 3596 + }, + { + "epoch": 0.34, + "grad_norm": 0.28960322854085857, + "learning_rate": 0.00019196085800316577, + "loss": 1.0389, + "step": 3597 + }, + { + "epoch": 0.34, + "grad_norm": 0.26819478685827985, + "learning_rate": 0.00019195464238833872, + "loss": 1.0676, + "step": 3598 + }, + { + "epoch": 0.34, + "grad_norm": 0.27909258767693773, + "learning_rate": 0.00019194842447228894, + "loss": 1.1414, + "step": 3599 + }, + { + "epoch": 0.34, + "grad_norm": 0.29782335526237375, + "learning_rate": 0.00019194220425517203, + "loss": 1.0944, + "step": 3600 + }, + { + "epoch": 0.34, + "grad_norm": 0.2990681773116163, + "learning_rate": 0.00019193598173714368, + "loss": 1.2351, + "step": 3601 + }, + { + "epoch": 0.34, + "grad_norm": 0.25309601233066126, + "learning_rate": 0.00019192975691835967, + "loss": 0.9833, + "step": 3602 + }, + { + "epoch": 0.34, + "grad_norm": 0.2530498249236964, + "learning_rate": 0.00019192352979897564, + "loss": 1.1116, + "step": 3603 + }, + { + "epoch": 0.34, + "grad_norm": 0.261372825515641, + "learning_rate": 0.00019191730037914755, + "loss": 1.1451, + "step": 3604 + }, + { + "epoch": 0.34, + "grad_norm": 0.2224404725086543, + "learning_rate": 0.00019191106865903125, + "loss": 1.1346, + "step": 3605 + }, + { + "epoch": 0.34, + "grad_norm": 0.24745135431170612, + "learning_rate": 0.00019190483463878266, + "loss": 1.0474, + "step": 3606 + }, + { + "epoch": 0.35, + "grad_norm": 0.2680064496452726, + "learning_rate": 0.00019189859831855786, + "loss": 1.0726, + "step": 3607 + }, + { + "epoch": 0.35, + "grad_norm": 0.2728343814388248, + "learning_rate": 0.00019189235969851285, + "loss": 1.143, + "step": 3608 + }, + { + "epoch": 0.35, + "grad_norm": 0.2758236497986754, + "learning_rate": 0.0001918861187788038, + "loss": 1.1073, + "step": 3609 + }, + { + "epoch": 0.35, + "grad_norm": 0.24891564132409086, + "learning_rate": 0.00019187987555958688, + "loss": 0.9501, + "step": 3610 + }, + { + "epoch": 0.35, + "grad_norm": 0.28175698358764867, + "learning_rate": 0.00019187363004101834, + "loss": 1.119, + "step": 3611 + }, + { + "epoch": 0.35, + "grad_norm": 0.29465454747771924, + "learning_rate": 0.00019186738222325446, + "loss": 1.1071, + "step": 3612 + }, + { + "epoch": 0.35, + "grad_norm": 0.28563894659906613, + "learning_rate": 0.00019186113210645158, + "loss": 0.9567, + "step": 3613 + }, + { + "epoch": 0.35, + "grad_norm": 0.2665261670344539, + "learning_rate": 0.00019185487969076618, + "loss": 1.0447, + "step": 3614 + }, + { + "epoch": 0.35, + "grad_norm": 0.29057112065403007, + "learning_rate": 0.00019184862497635466, + "loss": 1.094, + "step": 3615 + }, + { + "epoch": 0.35, + "grad_norm": 0.24917921323756612, + "learning_rate": 0.0001918423679633736, + "loss": 1.0332, + "step": 3616 + }, + { + "epoch": 0.35, + "grad_norm": 0.25724912043463755, + "learning_rate": 0.0001918361086519795, + "loss": 1.1526, + "step": 3617 + }, + { + "epoch": 0.35, + "grad_norm": 0.30458714457974434, + "learning_rate": 0.00019182984704232912, + "loss": 1.1059, + "step": 3618 + }, + { + "epoch": 0.35, + "grad_norm": 0.233003803525938, + "learning_rate": 0.00019182358313457907, + "loss": 1.0059, + "step": 3619 + }, + { + "epoch": 0.35, + "grad_norm": 0.3095013736710616, + "learning_rate": 0.00019181731692888615, + "loss": 1.2226, + "step": 3620 + }, + { + "epoch": 0.35, + "grad_norm": 0.3112580093485402, + "learning_rate": 0.0001918110484254072, + "loss": 1.0111, + "step": 3621 + }, + { + "epoch": 0.35, + "grad_norm": 0.2615583637323915, + "learning_rate": 0.00019180477762429905, + "loss": 1.0417, + "step": 3622 + }, + { + "epoch": 0.35, + "grad_norm": 0.25331226852742855, + "learning_rate": 0.00019179850452571864, + "loss": 1.1169, + "step": 3623 + }, + { + "epoch": 0.35, + "grad_norm": 0.2653623351810319, + "learning_rate": 0.00019179222912982295, + "loss": 1.0001, + "step": 3624 + }, + { + "epoch": 0.35, + "grad_norm": 0.3027616299847975, + "learning_rate": 0.00019178595143676903, + "loss": 1.0122, + "step": 3625 + }, + { + "epoch": 0.35, + "grad_norm": 0.2596463436944122, + "learning_rate": 0.00019177967144671403, + "loss": 1.0603, + "step": 3626 + }, + { + "epoch": 0.35, + "grad_norm": 0.26322578302738187, + "learning_rate": 0.00019177338915981503, + "loss": 1.1179, + "step": 3627 + }, + { + "epoch": 0.35, + "grad_norm": 0.26304450867502877, + "learning_rate": 0.00019176710457622932, + "loss": 1.1117, + "step": 3628 + }, + { + "epoch": 0.35, + "grad_norm": 0.26025672008441436, + "learning_rate": 0.00019176081769611413, + "loss": 1.0988, + "step": 3629 + }, + { + "epoch": 0.35, + "grad_norm": 0.264676786106681, + "learning_rate": 0.00019175452851962678, + "loss": 1.1449, + "step": 3630 + }, + { + "epoch": 0.35, + "grad_norm": 0.28753505819216246, + "learning_rate": 0.00019174823704692473, + "loss": 0.9785, + "step": 3631 + }, + { + "epoch": 0.35, + "grad_norm": 0.27286886659302656, + "learning_rate": 0.00019174194327816534, + "loss": 1.0712, + "step": 3632 + }, + { + "epoch": 0.35, + "grad_norm": 0.2851788136509597, + "learning_rate": 0.0001917356472135062, + "loss": 1.1066, + "step": 3633 + }, + { + "epoch": 0.35, + "grad_norm": 0.2596390703916512, + "learning_rate": 0.00019172934885310484, + "loss": 1.03, + "step": 3634 + }, + { + "epoch": 0.35, + "grad_norm": 0.27793648005875066, + "learning_rate": 0.00019172304819711886, + "loss": 1.0643, + "step": 3635 + }, + { + "epoch": 0.35, + "grad_norm": 0.2785181884043944, + "learning_rate": 0.00019171674524570595, + "loss": 1.1464, + "step": 3636 + }, + { + "epoch": 0.35, + "grad_norm": 0.30776612944819687, + "learning_rate": 0.00019171043999902386, + "loss": 1.0927, + "step": 3637 + }, + { + "epoch": 0.35, + "grad_norm": 0.2477006043242119, + "learning_rate": 0.00019170413245723036, + "loss": 1.0933, + "step": 3638 + }, + { + "epoch": 0.35, + "grad_norm": 0.28543848973925623, + "learning_rate": 0.00019169782262048332, + "loss": 1.0738, + "step": 3639 + }, + { + "epoch": 0.35, + "grad_norm": 0.23369616317344152, + "learning_rate": 0.00019169151048894066, + "loss": 1.0539, + "step": 3640 + }, + { + "epoch": 0.35, + "grad_norm": 0.26607663383639485, + "learning_rate": 0.00019168519606276027, + "loss": 1.0068, + "step": 3641 + }, + { + "epoch": 0.35, + "grad_norm": 0.27489969296500305, + "learning_rate": 0.00019167887934210028, + "loss": 1.1252, + "step": 3642 + }, + { + "epoch": 0.35, + "grad_norm": 0.26853165269525164, + "learning_rate": 0.00019167256032711868, + "loss": 1.1099, + "step": 3643 + }, + { + "epoch": 0.35, + "grad_norm": 0.26605252525945594, + "learning_rate": 0.00019166623901797366, + "loss": 1.063, + "step": 3644 + }, + { + "epoch": 0.35, + "grad_norm": 0.2857813746118553, + "learning_rate": 0.0001916599154148234, + "loss": 1.1668, + "step": 3645 + }, + { + "epoch": 0.35, + "grad_norm": 0.3004731926385603, + "learning_rate": 0.00019165358951782612, + "loss": 1.1223, + "step": 3646 + }, + { + "epoch": 0.35, + "grad_norm": 0.30149091959544416, + "learning_rate": 0.0001916472613271402, + "loss": 1.0124, + "step": 3647 + }, + { + "epoch": 0.35, + "grad_norm": 0.24733679580665724, + "learning_rate": 0.00019164093084292393, + "loss": 1.1462, + "step": 3648 + }, + { + "epoch": 0.35, + "grad_norm": 0.21990278220565118, + "learning_rate": 0.00019163459806533582, + "loss": 1.0147, + "step": 3649 + }, + { + "epoch": 0.35, + "grad_norm": 0.26246347077998894, + "learning_rate": 0.00019162826299453427, + "loss": 1.0042, + "step": 3650 + }, + { + "epoch": 0.35, + "grad_norm": 0.2552863623874917, + "learning_rate": 0.00019162192563067785, + "loss": 1.0509, + "step": 3651 + }, + { + "epoch": 0.35, + "grad_norm": 0.26792837644378786, + "learning_rate": 0.00019161558597392516, + "loss": 1.0793, + "step": 3652 + }, + { + "epoch": 0.35, + "grad_norm": 0.26602863063633964, + "learning_rate": 0.00019160924402443488, + "loss": 1.0786, + "step": 3653 + }, + { + "epoch": 0.35, + "grad_norm": 0.2748453626108522, + "learning_rate": 0.00019160289978236567, + "loss": 1.1711, + "step": 3654 + }, + { + "epoch": 0.35, + "grad_norm": 0.2892103795799046, + "learning_rate": 0.00019159655324787634, + "loss": 1.0855, + "step": 3655 + }, + { + "epoch": 0.35, + "grad_norm": 0.29356881676510077, + "learning_rate": 0.00019159020442112567, + "loss": 1.0765, + "step": 3656 + }, + { + "epoch": 0.35, + "grad_norm": 0.25797959221770866, + "learning_rate": 0.0001915838533022726, + "loss": 1.0762, + "step": 3657 + }, + { + "epoch": 0.35, + "grad_norm": 0.24086139476910498, + "learning_rate": 0.00019157749989147602, + "loss": 1.0265, + "step": 3658 + }, + { + "epoch": 0.35, + "grad_norm": 0.29161814627285754, + "learning_rate": 0.00019157114418889498, + "loss": 1.0909, + "step": 3659 + }, + { + "epoch": 0.35, + "grad_norm": 0.2899831061438026, + "learning_rate": 0.0001915647861946885, + "loss": 1.0793, + "step": 3660 + }, + { + "epoch": 0.35, + "grad_norm": 0.2444771404216185, + "learning_rate": 0.00019155842590901564, + "loss": 1.0686, + "step": 3661 + }, + { + "epoch": 0.35, + "grad_norm": 0.2757863183788381, + "learning_rate": 0.0001915520633320357, + "loss": 1.1698, + "step": 3662 + }, + { + "epoch": 0.35, + "grad_norm": 0.2850756152744979, + "learning_rate": 0.00019154569846390783, + "loss": 1.2098, + "step": 3663 + }, + { + "epoch": 0.35, + "grad_norm": 0.2229292059572456, + "learning_rate": 0.00019153933130479128, + "loss": 1.1241, + "step": 3664 + }, + { + "epoch": 0.35, + "grad_norm": 0.2763336437474556, + "learning_rate": 0.00019153296185484545, + "loss": 1.0689, + "step": 3665 + }, + { + "epoch": 0.35, + "grad_norm": 0.26355818754319077, + "learning_rate": 0.00019152659011422975, + "loss": 1.1286, + "step": 3666 + }, + { + "epoch": 0.35, + "grad_norm": 0.2834332018069187, + "learning_rate": 0.0001915202160831036, + "loss": 1.1524, + "step": 3667 + }, + { + "epoch": 0.35, + "grad_norm": 0.2829528091832065, + "learning_rate": 0.0001915138397616265, + "loss": 1.1356, + "step": 3668 + }, + { + "epoch": 0.35, + "grad_norm": 0.2549124521154805, + "learning_rate": 0.00019150746114995809, + "loss": 1.0091, + "step": 3669 + }, + { + "epoch": 0.35, + "grad_norm": 0.2787455502134115, + "learning_rate": 0.00019150108024825792, + "loss": 1.1973, + "step": 3670 + }, + { + "epoch": 0.35, + "grad_norm": 0.3045237462568005, + "learning_rate": 0.00019149469705668574, + "loss": 1.0021, + "step": 3671 + }, + { + "epoch": 0.35, + "grad_norm": 0.2563873686787024, + "learning_rate": 0.00019148831157540122, + "loss": 1.0652, + "step": 3672 + }, + { + "epoch": 0.35, + "grad_norm": 0.26639470855690445, + "learning_rate": 0.0001914819238045642, + "loss": 1.0402, + "step": 3673 + }, + { + "epoch": 0.35, + "grad_norm": 0.24615772856913087, + "learning_rate": 0.00019147553374433457, + "loss": 1.1036, + "step": 3674 + }, + { + "epoch": 0.35, + "grad_norm": 0.26050952114504417, + "learning_rate": 0.00019146914139487223, + "loss": 1.082, + "step": 3675 + }, + { + "epoch": 0.35, + "grad_norm": 0.2711202214111707, + "learning_rate": 0.00019146274675633712, + "loss": 1.022, + "step": 3676 + }, + { + "epoch": 0.35, + "grad_norm": 0.27642386341556874, + "learning_rate": 0.0001914563498288893, + "loss": 1.1196, + "step": 3677 + }, + { + "epoch": 0.35, + "grad_norm": 0.26017602194644773, + "learning_rate": 0.00019144995061268886, + "loss": 1.0743, + "step": 3678 + }, + { + "epoch": 0.35, + "grad_norm": 0.2802611591626233, + "learning_rate": 0.00019144354910789592, + "loss": 1.1542, + "step": 3679 + }, + { + "epoch": 0.35, + "grad_norm": 0.30109865674446373, + "learning_rate": 0.00019143714531467067, + "loss": 1.0761, + "step": 3680 + }, + { + "epoch": 0.35, + "grad_norm": 0.29232637101985814, + "learning_rate": 0.00019143073923317343, + "loss": 1.1051, + "step": 3681 + }, + { + "epoch": 0.35, + "grad_norm": 0.26726588819259994, + "learning_rate": 0.00019142433086356445, + "loss": 1.1052, + "step": 3682 + }, + { + "epoch": 0.35, + "grad_norm": 0.3170937940560179, + "learning_rate": 0.00019141792020600416, + "loss": 1.0991, + "step": 3683 + }, + { + "epoch": 0.35, + "grad_norm": 0.26117278170269886, + "learning_rate": 0.00019141150726065292, + "loss": 1.1537, + "step": 3684 + }, + { + "epoch": 0.35, + "grad_norm": 0.23933035001939557, + "learning_rate": 0.00019140509202767128, + "loss": 1.0034, + "step": 3685 + }, + { + "epoch": 0.35, + "grad_norm": 0.261470828936781, + "learning_rate": 0.00019139867450721978, + "loss": 1.0356, + "step": 3686 + }, + { + "epoch": 0.35, + "grad_norm": 0.28669585869759606, + "learning_rate": 0.000191392254699459, + "loss": 1.1138, + "step": 3687 + }, + { + "epoch": 0.35, + "grad_norm": 0.28522961501456195, + "learning_rate": 0.00019138583260454962, + "loss": 1.1261, + "step": 3688 + }, + { + "epoch": 0.35, + "grad_norm": 0.24515845894099997, + "learning_rate": 0.00019137940822265234, + "loss": 1.0393, + "step": 3689 + }, + { + "epoch": 0.35, + "grad_norm": 0.30181790008831666, + "learning_rate": 0.00019137298155392794, + "loss": 1.1297, + "step": 3690 + }, + { + "epoch": 0.35, + "grad_norm": 0.23025123557623256, + "learning_rate": 0.0001913665525985372, + "loss": 0.9475, + "step": 3691 + }, + { + "epoch": 0.35, + "grad_norm": 0.2916644640665306, + "learning_rate": 0.0001913601213566411, + "loss": 1.0052, + "step": 3692 + }, + { + "epoch": 0.35, + "grad_norm": 0.2914800928299035, + "learning_rate": 0.00019135368782840058, + "loss": 1.0656, + "step": 3693 + }, + { + "epoch": 0.35, + "grad_norm": 0.2638508110242151, + "learning_rate": 0.00019134725201397655, + "loss": 1.1046, + "step": 3694 + }, + { + "epoch": 0.35, + "grad_norm": 0.2862216325349102, + "learning_rate": 0.00019134081391353018, + "loss": 1.1441, + "step": 3695 + }, + { + "epoch": 0.35, + "grad_norm": 0.31225707110177847, + "learning_rate": 0.00019133437352722253, + "loss": 1.143, + "step": 3696 + }, + { + "epoch": 0.35, + "grad_norm": 0.30593521073577173, + "learning_rate": 0.00019132793085521477, + "loss": 1.1625, + "step": 3697 + }, + { + "epoch": 0.35, + "grad_norm": 0.29258514429430604, + "learning_rate": 0.00019132148589766812, + "loss": 1.1549, + "step": 3698 + }, + { + "epoch": 0.35, + "grad_norm": 0.25691353350560925, + "learning_rate": 0.00019131503865474388, + "loss": 1.086, + "step": 3699 + }, + { + "epoch": 0.35, + "grad_norm": 0.28413967582202954, + "learning_rate": 0.00019130858912660346, + "loss": 1.0516, + "step": 3700 + }, + { + "epoch": 0.35, + "grad_norm": 0.2486682279993249, + "learning_rate": 0.0001913021373134082, + "loss": 1.144, + "step": 3701 + }, + { + "epoch": 0.35, + "grad_norm": 0.2799664249634028, + "learning_rate": 0.00019129568321531957, + "loss": 1.203, + "step": 3702 + }, + { + "epoch": 0.35, + "grad_norm": 0.2360258518505738, + "learning_rate": 0.00019128922683249905, + "loss": 1.0592, + "step": 3703 + }, + { + "epoch": 0.35, + "grad_norm": 0.28969445762878065, + "learning_rate": 0.0001912827681651083, + "loss": 1.2039, + "step": 3704 + }, + { + "epoch": 0.35, + "grad_norm": 0.2925284783203515, + "learning_rate": 0.0001912763072133089, + "loss": 1.0849, + "step": 3705 + }, + { + "epoch": 0.35, + "grad_norm": 0.33511999769210443, + "learning_rate": 0.00019126984397726252, + "loss": 1.0263, + "step": 3706 + }, + { + "epoch": 0.35, + "grad_norm": 0.2898231101162137, + "learning_rate": 0.00019126337845713098, + "loss": 1.042, + "step": 3707 + }, + { + "epoch": 0.35, + "grad_norm": 0.2787807763701908, + "learning_rate": 0.000191256910653076, + "loss": 0.9801, + "step": 3708 + }, + { + "epoch": 0.35, + "grad_norm": 0.2655169550180212, + "learning_rate": 0.0001912504405652595, + "loss": 1.024, + "step": 3709 + }, + { + "epoch": 0.35, + "grad_norm": 0.23484914814404617, + "learning_rate": 0.00019124396819384336, + "loss": 1.083, + "step": 3710 + }, + { + "epoch": 0.36, + "grad_norm": 0.2399809754737731, + "learning_rate": 0.00019123749353898957, + "loss": 0.8806, + "step": 3711 + }, + { + "epoch": 0.36, + "grad_norm": 0.25071481306083293, + "learning_rate": 0.00019123101660086018, + "loss": 1.0832, + "step": 3712 + }, + { + "epoch": 0.36, + "grad_norm": 0.26019964031949383, + "learning_rate": 0.00019122453737961724, + "loss": 1.105, + "step": 3713 + }, + { + "epoch": 0.36, + "grad_norm": 0.27184915954233857, + "learning_rate": 0.00019121805587542294, + "loss": 1.1295, + "step": 3714 + }, + { + "epoch": 0.36, + "grad_norm": 0.3232217248521111, + "learning_rate": 0.00019121157208843947, + "loss": 1.0362, + "step": 3715 + }, + { + "epoch": 0.36, + "grad_norm": 0.2484485517327626, + "learning_rate": 0.00019120508601882906, + "loss": 1.0417, + "step": 3716 + }, + { + "epoch": 0.36, + "grad_norm": 0.28882594941774437, + "learning_rate": 0.00019119859766675407, + "loss": 1.0759, + "step": 3717 + }, + { + "epoch": 0.36, + "grad_norm": 0.26746773610204805, + "learning_rate": 0.00019119210703237685, + "loss": 0.9896, + "step": 3718 + }, + { + "epoch": 0.36, + "grad_norm": 0.3030903175749709, + "learning_rate": 0.00019118561411585986, + "loss": 1.0477, + "step": 3719 + }, + { + "epoch": 0.36, + "grad_norm": 0.2647502528125401, + "learning_rate": 0.00019117911891736552, + "loss": 1.0581, + "step": 3720 + }, + { + "epoch": 0.36, + "grad_norm": 0.31861248281328974, + "learning_rate": 0.00019117262143705647, + "loss": 1.144, + "step": 3721 + }, + { + "epoch": 0.36, + "grad_norm": 0.25783949986971005, + "learning_rate": 0.00019116612167509526, + "loss": 1.2301, + "step": 3722 + }, + { + "epoch": 0.36, + "grad_norm": 0.24441169216486786, + "learning_rate": 0.00019115961963164454, + "loss": 1.118, + "step": 3723 + }, + { + "epoch": 0.36, + "grad_norm": 0.2659364840495592, + "learning_rate": 0.00019115311530686706, + "loss": 1.0523, + "step": 3724 + }, + { + "epoch": 0.36, + "grad_norm": 0.2522530337190297, + "learning_rate": 0.00019114660870092558, + "loss": 0.9005, + "step": 3725 + }, + { + "epoch": 0.36, + "grad_norm": 0.2647152274884048, + "learning_rate": 0.00019114009981398294, + "loss": 1.0931, + "step": 3726 + }, + { + "epoch": 0.36, + "grad_norm": 0.2702016934536367, + "learning_rate": 0.000191133588646202, + "loss": 1.0832, + "step": 3727 + }, + { + "epoch": 0.36, + "grad_norm": 0.24668859082298314, + "learning_rate": 0.00019112707519774576, + "loss": 0.9758, + "step": 3728 + }, + { + "epoch": 0.36, + "grad_norm": 0.2672446623933551, + "learning_rate": 0.0001911205594687772, + "loss": 1.149, + "step": 3729 + }, + { + "epoch": 0.36, + "grad_norm": 0.2600484573038422, + "learning_rate": 0.00019111404145945933, + "loss": 1.057, + "step": 3730 + }, + { + "epoch": 0.36, + "grad_norm": 0.31076021763085576, + "learning_rate": 0.00019110752116995535, + "loss": 1.1161, + "step": 3731 + }, + { + "epoch": 0.36, + "grad_norm": 0.2626503992229002, + "learning_rate": 0.00019110099860042835, + "loss": 1.1429, + "step": 3732 + }, + { + "epoch": 0.36, + "grad_norm": 0.2550015497297485, + "learning_rate": 0.00019109447375104165, + "loss": 1.207, + "step": 3733 + }, + { + "epoch": 0.36, + "grad_norm": 0.29381829699592593, + "learning_rate": 0.00019108794662195847, + "loss": 1.071, + "step": 3734 + }, + { + "epoch": 0.36, + "grad_norm": 0.2582343525213059, + "learning_rate": 0.00019108141721334217, + "loss": 1.0302, + "step": 3735 + }, + { + "epoch": 0.36, + "grad_norm": 0.3033664358520454, + "learning_rate": 0.00019107488552535617, + "loss": 1.0559, + "step": 3736 + }, + { + "epoch": 0.36, + "grad_norm": 0.2804498013242838, + "learning_rate": 0.00019106835155816395, + "loss": 1.1039, + "step": 3737 + }, + { + "epoch": 0.36, + "grad_norm": 0.2804541814983532, + "learning_rate": 0.00019106181531192894, + "loss": 1.1296, + "step": 3738 + }, + { + "epoch": 0.36, + "grad_norm": 0.30659613447065276, + "learning_rate": 0.0001910552767868148, + "loss": 1.1612, + "step": 3739 + }, + { + "epoch": 0.36, + "grad_norm": 0.2891921914653932, + "learning_rate": 0.00019104873598298517, + "loss": 1.0669, + "step": 3740 + }, + { + "epoch": 0.36, + "grad_norm": 0.28558383598746206, + "learning_rate": 0.00019104219290060366, + "loss": 1.2211, + "step": 3741 + }, + { + "epoch": 0.36, + "grad_norm": 0.29494252698624446, + "learning_rate": 0.00019103564753983405, + "loss": 0.9863, + "step": 3742 + }, + { + "epoch": 0.36, + "grad_norm": 0.2556255730742099, + "learning_rate": 0.00019102909990084018, + "loss": 1.0618, + "step": 3743 + }, + { + "epoch": 0.36, + "grad_norm": 0.2506965201053014, + "learning_rate": 0.00019102254998378584, + "loss": 0.9738, + "step": 3744 + }, + { + "epoch": 0.36, + "grad_norm": 0.29913681320353497, + "learning_rate": 0.00019101599778883498, + "loss": 1.0876, + "step": 3745 + }, + { + "epoch": 0.36, + "grad_norm": 0.2773312706953958, + "learning_rate": 0.0001910094433161516, + "loss": 1.0741, + "step": 3746 + }, + { + "epoch": 0.36, + "grad_norm": 0.2917033659607459, + "learning_rate": 0.0001910028865658997, + "loss": 1.1748, + "step": 3747 + }, + { + "epoch": 0.36, + "grad_norm": 0.2296887419120594, + "learning_rate": 0.00019099632753824335, + "loss": 0.9938, + "step": 3748 + }, + { + "epoch": 0.36, + "grad_norm": 0.2704584908164929, + "learning_rate": 0.00019098976623334673, + "loss": 1.0166, + "step": 3749 + }, + { + "epoch": 0.36, + "grad_norm": 0.21917669304416104, + "learning_rate": 0.00019098320265137402, + "loss": 1.0756, + "step": 3750 + }, + { + "epoch": 0.36, + "grad_norm": 0.26280707594546593, + "learning_rate": 0.0001909766367924895, + "loss": 1.2615, + "step": 3751 + }, + { + "epoch": 0.36, + "grad_norm": 0.29377384953319624, + "learning_rate": 0.00019097006865685743, + "loss": 1.0886, + "step": 3752 + }, + { + "epoch": 0.36, + "grad_norm": 0.23301855169248875, + "learning_rate": 0.00019096349824464226, + "loss": 0.9808, + "step": 3753 + }, + { + "epoch": 0.36, + "grad_norm": 0.253520755622324, + "learning_rate": 0.00019095692555600832, + "loss": 1.1587, + "step": 3754 + }, + { + "epoch": 0.36, + "grad_norm": 0.2754796183392769, + "learning_rate": 0.00019095035059112024, + "loss": 1.1064, + "step": 3755 + }, + { + "epoch": 0.36, + "grad_norm": 0.2747255493731151, + "learning_rate": 0.00019094377335014242, + "loss": 1.055, + "step": 3756 + }, + { + "epoch": 0.36, + "grad_norm": 0.28773064135087933, + "learning_rate": 0.00019093719383323952, + "loss": 1.1881, + "step": 3757 + }, + { + "epoch": 0.36, + "grad_norm": 0.2806691047427897, + "learning_rate": 0.0001909306120405762, + "loss": 1.1801, + "step": 3758 + }, + { + "epoch": 0.36, + "grad_norm": 0.3132693357401042, + "learning_rate": 0.00019092402797231715, + "loss": 1.0493, + "step": 3759 + }, + { + "epoch": 0.36, + "grad_norm": 0.25209467298228705, + "learning_rate": 0.00019091744162862717, + "loss": 1.03, + "step": 3760 + }, + { + "epoch": 0.36, + "grad_norm": 0.31060570827727507, + "learning_rate": 0.0001909108530096711, + "loss": 1.21, + "step": 3761 + }, + { + "epoch": 0.36, + "grad_norm": 0.25830756717657666, + "learning_rate": 0.00019090426211561376, + "loss": 1.0167, + "step": 3762 + }, + { + "epoch": 0.36, + "grad_norm": 0.2535702002191736, + "learning_rate": 0.00019089766894662014, + "loss": 1.0515, + "step": 3763 + }, + { + "epoch": 0.36, + "grad_norm": 0.278969068298096, + "learning_rate": 0.00019089107350285522, + "loss": 1.1331, + "step": 3764 + }, + { + "epoch": 0.36, + "grad_norm": 0.2807324463593699, + "learning_rate": 0.00019088447578448407, + "loss": 1.0715, + "step": 3765 + }, + { + "epoch": 0.36, + "grad_norm": 0.28955522038757464, + "learning_rate": 0.0001908778757916718, + "loss": 1.0668, + "step": 3766 + }, + { + "epoch": 0.36, + "grad_norm": 0.2595700503045936, + "learning_rate": 0.00019087127352458358, + "loss": 1.1446, + "step": 3767 + }, + { + "epoch": 0.36, + "grad_norm": 0.28444433757950754, + "learning_rate": 0.00019086466898338462, + "loss": 1.0789, + "step": 3768 + }, + { + "epoch": 0.36, + "grad_norm": 0.2787597254903445, + "learning_rate": 0.00019085806216824017, + "loss": 0.9789, + "step": 3769 + }, + { + "epoch": 0.36, + "grad_norm": 0.2941116656712117, + "learning_rate": 0.0001908514530793157, + "loss": 1.0751, + "step": 3770 + }, + { + "epoch": 0.36, + "grad_norm": 0.22941662757128714, + "learning_rate": 0.00019084484171677646, + "loss": 1.1191, + "step": 3771 + }, + { + "epoch": 0.36, + "grad_norm": 0.24999713030985823, + "learning_rate": 0.00019083822808078798, + "loss": 1.1661, + "step": 3772 + }, + { + "epoch": 0.36, + "grad_norm": 0.28806317138126275, + "learning_rate": 0.00019083161217151574, + "loss": 1.138, + "step": 3773 + }, + { + "epoch": 0.36, + "grad_norm": 0.26401411481347825, + "learning_rate": 0.00019082499398912533, + "loss": 1.0933, + "step": 3774 + }, + { + "epoch": 0.36, + "grad_norm": 0.27780628237715865, + "learning_rate": 0.00019081837353378237, + "loss": 1.1479, + "step": 3775 + }, + { + "epoch": 0.36, + "grad_norm": 0.2567041443182185, + "learning_rate": 0.00019081175080565253, + "loss": 0.9848, + "step": 3776 + }, + { + "epoch": 0.36, + "grad_norm": 0.33134929043752975, + "learning_rate": 0.00019080512580490154, + "loss": 1.005, + "step": 3777 + }, + { + "epoch": 0.36, + "grad_norm": 0.25790224327167643, + "learning_rate": 0.0001907984985316952, + "loss": 1.063, + "step": 3778 + }, + { + "epoch": 0.36, + "grad_norm": 0.2542673197030738, + "learning_rate": 0.0001907918689861994, + "loss": 1.1054, + "step": 3779 + }, + { + "epoch": 0.36, + "grad_norm": 0.29841436435986246, + "learning_rate": 0.00019078523716858, + "loss": 1.0715, + "step": 3780 + }, + { + "epoch": 0.36, + "grad_norm": 0.2607650355473665, + "learning_rate": 0.000190778603079003, + "loss": 1.0562, + "step": 3781 + }, + { + "epoch": 0.36, + "grad_norm": 0.27320429258836315, + "learning_rate": 0.00019077196671763436, + "loss": 1.0277, + "step": 3782 + }, + { + "epoch": 0.36, + "grad_norm": 0.2548164654391468, + "learning_rate": 0.00019076532808464026, + "loss": 1.0822, + "step": 3783 + }, + { + "epoch": 0.36, + "grad_norm": 0.3085428982977869, + "learning_rate": 0.00019075868718018677, + "loss": 1.1456, + "step": 3784 + }, + { + "epoch": 0.36, + "grad_norm": 0.31897081017855045, + "learning_rate": 0.0001907520440044401, + "loss": 1.0907, + "step": 3785 + }, + { + "epoch": 0.36, + "grad_norm": 0.2491436372725657, + "learning_rate": 0.00019074539855756646, + "loss": 1.0872, + "step": 3786 + }, + { + "epoch": 0.36, + "grad_norm": 0.2844533490165419, + "learning_rate": 0.00019073875083973222, + "loss": 1.0052, + "step": 3787 + }, + { + "epoch": 0.36, + "grad_norm": 0.3018741634697304, + "learning_rate": 0.0001907321008511037, + "loss": 0.9965, + "step": 3788 + }, + { + "epoch": 0.36, + "grad_norm": 0.2423698565672897, + "learning_rate": 0.0001907254485918473, + "loss": 1.0812, + "step": 3789 + }, + { + "epoch": 0.36, + "grad_norm": 0.2975822751648389, + "learning_rate": 0.0001907187940621296, + "loss": 0.9894, + "step": 3790 + }, + { + "epoch": 0.36, + "grad_norm": 0.29200730112195994, + "learning_rate": 0.000190712137262117, + "loss": 1.0714, + "step": 3791 + }, + { + "epoch": 0.36, + "grad_norm": 0.28370419210343656, + "learning_rate": 0.0001907054781919762, + "loss": 1.077, + "step": 3792 + }, + { + "epoch": 0.36, + "grad_norm": 0.2722830323949, + "learning_rate": 0.0001906988168518738, + "loss": 1.0713, + "step": 3793 + }, + { + "epoch": 0.36, + "grad_norm": 0.24874541049005577, + "learning_rate": 0.00019069215324197646, + "loss": 1.0738, + "step": 3794 + }, + { + "epoch": 0.36, + "grad_norm": 0.28546406332135904, + "learning_rate": 0.00019068548736245102, + "loss": 1.1608, + "step": 3795 + }, + { + "epoch": 0.36, + "grad_norm": 0.29760842933931714, + "learning_rate": 0.00019067881921346427, + "loss": 1.0152, + "step": 3796 + }, + { + "epoch": 0.36, + "grad_norm": 0.2656364460449629, + "learning_rate": 0.00019067214879518306, + "loss": 1.1331, + "step": 3797 + }, + { + "epoch": 0.36, + "grad_norm": 0.2807658825912433, + "learning_rate": 0.00019066547610777437, + "loss": 1.0905, + "step": 3798 + }, + { + "epoch": 0.36, + "grad_norm": 0.2821813716967829, + "learning_rate": 0.00019065880115140513, + "loss": 1.0497, + "step": 3799 + }, + { + "epoch": 0.36, + "grad_norm": 0.26730901537371593, + "learning_rate": 0.00019065212392624243, + "loss": 1.0252, + "step": 3800 + }, + { + "epoch": 0.36, + "grad_norm": 0.26067574808177124, + "learning_rate": 0.00019064544443245335, + "loss": 0.9627, + "step": 3801 + }, + { + "epoch": 0.36, + "grad_norm": 0.26226327026433305, + "learning_rate": 0.00019063876267020507, + "loss": 1.0757, + "step": 3802 + }, + { + "epoch": 0.36, + "grad_norm": 0.2804595303992103, + "learning_rate": 0.00019063207863966478, + "loss": 1.1859, + "step": 3803 + }, + { + "epoch": 0.36, + "grad_norm": 0.265318805788959, + "learning_rate": 0.00019062539234099973, + "loss": 1.0262, + "step": 3804 + }, + { + "epoch": 0.36, + "grad_norm": 0.2530307087999173, + "learning_rate": 0.00019061870377437733, + "loss": 1.0954, + "step": 3805 + }, + { + "epoch": 0.36, + "grad_norm": 0.2734697145712888, + "learning_rate": 0.00019061201293996488, + "loss": 1.0704, + "step": 3806 + }, + { + "epoch": 0.36, + "grad_norm": 0.24462035828995993, + "learning_rate": 0.00019060531983792987, + "loss": 1.1139, + "step": 3807 + }, + { + "epoch": 0.36, + "grad_norm": 0.2894517260409347, + "learning_rate": 0.00019059862446843982, + "loss": 1.0643, + "step": 3808 + }, + { + "epoch": 0.36, + "grad_norm": 0.28221499051147075, + "learning_rate": 0.00019059192683166222, + "loss": 1.0379, + "step": 3809 + }, + { + "epoch": 0.36, + "grad_norm": 0.2596597115877193, + "learning_rate": 0.00019058522692776473, + "loss": 1.1256, + "step": 3810 + }, + { + "epoch": 0.36, + "grad_norm": 0.26056578947550624, + "learning_rate": 0.00019057852475691498, + "loss": 1.002, + "step": 3811 + }, + { + "epoch": 0.36, + "grad_norm": 0.27913230085219015, + "learning_rate": 0.00019057182031928074, + "loss": 1.1348, + "step": 3812 + }, + { + "epoch": 0.36, + "grad_norm": 0.27929429057142885, + "learning_rate": 0.00019056511361502975, + "loss": 1.1987, + "step": 3813 + }, + { + "epoch": 0.36, + "grad_norm": 0.235530597825584, + "learning_rate": 0.0001905584046443299, + "loss": 1.1361, + "step": 3814 + }, + { + "epoch": 0.36, + "grad_norm": 0.2522023187837088, + "learning_rate": 0.00019055169340734908, + "loss": 1.0713, + "step": 3815 + }, + { + "epoch": 0.37, + "grad_norm": 0.280427344983831, + "learning_rate": 0.0001905449799042552, + "loss": 1.0557, + "step": 3816 + }, + { + "epoch": 0.37, + "grad_norm": 0.2969054368145568, + "learning_rate": 0.0001905382641352163, + "loss": 1.1357, + "step": 3817 + }, + { + "epoch": 0.37, + "grad_norm": 0.2440950945037354, + "learning_rate": 0.00019053154610040044, + "loss": 0.9862, + "step": 3818 + }, + { + "epoch": 0.37, + "grad_norm": 0.26826804773960794, + "learning_rate": 0.0001905248257999757, + "loss": 1.1239, + "step": 3819 + }, + { + "epoch": 0.37, + "grad_norm": 0.2505950276337422, + "learning_rate": 0.00019051810323411034, + "loss": 1.0886, + "step": 3820 + }, + { + "epoch": 0.37, + "grad_norm": 0.27676795581151226, + "learning_rate": 0.00019051137840297256, + "loss": 1.1414, + "step": 3821 + }, + { + "epoch": 0.37, + "grad_norm": 0.2571438975717034, + "learning_rate": 0.00019050465130673067, + "loss": 1.0289, + "step": 3822 + }, + { + "epoch": 0.37, + "grad_norm": 0.25111218618944997, + "learning_rate": 0.00019049792194555294, + "loss": 0.9651, + "step": 3823 + }, + { + "epoch": 0.37, + "grad_norm": 0.2610987856579739, + "learning_rate": 0.00019049119031960788, + "loss": 1.1635, + "step": 3824 + }, + { + "epoch": 0.37, + "grad_norm": 0.2432839279077218, + "learning_rate": 0.00019048445642906388, + "loss": 1.1106, + "step": 3825 + }, + { + "epoch": 0.37, + "grad_norm": 0.25411198588402173, + "learning_rate": 0.00019047772027408954, + "loss": 1.0766, + "step": 3826 + }, + { + "epoch": 0.37, + "grad_norm": 0.2611685254036077, + "learning_rate": 0.00019047098185485335, + "loss": 1.0616, + "step": 3827 + }, + { + "epoch": 0.37, + "grad_norm": 0.2486133581840728, + "learning_rate": 0.00019046424117152402, + "loss": 0.9964, + "step": 3828 + }, + { + "epoch": 0.37, + "grad_norm": 0.3298032342837613, + "learning_rate": 0.00019045749822427016, + "loss": 0.9625, + "step": 3829 + }, + { + "epoch": 0.37, + "grad_norm": 0.29292144210717325, + "learning_rate": 0.00019045075301326057, + "loss": 1.0938, + "step": 3830 + }, + { + "epoch": 0.37, + "grad_norm": 0.2906727228238908, + "learning_rate": 0.00019044400553866405, + "loss": 1.0653, + "step": 3831 + }, + { + "epoch": 0.37, + "grad_norm": 0.25691594404144225, + "learning_rate": 0.00019043725580064939, + "loss": 1.0817, + "step": 3832 + }, + { + "epoch": 0.37, + "grad_norm": 0.2787203095759583, + "learning_rate": 0.00019043050379938565, + "loss": 1.0643, + "step": 3833 + }, + { + "epoch": 0.37, + "grad_norm": 0.26242429857133936, + "learning_rate": 0.00019042374953504165, + "loss": 1.0383, + "step": 3834 + }, + { + "epoch": 0.37, + "grad_norm": 0.2750170989226645, + "learning_rate": 0.00019041699300778654, + "loss": 1.0113, + "step": 3835 + }, + { + "epoch": 0.37, + "grad_norm": 0.24689367063544376, + "learning_rate": 0.00019041023421778933, + "loss": 1.0589, + "step": 3836 + }, + { + "epoch": 0.37, + "grad_norm": 0.3101949979537771, + "learning_rate": 0.0001904034731652192, + "loss": 1.0844, + "step": 3837 + }, + { + "epoch": 0.37, + "grad_norm": 0.3026108279779279, + "learning_rate": 0.00019039670985024533, + "loss": 0.9748, + "step": 3838 + }, + { + "epoch": 0.37, + "grad_norm": 0.28436273819827806, + "learning_rate": 0.00019038994427303697, + "loss": 1.13, + "step": 3839 + }, + { + "epoch": 0.37, + "grad_norm": 0.26672874828160903, + "learning_rate": 0.00019038317643376346, + "loss": 0.9425, + "step": 3840 + }, + { + "epoch": 0.37, + "grad_norm": 0.2551937537263912, + "learning_rate": 0.00019037640633259417, + "loss": 1.1193, + "step": 3841 + }, + { + "epoch": 0.37, + "grad_norm": 0.2421616493910976, + "learning_rate": 0.00019036963396969848, + "loss": 1.1925, + "step": 3842 + }, + { + "epoch": 0.37, + "grad_norm": 0.27403481599431856, + "learning_rate": 0.00019036285934524594, + "loss": 0.9328, + "step": 3843 + }, + { + "epoch": 0.37, + "grad_norm": 0.2736343566787731, + "learning_rate": 0.00019035608245940603, + "loss": 1.1229, + "step": 3844 + }, + { + "epoch": 0.37, + "grad_norm": 0.2984644122142435, + "learning_rate": 0.00019034930331234836, + "loss": 1.0706, + "step": 3845 + }, + { + "epoch": 0.37, + "grad_norm": 0.25600173944542287, + "learning_rate": 0.00019034252190424264, + "loss": 1.1189, + "step": 3846 + }, + { + "epoch": 0.37, + "grad_norm": 0.27750211746463055, + "learning_rate": 0.0001903357382352585, + "loss": 1.2051, + "step": 3847 + }, + { + "epoch": 0.37, + "grad_norm": 0.308006852697485, + "learning_rate": 0.00019032895230556573, + "loss": 1.1935, + "step": 3848 + }, + { + "epoch": 0.37, + "grad_norm": 0.2606236441816338, + "learning_rate": 0.00019032216411533415, + "loss": 1.1713, + "step": 3849 + }, + { + "epoch": 0.37, + "grad_norm": 0.28436108514736663, + "learning_rate": 0.00019031537366473369, + "loss": 1.0714, + "step": 3850 + }, + { + "epoch": 0.37, + "grad_norm": 0.3140913304294229, + "learning_rate": 0.00019030858095393422, + "loss": 1.109, + "step": 3851 + }, + { + "epoch": 0.37, + "grad_norm": 0.27820186741488906, + "learning_rate": 0.00019030178598310573, + "loss": 1.1456, + "step": 3852 + }, + { + "epoch": 0.37, + "grad_norm": 0.25313171010488306, + "learning_rate": 0.00019029498875241832, + "loss": 1.0996, + "step": 3853 + }, + { + "epoch": 0.37, + "grad_norm": 0.26632647868200215, + "learning_rate": 0.00019028818926204207, + "loss": 1.0288, + "step": 3854 + }, + { + "epoch": 0.37, + "grad_norm": 0.2802151300023979, + "learning_rate": 0.00019028138751214714, + "loss": 1.1178, + "step": 3855 + }, + { + "epoch": 0.37, + "grad_norm": 0.28551319793457147, + "learning_rate": 0.00019027458350290375, + "loss": 1.1742, + "step": 3856 + }, + { + "epoch": 0.37, + "grad_norm": 0.29354399099059836, + "learning_rate": 0.00019026777723448214, + "loss": 1.1434, + "step": 3857 + }, + { + "epoch": 0.37, + "grad_norm": 0.28766227685882445, + "learning_rate": 0.00019026096870705274, + "loss": 1.1227, + "step": 3858 + }, + { + "epoch": 0.37, + "grad_norm": 0.25123810673486535, + "learning_rate": 0.0001902541579207858, + "loss": 1.0514, + "step": 3859 + }, + { + "epoch": 0.37, + "grad_norm": 0.3035800237589507, + "learning_rate": 0.00019024734487585186, + "loss": 1.0958, + "step": 3860 + }, + { + "epoch": 0.37, + "grad_norm": 0.24205745440797985, + "learning_rate": 0.0001902405295724214, + "loss": 1.1692, + "step": 3861 + }, + { + "epoch": 0.37, + "grad_norm": 0.27865447448135033, + "learning_rate": 0.00019023371201066497, + "loss": 1.0144, + "step": 3862 + }, + { + "epoch": 0.37, + "grad_norm": 0.2660377758295032, + "learning_rate": 0.0001902268921907532, + "loss": 0.9894, + "step": 3863 + }, + { + "epoch": 0.37, + "grad_norm": 0.2726012616002911, + "learning_rate": 0.00019022007011285674, + "loss": 1.0454, + "step": 3864 + }, + { + "epoch": 0.37, + "grad_norm": 0.2726846785840144, + "learning_rate": 0.0001902132457771463, + "loss": 1.0598, + "step": 3865 + }, + { + "epoch": 0.37, + "grad_norm": 0.25701844846319727, + "learning_rate": 0.0001902064191837927, + "loss": 1.1679, + "step": 3866 + }, + { + "epoch": 0.37, + "grad_norm": 0.2921405168078913, + "learning_rate": 0.00019019959033296678, + "loss": 1.0524, + "step": 3867 + }, + { + "epoch": 0.37, + "grad_norm": 0.27129522822886254, + "learning_rate": 0.00019019275922483943, + "loss": 1.105, + "step": 3868 + }, + { + "epoch": 0.37, + "grad_norm": 0.2718161044722209, + "learning_rate": 0.0001901859258595816, + "loss": 1.0613, + "step": 3869 + }, + { + "epoch": 0.37, + "grad_norm": 0.2536896991586868, + "learning_rate": 0.00019017909023736428, + "loss": 0.9996, + "step": 3870 + }, + { + "epoch": 0.37, + "grad_norm": 0.29175405932195664, + "learning_rate": 0.00019017225235835853, + "loss": 1.1386, + "step": 3871 + }, + { + "epoch": 0.37, + "grad_norm": 0.30144877670952286, + "learning_rate": 0.00019016541222273553, + "loss": 1.1468, + "step": 3872 + }, + { + "epoch": 0.37, + "grad_norm": 0.2931426576366692, + "learning_rate": 0.00019015856983066644, + "loss": 1.1721, + "step": 3873 + }, + { + "epoch": 0.37, + "grad_norm": 0.2916770600223048, + "learning_rate": 0.00019015172518232242, + "loss": 1.0945, + "step": 3874 + }, + { + "epoch": 0.37, + "grad_norm": 0.27166976489578, + "learning_rate": 0.00019014487827787483, + "loss": 1.0891, + "step": 3875 + }, + { + "epoch": 0.37, + "grad_norm": 0.2839705782430795, + "learning_rate": 0.00019013802911749505, + "loss": 1.1197, + "step": 3876 + }, + { + "epoch": 0.37, + "grad_norm": 0.298375581966575, + "learning_rate": 0.0001901311777013544, + "loss": 1.0689, + "step": 3877 + }, + { + "epoch": 0.37, + "grad_norm": 0.3013484850412901, + "learning_rate": 0.0001901243240296244, + "loss": 1.0352, + "step": 3878 + }, + { + "epoch": 0.37, + "grad_norm": 0.25624648167577585, + "learning_rate": 0.00019011746810247658, + "loss": 1.0749, + "step": 3879 + }, + { + "epoch": 0.37, + "grad_norm": 0.25490116870005675, + "learning_rate": 0.00019011060992008244, + "loss": 1.1551, + "step": 3880 + }, + { + "epoch": 0.37, + "grad_norm": 0.28069672688477176, + "learning_rate": 0.00019010374948261367, + "loss": 0.9975, + "step": 3881 + }, + { + "epoch": 0.37, + "grad_norm": 0.2651997155968314, + "learning_rate": 0.0001900968867902419, + "loss": 1.1731, + "step": 3882 + }, + { + "epoch": 0.37, + "grad_norm": 0.3115313345415178, + "learning_rate": 0.00019009002184313897, + "loss": 1.1077, + "step": 3883 + }, + { + "epoch": 0.37, + "grad_norm": 0.2775305592186655, + "learning_rate": 0.00019008315464147662, + "loss": 1.1736, + "step": 3884 + }, + { + "epoch": 0.37, + "grad_norm": 0.28786278339401167, + "learning_rate": 0.0001900762851854267, + "loss": 1.0521, + "step": 3885 + }, + { + "epoch": 0.37, + "grad_norm": 0.23797474326903245, + "learning_rate": 0.0001900694134751611, + "loss": 1.1405, + "step": 3886 + }, + { + "epoch": 0.37, + "grad_norm": 0.2605654562908968, + "learning_rate": 0.00019006253951085186, + "loss": 1.029, + "step": 3887 + }, + { + "epoch": 0.37, + "grad_norm": 0.25694650291010046, + "learning_rate": 0.00019005566329267096, + "loss": 1.0665, + "step": 3888 + }, + { + "epoch": 0.37, + "grad_norm": 0.26716631196903606, + "learning_rate": 0.0001900487848207905, + "loss": 1.1494, + "step": 3889 + }, + { + "epoch": 0.37, + "grad_norm": 0.3116260242272765, + "learning_rate": 0.00019004190409538255, + "loss": 1.117, + "step": 3890 + }, + { + "epoch": 0.37, + "grad_norm": 0.26774746733662874, + "learning_rate": 0.00019003502111661943, + "loss": 1.1987, + "step": 3891 + }, + { + "epoch": 0.37, + "grad_norm": 0.25363750120087714, + "learning_rate": 0.0001900281358846733, + "loss": 1.0831, + "step": 3892 + }, + { + "epoch": 0.37, + "grad_norm": 0.28339718118220925, + "learning_rate": 0.00019002124839971647, + "loss": 1.1161, + "step": 3893 + }, + { + "epoch": 0.37, + "grad_norm": 0.2541180184606548, + "learning_rate": 0.00019001435866192133, + "loss": 1.037, + "step": 3894 + }, + { + "epoch": 0.37, + "grad_norm": 0.2662101976131836, + "learning_rate": 0.0001900074666714603, + "loss": 1.0495, + "step": 3895 + }, + { + "epoch": 0.37, + "grad_norm": 0.24244962084116864, + "learning_rate": 0.00019000057242850584, + "loss": 1.143, + "step": 3896 + }, + { + "epoch": 0.37, + "grad_norm": 0.2815830104446293, + "learning_rate": 0.00018999367593323048, + "loss": 1.1087, + "step": 3897 + }, + { + "epoch": 0.37, + "grad_norm": 0.2661433168734344, + "learning_rate": 0.00018998677718580687, + "loss": 1.0699, + "step": 3898 + }, + { + "epoch": 0.37, + "grad_norm": 0.278452400141501, + "learning_rate": 0.00018997987618640756, + "loss": 0.9984, + "step": 3899 + }, + { + "epoch": 0.37, + "grad_norm": 0.2950352883266951, + "learning_rate": 0.00018997297293520533, + "loss": 1.0725, + "step": 3900 + }, + { + "epoch": 0.37, + "grad_norm": 0.24855309318205562, + "learning_rate": 0.00018996606743237288, + "loss": 1.1091, + "step": 3901 + }, + { + "epoch": 0.37, + "grad_norm": 0.26157079774866104, + "learning_rate": 0.00018995915967808305, + "loss": 1.137, + "step": 3902 + }, + { + "epoch": 0.37, + "grad_norm": 0.2817964268191354, + "learning_rate": 0.00018995224967250873, + "loss": 1.0887, + "step": 3903 + }, + { + "epoch": 0.37, + "grad_norm": 0.303055127418326, + "learning_rate": 0.00018994533741582283, + "loss": 1.0509, + "step": 3904 + }, + { + "epoch": 0.37, + "grad_norm": 0.29319713632683003, + "learning_rate": 0.00018993842290819833, + "loss": 1.1527, + "step": 3905 + }, + { + "epoch": 0.37, + "grad_norm": 0.28460911802279176, + "learning_rate": 0.00018993150614980824, + "loss": 1.1663, + "step": 3906 + }, + { + "epoch": 0.37, + "grad_norm": 0.24323458888587937, + "learning_rate": 0.00018992458714082574, + "loss": 1.107, + "step": 3907 + }, + { + "epoch": 0.37, + "grad_norm": 0.2507473651775092, + "learning_rate": 0.0001899176658814239, + "loss": 1.1291, + "step": 3908 + }, + { + "epoch": 0.37, + "grad_norm": 0.26851442265034603, + "learning_rate": 0.00018991074237177595, + "loss": 1.0296, + "step": 3909 + }, + { + "epoch": 0.37, + "grad_norm": 0.27910401854051703, + "learning_rate": 0.0001899038166120552, + "loss": 1.0243, + "step": 3910 + }, + { + "epoch": 0.37, + "grad_norm": 0.2643335688729103, + "learning_rate": 0.0001898968886024349, + "loss": 1.1176, + "step": 3911 + }, + { + "epoch": 0.37, + "grad_norm": 0.2845122162938141, + "learning_rate": 0.0001898899583430885, + "loss": 1.1254, + "step": 3912 + }, + { + "epoch": 0.37, + "grad_norm": 0.2631105012541486, + "learning_rate": 0.00018988302583418937, + "loss": 1.0436, + "step": 3913 + }, + { + "epoch": 0.37, + "grad_norm": 0.26527130872751753, + "learning_rate": 0.00018987609107591104, + "loss": 1.0063, + "step": 3914 + }, + { + "epoch": 0.37, + "grad_norm": 0.26612171965346043, + "learning_rate": 0.00018986915406842708, + "loss": 1.1018, + "step": 3915 + }, + { + "epoch": 0.37, + "grad_norm": 0.29895893771415827, + "learning_rate": 0.000189862214811911, + "loss": 1.0477, + "step": 3916 + }, + { + "epoch": 0.37, + "grad_norm": 0.298508391495162, + "learning_rate": 0.00018985527330653653, + "loss": 1.1202, + "step": 3917 + }, + { + "epoch": 0.37, + "grad_norm": 0.2862203670774759, + "learning_rate": 0.0001898483295524774, + "loss": 1.1676, + "step": 3918 + }, + { + "epoch": 0.37, + "grad_norm": 0.7409275636159137, + "learning_rate": 0.00018984138354990736, + "loss": 1.4645, + "step": 3919 + }, + { + "epoch": 0.38, + "grad_norm": 0.26519753642569927, + "learning_rate": 0.0001898344352990002, + "loss": 1.0743, + "step": 3920 + }, + { + "epoch": 0.38, + "grad_norm": 0.29786591251006633, + "learning_rate": 0.00018982748479992988, + "loss": 1.1278, + "step": 3921 + }, + { + "epoch": 0.38, + "grad_norm": 0.278707089601725, + "learning_rate": 0.00018982053205287024, + "loss": 1.0982, + "step": 3922 + }, + { + "epoch": 0.38, + "grad_norm": 0.233468164464521, + "learning_rate": 0.00018981357705799538, + "loss": 1.0338, + "step": 3923 + }, + { + "epoch": 0.38, + "grad_norm": 0.32242913731155076, + "learning_rate": 0.0001898066198154793, + "loss": 1.1282, + "step": 3924 + }, + { + "epoch": 0.38, + "grad_norm": 0.2712636448427821, + "learning_rate": 0.00018979966032549612, + "loss": 1.0868, + "step": 3925 + }, + { + "epoch": 0.38, + "grad_norm": 0.27812674663111897, + "learning_rate": 0.00018979269858822, + "loss": 0.9507, + "step": 3926 + }, + { + "epoch": 0.38, + "grad_norm": 0.31036023474014135, + "learning_rate": 0.00018978573460382516, + "loss": 1.066, + "step": 3927 + }, + { + "epoch": 0.38, + "grad_norm": 0.2741036928468329, + "learning_rate": 0.00018977876837248587, + "loss": 1.114, + "step": 3928 + }, + { + "epoch": 0.38, + "grad_norm": 0.32960721822124844, + "learning_rate": 0.0001897717998943765, + "loss": 0.9916, + "step": 3929 + }, + { + "epoch": 0.38, + "grad_norm": 0.25938355281972847, + "learning_rate": 0.0001897648291696714, + "loss": 1.0566, + "step": 3930 + }, + { + "epoch": 0.38, + "grad_norm": 0.27023081057739146, + "learning_rate": 0.00018975785619854504, + "loss": 1.1365, + "step": 3931 + }, + { + "epoch": 0.38, + "grad_norm": 0.30667049605255553, + "learning_rate": 0.00018975088098117194, + "loss": 1.0995, + "step": 3932 + }, + { + "epoch": 0.38, + "grad_norm": 0.24394315516483825, + "learning_rate": 0.00018974390351772665, + "loss": 1.0412, + "step": 3933 + }, + { + "epoch": 0.38, + "grad_norm": 0.2801905314214369, + "learning_rate": 0.00018973692380838371, + "loss": 1.0848, + "step": 3934 + }, + { + "epoch": 0.38, + "grad_norm": 0.2381280727680932, + "learning_rate": 0.00018972994185331788, + "loss": 1.0596, + "step": 3935 + }, + { + "epoch": 0.38, + "grad_norm": 0.23812773003536764, + "learning_rate": 0.00018972295765270388, + "loss": 1.0666, + "step": 3936 + }, + { + "epoch": 0.38, + "grad_norm": 0.32096602227564136, + "learning_rate": 0.00018971597120671647, + "loss": 1.0948, + "step": 3937 + }, + { + "epoch": 0.38, + "grad_norm": 0.26277005671959386, + "learning_rate": 0.0001897089825155305, + "loss": 0.992, + "step": 3938 + }, + { + "epoch": 0.38, + "grad_norm": 0.25895544694478345, + "learning_rate": 0.00018970199157932084, + "loss": 1.0121, + "step": 3939 + }, + { + "epoch": 0.38, + "grad_norm": 0.33276374338059417, + "learning_rate": 0.0001896949983982625, + "loss": 1.1811, + "step": 3940 + }, + { + "epoch": 0.38, + "grad_norm": 0.2899952697366727, + "learning_rate": 0.00018968800297253043, + "loss": 1.0459, + "step": 3941 + }, + { + "epoch": 0.38, + "grad_norm": 0.28332378815166603, + "learning_rate": 0.0001896810053022997, + "loss": 1.1406, + "step": 3942 + }, + { + "epoch": 0.38, + "grad_norm": 0.2563768370833164, + "learning_rate": 0.00018967400538774548, + "loss": 1.0918, + "step": 3943 + }, + { + "epoch": 0.38, + "grad_norm": 0.2527720587986432, + "learning_rate": 0.00018966700322904293, + "loss": 1.0922, + "step": 3944 + }, + { + "epoch": 0.38, + "grad_norm": 0.25592462221706, + "learning_rate": 0.00018965999882636725, + "loss": 1.0781, + "step": 3945 + }, + { + "epoch": 0.38, + "grad_norm": 0.3089279882540109, + "learning_rate": 0.00018965299217989375, + "loss": 1.1559, + "step": 3946 + }, + { + "epoch": 0.38, + "grad_norm": 0.25525835490586896, + "learning_rate": 0.00018964598328979776, + "loss": 1.1349, + "step": 3947 + }, + { + "epoch": 0.38, + "grad_norm": 0.2841608784955134, + "learning_rate": 0.00018963897215625472, + "loss": 0.984, + "step": 3948 + }, + { + "epoch": 0.38, + "grad_norm": 0.272016327051106, + "learning_rate": 0.00018963195877944007, + "loss": 1.058, + "step": 3949 + }, + { + "epoch": 0.38, + "grad_norm": 0.3037390334949688, + "learning_rate": 0.0001896249431595293, + "loss": 1.0846, + "step": 3950 + }, + { + "epoch": 0.38, + "grad_norm": 0.2497613489471199, + "learning_rate": 0.000189617925296698, + "loss": 1.0793, + "step": 3951 + }, + { + "epoch": 0.38, + "grad_norm": 0.27852691608884916, + "learning_rate": 0.00018961090519112182, + "loss": 1.1262, + "step": 3952 + }, + { + "epoch": 0.38, + "grad_norm": 0.2844330777819289, + "learning_rate": 0.0001896038828429764, + "loss": 1.0902, + "step": 3953 + }, + { + "epoch": 0.38, + "grad_norm": 0.30372667850264373, + "learning_rate": 0.0001895968582524375, + "loss": 0.9642, + "step": 3954 + }, + { + "epoch": 0.38, + "grad_norm": 0.24532996718177777, + "learning_rate": 0.00018958983141968095, + "loss": 0.9768, + "step": 3955 + }, + { + "epoch": 0.38, + "grad_norm": 0.27188695012581215, + "learning_rate": 0.0001895828023448825, + "loss": 1.0809, + "step": 3956 + }, + { + "epoch": 0.38, + "grad_norm": 0.28620857758847607, + "learning_rate": 0.00018957577102821817, + "loss": 1.1338, + "step": 3957 + }, + { + "epoch": 0.38, + "grad_norm": 0.2967269096451259, + "learning_rate": 0.00018956873746986386, + "loss": 1.0498, + "step": 3958 + }, + { + "epoch": 0.38, + "grad_norm": 0.28197346617437485, + "learning_rate": 0.00018956170166999558, + "loss": 1.1001, + "step": 3959 + }, + { + "epoch": 0.38, + "grad_norm": 0.32447316500456774, + "learning_rate": 0.00018955466362878943, + "loss": 1.1346, + "step": 3960 + }, + { + "epoch": 0.38, + "grad_norm": 0.29206790559618895, + "learning_rate": 0.00018954762334642158, + "loss": 1.1531, + "step": 3961 + }, + { + "epoch": 0.38, + "grad_norm": 0.31059482758918283, + "learning_rate": 0.00018954058082306817, + "loss": 1.0824, + "step": 3962 + }, + { + "epoch": 0.38, + "grad_norm": 0.2599356809697041, + "learning_rate": 0.0001895335360589054, + "loss": 1.0651, + "step": 3963 + }, + { + "epoch": 0.38, + "grad_norm": 0.28568670159243, + "learning_rate": 0.00018952648905410966, + "loss": 1.111, + "step": 3964 + }, + { + "epoch": 0.38, + "grad_norm": 0.2741580155840068, + "learning_rate": 0.0001895194398088573, + "loss": 1.0367, + "step": 3965 + }, + { + "epoch": 0.38, + "grad_norm": 0.2460551214193779, + "learning_rate": 0.00018951238832332464, + "loss": 1.1306, + "step": 3966 + }, + { + "epoch": 0.38, + "grad_norm": 0.28164939664440597, + "learning_rate": 0.00018950533459768823, + "loss": 1.0736, + "step": 3967 + }, + { + "epoch": 0.38, + "grad_norm": 0.2935359891153058, + "learning_rate": 0.00018949827863212456, + "loss": 1.1287, + "step": 3968 + }, + { + "epoch": 0.38, + "grad_norm": 0.27127546853360357, + "learning_rate": 0.00018949122042681023, + "loss": 0.9919, + "step": 3969 + }, + { + "epoch": 0.38, + "grad_norm": 0.3153198801953517, + "learning_rate": 0.00018948415998192182, + "loss": 1.1561, + "step": 3970 + }, + { + "epoch": 0.38, + "grad_norm": 0.2639733592331646, + "learning_rate": 0.0001894770972976361, + "loss": 1.0033, + "step": 3971 + }, + { + "epoch": 0.38, + "grad_norm": 0.26313646288953435, + "learning_rate": 0.0001894700323741298, + "loss": 1.1798, + "step": 3972 + }, + { + "epoch": 0.38, + "grad_norm": 0.2711515260554288, + "learning_rate": 0.0001894629652115797, + "loss": 1.1126, + "step": 3973 + }, + { + "epoch": 0.38, + "grad_norm": 0.272492854127571, + "learning_rate": 0.0001894558958101627, + "loss": 0.9806, + "step": 3974 + }, + { + "epoch": 0.38, + "grad_norm": 0.27502865859424286, + "learning_rate": 0.00018944882417005565, + "loss": 1.088, + "step": 3975 + }, + { + "epoch": 0.38, + "grad_norm": 0.3343809998581711, + "learning_rate": 0.00018944175029143558, + "loss": 1.1771, + "step": 3976 + }, + { + "epoch": 0.38, + "grad_norm": 0.2894367965617339, + "learning_rate": 0.0001894346741744795, + "loss": 1.1597, + "step": 3977 + }, + { + "epoch": 0.38, + "grad_norm": 0.25286298932737533, + "learning_rate": 0.00018942759581936446, + "loss": 1.0403, + "step": 3978 + }, + { + "epoch": 0.38, + "grad_norm": 0.27494211784286493, + "learning_rate": 0.00018942051522626764, + "loss": 1.1405, + "step": 3979 + }, + { + "epoch": 0.38, + "grad_norm": 0.28818353355011783, + "learning_rate": 0.00018941343239536624, + "loss": 1.0295, + "step": 3980 + }, + { + "epoch": 0.38, + "grad_norm": 0.25536310488024094, + "learning_rate": 0.0001894063473268375, + "loss": 1.0547, + "step": 3981 + }, + { + "epoch": 0.38, + "grad_norm": 0.32569914313859244, + "learning_rate": 0.00018939926002085872, + "loss": 1.2414, + "step": 3982 + }, + { + "epoch": 0.38, + "grad_norm": 0.2869364080234295, + "learning_rate": 0.0001893921704776073, + "loss": 0.9819, + "step": 3983 + }, + { + "epoch": 0.38, + "grad_norm": 0.2657350930953723, + "learning_rate": 0.0001893850786972606, + "loss": 1.0408, + "step": 3984 + }, + { + "epoch": 0.38, + "grad_norm": 0.29775662133242436, + "learning_rate": 0.0001893779846799961, + "loss": 1.1624, + "step": 3985 + }, + { + "epoch": 0.38, + "grad_norm": 0.27355863349278053, + "learning_rate": 0.00018937088842599142, + "loss": 1.2827, + "step": 3986 + }, + { + "epoch": 0.38, + "grad_norm": 0.2732303129004138, + "learning_rate": 0.00018936378993542408, + "loss": 1.0976, + "step": 3987 + }, + { + "epoch": 0.38, + "grad_norm": 0.3021647282609973, + "learning_rate": 0.00018935668920847171, + "loss": 1.0696, + "step": 3988 + }, + { + "epoch": 0.38, + "grad_norm": 0.272227355179039, + "learning_rate": 0.00018934958624531207, + "loss": 1.0169, + "step": 3989 + }, + { + "epoch": 0.38, + "grad_norm": 0.2958029148879499, + "learning_rate": 0.00018934248104612283, + "loss": 1.1677, + "step": 3990 + }, + { + "epoch": 0.38, + "grad_norm": 0.2961165738395803, + "learning_rate": 0.00018933537361108188, + "loss": 1.1413, + "step": 3991 + }, + { + "epoch": 0.38, + "grad_norm": 0.3276060272777321, + "learning_rate": 0.00018932826394036707, + "loss": 1.1431, + "step": 3992 + }, + { + "epoch": 0.38, + "grad_norm": 0.24414858635640996, + "learning_rate": 0.00018932115203415631, + "loss": 0.9642, + "step": 3993 + }, + { + "epoch": 0.38, + "grad_norm": 0.24286855215872527, + "learning_rate": 0.0001893140378926276, + "loss": 1.0458, + "step": 3994 + }, + { + "epoch": 0.38, + "grad_norm": 0.2846207625112956, + "learning_rate": 0.0001893069215159589, + "loss": 1.1316, + "step": 3995 + }, + { + "epoch": 0.38, + "grad_norm": 0.28248248879909615, + "learning_rate": 0.00018929980290432842, + "loss": 1.1581, + "step": 3996 + }, + { + "epoch": 0.38, + "grad_norm": 0.27218202374232026, + "learning_rate": 0.00018929268205791422, + "loss": 1.2227, + "step": 3997 + }, + { + "epoch": 0.38, + "grad_norm": 0.26852560065398273, + "learning_rate": 0.00018928555897689456, + "loss": 0.9302, + "step": 3998 + }, + { + "epoch": 0.38, + "grad_norm": 0.2776665044902241, + "learning_rate": 0.00018927843366144765, + "loss": 1.0248, + "step": 3999 + }, + { + "epoch": 0.38, + "grad_norm": 0.27797522607330477, + "learning_rate": 0.00018927130611175183, + "loss": 1.166, + "step": 4000 + }, + { + "epoch": 0.38, + "grad_norm": 0.23621425637102375, + "learning_rate": 0.00018926417632798547, + "loss": 1.1106, + "step": 4001 + }, + { + "epoch": 0.38, + "grad_norm": 0.2926545007076596, + "learning_rate": 0.000189257044310327, + "loss": 1.1281, + "step": 4002 + }, + { + "epoch": 0.38, + "grad_norm": 0.27712375321038807, + "learning_rate": 0.00018924991005895493, + "loss": 1.0626, + "step": 4003 + }, + { + "epoch": 0.38, + "grad_norm": 0.231003959100629, + "learning_rate": 0.0001892427735740477, + "loss": 0.9582, + "step": 4004 + }, + { + "epoch": 0.38, + "grad_norm": 0.29692646912091747, + "learning_rate": 0.00018923563485578405, + "loss": 1.0728, + "step": 4005 + }, + { + "epoch": 0.38, + "grad_norm": 0.26325491425794023, + "learning_rate": 0.0001892284939043425, + "loss": 1.1163, + "step": 4006 + }, + { + "epoch": 0.38, + "grad_norm": 0.27243932891523137, + "learning_rate": 0.00018922135071990185, + "loss": 1.0374, + "step": 4007 + }, + { + "epoch": 0.38, + "grad_norm": 0.28198690740674986, + "learning_rate": 0.0001892142053026408, + "loss": 1.1425, + "step": 4008 + }, + { + "epoch": 0.38, + "grad_norm": 0.28509269273338117, + "learning_rate": 0.00018920705765273818, + "loss": 1.3779, + "step": 4009 + }, + { + "epoch": 0.38, + "grad_norm": 0.28126037107058494, + "learning_rate": 0.0001891999077703729, + "loss": 1.1616, + "step": 4010 + }, + { + "epoch": 0.38, + "grad_norm": 0.3282196814898794, + "learning_rate": 0.00018919275565572387, + "loss": 1.1184, + "step": 4011 + }, + { + "epoch": 0.38, + "grad_norm": 0.2796592489259632, + "learning_rate": 0.00018918560130897006, + "loss": 1.1493, + "step": 4012 + }, + { + "epoch": 0.38, + "grad_norm": 0.35705040515515696, + "learning_rate": 0.00018917844473029054, + "loss": 1.035, + "step": 4013 + }, + { + "epoch": 0.38, + "grad_norm": 0.26954678750551575, + "learning_rate": 0.00018917128591986439, + "loss": 1.1217, + "step": 4014 + }, + { + "epoch": 0.38, + "grad_norm": 0.268134645202258, + "learning_rate": 0.00018916412487787076, + "loss": 1.1415, + "step": 4015 + }, + { + "epoch": 0.38, + "grad_norm": 0.2869059780412037, + "learning_rate": 0.0001891569616044889, + "loss": 1.1142, + "step": 4016 + }, + { + "epoch": 0.38, + "grad_norm": 0.2866248425276922, + "learning_rate": 0.000189149796099898, + "loss": 0.968, + "step": 4017 + }, + { + "epoch": 0.38, + "grad_norm": 0.2854847382998254, + "learning_rate": 0.00018914262836427744, + "loss": 1.1804, + "step": 4018 + }, + { + "epoch": 0.38, + "grad_norm": 0.28804702455957, + "learning_rate": 0.00018913545839780658, + "loss": 0.9394, + "step": 4019 + }, + { + "epoch": 0.38, + "grad_norm": 0.26449776920625534, + "learning_rate": 0.00018912828620066486, + "loss": 1.1066, + "step": 4020 + }, + { + "epoch": 0.38, + "grad_norm": 0.2946737668152781, + "learning_rate": 0.00018912111177303177, + "loss": 1.2112, + "step": 4021 + }, + { + "epoch": 0.38, + "grad_norm": 0.2901645389196644, + "learning_rate": 0.00018911393511508685, + "loss": 1.039, + "step": 4022 + }, + { + "epoch": 0.38, + "grad_norm": 0.27790893974884995, + "learning_rate": 0.00018910675622700967, + "loss": 1.058, + "step": 4023 + }, + { + "epoch": 0.38, + "grad_norm": 0.27176601010684964, + "learning_rate": 0.00018909957510897992, + "loss": 1.1212, + "step": 4024 + }, + { + "epoch": 0.39, + "grad_norm": 0.27542708140447214, + "learning_rate": 0.00018909239176117732, + "loss": 1.0686, + "step": 4025 + }, + { + "epoch": 0.39, + "grad_norm": 0.27230675704373336, + "learning_rate": 0.0001890852061837816, + "loss": 1.1652, + "step": 4026 + }, + { + "epoch": 0.39, + "grad_norm": 0.2557788458829725, + "learning_rate": 0.00018907801837697265, + "loss": 0.955, + "step": 4027 + }, + { + "epoch": 0.39, + "grad_norm": 0.2601254941936526, + "learning_rate": 0.00018907082834093028, + "loss": 1.0526, + "step": 4028 + }, + { + "epoch": 0.39, + "grad_norm": 0.266059316761016, + "learning_rate": 0.00018906363607583445, + "loss": 1.0664, + "step": 4029 + }, + { + "epoch": 0.39, + "grad_norm": 0.26037123171682397, + "learning_rate": 0.00018905644158186515, + "loss": 1.027, + "step": 4030 + }, + { + "epoch": 0.39, + "grad_norm": 0.2563328237402112, + "learning_rate": 0.00018904924485920247, + "loss": 1.1243, + "step": 4031 + }, + { + "epoch": 0.39, + "grad_norm": 0.28925250146928455, + "learning_rate": 0.0001890420459080264, + "loss": 1.1726, + "step": 4032 + }, + { + "epoch": 0.39, + "grad_norm": 0.28975657876816185, + "learning_rate": 0.0001890348447285172, + "loss": 1.0362, + "step": 4033 + }, + { + "epoch": 0.39, + "grad_norm": 0.266527827382827, + "learning_rate": 0.00018902764132085507, + "loss": 1.1282, + "step": 4034 + }, + { + "epoch": 0.39, + "grad_norm": 0.2819868839387067, + "learning_rate": 0.00018902043568522027, + "loss": 1.0481, + "step": 4035 + }, + { + "epoch": 0.39, + "grad_norm": 0.26260787323629486, + "learning_rate": 0.0001890132278217931, + "loss": 1.0494, + "step": 4036 + }, + { + "epoch": 0.39, + "grad_norm": 0.29231684987402756, + "learning_rate": 0.00018900601773075396, + "loss": 1.0668, + "step": 4037 + }, + { + "epoch": 0.39, + "grad_norm": 0.2689538216980018, + "learning_rate": 0.00018899880541228332, + "loss": 1.0549, + "step": 4038 + }, + { + "epoch": 0.39, + "grad_norm": 0.2748948697066311, + "learning_rate": 0.0001889915908665616, + "loss": 1.0691, + "step": 4039 + }, + { + "epoch": 0.39, + "grad_norm": 0.26070699821281274, + "learning_rate": 0.00018898437409376942, + "loss": 1.0906, + "step": 4040 + }, + { + "epoch": 0.39, + "grad_norm": 0.29273143034067534, + "learning_rate": 0.00018897715509408734, + "loss": 1.0254, + "step": 4041 + }, + { + "epoch": 0.39, + "grad_norm": 0.3032571667610144, + "learning_rate": 0.00018896993386769602, + "loss": 0.9515, + "step": 4042 + }, + { + "epoch": 0.39, + "grad_norm": 0.2567507749894945, + "learning_rate": 0.0001889627104147762, + "loss": 1.054, + "step": 4043 + }, + { + "epoch": 0.39, + "grad_norm": 0.3048904231170557, + "learning_rate": 0.00018895548473550866, + "loss": 1.0738, + "step": 4044 + }, + { + "epoch": 0.39, + "grad_norm": 0.27093062547894164, + "learning_rate": 0.00018894825683007417, + "loss": 1.1361, + "step": 4045 + }, + { + "epoch": 0.39, + "grad_norm": 0.2907476290882117, + "learning_rate": 0.00018894102669865368, + "loss": 1.2182, + "step": 4046 + }, + { + "epoch": 0.39, + "grad_norm": 0.27932158114529015, + "learning_rate": 0.0001889337943414281, + "loss": 1.0292, + "step": 4047 + }, + { + "epoch": 0.39, + "grad_norm": 0.303163185427342, + "learning_rate": 0.00018892655975857842, + "loss": 1.0809, + "step": 4048 + }, + { + "epoch": 0.39, + "grad_norm": 0.28389248942158896, + "learning_rate": 0.0001889193229502857, + "loss": 0.9802, + "step": 4049 + }, + { + "epoch": 0.39, + "grad_norm": 0.23743556853791362, + "learning_rate": 0.000188912083916731, + "loss": 1.0415, + "step": 4050 + }, + { + "epoch": 0.39, + "grad_norm": 0.2607618534899767, + "learning_rate": 0.00018890484265809558, + "loss": 1.1187, + "step": 4051 + }, + { + "epoch": 0.39, + "grad_norm": 0.282981189123625, + "learning_rate": 0.00018889759917456057, + "loss": 1.1592, + "step": 4052 + }, + { + "epoch": 0.39, + "grad_norm": 0.2875209522933082, + "learning_rate": 0.00018889035346630726, + "loss": 1.1722, + "step": 4053 + }, + { + "epoch": 0.39, + "grad_norm": 0.28469142917045703, + "learning_rate": 0.000188883105533517, + "loss": 1.0714, + "step": 4054 + }, + { + "epoch": 0.39, + "grad_norm": 0.27563667585065, + "learning_rate": 0.00018887585537637116, + "loss": 0.9172, + "step": 4055 + }, + { + "epoch": 0.39, + "grad_norm": 0.2735767551268766, + "learning_rate": 0.00018886860299505118, + "loss": 1.1189, + "step": 4056 + }, + { + "epoch": 0.39, + "grad_norm": 0.30194229419916335, + "learning_rate": 0.00018886134838973857, + "loss": 1.0886, + "step": 4057 + }, + { + "epoch": 0.39, + "grad_norm": 0.2932088348927251, + "learning_rate": 0.00018885409156061488, + "loss": 1.115, + "step": 4058 + }, + { + "epoch": 0.39, + "grad_norm": 0.27110810019888465, + "learning_rate": 0.00018884683250786167, + "loss": 1.0398, + "step": 4059 + }, + { + "epoch": 0.39, + "grad_norm": 0.33237664113893134, + "learning_rate": 0.00018883957123166066, + "loss": 1.1189, + "step": 4060 + }, + { + "epoch": 0.39, + "grad_norm": 0.2507604291402785, + "learning_rate": 0.00018883230773219354, + "loss": 1.0053, + "step": 4061 + }, + { + "epoch": 0.39, + "grad_norm": 0.2972413921137987, + "learning_rate": 0.00018882504200964207, + "loss": 1.1487, + "step": 4062 + }, + { + "epoch": 0.39, + "grad_norm": 0.26170579658272336, + "learning_rate": 0.00018881777406418816, + "loss": 1.1655, + "step": 4063 + }, + { + "epoch": 0.39, + "grad_norm": 0.30453124626017863, + "learning_rate": 0.00018881050389601357, + "loss": 0.9793, + "step": 4064 + }, + { + "epoch": 0.39, + "grad_norm": 0.27274778300480246, + "learning_rate": 0.00018880323150530034, + "loss": 1.019, + "step": 4065 + }, + { + "epoch": 0.39, + "grad_norm": 0.2597880316659366, + "learning_rate": 0.0001887959568922304, + "loss": 1.13, + "step": 4066 + }, + { + "epoch": 0.39, + "grad_norm": 0.30696703463261027, + "learning_rate": 0.00018878868005698586, + "loss": 1.068, + "step": 4067 + }, + { + "epoch": 0.39, + "grad_norm": 0.254482856569276, + "learning_rate": 0.0001887814009997488, + "loss": 1.156, + "step": 4068 + }, + { + "epoch": 0.39, + "grad_norm": 0.29100192996613405, + "learning_rate": 0.00018877411972070135, + "loss": 1.1195, + "step": 4069 + }, + { + "epoch": 0.39, + "grad_norm": 0.2592354961216204, + "learning_rate": 0.0001887668362200258, + "loss": 1.1087, + "step": 4070 + }, + { + "epoch": 0.39, + "grad_norm": 0.28232033919806615, + "learning_rate": 0.00018875955049790438, + "loss": 1.0374, + "step": 4071 + }, + { + "epoch": 0.39, + "grad_norm": 0.2710379764058582, + "learning_rate": 0.00018875226255451942, + "loss": 1.0692, + "step": 4072 + }, + { + "epoch": 0.39, + "grad_norm": 0.3211139224345676, + "learning_rate": 0.00018874497239005332, + "loss": 1.0422, + "step": 4073 + }, + { + "epoch": 0.39, + "grad_norm": 0.26011924754300747, + "learning_rate": 0.0001887376800046885, + "loss": 1.0197, + "step": 4074 + }, + { + "epoch": 0.39, + "grad_norm": 0.2596113402217534, + "learning_rate": 0.00018873038539860747, + "loss": 1.0965, + "step": 4075 + }, + { + "epoch": 0.39, + "grad_norm": 0.2666964093730143, + "learning_rate": 0.0001887230885719928, + "loss": 1.1003, + "step": 4076 + }, + { + "epoch": 0.39, + "grad_norm": 0.24755453906410413, + "learning_rate": 0.00018871578952502703, + "loss": 1.0753, + "step": 4077 + }, + { + "epoch": 0.39, + "grad_norm": 0.25330256715469024, + "learning_rate": 0.0001887084882578929, + "loss": 1.0962, + "step": 4078 + }, + { + "epoch": 0.39, + "grad_norm": 0.2664417858327983, + "learning_rate": 0.00018870118477077309, + "loss": 1.1026, + "step": 4079 + }, + { + "epoch": 0.39, + "grad_norm": 0.2887769686304296, + "learning_rate": 0.00018869387906385044, + "loss": 1.085, + "step": 4080 + }, + { + "epoch": 0.39, + "grad_norm": 0.2945622145717073, + "learning_rate": 0.00018868657113730764, + "loss": 1.0453, + "step": 4081 + }, + { + "epoch": 0.39, + "grad_norm": 0.2914962906932826, + "learning_rate": 0.0001886792609913277, + "loss": 1.1669, + "step": 4082 + }, + { + "epoch": 0.39, + "grad_norm": 0.2982094102526812, + "learning_rate": 0.00018867194862609354, + "loss": 1.1398, + "step": 4083 + }, + { + "epoch": 0.39, + "grad_norm": 0.2943744623045212, + "learning_rate": 0.0001886646340417881, + "loss": 1.1049, + "step": 4084 + }, + { + "epoch": 0.39, + "grad_norm": 0.2808334985745818, + "learning_rate": 0.0001886573172385945, + "loss": 1.0434, + "step": 4085 + }, + { + "epoch": 0.39, + "grad_norm": 0.28765243366041743, + "learning_rate": 0.0001886499982166958, + "loss": 1.1537, + "step": 4086 + }, + { + "epoch": 0.39, + "grad_norm": 0.2838538473305344, + "learning_rate": 0.0001886426769762752, + "loss": 1.1621, + "step": 4087 + }, + { + "epoch": 0.39, + "grad_norm": 0.2779123375968445, + "learning_rate": 0.00018863535351751586, + "loss": 1.0426, + "step": 4088 + }, + { + "epoch": 0.39, + "grad_norm": 0.26234781180349137, + "learning_rate": 0.00018862802784060115, + "loss": 1.1284, + "step": 4089 + }, + { + "epoch": 0.39, + "grad_norm": 0.27792379078088625, + "learning_rate": 0.00018862069994571428, + "loss": 0.9789, + "step": 4090 + }, + { + "epoch": 0.39, + "grad_norm": 0.3213818567797798, + "learning_rate": 0.00018861336983303875, + "loss": 1.0857, + "step": 4091 + }, + { + "epoch": 0.39, + "grad_norm": 0.2942679485146761, + "learning_rate": 0.0001886060375027579, + "loss": 1.0244, + "step": 4092 + }, + { + "epoch": 0.39, + "grad_norm": 0.2934982143646054, + "learning_rate": 0.0001885987029550553, + "loss": 1.0908, + "step": 4093 + }, + { + "epoch": 0.39, + "grad_norm": 0.2705085832069158, + "learning_rate": 0.00018859136619011447, + "loss": 1.0971, + "step": 4094 + }, + { + "epoch": 0.39, + "grad_norm": 0.2645736238106595, + "learning_rate": 0.00018858402720811905, + "loss": 1.0806, + "step": 4095 + }, + { + "epoch": 0.39, + "grad_norm": 0.25805002703777125, + "learning_rate": 0.00018857668600925264, + "loss": 1.0815, + "step": 4096 + }, + { + "epoch": 0.39, + "grad_norm": 0.2768106728894951, + "learning_rate": 0.00018856934259369902, + "loss": 1.0963, + "step": 4097 + }, + { + "epoch": 0.39, + "grad_norm": 0.2952517992902994, + "learning_rate": 0.00018856199696164194, + "loss": 1.0913, + "step": 4098 + }, + { + "epoch": 0.39, + "grad_norm": 0.25015646961672283, + "learning_rate": 0.0001885546491132652, + "loss": 1.0521, + "step": 4099 + }, + { + "epoch": 0.39, + "grad_norm": 0.2933146156325368, + "learning_rate": 0.00018854729904875273, + "loss": 1.1188, + "step": 4100 + }, + { + "epoch": 0.39, + "grad_norm": 0.2906550664804429, + "learning_rate": 0.00018853994676828846, + "loss": 1.1433, + "step": 4101 + }, + { + "epoch": 0.39, + "grad_norm": 0.26884740955424163, + "learning_rate": 0.00018853259227205634, + "loss": 1.1072, + "step": 4102 + }, + { + "epoch": 0.39, + "grad_norm": 0.3113311790033681, + "learning_rate": 0.0001885252355602405, + "loss": 1.1282, + "step": 4103 + }, + { + "epoch": 0.39, + "grad_norm": 0.28155446924580935, + "learning_rate": 0.00018851787663302498, + "loss": 1.083, + "step": 4104 + }, + { + "epoch": 0.39, + "grad_norm": 0.29809592194674794, + "learning_rate": 0.00018851051549059397, + "loss": 1.0569, + "step": 4105 + }, + { + "epoch": 0.39, + "grad_norm": 0.24384181868224059, + "learning_rate": 0.0001885031521331317, + "loss": 1.0561, + "step": 4106 + }, + { + "epoch": 0.39, + "grad_norm": 0.27108319846787104, + "learning_rate": 0.0001884957865608224, + "loss": 1.058, + "step": 4107 + }, + { + "epoch": 0.39, + "grad_norm": 0.2943767607152993, + "learning_rate": 0.00018848841877385045, + "loss": 1.1393, + "step": 4108 + }, + { + "epoch": 0.39, + "grad_norm": 0.29474696198802675, + "learning_rate": 0.00018848104877240015, + "loss": 1.1804, + "step": 4109 + }, + { + "epoch": 0.39, + "grad_norm": 0.2951250932743141, + "learning_rate": 0.00018847367655665606, + "loss": 1.2261, + "step": 4110 + }, + { + "epoch": 0.39, + "grad_norm": 0.2608593829479291, + "learning_rate": 0.0001884663021268026, + "loss": 1.0769, + "step": 4111 + }, + { + "epoch": 0.39, + "grad_norm": 0.29600065532000475, + "learning_rate": 0.0001884589254830243, + "loss": 1.0772, + "step": 4112 + }, + { + "epoch": 0.39, + "grad_norm": 0.2990313407932539, + "learning_rate": 0.0001884515466255058, + "loss": 1.1775, + "step": 4113 + }, + { + "epoch": 0.39, + "grad_norm": 0.2843669340987528, + "learning_rate": 0.00018844416555443178, + "loss": 1.1374, + "step": 4114 + }, + { + "epoch": 0.39, + "grad_norm": 0.2508179755628321, + "learning_rate": 0.00018843678226998693, + "loss": 1.1355, + "step": 4115 + }, + { + "epoch": 0.39, + "grad_norm": 0.2754853279028242, + "learning_rate": 0.000188429396772356, + "loss": 1.0528, + "step": 4116 + }, + { + "epoch": 0.39, + "grad_norm": 0.2692652041119292, + "learning_rate": 0.00018842200906172386, + "loss": 1.081, + "step": 4117 + }, + { + "epoch": 0.39, + "grad_norm": 0.2581020541470848, + "learning_rate": 0.00018841461913827537, + "loss": 1.1674, + "step": 4118 + }, + { + "epoch": 0.39, + "grad_norm": 0.30950720631732437, + "learning_rate": 0.0001884072270021955, + "loss": 1.1665, + "step": 4119 + }, + { + "epoch": 0.39, + "grad_norm": 0.2485313737562046, + "learning_rate": 0.00018839983265366917, + "loss": 1.1825, + "step": 4120 + }, + { + "epoch": 0.39, + "grad_norm": 0.2644284997612591, + "learning_rate": 0.0001883924360928815, + "loss": 1.1264, + "step": 4121 + }, + { + "epoch": 0.39, + "grad_norm": 0.29112561286000443, + "learning_rate": 0.0001883850373200175, + "loss": 1.1339, + "step": 4122 + }, + { + "epoch": 0.39, + "grad_norm": 0.27090825993266066, + "learning_rate": 0.00018837763633526247, + "loss": 1.1998, + "step": 4123 + }, + { + "epoch": 0.39, + "grad_norm": 0.3146596625378274, + "learning_rate": 0.0001883702331388015, + "loss": 1.1325, + "step": 4124 + }, + { + "epoch": 0.39, + "grad_norm": 0.31943559497066387, + "learning_rate": 0.00018836282773081992, + "loss": 1.0685, + "step": 4125 + }, + { + "epoch": 0.39, + "grad_norm": 0.3067968110428308, + "learning_rate": 0.00018835542011150303, + "loss": 1.1561, + "step": 4126 + }, + { + "epoch": 0.39, + "grad_norm": 0.27469103129370837, + "learning_rate": 0.00018834801028103627, + "loss": 1.0606, + "step": 4127 + }, + { + "epoch": 0.39, + "grad_norm": 0.2565388108324173, + "learning_rate": 0.00018834059823960497, + "loss": 1.1264, + "step": 4128 + }, + { + "epoch": 0.4, + "grad_norm": 0.26620235976121565, + "learning_rate": 0.0001883331839873947, + "loss": 0.9498, + "step": 4129 + }, + { + "epoch": 0.4, + "grad_norm": 0.2736769679639783, + "learning_rate": 0.00018832576752459099, + "loss": 1.1824, + "step": 4130 + }, + { + "epoch": 0.4, + "grad_norm": 0.28039373078224655, + "learning_rate": 0.00018831834885137943, + "loss": 1.0567, + "step": 4131 + }, + { + "epoch": 0.4, + "grad_norm": 0.28360004650576354, + "learning_rate": 0.00018831092796794572, + "loss": 1.1355, + "step": 4132 + }, + { + "epoch": 0.4, + "grad_norm": 0.3007266751162003, + "learning_rate": 0.0001883035048744755, + "loss": 1.1805, + "step": 4133 + }, + { + "epoch": 0.4, + "grad_norm": 0.27426609206190533, + "learning_rate": 0.00018829607957115458, + "loss": 1.1591, + "step": 4134 + }, + { + "epoch": 0.4, + "grad_norm": 0.2975062542878613, + "learning_rate": 0.00018828865205816877, + "loss": 1.1326, + "step": 4135 + }, + { + "epoch": 0.4, + "grad_norm": 0.24138546192014423, + "learning_rate": 0.00018828122233570396, + "loss": 1.0062, + "step": 4136 + }, + { + "epoch": 0.4, + "grad_norm": 0.3259428242836633, + "learning_rate": 0.00018827379040394607, + "loss": 1.0638, + "step": 4137 + }, + { + "epoch": 0.4, + "grad_norm": 0.2893519126284774, + "learning_rate": 0.00018826635626308113, + "loss": 1.0938, + "step": 4138 + }, + { + "epoch": 0.4, + "grad_norm": 0.2649524956937564, + "learning_rate": 0.00018825891991329513, + "loss": 1.0869, + "step": 4139 + }, + { + "epoch": 0.4, + "grad_norm": 0.28942290188391856, + "learning_rate": 0.00018825148135477417, + "loss": 0.9653, + "step": 4140 + }, + { + "epoch": 0.4, + "grad_norm": 0.303875931112067, + "learning_rate": 0.00018824404058770443, + "loss": 1.1875, + "step": 4141 + }, + { + "epoch": 0.4, + "grad_norm": 0.27790521276702573, + "learning_rate": 0.00018823659761227216, + "loss": 1.1442, + "step": 4142 + }, + { + "epoch": 0.4, + "grad_norm": 0.2499769206341798, + "learning_rate": 0.00018822915242866354, + "loss": 0.9754, + "step": 4143 + }, + { + "epoch": 0.4, + "grad_norm": 0.30156640795095413, + "learning_rate": 0.00018822170503706494, + "loss": 1.1384, + "step": 4144 + }, + { + "epoch": 0.4, + "grad_norm": 0.2551627106250161, + "learning_rate": 0.00018821425543766275, + "loss": 0.9835, + "step": 4145 + }, + { + "epoch": 0.4, + "grad_norm": 0.2699256367377609, + "learning_rate": 0.00018820680363064335, + "loss": 1.0811, + "step": 4146 + }, + { + "epoch": 0.4, + "grad_norm": 0.2685887519613943, + "learning_rate": 0.00018819934961619323, + "loss": 1.1884, + "step": 4147 + }, + { + "epoch": 0.4, + "grad_norm": 0.27522442156799526, + "learning_rate": 0.000188191893394499, + "loss": 1.1393, + "step": 4148 + }, + { + "epoch": 0.4, + "grad_norm": 0.25866933190885216, + "learning_rate": 0.0001881844349657472, + "loss": 1.0642, + "step": 4149 + }, + { + "epoch": 0.4, + "grad_norm": 0.2767798366498473, + "learning_rate": 0.00018817697433012447, + "loss": 1.1294, + "step": 4150 + }, + { + "epoch": 0.4, + "grad_norm": 0.25741272329454395, + "learning_rate": 0.00018816951148781756, + "loss": 1.2252, + "step": 4151 + }, + { + "epoch": 0.4, + "grad_norm": 0.2703985849781623, + "learning_rate": 0.00018816204643901322, + "loss": 1.0734, + "step": 4152 + }, + { + "epoch": 0.4, + "grad_norm": 0.3049033588778724, + "learning_rate": 0.00018815457918389822, + "loss": 1.053, + "step": 4153 + }, + { + "epoch": 0.4, + "grad_norm": 0.22456734765420605, + "learning_rate": 0.00018814710972265953, + "loss": 0.9969, + "step": 4154 + }, + { + "epoch": 0.4, + "grad_norm": 0.2773921790872893, + "learning_rate": 0.00018813963805548397, + "loss": 1.0376, + "step": 4155 + }, + { + "epoch": 0.4, + "grad_norm": 0.34700507029136146, + "learning_rate": 0.0001881321641825586, + "loss": 1.1532, + "step": 4156 + }, + { + "epoch": 0.4, + "grad_norm": 0.3113422889257847, + "learning_rate": 0.00018812468810407043, + "loss": 1.0595, + "step": 4157 + }, + { + "epoch": 0.4, + "grad_norm": 0.29393485580712975, + "learning_rate": 0.00018811720982020655, + "loss": 1.1448, + "step": 4158 + }, + { + "epoch": 0.4, + "grad_norm": 0.2958948874029899, + "learning_rate": 0.00018810972933115412, + "loss": 1.1131, + "step": 4159 + }, + { + "epoch": 0.4, + "grad_norm": 0.2465662069981653, + "learning_rate": 0.00018810224663710033, + "loss": 0.8469, + "step": 4160 + }, + { + "epoch": 0.4, + "grad_norm": 0.2876082268093369, + "learning_rate": 0.00018809476173823247, + "loss": 1.0811, + "step": 4161 + }, + { + "epoch": 0.4, + "grad_norm": 0.3097135244404772, + "learning_rate": 0.0001880872746347378, + "loss": 1.1378, + "step": 4162 + }, + { + "epoch": 0.4, + "grad_norm": 0.2793139934409337, + "learning_rate": 0.00018807978532680374, + "loss": 1.0819, + "step": 4163 + }, + { + "epoch": 0.4, + "grad_norm": 0.2657493887824555, + "learning_rate": 0.0001880722938146177, + "loss": 1.1998, + "step": 4164 + }, + { + "epoch": 0.4, + "grad_norm": 0.2504323188479509, + "learning_rate": 0.00018806480009836716, + "loss": 1.136, + "step": 4165 + }, + { + "epoch": 0.4, + "grad_norm": 0.2770886792479756, + "learning_rate": 0.00018805730417823964, + "loss": 1.0626, + "step": 4166 + }, + { + "epoch": 0.4, + "grad_norm": 0.28327442888376786, + "learning_rate": 0.00018804980605442273, + "loss": 1.2524, + "step": 4167 + }, + { + "epoch": 0.4, + "grad_norm": 0.2738803155422817, + "learning_rate": 0.00018804230572710411, + "loss": 1.2178, + "step": 4168 + }, + { + "epoch": 0.4, + "grad_norm": 0.2621464862724536, + "learning_rate": 0.00018803480319647145, + "loss": 1.1003, + "step": 4169 + }, + { + "epoch": 0.4, + "grad_norm": 0.2829875367760781, + "learning_rate": 0.0001880272984627125, + "loss": 1.1652, + "step": 4170 + }, + { + "epoch": 0.4, + "grad_norm": 0.25110962790800634, + "learning_rate": 0.00018801979152601508, + "loss": 1.0741, + "step": 4171 + }, + { + "epoch": 0.4, + "grad_norm": 0.3021846850869496, + "learning_rate": 0.0001880122823865671, + "loss": 1.1427, + "step": 4172 + }, + { + "epoch": 0.4, + "grad_norm": 0.25970989561092755, + "learning_rate": 0.00018800477104455638, + "loss": 1.1105, + "step": 4173 + }, + { + "epoch": 0.4, + "grad_norm": 0.3037823815880924, + "learning_rate": 0.00018799725750017098, + "loss": 1.04, + "step": 4174 + }, + { + "epoch": 0.4, + "grad_norm": 0.28340725697761254, + "learning_rate": 0.00018798974175359892, + "loss": 1.0287, + "step": 4175 + }, + { + "epoch": 0.4, + "grad_norm": 0.2907181694596812, + "learning_rate": 0.00018798222380502825, + "loss": 1.1459, + "step": 4176 + }, + { + "epoch": 0.4, + "grad_norm": 0.2798317614131016, + "learning_rate": 0.00018797470365464718, + "loss": 1.0269, + "step": 4177 + }, + { + "epoch": 0.4, + "grad_norm": 0.2451699568307834, + "learning_rate": 0.0001879671813026438, + "loss": 1.0953, + "step": 4178 + }, + { + "epoch": 0.4, + "grad_norm": 0.26115132881368164, + "learning_rate": 0.00018795965674920647, + "loss": 0.95, + "step": 4179 + }, + { + "epoch": 0.4, + "grad_norm": 0.28240121314882904, + "learning_rate": 0.00018795212999452344, + "loss": 1.0437, + "step": 4180 + }, + { + "epoch": 0.4, + "grad_norm": 0.30408179576379274, + "learning_rate": 0.00018794460103878306, + "loss": 1.1509, + "step": 4181 + }, + { + "epoch": 0.4, + "grad_norm": 0.2613648521070335, + "learning_rate": 0.00018793706988217378, + "loss": 1.0649, + "step": 4182 + }, + { + "epoch": 0.4, + "grad_norm": 0.2807710295201798, + "learning_rate": 0.00018792953652488405, + "loss": 1.2016, + "step": 4183 + }, + { + "epoch": 0.4, + "grad_norm": 0.27595401782163975, + "learning_rate": 0.0001879220009671024, + "loss": 1.12, + "step": 4184 + }, + { + "epoch": 0.4, + "grad_norm": 0.28286269456870644, + "learning_rate": 0.00018791446320901747, + "loss": 1.0609, + "step": 4185 + }, + { + "epoch": 0.4, + "grad_norm": 0.28563194802618863, + "learning_rate": 0.0001879069232508178, + "loss": 1.0778, + "step": 4186 + }, + { + "epoch": 0.4, + "grad_norm": 0.270002889739277, + "learning_rate": 0.00018789938109269215, + "loss": 1.0788, + "step": 4187 + }, + { + "epoch": 0.4, + "grad_norm": 0.25196325847643697, + "learning_rate": 0.00018789183673482924, + "loss": 1.1108, + "step": 4188 + }, + { + "epoch": 0.4, + "grad_norm": 0.25601666120815847, + "learning_rate": 0.00018788429017741785, + "loss": 1.043, + "step": 4189 + }, + { + "epoch": 0.4, + "grad_norm": 0.2743217941272287, + "learning_rate": 0.0001878767414206469, + "loss": 1.0379, + "step": 4190 + }, + { + "epoch": 0.4, + "grad_norm": 0.2971315421501434, + "learning_rate": 0.00018786919046470527, + "loss": 1.1175, + "step": 4191 + }, + { + "epoch": 0.4, + "grad_norm": 0.24701289964452056, + "learning_rate": 0.0001878616373097819, + "loss": 1.047, + "step": 4192 + }, + { + "epoch": 0.4, + "grad_norm": 0.28093581228292824, + "learning_rate": 0.00018785408195606587, + "loss": 1.1656, + "step": 4193 + }, + { + "epoch": 0.4, + "grad_norm": 0.2738956970254292, + "learning_rate": 0.0001878465244037462, + "loss": 1.0897, + "step": 4194 + }, + { + "epoch": 0.4, + "grad_norm": 0.3300931758890875, + "learning_rate": 0.00018783896465301205, + "loss": 1.1008, + "step": 4195 + }, + { + "epoch": 0.4, + "grad_norm": 0.2339082478589511, + "learning_rate": 0.0001878314027040526, + "loss": 1.1647, + "step": 4196 + }, + { + "epoch": 0.4, + "grad_norm": 0.2575164996133438, + "learning_rate": 0.0001878238385570571, + "loss": 1.1316, + "step": 4197 + }, + { + "epoch": 0.4, + "grad_norm": 0.28065885782328104, + "learning_rate": 0.00018781627221221484, + "loss": 1.1204, + "step": 4198 + }, + { + "epoch": 0.4, + "grad_norm": 0.27240538013841803, + "learning_rate": 0.0001878087036697152, + "loss": 1.1995, + "step": 4199 + }, + { + "epoch": 0.4, + "grad_norm": 0.28177346756683214, + "learning_rate": 0.00018780113292974756, + "loss": 1.0969, + "step": 4200 + }, + { + "epoch": 0.4, + "grad_norm": 0.2869344753327611, + "learning_rate": 0.00018779355999250135, + "loss": 1.02, + "step": 4201 + }, + { + "epoch": 0.4, + "grad_norm": 0.317258761790818, + "learning_rate": 0.00018778598485816618, + "loss": 1.0979, + "step": 4202 + }, + { + "epoch": 0.4, + "grad_norm": 0.28088629903560547, + "learning_rate": 0.00018777840752693152, + "loss": 1.0249, + "step": 4203 + }, + { + "epoch": 0.4, + "grad_norm": 0.2796415407726609, + "learning_rate": 0.00018777082799898705, + "loss": 1.2466, + "step": 4204 + }, + { + "epoch": 0.4, + "grad_norm": 0.28573228367442755, + "learning_rate": 0.00018776324627452247, + "loss": 1.1008, + "step": 4205 + }, + { + "epoch": 0.4, + "grad_norm": 0.36653558077222376, + "learning_rate": 0.0001877556623537275, + "loss": 1.1889, + "step": 4206 + }, + { + "epoch": 0.4, + "grad_norm": 0.257060176220037, + "learning_rate": 0.00018774807623679192, + "loss": 1.096, + "step": 4207 + }, + { + "epoch": 0.4, + "grad_norm": 0.26689142065335364, + "learning_rate": 0.00018774048792390559, + "loss": 1.1396, + "step": 4208 + }, + { + "epoch": 0.4, + "grad_norm": 0.27797803343333893, + "learning_rate": 0.0001877328974152584, + "loss": 1.0059, + "step": 4209 + }, + { + "epoch": 0.4, + "grad_norm": 0.26890403496189735, + "learning_rate": 0.00018772530471104028, + "loss": 1.1074, + "step": 4210 + }, + { + "epoch": 0.4, + "grad_norm": 0.29380901154741923, + "learning_rate": 0.00018771770981144132, + "loss": 1.1014, + "step": 4211 + }, + { + "epoch": 0.4, + "grad_norm": 0.2777299581369351, + "learning_rate": 0.00018771011271665153, + "loss": 1.0938, + "step": 4212 + }, + { + "epoch": 0.4, + "grad_norm": 0.26403372932978386, + "learning_rate": 0.00018770251342686104, + "loss": 1.1595, + "step": 4213 + }, + { + "epoch": 0.4, + "grad_norm": 0.28636002615157274, + "learning_rate": 0.00018769491194226006, + "loss": 1.0455, + "step": 4214 + }, + { + "epoch": 0.4, + "grad_norm": 0.2844974569194305, + "learning_rate": 0.00018768730826303876, + "loss": 1.0772, + "step": 4215 + }, + { + "epoch": 0.4, + "grad_norm": 0.25117753184917296, + "learning_rate": 0.0001876797023893875, + "loss": 1.1215, + "step": 4216 + }, + { + "epoch": 0.4, + "grad_norm": 0.25290828237696705, + "learning_rate": 0.00018767209432149652, + "loss": 1.0751, + "step": 4217 + }, + { + "epoch": 0.4, + "grad_norm": 0.2641125806261801, + "learning_rate": 0.0001876644840595563, + "loss": 1.1248, + "step": 4218 + }, + { + "epoch": 0.4, + "grad_norm": 0.26789458127024735, + "learning_rate": 0.00018765687160375732, + "loss": 1.058, + "step": 4219 + }, + { + "epoch": 0.4, + "grad_norm": 0.2656234103720025, + "learning_rate": 0.00018764925695428998, + "loss": 1.0255, + "step": 4220 + }, + { + "epoch": 0.4, + "grad_norm": 0.24975067381842872, + "learning_rate": 0.00018764164011134495, + "loss": 1.1212, + "step": 4221 + }, + { + "epoch": 0.4, + "grad_norm": 0.2395528550635482, + "learning_rate": 0.00018763402107511276, + "loss": 0.9989, + "step": 4222 + }, + { + "epoch": 0.4, + "grad_norm": 0.2532908262725297, + "learning_rate": 0.00018762639984578412, + "loss": 1.1901, + "step": 4223 + }, + { + "epoch": 0.4, + "grad_norm": 0.31719585665339967, + "learning_rate": 0.00018761877642354977, + "loss": 1.2269, + "step": 4224 + }, + { + "epoch": 0.4, + "grad_norm": 0.31874436060610234, + "learning_rate": 0.00018761115080860046, + "loss": 0.9891, + "step": 4225 + }, + { + "epoch": 0.4, + "grad_norm": 0.29050294955105876, + "learning_rate": 0.00018760352300112705, + "loss": 1.1358, + "step": 4226 + }, + { + "epoch": 0.4, + "grad_norm": 0.27399249257978747, + "learning_rate": 0.00018759589300132041, + "loss": 1.0868, + "step": 4227 + }, + { + "epoch": 0.4, + "grad_norm": 0.2696276778720872, + "learning_rate": 0.00018758826080937148, + "loss": 1.1893, + "step": 4228 + }, + { + "epoch": 0.4, + "grad_norm": 0.28473533529326145, + "learning_rate": 0.00018758062642547133, + "loss": 1.1131, + "step": 4229 + }, + { + "epoch": 0.4, + "grad_norm": 0.27137640986272094, + "learning_rate": 0.00018757298984981092, + "loss": 0.976, + "step": 4230 + }, + { + "epoch": 0.4, + "grad_norm": 0.3035287993677886, + "learning_rate": 0.0001875653510825814, + "loss": 1.2036, + "step": 4231 + }, + { + "epoch": 0.4, + "grad_norm": 0.2730310702863054, + "learning_rate": 0.00018755771012397393, + "loss": 1.1172, + "step": 4232 + }, + { + "epoch": 0.4, + "grad_norm": 0.2515152904223668, + "learning_rate": 0.00018755006697417976, + "loss": 1.0117, + "step": 4233 + }, + { + "epoch": 0.41, + "grad_norm": 0.2628436719293594, + "learning_rate": 0.00018754242163339014, + "loss": 1.0782, + "step": 4234 + }, + { + "epoch": 0.41, + "grad_norm": 0.26021326272006884, + "learning_rate": 0.0001875347741017964, + "loss": 1.1058, + "step": 4235 + }, + { + "epoch": 0.41, + "grad_norm": 0.2739556248198109, + "learning_rate": 0.0001875271243795899, + "loss": 0.9982, + "step": 4236 + }, + { + "epoch": 0.41, + "grad_norm": 0.30134796736592306, + "learning_rate": 0.00018751947246696212, + "loss": 1.0949, + "step": 4237 + }, + { + "epoch": 0.41, + "grad_norm": 0.271764675869869, + "learning_rate": 0.00018751181836410455, + "loss": 1.0182, + "step": 4238 + }, + { + "epoch": 0.41, + "grad_norm": 0.24250081027639273, + "learning_rate": 0.0001875041620712087, + "loss": 1.0884, + "step": 4239 + }, + { + "epoch": 0.41, + "grad_norm": 0.2869641568770455, + "learning_rate": 0.0001874965035884662, + "loss": 1.1567, + "step": 4240 + }, + { + "epoch": 0.41, + "grad_norm": 0.2969381271520231, + "learning_rate": 0.00018748884291606874, + "loss": 1.0765, + "step": 4241 + }, + { + "epoch": 0.41, + "grad_norm": 0.26178640355726157, + "learning_rate": 0.00018748118005420798, + "loss": 1.1142, + "step": 4242 + }, + { + "epoch": 0.41, + "grad_norm": 0.2800257202847653, + "learning_rate": 0.0001874735150030757, + "loss": 1.1046, + "step": 4243 + }, + { + "epoch": 0.41, + "grad_norm": 0.2944863333696688, + "learning_rate": 0.00018746584776286376, + "loss": 1.038, + "step": 4244 + }, + { + "epoch": 0.41, + "grad_norm": 0.30216624143418275, + "learning_rate": 0.00018745817833376398, + "loss": 1.2037, + "step": 4245 + }, + { + "epoch": 0.41, + "grad_norm": 0.2663185997548643, + "learning_rate": 0.00018745050671596834, + "loss": 1.1279, + "step": 4246 + }, + { + "epoch": 0.41, + "grad_norm": 0.2447130737177253, + "learning_rate": 0.00018744283290966882, + "loss": 1.0917, + "step": 4247 + }, + { + "epoch": 0.41, + "grad_norm": 0.290268659831609, + "learning_rate": 0.00018743515691505743, + "loss": 1.031, + "step": 4248 + }, + { + "epoch": 0.41, + "grad_norm": 0.28648517218701464, + "learning_rate": 0.0001874274787323263, + "loss": 1.0971, + "step": 4249 + }, + { + "epoch": 0.41, + "grad_norm": 0.2682660047938872, + "learning_rate": 0.00018741979836166755, + "loss": 1.1424, + "step": 4250 + }, + { + "epoch": 0.41, + "grad_norm": 0.3030211943382245, + "learning_rate": 0.00018741211580327344, + "loss": 1.254, + "step": 4251 + }, + { + "epoch": 0.41, + "grad_norm": 0.3098473572044177, + "learning_rate": 0.00018740443105733613, + "loss": 0.9803, + "step": 4252 + }, + { + "epoch": 0.41, + "grad_norm": 0.2797263417102469, + "learning_rate": 0.00018739674412404807, + "loss": 1.0723, + "step": 4253 + }, + { + "epoch": 0.41, + "grad_norm": 0.27611130556479674, + "learning_rate": 0.00018738905500360154, + "loss": 1.0926, + "step": 4254 + }, + { + "epoch": 0.41, + "grad_norm": 0.2787572653472917, + "learning_rate": 0.00018738136369618897, + "loss": 1.0504, + "step": 4255 + }, + { + "epoch": 0.41, + "grad_norm": 0.28676439879084453, + "learning_rate": 0.00018737367020200285, + "loss": 1.0677, + "step": 4256 + }, + { + "epoch": 0.41, + "grad_norm": 0.2898408979989211, + "learning_rate": 0.00018736597452123575, + "loss": 1.0629, + "step": 4257 + }, + { + "epoch": 0.41, + "grad_norm": 0.265340677178945, + "learning_rate": 0.00018735827665408022, + "loss": 1.0293, + "step": 4258 + }, + { + "epoch": 0.41, + "grad_norm": 0.2910287914202519, + "learning_rate": 0.0001873505766007289, + "loss": 0.9698, + "step": 4259 + }, + { + "epoch": 0.41, + "grad_norm": 0.2899174476618089, + "learning_rate": 0.00018734287436137452, + "loss": 1.1953, + "step": 4260 + }, + { + "epoch": 0.41, + "grad_norm": 0.2566780706363218, + "learning_rate": 0.0001873351699362098, + "loss": 1.0803, + "step": 4261 + }, + { + "epoch": 0.41, + "grad_norm": 0.25026487846526846, + "learning_rate": 0.00018732746332542758, + "loss": 1.0512, + "step": 4262 + }, + { + "epoch": 0.41, + "grad_norm": 0.2634865581228525, + "learning_rate": 0.00018731975452922073, + "loss": 1.0902, + "step": 4263 + }, + { + "epoch": 0.41, + "grad_norm": 0.2739235925675129, + "learning_rate": 0.0001873120435477821, + "loss": 1.2044, + "step": 4264 + }, + { + "epoch": 0.41, + "grad_norm": 0.3392678199356469, + "learning_rate": 0.00018730433038130473, + "loss": 1.1055, + "step": 4265 + }, + { + "epoch": 0.41, + "grad_norm": 0.2394900592357321, + "learning_rate": 0.0001872966150299816, + "loss": 1.0701, + "step": 4266 + }, + { + "epoch": 0.41, + "grad_norm": 0.2700640038637131, + "learning_rate": 0.00018728889749400584, + "loss": 1.0162, + "step": 4267 + }, + { + "epoch": 0.41, + "grad_norm": 0.2912889000502306, + "learning_rate": 0.00018728117777357055, + "loss": 1.1419, + "step": 4268 + }, + { + "epoch": 0.41, + "grad_norm": 0.2760619669010758, + "learning_rate": 0.00018727345586886892, + "loss": 1.0408, + "step": 4269 + }, + { + "epoch": 0.41, + "grad_norm": 0.2927376911193287, + "learning_rate": 0.00018726573178009422, + "loss": 1.0906, + "step": 4270 + }, + { + "epoch": 0.41, + "grad_norm": 0.25519561721394524, + "learning_rate": 0.00018725800550743976, + "loss": 1.0385, + "step": 4271 + }, + { + "epoch": 0.41, + "grad_norm": 0.2830277309204389, + "learning_rate": 0.00018725027705109886, + "loss": 1.0272, + "step": 4272 + }, + { + "epoch": 0.41, + "grad_norm": 0.2664819202376281, + "learning_rate": 0.00018724254641126493, + "loss": 1.1779, + "step": 4273 + }, + { + "epoch": 0.41, + "grad_norm": 0.2574469843738646, + "learning_rate": 0.00018723481358813145, + "loss": 1.0864, + "step": 4274 + }, + { + "epoch": 0.41, + "grad_norm": 0.2757193054824571, + "learning_rate": 0.00018722707858189193, + "loss": 1.0145, + "step": 4275 + }, + { + "epoch": 0.41, + "grad_norm": 0.29257610913040466, + "learning_rate": 0.00018721934139273998, + "loss": 0.9679, + "step": 4276 + }, + { + "epoch": 0.41, + "grad_norm": 0.289091197361481, + "learning_rate": 0.00018721160202086914, + "loss": 1.092, + "step": 4277 + }, + { + "epoch": 0.41, + "grad_norm": 0.24344559699327611, + "learning_rate": 0.0001872038604664732, + "loss": 1.1082, + "step": 4278 + }, + { + "epoch": 0.41, + "grad_norm": 0.2810770812752845, + "learning_rate": 0.0001871961167297458, + "loss": 1.0673, + "step": 4279 + }, + { + "epoch": 0.41, + "grad_norm": 0.2861371909971846, + "learning_rate": 0.0001871883708108808, + "loss": 1.0974, + "step": 4280 + }, + { + "epoch": 0.41, + "grad_norm": 0.31326751338971903, + "learning_rate": 0.00018718062271007204, + "loss": 1.1654, + "step": 4281 + }, + { + "epoch": 0.41, + "grad_norm": 0.2695673863745955, + "learning_rate": 0.00018717287242751341, + "loss": 1.045, + "step": 4282 + }, + { + "epoch": 0.41, + "grad_norm": 0.3078415478161515, + "learning_rate": 0.00018716511996339885, + "loss": 1.0805, + "step": 4283 + }, + { + "epoch": 0.41, + "grad_norm": 0.26900716775364764, + "learning_rate": 0.00018715736531792237, + "loss": 1.1007, + "step": 4284 + }, + { + "epoch": 0.41, + "grad_norm": 0.28023295676688076, + "learning_rate": 0.00018714960849127804, + "loss": 1.0815, + "step": 4285 + }, + { + "epoch": 0.41, + "grad_norm": 0.2898989628594484, + "learning_rate": 0.00018714184948366, + "loss": 1.0418, + "step": 4286 + }, + { + "epoch": 0.41, + "grad_norm": 0.26061571588826044, + "learning_rate": 0.00018713408829526242, + "loss": 1.1429, + "step": 4287 + }, + { + "epoch": 0.41, + "grad_norm": 0.2745641023863704, + "learning_rate": 0.0001871263249262795, + "loss": 1.1773, + "step": 4288 + }, + { + "epoch": 0.41, + "grad_norm": 0.26470950342417765, + "learning_rate": 0.00018711855937690556, + "loss": 1.1311, + "step": 4289 + }, + { + "epoch": 0.41, + "grad_norm": 0.29468498976148555, + "learning_rate": 0.00018711079164733491, + "loss": 1.0414, + "step": 4290 + }, + { + "epoch": 0.41, + "grad_norm": 0.26226927485345436, + "learning_rate": 0.00018710302173776194, + "loss": 1.0876, + "step": 4291 + }, + { + "epoch": 0.41, + "grad_norm": 0.24393119618376538, + "learning_rate": 0.00018709524964838115, + "loss": 1.0936, + "step": 4292 + }, + { + "epoch": 0.41, + "grad_norm": 0.2871335693752671, + "learning_rate": 0.00018708747537938696, + "loss": 1.0862, + "step": 4293 + }, + { + "epoch": 0.41, + "grad_norm": 0.25454898572621976, + "learning_rate": 0.00018707969893097399, + "loss": 0.9553, + "step": 4294 + }, + { + "epoch": 0.41, + "grad_norm": 0.2612510649191512, + "learning_rate": 0.0001870719203033368, + "loss": 1.015, + "step": 4295 + }, + { + "epoch": 0.41, + "grad_norm": 0.2735824234461878, + "learning_rate": 0.0001870641394966701, + "loss": 1.1366, + "step": 4296 + }, + { + "epoch": 0.41, + "grad_norm": 0.2561249258130775, + "learning_rate": 0.00018705635651116857, + "loss": 0.9771, + "step": 4297 + }, + { + "epoch": 0.41, + "grad_norm": 0.2724097621208985, + "learning_rate": 0.00018704857134702705, + "loss": 1.0373, + "step": 4298 + }, + { + "epoch": 0.41, + "grad_norm": 0.2624090546858682, + "learning_rate": 0.00018704078400444028, + "loss": 1.0986, + "step": 4299 + }, + { + "epoch": 0.41, + "grad_norm": 0.2933893818570987, + "learning_rate": 0.0001870329944836032, + "loss": 1.0659, + "step": 4300 + }, + { + "epoch": 0.41, + "grad_norm": 0.25403733960977537, + "learning_rate": 0.00018702520278471074, + "loss": 1.1726, + "step": 4301 + }, + { + "epoch": 0.41, + "grad_norm": 0.27542609220308545, + "learning_rate": 0.00018701740890795788, + "loss": 1.1237, + "step": 4302 + }, + { + "epoch": 0.41, + "grad_norm": 0.2536691041486995, + "learning_rate": 0.0001870096128535397, + "loss": 0.9981, + "step": 4303 + }, + { + "epoch": 0.41, + "grad_norm": 0.2732550236694586, + "learning_rate": 0.00018700181462165126, + "loss": 1.0504, + "step": 4304 + }, + { + "epoch": 0.41, + "grad_norm": 0.26940609611280475, + "learning_rate": 0.0001869940142124877, + "loss": 1.0977, + "step": 4305 + }, + { + "epoch": 0.41, + "grad_norm": 0.2912691081559738, + "learning_rate": 0.0001869862116262443, + "loss": 1.0703, + "step": 4306 + }, + { + "epoch": 0.41, + "grad_norm": 0.27335596729191, + "learning_rate": 0.00018697840686311628, + "loss": 1.1206, + "step": 4307 + }, + { + "epoch": 0.41, + "grad_norm": 0.2882836785046496, + "learning_rate": 0.00018697059992329895, + "loss": 1.0942, + "step": 4308 + }, + { + "epoch": 0.41, + "grad_norm": 0.2736335086651974, + "learning_rate": 0.0001869627908069877, + "loss": 1.09, + "step": 4309 + }, + { + "epoch": 0.41, + "grad_norm": 0.25546265700852766, + "learning_rate": 0.00018695497951437795, + "loss": 1.1653, + "step": 4310 + }, + { + "epoch": 0.41, + "grad_norm": 0.27099441480074316, + "learning_rate": 0.0001869471660456652, + "loss": 1.0797, + "step": 4311 + }, + { + "epoch": 0.41, + "grad_norm": 0.2734354890739324, + "learning_rate": 0.00018693935040104497, + "loss": 1.0753, + "step": 4312 + }, + { + "epoch": 0.41, + "grad_norm": 0.3009809400535224, + "learning_rate": 0.00018693153258071286, + "loss": 1.1689, + "step": 4313 + }, + { + "epoch": 0.41, + "grad_norm": 0.29754474015094556, + "learning_rate": 0.00018692371258486451, + "loss": 1.0379, + "step": 4314 + }, + { + "epoch": 0.41, + "grad_norm": 0.2996282403269194, + "learning_rate": 0.00018691589041369564, + "loss": 1.1081, + "step": 4315 + }, + { + "epoch": 0.41, + "grad_norm": 0.24314970149389548, + "learning_rate": 0.000186908066067402, + "loss": 1.0464, + "step": 4316 + }, + { + "epoch": 0.41, + "grad_norm": 0.24701089925472527, + "learning_rate": 0.00018690023954617932, + "loss": 1.1741, + "step": 4317 + }, + { + "epoch": 0.41, + "grad_norm": 0.27607286639486517, + "learning_rate": 0.0001868924108502236, + "loss": 1.0594, + "step": 4318 + }, + { + "epoch": 0.41, + "grad_norm": 0.3017023954760616, + "learning_rate": 0.00018688457997973065, + "loss": 1.0294, + "step": 4319 + }, + { + "epoch": 0.41, + "grad_norm": 0.24937940795309205, + "learning_rate": 0.00018687674693489647, + "loss": 1.0421, + "step": 4320 + }, + { + "epoch": 0.41, + "grad_norm": 0.23892217344372102, + "learning_rate": 0.00018686891171591712, + "loss": 1.0908, + "step": 4321 + }, + { + "epoch": 0.41, + "grad_norm": 0.28068063323508174, + "learning_rate": 0.00018686107432298868, + "loss": 1.1319, + "step": 4322 + }, + { + "epoch": 0.41, + "grad_norm": 0.26786810608469297, + "learning_rate": 0.00018685323475630723, + "loss": 1.0355, + "step": 4323 + }, + { + "epoch": 0.41, + "grad_norm": 0.2435421171517636, + "learning_rate": 0.000186845393016069, + "loss": 1.0141, + "step": 4324 + }, + { + "epoch": 0.41, + "grad_norm": 0.27884414310128647, + "learning_rate": 0.00018683754910247025, + "loss": 1.1473, + "step": 4325 + }, + { + "epoch": 0.41, + "grad_norm": 0.2890938734276742, + "learning_rate": 0.00018682970301570726, + "loss": 1.1203, + "step": 4326 + }, + { + "epoch": 0.41, + "grad_norm": 0.2769615698559789, + "learning_rate": 0.00018682185475597636, + "loss": 1.0541, + "step": 4327 + }, + { + "epoch": 0.41, + "grad_norm": 0.2785590604016616, + "learning_rate": 0.00018681400432347397, + "loss": 1.0544, + "step": 4328 + }, + { + "epoch": 0.41, + "grad_norm": 0.29418313155446657, + "learning_rate": 0.00018680615171839658, + "loss": 1.1175, + "step": 4329 + }, + { + "epoch": 0.41, + "grad_norm": 0.3025538424684769, + "learning_rate": 0.00018679829694094068, + "loss": 1.2113, + "step": 4330 + }, + { + "epoch": 0.41, + "grad_norm": 0.2910730421005133, + "learning_rate": 0.00018679043999130288, + "loss": 1.132, + "step": 4331 + }, + { + "epoch": 0.41, + "grad_norm": 0.29253393692155966, + "learning_rate": 0.00018678258086967975, + "loss": 1.095, + "step": 4332 + }, + { + "epoch": 0.41, + "grad_norm": 0.28305224842813903, + "learning_rate": 0.00018677471957626797, + "loss": 1.1026, + "step": 4333 + }, + { + "epoch": 0.41, + "grad_norm": 0.2924260897779223, + "learning_rate": 0.0001867668561112643, + "loss": 1.0386, + "step": 4334 + }, + { + "epoch": 0.41, + "grad_norm": 0.293768434817011, + "learning_rate": 0.00018675899047486557, + "loss": 1.0448, + "step": 4335 + }, + { + "epoch": 0.41, + "grad_norm": 0.2941288478482527, + "learning_rate": 0.00018675112266726854, + "loss": 1.0662, + "step": 4336 + }, + { + "epoch": 0.41, + "grad_norm": 0.2957044806998864, + "learning_rate": 0.00018674325268867016, + "loss": 0.9872, + "step": 4337 + }, + { + "epoch": 0.42, + "grad_norm": 0.3170426741111006, + "learning_rate": 0.00018673538053926735, + "loss": 1.0532, + "step": 4338 + }, + { + "epoch": 0.42, + "grad_norm": 0.3000921754928047, + "learning_rate": 0.00018672750621925714, + "loss": 1.1413, + "step": 4339 + }, + { + "epoch": 0.42, + "grad_norm": 0.2998372677155844, + "learning_rate": 0.00018671962972883658, + "loss": 1.1425, + "step": 4340 + }, + { + "epoch": 0.42, + "grad_norm": 0.2645728037896027, + "learning_rate": 0.00018671175106820277, + "loss": 1.0747, + "step": 4341 + }, + { + "epoch": 0.42, + "grad_norm": 0.28183953656999067, + "learning_rate": 0.00018670387023755295, + "loss": 0.9737, + "step": 4342 + }, + { + "epoch": 0.42, + "grad_norm": 0.256299564446901, + "learning_rate": 0.00018669598723708422, + "loss": 1.1139, + "step": 4343 + }, + { + "epoch": 0.42, + "grad_norm": 0.26935203909823185, + "learning_rate": 0.00018668810206699395, + "loss": 1.0717, + "step": 4344 + }, + { + "epoch": 0.42, + "grad_norm": 0.23352656548172643, + "learning_rate": 0.00018668021472747944, + "loss": 1.0427, + "step": 4345 + }, + { + "epoch": 0.42, + "grad_norm": 0.2778191273219274, + "learning_rate": 0.00018667232521873807, + "loss": 1.0442, + "step": 4346 + }, + { + "epoch": 0.42, + "grad_norm": 0.2708092397068009, + "learning_rate": 0.00018666443354096733, + "loss": 1.1642, + "step": 4347 + }, + { + "epoch": 0.42, + "grad_norm": 0.2628086430153175, + "learning_rate": 0.00018665653969436466, + "loss": 1.0567, + "step": 4348 + }, + { + "epoch": 0.42, + "grad_norm": 0.2689137058024899, + "learning_rate": 0.00018664864367912758, + "loss": 1.055, + "step": 4349 + }, + { + "epoch": 0.42, + "grad_norm": 0.27522982747891467, + "learning_rate": 0.00018664074549545377, + "loss": 0.9796, + "step": 4350 + }, + { + "epoch": 0.42, + "grad_norm": 0.2585842007123571, + "learning_rate": 0.00018663284514354084, + "loss": 1.0592, + "step": 4351 + }, + { + "epoch": 0.42, + "grad_norm": 0.2698306914125685, + "learning_rate": 0.0001866249426235865, + "loss": 1.084, + "step": 4352 + }, + { + "epoch": 0.42, + "grad_norm": 0.30565305866824105, + "learning_rate": 0.00018661703793578855, + "loss": 1.0316, + "step": 4353 + }, + { + "epoch": 0.42, + "grad_norm": 0.26114362713963013, + "learning_rate": 0.00018660913108034478, + "loss": 1.1677, + "step": 4354 + }, + { + "epoch": 0.42, + "grad_norm": 0.3016235809178839, + "learning_rate": 0.00018660122205745313, + "loss": 1.1933, + "step": 4355 + }, + { + "epoch": 0.42, + "grad_norm": 0.2863853851725708, + "learning_rate": 0.0001865933108673114, + "loss": 0.9809, + "step": 4356 + }, + { + "epoch": 0.42, + "grad_norm": 0.2912386308622707, + "learning_rate": 0.00018658539751011767, + "loss": 1.1289, + "step": 4357 + }, + { + "epoch": 0.42, + "grad_norm": 0.2776875205740512, + "learning_rate": 0.00018657748198606995, + "loss": 1.0565, + "step": 4358 + }, + { + "epoch": 0.42, + "grad_norm": 0.2748712764200445, + "learning_rate": 0.00018656956429536633, + "loss": 1.0782, + "step": 4359 + }, + { + "epoch": 0.42, + "grad_norm": 0.255166953661312, + "learning_rate": 0.00018656164443820494, + "loss": 1.0291, + "step": 4360 + }, + { + "epoch": 0.42, + "grad_norm": 0.2665620217061101, + "learning_rate": 0.00018655372241478403, + "loss": 1.114, + "step": 4361 + }, + { + "epoch": 0.42, + "grad_norm": 0.2544779490108305, + "learning_rate": 0.00018654579822530179, + "loss": 1.2265, + "step": 4362 + }, + { + "epoch": 0.42, + "grad_norm": 0.23854453003136591, + "learning_rate": 0.00018653787186995654, + "loss": 1.033, + "step": 4363 + }, + { + "epoch": 0.42, + "grad_norm": 0.2710547109595493, + "learning_rate": 0.00018652994334894668, + "loss": 1.0567, + "step": 4364 + }, + { + "epoch": 0.42, + "grad_norm": 0.27823107428416105, + "learning_rate": 0.00018652201266247063, + "loss": 1.0357, + "step": 4365 + }, + { + "epoch": 0.42, + "grad_norm": 0.3065920295229727, + "learning_rate": 0.0001865140798107268, + "loss": 0.9394, + "step": 4366 + }, + { + "epoch": 0.42, + "grad_norm": 0.29060026851055215, + "learning_rate": 0.00018650614479391378, + "loss": 1.0582, + "step": 4367 + }, + { + "epoch": 0.42, + "grad_norm": 0.2798363947842491, + "learning_rate": 0.00018649820761223012, + "loss": 1.1008, + "step": 4368 + }, + { + "epoch": 0.42, + "grad_norm": 0.2524281791152147, + "learning_rate": 0.00018649026826587442, + "loss": 1.0788, + "step": 4369 + }, + { + "epoch": 0.42, + "grad_norm": 0.3175783594422539, + "learning_rate": 0.00018648232675504543, + "loss": 1.0091, + "step": 4370 + }, + { + "epoch": 0.42, + "grad_norm": 0.2605770620877395, + "learning_rate": 0.00018647438307994185, + "loss": 1.0315, + "step": 4371 + }, + { + "epoch": 0.42, + "grad_norm": 0.3074184313253669, + "learning_rate": 0.0001864664372407625, + "loss": 1.11, + "step": 4372 + }, + { + "epoch": 0.42, + "grad_norm": 0.2804678028999845, + "learning_rate": 0.0001864584892377062, + "loss": 1.0521, + "step": 4373 + }, + { + "epoch": 0.42, + "grad_norm": 0.25285663187552027, + "learning_rate": 0.00018645053907097187, + "loss": 1.0407, + "step": 4374 + }, + { + "epoch": 0.42, + "grad_norm": 0.32876163956484417, + "learning_rate": 0.00018644258674075848, + "loss": 0.9872, + "step": 4375 + }, + { + "epoch": 0.42, + "grad_norm": 0.25777775379310247, + "learning_rate": 0.000186434632247265, + "loss": 1.0567, + "step": 4376 + }, + { + "epoch": 0.42, + "grad_norm": 0.29826590288671295, + "learning_rate": 0.00018642667559069055, + "loss": 1.0757, + "step": 4377 + }, + { + "epoch": 0.42, + "grad_norm": 0.27317079924980964, + "learning_rate": 0.0001864187167712342, + "loss": 1.1448, + "step": 4378 + }, + { + "epoch": 0.42, + "grad_norm": 0.3070229873978444, + "learning_rate": 0.00018641075578909518, + "loss": 1.0379, + "step": 4379 + }, + { + "epoch": 0.42, + "grad_norm": 0.2718896024874906, + "learning_rate": 0.0001864027926444727, + "loss": 1.1072, + "step": 4380 + }, + { + "epoch": 0.42, + "grad_norm": 0.2620818390598612, + "learning_rate": 0.00018639482733756601, + "loss": 1.0537, + "step": 4381 + }, + { + "epoch": 0.42, + "grad_norm": 0.2289284295029631, + "learning_rate": 0.00018638685986857448, + "loss": 1.0194, + "step": 4382 + }, + { + "epoch": 0.42, + "grad_norm": 0.29452954070001014, + "learning_rate": 0.00018637889023769748, + "loss": 1.1051, + "step": 4383 + }, + { + "epoch": 0.42, + "grad_norm": 0.24409495634240713, + "learning_rate": 0.00018637091844513445, + "loss": 1.089, + "step": 4384 + }, + { + "epoch": 0.42, + "grad_norm": 0.2444141149494506, + "learning_rate": 0.00018636294449108493, + "loss": 0.9955, + "step": 4385 + }, + { + "epoch": 0.42, + "grad_norm": 0.25886299860296796, + "learning_rate": 0.00018635496837574844, + "loss": 1.0442, + "step": 4386 + }, + { + "epoch": 0.42, + "grad_norm": 0.2788912752950406, + "learning_rate": 0.00018634699009932462, + "loss": 1.1965, + "step": 4387 + }, + { + "epoch": 0.42, + "grad_norm": 0.2671936525699648, + "learning_rate": 0.00018633900966201304, + "loss": 1.0077, + "step": 4388 + }, + { + "epoch": 0.42, + "grad_norm": 0.28071558942931124, + "learning_rate": 0.00018633102706401355, + "loss": 1.1462, + "step": 4389 + }, + { + "epoch": 0.42, + "grad_norm": 0.34829423214490346, + "learning_rate": 0.00018632304230552582, + "loss": 1.0623, + "step": 4390 + }, + { + "epoch": 0.42, + "grad_norm": 0.24191132707171756, + "learning_rate": 0.0001863150553867497, + "loss": 1.0765, + "step": 4391 + }, + { + "epoch": 0.42, + "grad_norm": 0.27536728708801206, + "learning_rate": 0.00018630706630788505, + "loss": 1.0193, + "step": 4392 + }, + { + "epoch": 0.42, + "grad_norm": 0.2697499088562797, + "learning_rate": 0.00018629907506913186, + "loss": 1.1339, + "step": 4393 + }, + { + "epoch": 0.42, + "grad_norm": 0.26546348358382443, + "learning_rate": 0.00018629108167069006, + "loss": 1.1509, + "step": 4394 + }, + { + "epoch": 0.42, + "grad_norm": 0.2627657556993223, + "learning_rate": 0.00018628308611275972, + "loss": 1.1698, + "step": 4395 + }, + { + "epoch": 0.42, + "grad_norm": 0.2681938025996225, + "learning_rate": 0.00018627508839554093, + "loss": 1.0299, + "step": 4396 + }, + { + "epoch": 0.42, + "grad_norm": 0.26016966609609554, + "learning_rate": 0.00018626708851923382, + "loss": 1.132, + "step": 4397 + }, + { + "epoch": 0.42, + "grad_norm": 0.28096390529787246, + "learning_rate": 0.0001862590864840386, + "loss": 1.1004, + "step": 4398 + }, + { + "epoch": 0.42, + "grad_norm": 0.2914134240329839, + "learning_rate": 0.00018625108229015555, + "loss": 1.0259, + "step": 4399 + }, + { + "epoch": 0.42, + "grad_norm": 0.27633727064382735, + "learning_rate": 0.00018624307593778495, + "loss": 1.0006, + "step": 4400 + }, + { + "epoch": 0.42, + "grad_norm": 0.26565720029010786, + "learning_rate": 0.00018623506742712715, + "loss": 1.0687, + "step": 4401 + }, + { + "epoch": 0.42, + "grad_norm": 0.24736946692607406, + "learning_rate": 0.00018622705675838263, + "loss": 1.1022, + "step": 4402 + }, + { + "epoch": 0.42, + "grad_norm": 0.23869319091908384, + "learning_rate": 0.0001862190439317518, + "loss": 1.1136, + "step": 4403 + }, + { + "epoch": 0.42, + "grad_norm": 0.26085938439573125, + "learning_rate": 0.0001862110289474352, + "loss": 1.0647, + "step": 4404 + }, + { + "epoch": 0.42, + "grad_norm": 0.2615746057557681, + "learning_rate": 0.00018620301180563342, + "loss": 1.026, + "step": 4405 + }, + { + "epoch": 0.42, + "grad_norm": 0.265385119223784, + "learning_rate": 0.0001861949925065471, + "loss": 1.0762, + "step": 4406 + }, + { + "epoch": 0.42, + "grad_norm": 0.2621600644598475, + "learning_rate": 0.00018618697105037693, + "loss": 1.0342, + "step": 4407 + }, + { + "epoch": 0.42, + "grad_norm": 0.28497896413460505, + "learning_rate": 0.00018617894743732361, + "loss": 1.0353, + "step": 4408 + }, + { + "epoch": 0.42, + "grad_norm": 0.2537528026008887, + "learning_rate": 0.00018617092166758802, + "loss": 0.9979, + "step": 4409 + }, + { + "epoch": 0.42, + "grad_norm": 0.273548636864498, + "learning_rate": 0.00018616289374137092, + "loss": 1.1967, + "step": 4410 + }, + { + "epoch": 0.42, + "grad_norm": 0.32894219686067755, + "learning_rate": 0.0001861548636588733, + "loss": 1.082, + "step": 4411 + }, + { + "epoch": 0.42, + "grad_norm": 0.26569438454996513, + "learning_rate": 0.00018614683142029602, + "loss": 0.8886, + "step": 4412 + }, + { + "epoch": 0.42, + "grad_norm": 0.25788494163824816, + "learning_rate": 0.00018613879702584013, + "loss": 1.0712, + "step": 4413 + }, + { + "epoch": 0.42, + "grad_norm": 0.272910401506832, + "learning_rate": 0.00018613076047570678, + "loss": 1.0169, + "step": 4414 + }, + { + "epoch": 0.42, + "grad_norm": 0.27161074974964666, + "learning_rate": 0.00018612272177009694, + "loss": 1.1233, + "step": 4415 + }, + { + "epoch": 0.42, + "grad_norm": 0.28453465725372484, + "learning_rate": 0.0001861146809092119, + "loss": 1.0644, + "step": 4416 + }, + { + "epoch": 0.42, + "grad_norm": 0.26695328078689184, + "learning_rate": 0.00018610663789325288, + "loss": 1.0859, + "step": 4417 + }, + { + "epoch": 0.42, + "grad_norm": 0.28644342117192206, + "learning_rate": 0.00018609859272242108, + "loss": 1.1014, + "step": 4418 + }, + { + "epoch": 0.42, + "grad_norm": 0.28232507692715614, + "learning_rate": 0.0001860905453969179, + "loss": 1.0183, + "step": 4419 + }, + { + "epoch": 0.42, + "grad_norm": 0.28914880496465856, + "learning_rate": 0.0001860824959169447, + "loss": 1.0779, + "step": 4420 + }, + { + "epoch": 0.42, + "grad_norm": 0.27341424599975833, + "learning_rate": 0.000186074444282703, + "loss": 0.9407, + "step": 4421 + }, + { + "epoch": 0.42, + "grad_norm": 0.31013888814670343, + "learning_rate": 0.00018606639049439415, + "loss": 1.1336, + "step": 4422 + }, + { + "epoch": 0.42, + "grad_norm": 0.2804627470142664, + "learning_rate": 0.00018605833455221984, + "loss": 1.1237, + "step": 4423 + }, + { + "epoch": 0.42, + "grad_norm": 0.27060408840367484, + "learning_rate": 0.00018605027645638163, + "loss": 1.0914, + "step": 4424 + }, + { + "epoch": 0.42, + "grad_norm": 0.2949572609779446, + "learning_rate": 0.00018604221620708113, + "loss": 1.0142, + "step": 4425 + }, + { + "epoch": 0.42, + "grad_norm": 0.2978775593613743, + "learning_rate": 0.00018603415380452013, + "loss": 1.0299, + "step": 4426 + }, + { + "epoch": 0.42, + "grad_norm": 0.27422717885209724, + "learning_rate": 0.00018602608924890034, + "loss": 0.9475, + "step": 4427 + }, + { + "epoch": 0.42, + "grad_norm": 0.24881221794442596, + "learning_rate": 0.0001860180225404236, + "loss": 1.0083, + "step": 4428 + }, + { + "epoch": 0.42, + "grad_norm": 0.28296329045097385, + "learning_rate": 0.00018600995367929182, + "loss": 1.1519, + "step": 4429 + }, + { + "epoch": 0.42, + "grad_norm": 0.3132917134168698, + "learning_rate": 0.00018600188266570687, + "loss": 1.1355, + "step": 4430 + }, + { + "epoch": 0.42, + "grad_norm": 0.2656930447471748, + "learning_rate": 0.00018599380949987072, + "loss": 1.1187, + "step": 4431 + }, + { + "epoch": 0.42, + "grad_norm": 0.29159128798152506, + "learning_rate": 0.0001859857341819855, + "loss": 1.0272, + "step": 4432 + }, + { + "epoch": 0.42, + "grad_norm": 0.26189338837080134, + "learning_rate": 0.00018597765671225322, + "loss": 1.0971, + "step": 4433 + }, + { + "epoch": 0.42, + "grad_norm": 0.28052538093117707, + "learning_rate": 0.00018596957709087603, + "loss": 1.0194, + "step": 4434 + }, + { + "epoch": 0.42, + "grad_norm": 0.2652637453080487, + "learning_rate": 0.0001859614953180562, + "loss": 1.0746, + "step": 4435 + }, + { + "epoch": 0.42, + "grad_norm": 0.2829916437751392, + "learning_rate": 0.00018595341139399584, + "loss": 1.1021, + "step": 4436 + }, + { + "epoch": 0.42, + "grad_norm": 0.25812189396799773, + "learning_rate": 0.0001859453253188974, + "loss": 1.0858, + "step": 4437 + }, + { + "epoch": 0.42, + "grad_norm": 0.26939995257598304, + "learning_rate": 0.00018593723709296316, + "loss": 1.0994, + "step": 4438 + }, + { + "epoch": 0.42, + "grad_norm": 0.2860192381686427, + "learning_rate": 0.00018592914671639553, + "loss": 1.0819, + "step": 4439 + }, + { + "epoch": 0.42, + "grad_norm": 0.2791379239990837, + "learning_rate": 0.00018592105418939705, + "loss": 1.0618, + "step": 4440 + }, + { + "epoch": 0.42, + "grad_norm": 0.2740072559598784, + "learning_rate": 0.00018591295951217015, + "loss": 1.1346, + "step": 4441 + }, + { + "epoch": 0.42, + "grad_norm": 0.2889247502983577, + "learning_rate": 0.00018590486268491748, + "loss": 1.0459, + "step": 4442 + }, + { + "epoch": 0.43, + "grad_norm": 0.2978227410800553, + "learning_rate": 0.0001858967637078416, + "loss": 1.0501, + "step": 4443 + }, + { + "epoch": 0.43, + "grad_norm": 0.3018285492782609, + "learning_rate": 0.00018588866258114524, + "loss": 1.0894, + "step": 4444 + }, + { + "epoch": 0.43, + "grad_norm": 0.28346415215862286, + "learning_rate": 0.0001858805593050311, + "loss": 0.9718, + "step": 4445 + }, + { + "epoch": 0.43, + "grad_norm": 0.26039184338131427, + "learning_rate": 0.000185872453879702, + "loss": 1.1089, + "step": 4446 + }, + { + "epoch": 0.43, + "grad_norm": 0.3206288442226837, + "learning_rate": 0.0001858643463053608, + "loss": 1.0651, + "step": 4447 + }, + { + "epoch": 0.43, + "grad_norm": 0.265449658618547, + "learning_rate": 0.00018585623658221034, + "loss": 1.0637, + "step": 4448 + }, + { + "epoch": 0.43, + "grad_norm": 0.29272693696344226, + "learning_rate": 0.0001858481247104536, + "loss": 1.0931, + "step": 4449 + }, + { + "epoch": 0.43, + "grad_norm": 0.29578928026152973, + "learning_rate": 0.0001858400106902936, + "loss": 1.1793, + "step": 4450 + }, + { + "epoch": 0.43, + "grad_norm": 0.27791665518502323, + "learning_rate": 0.00018583189452193338, + "loss": 1.0318, + "step": 4451 + }, + { + "epoch": 0.43, + "grad_norm": 0.28708172386603614, + "learning_rate": 0.00018582377620557602, + "loss": 1.0001, + "step": 4452 + }, + { + "epoch": 0.43, + "grad_norm": 0.2945398920234109, + "learning_rate": 0.0001858156557414248, + "loss": 1.0574, + "step": 4453 + }, + { + "epoch": 0.43, + "grad_norm": 0.23921340949830877, + "learning_rate": 0.0001858075331296828, + "loss": 1.0104, + "step": 4454 + }, + { + "epoch": 0.43, + "grad_norm": 0.3568337762102039, + "learning_rate": 0.00018579940837055338, + "loss": 0.9811, + "step": 4455 + }, + { + "epoch": 0.43, + "grad_norm": 0.3247650997857825, + "learning_rate": 0.00018579128146423984, + "loss": 1.1677, + "step": 4456 + }, + { + "epoch": 0.43, + "grad_norm": 0.3235988446494336, + "learning_rate": 0.00018578315241094554, + "loss": 1.1063, + "step": 4457 + }, + { + "epoch": 0.43, + "grad_norm": 0.2910859031590426, + "learning_rate": 0.00018577502121087396, + "loss": 1.1122, + "step": 4458 + }, + { + "epoch": 0.43, + "grad_norm": 0.2855175850221935, + "learning_rate": 0.00018576688786422856, + "loss": 1.0616, + "step": 4459 + }, + { + "epoch": 0.43, + "grad_norm": 0.24829230055660786, + "learning_rate": 0.0001857587523712129, + "loss": 1.0424, + "step": 4460 + }, + { + "epoch": 0.43, + "grad_norm": 0.2840641696870963, + "learning_rate": 0.00018575061473203054, + "loss": 1.2132, + "step": 4461 + }, + { + "epoch": 0.43, + "grad_norm": 0.27078441085947547, + "learning_rate": 0.0001857424749468852, + "loss": 1.0858, + "step": 4462 + }, + { + "epoch": 0.43, + "grad_norm": 0.285656535390425, + "learning_rate": 0.0001857343330159805, + "loss": 1.0003, + "step": 4463 + }, + { + "epoch": 0.43, + "grad_norm": 0.2687099265187277, + "learning_rate": 0.00018572618893952024, + "loss": 1.0664, + "step": 4464 + }, + { + "epoch": 0.43, + "grad_norm": 0.3181574270847497, + "learning_rate": 0.00018571804271770822, + "loss": 0.9845, + "step": 4465 + }, + { + "epoch": 0.43, + "grad_norm": 0.25226801914556535, + "learning_rate": 0.0001857098943507483, + "loss": 1.1452, + "step": 4466 + }, + { + "epoch": 0.43, + "grad_norm": 0.3055598706754259, + "learning_rate": 0.00018570174383884442, + "loss": 1.0501, + "step": 4467 + }, + { + "epoch": 0.43, + "grad_norm": 0.2382755864156548, + "learning_rate": 0.00018569359118220056, + "loss": 1.0189, + "step": 4468 + }, + { + "epoch": 0.43, + "grad_norm": 0.26719151556229226, + "learning_rate": 0.00018568543638102072, + "loss": 1.1856, + "step": 4469 + }, + { + "epoch": 0.43, + "grad_norm": 0.2671695573995491, + "learning_rate": 0.00018567727943550897, + "loss": 1.0382, + "step": 4470 + }, + { + "epoch": 0.43, + "grad_norm": 0.27765193324490084, + "learning_rate": 0.00018566912034586946, + "loss": 1.2756, + "step": 4471 + }, + { + "epoch": 0.43, + "grad_norm": 0.27186513834244824, + "learning_rate": 0.00018566095911230638, + "loss": 1.0309, + "step": 4472 + }, + { + "epoch": 0.43, + "grad_norm": 0.29391579590132144, + "learning_rate": 0.00018565279573502392, + "loss": 1.1029, + "step": 4473 + }, + { + "epoch": 0.43, + "grad_norm": 0.30436200911544314, + "learning_rate": 0.00018564463021422645, + "loss": 1.0607, + "step": 4474 + }, + { + "epoch": 0.43, + "grad_norm": 0.29885017590562324, + "learning_rate": 0.00018563646255011828, + "loss": 1.1022, + "step": 4475 + }, + { + "epoch": 0.43, + "grad_norm": 0.30524559756359343, + "learning_rate": 0.0001856282927429038, + "loss": 1.1297, + "step": 4476 + }, + { + "epoch": 0.43, + "grad_norm": 0.2862736844725348, + "learning_rate": 0.0001856201207927875, + "loss": 1.0444, + "step": 4477 + }, + { + "epoch": 0.43, + "grad_norm": 0.27357248947278107, + "learning_rate": 0.00018561194669997386, + "loss": 1.1338, + "step": 4478 + }, + { + "epoch": 0.43, + "grad_norm": 0.2642596700408733, + "learning_rate": 0.00018560377046466747, + "loss": 1.0775, + "step": 4479 + }, + { + "epoch": 0.43, + "grad_norm": 0.28175880695705074, + "learning_rate": 0.00018559559208707288, + "loss": 1.1622, + "step": 4480 + }, + { + "epoch": 0.43, + "grad_norm": 0.29569453857541267, + "learning_rate": 0.00018558741156739483, + "loss": 1.0761, + "step": 4481 + }, + { + "epoch": 0.43, + "grad_norm": 0.2751922546444621, + "learning_rate": 0.000185579228905838, + "loss": 1.0507, + "step": 4482 + }, + { + "epoch": 0.43, + "grad_norm": 0.27122561641213205, + "learning_rate": 0.00018557104410260722, + "loss": 1.174, + "step": 4483 + }, + { + "epoch": 0.43, + "grad_norm": 0.2912196997412804, + "learning_rate": 0.00018556285715790724, + "loss": 1.0137, + "step": 4484 + }, + { + "epoch": 0.43, + "grad_norm": 0.2696109196990019, + "learning_rate": 0.00018555466807194303, + "loss": 0.8727, + "step": 4485 + }, + { + "epoch": 0.43, + "grad_norm": 0.27947099836689787, + "learning_rate": 0.00018554647684491943, + "loss": 1.1609, + "step": 4486 + }, + { + "epoch": 0.43, + "grad_norm": 0.26675781929194203, + "learning_rate": 0.00018553828347704152, + "loss": 1.0248, + "step": 4487 + }, + { + "epoch": 0.43, + "grad_norm": 0.2588683196044053, + "learning_rate": 0.00018553008796851428, + "loss": 1.0563, + "step": 4488 + }, + { + "epoch": 0.43, + "grad_norm": 0.27999097742810836, + "learning_rate": 0.00018552189031954285, + "loss": 1.0313, + "step": 4489 + }, + { + "epoch": 0.43, + "grad_norm": 0.2895515425733008, + "learning_rate": 0.00018551369053033237, + "loss": 1.0168, + "step": 4490 + }, + { + "epoch": 0.43, + "grad_norm": 0.2545597330861161, + "learning_rate": 0.00018550548860108804, + "loss": 1.0956, + "step": 4491 + }, + { + "epoch": 0.43, + "grad_norm": 0.26512014338659784, + "learning_rate": 0.00018549728453201513, + "loss": 1.1138, + "step": 4492 + }, + { + "epoch": 0.43, + "grad_norm": 0.27434370289340704, + "learning_rate": 0.0001854890783233189, + "loss": 1.0605, + "step": 4493 + }, + { + "epoch": 0.43, + "grad_norm": 0.2936485774839461, + "learning_rate": 0.0001854808699752048, + "loss": 1.0747, + "step": 4494 + }, + { + "epoch": 0.43, + "grad_norm": 0.27403335709343807, + "learning_rate": 0.00018547265948787818, + "loss": 1.0193, + "step": 4495 + }, + { + "epoch": 0.43, + "grad_norm": 0.28508114808522955, + "learning_rate": 0.00018546444686154455, + "loss": 1.0224, + "step": 4496 + }, + { + "epoch": 0.43, + "grad_norm": 0.3342643589174626, + "learning_rate": 0.00018545623209640941, + "loss": 1.0092, + "step": 4497 + }, + { + "epoch": 0.43, + "grad_norm": 0.26332940264787036, + "learning_rate": 0.0001854480151926784, + "loss": 1.1283, + "step": 4498 + }, + { + "epoch": 0.43, + "grad_norm": 0.30979393657373167, + "learning_rate": 0.00018543979615055705, + "loss": 1.1229, + "step": 4499 + }, + { + "epoch": 0.43, + "grad_norm": 0.29782251228163314, + "learning_rate": 0.00018543157497025113, + "loss": 1.0053, + "step": 4500 + }, + { + "epoch": 0.43, + "grad_norm": 0.26274351011419134, + "learning_rate": 0.00018542335165196635, + "loss": 1.1258, + "step": 4501 + }, + { + "epoch": 0.43, + "grad_norm": 0.3085288258220842, + "learning_rate": 0.00018541512619590854, + "loss": 1.0965, + "step": 4502 + }, + { + "epoch": 0.43, + "grad_norm": 0.30190633074116374, + "learning_rate": 0.00018540689860228348, + "loss": 1.2104, + "step": 4503 + }, + { + "epoch": 0.43, + "grad_norm": 0.2754205944826528, + "learning_rate": 0.0001853986688712971, + "loss": 1.1199, + "step": 4504 + }, + { + "epoch": 0.43, + "grad_norm": 0.2590596310721408, + "learning_rate": 0.00018539043700315538, + "loss": 1.1025, + "step": 4505 + }, + { + "epoch": 0.43, + "grad_norm": 0.24485764570082005, + "learning_rate": 0.0001853822029980643, + "loss": 0.9729, + "step": 4506 + }, + { + "epoch": 0.43, + "grad_norm": 0.27774076617481674, + "learning_rate": 0.00018537396685622994, + "loss": 1.103, + "step": 4507 + }, + { + "epoch": 0.43, + "grad_norm": 0.2682689587887445, + "learning_rate": 0.00018536572857785842, + "loss": 1.105, + "step": 4508 + }, + { + "epoch": 0.43, + "grad_norm": 0.29122611914479773, + "learning_rate": 0.00018535748816315585, + "loss": 1.1096, + "step": 4509 + }, + { + "epoch": 0.43, + "grad_norm": 0.23291917772580553, + "learning_rate": 0.0001853492456123285, + "loss": 1.1044, + "step": 4510 + }, + { + "epoch": 0.43, + "grad_norm": 0.33929513760098673, + "learning_rate": 0.00018534100092558266, + "loss": 1.1069, + "step": 4511 + }, + { + "epoch": 0.43, + "grad_norm": 0.2541464302266371, + "learning_rate": 0.00018533275410312464, + "loss": 1.043, + "step": 4512 + }, + { + "epoch": 0.43, + "grad_norm": 0.2729757201162812, + "learning_rate": 0.0001853245051451608, + "loss": 1.0917, + "step": 4513 + }, + { + "epoch": 0.43, + "grad_norm": 0.2881511710111317, + "learning_rate": 0.00018531625405189761, + "loss": 1.0334, + "step": 4514 + }, + { + "epoch": 0.43, + "grad_norm": 0.3004052066232476, + "learning_rate": 0.00018530800082354153, + "loss": 1.2131, + "step": 4515 + }, + { + "epoch": 0.43, + "grad_norm": 0.296301949311051, + "learning_rate": 0.0001852997454602991, + "loss": 1.153, + "step": 4516 + }, + { + "epoch": 0.43, + "grad_norm": 0.26666605205862975, + "learning_rate": 0.00018529148796237696, + "loss": 1.0267, + "step": 4517 + }, + { + "epoch": 0.43, + "grad_norm": 0.27466066458804184, + "learning_rate": 0.00018528322832998172, + "loss": 1.0631, + "step": 4518 + }, + { + "epoch": 0.43, + "grad_norm": 0.2913602893847047, + "learning_rate": 0.0001852749665633201, + "loss": 1.1576, + "step": 4519 + }, + { + "epoch": 0.43, + "grad_norm": 0.2631891203792342, + "learning_rate": 0.00018526670266259885, + "loss": 1.0626, + "step": 4520 + }, + { + "epoch": 0.43, + "grad_norm": 0.33347156766562963, + "learning_rate": 0.00018525843662802477, + "loss": 1.0737, + "step": 4521 + }, + { + "epoch": 0.43, + "grad_norm": 0.2790606063839219, + "learning_rate": 0.00018525016845980473, + "loss": 1.0466, + "step": 4522 + }, + { + "epoch": 0.43, + "grad_norm": 0.2836733588666177, + "learning_rate": 0.00018524189815814565, + "loss": 1.0932, + "step": 4523 + }, + { + "epoch": 0.43, + "grad_norm": 0.3028905196190124, + "learning_rate": 0.0001852336257232545, + "loss": 1.0616, + "step": 4524 + }, + { + "epoch": 0.43, + "grad_norm": 0.2845873599717972, + "learning_rate": 0.00018522535115533828, + "loss": 1.0551, + "step": 4525 + }, + { + "epoch": 0.43, + "grad_norm": 0.24471061186570162, + "learning_rate": 0.0001852170744546041, + "loss": 1.0166, + "step": 4526 + }, + { + "epoch": 0.43, + "grad_norm": 0.2877270171258153, + "learning_rate": 0.00018520879562125905, + "loss": 1.0145, + "step": 4527 + }, + { + "epoch": 0.43, + "grad_norm": 0.2770560293900152, + "learning_rate": 0.00018520051465551038, + "loss": 1.1089, + "step": 4528 + }, + { + "epoch": 0.43, + "grad_norm": 0.27958349827422824, + "learning_rate": 0.00018519223155756526, + "loss": 1.1539, + "step": 4529 + }, + { + "epoch": 0.43, + "grad_norm": 0.24834333728465557, + "learning_rate": 0.000185183946327631, + "loss": 1.1034, + "step": 4530 + }, + { + "epoch": 0.43, + "grad_norm": 0.2749131019387281, + "learning_rate": 0.00018517565896591494, + "loss": 0.9705, + "step": 4531 + }, + { + "epoch": 0.43, + "grad_norm": 0.2717292532230968, + "learning_rate": 0.00018516736947262453, + "loss": 1.0913, + "step": 4532 + }, + { + "epoch": 0.43, + "grad_norm": 0.22308007763116086, + "learning_rate": 0.00018515907784796712, + "loss": 1.0885, + "step": 4533 + }, + { + "epoch": 0.43, + "grad_norm": 0.2723580969909008, + "learning_rate": 0.00018515078409215029, + "loss": 1.0822, + "step": 4534 + }, + { + "epoch": 0.43, + "grad_norm": 0.2723449059110442, + "learning_rate": 0.00018514248820538157, + "loss": 1.1214, + "step": 4535 + }, + { + "epoch": 0.43, + "grad_norm": 0.2985439043563554, + "learning_rate": 0.0001851341901878686, + "loss": 1.0749, + "step": 4536 + }, + { + "epoch": 0.43, + "grad_norm": 0.26995390366164945, + "learning_rate": 0.000185125890039819, + "loss": 1.1233, + "step": 4537 + }, + { + "epoch": 0.43, + "grad_norm": 0.2852650589577388, + "learning_rate": 0.00018511758776144048, + "loss": 1.135, + "step": 4538 + }, + { + "epoch": 0.43, + "grad_norm": 0.25495506751354075, + "learning_rate": 0.0001851092833529408, + "loss": 1.0329, + "step": 4539 + }, + { + "epoch": 0.43, + "grad_norm": 0.2788212963607499, + "learning_rate": 0.0001851009768145279, + "loss": 1.0452, + "step": 4540 + }, + { + "epoch": 0.43, + "grad_norm": 0.29286510945056726, + "learning_rate": 0.00018509266814640952, + "loss": 1.0701, + "step": 4541 + }, + { + "epoch": 0.43, + "grad_norm": 0.298753776585502, + "learning_rate": 0.00018508435734879367, + "loss": 1.0797, + "step": 4542 + }, + { + "epoch": 0.43, + "grad_norm": 0.2893400312544143, + "learning_rate": 0.00018507604442188826, + "loss": 1.1236, + "step": 4543 + }, + { + "epoch": 0.43, + "grad_norm": 0.297108222760269, + "learning_rate": 0.0001850677293659014, + "loss": 1.1958, + "step": 4544 + }, + { + "epoch": 0.43, + "grad_norm": 0.3026573746982335, + "learning_rate": 0.00018505941218104112, + "loss": 1.0817, + "step": 4545 + }, + { + "epoch": 0.43, + "grad_norm": 0.27275443083535883, + "learning_rate": 0.00018505109286751564, + "loss": 1.0752, + "step": 4546 + }, + { + "epoch": 0.44, + "grad_norm": 0.27885346389765003, + "learning_rate": 0.00018504277142553308, + "loss": 1.0219, + "step": 4547 + }, + { + "epoch": 0.44, + "grad_norm": 0.3180339146414116, + "learning_rate": 0.00018503444785530172, + "loss": 1.0987, + "step": 4548 + }, + { + "epoch": 0.44, + "grad_norm": 0.2808155483149112, + "learning_rate": 0.00018502612215702988, + "loss": 1.073, + "step": 4549 + }, + { + "epoch": 0.44, + "grad_norm": 0.25391611727242125, + "learning_rate": 0.00018501779433092587, + "loss": 1.0876, + "step": 4550 + }, + { + "epoch": 0.44, + "grad_norm": 0.2261609383943027, + "learning_rate": 0.00018500946437719813, + "loss": 1.0297, + "step": 4551 + }, + { + "epoch": 0.44, + "grad_norm": 0.2754443425644491, + "learning_rate": 0.00018500113229605512, + "loss": 1.082, + "step": 4552 + }, + { + "epoch": 0.44, + "grad_norm": 0.27586769729030647, + "learning_rate": 0.00018499279808770536, + "loss": 1.137, + "step": 4553 + }, + { + "epoch": 0.44, + "grad_norm": 0.31905598519632444, + "learning_rate": 0.0001849844617523574, + "loss": 1.1845, + "step": 4554 + }, + { + "epoch": 0.44, + "grad_norm": 0.23729193980262855, + "learning_rate": 0.00018497612329021988, + "loss": 0.9061, + "step": 4555 + }, + { + "epoch": 0.44, + "grad_norm": 0.2782155585428631, + "learning_rate": 0.00018496778270150145, + "loss": 1.1527, + "step": 4556 + }, + { + "epoch": 0.44, + "grad_norm": 0.2710366956832406, + "learning_rate": 0.0001849594399864109, + "loss": 1.0705, + "step": 4557 + }, + { + "epoch": 0.44, + "grad_norm": 0.22796189873292866, + "learning_rate": 0.00018495109514515693, + "loss": 1.0695, + "step": 4558 + }, + { + "epoch": 0.44, + "grad_norm": 0.2831241401573468, + "learning_rate": 0.00018494274817794842, + "loss": 0.9582, + "step": 4559 + }, + { + "epoch": 0.44, + "grad_norm": 0.2824780217557077, + "learning_rate": 0.0001849343990849943, + "loss": 1.0315, + "step": 4560 + }, + { + "epoch": 0.44, + "grad_norm": 0.270556872472652, + "learning_rate": 0.0001849260478665034, + "loss": 1.0019, + "step": 4561 + }, + { + "epoch": 0.44, + "grad_norm": 0.25499089437909833, + "learning_rate": 0.00018491769452268482, + "loss": 1.0728, + "step": 4562 + }, + { + "epoch": 0.44, + "grad_norm": 0.2908571742460526, + "learning_rate": 0.00018490933905374754, + "loss": 1.0077, + "step": 4563 + }, + { + "epoch": 0.44, + "grad_norm": 0.28487382701620256, + "learning_rate": 0.0001849009814599007, + "loss": 1.0629, + "step": 4564 + }, + { + "epoch": 0.44, + "grad_norm": 0.26175206299618, + "learning_rate": 0.00018489262174135345, + "loss": 1.1802, + "step": 4565 + }, + { + "epoch": 0.44, + "grad_norm": 0.2881582041583295, + "learning_rate": 0.00018488425989831496, + "loss": 0.9577, + "step": 4566 + }, + { + "epoch": 0.44, + "grad_norm": 0.2833504131778664, + "learning_rate": 0.00018487589593099455, + "loss": 1.2034, + "step": 4567 + }, + { + "epoch": 0.44, + "grad_norm": 0.2623734320038995, + "learning_rate": 0.00018486752983960146, + "loss": 1.1153, + "step": 4568 + }, + { + "epoch": 0.44, + "grad_norm": 0.2522517398867482, + "learning_rate": 0.00018485916162434515, + "loss": 1.0618, + "step": 4569 + }, + { + "epoch": 0.44, + "grad_norm": 0.24153711019667667, + "learning_rate": 0.00018485079128543496, + "loss": 1.0822, + "step": 4570 + }, + { + "epoch": 0.44, + "grad_norm": 0.2882476151134348, + "learning_rate": 0.0001848424188230804, + "loss": 1.1657, + "step": 4571 + }, + { + "epoch": 0.44, + "grad_norm": 0.2964951820277572, + "learning_rate": 0.00018483404423749096, + "loss": 1.1222, + "step": 4572 + }, + { + "epoch": 0.44, + "grad_norm": 0.27784406217795626, + "learning_rate": 0.00018482566752887628, + "loss": 1.0545, + "step": 4573 + }, + { + "epoch": 0.44, + "grad_norm": 0.26707431862694997, + "learning_rate": 0.00018481728869744596, + "loss": 1.0567, + "step": 4574 + }, + { + "epoch": 0.44, + "grad_norm": 0.3017539840667631, + "learning_rate": 0.00018480890774340964, + "loss": 1.1777, + "step": 4575 + }, + { + "epoch": 0.44, + "grad_norm": 0.3113524234554564, + "learning_rate": 0.00018480052466697715, + "loss": 1.1013, + "step": 4576 + }, + { + "epoch": 0.44, + "grad_norm": 0.308306701323019, + "learning_rate": 0.00018479213946835822, + "loss": 1.1211, + "step": 4577 + }, + { + "epoch": 0.44, + "grad_norm": 0.2710413287763368, + "learning_rate": 0.00018478375214776272, + "loss": 1.1882, + "step": 4578 + }, + { + "epoch": 0.44, + "grad_norm": 0.31364976820788937, + "learning_rate": 0.00018477536270540052, + "loss": 1.0807, + "step": 4579 + }, + { + "epoch": 0.44, + "grad_norm": 0.25048911528630063, + "learning_rate": 0.00018476697114148158, + "loss": 1.0959, + "step": 4580 + }, + { + "epoch": 0.44, + "grad_norm": 0.2663984503966, + "learning_rate": 0.00018475857745621594, + "loss": 1.0236, + "step": 4581 + }, + { + "epoch": 0.44, + "grad_norm": 0.26614213923311075, + "learning_rate": 0.00018475018164981362, + "loss": 1.0751, + "step": 4582 + }, + { + "epoch": 0.44, + "grad_norm": 0.2814084782252928, + "learning_rate": 0.00018474178372248474, + "loss": 0.9935, + "step": 4583 + }, + { + "epoch": 0.44, + "grad_norm": 0.25798031430362306, + "learning_rate": 0.00018473338367443946, + "loss": 0.9938, + "step": 4584 + }, + { + "epoch": 0.44, + "grad_norm": 0.2865428384395744, + "learning_rate": 0.00018472498150588803, + "loss": 1.0878, + "step": 4585 + }, + { + "epoch": 0.44, + "grad_norm": 0.3077652321307214, + "learning_rate": 0.00018471657721704066, + "loss": 1.1567, + "step": 4586 + }, + { + "epoch": 0.44, + "grad_norm": 0.31452880319912974, + "learning_rate": 0.0001847081708081077, + "loss": 1.1304, + "step": 4587 + }, + { + "epoch": 0.44, + "grad_norm": 0.2763324776356194, + "learning_rate": 0.00018469976227929955, + "loss": 1.0634, + "step": 4588 + }, + { + "epoch": 0.44, + "grad_norm": 0.29691042342244073, + "learning_rate": 0.0001846913516308266, + "loss": 1.1236, + "step": 4589 + }, + { + "epoch": 0.44, + "grad_norm": 0.2846334629692399, + "learning_rate": 0.00018468293886289935, + "loss": 0.9717, + "step": 4590 + }, + { + "epoch": 0.44, + "grad_norm": 0.2668935421303119, + "learning_rate": 0.00018467452397572833, + "loss": 1.082, + "step": 4591 + }, + { + "epoch": 0.44, + "grad_norm": 0.28376985073992644, + "learning_rate": 0.00018466610696952416, + "loss": 1.088, + "step": 4592 + }, + { + "epoch": 0.44, + "grad_norm": 0.3014664421717131, + "learning_rate": 0.00018465768784449742, + "loss": 0.9671, + "step": 4593 + }, + { + "epoch": 0.44, + "grad_norm": 0.27756706076476795, + "learning_rate": 0.00018464926660085885, + "loss": 1.0486, + "step": 4594 + }, + { + "epoch": 0.44, + "grad_norm": 0.31276907084202593, + "learning_rate": 0.00018464084323881918, + "loss": 1.1944, + "step": 4595 + }, + { + "epoch": 0.44, + "grad_norm": 0.2710547392306981, + "learning_rate": 0.00018463241775858923, + "loss": 1.0873, + "step": 4596 + }, + { + "epoch": 0.44, + "grad_norm": 0.2726412664274147, + "learning_rate": 0.00018462399016037982, + "loss": 0.975, + "step": 4597 + }, + { + "epoch": 0.44, + "grad_norm": 0.28160638881053474, + "learning_rate": 0.00018461556044440186, + "loss": 1.092, + "step": 4598 + }, + { + "epoch": 0.44, + "grad_norm": 0.2725964852327896, + "learning_rate": 0.00018460712861086633, + "loss": 1.1128, + "step": 4599 + }, + { + "epoch": 0.44, + "grad_norm": 0.27984134666173177, + "learning_rate": 0.00018459869465998425, + "loss": 1.1321, + "step": 4600 + }, + { + "epoch": 0.44, + "grad_norm": 0.2863490711350883, + "learning_rate": 0.00018459025859196663, + "loss": 1.1366, + "step": 4601 + }, + { + "epoch": 0.44, + "grad_norm": 0.2879506583130578, + "learning_rate": 0.00018458182040702466, + "loss": 1.1197, + "step": 4602 + }, + { + "epoch": 0.44, + "grad_norm": 0.26719713518673577, + "learning_rate": 0.00018457338010536946, + "loss": 0.9944, + "step": 4603 + }, + { + "epoch": 0.44, + "grad_norm": 0.3121012835925408, + "learning_rate": 0.0001845649376872123, + "loss": 1.198, + "step": 4604 + }, + { + "epoch": 0.44, + "grad_norm": 0.30448939179247264, + "learning_rate": 0.0001845564931527644, + "loss": 0.9822, + "step": 4605 + }, + { + "epoch": 0.44, + "grad_norm": 0.2735559381750156, + "learning_rate": 0.00018454804650223713, + "loss": 1.0978, + "step": 4606 + }, + { + "epoch": 0.44, + "grad_norm": 0.2555127503868371, + "learning_rate": 0.0001845395977358418, + "loss": 1.0871, + "step": 4607 + }, + { + "epoch": 0.44, + "grad_norm": 0.2784512056488346, + "learning_rate": 0.00018453114685379, + "loss": 1.1117, + "step": 4608 + }, + { + "epoch": 0.44, + "grad_norm": 0.27412619492911333, + "learning_rate": 0.0001845226938562931, + "loss": 0.9858, + "step": 4609 + }, + { + "epoch": 0.44, + "grad_norm": 0.3198996291332153, + "learning_rate": 0.00018451423874356261, + "loss": 1.0908, + "step": 4610 + }, + { + "epoch": 0.44, + "grad_norm": 0.2893727277713405, + "learning_rate": 0.00018450578151581022, + "loss": 1.1565, + "step": 4611 + }, + { + "epoch": 0.44, + "grad_norm": 0.3012087634762126, + "learning_rate": 0.00018449732217324754, + "loss": 1.1766, + "step": 4612 + }, + { + "epoch": 0.44, + "grad_norm": 0.29522825483614856, + "learning_rate": 0.00018448886071608625, + "loss": 1.2191, + "step": 4613 + }, + { + "epoch": 0.44, + "grad_norm": 0.27761023884520314, + "learning_rate": 0.00018448039714453814, + "loss": 1.2029, + "step": 4614 + }, + { + "epoch": 0.44, + "grad_norm": 0.22548736584501833, + "learning_rate": 0.000184471931458815, + "loss": 1.0772, + "step": 4615 + }, + { + "epoch": 0.44, + "grad_norm": 0.28551218282274554, + "learning_rate": 0.00018446346365912867, + "loss": 1.1001, + "step": 4616 + }, + { + "epoch": 0.44, + "grad_norm": 0.29640261684492736, + "learning_rate": 0.0001844549937456911, + "loss": 1.0567, + "step": 4617 + }, + { + "epoch": 0.44, + "grad_norm": 0.26167728449746586, + "learning_rate": 0.0001844465217187142, + "loss": 1.0648, + "step": 4618 + }, + { + "epoch": 0.44, + "grad_norm": 0.27822656705075727, + "learning_rate": 0.00018443804757841003, + "loss": 1.1127, + "step": 4619 + }, + { + "epoch": 0.44, + "grad_norm": 0.2904755993031546, + "learning_rate": 0.00018442957132499069, + "loss": 1.0909, + "step": 4620 + }, + { + "epoch": 0.44, + "grad_norm": 0.28052003011517346, + "learning_rate": 0.00018442109295866823, + "loss": 1.0709, + "step": 4621 + }, + { + "epoch": 0.44, + "grad_norm": 0.25770690828327825, + "learning_rate": 0.00018441261247965487, + "loss": 1.1385, + "step": 4622 + }, + { + "epoch": 0.44, + "grad_norm": 0.256212600739427, + "learning_rate": 0.00018440412988816283, + "loss": 1.0721, + "step": 4623 + }, + { + "epoch": 0.44, + "grad_norm": 0.2749581272974963, + "learning_rate": 0.0001843956451844044, + "loss": 1.1453, + "step": 4624 + }, + { + "epoch": 0.44, + "grad_norm": 0.3046942593317576, + "learning_rate": 0.0001843871583685919, + "loss": 1.189, + "step": 4625 + }, + { + "epoch": 0.44, + "grad_norm": 0.28752913128850266, + "learning_rate": 0.00018437866944093773, + "loss": 1.2043, + "step": 4626 + }, + { + "epoch": 0.44, + "grad_norm": 0.2900180388718663, + "learning_rate": 0.00018437017840165434, + "loss": 1.1533, + "step": 4627 + }, + { + "epoch": 0.44, + "grad_norm": 0.26073951801493656, + "learning_rate": 0.0001843616852509542, + "loss": 1.0558, + "step": 4628 + }, + { + "epoch": 0.44, + "grad_norm": 0.3077527251913004, + "learning_rate": 0.00018435318998904986, + "loss": 1.0387, + "step": 4629 + }, + { + "epoch": 0.44, + "grad_norm": 0.29483737359579054, + "learning_rate": 0.00018434469261615393, + "loss": 1.0913, + "step": 4630 + }, + { + "epoch": 0.44, + "grad_norm": 0.2933924970818525, + "learning_rate": 0.00018433619313247906, + "loss": 1.0347, + "step": 4631 + }, + { + "epoch": 0.44, + "grad_norm": 0.26212171413860036, + "learning_rate": 0.00018432769153823797, + "loss": 1.0702, + "step": 4632 + }, + { + "epoch": 0.44, + "grad_norm": 0.2546684009326573, + "learning_rate": 0.00018431918783364337, + "loss": 1.028, + "step": 4633 + }, + { + "epoch": 0.44, + "grad_norm": 0.24838393365899913, + "learning_rate": 0.00018431068201890812, + "loss": 1.0165, + "step": 4634 + }, + { + "epoch": 0.44, + "grad_norm": 0.3099036311014584, + "learning_rate": 0.00018430217409424505, + "loss": 0.9548, + "step": 4635 + }, + { + "epoch": 0.44, + "grad_norm": 0.2759430315688091, + "learning_rate": 0.00018429366405986713, + "loss": 1.149, + "step": 4636 + }, + { + "epoch": 0.44, + "grad_norm": 0.2953749611748448, + "learning_rate": 0.00018428515191598726, + "loss": 1.1464, + "step": 4637 + }, + { + "epoch": 0.44, + "grad_norm": 0.2654249301676195, + "learning_rate": 0.0001842766376628185, + "loss": 1.0327, + "step": 4638 + }, + { + "epoch": 0.44, + "grad_norm": 0.2802886640388649, + "learning_rate": 0.0001842681213005739, + "loss": 1.1143, + "step": 4639 + }, + { + "epoch": 0.44, + "grad_norm": 0.29564078608147715, + "learning_rate": 0.00018425960282946661, + "loss": 0.8881, + "step": 4640 + }, + { + "epoch": 0.44, + "grad_norm": 0.2925251530239038, + "learning_rate": 0.00018425108224970983, + "loss": 1.1731, + "step": 4641 + }, + { + "epoch": 0.44, + "grad_norm": 0.2900983438301286, + "learning_rate": 0.00018424255956151674, + "loss": 1.177, + "step": 4642 + }, + { + "epoch": 0.44, + "grad_norm": 0.31019044957642683, + "learning_rate": 0.00018423403476510065, + "loss": 1.1355, + "step": 4643 + }, + { + "epoch": 0.44, + "grad_norm": 0.3213591456521516, + "learning_rate": 0.00018422550786067492, + "loss": 1.077, + "step": 4644 + }, + { + "epoch": 0.44, + "grad_norm": 0.2677075004163226, + "learning_rate": 0.0001842169788484529, + "loss": 1.0312, + "step": 4645 + }, + { + "epoch": 0.44, + "grad_norm": 0.25726221862210036, + "learning_rate": 0.0001842084477286481, + "loss": 0.9965, + "step": 4646 + }, + { + "epoch": 0.44, + "grad_norm": 0.29706048227507303, + "learning_rate": 0.00018419991450147394, + "loss": 1.0823, + "step": 4647 + }, + { + "epoch": 0.44, + "grad_norm": 0.2567670140513427, + "learning_rate": 0.000184191379167144, + "loss": 1.0802, + "step": 4648 + }, + { + "epoch": 0.44, + "grad_norm": 0.2730948766094166, + "learning_rate": 0.00018418284172587188, + "loss": 1.0743, + "step": 4649 + }, + { + "epoch": 0.44, + "grad_norm": 0.2576764085593571, + "learning_rate": 0.00018417430217787124, + "loss": 1.0818, + "step": 4650 + }, + { + "epoch": 0.44, + "grad_norm": 0.28479973480576726, + "learning_rate": 0.00018416576052335582, + "loss": 1.0513, + "step": 4651 + }, + { + "epoch": 0.45, + "grad_norm": 0.28752546745083385, + "learning_rate": 0.0001841572167625393, + "loss": 1.1685, + "step": 4652 + }, + { + "epoch": 0.45, + "grad_norm": 0.2846736795661952, + "learning_rate": 0.00018414867089563557, + "loss": 1.2087, + "step": 4653 + }, + { + "epoch": 0.45, + "grad_norm": 0.27455162051832954, + "learning_rate": 0.00018414012292285845, + "loss": 1.0672, + "step": 4654 + }, + { + "epoch": 0.45, + "grad_norm": 0.26145375283055017, + "learning_rate": 0.00018413157284442186, + "loss": 1.0861, + "step": 4655 + }, + { + "epoch": 0.45, + "grad_norm": 0.2871748977574908, + "learning_rate": 0.0001841230206605398, + "loss": 1.168, + "step": 4656 + }, + { + "epoch": 0.45, + "grad_norm": 0.26086270240508314, + "learning_rate": 0.00018411446637142632, + "loss": 1.1131, + "step": 4657 + }, + { + "epoch": 0.45, + "grad_norm": 0.26968127971681205, + "learning_rate": 0.0001841059099772954, + "loss": 1.0536, + "step": 4658 + }, + { + "epoch": 0.45, + "grad_norm": 0.2665866276637478, + "learning_rate": 0.00018409735147836124, + "loss": 1.2219, + "step": 4659 + }, + { + "epoch": 0.45, + "grad_norm": 0.2788104688731091, + "learning_rate": 0.000184088790874838, + "loss": 1.0498, + "step": 4660 + }, + { + "epoch": 0.45, + "grad_norm": 0.3042257990864148, + "learning_rate": 0.00018408022816693994, + "loss": 1.1032, + "step": 4661 + }, + { + "epoch": 0.45, + "grad_norm": 0.23891458103453578, + "learning_rate": 0.0001840716633548813, + "loss": 1.1667, + "step": 4662 + }, + { + "epoch": 0.45, + "grad_norm": 0.2668685305184029, + "learning_rate": 0.00018406309643887649, + "loss": 1.0313, + "step": 4663 + }, + { + "epoch": 0.45, + "grad_norm": 0.26923096074860575, + "learning_rate": 0.0001840545274191398, + "loss": 1.096, + "step": 4664 + }, + { + "epoch": 0.45, + "grad_norm": 0.26120691585456396, + "learning_rate": 0.0001840459562958858, + "loss": 1.0147, + "step": 4665 + }, + { + "epoch": 0.45, + "grad_norm": 0.279287076136616, + "learning_rate": 0.0001840373830693289, + "loss": 1.12, + "step": 4666 + }, + { + "epoch": 0.45, + "grad_norm": 0.2517878873784173, + "learning_rate": 0.00018402880773968363, + "loss": 1.0207, + "step": 4667 + }, + { + "epoch": 0.45, + "grad_norm": 0.2554752277219163, + "learning_rate": 0.00018402023030716469, + "loss": 1.1272, + "step": 4668 + }, + { + "epoch": 0.45, + "grad_norm": 0.24985971217131164, + "learning_rate": 0.00018401165077198666, + "loss": 1.0905, + "step": 4669 + }, + { + "epoch": 0.45, + "grad_norm": 0.2515290610443281, + "learning_rate": 0.0001840030691343643, + "loss": 1.1553, + "step": 4670 + }, + { + "epoch": 0.45, + "grad_norm": 0.2632716739172169, + "learning_rate": 0.00018399448539451228, + "loss": 1.0571, + "step": 4671 + }, + { + "epoch": 0.45, + "grad_norm": 0.29388108923128464, + "learning_rate": 0.00018398589955264552, + "loss": 1.0905, + "step": 4672 + }, + { + "epoch": 0.45, + "grad_norm": 0.34065302184228613, + "learning_rate": 0.00018397731160897882, + "loss": 1.118, + "step": 4673 + }, + { + "epoch": 0.45, + "grad_norm": 0.26474232885623084, + "learning_rate": 0.00018396872156372713, + "loss": 1.1804, + "step": 4674 + }, + { + "epoch": 0.45, + "grad_norm": 0.23096264177639642, + "learning_rate": 0.00018396012941710542, + "loss": 1.0124, + "step": 4675 + }, + { + "epoch": 0.45, + "grad_norm": 0.26143495936449157, + "learning_rate": 0.00018395153516932868, + "loss": 1.0354, + "step": 4676 + }, + { + "epoch": 0.45, + "grad_norm": 0.2909688243868478, + "learning_rate": 0.00018394293882061203, + "loss": 1.0682, + "step": 4677 + }, + { + "epoch": 0.45, + "grad_norm": 0.2786275126719699, + "learning_rate": 0.00018393434037117056, + "loss": 1.1167, + "step": 4678 + }, + { + "epoch": 0.45, + "grad_norm": 0.31697489334219714, + "learning_rate": 0.0001839257398212195, + "loss": 1.0991, + "step": 4679 + }, + { + "epoch": 0.45, + "grad_norm": 0.27077228765384886, + "learning_rate": 0.00018391713717097404, + "loss": 0.9453, + "step": 4680 + }, + { + "epoch": 0.45, + "grad_norm": 0.2762723786687761, + "learning_rate": 0.0001839085324206495, + "loss": 1.1394, + "step": 4681 + }, + { + "epoch": 0.45, + "grad_norm": 0.2904037438324207, + "learning_rate": 0.00018389992557046116, + "loss": 0.9806, + "step": 4682 + }, + { + "epoch": 0.45, + "grad_norm": 0.33416394497118107, + "learning_rate": 0.00018389131662062449, + "loss": 1.0519, + "step": 4683 + }, + { + "epoch": 0.45, + "grad_norm": 0.280572824730302, + "learning_rate": 0.00018388270557135488, + "loss": 1.0679, + "step": 4684 + }, + { + "epoch": 0.45, + "grad_norm": 0.318148944370186, + "learning_rate": 0.00018387409242286786, + "loss": 1.0326, + "step": 4685 + }, + { + "epoch": 0.45, + "grad_norm": 0.2592469384169005, + "learning_rate": 0.00018386547717537895, + "loss": 1.154, + "step": 4686 + }, + { + "epoch": 0.45, + "grad_norm": 0.24639073939502068, + "learning_rate": 0.00018385685982910376, + "loss": 1.0404, + "step": 4687 + }, + { + "epoch": 0.45, + "grad_norm": 0.305988694893456, + "learning_rate": 0.00018384824038425796, + "loss": 1.0658, + "step": 4688 + }, + { + "epoch": 0.45, + "grad_norm": 0.29161415294102544, + "learning_rate": 0.00018383961884105724, + "loss": 1.0198, + "step": 4689 + }, + { + "epoch": 0.45, + "grad_norm": 0.2679552986606862, + "learning_rate": 0.00018383099519971737, + "loss": 1.081, + "step": 4690 + }, + { + "epoch": 0.45, + "grad_norm": 0.27720828972286177, + "learning_rate": 0.00018382236946045416, + "loss": 1.1043, + "step": 4691 + }, + { + "epoch": 0.45, + "grad_norm": 0.26516410427337717, + "learning_rate": 0.0001838137416234835, + "loss": 0.9742, + "step": 4692 + }, + { + "epoch": 0.45, + "grad_norm": 0.27803791295610497, + "learning_rate": 0.00018380511168902128, + "loss": 1.0465, + "step": 4693 + }, + { + "epoch": 0.45, + "grad_norm": 0.3744328239378393, + "learning_rate": 0.00018379647965728344, + "loss": 1.1089, + "step": 4694 + }, + { + "epoch": 0.45, + "grad_norm": 0.2973942314834355, + "learning_rate": 0.00018378784552848605, + "loss": 1.0153, + "step": 4695 + }, + { + "epoch": 0.45, + "grad_norm": 0.26659388968085823, + "learning_rate": 0.00018377920930284515, + "loss": 1.1312, + "step": 4696 + }, + { + "epoch": 0.45, + "grad_norm": 0.25981005331538953, + "learning_rate": 0.0001837705709805769, + "loss": 1.0054, + "step": 4697 + }, + { + "epoch": 0.45, + "grad_norm": 0.22213075334205962, + "learning_rate": 0.00018376193056189745, + "loss": 1.015, + "step": 4698 + }, + { + "epoch": 0.45, + "grad_norm": 0.275135826725204, + "learning_rate": 0.00018375328804702304, + "loss": 0.965, + "step": 4699 + }, + { + "epoch": 0.45, + "grad_norm": 0.29035632968889363, + "learning_rate": 0.00018374464343617, + "loss": 1.0289, + "step": 4700 + }, + { + "epoch": 0.45, + "grad_norm": 0.27764015636676737, + "learning_rate": 0.0001837359967295546, + "loss": 1.0597, + "step": 4701 + }, + { + "epoch": 0.45, + "grad_norm": 0.25677252798632294, + "learning_rate": 0.00018372734792739323, + "loss": 1.0364, + "step": 4702 + }, + { + "epoch": 0.45, + "grad_norm": 0.3040774439028437, + "learning_rate": 0.0001837186970299024, + "loss": 1.157, + "step": 4703 + }, + { + "epoch": 0.45, + "grad_norm": 0.2693075199539607, + "learning_rate": 0.00018371004403729853, + "loss": 1.0576, + "step": 4704 + }, + { + "epoch": 0.45, + "grad_norm": 0.29196540371409324, + "learning_rate": 0.0001837013889497982, + "loss": 1.1876, + "step": 4705 + }, + { + "epoch": 0.45, + "grad_norm": 0.2926321409398997, + "learning_rate": 0.00018369273176761802, + "loss": 1.1449, + "step": 4706 + }, + { + "epoch": 0.45, + "grad_norm": 0.26895329592482214, + "learning_rate": 0.00018368407249097466, + "loss": 1.0901, + "step": 4707 + }, + { + "epoch": 0.45, + "grad_norm": 0.2606964589821753, + "learning_rate": 0.00018367541112008476, + "loss": 1.1665, + "step": 4708 + }, + { + "epoch": 0.45, + "grad_norm": 0.3222647861912937, + "learning_rate": 0.0001836667476551651, + "loss": 1.0423, + "step": 4709 + }, + { + "epoch": 0.45, + "grad_norm": 0.28474742820692894, + "learning_rate": 0.00018365808209643253, + "loss": 1.0235, + "step": 4710 + }, + { + "epoch": 0.45, + "grad_norm": 0.29857635596649795, + "learning_rate": 0.00018364941444410385, + "loss": 1.1574, + "step": 4711 + }, + { + "epoch": 0.45, + "grad_norm": 0.3222102111839974, + "learning_rate": 0.00018364074469839602, + "loss": 1.1667, + "step": 4712 + }, + { + "epoch": 0.45, + "grad_norm": 0.31112582358071006, + "learning_rate": 0.00018363207285952595, + "loss": 1.152, + "step": 4713 + }, + { + "epoch": 0.45, + "grad_norm": 0.26706361932201095, + "learning_rate": 0.00018362339892771072, + "loss": 1.1072, + "step": 4714 + }, + { + "epoch": 0.45, + "grad_norm": 0.3008566236074501, + "learning_rate": 0.00018361472290316736, + "loss": 1.0795, + "step": 4715 + }, + { + "epoch": 0.45, + "grad_norm": 0.2802364960499926, + "learning_rate": 0.00018360604478611303, + "loss": 1.062, + "step": 4716 + }, + { + "epoch": 0.45, + "grad_norm": 0.25442275739177317, + "learning_rate": 0.00018359736457676488, + "loss": 1.0775, + "step": 4717 + }, + { + "epoch": 0.45, + "grad_norm": 0.28931573899470037, + "learning_rate": 0.00018358868227534014, + "loss": 1.1024, + "step": 4718 + }, + { + "epoch": 0.45, + "grad_norm": 0.256028110974589, + "learning_rate": 0.0001835799978820561, + "loss": 1.1924, + "step": 4719 + }, + { + "epoch": 0.45, + "grad_norm": 0.2645246955746137, + "learning_rate": 0.00018357131139713008, + "loss": 1.1301, + "step": 4720 + }, + { + "epoch": 0.45, + "grad_norm": 0.3130330679624715, + "learning_rate": 0.0001835626228207795, + "loss": 1.0055, + "step": 4721 + }, + { + "epoch": 0.45, + "grad_norm": 0.28521400562617844, + "learning_rate": 0.00018355393215322173, + "loss": 1.1705, + "step": 4722 + }, + { + "epoch": 0.45, + "grad_norm": 0.23137236678760192, + "learning_rate": 0.0001835452393946743, + "loss": 1.0553, + "step": 4723 + }, + { + "epoch": 0.45, + "grad_norm": 0.2716401502501313, + "learning_rate": 0.00018353654454535473, + "loss": 0.9819, + "step": 4724 + }, + { + "epoch": 0.45, + "grad_norm": 0.25530206196101923, + "learning_rate": 0.00018352784760548066, + "loss": 1.0459, + "step": 4725 + }, + { + "epoch": 0.45, + "grad_norm": 0.3028518361428745, + "learning_rate": 0.0001835191485752697, + "loss": 1.1051, + "step": 4726 + }, + { + "epoch": 0.45, + "grad_norm": 0.266666477668348, + "learning_rate": 0.00018351044745493957, + "loss": 1.0546, + "step": 4727 + }, + { + "epoch": 0.45, + "grad_norm": 0.270858102859276, + "learning_rate": 0.000183501744244708, + "loss": 1.1381, + "step": 4728 + }, + { + "epoch": 0.45, + "grad_norm": 0.2626211922077772, + "learning_rate": 0.0001834930389447928, + "loss": 1.0571, + "step": 4729 + }, + { + "epoch": 0.45, + "grad_norm": 0.25526998893008135, + "learning_rate": 0.00018348433155541182, + "loss": 1.0052, + "step": 4730 + }, + { + "epoch": 0.45, + "grad_norm": 0.2578903383304173, + "learning_rate": 0.000183475622076783, + "loss": 0.9924, + "step": 4731 + }, + { + "epoch": 0.45, + "grad_norm": 0.2413218945569206, + "learning_rate": 0.00018346691050912423, + "loss": 1.1513, + "step": 4732 + }, + { + "epoch": 0.45, + "grad_norm": 0.2446885085320821, + "learning_rate": 0.0001834581968526536, + "loss": 1.0376, + "step": 4733 + }, + { + "epoch": 0.45, + "grad_norm": 0.3680136627218759, + "learning_rate": 0.00018344948110758912, + "loss": 0.9561, + "step": 4734 + }, + { + "epoch": 0.45, + "grad_norm": 0.2723843485796278, + "learning_rate": 0.00018344076327414896, + "loss": 1.0291, + "step": 4735 + }, + { + "epoch": 0.45, + "grad_norm": 0.3027706624868208, + "learning_rate": 0.00018343204335255123, + "loss": 1.1075, + "step": 4736 + }, + { + "epoch": 0.45, + "grad_norm": 0.27483257758567703, + "learning_rate": 0.00018342332134301418, + "loss": 1.0461, + "step": 4737 + }, + { + "epoch": 0.45, + "grad_norm": 0.30639774434247813, + "learning_rate": 0.00018341459724575612, + "loss": 1.1396, + "step": 4738 + }, + { + "epoch": 0.45, + "grad_norm": 0.30130628173745716, + "learning_rate": 0.00018340587106099532, + "loss": 1.0496, + "step": 4739 + }, + { + "epoch": 0.45, + "grad_norm": 0.2951096190186332, + "learning_rate": 0.00018339714278895017, + "loss": 1.0468, + "step": 4740 + }, + { + "epoch": 0.45, + "grad_norm": 0.255641750420738, + "learning_rate": 0.0001833884124298391, + "loss": 0.9368, + "step": 4741 + }, + { + "epoch": 0.45, + "grad_norm": 0.28182596587401804, + "learning_rate": 0.00018337967998388062, + "loss": 1.1615, + "step": 4742 + }, + { + "epoch": 0.45, + "grad_norm": 0.2654967829673649, + "learning_rate": 0.00018337094545129327, + "loss": 1.1393, + "step": 4743 + }, + { + "epoch": 0.45, + "grad_norm": 0.2809627165424522, + "learning_rate": 0.00018336220883229557, + "loss": 1.0288, + "step": 4744 + }, + { + "epoch": 0.45, + "grad_norm": 0.3063917931895973, + "learning_rate": 0.0001833534701271062, + "loss": 1.1096, + "step": 4745 + }, + { + "epoch": 0.45, + "grad_norm": 0.2978525520912866, + "learning_rate": 0.00018334472933594388, + "loss": 1.1158, + "step": 4746 + }, + { + "epoch": 0.45, + "grad_norm": 0.2957907023562577, + "learning_rate": 0.00018333598645902733, + "loss": 1.0369, + "step": 4747 + }, + { + "epoch": 0.45, + "grad_norm": 0.23971767785327205, + "learning_rate": 0.00018332724149657534, + "loss": 1.0324, + "step": 4748 + }, + { + "epoch": 0.45, + "grad_norm": 0.25958963718826317, + "learning_rate": 0.00018331849444880676, + "loss": 1.0323, + "step": 4749 + }, + { + "epoch": 0.45, + "grad_norm": 0.2702035936636161, + "learning_rate": 0.00018330974531594046, + "loss": 1.146, + "step": 4750 + }, + { + "epoch": 0.45, + "grad_norm": 0.30569382745175333, + "learning_rate": 0.00018330099409819548, + "loss": 1.1357, + "step": 4751 + }, + { + "epoch": 0.45, + "grad_norm": 0.2820062684104576, + "learning_rate": 0.00018329224079579072, + "loss": 1.1124, + "step": 4752 + }, + { + "epoch": 0.45, + "grad_norm": 0.2829441998287804, + "learning_rate": 0.0001832834854089453, + "loss": 1.0883, + "step": 4753 + }, + { + "epoch": 0.45, + "grad_norm": 0.30058225164600477, + "learning_rate": 0.00018327472793787833, + "loss": 1.1226, + "step": 4754 + }, + { + "epoch": 0.45, + "grad_norm": 0.2806946975067004, + "learning_rate": 0.00018326596838280897, + "loss": 1.0858, + "step": 4755 + }, + { + "epoch": 0.46, + "grad_norm": 0.26835184504189624, + "learning_rate": 0.0001832572067439564, + "loss": 1.0372, + "step": 4756 + }, + { + "epoch": 0.46, + "grad_norm": 0.2284738744280186, + "learning_rate": 0.00018324844302153992, + "loss": 0.9814, + "step": 4757 + }, + { + "epoch": 0.46, + "grad_norm": 0.2678772918055495, + "learning_rate": 0.00018323967721577881, + "loss": 1.1667, + "step": 4758 + }, + { + "epoch": 0.46, + "grad_norm": 0.2842258914883526, + "learning_rate": 0.00018323090932689248, + "loss": 1.0497, + "step": 4759 + }, + { + "epoch": 0.46, + "grad_norm": 0.2909027276625095, + "learning_rate": 0.00018322213935510035, + "loss": 1.083, + "step": 4760 + }, + { + "epoch": 0.46, + "grad_norm": 0.2861834538414527, + "learning_rate": 0.00018321336730062185, + "loss": 1.2472, + "step": 4761 + }, + { + "epoch": 0.46, + "grad_norm": 0.3061550280271623, + "learning_rate": 0.00018320459316367656, + "loss": 1.0457, + "step": 4762 + }, + { + "epoch": 0.46, + "grad_norm": 0.29646221655536087, + "learning_rate": 0.00018319581694448402, + "loss": 1.2093, + "step": 4763 + }, + { + "epoch": 0.46, + "grad_norm": 0.24948311138239332, + "learning_rate": 0.00018318703864326387, + "loss": 1.0602, + "step": 4764 + }, + { + "epoch": 0.46, + "grad_norm": 0.2972189282962595, + "learning_rate": 0.0001831782582602358, + "loss": 1.0545, + "step": 4765 + }, + { + "epoch": 0.46, + "grad_norm": 0.2766121044623654, + "learning_rate": 0.00018316947579561955, + "loss": 1.1347, + "step": 4766 + }, + { + "epoch": 0.46, + "grad_norm": 0.24848006867021413, + "learning_rate": 0.0001831606912496349, + "loss": 0.9744, + "step": 4767 + }, + { + "epoch": 0.46, + "grad_norm": 0.31237542451843275, + "learning_rate": 0.00018315190462250166, + "loss": 1.1843, + "step": 4768 + }, + { + "epoch": 0.46, + "grad_norm": 0.29004056616059715, + "learning_rate": 0.00018314311591443978, + "loss": 1.1249, + "step": 4769 + }, + { + "epoch": 0.46, + "grad_norm": 0.26116055277908656, + "learning_rate": 0.00018313432512566914, + "loss": 1.1072, + "step": 4770 + }, + { + "epoch": 0.46, + "grad_norm": 0.28883119567906423, + "learning_rate": 0.0001831255322564098, + "loss": 1.0457, + "step": 4771 + }, + { + "epoch": 0.46, + "grad_norm": 0.303739333682482, + "learning_rate": 0.00018311673730688174, + "loss": 1.0541, + "step": 4772 + }, + { + "epoch": 0.46, + "grad_norm": 0.29152935043740263, + "learning_rate": 0.00018310794027730513, + "loss": 0.9989, + "step": 4773 + }, + { + "epoch": 0.46, + "grad_norm": 0.2642591264939651, + "learning_rate": 0.00018309914116790006, + "loss": 1.0887, + "step": 4774 + }, + { + "epoch": 0.46, + "grad_norm": 0.2737816497235381, + "learning_rate": 0.00018309033997888677, + "loss": 0.9973, + "step": 4775 + }, + { + "epoch": 0.46, + "grad_norm": 0.27257528194967345, + "learning_rate": 0.0001830815367104855, + "loss": 1.0495, + "step": 4776 + }, + { + "epoch": 0.46, + "grad_norm": 0.28425805359861567, + "learning_rate": 0.00018307273136291654, + "loss": 1.0373, + "step": 4777 + }, + { + "epoch": 0.46, + "grad_norm": 0.29026318888377095, + "learning_rate": 0.00018306392393640025, + "loss": 1.0727, + "step": 4778 + }, + { + "epoch": 0.46, + "grad_norm": 0.27166227285923494, + "learning_rate": 0.0001830551144311571, + "loss": 1.0168, + "step": 4779 + }, + { + "epoch": 0.46, + "grad_norm": 0.2755929576909545, + "learning_rate": 0.00018304630284740752, + "loss": 1.1526, + "step": 4780 + }, + { + "epoch": 0.46, + "grad_norm": 0.2899920317630215, + "learning_rate": 0.00018303748918537197, + "loss": 1.1746, + "step": 4781 + }, + { + "epoch": 0.46, + "grad_norm": 0.2986410508568978, + "learning_rate": 0.00018302867344527113, + "loss": 0.9996, + "step": 4782 + }, + { + "epoch": 0.46, + "grad_norm": 0.265948409051619, + "learning_rate": 0.00018301985562732548, + "loss": 1.0769, + "step": 4783 + }, + { + "epoch": 0.46, + "grad_norm": 0.28694971143700526, + "learning_rate": 0.0001830110357317558, + "loss": 1.1081, + "step": 4784 + }, + { + "epoch": 0.46, + "grad_norm": 0.27742537509438936, + "learning_rate": 0.00018300221375878282, + "loss": 1.0131, + "step": 4785 + }, + { + "epoch": 0.46, + "grad_norm": 0.2625193602157892, + "learning_rate": 0.00018299338970862724, + "loss": 1.1648, + "step": 4786 + }, + { + "epoch": 0.46, + "grad_norm": 0.2738231281645282, + "learning_rate": 0.00018298456358150996, + "loss": 1.0869, + "step": 4787 + }, + { + "epoch": 0.46, + "grad_norm": 0.28533535787671566, + "learning_rate": 0.00018297573537765175, + "loss": 1.1657, + "step": 4788 + }, + { + "epoch": 0.46, + "grad_norm": 0.24149533195194994, + "learning_rate": 0.00018296690509727367, + "loss": 1.1252, + "step": 4789 + }, + { + "epoch": 0.46, + "grad_norm": 0.2513397282973885, + "learning_rate": 0.00018295807274059663, + "loss": 1.1539, + "step": 4790 + }, + { + "epoch": 0.46, + "grad_norm": 0.26483605282680023, + "learning_rate": 0.00018294923830784168, + "loss": 1.1032, + "step": 4791 + }, + { + "epoch": 0.46, + "grad_norm": 0.24791217448830763, + "learning_rate": 0.0001829404017992299, + "loss": 1.019, + "step": 4792 + }, + { + "epoch": 0.46, + "grad_norm": 0.2666231822052287, + "learning_rate": 0.00018293156321498247, + "loss": 1.0347, + "step": 4793 + }, + { + "epoch": 0.46, + "grad_norm": 0.29874993409578854, + "learning_rate": 0.0001829227225553205, + "loss": 1.178, + "step": 4794 + }, + { + "epoch": 0.46, + "grad_norm": 0.2738977464194366, + "learning_rate": 0.00018291387982046536, + "loss": 1.1841, + "step": 4795 + }, + { + "epoch": 0.46, + "grad_norm": 0.26913204839505855, + "learning_rate": 0.00018290503501063819, + "loss": 1.1249, + "step": 4796 + }, + { + "epoch": 0.46, + "grad_norm": 0.2642140627592377, + "learning_rate": 0.00018289618812606046, + "loss": 1.1228, + "step": 4797 + }, + { + "epoch": 0.46, + "grad_norm": 0.2923403068856712, + "learning_rate": 0.00018288733916695351, + "loss": 1.0207, + "step": 4798 + }, + { + "epoch": 0.46, + "grad_norm": 0.2574187448061279, + "learning_rate": 0.0001828784881335388, + "loss": 1.022, + "step": 4799 + }, + { + "epoch": 0.46, + "grad_norm": 0.2705462102994065, + "learning_rate": 0.00018286963502603786, + "loss": 1.1524, + "step": 4800 + }, + { + "epoch": 0.46, + "grad_norm": 0.2570338839265344, + "learning_rate": 0.0001828607798446722, + "loss": 1.0328, + "step": 4801 + }, + { + "epoch": 0.46, + "grad_norm": 0.3210236176915547, + "learning_rate": 0.00018285192258966343, + "loss": 1.1115, + "step": 4802 + }, + { + "epoch": 0.46, + "grad_norm": 0.28608741562007683, + "learning_rate": 0.00018284306326123327, + "loss": 1.1011, + "step": 4803 + }, + { + "epoch": 0.46, + "grad_norm": 0.2945292323022467, + "learning_rate": 0.00018283420185960338, + "loss": 1.0293, + "step": 4804 + }, + { + "epoch": 0.46, + "grad_norm": 0.2921369853346895, + "learning_rate": 0.00018282533838499552, + "loss": 1.0956, + "step": 4805 + }, + { + "epoch": 0.46, + "grad_norm": 0.2508835050525997, + "learning_rate": 0.0001828164728376315, + "loss": 0.9944, + "step": 4806 + }, + { + "epoch": 0.46, + "grad_norm": 0.31162618222530697, + "learning_rate": 0.00018280760521773322, + "loss": 1.1071, + "step": 4807 + }, + { + "epoch": 0.46, + "grad_norm": 0.26840278702905573, + "learning_rate": 0.00018279873552552256, + "loss": 1.0196, + "step": 4808 + }, + { + "epoch": 0.46, + "grad_norm": 0.28949640843261154, + "learning_rate": 0.0001827898637612215, + "loss": 1.0671, + "step": 4809 + }, + { + "epoch": 0.46, + "grad_norm": 0.26451664824875853, + "learning_rate": 0.00018278098992505207, + "loss": 1.037, + "step": 4810 + }, + { + "epoch": 0.46, + "grad_norm": 0.3163920468320238, + "learning_rate": 0.00018277211401723634, + "loss": 1.0914, + "step": 4811 + }, + { + "epoch": 0.46, + "grad_norm": 0.293156846011273, + "learning_rate": 0.00018276323603799645, + "loss": 1.0247, + "step": 4812 + }, + { + "epoch": 0.46, + "grad_norm": 0.3167908615898505, + "learning_rate": 0.00018275435598755457, + "loss": 1.0896, + "step": 4813 + }, + { + "epoch": 0.46, + "grad_norm": 0.29195900263136115, + "learning_rate": 0.0001827454738661329, + "loss": 1.106, + "step": 4814 + }, + { + "epoch": 0.46, + "grad_norm": 0.24880439045988673, + "learning_rate": 0.00018273658967395378, + "loss": 1.0249, + "step": 4815 + }, + { + "epoch": 0.46, + "grad_norm": 0.27945847139446783, + "learning_rate": 0.00018272770341123948, + "loss": 1.1502, + "step": 4816 + }, + { + "epoch": 0.46, + "grad_norm": 0.2826003957306108, + "learning_rate": 0.0001827188150782124, + "loss": 1.0295, + "step": 4817 + }, + { + "epoch": 0.46, + "grad_norm": 0.2733756686159319, + "learning_rate": 0.000182709924675095, + "loss": 1.1557, + "step": 4818 + }, + { + "epoch": 0.46, + "grad_norm": 0.2719146164107582, + "learning_rate": 0.00018270103220210975, + "loss": 1.1701, + "step": 4819 + }, + { + "epoch": 0.46, + "grad_norm": 0.27143925540828834, + "learning_rate": 0.0001826921376594792, + "loss": 1.094, + "step": 4820 + }, + { + "epoch": 0.46, + "grad_norm": 0.3064338525945154, + "learning_rate": 0.00018268324104742592, + "loss": 1.1517, + "step": 4821 + }, + { + "epoch": 0.46, + "grad_norm": 0.31652916261666003, + "learning_rate": 0.00018267434236617257, + "loss": 1.1124, + "step": 4822 + }, + { + "epoch": 0.46, + "grad_norm": 0.28456338644193074, + "learning_rate": 0.00018266544161594185, + "loss": 1.1934, + "step": 4823 + }, + { + "epoch": 0.46, + "grad_norm": 0.24199134880492237, + "learning_rate": 0.0001826565387969565, + "loss": 0.9981, + "step": 4824 + }, + { + "epoch": 0.46, + "grad_norm": 0.26907228489033597, + "learning_rate": 0.00018264763390943932, + "loss": 0.9906, + "step": 4825 + }, + { + "epoch": 0.46, + "grad_norm": 0.2885973158602934, + "learning_rate": 0.00018263872695361316, + "loss": 1.0922, + "step": 4826 + }, + { + "epoch": 0.46, + "grad_norm": 0.2618403285446365, + "learning_rate": 0.00018262981792970093, + "loss": 1.0619, + "step": 4827 + }, + { + "epoch": 0.46, + "grad_norm": 0.2508955108209627, + "learning_rate": 0.00018262090683792556, + "loss": 1.0296, + "step": 4828 + }, + { + "epoch": 0.46, + "grad_norm": 0.2805927803935353, + "learning_rate": 0.00018261199367851008, + "loss": 1.0554, + "step": 4829 + }, + { + "epoch": 0.46, + "grad_norm": 0.2703573079658423, + "learning_rate": 0.00018260307845167754, + "loss": 1.0561, + "step": 4830 + }, + { + "epoch": 0.46, + "grad_norm": 0.2674244714124627, + "learning_rate": 0.00018259416115765103, + "loss": 1.1403, + "step": 4831 + }, + { + "epoch": 0.46, + "grad_norm": 0.28142267132464976, + "learning_rate": 0.00018258524179665377, + "loss": 1.142, + "step": 4832 + }, + { + "epoch": 0.46, + "grad_norm": 0.28299880159536434, + "learning_rate": 0.00018257632036890891, + "loss": 1.0733, + "step": 4833 + }, + { + "epoch": 0.46, + "grad_norm": 0.26287599628896613, + "learning_rate": 0.0001825673968746397, + "loss": 1.036, + "step": 4834 + }, + { + "epoch": 0.46, + "grad_norm": 0.27196439255395516, + "learning_rate": 0.00018255847131406954, + "loss": 1.0408, + "step": 4835 + }, + { + "epoch": 0.46, + "grad_norm": 0.27846018633896047, + "learning_rate": 0.00018254954368742172, + "loss": 1.0871, + "step": 4836 + }, + { + "epoch": 0.46, + "grad_norm": 0.29707264756040697, + "learning_rate": 0.00018254061399491968, + "loss": 1.0648, + "step": 4837 + }, + { + "epoch": 0.46, + "grad_norm": 0.26505419324604984, + "learning_rate": 0.00018253168223678694, + "loss": 1.1028, + "step": 4838 + }, + { + "epoch": 0.46, + "grad_norm": 0.255410178872581, + "learning_rate": 0.00018252274841324697, + "loss": 1.0632, + "step": 4839 + }, + { + "epoch": 0.46, + "grad_norm": 0.31740617878546407, + "learning_rate": 0.00018251381252452334, + "loss": 1.0536, + "step": 4840 + }, + { + "epoch": 0.46, + "grad_norm": 0.283151238470317, + "learning_rate": 0.0001825048745708397, + "loss": 1.0617, + "step": 4841 + }, + { + "epoch": 0.46, + "grad_norm": 0.2754257741606846, + "learning_rate": 0.0001824959345524197, + "loss": 1.1151, + "step": 4842 + }, + { + "epoch": 0.46, + "grad_norm": 0.27843031061325174, + "learning_rate": 0.00018248699246948714, + "loss": 1.0461, + "step": 4843 + }, + { + "epoch": 0.46, + "grad_norm": 0.28725916243319277, + "learning_rate": 0.00018247804832226573, + "loss": 1.1349, + "step": 4844 + }, + { + "epoch": 0.46, + "grad_norm": 0.2656780830493282, + "learning_rate": 0.00018246910211097933, + "loss": 1.0609, + "step": 4845 + }, + { + "epoch": 0.46, + "grad_norm": 0.2735044242141262, + "learning_rate": 0.0001824601538358518, + "loss": 1.1276, + "step": 4846 + }, + { + "epoch": 0.46, + "grad_norm": 0.2758621380929105, + "learning_rate": 0.00018245120349710708, + "loss": 0.9056, + "step": 4847 + }, + { + "epoch": 0.46, + "grad_norm": 0.2399901693338529, + "learning_rate": 0.00018244225109496922, + "loss": 1.0778, + "step": 4848 + }, + { + "epoch": 0.46, + "grad_norm": 0.24569048440162722, + "learning_rate": 0.0001824332966296622, + "loss": 1.1481, + "step": 4849 + }, + { + "epoch": 0.46, + "grad_norm": 0.24697964457020974, + "learning_rate": 0.00018242434010141013, + "loss": 1.0456, + "step": 4850 + }, + { + "epoch": 0.46, + "grad_norm": 0.26864292763639014, + "learning_rate": 0.0001824153815104371, + "loss": 1.0611, + "step": 4851 + }, + { + "epoch": 0.46, + "grad_norm": 0.32764299735013946, + "learning_rate": 0.0001824064208569674, + "loss": 1.1193, + "step": 4852 + }, + { + "epoch": 0.46, + "grad_norm": 0.2841433629068385, + "learning_rate": 0.00018239745814122523, + "loss": 1.1592, + "step": 4853 + }, + { + "epoch": 0.46, + "grad_norm": 0.2803323057557425, + "learning_rate": 0.00018238849336343487, + "loss": 1.103, + "step": 4854 + }, + { + "epoch": 0.46, + "grad_norm": 0.2403783130731807, + "learning_rate": 0.00018237952652382067, + "loss": 1.1279, + "step": 4855 + }, + { + "epoch": 0.46, + "grad_norm": 0.2644107034178308, + "learning_rate": 0.00018237055762260708, + "loss": 1.0965, + "step": 4856 + }, + { + "epoch": 0.46, + "grad_norm": 0.30697640163453677, + "learning_rate": 0.0001823615866600185, + "loss": 1.2236, + "step": 4857 + }, + { + "epoch": 0.46, + "grad_norm": 0.2735457493491036, + "learning_rate": 0.00018235261363627945, + "loss": 1.0618, + "step": 4858 + }, + { + "epoch": 0.46, + "grad_norm": 0.2406737920100741, + "learning_rate": 0.00018234363855161448, + "loss": 1.004, + "step": 4859 + }, + { + "epoch": 0.46, + "grad_norm": 0.2484692248494346, + "learning_rate": 0.00018233466140624822, + "loss": 0.9887, + "step": 4860 + }, + { + "epoch": 0.47, + "grad_norm": 0.2896830439213678, + "learning_rate": 0.00018232568220040532, + "loss": 1.1294, + "step": 4861 + }, + { + "epoch": 0.47, + "grad_norm": 0.26324509988169137, + "learning_rate": 0.00018231670093431042, + "loss": 1.1409, + "step": 4862 + }, + { + "epoch": 0.47, + "grad_norm": 0.2545994960965595, + "learning_rate": 0.00018230771760818844, + "loss": 1.0028, + "step": 4863 + }, + { + "epoch": 0.47, + "grad_norm": 0.2708183217546789, + "learning_rate": 0.000182298732222264, + "loss": 1.0793, + "step": 4864 + }, + { + "epoch": 0.47, + "grad_norm": 0.2917108244983868, + "learning_rate": 0.00018228974477676216, + "loss": 1.0369, + "step": 4865 + }, + { + "epoch": 0.47, + "grad_norm": 0.261745568565937, + "learning_rate": 0.0001822807552719077, + "loss": 0.962, + "step": 4866 + }, + { + "epoch": 0.47, + "grad_norm": 0.2743799469305386, + "learning_rate": 0.0001822717637079256, + "loss": 1.2131, + "step": 4867 + }, + { + "epoch": 0.47, + "grad_norm": 0.2979549118848966, + "learning_rate": 0.0001822627700850409, + "loss": 1.0951, + "step": 4868 + }, + { + "epoch": 0.47, + "grad_norm": 0.2589301792916495, + "learning_rate": 0.00018225377440347874, + "loss": 1.1224, + "step": 4869 + }, + { + "epoch": 0.47, + "grad_norm": 0.27151217602723077, + "learning_rate": 0.00018224477666346414, + "loss": 1.2002, + "step": 4870 + }, + { + "epoch": 0.47, + "grad_norm": 0.2678272434705896, + "learning_rate": 0.00018223577686522232, + "loss": 1.0903, + "step": 4871 + }, + { + "epoch": 0.47, + "grad_norm": 0.28664523382221585, + "learning_rate": 0.0001822267750089785, + "loss": 1.1004, + "step": 4872 + }, + { + "epoch": 0.47, + "grad_norm": 0.2947186333319964, + "learning_rate": 0.00018221777109495797, + "loss": 1.0248, + "step": 4873 + }, + { + "epoch": 0.47, + "grad_norm": 0.2911818485318527, + "learning_rate": 0.00018220876512338604, + "loss": 1.1243, + "step": 4874 + }, + { + "epoch": 0.47, + "grad_norm": 0.2389316378471969, + "learning_rate": 0.0001821997570944881, + "loss": 1.0098, + "step": 4875 + }, + { + "epoch": 0.47, + "grad_norm": 0.29112691824378567, + "learning_rate": 0.00018219074700848956, + "loss": 1.1284, + "step": 4876 + }, + { + "epoch": 0.47, + "grad_norm": 0.32234639877925986, + "learning_rate": 0.00018218173486561593, + "loss": 1.1974, + "step": 4877 + }, + { + "epoch": 0.47, + "grad_norm": 0.28693422182466444, + "learning_rate": 0.00018217272066609275, + "loss": 1.07, + "step": 4878 + }, + { + "epoch": 0.47, + "grad_norm": 0.24948119387159542, + "learning_rate": 0.00018216370441014558, + "loss": 1.0697, + "step": 4879 + }, + { + "epoch": 0.47, + "grad_norm": 0.2766198420685631, + "learning_rate": 0.00018215468609800007, + "loss": 1.1055, + "step": 4880 + }, + { + "epoch": 0.47, + "grad_norm": 0.2913825743782603, + "learning_rate": 0.0001821456657298819, + "loss": 1.0749, + "step": 4881 + }, + { + "epoch": 0.47, + "grad_norm": 0.2688271558729764, + "learning_rate": 0.00018213664330601683, + "loss": 0.9326, + "step": 4882 + }, + { + "epoch": 0.47, + "grad_norm": 0.30551493928355505, + "learning_rate": 0.00018212761882663062, + "loss": 1.1667, + "step": 4883 + }, + { + "epoch": 0.47, + "grad_norm": 0.3272843823903593, + "learning_rate": 0.00018211859229194918, + "loss": 1.0908, + "step": 4884 + }, + { + "epoch": 0.47, + "grad_norm": 0.2683326547425987, + "learning_rate": 0.00018210956370219832, + "loss": 1.1501, + "step": 4885 + }, + { + "epoch": 0.47, + "grad_norm": 0.2508457951874583, + "learning_rate": 0.00018210053305760403, + "loss": 1.0294, + "step": 4886 + }, + { + "epoch": 0.47, + "grad_norm": 0.3066517541478313, + "learning_rate": 0.0001820915003583923, + "loss": 1.1289, + "step": 4887 + }, + { + "epoch": 0.47, + "grad_norm": 0.2979558635215368, + "learning_rate": 0.0001820824656047892, + "loss": 1.0369, + "step": 4888 + }, + { + "epoch": 0.47, + "grad_norm": 0.24297953137434605, + "learning_rate": 0.0001820734287970208, + "loss": 1.0952, + "step": 4889 + }, + { + "epoch": 0.47, + "grad_norm": 0.2642818716032307, + "learning_rate": 0.00018206438993531324, + "loss": 1.1495, + "step": 4890 + }, + { + "epoch": 0.47, + "grad_norm": 0.28250099631091213, + "learning_rate": 0.0001820553490198928, + "loss": 1.2023, + "step": 4891 + }, + { + "epoch": 0.47, + "grad_norm": 0.24244241524853985, + "learning_rate": 0.00018204630605098563, + "loss": 1.1123, + "step": 4892 + }, + { + "epoch": 0.47, + "grad_norm": 0.28117721265077, + "learning_rate": 0.00018203726102881807, + "loss": 1.1124, + "step": 4893 + }, + { + "epoch": 0.47, + "grad_norm": 0.24950954002584746, + "learning_rate": 0.00018202821395361656, + "loss": 1.0812, + "step": 4894 + }, + { + "epoch": 0.47, + "grad_norm": 0.29484743456459533, + "learning_rate": 0.0001820191648256074, + "loss": 1.019, + "step": 4895 + }, + { + "epoch": 0.47, + "grad_norm": 0.26478910448375825, + "learning_rate": 0.00018201011364501712, + "loss": 1.061, + "step": 4896 + }, + { + "epoch": 0.47, + "grad_norm": 0.2802200200065187, + "learning_rate": 0.00018200106041207218, + "loss": 1.2153, + "step": 4897 + }, + { + "epoch": 0.47, + "grad_norm": 0.2931006305889275, + "learning_rate": 0.00018199200512699918, + "loss": 1.1586, + "step": 4898 + }, + { + "epoch": 0.47, + "grad_norm": 0.28693848326326754, + "learning_rate": 0.00018198294779002473, + "loss": 0.8363, + "step": 4899 + }, + { + "epoch": 0.47, + "grad_norm": 0.2488053532878251, + "learning_rate": 0.00018197388840137548, + "loss": 1.0084, + "step": 4900 + }, + { + "epoch": 0.47, + "grad_norm": 0.30279834945613493, + "learning_rate": 0.00018196482696127814, + "loss": 1.0889, + "step": 4901 + }, + { + "epoch": 0.47, + "grad_norm": 0.27315676586595977, + "learning_rate": 0.0001819557634699595, + "loss": 1.194, + "step": 4902 + }, + { + "epoch": 0.47, + "grad_norm": 0.27870113697280036, + "learning_rate": 0.0001819466979276464, + "loss": 1.0237, + "step": 4903 + }, + { + "epoch": 0.47, + "grad_norm": 0.26068511407455874, + "learning_rate": 0.00018193763033456565, + "loss": 1.141, + "step": 4904 + }, + { + "epoch": 0.47, + "grad_norm": 0.25403699988609485, + "learning_rate": 0.0001819285606909442, + "loss": 0.985, + "step": 4905 + }, + { + "epoch": 0.47, + "grad_norm": 0.25520183475888464, + "learning_rate": 0.00018191948899700904, + "loss": 0.9452, + "step": 4906 + }, + { + "epoch": 0.47, + "grad_norm": 0.2890128875837949, + "learning_rate": 0.00018191041525298719, + "loss": 0.9948, + "step": 4907 + }, + { + "epoch": 0.47, + "grad_norm": 0.2826684783407896, + "learning_rate": 0.00018190133945910573, + "loss": 1.1965, + "step": 4908 + }, + { + "epoch": 0.47, + "grad_norm": 0.27927424785621435, + "learning_rate": 0.00018189226161559175, + "loss": 1.0749, + "step": 4909 + }, + { + "epoch": 0.47, + "grad_norm": 0.28389430568060914, + "learning_rate": 0.00018188318172267245, + "loss": 1.1575, + "step": 4910 + }, + { + "epoch": 0.47, + "grad_norm": 0.21938938517365983, + "learning_rate": 0.0001818740997805751, + "loss": 1.0206, + "step": 4911 + }, + { + "epoch": 0.47, + "grad_norm": 0.29366370308740425, + "learning_rate": 0.00018186501578952693, + "loss": 1.0559, + "step": 4912 + }, + { + "epoch": 0.47, + "grad_norm": 0.29917686177952224, + "learning_rate": 0.0001818559297497553, + "loss": 1.164, + "step": 4913 + }, + { + "epoch": 0.47, + "grad_norm": 0.24934981554280172, + "learning_rate": 0.00018184684166148754, + "loss": 0.978, + "step": 4914 + }, + { + "epoch": 0.47, + "grad_norm": 0.25613501770670105, + "learning_rate": 0.00018183775152495117, + "loss": 1.1179, + "step": 4915 + }, + { + "epoch": 0.47, + "grad_norm": 0.28150065228278026, + "learning_rate": 0.00018182865934037362, + "loss": 1.0, + "step": 4916 + }, + { + "epoch": 0.47, + "grad_norm": 0.2736305655565639, + "learning_rate": 0.00018181956510798246, + "loss": 0.9436, + "step": 4917 + }, + { + "epoch": 0.47, + "grad_norm": 0.23963815077384734, + "learning_rate": 0.00018181046882800525, + "loss": 1.1872, + "step": 4918 + }, + { + "epoch": 0.47, + "grad_norm": 0.26272710687943224, + "learning_rate": 0.00018180137050066963, + "loss": 1.1077, + "step": 4919 + }, + { + "epoch": 0.47, + "grad_norm": 0.2491631914332162, + "learning_rate": 0.00018179227012620332, + "loss": 1.0311, + "step": 4920 + }, + { + "epoch": 0.47, + "grad_norm": 0.2850474055751791, + "learning_rate": 0.00018178316770483405, + "loss": 1.072, + "step": 4921 + }, + { + "epoch": 0.47, + "grad_norm": 0.2563447984596202, + "learning_rate": 0.0001817740632367896, + "loss": 1.0942, + "step": 4922 + }, + { + "epoch": 0.47, + "grad_norm": 0.28963605095483796, + "learning_rate": 0.00018176495672229782, + "loss": 1.1151, + "step": 4923 + }, + { + "epoch": 0.47, + "grad_norm": 0.29799210873069965, + "learning_rate": 0.0001817558481615866, + "loss": 1.0651, + "step": 4924 + }, + { + "epoch": 0.47, + "grad_norm": 0.2900355369937844, + "learning_rate": 0.0001817467375548839, + "loss": 1.0395, + "step": 4925 + }, + { + "epoch": 0.47, + "grad_norm": 0.26790765666005045, + "learning_rate": 0.00018173762490241777, + "loss": 1.0195, + "step": 4926 + }, + { + "epoch": 0.47, + "grad_norm": 0.27029423974434824, + "learning_rate": 0.00018172851020441616, + "loss": 1.0376, + "step": 4927 + }, + { + "epoch": 0.47, + "grad_norm": 0.29781631086031485, + "learning_rate": 0.00018171939346110723, + "loss": 1.1626, + "step": 4928 + }, + { + "epoch": 0.47, + "grad_norm": 0.24153000189250098, + "learning_rate": 0.0001817102746727191, + "loss": 1.1416, + "step": 4929 + }, + { + "epoch": 0.47, + "grad_norm": 0.2832270079995993, + "learning_rate": 0.00018170115383948001, + "loss": 1.0366, + "step": 4930 + }, + { + "epoch": 0.47, + "grad_norm": 0.28980962720099673, + "learning_rate": 0.0001816920309616182, + "loss": 1.1199, + "step": 4931 + }, + { + "epoch": 0.47, + "grad_norm": 0.2885127084279082, + "learning_rate": 0.00018168290603936198, + "loss": 0.9624, + "step": 4932 + }, + { + "epoch": 0.47, + "grad_norm": 0.264979114595309, + "learning_rate": 0.00018167377907293966, + "loss": 1.1314, + "step": 4933 + }, + { + "epoch": 0.47, + "grad_norm": 0.29190107567170176, + "learning_rate": 0.00018166465006257972, + "loss": 1.1603, + "step": 4934 + }, + { + "epoch": 0.47, + "grad_norm": 0.29218193365264744, + "learning_rate": 0.0001816555190085106, + "loss": 1.1024, + "step": 4935 + }, + { + "epoch": 0.47, + "grad_norm": 0.290286097531124, + "learning_rate": 0.00018164638591096078, + "loss": 1.0179, + "step": 4936 + }, + { + "epoch": 0.47, + "grad_norm": 0.3045929894370889, + "learning_rate": 0.00018163725077015883, + "loss": 1.1458, + "step": 4937 + }, + { + "epoch": 0.47, + "grad_norm": 0.3096182938272493, + "learning_rate": 0.0001816281135863334, + "loss": 1.0445, + "step": 4938 + }, + { + "epoch": 0.47, + "grad_norm": 0.2989704049787465, + "learning_rate": 0.00018161897435971312, + "loss": 1.0597, + "step": 4939 + }, + { + "epoch": 0.47, + "grad_norm": 0.33521100047691355, + "learning_rate": 0.00018160983309052671, + "loss": 1.0351, + "step": 4940 + }, + { + "epoch": 0.47, + "grad_norm": 0.27937510049751796, + "learning_rate": 0.00018160068977900293, + "loss": 1.0309, + "step": 4941 + }, + { + "epoch": 0.47, + "grad_norm": 0.30607280379755825, + "learning_rate": 0.00018159154442537058, + "loss": 1.0614, + "step": 4942 + }, + { + "epoch": 0.47, + "grad_norm": 0.2942443003492716, + "learning_rate": 0.0001815823970298586, + "loss": 1.0296, + "step": 4943 + }, + { + "epoch": 0.47, + "grad_norm": 0.27168124667388854, + "learning_rate": 0.00018157324759269583, + "loss": 1.2025, + "step": 4944 + }, + { + "epoch": 0.47, + "grad_norm": 0.3040911262976451, + "learning_rate": 0.00018156409611411127, + "loss": 1.1002, + "step": 4945 + }, + { + "epoch": 0.47, + "grad_norm": 0.2932905809708139, + "learning_rate": 0.00018155494259433397, + "loss": 1.1174, + "step": 4946 + }, + { + "epoch": 0.47, + "grad_norm": 0.27057969270876014, + "learning_rate": 0.00018154578703359294, + "loss": 1.1873, + "step": 4947 + }, + { + "epoch": 0.47, + "grad_norm": 0.23291125191630732, + "learning_rate": 0.00018153662943211737, + "loss": 1.0286, + "step": 4948 + }, + { + "epoch": 0.47, + "grad_norm": 0.2935414983637233, + "learning_rate": 0.00018152746979013638, + "loss": 1.0808, + "step": 4949 + }, + { + "epoch": 0.47, + "grad_norm": 0.28416378913000195, + "learning_rate": 0.00018151830810787925, + "loss": 1.1802, + "step": 4950 + }, + { + "epoch": 0.47, + "grad_norm": 0.2804898105323499, + "learning_rate": 0.00018150914438557522, + "loss": 1.0825, + "step": 4951 + }, + { + "epoch": 0.47, + "grad_norm": 0.2908103000886573, + "learning_rate": 0.0001814999786234536, + "loss": 1.1133, + "step": 4952 + }, + { + "epoch": 0.47, + "grad_norm": 0.25695695427126014, + "learning_rate": 0.0001814908108217438, + "loss": 1.1093, + "step": 4953 + }, + { + "epoch": 0.47, + "grad_norm": 0.27219006337780743, + "learning_rate": 0.0001814816409806753, + "loss": 1.1098, + "step": 4954 + }, + { + "epoch": 0.47, + "grad_norm": 0.3013175495467479, + "learning_rate": 0.00018147246910047747, + "loss": 1.0026, + "step": 4955 + }, + { + "epoch": 0.47, + "grad_norm": 0.24847902288191978, + "learning_rate": 0.0001814632951813799, + "loss": 0.999, + "step": 4956 + }, + { + "epoch": 0.47, + "grad_norm": 0.2691757208602004, + "learning_rate": 0.00018145411922361219, + "loss": 1.0259, + "step": 4957 + }, + { + "epoch": 0.47, + "grad_norm": 0.2601243300778845, + "learning_rate": 0.00018144494122740394, + "loss": 0.9758, + "step": 4958 + }, + { + "epoch": 0.47, + "grad_norm": 0.27967276767781657, + "learning_rate": 0.00018143576119298484, + "loss": 1.0271, + "step": 4959 + }, + { + "epoch": 0.47, + "grad_norm": 0.3093700281906271, + "learning_rate": 0.00018142657912058465, + "loss": 0.9627, + "step": 4960 + }, + { + "epoch": 0.47, + "grad_norm": 0.2951333334350976, + "learning_rate": 0.00018141739501043315, + "loss": 1.0788, + "step": 4961 + }, + { + "epoch": 0.47, + "grad_norm": 0.2754995868240653, + "learning_rate": 0.00018140820886276018, + "loss": 0.987, + "step": 4962 + }, + { + "epoch": 0.47, + "grad_norm": 0.2650597795296595, + "learning_rate": 0.0001813990206777956, + "loss": 1.108, + "step": 4963 + }, + { + "epoch": 0.47, + "grad_norm": 0.2680629620506122, + "learning_rate": 0.00018138983045576937, + "loss": 1.0824, + "step": 4964 + }, + { + "epoch": 0.48, + "grad_norm": 0.2745197902744774, + "learning_rate": 0.00018138063819691147, + "loss": 1.087, + "step": 4965 + }, + { + "epoch": 0.48, + "grad_norm": 0.2951908263527352, + "learning_rate": 0.00018137144390145194, + "loss": 1.0986, + "step": 4966 + }, + { + "epoch": 0.48, + "grad_norm": 0.3006255204787449, + "learning_rate": 0.00018136224756962093, + "loss": 1.0538, + "step": 4967 + }, + { + "epoch": 0.48, + "grad_norm": 0.28348543906022167, + "learning_rate": 0.00018135304920164854, + "loss": 1.0905, + "step": 4968 + }, + { + "epoch": 0.48, + "grad_norm": 0.2572498672154097, + "learning_rate": 0.00018134384879776497, + "loss": 1.1133, + "step": 4969 + }, + { + "epoch": 0.48, + "grad_norm": 0.30267258785481194, + "learning_rate": 0.00018133464635820042, + "loss": 1.1097, + "step": 4970 + }, + { + "epoch": 0.48, + "grad_norm": 0.2645184466065096, + "learning_rate": 0.00018132544188318526, + "loss": 1.111, + "step": 4971 + }, + { + "epoch": 0.48, + "grad_norm": 0.2981463331584167, + "learning_rate": 0.0001813162353729498, + "loss": 1.0594, + "step": 4972 + }, + { + "epoch": 0.48, + "grad_norm": 0.3028092831683072, + "learning_rate": 0.0001813070268277244, + "loss": 1.227, + "step": 4973 + }, + { + "epoch": 0.48, + "grad_norm": 0.276916640360599, + "learning_rate": 0.00018129781624773961, + "loss": 1.1563, + "step": 4974 + }, + { + "epoch": 0.48, + "grad_norm": 0.26497515296266566, + "learning_rate": 0.00018128860363322586, + "loss": 1.0489, + "step": 4975 + }, + { + "epoch": 0.48, + "grad_norm": 0.2870089379191648, + "learning_rate": 0.00018127938898441373, + "loss": 1.1085, + "step": 4976 + }, + { + "epoch": 0.48, + "grad_norm": 0.2804072923871364, + "learning_rate": 0.00018127017230153378, + "loss": 1.1697, + "step": 4977 + }, + { + "epoch": 0.48, + "grad_norm": 0.26600268228720364, + "learning_rate": 0.0001812609535848167, + "loss": 1.0651, + "step": 4978 + }, + { + "epoch": 0.48, + "grad_norm": 0.3017636575581846, + "learning_rate": 0.0001812517328344932, + "loss": 1.0479, + "step": 4979 + }, + { + "epoch": 0.48, + "grad_norm": 0.2848522851547357, + "learning_rate": 0.000181242510050794, + "loss": 1.0961, + "step": 4980 + }, + { + "epoch": 0.48, + "grad_norm": 0.29706092141646745, + "learning_rate": 0.00018123328523394992, + "loss": 1.0572, + "step": 4981 + }, + { + "epoch": 0.48, + "grad_norm": 0.24623214065631238, + "learning_rate": 0.00018122405838419186, + "loss": 1.0209, + "step": 4982 + }, + { + "epoch": 0.48, + "grad_norm": 0.27148089077495985, + "learning_rate": 0.00018121482950175067, + "loss": 1.0425, + "step": 4983 + }, + { + "epoch": 0.48, + "grad_norm": 0.27374692095758846, + "learning_rate": 0.00018120559858685734, + "loss": 1.0984, + "step": 4984 + }, + { + "epoch": 0.48, + "grad_norm": 0.28314322190570707, + "learning_rate": 0.00018119636563974285, + "loss": 1.0245, + "step": 4985 + }, + { + "epoch": 0.48, + "grad_norm": 0.27988607306147967, + "learning_rate": 0.0001811871306606383, + "loss": 1.0786, + "step": 4986 + }, + { + "epoch": 0.48, + "grad_norm": 0.28485376383594274, + "learning_rate": 0.0001811778936497748, + "loss": 1.0639, + "step": 4987 + }, + { + "epoch": 0.48, + "grad_norm": 0.23607692697159477, + "learning_rate": 0.00018116865460738343, + "loss": 1.0506, + "step": 4988 + }, + { + "epoch": 0.48, + "grad_norm": 0.29135188062241957, + "learning_rate": 0.0001811594135336955, + "loss": 1.0689, + "step": 4989 + }, + { + "epoch": 0.48, + "grad_norm": 0.26704486335325794, + "learning_rate": 0.00018115017042894227, + "loss": 1.1443, + "step": 4990 + }, + { + "epoch": 0.48, + "grad_norm": 0.26331319852301815, + "learning_rate": 0.00018114092529335497, + "loss": 1.0934, + "step": 4991 + }, + { + "epoch": 0.48, + "grad_norm": 0.26377752564879786, + "learning_rate": 0.00018113167812716506, + "loss": 0.9711, + "step": 4992 + }, + { + "epoch": 0.48, + "grad_norm": 0.26534776082769385, + "learning_rate": 0.0001811224289306039, + "loss": 1.0002, + "step": 4993 + }, + { + "epoch": 0.48, + "grad_norm": 0.2711195707498488, + "learning_rate": 0.00018111317770390297, + "loss": 1.2213, + "step": 4994 + }, + { + "epoch": 0.48, + "grad_norm": 0.24845192655050088, + "learning_rate": 0.0001811039244472938, + "loss": 1.1603, + "step": 4995 + }, + { + "epoch": 0.48, + "grad_norm": 0.2774236359347863, + "learning_rate": 0.00018109466916100793, + "loss": 1.0153, + "step": 4996 + }, + { + "epoch": 0.48, + "grad_norm": 0.291748829551597, + "learning_rate": 0.000181085411845277, + "loss": 1.139, + "step": 4997 + }, + { + "epoch": 0.48, + "grad_norm": 0.2707487926064819, + "learning_rate": 0.0001810761525003327, + "loss": 0.9485, + "step": 4998 + }, + { + "epoch": 0.48, + "grad_norm": 0.31763719053401285, + "learning_rate": 0.0001810668911264067, + "loss": 1.0691, + "step": 4999 + }, + { + "epoch": 0.48, + "grad_norm": 0.271398772008147, + "learning_rate": 0.00018105762772373086, + "loss": 1.0501, + "step": 5000 + }, + { + "epoch": 0.48, + "grad_norm": 0.2723475184642883, + "learning_rate": 0.00018104836229253688, + "loss": 1.0046, + "step": 5001 + }, + { + "epoch": 0.48, + "grad_norm": 0.23738205025152834, + "learning_rate": 0.00018103909483305672, + "loss": 1.1247, + "step": 5002 + }, + { + "epoch": 0.48, + "grad_norm": 0.28212585900181886, + "learning_rate": 0.00018102982534552226, + "loss": 0.966, + "step": 5003 + }, + { + "epoch": 0.48, + "grad_norm": 0.2923303986208393, + "learning_rate": 0.00018102055383016554, + "loss": 1.177, + "step": 5004 + }, + { + "epoch": 0.48, + "grad_norm": 0.261063011725201, + "learning_rate": 0.0001810112802872185, + "loss": 0.9956, + "step": 5005 + }, + { + "epoch": 0.48, + "grad_norm": 0.28452157947524576, + "learning_rate": 0.0001810020047169133, + "loss": 1.1531, + "step": 5006 + }, + { + "epoch": 0.48, + "grad_norm": 0.30435514362331084, + "learning_rate": 0.00018099272711948197, + "loss": 1.101, + "step": 5007 + }, + { + "epoch": 0.48, + "grad_norm": 0.3171881336122615, + "learning_rate": 0.0001809834474951568, + "loss": 1.1477, + "step": 5008 + }, + { + "epoch": 0.48, + "grad_norm": 0.3021432074002858, + "learning_rate": 0.00018097416584416992, + "loss": 1.1089, + "step": 5009 + }, + { + "epoch": 0.48, + "grad_norm": 0.2584355712285805, + "learning_rate": 0.00018096488216675364, + "loss": 1.0171, + "step": 5010 + }, + { + "epoch": 0.48, + "grad_norm": 0.2784662384579421, + "learning_rate": 0.00018095559646314033, + "loss": 1.1563, + "step": 5011 + }, + { + "epoch": 0.48, + "grad_norm": 0.2590883277277557, + "learning_rate": 0.00018094630873356234, + "loss": 0.9934, + "step": 5012 + }, + { + "epoch": 0.48, + "grad_norm": 0.31013602501461507, + "learning_rate": 0.0001809370189782521, + "loss": 1.0347, + "step": 5013 + }, + { + "epoch": 0.48, + "grad_norm": 0.2908179871344749, + "learning_rate": 0.00018092772719744207, + "loss": 1.1276, + "step": 5014 + }, + { + "epoch": 0.48, + "grad_norm": 0.27894866086973424, + "learning_rate": 0.0001809184333913648, + "loss": 1.1603, + "step": 5015 + }, + { + "epoch": 0.48, + "grad_norm": 0.30799725337128564, + "learning_rate": 0.0001809091375602529, + "loss": 0.9902, + "step": 5016 + }, + { + "epoch": 0.48, + "grad_norm": 0.26185924608898803, + "learning_rate": 0.00018089983970433896, + "loss": 1.0665, + "step": 5017 + }, + { + "epoch": 0.48, + "grad_norm": 0.23371144120926612, + "learning_rate": 0.0001808905398238557, + "loss": 1.0455, + "step": 5018 + }, + { + "epoch": 0.48, + "grad_norm": 0.27647347653440757, + "learning_rate": 0.00018088123791903588, + "loss": 1.1008, + "step": 5019 + }, + { + "epoch": 0.48, + "grad_norm": 0.2619795224095397, + "learning_rate": 0.0001808719339901122, + "loss": 1.111, + "step": 5020 + }, + { + "epoch": 0.48, + "grad_norm": 0.28812383140009523, + "learning_rate": 0.00018086262803731758, + "loss": 1.1625, + "step": 5021 + }, + { + "epoch": 0.48, + "grad_norm": 0.29790943849759494, + "learning_rate": 0.00018085332006088486, + "loss": 1.0231, + "step": 5022 + }, + { + "epoch": 0.48, + "grad_norm": 0.2584586663721738, + "learning_rate": 0.00018084401006104699, + "loss": 0.9985, + "step": 5023 + }, + { + "epoch": 0.48, + "grad_norm": 0.3204107870274487, + "learning_rate": 0.00018083469803803696, + "loss": 1.0371, + "step": 5024 + }, + { + "epoch": 0.48, + "grad_norm": 0.32851014719630056, + "learning_rate": 0.0001808253839920878, + "loss": 1.0865, + "step": 5025 + }, + { + "epoch": 0.48, + "grad_norm": 0.28813789079186414, + "learning_rate": 0.00018081606792343262, + "loss": 1.0732, + "step": 5026 + }, + { + "epoch": 0.48, + "grad_norm": 0.2778577229940317, + "learning_rate": 0.00018080674983230455, + "loss": 0.9632, + "step": 5027 + }, + { + "epoch": 0.48, + "grad_norm": 0.25371536354442387, + "learning_rate": 0.00018079742971893677, + "loss": 1.0637, + "step": 5028 + }, + { + "epoch": 0.48, + "grad_norm": 0.2608878940566144, + "learning_rate": 0.00018078810758356256, + "loss": 1.1195, + "step": 5029 + }, + { + "epoch": 0.48, + "grad_norm": 0.2880007088671419, + "learning_rate": 0.00018077878342641514, + "loss": 1.0541, + "step": 5030 + }, + { + "epoch": 0.48, + "grad_norm": 0.27719099372774664, + "learning_rate": 0.0001807694572477279, + "loss": 1.14, + "step": 5031 + }, + { + "epoch": 0.48, + "grad_norm": 0.26236080400560896, + "learning_rate": 0.00018076012904773427, + "loss": 1.0992, + "step": 5032 + }, + { + "epoch": 0.48, + "grad_norm": 0.28662749205806315, + "learning_rate": 0.00018075079882666763, + "loss": 1.1681, + "step": 5033 + }, + { + "epoch": 0.48, + "grad_norm": 0.24357430650710507, + "learning_rate": 0.0001807414665847615, + "loss": 1.0774, + "step": 5034 + }, + { + "epoch": 0.48, + "grad_norm": 0.22511154512758563, + "learning_rate": 0.00018073213232224945, + "loss": 0.9704, + "step": 5035 + }, + { + "epoch": 0.48, + "grad_norm": 0.25551533972771157, + "learning_rate": 0.000180722796039365, + "loss": 1.1366, + "step": 5036 + }, + { + "epoch": 0.48, + "grad_norm": 0.27262724569941255, + "learning_rate": 0.0001807134577363419, + "loss": 0.9719, + "step": 5037 + }, + { + "epoch": 0.48, + "grad_norm": 0.2640695671336433, + "learning_rate": 0.00018070411741341377, + "loss": 1.0389, + "step": 5038 + }, + { + "epoch": 0.48, + "grad_norm": 0.25865882131840123, + "learning_rate": 0.00018069477507081438, + "loss": 1.0127, + "step": 5039 + }, + { + "epoch": 0.48, + "grad_norm": 0.3041466915560575, + "learning_rate": 0.00018068543070877752, + "loss": 1.1345, + "step": 5040 + }, + { + "epoch": 0.48, + "grad_norm": 0.280532311208517, + "learning_rate": 0.00018067608432753706, + "loss": 0.98, + "step": 5041 + }, + { + "epoch": 0.48, + "grad_norm": 0.3067608848042604, + "learning_rate": 0.0001806667359273269, + "loss": 1.1467, + "step": 5042 + }, + { + "epoch": 0.48, + "grad_norm": 0.28794591958027166, + "learning_rate": 0.00018065738550838094, + "loss": 1.1362, + "step": 5043 + }, + { + "epoch": 0.48, + "grad_norm": 0.25975883602188704, + "learning_rate": 0.00018064803307093325, + "loss": 1.0846, + "step": 5044 + }, + { + "epoch": 0.48, + "grad_norm": 0.28326599642955586, + "learning_rate": 0.00018063867861521784, + "loss": 1.0675, + "step": 5045 + }, + { + "epoch": 0.48, + "grad_norm": 0.2558529563820642, + "learning_rate": 0.00018062932214146882, + "loss": 1.1281, + "step": 5046 + }, + { + "epoch": 0.48, + "grad_norm": 0.28221647075585127, + "learning_rate": 0.0001806199636499203, + "loss": 1.0499, + "step": 5047 + }, + { + "epoch": 0.48, + "grad_norm": 0.25047524197955967, + "learning_rate": 0.00018061060314080658, + "loss": 1.1112, + "step": 5048 + }, + { + "epoch": 0.48, + "grad_norm": 0.25460102742828555, + "learning_rate": 0.00018060124061436184, + "loss": 1.139, + "step": 5049 + }, + { + "epoch": 0.48, + "grad_norm": 0.2684967059066573, + "learning_rate": 0.00018059187607082037, + "loss": 1.0121, + "step": 5050 + }, + { + "epoch": 0.48, + "grad_norm": 0.2740879178050141, + "learning_rate": 0.00018058250951041656, + "loss": 1.2213, + "step": 5051 + }, + { + "epoch": 0.48, + "grad_norm": 0.23814111073074376, + "learning_rate": 0.0001805731409333848, + "loss": 0.946, + "step": 5052 + }, + { + "epoch": 0.48, + "grad_norm": 0.2846550890405717, + "learning_rate": 0.00018056377033995959, + "loss": 1.0843, + "step": 5053 + }, + { + "epoch": 0.48, + "grad_norm": 0.26517227533600335, + "learning_rate": 0.00018055439773037536, + "loss": 1.0066, + "step": 5054 + }, + { + "epoch": 0.48, + "grad_norm": 0.27053031387998794, + "learning_rate": 0.0001805450231048667, + "loss": 0.9459, + "step": 5055 + }, + { + "epoch": 0.48, + "grad_norm": 0.3019675939625031, + "learning_rate": 0.00018053564646366822, + "loss": 1.2236, + "step": 5056 + }, + { + "epoch": 0.48, + "grad_norm": 0.25798882391808686, + "learning_rate": 0.00018052626780701457, + "loss": 1.1254, + "step": 5057 + }, + { + "epoch": 0.48, + "grad_norm": 0.27311813604505014, + "learning_rate": 0.00018051688713514047, + "loss": 1.0087, + "step": 5058 + }, + { + "epoch": 0.48, + "grad_norm": 0.24271338025668723, + "learning_rate": 0.00018050750444828067, + "loss": 1.0985, + "step": 5059 + }, + { + "epoch": 0.48, + "grad_norm": 0.28513204364807454, + "learning_rate": 0.00018049811974666996, + "loss": 0.968, + "step": 5060 + }, + { + "epoch": 0.48, + "grad_norm": 0.2684639300431927, + "learning_rate": 0.00018048873303054324, + "loss": 1.0187, + "step": 5061 + }, + { + "epoch": 0.48, + "grad_norm": 0.28668337168219155, + "learning_rate": 0.00018047934430013535, + "loss": 1.1447, + "step": 5062 + }, + { + "epoch": 0.48, + "grad_norm": 0.2526134197715635, + "learning_rate": 0.0001804699535556813, + "loss": 1.1222, + "step": 5063 + }, + { + "epoch": 0.48, + "grad_norm": 0.2734155691633195, + "learning_rate": 0.0001804605607974161, + "loss": 1.0668, + "step": 5064 + }, + { + "epoch": 0.48, + "grad_norm": 0.29009495050980677, + "learning_rate": 0.0001804511660255748, + "loss": 1.143, + "step": 5065 + }, + { + "epoch": 0.48, + "grad_norm": 0.24381730481982744, + "learning_rate": 0.0001804417692403925, + "loss": 1.2, + "step": 5066 + }, + { + "epoch": 0.48, + "grad_norm": 0.28667698702842986, + "learning_rate": 0.00018043237044210438, + "loss": 0.9782, + "step": 5067 + }, + { + "epoch": 0.48, + "grad_norm": 0.2290214469722657, + "learning_rate": 0.00018042296963094562, + "loss": 1.0481, + "step": 5068 + }, + { + "epoch": 0.48, + "grad_norm": 0.25712013816908574, + "learning_rate": 0.00018041356680715152, + "loss": 1.0808, + "step": 5069 + }, + { + "epoch": 0.49, + "grad_norm": 0.27191198243103304, + "learning_rate": 0.00018040416197095737, + "loss": 1.0839, + "step": 5070 + }, + { + "epoch": 0.49, + "grad_norm": 0.2896020255383838, + "learning_rate": 0.00018039475512259855, + "loss": 1.0719, + "step": 5071 + }, + { + "epoch": 0.49, + "grad_norm": 0.2722755649971534, + "learning_rate": 0.0001803853462623104, + "loss": 1.111, + "step": 5072 + }, + { + "epoch": 0.49, + "grad_norm": 0.296096353606665, + "learning_rate": 0.0001803759353903285, + "loss": 1.1522, + "step": 5073 + }, + { + "epoch": 0.49, + "grad_norm": 0.2516486412186821, + "learning_rate": 0.0001803665225068883, + "loss": 1.0666, + "step": 5074 + }, + { + "epoch": 0.49, + "grad_norm": 0.28162660247716065, + "learning_rate": 0.00018035710761222533, + "loss": 1.0581, + "step": 5075 + }, + { + "epoch": 0.49, + "grad_norm": 0.2573369100267432, + "learning_rate": 0.00018034769070657524, + "loss": 1.0313, + "step": 5076 + }, + { + "epoch": 0.49, + "grad_norm": 0.25964129622911775, + "learning_rate": 0.00018033827179017372, + "loss": 1.0417, + "step": 5077 + }, + { + "epoch": 0.49, + "grad_norm": 0.2584869469852085, + "learning_rate": 0.00018032885086325645, + "loss": 1.0705, + "step": 5078 + }, + { + "epoch": 0.49, + "grad_norm": 0.2803573003244443, + "learning_rate": 0.0001803194279260592, + "loss": 1.1134, + "step": 5079 + }, + { + "epoch": 0.49, + "grad_norm": 0.25948244844278057, + "learning_rate": 0.00018031000297881778, + "loss": 1.1251, + "step": 5080 + }, + { + "epoch": 0.49, + "grad_norm": 0.27477615836677105, + "learning_rate": 0.00018030057602176806, + "loss": 1.0987, + "step": 5081 + }, + { + "epoch": 0.49, + "grad_norm": 0.2949596444499225, + "learning_rate": 0.00018029114705514596, + "loss": 1.1267, + "step": 5082 + }, + { + "epoch": 0.49, + "grad_norm": 0.3126152488004548, + "learning_rate": 0.00018028171607918747, + "loss": 1.1043, + "step": 5083 + }, + { + "epoch": 0.49, + "grad_norm": 0.3158944201979104, + "learning_rate": 0.00018027228309412853, + "loss": 1.1333, + "step": 5084 + }, + { + "epoch": 0.49, + "grad_norm": 0.23721139976987007, + "learning_rate": 0.00018026284810020532, + "loss": 1.0958, + "step": 5085 + }, + { + "epoch": 0.49, + "grad_norm": 0.2620591461537738, + "learning_rate": 0.00018025341109765384, + "loss": 1.0396, + "step": 5086 + }, + { + "epoch": 0.49, + "grad_norm": 0.29187944723111464, + "learning_rate": 0.00018024397208671035, + "loss": 1.1689, + "step": 5087 + }, + { + "epoch": 0.49, + "grad_norm": 0.2740567842867544, + "learning_rate": 0.000180234531067611, + "loss": 1.0742, + "step": 5088 + }, + { + "epoch": 0.49, + "grad_norm": 0.28997111975839956, + "learning_rate": 0.00018022508804059207, + "loss": 1.1359, + "step": 5089 + }, + { + "epoch": 0.49, + "grad_norm": 0.264169160568814, + "learning_rate": 0.00018021564300588994, + "loss": 0.9856, + "step": 5090 + }, + { + "epoch": 0.49, + "grad_norm": 0.25799277848982, + "learning_rate": 0.0001802061959637409, + "loss": 1.0802, + "step": 5091 + }, + { + "epoch": 0.49, + "grad_norm": 0.27789264617161874, + "learning_rate": 0.0001801967469143814, + "loss": 1.1495, + "step": 5092 + }, + { + "epoch": 0.49, + "grad_norm": 0.284905774011098, + "learning_rate": 0.0001801872958580479, + "loss": 1.1043, + "step": 5093 + }, + { + "epoch": 0.49, + "grad_norm": 0.23840852278323477, + "learning_rate": 0.00018017784279497693, + "loss": 1.0545, + "step": 5094 + }, + { + "epoch": 0.49, + "grad_norm": 0.29365200369529876, + "learning_rate": 0.00018016838772540506, + "loss": 1.0454, + "step": 5095 + }, + { + "epoch": 0.49, + "grad_norm": 0.2905391396194257, + "learning_rate": 0.0001801589306495689, + "loss": 1.0989, + "step": 5096 + }, + { + "epoch": 0.49, + "grad_norm": 0.30717926924064015, + "learning_rate": 0.00018014947156770513, + "loss": 1.0989, + "step": 5097 + }, + { + "epoch": 0.49, + "grad_norm": 0.24633770830565768, + "learning_rate": 0.00018014001048005044, + "loss": 0.9554, + "step": 5098 + }, + { + "epoch": 0.49, + "grad_norm": 0.269877103345373, + "learning_rate": 0.00018013054738684166, + "loss": 1.1133, + "step": 5099 + }, + { + "epoch": 0.49, + "grad_norm": 0.2995678521690841, + "learning_rate": 0.00018012108228831556, + "loss": 1.1077, + "step": 5100 + }, + { + "epoch": 0.49, + "grad_norm": 0.24440039782838777, + "learning_rate": 0.000180111615184709, + "loss": 1.0012, + "step": 5101 + }, + { + "epoch": 0.49, + "grad_norm": 0.27251554211773565, + "learning_rate": 0.00018010214607625894, + "loss": 1.1004, + "step": 5102 + }, + { + "epoch": 0.49, + "grad_norm": 0.2810534155604437, + "learning_rate": 0.0001800926749632023, + "loss": 1.0193, + "step": 5103 + }, + { + "epoch": 0.49, + "grad_norm": 0.27100726353528887, + "learning_rate": 0.0001800832018457762, + "loss": 1.0602, + "step": 5104 + }, + { + "epoch": 0.49, + "grad_norm": 0.26564758624041535, + "learning_rate": 0.00018007372672421756, + "loss": 1.079, + "step": 5105 + }, + { + "epoch": 0.49, + "grad_norm": 0.2974717566929695, + "learning_rate": 0.00018006424959876363, + "loss": 1.1326, + "step": 5106 + }, + { + "epoch": 0.49, + "grad_norm": 0.30488571850565943, + "learning_rate": 0.00018005477046965153, + "loss": 1.0309, + "step": 5107 + }, + { + "epoch": 0.49, + "grad_norm": 0.2145246313949886, + "learning_rate": 0.0001800452893371185, + "loss": 1.0762, + "step": 5108 + }, + { + "epoch": 0.49, + "grad_norm": 0.2636249075639135, + "learning_rate": 0.00018003580620140177, + "loss": 1.153, + "step": 5109 + }, + { + "epoch": 0.49, + "grad_norm": 0.27782368506119703, + "learning_rate": 0.0001800263210627387, + "loss": 1.117, + "step": 5110 + }, + { + "epoch": 0.49, + "grad_norm": 0.3007327930211106, + "learning_rate": 0.00018001683392136666, + "loss": 1.1193, + "step": 5111 + }, + { + "epoch": 0.49, + "grad_norm": 0.28929437068723235, + "learning_rate": 0.00018000734477752306, + "loss": 1.0561, + "step": 5112 + }, + { + "epoch": 0.49, + "grad_norm": 0.28549761815962377, + "learning_rate": 0.00017999785363144536, + "loss": 1.1016, + "step": 5113 + }, + { + "epoch": 0.49, + "grad_norm": 0.2863390229819487, + "learning_rate": 0.0001799883604833711, + "loss": 1.1143, + "step": 5114 + }, + { + "epoch": 0.49, + "grad_norm": 0.2758612911370492, + "learning_rate": 0.00017997886533353786, + "loss": 1.1496, + "step": 5115 + }, + { + "epoch": 0.49, + "grad_norm": 0.26382558236784287, + "learning_rate": 0.00017996936818218324, + "loss": 1.0343, + "step": 5116 + }, + { + "epoch": 0.49, + "grad_norm": 0.2695740841257572, + "learning_rate": 0.00017995986902954493, + "loss": 1.1589, + "step": 5117 + }, + { + "epoch": 0.49, + "grad_norm": 0.315794471443836, + "learning_rate": 0.00017995036787586064, + "loss": 0.9799, + "step": 5118 + }, + { + "epoch": 0.49, + "grad_norm": 0.2707540246275962, + "learning_rate": 0.00017994086472136815, + "loss": 1.089, + "step": 5119 + }, + { + "epoch": 0.49, + "grad_norm": 0.3337912375773768, + "learning_rate": 0.0001799313595663053, + "loss": 1.0462, + "step": 5120 + }, + { + "epoch": 0.49, + "grad_norm": 0.30849482259896643, + "learning_rate": 0.0001799218524109099, + "loss": 0.9913, + "step": 5121 + }, + { + "epoch": 0.49, + "grad_norm": 0.2794905681957685, + "learning_rate": 0.00017991234325541995, + "loss": 1.0798, + "step": 5122 + }, + { + "epoch": 0.49, + "grad_norm": 0.26906629376110763, + "learning_rate": 0.00017990283210007335, + "loss": 0.9918, + "step": 5123 + }, + { + "epoch": 0.49, + "grad_norm": 0.2640810736886688, + "learning_rate": 0.00017989331894510818, + "loss": 1.0668, + "step": 5124 + }, + { + "epoch": 0.49, + "grad_norm": 0.3403000076240741, + "learning_rate": 0.0001798838037907625, + "loss": 1.1871, + "step": 5125 + }, + { + "epoch": 0.49, + "grad_norm": 0.29390271181136557, + "learning_rate": 0.00017987428663727441, + "loss": 1.0834, + "step": 5126 + }, + { + "epoch": 0.49, + "grad_norm": 0.29242522687517936, + "learning_rate": 0.00017986476748488214, + "loss": 0.9395, + "step": 5127 + }, + { + "epoch": 0.49, + "grad_norm": 0.30030389910750893, + "learning_rate": 0.00017985524633382381, + "loss": 1.0064, + "step": 5128 + }, + { + "epoch": 0.49, + "grad_norm": 0.26767260775313606, + "learning_rate": 0.00017984572318433778, + "loss": 1.2037, + "step": 5129 + }, + { + "epoch": 0.49, + "grad_norm": 0.2883783786958614, + "learning_rate": 0.00017983619803666235, + "loss": 1.0901, + "step": 5130 + }, + { + "epoch": 0.49, + "grad_norm": 0.2615336186142248, + "learning_rate": 0.00017982667089103588, + "loss": 1.1084, + "step": 5131 + }, + { + "epoch": 0.49, + "grad_norm": 0.2806392247590465, + "learning_rate": 0.0001798171417476968, + "loss": 1.0532, + "step": 5132 + }, + { + "epoch": 0.49, + "grad_norm": 0.2563049389592253, + "learning_rate": 0.0001798076106068836, + "loss": 1.0789, + "step": 5133 + }, + { + "epoch": 0.49, + "grad_norm": 0.32787128373934044, + "learning_rate": 0.0001797980774688348, + "loss": 1.0862, + "step": 5134 + }, + { + "epoch": 0.49, + "grad_norm": 0.26980316995262976, + "learning_rate": 0.00017978854233378891, + "loss": 1.2006, + "step": 5135 + }, + { + "epoch": 0.49, + "grad_norm": 0.2359678414179539, + "learning_rate": 0.00017977900520198465, + "loss": 0.9747, + "step": 5136 + }, + { + "epoch": 0.49, + "grad_norm": 0.28300881314834403, + "learning_rate": 0.00017976946607366063, + "loss": 1.0696, + "step": 5137 + }, + { + "epoch": 0.49, + "grad_norm": 0.27366802597990486, + "learning_rate": 0.0001797599249490556, + "loss": 0.9662, + "step": 5138 + }, + { + "epoch": 0.49, + "grad_norm": 0.2714308944537401, + "learning_rate": 0.00017975038182840828, + "loss": 1.083, + "step": 5139 + }, + { + "epoch": 0.49, + "grad_norm": 0.2644809976461488, + "learning_rate": 0.00017974083671195757, + "loss": 1.081, + "step": 5140 + }, + { + "epoch": 0.49, + "grad_norm": 0.2813538147596539, + "learning_rate": 0.0001797312895999423, + "loss": 1.1335, + "step": 5141 + }, + { + "epoch": 0.49, + "grad_norm": 0.2910923380431433, + "learning_rate": 0.0001797217404926014, + "loss": 1.0232, + "step": 5142 + }, + { + "epoch": 0.49, + "grad_norm": 0.26419502096815084, + "learning_rate": 0.00017971218939017382, + "loss": 1.106, + "step": 5143 + }, + { + "epoch": 0.49, + "grad_norm": 0.30598831197367454, + "learning_rate": 0.00017970263629289864, + "loss": 1.1303, + "step": 5144 + }, + { + "epoch": 0.49, + "grad_norm": 0.24615417333770134, + "learning_rate": 0.00017969308120101488, + "loss": 0.969, + "step": 5145 + }, + { + "epoch": 0.49, + "grad_norm": 0.2592721166455555, + "learning_rate": 0.00017968352411476166, + "loss": 1.0971, + "step": 5146 + }, + { + "epoch": 0.49, + "grad_norm": 0.2868843022304741, + "learning_rate": 0.00017967396503437816, + "loss": 0.986, + "step": 5147 + }, + { + "epoch": 0.49, + "grad_norm": 0.2936385306501026, + "learning_rate": 0.00017966440396010366, + "loss": 1.0833, + "step": 5148 + }, + { + "epoch": 0.49, + "grad_norm": 0.2947617623317851, + "learning_rate": 0.00017965484089217735, + "loss": 1.0711, + "step": 5149 + }, + { + "epoch": 0.49, + "grad_norm": 0.2628043083883782, + "learning_rate": 0.0001796452758308386, + "loss": 1.1146, + "step": 5150 + }, + { + "epoch": 0.49, + "grad_norm": 0.31452969252198026, + "learning_rate": 0.00017963570877632676, + "loss": 1.1144, + "step": 5151 + }, + { + "epoch": 0.49, + "grad_norm": 0.2596821467826261, + "learning_rate": 0.00017962613972888125, + "loss": 1.0938, + "step": 5152 + }, + { + "epoch": 0.49, + "grad_norm": 0.27739044259675993, + "learning_rate": 0.00017961656868874156, + "loss": 1.0211, + "step": 5153 + }, + { + "epoch": 0.49, + "grad_norm": 0.2937922180821596, + "learning_rate": 0.0001796069956561472, + "loss": 1.0647, + "step": 5154 + }, + { + "epoch": 0.49, + "grad_norm": 0.27965187717460716, + "learning_rate": 0.00017959742063133774, + "loss": 1.0117, + "step": 5155 + }, + { + "epoch": 0.49, + "grad_norm": 0.2840025373723606, + "learning_rate": 0.00017958784361455282, + "loss": 0.9706, + "step": 5156 + }, + { + "epoch": 0.49, + "grad_norm": 0.3178568601522771, + "learning_rate": 0.00017957826460603205, + "loss": 1.018, + "step": 5157 + }, + { + "epoch": 0.49, + "grad_norm": 0.2860032058314914, + "learning_rate": 0.00017956868360601526, + "loss": 1.1566, + "step": 5158 + }, + { + "epoch": 0.49, + "grad_norm": 0.28443581551498176, + "learning_rate": 0.00017955910061474213, + "loss": 1.0321, + "step": 5159 + }, + { + "epoch": 0.49, + "grad_norm": 0.27187438960276544, + "learning_rate": 0.0001795495156324525, + "loss": 1.0505, + "step": 5160 + }, + { + "epoch": 0.49, + "grad_norm": 0.29288869429900377, + "learning_rate": 0.00017953992865938622, + "loss": 1.0511, + "step": 5161 + }, + { + "epoch": 0.49, + "grad_norm": 0.30056027600724244, + "learning_rate": 0.00017953033969578326, + "loss": 1.1062, + "step": 5162 + }, + { + "epoch": 0.49, + "grad_norm": 0.26821431519549427, + "learning_rate": 0.00017952074874188356, + "loss": 1.092, + "step": 5163 + }, + { + "epoch": 0.49, + "grad_norm": 0.24906638557874644, + "learning_rate": 0.00017951115579792717, + "loss": 1.1022, + "step": 5164 + }, + { + "epoch": 0.49, + "grad_norm": 0.26309218013010355, + "learning_rate": 0.0001795015608641541, + "loss": 1.0265, + "step": 5165 + }, + { + "epoch": 0.49, + "grad_norm": 0.26268394260057937, + "learning_rate": 0.00017949196394080453, + "loss": 1.0653, + "step": 5166 + }, + { + "epoch": 0.49, + "grad_norm": 0.2896832205130746, + "learning_rate": 0.00017948236502811859, + "loss": 1.0854, + "step": 5167 + }, + { + "epoch": 0.49, + "grad_norm": 0.2625237394571958, + "learning_rate": 0.00017947276412633652, + "loss": 0.9619, + "step": 5168 + }, + { + "epoch": 0.49, + "grad_norm": 0.27986610289816, + "learning_rate": 0.0001794631612356986, + "loss": 1.1716, + "step": 5169 + }, + { + "epoch": 0.49, + "grad_norm": 0.28211340979754457, + "learning_rate": 0.0001794535563564451, + "loss": 1.1021, + "step": 5170 + }, + { + "epoch": 0.49, + "grad_norm": 0.24926141918189934, + "learning_rate": 0.00017944394948881642, + "loss": 1.056, + "step": 5171 + }, + { + "epoch": 0.49, + "grad_norm": 0.2869730653075155, + "learning_rate": 0.00017943434063305298, + "loss": 0.9492, + "step": 5172 + }, + { + "epoch": 0.49, + "grad_norm": 0.29187379432960536, + "learning_rate": 0.00017942472978939525, + "loss": 1.1833, + "step": 5173 + }, + { + "epoch": 0.5, + "grad_norm": 0.28973741108012707, + "learning_rate": 0.00017941511695808372, + "loss": 1.0489, + "step": 5174 + }, + { + "epoch": 0.5, + "grad_norm": 0.27689121010931406, + "learning_rate": 0.000179405502139359, + "loss": 1.1769, + "step": 5175 + }, + { + "epoch": 0.5, + "grad_norm": 0.3067616467203378, + "learning_rate": 0.00017939588533346168, + "loss": 1.1473, + "step": 5176 + }, + { + "epoch": 0.5, + "grad_norm": 0.2757139090006968, + "learning_rate": 0.0001793862665406324, + "loss": 1.1183, + "step": 5177 + }, + { + "epoch": 0.5, + "grad_norm": 0.27386836919563157, + "learning_rate": 0.00017937664576111198, + "loss": 1.0154, + "step": 5178 + }, + { + "epoch": 0.5, + "grad_norm": 0.2918051909413474, + "learning_rate": 0.00017936702299514105, + "loss": 1.1863, + "step": 5179 + }, + { + "epoch": 0.5, + "grad_norm": 0.2964159774568591, + "learning_rate": 0.00017935739824296052, + "loss": 1.0043, + "step": 5180 + }, + { + "epoch": 0.5, + "grad_norm": 0.31639186034892197, + "learning_rate": 0.0001793477715048112, + "loss": 1.1946, + "step": 5181 + }, + { + "epoch": 0.5, + "grad_norm": 0.3060832768680089, + "learning_rate": 0.00017933814278093407, + "loss": 0.9377, + "step": 5182 + }, + { + "epoch": 0.5, + "grad_norm": 0.3453844813102695, + "learning_rate": 0.00017932851207157002, + "loss": 1.0465, + "step": 5183 + }, + { + "epoch": 0.5, + "grad_norm": 0.25905466276473943, + "learning_rate": 0.0001793188793769601, + "loss": 1.1672, + "step": 5184 + }, + { + "epoch": 0.5, + "grad_norm": 0.29227551232364024, + "learning_rate": 0.00017930924469734537, + "loss": 1.0898, + "step": 5185 + }, + { + "epoch": 0.5, + "grad_norm": 0.2513616979563531, + "learning_rate": 0.00017929960803296697, + "loss": 1.0656, + "step": 5186 + }, + { + "epoch": 0.5, + "grad_norm": 0.275133905403024, + "learning_rate": 0.00017928996938406603, + "loss": 1.002, + "step": 5187 + }, + { + "epoch": 0.5, + "grad_norm": 0.28951906430321944, + "learning_rate": 0.00017928032875088375, + "loss": 1.1197, + "step": 5188 + }, + { + "epoch": 0.5, + "grad_norm": 0.26163177751335326, + "learning_rate": 0.00017927068613366145, + "loss": 1.1226, + "step": 5189 + }, + { + "epoch": 0.5, + "grad_norm": 0.308746805614135, + "learning_rate": 0.00017926104153264042, + "loss": 1.0741, + "step": 5190 + }, + { + "epoch": 0.5, + "grad_norm": 0.29075350653382204, + "learning_rate": 0.00017925139494806198, + "loss": 1.1404, + "step": 5191 + }, + { + "epoch": 0.5, + "grad_norm": 0.2760956753624024, + "learning_rate": 0.0001792417463801676, + "loss": 1.132, + "step": 5192 + }, + { + "epoch": 0.5, + "grad_norm": 0.30467872599141255, + "learning_rate": 0.0001792320958291987, + "loss": 1.1227, + "step": 5193 + }, + { + "epoch": 0.5, + "grad_norm": 0.269599432070437, + "learning_rate": 0.0001792224432953968, + "loss": 1.1915, + "step": 5194 + }, + { + "epoch": 0.5, + "grad_norm": 0.2540692313113988, + "learning_rate": 0.00017921278877900348, + "loss": 1.1215, + "step": 5195 + }, + { + "epoch": 0.5, + "grad_norm": 0.25608501554671015, + "learning_rate": 0.0001792031322802603, + "loss": 1.0529, + "step": 5196 + }, + { + "epoch": 0.5, + "grad_norm": 0.2646934249716702, + "learning_rate": 0.00017919347379940904, + "loss": 1.1269, + "step": 5197 + }, + { + "epoch": 0.5, + "grad_norm": 0.2458734180042231, + "learning_rate": 0.00017918381333669126, + "loss": 1.0294, + "step": 5198 + }, + { + "epoch": 0.5, + "grad_norm": 0.2629321130779039, + "learning_rate": 0.0001791741508923488, + "loss": 1.0545, + "step": 5199 + }, + { + "epoch": 0.5, + "grad_norm": 0.29932796860263766, + "learning_rate": 0.00017916448646662346, + "loss": 1.1029, + "step": 5200 + }, + { + "epoch": 0.5, + "grad_norm": 0.2794636942689881, + "learning_rate": 0.00017915482005975708, + "loss": 0.9605, + "step": 5201 + }, + { + "epoch": 0.5, + "grad_norm": 0.2805023432544276, + "learning_rate": 0.00017914515167199158, + "loss": 1.0897, + "step": 5202 + }, + { + "epoch": 0.5, + "grad_norm": 0.25784189464219454, + "learning_rate": 0.00017913548130356894, + "loss": 1.013, + "step": 5203 + }, + { + "epoch": 0.5, + "grad_norm": 0.31553870897854386, + "learning_rate": 0.00017912580895473114, + "loss": 1.1689, + "step": 5204 + }, + { + "epoch": 0.5, + "grad_norm": 0.31827443288264134, + "learning_rate": 0.00017911613462572024, + "loss": 1.0521, + "step": 5205 + }, + { + "epoch": 0.5, + "grad_norm": 0.3025879844146086, + "learning_rate": 0.00017910645831677836, + "loss": 0.997, + "step": 5206 + }, + { + "epoch": 0.5, + "grad_norm": 0.26057117438515826, + "learning_rate": 0.0001790967800281476, + "loss": 1.1074, + "step": 5207 + }, + { + "epoch": 0.5, + "grad_norm": 0.2611725357612145, + "learning_rate": 0.00017908709976007024, + "loss": 1.0784, + "step": 5208 + }, + { + "epoch": 0.5, + "grad_norm": 0.2344376304389919, + "learning_rate": 0.0001790774175127885, + "loss": 1.0707, + "step": 5209 + }, + { + "epoch": 0.5, + "grad_norm": 0.29122733484693347, + "learning_rate": 0.00017906773328654472, + "loss": 1.1213, + "step": 5210 + }, + { + "epoch": 0.5, + "grad_norm": 0.2759187534926237, + "learning_rate": 0.00017905804708158118, + "loss": 1.0693, + "step": 5211 + }, + { + "epoch": 0.5, + "grad_norm": 0.2777254558995927, + "learning_rate": 0.00017904835889814033, + "loss": 1.0366, + "step": 5212 + }, + { + "epoch": 0.5, + "grad_norm": 0.2714206402525035, + "learning_rate": 0.00017903866873646463, + "loss": 1.1107, + "step": 5213 + }, + { + "epoch": 0.5, + "grad_norm": 0.3012099609994761, + "learning_rate": 0.0001790289765967966, + "loss": 1.0848, + "step": 5214 + }, + { + "epoch": 0.5, + "grad_norm": 0.3032845339156945, + "learning_rate": 0.00017901928247937872, + "loss": 1.1453, + "step": 5215 + }, + { + "epoch": 0.5, + "grad_norm": 0.26396959858562596, + "learning_rate": 0.00017900958638445365, + "loss": 1.1375, + "step": 5216 + }, + { + "epoch": 0.5, + "grad_norm": 0.2727644440686032, + "learning_rate": 0.00017899988831226402, + "loss": 1.0937, + "step": 5217 + }, + { + "epoch": 0.5, + "grad_norm": 0.3053107974870675, + "learning_rate": 0.00017899018826305252, + "loss": 1.1792, + "step": 5218 + }, + { + "epoch": 0.5, + "grad_norm": 0.26793488874015303, + "learning_rate": 0.00017898048623706195, + "loss": 1.1571, + "step": 5219 + }, + { + "epoch": 0.5, + "grad_norm": 0.2597411208953679, + "learning_rate": 0.00017897078223453504, + "loss": 1.052, + "step": 5220 + }, + { + "epoch": 0.5, + "grad_norm": 0.31195341891857165, + "learning_rate": 0.0001789610762557147, + "loss": 1.0902, + "step": 5221 + }, + { + "epoch": 0.5, + "grad_norm": 0.2744676529459866, + "learning_rate": 0.0001789513683008438, + "loss": 1.0829, + "step": 5222 + }, + { + "epoch": 0.5, + "grad_norm": 0.26093271474594537, + "learning_rate": 0.00017894165837016528, + "loss": 0.9847, + "step": 5223 + }, + { + "epoch": 0.5, + "grad_norm": 0.290962254361299, + "learning_rate": 0.00017893194646392214, + "loss": 1.0686, + "step": 5224 + }, + { + "epoch": 0.5, + "grad_norm": 0.26375436627341603, + "learning_rate": 0.00017892223258235746, + "loss": 1.1454, + "step": 5225 + }, + { + "epoch": 0.5, + "grad_norm": 0.2646502633063915, + "learning_rate": 0.00017891251672571428, + "loss": 1.0035, + "step": 5226 + }, + { + "epoch": 0.5, + "eval_loss": 1.130812406539917, + "eval_runtime": 4229.0908, + "eval_samples_per_second": 19.772, + "eval_steps_per_second": 2.472, + "step": 5226 + }, + { + "epoch": 0.5, + "grad_norm": 0.27086856649700053, + "learning_rate": 0.00017890279889423577, + "loss": 1.2004, + "step": 5227 + }, + { + "epoch": 0.5, + "grad_norm": 0.26907032517761026, + "learning_rate": 0.00017889307908816514, + "loss": 1.1269, + "step": 5228 + }, + { + "epoch": 0.5, + "grad_norm": 0.2559621504798168, + "learning_rate": 0.00017888335730774563, + "loss": 1.0099, + "step": 5229 + }, + { + "epoch": 0.5, + "grad_norm": 0.2886629743550709, + "learning_rate": 0.00017887363355322054, + "loss": 1.0698, + "step": 5230 + }, + { + "epoch": 0.5, + "grad_norm": 0.3107523248743581, + "learning_rate": 0.00017886390782483318, + "loss": 1.086, + "step": 5231 + }, + { + "epoch": 0.5, + "grad_norm": 0.2731347287540774, + "learning_rate": 0.00017885418012282696, + "loss": 1.1274, + "step": 5232 + }, + { + "epoch": 0.5, + "grad_norm": 0.3032030305104551, + "learning_rate": 0.00017884445044744532, + "loss": 1.0157, + "step": 5233 + }, + { + "epoch": 0.5, + "grad_norm": 0.29204175835395896, + "learning_rate": 0.00017883471879893176, + "loss": 1.0925, + "step": 5234 + }, + { + "epoch": 0.5, + "grad_norm": 0.27875381267269567, + "learning_rate": 0.00017882498517752984, + "loss": 1.0, + "step": 5235 + }, + { + "epoch": 0.5, + "grad_norm": 0.2744649218236222, + "learning_rate": 0.00017881524958348311, + "loss": 1.1309, + "step": 5236 + }, + { + "epoch": 0.5, + "grad_norm": 0.2638597475413694, + "learning_rate": 0.00017880551201703522, + "loss": 1.1368, + "step": 5237 + }, + { + "epoch": 0.5, + "grad_norm": 0.29740497162072044, + "learning_rate": 0.00017879577247842984, + "loss": 1.163, + "step": 5238 + }, + { + "epoch": 0.5, + "grad_norm": 0.2792113343183714, + "learning_rate": 0.00017878603096791078, + "loss": 0.9453, + "step": 5239 + }, + { + "epoch": 0.5, + "grad_norm": 0.3252881204864866, + "learning_rate": 0.00017877628748572176, + "loss": 0.9687, + "step": 5240 + }, + { + "epoch": 0.5, + "grad_norm": 0.2993197408644714, + "learning_rate": 0.00017876654203210666, + "loss": 1.1889, + "step": 5241 + }, + { + "epoch": 0.5, + "grad_norm": 0.25588679975602296, + "learning_rate": 0.0001787567946073093, + "loss": 1.0073, + "step": 5242 + }, + { + "epoch": 0.5, + "grad_norm": 0.25844292628782845, + "learning_rate": 0.00017874704521157368, + "loss": 1.1023, + "step": 5243 + }, + { + "epoch": 0.5, + "grad_norm": 0.2701484124438951, + "learning_rate": 0.00017873729384514374, + "loss": 1.1446, + "step": 5244 + }, + { + "epoch": 0.5, + "grad_norm": 0.28163616631177285, + "learning_rate": 0.00017872754050826358, + "loss": 1.1451, + "step": 5245 + }, + { + "epoch": 0.5, + "grad_norm": 0.24357842637206142, + "learning_rate": 0.00017871778520117722, + "loss": 1.0879, + "step": 5246 + }, + { + "epoch": 0.5, + "grad_norm": 0.2854538423239079, + "learning_rate": 0.0001787080279241288, + "loss": 1.1346, + "step": 5247 + }, + { + "epoch": 0.5, + "grad_norm": 0.28966646255088585, + "learning_rate": 0.00017869826867736253, + "loss": 1.1342, + "step": 5248 + }, + { + "epoch": 0.5, + "grad_norm": 0.269137927932761, + "learning_rate": 0.0001786885074611226, + "loss": 1.2118, + "step": 5249 + }, + { + "epoch": 0.5, + "grad_norm": 0.24965315368924026, + "learning_rate": 0.00017867874427565336, + "loss": 1.0445, + "step": 5250 + }, + { + "epoch": 0.5, + "grad_norm": 0.3268586937562193, + "learning_rate": 0.00017866897912119907, + "loss": 1.0391, + "step": 5251 + }, + { + "epoch": 0.5, + "grad_norm": 0.2628697700403388, + "learning_rate": 0.00017865921199800415, + "loss": 1.0788, + "step": 5252 + }, + { + "epoch": 0.5, + "grad_norm": 0.25466346365515374, + "learning_rate": 0.00017864944290631301, + "loss": 1.1108, + "step": 5253 + }, + { + "epoch": 0.5, + "grad_norm": 0.2785430342878654, + "learning_rate": 0.00017863967184637014, + "loss": 0.9847, + "step": 5254 + }, + { + "epoch": 0.5, + "grad_norm": 0.2821653592665947, + "learning_rate": 0.00017862989881842003, + "loss": 1.1659, + "step": 5255 + }, + { + "epoch": 0.5, + "grad_norm": 0.28381789349374786, + "learning_rate": 0.0001786201238227073, + "loss": 1.0368, + "step": 5256 + }, + { + "epoch": 0.5, + "grad_norm": 0.28526194687906037, + "learning_rate": 0.00017861034685947658, + "loss": 1.0789, + "step": 5257 + }, + { + "epoch": 0.5, + "grad_norm": 0.2646173880017078, + "learning_rate": 0.0001786005679289725, + "loss": 1.0564, + "step": 5258 + }, + { + "epoch": 0.5, + "grad_norm": 0.23710824870546326, + "learning_rate": 0.0001785907870314398, + "loss": 0.9625, + "step": 5259 + }, + { + "epoch": 0.5, + "grad_norm": 0.26259679119084095, + "learning_rate": 0.0001785810041671233, + "loss": 1.1021, + "step": 5260 + }, + { + "epoch": 0.5, + "grad_norm": 0.2709420502635169, + "learning_rate": 0.00017857121933626777, + "loss": 1.0062, + "step": 5261 + }, + { + "epoch": 0.5, + "grad_norm": 0.2613609827266697, + "learning_rate": 0.0001785614325391181, + "loss": 1.0678, + "step": 5262 + }, + { + "epoch": 0.5, + "grad_norm": 0.28813131561794453, + "learning_rate": 0.00017855164377591918, + "loss": 0.9172, + "step": 5263 + }, + { + "epoch": 0.5, + "grad_norm": 0.27234699286481917, + "learning_rate": 0.000178541853046916, + "loss": 1.1471, + "step": 5264 + }, + { + "epoch": 0.5, + "grad_norm": 0.2506386238760721, + "learning_rate": 0.0001785320603523536, + "loss": 1.0259, + "step": 5265 + }, + { + "epoch": 0.5, + "grad_norm": 0.2540092486293724, + "learning_rate": 0.00017852226569247708, + "loss": 1.0877, + "step": 5266 + }, + { + "epoch": 0.5, + "grad_norm": 0.29602747120892153, + "learning_rate": 0.00017851246906753145, + "loss": 1.109, + "step": 5267 + }, + { + "epoch": 0.5, + "grad_norm": 0.2992977885795131, + "learning_rate": 0.00017850267047776197, + "loss": 1.0069, + "step": 5268 + }, + { + "epoch": 0.5, + "grad_norm": 0.28770619817428295, + "learning_rate": 0.0001784928699234138, + "loss": 1.0456, + "step": 5269 + }, + { + "epoch": 0.5, + "grad_norm": 0.2621648485300015, + "learning_rate": 0.00017848306740473227, + "loss": 1.0159, + "step": 5270 + }, + { + "epoch": 0.5, + "grad_norm": 0.27475331658348695, + "learning_rate": 0.00017847326292196261, + "loss": 1.026, + "step": 5271 + }, + { + "epoch": 0.5, + "grad_norm": 0.2883171040107314, + "learning_rate": 0.00017846345647535026, + "loss": 1.0636, + "step": 5272 + }, + { + "epoch": 0.5, + "grad_norm": 0.2679836068828528, + "learning_rate": 0.0001784536480651406, + "loss": 1.1411, + "step": 5273 + }, + { + "epoch": 0.5, + "grad_norm": 0.2718315189359781, + "learning_rate": 0.00017844383769157905, + "loss": 1.0714, + "step": 5274 + }, + { + "epoch": 0.5, + "grad_norm": 0.2679104057163504, + "learning_rate": 0.0001784340253549112, + "loss": 1.0169, + "step": 5275 + }, + { + "epoch": 0.5, + "grad_norm": 0.28568786629626025, + "learning_rate": 0.00017842421105538256, + "loss": 1.0771, + "step": 5276 + }, + { + "epoch": 0.5, + "grad_norm": 0.2548173055279772, + "learning_rate": 0.00017841439479323877, + "loss": 1.1359, + "step": 5277 + }, + { + "epoch": 0.5, + "grad_norm": 0.24999339748919044, + "learning_rate": 0.00017840457656872544, + "loss": 1.0132, + "step": 5278 + }, + { + "epoch": 0.51, + "grad_norm": 0.3003717665696886, + "learning_rate": 0.00017839475638208832, + "loss": 1.0838, + "step": 5279 + }, + { + "epoch": 0.51, + "grad_norm": 0.31887733553865744, + "learning_rate": 0.00017838493423357314, + "loss": 1.0885, + "step": 5280 + }, + { + "epoch": 0.51, + "grad_norm": 0.2625967483202807, + "learning_rate": 0.00017837511012342572, + "loss": 1.1534, + "step": 5281 + }, + { + "epoch": 0.51, + "grad_norm": 0.31614913625651914, + "learning_rate": 0.0001783652840518919, + "loss": 1.0403, + "step": 5282 + }, + { + "epoch": 0.51, + "grad_norm": 0.26033042065792134, + "learning_rate": 0.00017835545601921764, + "loss": 1.0581, + "step": 5283 + }, + { + "epoch": 0.51, + "grad_norm": 0.2856070132728536, + "learning_rate": 0.00017834562602564883, + "loss": 1.0107, + "step": 5284 + }, + { + "epoch": 0.51, + "grad_norm": 0.2536339900536725, + "learning_rate": 0.00017833579407143147, + "loss": 0.9525, + "step": 5285 + }, + { + "epoch": 0.51, + "grad_norm": 0.26624532509069715, + "learning_rate": 0.00017832596015681165, + "loss": 1.0309, + "step": 5286 + }, + { + "epoch": 0.51, + "grad_norm": 0.26639398423968824, + "learning_rate": 0.00017831612428203543, + "loss": 1.103, + "step": 5287 + }, + { + "epoch": 0.51, + "grad_norm": 0.2854143682160096, + "learning_rate": 0.00017830628644734898, + "loss": 1.1898, + "step": 5288 + }, + { + "epoch": 0.51, + "grad_norm": 0.2647981085190368, + "learning_rate": 0.0001782964466529985, + "loss": 1.0559, + "step": 5289 + }, + { + "epoch": 0.51, + "grad_norm": 0.2692642057535311, + "learning_rate": 0.00017828660489923025, + "loss": 1.1121, + "step": 5290 + }, + { + "epoch": 0.51, + "grad_norm": 0.2790653912792434, + "learning_rate": 0.00017827676118629054, + "loss": 1.119, + "step": 5291 + }, + { + "epoch": 0.51, + "grad_norm": 0.2731285711487075, + "learning_rate": 0.00017826691551442564, + "loss": 1.1326, + "step": 5292 + }, + { + "epoch": 0.51, + "grad_norm": 0.27848520893646905, + "learning_rate": 0.000178257067883882, + "loss": 1.0876, + "step": 5293 + }, + { + "epoch": 0.51, + "grad_norm": 0.27766691943147104, + "learning_rate": 0.00017824721829490608, + "loss": 1.0075, + "step": 5294 + }, + { + "epoch": 0.51, + "grad_norm": 0.24812476086316548, + "learning_rate": 0.00017823736674774432, + "loss": 0.9859, + "step": 5295 + }, + { + "epoch": 0.51, + "grad_norm": 0.24136690327253105, + "learning_rate": 0.00017822751324264328, + "loss": 0.9295, + "step": 5296 + }, + { + "epoch": 0.51, + "grad_norm": 0.22887560800126114, + "learning_rate": 0.00017821765777984957, + "loss": 1.1376, + "step": 5297 + }, + { + "epoch": 0.51, + "grad_norm": 0.2746004567986452, + "learning_rate": 0.0001782078003596098, + "loss": 1.0057, + "step": 5298 + }, + { + "epoch": 0.51, + "grad_norm": 0.2425791039008265, + "learning_rate": 0.0001781979409821707, + "loss": 1.1405, + "step": 5299 + }, + { + "epoch": 0.51, + "grad_norm": 0.2718956254441623, + "learning_rate": 0.00017818807964777898, + "loss": 1.0884, + "step": 5300 + }, + { + "epoch": 0.51, + "grad_norm": 0.2616875209648741, + "learning_rate": 0.0001781782163566814, + "loss": 1.0947, + "step": 5301 + }, + { + "epoch": 0.51, + "grad_norm": 0.24842391055912055, + "learning_rate": 0.00017816835110912485, + "loss": 0.9207, + "step": 5302 + }, + { + "epoch": 0.51, + "grad_norm": 0.2747180515251053, + "learning_rate": 0.00017815848390535617, + "loss": 0.9877, + "step": 5303 + }, + { + "epoch": 0.51, + "grad_norm": 0.2875077951068339, + "learning_rate": 0.00017814861474562232, + "loss": 1.0289, + "step": 5304 + }, + { + "epoch": 0.51, + "grad_norm": 0.2763116859597204, + "learning_rate": 0.00017813874363017027, + "loss": 1.0357, + "step": 5305 + }, + { + "epoch": 0.51, + "grad_norm": 0.28069395829776694, + "learning_rate": 0.00017812887055924703, + "loss": 1.1048, + "step": 5306 + }, + { + "epoch": 0.51, + "grad_norm": 0.30047513289045635, + "learning_rate": 0.00017811899553309975, + "loss": 1.0333, + "step": 5307 + }, + { + "epoch": 0.51, + "grad_norm": 0.2650398736977853, + "learning_rate": 0.00017810911855197547, + "loss": 1.0565, + "step": 5308 + }, + { + "epoch": 0.51, + "grad_norm": 0.24640154980602844, + "learning_rate": 0.0001780992396161214, + "loss": 1.1352, + "step": 5309 + }, + { + "epoch": 0.51, + "grad_norm": 0.28475244026032503, + "learning_rate": 0.00017808935872578482, + "loss": 1.0479, + "step": 5310 + }, + { + "epoch": 0.51, + "grad_norm": 0.2756255403166274, + "learning_rate": 0.00017807947588121295, + "loss": 1.0636, + "step": 5311 + }, + { + "epoch": 0.51, + "grad_norm": 0.3137323153824364, + "learning_rate": 0.00017806959108265308, + "loss": 1.2068, + "step": 5312 + }, + { + "epoch": 0.51, + "grad_norm": 0.2842470117681127, + "learning_rate": 0.00017805970433035266, + "loss": 1.229, + "step": 5313 + }, + { + "epoch": 0.51, + "grad_norm": 0.2981813888336136, + "learning_rate": 0.00017804981562455908, + "loss": 1.1881, + "step": 5314 + }, + { + "epoch": 0.51, + "grad_norm": 0.28279186295473696, + "learning_rate": 0.00017803992496551982, + "loss": 1.0685, + "step": 5315 + }, + { + "epoch": 0.51, + "grad_norm": 0.2702456382826126, + "learning_rate": 0.0001780300323534824, + "loss": 0.9998, + "step": 5316 + }, + { + "epoch": 0.51, + "grad_norm": 0.276914892943855, + "learning_rate": 0.00017802013778869436, + "loss": 1.0531, + "step": 5317 + }, + { + "epoch": 0.51, + "grad_norm": 0.27960841961549454, + "learning_rate": 0.0001780102412714033, + "loss": 1.1359, + "step": 5318 + }, + { + "epoch": 0.51, + "grad_norm": 0.25797705830410184, + "learning_rate": 0.00017800034280185699, + "loss": 1.0185, + "step": 5319 + }, + { + "epoch": 0.51, + "grad_norm": 0.2857057108150563, + "learning_rate": 0.00017799044238030307, + "loss": 1.0631, + "step": 5320 + }, + { + "epoch": 0.51, + "grad_norm": 0.25584721058778875, + "learning_rate": 0.0001779805400069893, + "loss": 0.987, + "step": 5321 + }, + { + "epoch": 0.51, + "grad_norm": 0.2325654159439427, + "learning_rate": 0.0001779706356821635, + "loss": 1.0651, + "step": 5322 + }, + { + "epoch": 0.51, + "grad_norm": 0.2970569353859898, + "learning_rate": 0.00017796072940607353, + "loss": 1.13, + "step": 5323 + }, + { + "epoch": 0.51, + "grad_norm": 0.2929022741608403, + "learning_rate": 0.00017795082117896734, + "loss": 1.0719, + "step": 5324 + }, + { + "epoch": 0.51, + "grad_norm": 0.2609795745084068, + "learning_rate": 0.00017794091100109283, + "loss": 1.0067, + "step": 5325 + }, + { + "epoch": 0.51, + "grad_norm": 0.30762587611696396, + "learning_rate": 0.0001779309988726981, + "loss": 1.147, + "step": 5326 + }, + { + "epoch": 0.51, + "grad_norm": 0.2661981298944252, + "learning_rate": 0.00017792108479403106, + "loss": 1.0734, + "step": 5327 + }, + { + "epoch": 0.51, + "grad_norm": 0.2764342632822514, + "learning_rate": 0.00017791116876533994, + "loss": 1.1101, + "step": 5328 + }, + { + "epoch": 0.51, + "grad_norm": 0.24938502770088783, + "learning_rate": 0.00017790125078687288, + "loss": 1.0666, + "step": 5329 + }, + { + "epoch": 0.51, + "grad_norm": 0.3336408049693267, + "learning_rate": 0.000177891330858878, + "loss": 1.0902, + "step": 5330 + }, + { + "epoch": 0.51, + "grad_norm": 0.3218184473435925, + "learning_rate": 0.00017788140898160367, + "loss": 1.0904, + "step": 5331 + }, + { + "epoch": 0.51, + "grad_norm": 0.2789740899659255, + "learning_rate": 0.0001778714851552981, + "loss": 1.0243, + "step": 5332 + }, + { + "epoch": 0.51, + "grad_norm": 0.266597050786924, + "learning_rate": 0.00017786155938020968, + "loss": 1.0069, + "step": 5333 + }, + { + "epoch": 0.51, + "grad_norm": 0.28223064418935384, + "learning_rate": 0.00017785163165658685, + "loss": 1.1401, + "step": 5334 + }, + { + "epoch": 0.51, + "grad_norm": 0.272562171370601, + "learning_rate": 0.00017784170198467797, + "loss": 1.0984, + "step": 5335 + }, + { + "epoch": 0.51, + "grad_norm": 0.23364083669723149, + "learning_rate": 0.00017783177036473155, + "loss": 1.0585, + "step": 5336 + }, + { + "epoch": 0.51, + "grad_norm": 0.2423799752683352, + "learning_rate": 0.0001778218367969962, + "loss": 0.9586, + "step": 5337 + }, + { + "epoch": 0.51, + "grad_norm": 0.28056034525742624, + "learning_rate": 0.00017781190128172045, + "loss": 1.0374, + "step": 5338 + }, + { + "epoch": 0.51, + "grad_norm": 0.31575138951832793, + "learning_rate": 0.000177801963819153, + "loss": 1.1025, + "step": 5339 + }, + { + "epoch": 0.51, + "grad_norm": 0.2756564243357627, + "learning_rate": 0.00017779202440954247, + "loss": 1.073, + "step": 5340 + }, + { + "epoch": 0.51, + "grad_norm": 0.2855725418706688, + "learning_rate": 0.00017778208305313766, + "loss": 0.9841, + "step": 5341 + }, + { + "epoch": 0.51, + "grad_norm": 0.2622842600538124, + "learning_rate": 0.00017777213975018734, + "loss": 0.9923, + "step": 5342 + }, + { + "epoch": 0.51, + "grad_norm": 0.27868325397356525, + "learning_rate": 0.00017776219450094032, + "loss": 1.0104, + "step": 5343 + }, + { + "epoch": 0.51, + "grad_norm": 0.2910306279110282, + "learning_rate": 0.00017775224730564554, + "loss": 0.9543, + "step": 5344 + }, + { + "epoch": 0.51, + "grad_norm": 0.30979629764973976, + "learning_rate": 0.0001777422981645519, + "loss": 1.1145, + "step": 5345 + }, + { + "epoch": 0.51, + "grad_norm": 0.30299227741795537, + "learning_rate": 0.00017773234707790838, + "loss": 1.0723, + "step": 5346 + }, + { + "epoch": 0.51, + "grad_norm": 0.2925383103599133, + "learning_rate": 0.00017772239404596402, + "loss": 1.1494, + "step": 5347 + }, + { + "epoch": 0.51, + "grad_norm": 0.25939612879354135, + "learning_rate": 0.00017771243906896793, + "loss": 1.0362, + "step": 5348 + }, + { + "epoch": 0.51, + "grad_norm": 0.292894121185352, + "learning_rate": 0.00017770248214716918, + "loss": 1.0974, + "step": 5349 + }, + { + "epoch": 0.51, + "grad_norm": 0.2934306234314348, + "learning_rate": 0.000177692523280817, + "loss": 1.0471, + "step": 5350 + }, + { + "epoch": 0.51, + "grad_norm": 0.2620038040723942, + "learning_rate": 0.0001776825624701606, + "loss": 1.05, + "step": 5351 + }, + { + "epoch": 0.51, + "grad_norm": 0.258906223183488, + "learning_rate": 0.00017767259971544923, + "loss": 1.0016, + "step": 5352 + }, + { + "epoch": 0.51, + "grad_norm": 0.2693727580254897, + "learning_rate": 0.00017766263501693222, + "loss": 1.1004, + "step": 5353 + }, + { + "epoch": 0.51, + "grad_norm": 0.2656780441277641, + "learning_rate": 0.000177652668374859, + "loss": 1.1314, + "step": 5354 + }, + { + "epoch": 0.51, + "grad_norm": 0.26233985901713786, + "learning_rate": 0.00017764269978947893, + "loss": 1.0327, + "step": 5355 + }, + { + "epoch": 0.51, + "grad_norm": 0.2816892908241542, + "learning_rate": 0.00017763272926104152, + "loss": 1.1384, + "step": 5356 + }, + { + "epoch": 0.51, + "grad_norm": 0.29263486374255826, + "learning_rate": 0.00017762275678979625, + "loss": 1.1649, + "step": 5357 + }, + { + "epoch": 0.51, + "grad_norm": 0.25054193247048423, + "learning_rate": 0.00017761278237599272, + "loss": 0.9422, + "step": 5358 + }, + { + "epoch": 0.51, + "grad_norm": 0.2728514545749119, + "learning_rate": 0.00017760280601988052, + "loss": 1.1264, + "step": 5359 + }, + { + "epoch": 0.51, + "grad_norm": 0.2787897298192036, + "learning_rate": 0.00017759282772170933, + "loss": 1.069, + "step": 5360 + }, + { + "epoch": 0.51, + "grad_norm": 0.28051149945852427, + "learning_rate": 0.00017758284748172889, + "loss": 1.0434, + "step": 5361 + }, + { + "epoch": 0.51, + "grad_norm": 0.2434867215928191, + "learning_rate": 0.0001775728653001889, + "loss": 1.094, + "step": 5362 + }, + { + "epoch": 0.51, + "grad_norm": 0.298894712000934, + "learning_rate": 0.00017756288117733922, + "loss": 1.0858, + "step": 5363 + }, + { + "epoch": 0.51, + "grad_norm": 0.2989152889611858, + "learning_rate": 0.00017755289511342968, + "loss": 1.0862, + "step": 5364 + }, + { + "epoch": 0.51, + "grad_norm": 0.26061563505956564, + "learning_rate": 0.0001775429071087102, + "loss": 1.1229, + "step": 5365 + }, + { + "epoch": 0.51, + "grad_norm": 0.3078317769181313, + "learning_rate": 0.00017753291716343075, + "loss": 1.0443, + "step": 5366 + }, + { + "epoch": 0.51, + "grad_norm": 0.271922412205937, + "learning_rate": 0.00017752292527784132, + "loss": 1.0569, + "step": 5367 + }, + { + "epoch": 0.51, + "grad_norm": 0.2981868268132513, + "learning_rate": 0.00017751293145219194, + "loss": 1.1245, + "step": 5368 + }, + { + "epoch": 0.51, + "grad_norm": 0.28791587214094305, + "learning_rate": 0.00017750293568673275, + "loss": 1.1032, + "step": 5369 + }, + { + "epoch": 0.51, + "grad_norm": 0.2770400531641322, + "learning_rate": 0.00017749293798171388, + "loss": 1.1548, + "step": 5370 + }, + { + "epoch": 0.51, + "grad_norm": 0.31035234976544707, + "learning_rate": 0.00017748293833738554, + "loss": 1.0248, + "step": 5371 + }, + { + "epoch": 0.51, + "grad_norm": 0.32563997547553536, + "learning_rate": 0.000177472936753998, + "loss": 1.0142, + "step": 5372 + }, + { + "epoch": 0.51, + "grad_norm": 0.2965449735119415, + "learning_rate": 0.0001774629332318015, + "loss": 1.074, + "step": 5373 + }, + { + "epoch": 0.51, + "grad_norm": 0.24827461718041824, + "learning_rate": 0.00017745292777104638, + "loss": 1.0601, + "step": 5374 + }, + { + "epoch": 0.51, + "grad_norm": 0.28164100400780073, + "learning_rate": 0.00017744292037198312, + "loss": 1.1411, + "step": 5375 + }, + { + "epoch": 0.51, + "grad_norm": 0.255726016665569, + "learning_rate": 0.00017743291103486207, + "loss": 1.0318, + "step": 5376 + }, + { + "epoch": 0.51, + "grad_norm": 0.25998427104390703, + "learning_rate": 0.0001774228997599338, + "loss": 1.0986, + "step": 5377 + }, + { + "epoch": 0.51, + "grad_norm": 0.28575851348545084, + "learning_rate": 0.00017741288654744874, + "loss": 1.0325, + "step": 5378 + }, + { + "epoch": 0.51, + "grad_norm": 0.31525644009615983, + "learning_rate": 0.0001774028713976576, + "loss": 1.0881, + "step": 5379 + }, + { + "epoch": 0.51, + "grad_norm": 0.27772393103651905, + "learning_rate": 0.00017739285431081093, + "loss": 1.0819, + "step": 5380 + }, + { + "epoch": 0.51, + "grad_norm": 0.28458960847371934, + "learning_rate": 0.00017738283528715944, + "loss": 1.053, + "step": 5381 + }, + { + "epoch": 0.51, + "grad_norm": 0.2579176716986825, + "learning_rate": 0.00017737281432695387, + "loss": 1.0221, + "step": 5382 + }, + { + "epoch": 0.51, + "grad_norm": 0.2682071919326544, + "learning_rate": 0.000177362791430445, + "loss": 1.0632, + "step": 5383 + }, + { + "epoch": 0.52, + "grad_norm": 0.2817647505413598, + "learning_rate": 0.00017735276659788365, + "loss": 1.0892, + "step": 5384 + }, + { + "epoch": 0.52, + "grad_norm": 0.27370042339496314, + "learning_rate": 0.0001773427398295207, + "loss": 1.1051, + "step": 5385 + }, + { + "epoch": 0.52, + "grad_norm": 0.29120157753379705, + "learning_rate": 0.00017733271112560707, + "loss": 1.057, + "step": 5386 + }, + { + "epoch": 0.52, + "grad_norm": 0.274235384389006, + "learning_rate": 0.00017732268048639376, + "loss": 1.1645, + "step": 5387 + }, + { + "epoch": 0.52, + "grad_norm": 0.2554060673031894, + "learning_rate": 0.00017731264791213177, + "loss": 1.1206, + "step": 5388 + }, + { + "epoch": 0.52, + "grad_norm": 0.29038444135208097, + "learning_rate": 0.00017730261340307216, + "loss": 1.0847, + "step": 5389 + }, + { + "epoch": 0.52, + "grad_norm": 0.2927716873164211, + "learning_rate": 0.00017729257695946608, + "loss": 1.1842, + "step": 5390 + }, + { + "epoch": 0.52, + "grad_norm": 0.28575627634335404, + "learning_rate": 0.00017728253858156467, + "loss": 0.9855, + "step": 5391 + }, + { + "epoch": 0.52, + "grad_norm": 0.2741064932408415, + "learning_rate": 0.0001772724982696192, + "loss": 1.0681, + "step": 5392 + }, + { + "epoch": 0.52, + "grad_norm": 0.3221555396964685, + "learning_rate": 0.00017726245602388087, + "loss": 1.1304, + "step": 5393 + }, + { + "epoch": 0.52, + "grad_norm": 0.29940916808744256, + "learning_rate": 0.00017725241184460101, + "loss": 1.1972, + "step": 5394 + }, + { + "epoch": 0.52, + "grad_norm": 0.2555325051925566, + "learning_rate": 0.000177242365732031, + "loss": 1.0663, + "step": 5395 + }, + { + "epoch": 0.52, + "grad_norm": 0.29478956996412253, + "learning_rate": 0.00017723231768642227, + "loss": 1.1792, + "step": 5396 + }, + { + "epoch": 0.52, + "grad_norm": 0.2691856309392673, + "learning_rate": 0.0001772222677080262, + "loss": 1.0935, + "step": 5397 + }, + { + "epoch": 0.52, + "grad_norm": 0.26554481426737436, + "learning_rate": 0.00017721221579709438, + "loss": 1.1013, + "step": 5398 + }, + { + "epoch": 0.52, + "grad_norm": 0.2974199816061973, + "learning_rate": 0.00017720216195387834, + "loss": 1.1026, + "step": 5399 + }, + { + "epoch": 0.52, + "grad_norm": 0.25988903587070844, + "learning_rate": 0.00017719210617862967, + "loss": 0.9989, + "step": 5400 + }, + { + "epoch": 0.52, + "grad_norm": 0.2644737230686996, + "learning_rate": 0.00017718204847160004, + "loss": 1.1928, + "step": 5401 + }, + { + "epoch": 0.52, + "grad_norm": 0.321621116578329, + "learning_rate": 0.0001771719888330411, + "loss": 1.0699, + "step": 5402 + }, + { + "epoch": 0.52, + "grad_norm": 0.3117270874799254, + "learning_rate": 0.00017716192726320468, + "loss": 1.0265, + "step": 5403 + }, + { + "epoch": 0.52, + "grad_norm": 0.26397074260532366, + "learning_rate": 0.0001771518637623425, + "loss": 1.0595, + "step": 5404 + }, + { + "epoch": 0.52, + "grad_norm": 0.26070999892086566, + "learning_rate": 0.00017714179833070646, + "loss": 0.9732, + "step": 5405 + }, + { + "epoch": 0.52, + "grad_norm": 0.29516458483721847, + "learning_rate": 0.00017713173096854846, + "loss": 1.244, + "step": 5406 + }, + { + "epoch": 0.52, + "grad_norm": 0.28006511868535894, + "learning_rate": 0.0001771216616761204, + "loss": 1.0229, + "step": 5407 + }, + { + "epoch": 0.52, + "grad_norm": 0.2785160995661278, + "learning_rate": 0.0001771115904536743, + "loss": 1.0974, + "step": 5408 + }, + { + "epoch": 0.52, + "grad_norm": 0.3039976285784444, + "learning_rate": 0.00017710151730146215, + "loss": 1.1096, + "step": 5409 + }, + { + "epoch": 0.52, + "grad_norm": 0.2725502855772708, + "learning_rate": 0.0001770914422197361, + "loss": 1.0443, + "step": 5410 + }, + { + "epoch": 0.52, + "grad_norm": 0.2968618423089058, + "learning_rate": 0.00017708136520874822, + "loss": 1.0383, + "step": 5411 + }, + { + "epoch": 0.52, + "grad_norm": 0.2787402009547091, + "learning_rate": 0.00017707128626875078, + "loss": 1.1659, + "step": 5412 + }, + { + "epoch": 0.52, + "grad_norm": 0.2873480043549193, + "learning_rate": 0.00017706120539999595, + "loss": 0.9287, + "step": 5413 + }, + { + "epoch": 0.52, + "grad_norm": 0.27057828580034904, + "learning_rate": 0.00017705112260273602, + "loss": 0.9655, + "step": 5414 + }, + { + "epoch": 0.52, + "grad_norm": 0.2705987981572107, + "learning_rate": 0.00017704103787722332, + "loss": 1.1033, + "step": 5415 + }, + { + "epoch": 0.52, + "grad_norm": 0.2883167160040527, + "learning_rate": 0.00017703095122371024, + "loss": 1.1945, + "step": 5416 + }, + { + "epoch": 0.52, + "grad_norm": 0.28040572260172153, + "learning_rate": 0.00017702086264244918, + "loss": 1.1136, + "step": 5417 + }, + { + "epoch": 0.52, + "grad_norm": 0.23953916000277492, + "learning_rate": 0.0001770107721336926, + "loss": 1.127, + "step": 5418 + }, + { + "epoch": 0.52, + "grad_norm": 0.2666697798476311, + "learning_rate": 0.0001770006796976931, + "loss": 1.1178, + "step": 5419 + }, + { + "epoch": 0.52, + "grad_norm": 0.26595583947693385, + "learning_rate": 0.00017699058533470318, + "loss": 1.1537, + "step": 5420 + }, + { + "epoch": 0.52, + "grad_norm": 0.3018520026520265, + "learning_rate": 0.00017698048904497547, + "loss": 1.0672, + "step": 5421 + }, + { + "epoch": 0.52, + "grad_norm": 0.2585809987651865, + "learning_rate": 0.00017697039082876264, + "loss": 0.9036, + "step": 5422 + }, + { + "epoch": 0.52, + "grad_norm": 0.30520095394033386, + "learning_rate": 0.0001769602906863174, + "loss": 1.1266, + "step": 5423 + }, + { + "epoch": 0.52, + "grad_norm": 0.26703193999809494, + "learning_rate": 0.00017695018861789254, + "loss": 1.0113, + "step": 5424 + }, + { + "epoch": 0.52, + "grad_norm": 0.2665069045940571, + "learning_rate": 0.00017694008462374082, + "loss": 1.1435, + "step": 5425 + }, + { + "epoch": 0.52, + "grad_norm": 0.3066957091926963, + "learning_rate": 0.00017692997870411513, + "loss": 1.1477, + "step": 5426 + }, + { + "epoch": 0.52, + "grad_norm": 0.3060426948221189, + "learning_rate": 0.0001769198708592684, + "loss": 1.0636, + "step": 5427 + }, + { + "epoch": 0.52, + "grad_norm": 0.31035289985606923, + "learning_rate": 0.00017690976108945353, + "loss": 1.1531, + "step": 5428 + }, + { + "epoch": 0.52, + "grad_norm": 0.27105532382356223, + "learning_rate": 0.00017689964939492358, + "loss": 1.1738, + "step": 5429 + }, + { + "epoch": 0.52, + "grad_norm": 0.30441191653566924, + "learning_rate": 0.00017688953577593158, + "loss": 1.272, + "step": 5430 + }, + { + "epoch": 0.52, + "grad_norm": 0.30963973002090833, + "learning_rate": 0.0001768794202327306, + "loss": 1.2062, + "step": 5431 + }, + { + "epoch": 0.52, + "grad_norm": 0.27923640796457394, + "learning_rate": 0.0001768693027655738, + "loss": 1.1393, + "step": 5432 + }, + { + "epoch": 0.52, + "grad_norm": 0.2714745780737401, + "learning_rate": 0.00017685918337471442, + "loss": 1.0229, + "step": 5433 + }, + { + "epoch": 0.52, + "grad_norm": 0.2746763330279501, + "learning_rate": 0.00017684906206040567, + "loss": 1.025, + "step": 5434 + }, + { + "epoch": 0.52, + "grad_norm": 0.25420475121865566, + "learning_rate": 0.0001768389388229008, + "loss": 1.0173, + "step": 5435 + }, + { + "epoch": 0.52, + "grad_norm": 0.23858794911194617, + "learning_rate": 0.00017682881366245322, + "loss": 1.0893, + "step": 5436 + }, + { + "epoch": 0.52, + "grad_norm": 0.30224482212537046, + "learning_rate": 0.0001768186865793163, + "loss": 1.1222, + "step": 5437 + }, + { + "epoch": 0.52, + "grad_norm": 0.23278958691465643, + "learning_rate": 0.00017680855757374345, + "loss": 1.0458, + "step": 5438 + }, + { + "epoch": 0.52, + "grad_norm": 0.26428873295787486, + "learning_rate": 0.0001767984266459882, + "loss": 1.0792, + "step": 5439 + }, + { + "epoch": 0.52, + "grad_norm": 0.2594486178953975, + "learning_rate": 0.00017678829379630406, + "loss": 1.0737, + "step": 5440 + }, + { + "epoch": 0.52, + "grad_norm": 0.2743596761285101, + "learning_rate": 0.0001767781590249446, + "loss": 1.0216, + "step": 5441 + }, + { + "epoch": 0.52, + "grad_norm": 0.3014933928943263, + "learning_rate": 0.00017676802233216346, + "loss": 1.0686, + "step": 5442 + }, + { + "epoch": 0.52, + "grad_norm": 0.30975481238566765, + "learning_rate": 0.00017675788371821432, + "loss": 1.1826, + "step": 5443 + }, + { + "epoch": 0.52, + "grad_norm": 0.24533869531288546, + "learning_rate": 0.00017674774318335085, + "loss": 1.162, + "step": 5444 + }, + { + "epoch": 0.52, + "grad_norm": 0.27791112618813957, + "learning_rate": 0.0001767376007278269, + "loss": 0.9609, + "step": 5445 + }, + { + "epoch": 0.52, + "grad_norm": 0.31278080093016536, + "learning_rate": 0.00017672745635189633, + "loss": 1.1661, + "step": 5446 + }, + { + "epoch": 0.52, + "grad_norm": 0.29874080218471216, + "learning_rate": 0.00017671731005581287, + "loss": 1.1068, + "step": 5447 + }, + { + "epoch": 0.52, + "grad_norm": 0.2569503493201101, + "learning_rate": 0.0001767071618398305, + "loss": 1.0538, + "step": 5448 + }, + { + "epoch": 0.52, + "grad_norm": 0.29826323414906175, + "learning_rate": 0.00017669701170420322, + "loss": 1.1264, + "step": 5449 + }, + { + "epoch": 0.52, + "grad_norm": 0.2812826097473197, + "learning_rate": 0.00017668685964918504, + "loss": 1.0982, + "step": 5450 + }, + { + "epoch": 0.52, + "grad_norm": 0.2973251253473092, + "learning_rate": 0.00017667670567502998, + "loss": 1.0728, + "step": 5451 + }, + { + "epoch": 0.52, + "grad_norm": 0.24818251204984365, + "learning_rate": 0.0001766665497819922, + "loss": 1.0567, + "step": 5452 + }, + { + "epoch": 0.52, + "grad_norm": 0.2884278666490467, + "learning_rate": 0.00017665639197032582, + "loss": 1.0685, + "step": 5453 + }, + { + "epoch": 0.52, + "grad_norm": 0.262999153831761, + "learning_rate": 0.00017664623224028503, + "loss": 0.9473, + "step": 5454 + }, + { + "epoch": 0.52, + "grad_norm": 0.3152269030127414, + "learning_rate": 0.0001766360705921241, + "loss": 1.169, + "step": 5455 + }, + { + "epoch": 0.52, + "grad_norm": 0.25251227559062966, + "learning_rate": 0.00017662590702609737, + "loss": 1.0352, + "step": 5456 + }, + { + "epoch": 0.52, + "grad_norm": 0.32106633074490737, + "learning_rate": 0.00017661574154245914, + "loss": 1.0894, + "step": 5457 + }, + { + "epoch": 0.52, + "grad_norm": 0.2693010524825926, + "learning_rate": 0.00017660557414146384, + "loss": 1.0703, + "step": 5458 + }, + { + "epoch": 0.52, + "grad_norm": 0.33501583797096984, + "learning_rate": 0.0001765954048233659, + "loss": 1.2, + "step": 5459 + }, + { + "epoch": 0.52, + "grad_norm": 0.2766667108631253, + "learning_rate": 0.0001765852335884198, + "loss": 1.1096, + "step": 5460 + }, + { + "epoch": 0.52, + "grad_norm": 0.28642621560137427, + "learning_rate": 0.0001765750604368801, + "loss": 1.0589, + "step": 5461 + }, + { + "epoch": 0.52, + "grad_norm": 0.29446092888521896, + "learning_rate": 0.0001765648853690014, + "loss": 1.0758, + "step": 5462 + }, + { + "epoch": 0.52, + "grad_norm": 0.26574617011082774, + "learning_rate": 0.00017655470838503834, + "loss": 1.0517, + "step": 5463 + }, + { + "epoch": 0.52, + "grad_norm": 0.29380273713342564, + "learning_rate": 0.00017654452948524555, + "loss": 1.1355, + "step": 5464 + }, + { + "epoch": 0.52, + "grad_norm": 0.26610600805445106, + "learning_rate": 0.00017653434866987783, + "loss": 1.156, + "step": 5465 + }, + { + "epoch": 0.52, + "grad_norm": 0.23226425198871892, + "learning_rate": 0.00017652416593918994, + "loss": 1.1446, + "step": 5466 + }, + { + "epoch": 0.52, + "grad_norm": 0.3136944620672886, + "learning_rate": 0.00017651398129343667, + "loss": 1.1359, + "step": 5467 + }, + { + "epoch": 0.52, + "grad_norm": 0.29970255604775625, + "learning_rate": 0.00017650379473287296, + "loss": 1.0718, + "step": 5468 + }, + { + "epoch": 0.52, + "grad_norm": 0.2782035372010567, + "learning_rate": 0.0001764936062577537, + "loss": 1.0908, + "step": 5469 + }, + { + "epoch": 0.52, + "grad_norm": 0.2616477257641661, + "learning_rate": 0.00017648341586833387, + "loss": 0.9107, + "step": 5470 + }, + { + "epoch": 0.52, + "grad_norm": 0.2663523876553537, + "learning_rate": 0.00017647322356486848, + "loss": 0.9981, + "step": 5471 + }, + { + "epoch": 0.52, + "grad_norm": 0.26092658264988006, + "learning_rate": 0.0001764630293476126, + "loss": 1.1115, + "step": 5472 + }, + { + "epoch": 0.52, + "grad_norm": 0.27133782980711774, + "learning_rate": 0.0001764528332168214, + "loss": 1.1506, + "step": 5473 + }, + { + "epoch": 0.52, + "grad_norm": 0.29494184409148894, + "learning_rate": 0.00017644263517274997, + "loss": 1.0381, + "step": 5474 + }, + { + "epoch": 0.52, + "grad_norm": 0.293344464237028, + "learning_rate": 0.00017643243521565355, + "loss": 1.0955, + "step": 5475 + }, + { + "epoch": 0.52, + "grad_norm": 0.3113070820581294, + "learning_rate": 0.0001764222333457874, + "loss": 0.9733, + "step": 5476 + }, + { + "epoch": 0.52, + "grad_norm": 0.2686929294263848, + "learning_rate": 0.00017641202956340685, + "loss": 1.141, + "step": 5477 + }, + { + "epoch": 0.52, + "grad_norm": 0.2963778388036239, + "learning_rate": 0.0001764018238687672, + "loss": 1.049, + "step": 5478 + }, + { + "epoch": 0.52, + "grad_norm": 0.27712005782523735, + "learning_rate": 0.00017639161626212393, + "loss": 0.9666, + "step": 5479 + }, + { + "epoch": 0.52, + "grad_norm": 0.25074497020772385, + "learning_rate": 0.00017638140674373245, + "loss": 1.091, + "step": 5480 + }, + { + "epoch": 0.52, + "grad_norm": 0.27481911625527383, + "learning_rate": 0.00017637119531384822, + "loss": 0.9804, + "step": 5481 + }, + { + "epoch": 0.52, + "grad_norm": 0.30326990661840897, + "learning_rate": 0.00017636098197272687, + "loss": 1.0196, + "step": 5482 + }, + { + "epoch": 0.52, + "grad_norm": 0.2536248159519646, + "learning_rate": 0.00017635076672062395, + "loss": 1.0655, + "step": 5483 + }, + { + "epoch": 0.52, + "grad_norm": 0.24871071887818122, + "learning_rate": 0.0001763405495577951, + "loss": 1.0337, + "step": 5484 + }, + { + "epoch": 0.52, + "grad_norm": 0.27076559378833587, + "learning_rate": 0.00017633033048449607, + "loss": 1.0868, + "step": 5485 + }, + { + "epoch": 0.52, + "grad_norm": 0.26447743099546034, + "learning_rate": 0.00017632010950098247, + "loss": 1.1067, + "step": 5486 + }, + { + "epoch": 0.52, + "grad_norm": 0.2354573802397408, + "learning_rate": 0.00017630988660751018, + "loss": 0.9972, + "step": 5487 + }, + { + "epoch": 0.53, + "grad_norm": 0.31657477736419243, + "learning_rate": 0.00017629966180433503, + "loss": 1.1436, + "step": 5488 + }, + { + "epoch": 0.53, + "grad_norm": 0.31018143648114505, + "learning_rate": 0.0001762894350917129, + "loss": 1.1186, + "step": 5489 + }, + { + "epoch": 0.53, + "grad_norm": 0.2928001553899392, + "learning_rate": 0.00017627920646989971, + "loss": 1.1422, + "step": 5490 + }, + { + "epoch": 0.53, + "grad_norm": 0.30032975451874694, + "learning_rate": 0.00017626897593915142, + "loss": 1.0623, + "step": 5491 + }, + { + "epoch": 0.53, + "grad_norm": 0.3019231833502392, + "learning_rate": 0.0001762587434997241, + "loss": 1.0632, + "step": 5492 + }, + { + "epoch": 0.53, + "grad_norm": 0.2647986946052283, + "learning_rate": 0.0001762485091518738, + "loss": 1.0093, + "step": 5493 + }, + { + "epoch": 0.53, + "grad_norm": 0.2490169986270707, + "learning_rate": 0.0001762382728958566, + "loss": 1.1866, + "step": 5494 + }, + { + "epoch": 0.53, + "grad_norm": 0.27623003094440113, + "learning_rate": 0.00017622803473192874, + "loss": 1.0377, + "step": 5495 + }, + { + "epoch": 0.53, + "grad_norm": 0.29379957830634423, + "learning_rate": 0.0001762177946603464, + "loss": 1.1129, + "step": 5496 + }, + { + "epoch": 0.53, + "grad_norm": 0.26006713944566145, + "learning_rate": 0.00017620755268136584, + "loss": 1.0786, + "step": 5497 + }, + { + "epoch": 0.53, + "grad_norm": 0.2984517115544308, + "learning_rate": 0.00017619730879524337, + "loss": 1.145, + "step": 5498 + }, + { + "epoch": 0.53, + "grad_norm": 0.2634404683614875, + "learning_rate": 0.00017618706300223536, + "loss": 1.0225, + "step": 5499 + }, + { + "epoch": 0.53, + "grad_norm": 0.2770749610740461, + "learning_rate": 0.00017617681530259822, + "loss": 1.0321, + "step": 5500 + }, + { + "epoch": 0.53, + "grad_norm": 0.29909952213865876, + "learning_rate": 0.00017616656569658843, + "loss": 0.9445, + "step": 5501 + }, + { + "epoch": 0.53, + "grad_norm": 0.28919886120746485, + "learning_rate": 0.00017615631418446242, + "loss": 1.0648, + "step": 5502 + }, + { + "epoch": 0.53, + "grad_norm": 0.28680714124802115, + "learning_rate": 0.00017614606076647683, + "loss": 1.1729, + "step": 5503 + }, + { + "epoch": 0.53, + "grad_norm": 0.3018876386617487, + "learning_rate": 0.00017613580544288817, + "loss": 0.9817, + "step": 5504 + }, + { + "epoch": 0.53, + "grad_norm": 0.28707084465144506, + "learning_rate": 0.00017612554821395314, + "loss": 1.1636, + "step": 5505 + }, + { + "epoch": 0.53, + "grad_norm": 0.31719224150490105, + "learning_rate": 0.00017611528907992844, + "loss": 1.0808, + "step": 5506 + }, + { + "epoch": 0.53, + "grad_norm": 0.2625161917732417, + "learning_rate": 0.00017610502804107082, + "loss": 1.1023, + "step": 5507 + }, + { + "epoch": 0.53, + "grad_norm": 0.2872356902170121, + "learning_rate": 0.00017609476509763698, + "loss": 1.2038, + "step": 5508 + }, + { + "epoch": 0.53, + "grad_norm": 0.25936712334845474, + "learning_rate": 0.00017608450024988382, + "loss": 0.9567, + "step": 5509 + }, + { + "epoch": 0.53, + "grad_norm": 0.2791045831669703, + "learning_rate": 0.0001760742334980683, + "loss": 1.1003, + "step": 5510 + }, + { + "epoch": 0.53, + "grad_norm": 0.2647336003050076, + "learning_rate": 0.00017606396484244721, + "loss": 1.1102, + "step": 5511 + }, + { + "epoch": 0.53, + "grad_norm": 0.27574681080006713, + "learning_rate": 0.00017605369428327761, + "loss": 1.048, + "step": 5512 + }, + { + "epoch": 0.53, + "grad_norm": 0.27994961262509854, + "learning_rate": 0.00017604342182081653, + "loss": 0.9866, + "step": 5513 + }, + { + "epoch": 0.53, + "grad_norm": 0.261516937266442, + "learning_rate": 0.000176033147455321, + "loss": 1.0791, + "step": 5514 + }, + { + "epoch": 0.53, + "grad_norm": 0.2995061891654347, + "learning_rate": 0.0001760228711870482, + "loss": 1.1309, + "step": 5515 + }, + { + "epoch": 0.53, + "grad_norm": 0.2820497610571214, + "learning_rate": 0.00017601259301625524, + "loss": 1.0, + "step": 5516 + }, + { + "epoch": 0.53, + "grad_norm": 0.24255160815288854, + "learning_rate": 0.0001760023129431994, + "loss": 1.0015, + "step": 5517 + }, + { + "epoch": 0.53, + "grad_norm": 0.25094239441876454, + "learning_rate": 0.0001759920309681379, + "loss": 1.0133, + "step": 5518 + }, + { + "epoch": 0.53, + "grad_norm": 0.283208208913356, + "learning_rate": 0.00017598174709132803, + "loss": 1.1402, + "step": 5519 + }, + { + "epoch": 0.53, + "grad_norm": 0.2826249410012862, + "learning_rate": 0.00017597146131302722, + "loss": 1.0725, + "step": 5520 + }, + { + "epoch": 0.53, + "grad_norm": 0.28680753301102535, + "learning_rate": 0.00017596117363349282, + "loss": 1.1058, + "step": 5521 + }, + { + "epoch": 0.53, + "grad_norm": 0.2915298750178331, + "learning_rate": 0.00017595088405298234, + "loss": 1.0234, + "step": 5522 + }, + { + "epoch": 0.53, + "grad_norm": 0.29543680871161176, + "learning_rate": 0.00017594059257175325, + "loss": 1.1357, + "step": 5523 + }, + { + "epoch": 0.53, + "grad_norm": 0.2734660068763002, + "learning_rate": 0.0001759302991900631, + "loss": 0.9803, + "step": 5524 + }, + { + "epoch": 0.53, + "grad_norm": 0.29698197284064065, + "learning_rate": 0.0001759200039081695, + "loss": 1.0425, + "step": 5525 + }, + { + "epoch": 0.53, + "grad_norm": 0.28542474538774965, + "learning_rate": 0.00017590970672633007, + "loss": 1.0379, + "step": 5526 + }, + { + "epoch": 0.53, + "grad_norm": 0.2891844662811193, + "learning_rate": 0.00017589940764480252, + "loss": 1.0073, + "step": 5527 + }, + { + "epoch": 0.53, + "grad_norm": 0.28262400994939874, + "learning_rate": 0.00017588910666384462, + "loss": 1.0782, + "step": 5528 + }, + { + "epoch": 0.53, + "grad_norm": 0.2780628600213389, + "learning_rate": 0.00017587880378371412, + "loss": 1.0688, + "step": 5529 + }, + { + "epoch": 0.53, + "grad_norm": 0.2884519639543894, + "learning_rate": 0.00017586849900466883, + "loss": 1.0724, + "step": 5530 + }, + { + "epoch": 0.53, + "grad_norm": 0.2815574093742316, + "learning_rate": 0.00017585819232696675, + "loss": 1.0518, + "step": 5531 + }, + { + "epoch": 0.53, + "grad_norm": 0.28038754616537154, + "learning_rate": 0.00017584788375086565, + "loss": 1.0102, + "step": 5532 + }, + { + "epoch": 0.53, + "grad_norm": 0.2525746581044452, + "learning_rate": 0.00017583757327662363, + "loss": 1.0554, + "step": 5533 + }, + { + "epoch": 0.53, + "grad_norm": 0.2922109980484516, + "learning_rate": 0.00017582726090449867, + "loss": 1.1027, + "step": 5534 + }, + { + "epoch": 0.53, + "grad_norm": 0.2751484254089623, + "learning_rate": 0.00017581694663474886, + "loss": 1.0159, + "step": 5535 + }, + { + "epoch": 0.53, + "grad_norm": 0.26863647079539626, + "learning_rate": 0.00017580663046763231, + "loss": 1.0621, + "step": 5536 + }, + { + "epoch": 0.53, + "grad_norm": 0.2985191792690513, + "learning_rate": 0.00017579631240340716, + "loss": 1.1758, + "step": 5537 + }, + { + "epoch": 0.53, + "grad_norm": 0.27614555889342746, + "learning_rate": 0.00017578599244233168, + "loss": 1.061, + "step": 5538 + }, + { + "epoch": 0.53, + "grad_norm": 0.25905510867782044, + "learning_rate": 0.00017577567058466414, + "loss": 1.1777, + "step": 5539 + }, + { + "epoch": 0.53, + "grad_norm": 0.3278506380359264, + "learning_rate": 0.00017576534683066278, + "loss": 1.1552, + "step": 5540 + }, + { + "epoch": 0.53, + "grad_norm": 0.273031516195577, + "learning_rate": 0.000175755021180586, + "loss": 1.1249, + "step": 5541 + }, + { + "epoch": 0.53, + "grad_norm": 0.29918764475128873, + "learning_rate": 0.00017574469363469222, + "loss": 1.0937, + "step": 5542 + }, + { + "epoch": 0.53, + "grad_norm": 0.2753312938615492, + "learning_rate": 0.00017573436419323986, + "loss": 1.1123, + "step": 5543 + }, + { + "epoch": 0.53, + "grad_norm": 0.3076985822417243, + "learning_rate": 0.00017572403285648743, + "loss": 1.1392, + "step": 5544 + }, + { + "epoch": 0.53, + "grad_norm": 0.2858977995760822, + "learning_rate": 0.00017571369962469352, + "loss": 1.0683, + "step": 5545 + }, + { + "epoch": 0.53, + "grad_norm": 0.2826635982664086, + "learning_rate": 0.00017570336449811667, + "loss": 1.01, + "step": 5546 + }, + { + "epoch": 0.53, + "grad_norm": 0.25331688616473097, + "learning_rate": 0.00017569302747701558, + "loss": 1.0202, + "step": 5547 + }, + { + "epoch": 0.53, + "grad_norm": 0.25299991837835445, + "learning_rate": 0.00017568268856164886, + "loss": 0.9968, + "step": 5548 + }, + { + "epoch": 0.53, + "grad_norm": 0.26907253785105184, + "learning_rate": 0.00017567234775227533, + "loss": 1.0216, + "step": 5549 + }, + { + "epoch": 0.53, + "grad_norm": 0.2638154646751891, + "learning_rate": 0.0001756620050491537, + "loss": 1.169, + "step": 5550 + }, + { + "epoch": 0.53, + "grad_norm": 0.2674914565002557, + "learning_rate": 0.0001756516604525429, + "loss": 1.0112, + "step": 5551 + }, + { + "epoch": 0.53, + "grad_norm": 0.27204031327263634, + "learning_rate": 0.00017564131396270168, + "loss": 1.0392, + "step": 5552 + }, + { + "epoch": 0.53, + "grad_norm": 0.2808788802646591, + "learning_rate": 0.0001756309655798891, + "loss": 1.1142, + "step": 5553 + }, + { + "epoch": 0.53, + "grad_norm": 0.2737824930333972, + "learning_rate": 0.00017562061530436405, + "loss": 1.1227, + "step": 5554 + }, + { + "epoch": 0.53, + "grad_norm": 0.28525600932665507, + "learning_rate": 0.00017561026313638557, + "loss": 1.0276, + "step": 5555 + }, + { + "epoch": 0.53, + "grad_norm": 0.26176420613090035, + "learning_rate": 0.00017559990907621274, + "loss": 1.1253, + "step": 5556 + }, + { + "epoch": 0.53, + "grad_norm": 0.25918345855645814, + "learning_rate": 0.00017558955312410468, + "loss": 1.1035, + "step": 5557 + }, + { + "epoch": 0.53, + "grad_norm": 0.29529919222022755, + "learning_rate": 0.00017557919528032054, + "loss": 1.1018, + "step": 5558 + }, + { + "epoch": 0.53, + "grad_norm": 0.28939151113442757, + "learning_rate": 0.00017556883554511953, + "loss": 1.0248, + "step": 5559 + }, + { + "epoch": 0.53, + "grad_norm": 0.3092935541500706, + "learning_rate": 0.00017555847391876093, + "loss": 1.133, + "step": 5560 + }, + { + "epoch": 0.53, + "grad_norm": 0.268156083959369, + "learning_rate": 0.00017554811040150403, + "loss": 1.1213, + "step": 5561 + }, + { + "epoch": 0.53, + "grad_norm": 0.2621845882914305, + "learning_rate": 0.0001755377449936082, + "loss": 1.0878, + "step": 5562 + }, + { + "epoch": 0.53, + "grad_norm": 0.2661667850236094, + "learning_rate": 0.0001755273776953328, + "loss": 1.0683, + "step": 5563 + }, + { + "epoch": 0.53, + "grad_norm": 0.25877418531921775, + "learning_rate": 0.00017551700850693732, + "loss": 1.1574, + "step": 5564 + }, + { + "epoch": 0.53, + "grad_norm": 0.28288655943020663, + "learning_rate": 0.00017550663742868126, + "loss": 1.1328, + "step": 5565 + }, + { + "epoch": 0.53, + "grad_norm": 0.26185751105253613, + "learning_rate": 0.00017549626446082412, + "loss": 0.973, + "step": 5566 + }, + { + "epoch": 0.53, + "grad_norm": 0.295683213989208, + "learning_rate": 0.0001754858896036255, + "loss": 0.9697, + "step": 5567 + }, + { + "epoch": 0.53, + "grad_norm": 0.3264307309216949, + "learning_rate": 0.0001754755128573451, + "loss": 1.0159, + "step": 5568 + }, + { + "epoch": 0.53, + "grad_norm": 0.2753013510908348, + "learning_rate": 0.00017546513422224253, + "loss": 1.0423, + "step": 5569 + }, + { + "epoch": 0.53, + "grad_norm": 0.25968975647863907, + "learning_rate": 0.00017545475369857755, + "loss": 1.1648, + "step": 5570 + }, + { + "epoch": 0.53, + "grad_norm": 0.2804203410262089, + "learning_rate": 0.00017544437128660993, + "loss": 1.0268, + "step": 5571 + }, + { + "epoch": 0.53, + "grad_norm": 0.31239158563885117, + "learning_rate": 0.0001754339869865995, + "loss": 1.1495, + "step": 5572 + }, + { + "epoch": 0.53, + "grad_norm": 0.2733500389923679, + "learning_rate": 0.00017542360079880615, + "loss": 1.1128, + "step": 5573 + }, + { + "epoch": 0.53, + "grad_norm": 0.24320614149769154, + "learning_rate": 0.00017541321272348978, + "loss": 1.0674, + "step": 5574 + }, + { + "epoch": 0.53, + "grad_norm": 0.3181218743470043, + "learning_rate": 0.00017540282276091039, + "loss": 1.0091, + "step": 5575 + }, + { + "epoch": 0.53, + "grad_norm": 0.3031447060986254, + "learning_rate": 0.00017539243091132793, + "loss": 1.1715, + "step": 5576 + }, + { + "epoch": 0.53, + "grad_norm": 0.34380612538960487, + "learning_rate": 0.00017538203717500252, + "loss": 0.9964, + "step": 5577 + }, + { + "epoch": 0.53, + "grad_norm": 0.2661056584426639, + "learning_rate": 0.00017537164155219428, + "loss": 1.0928, + "step": 5578 + }, + { + "epoch": 0.53, + "grad_norm": 0.26522368104967226, + "learning_rate": 0.0001753612440431633, + "loss": 1.0348, + "step": 5579 + }, + { + "epoch": 0.53, + "grad_norm": 0.28303690003405435, + "learning_rate": 0.00017535084464816985, + "loss": 1.0077, + "step": 5580 + }, + { + "epoch": 0.53, + "grad_norm": 0.29888676081156723, + "learning_rate": 0.00017534044336747418, + "loss": 0.9651, + "step": 5581 + }, + { + "epoch": 0.53, + "grad_norm": 0.23610813701308117, + "learning_rate": 0.00017533004020133653, + "loss": 1.0869, + "step": 5582 + }, + { + "epoch": 0.53, + "grad_norm": 0.2473777150573363, + "learning_rate": 0.00017531963515001725, + "loss": 1.0581, + "step": 5583 + }, + { + "epoch": 0.53, + "grad_norm": 0.2878283161585639, + "learning_rate": 0.00017530922821377683, + "loss": 1.1182, + "step": 5584 + }, + { + "epoch": 0.53, + "grad_norm": 0.2895723267837339, + "learning_rate": 0.0001752988193928756, + "loss": 1.0193, + "step": 5585 + }, + { + "epoch": 0.53, + "grad_norm": 0.25924626472676504, + "learning_rate": 0.00017528840868757413, + "loss": 1.0764, + "step": 5586 + }, + { + "epoch": 0.53, + "grad_norm": 0.2697046584787591, + "learning_rate": 0.00017527799609813287, + "loss": 1.2123, + "step": 5587 + }, + { + "epoch": 0.53, + "grad_norm": 0.2779570564976789, + "learning_rate": 0.00017526758162481247, + "loss": 1.0463, + "step": 5588 + }, + { + "epoch": 0.53, + "grad_norm": 0.2659291393898864, + "learning_rate": 0.00017525716526787353, + "loss": 0.8657, + "step": 5589 + }, + { + "epoch": 0.53, + "grad_norm": 0.24940778956498255, + "learning_rate": 0.00017524674702757676, + "loss": 0.9349, + "step": 5590 + }, + { + "epoch": 0.53, + "grad_norm": 0.3512612443921498, + "learning_rate": 0.00017523632690418281, + "loss": 1.0309, + "step": 5591 + }, + { + "epoch": 0.53, + "grad_norm": 0.27949443291870935, + "learning_rate": 0.0001752259048979525, + "loss": 1.0867, + "step": 5592 + }, + { + "epoch": 0.54, + "grad_norm": 0.25241117168172916, + "learning_rate": 0.00017521548100914668, + "loss": 0.9595, + "step": 5593 + }, + { + "epoch": 0.54, + "grad_norm": 0.31177379926003695, + "learning_rate": 0.00017520505523802615, + "loss": 1.1147, + "step": 5594 + }, + { + "epoch": 0.54, + "grad_norm": 0.3035376452103113, + "learning_rate": 0.00017519462758485186, + "loss": 1.1091, + "step": 5595 + }, + { + "epoch": 0.54, + "grad_norm": 0.28334338339825155, + "learning_rate": 0.00017518419804988473, + "loss": 1.0027, + "step": 5596 + }, + { + "epoch": 0.54, + "grad_norm": 0.27635425273795633, + "learning_rate": 0.00017517376663338583, + "loss": 1.0832, + "step": 5597 + }, + { + "epoch": 0.54, + "grad_norm": 0.27479092014996065, + "learning_rate": 0.00017516333333561615, + "loss": 1.0254, + "step": 5598 + }, + { + "epoch": 0.54, + "grad_norm": 0.2923804540268794, + "learning_rate": 0.00017515289815683683, + "loss": 1.0939, + "step": 5599 + }, + { + "epoch": 0.54, + "grad_norm": 0.2689789333872261, + "learning_rate": 0.000175142461097309, + "loss": 1.0924, + "step": 5600 + }, + { + "epoch": 0.54, + "grad_norm": 0.2712229566662846, + "learning_rate": 0.00017513202215729384, + "loss": 1.1212, + "step": 5601 + }, + { + "epoch": 0.54, + "grad_norm": 0.28499521398979466, + "learning_rate": 0.0001751215813370526, + "loss": 1.1271, + "step": 5602 + }, + { + "epoch": 0.54, + "grad_norm": 0.272591489298431, + "learning_rate": 0.00017511113863684662, + "loss": 1.1602, + "step": 5603 + }, + { + "epoch": 0.54, + "grad_norm": 0.3008741005719719, + "learning_rate": 0.00017510069405693714, + "loss": 1.115, + "step": 5604 + }, + { + "epoch": 0.54, + "grad_norm": 0.2531166951692093, + "learning_rate": 0.00017509024759758561, + "loss": 1.0802, + "step": 5605 + }, + { + "epoch": 0.54, + "grad_norm": 0.24148412845313758, + "learning_rate": 0.00017507979925905347, + "loss": 0.953, + "step": 5606 + }, + { + "epoch": 0.54, + "grad_norm": 0.2534357412942829, + "learning_rate": 0.00017506934904160213, + "loss": 1.0784, + "step": 5607 + }, + { + "epoch": 0.54, + "grad_norm": 0.270455450479634, + "learning_rate": 0.00017505889694549316, + "loss": 1.0495, + "step": 5608 + }, + { + "epoch": 0.54, + "grad_norm": 0.2718743775512695, + "learning_rate": 0.00017504844297098812, + "loss": 1.1477, + "step": 5609 + }, + { + "epoch": 0.54, + "grad_norm": 0.2627508977482193, + "learning_rate": 0.00017503798711834863, + "loss": 0.9724, + "step": 5610 + }, + { + "epoch": 0.54, + "grad_norm": 0.28688718615510195, + "learning_rate": 0.00017502752938783637, + "loss": 1.0391, + "step": 5611 + }, + { + "epoch": 0.54, + "grad_norm": 0.3088622269070572, + "learning_rate": 0.000175017069779713, + "loss": 1.07, + "step": 5612 + }, + { + "epoch": 0.54, + "grad_norm": 0.2519490699405952, + "learning_rate": 0.00017500660829424035, + "loss": 0.9973, + "step": 5613 + }, + { + "epoch": 0.54, + "grad_norm": 0.2593258598031295, + "learning_rate": 0.00017499614493168017, + "loss": 1.1488, + "step": 5614 + }, + { + "epoch": 0.54, + "grad_norm": 0.28609091063527664, + "learning_rate": 0.00017498567969229432, + "loss": 1.1316, + "step": 5615 + }, + { + "epoch": 0.54, + "grad_norm": 0.2876241038098421, + "learning_rate": 0.00017497521257634472, + "loss": 1.142, + "step": 5616 + }, + { + "epoch": 0.54, + "grad_norm": 0.2699264999096545, + "learning_rate": 0.0001749647435840933, + "loss": 1.1133, + "step": 5617 + }, + { + "epoch": 0.54, + "grad_norm": 0.23212212244543703, + "learning_rate": 0.00017495427271580207, + "loss": 1.0177, + "step": 5618 + }, + { + "epoch": 0.54, + "grad_norm": 0.3191035435173965, + "learning_rate": 0.00017494379997173306, + "loss": 1.0753, + "step": 5619 + }, + { + "epoch": 0.54, + "grad_norm": 0.28382503691739164, + "learning_rate": 0.00017493332535214835, + "loss": 1.0277, + "step": 5620 + }, + { + "epoch": 0.54, + "grad_norm": 0.2620982386743405, + "learning_rate": 0.00017492284885731006, + "loss": 1.1087, + "step": 5621 + }, + { + "epoch": 0.54, + "grad_norm": 0.24048768135309437, + "learning_rate": 0.00017491237048748042, + "loss": 1.0179, + "step": 5622 + }, + { + "epoch": 0.54, + "grad_norm": 0.29297621702120474, + "learning_rate": 0.00017490189024292157, + "loss": 1.031, + "step": 5623 + }, + { + "epoch": 0.54, + "grad_norm": 0.24169868093525804, + "learning_rate": 0.00017489140812389591, + "loss": 1.1275, + "step": 5624 + }, + { + "epoch": 0.54, + "grad_norm": 0.3265517682655214, + "learning_rate": 0.00017488092413066566, + "loss": 1.09, + "step": 5625 + }, + { + "epoch": 0.54, + "grad_norm": 0.2889070937315453, + "learning_rate": 0.00017487043826349324, + "loss": 1.0827, + "step": 5626 + }, + { + "epoch": 0.54, + "grad_norm": 0.2616726142921512, + "learning_rate": 0.00017485995052264107, + "loss": 1.1226, + "step": 5627 + }, + { + "epoch": 0.54, + "grad_norm": 0.2823144964930083, + "learning_rate": 0.00017484946090837153, + "loss": 0.9189, + "step": 5628 + }, + { + "epoch": 0.54, + "grad_norm": 0.26247896162782614, + "learning_rate": 0.0001748389694209472, + "loss": 1.0863, + "step": 5629 + }, + { + "epoch": 0.54, + "grad_norm": 0.29529068834973227, + "learning_rate": 0.0001748284760606307, + "loss": 1.1572, + "step": 5630 + }, + { + "epoch": 0.54, + "grad_norm": 0.2725314072487987, + "learning_rate": 0.00017481798082768447, + "loss": 1.0527, + "step": 5631 + }, + { + "epoch": 0.54, + "grad_norm": 0.3268266589015598, + "learning_rate": 0.0001748074837223713, + "loss": 1.1146, + "step": 5632 + }, + { + "epoch": 0.54, + "grad_norm": 0.31412407097134504, + "learning_rate": 0.0001747969847449538, + "loss": 1.0571, + "step": 5633 + }, + { + "epoch": 0.54, + "grad_norm": 0.3129044080305532, + "learning_rate": 0.0001747864838956948, + "loss": 1.1354, + "step": 5634 + }, + { + "epoch": 0.54, + "grad_norm": 0.26484366612789245, + "learning_rate": 0.00017477598117485697, + "loss": 1.0219, + "step": 5635 + }, + { + "epoch": 0.54, + "grad_norm": 0.2881609277060793, + "learning_rate": 0.00017476547658270327, + "loss": 1.0661, + "step": 5636 + }, + { + "epoch": 0.54, + "grad_norm": 0.2785791649184835, + "learning_rate": 0.0001747549701194965, + "loss": 1.009, + "step": 5637 + }, + { + "epoch": 0.54, + "grad_norm": 0.2781943063291394, + "learning_rate": 0.00017474446178549963, + "loss": 1.1618, + "step": 5638 + }, + { + "epoch": 0.54, + "grad_norm": 0.286700570393826, + "learning_rate": 0.00017473395158097566, + "loss": 1.1184, + "step": 5639 + }, + { + "epoch": 0.54, + "grad_norm": 0.332805537003659, + "learning_rate": 0.00017472343950618755, + "loss": 1.0486, + "step": 5640 + }, + { + "epoch": 0.54, + "grad_norm": 0.27971090508186786, + "learning_rate": 0.0001747129255613984, + "loss": 1.1044, + "step": 5641 + }, + { + "epoch": 0.54, + "grad_norm": 0.28836371050426973, + "learning_rate": 0.00017470240974687133, + "loss": 1.0402, + "step": 5642 + }, + { + "epoch": 0.54, + "grad_norm": 0.30716094757551443, + "learning_rate": 0.00017469189206286952, + "loss": 1.113, + "step": 5643 + }, + { + "epoch": 0.54, + "grad_norm": 0.2743312373131173, + "learning_rate": 0.00017468137250965617, + "loss": 1.0299, + "step": 5644 + }, + { + "epoch": 0.54, + "grad_norm": 0.2609975564796487, + "learning_rate": 0.00017467085108749454, + "loss": 1.104, + "step": 5645 + }, + { + "epoch": 0.54, + "grad_norm": 0.29364822283678577, + "learning_rate": 0.0001746603277966479, + "loss": 1.0865, + "step": 5646 + }, + { + "epoch": 0.54, + "grad_norm": 0.2745578477787385, + "learning_rate": 0.00017464980263737968, + "loss": 1.1527, + "step": 5647 + }, + { + "epoch": 0.54, + "grad_norm": 0.26626720698396383, + "learning_rate": 0.00017463927560995321, + "loss": 1.0189, + "step": 5648 + }, + { + "epoch": 0.54, + "grad_norm": 0.27709851029020705, + "learning_rate": 0.000174628746714632, + "loss": 1.1241, + "step": 5649 + }, + { + "epoch": 0.54, + "grad_norm": 0.306265199776124, + "learning_rate": 0.00017461821595167945, + "loss": 1.1816, + "step": 5650 + }, + { + "epoch": 0.54, + "grad_norm": 0.23094636122975018, + "learning_rate": 0.00017460768332135918, + "loss": 1.0002, + "step": 5651 + }, + { + "epoch": 0.54, + "grad_norm": 0.25739454457437366, + "learning_rate": 0.00017459714882393473, + "loss": 1.0748, + "step": 5652 + }, + { + "epoch": 0.54, + "grad_norm": 0.24839067246041194, + "learning_rate": 0.00017458661245966974, + "loss": 1.1297, + "step": 5653 + }, + { + "epoch": 0.54, + "grad_norm": 0.2592947317889122, + "learning_rate": 0.0001745760742288279, + "loss": 1.1097, + "step": 5654 + }, + { + "epoch": 0.54, + "grad_norm": 0.29910161986761424, + "learning_rate": 0.00017456553413167293, + "loss": 1.0156, + "step": 5655 + }, + { + "epoch": 0.54, + "grad_norm": 0.2740550204065586, + "learning_rate": 0.00017455499216846864, + "loss": 1.1618, + "step": 5656 + }, + { + "epoch": 0.54, + "grad_norm": 0.2773348179265606, + "learning_rate": 0.00017454444833947877, + "loss": 1.0988, + "step": 5657 + }, + { + "epoch": 0.54, + "grad_norm": 0.2813065691260808, + "learning_rate": 0.00017453390264496728, + "loss": 0.9826, + "step": 5658 + }, + { + "epoch": 0.54, + "grad_norm": 0.2775051391562762, + "learning_rate": 0.000174523355085198, + "loss": 1.0356, + "step": 5659 + }, + { + "epoch": 0.54, + "grad_norm": 0.28015989674393643, + "learning_rate": 0.00017451280566043492, + "loss": 1.1507, + "step": 5660 + }, + { + "epoch": 0.54, + "grad_norm": 0.3011891640480555, + "learning_rate": 0.00017450225437094208, + "loss": 1.16, + "step": 5661 + }, + { + "epoch": 0.54, + "grad_norm": 0.38100561539213623, + "learning_rate": 0.00017449170121698347, + "loss": 1.0236, + "step": 5662 + }, + { + "epoch": 0.54, + "grad_norm": 0.27806057374613613, + "learning_rate": 0.00017448114619882321, + "loss": 1.0843, + "step": 5663 + }, + { + "epoch": 0.54, + "grad_norm": 0.2851575165624332, + "learning_rate": 0.0001744705893167255, + "loss": 1.1458, + "step": 5664 + }, + { + "epoch": 0.54, + "grad_norm": 0.2913077381839934, + "learning_rate": 0.00017446003057095447, + "loss": 1.1039, + "step": 5665 + }, + { + "epoch": 0.54, + "grad_norm": 0.28664285006948, + "learning_rate": 0.00017444946996177433, + "loss": 1.1214, + "step": 5666 + }, + { + "epoch": 0.54, + "grad_norm": 0.2838055565232777, + "learning_rate": 0.00017443890748944946, + "loss": 1.0184, + "step": 5667 + }, + { + "epoch": 0.54, + "grad_norm": 0.2511465662279637, + "learning_rate": 0.00017442834315424416, + "loss": 1.0698, + "step": 5668 + }, + { + "epoch": 0.54, + "grad_norm": 0.27586967822568237, + "learning_rate": 0.0001744177769564228, + "loss": 1.0234, + "step": 5669 + }, + { + "epoch": 0.54, + "grad_norm": 0.23851868493807254, + "learning_rate": 0.00017440720889624978, + "loss": 1.109, + "step": 5670 + }, + { + "epoch": 0.54, + "grad_norm": 0.29280665470416317, + "learning_rate": 0.00017439663897398958, + "loss": 1.1473, + "step": 5671 + }, + { + "epoch": 0.54, + "grad_norm": 0.27293284903881204, + "learning_rate": 0.00017438606718990675, + "loss": 1.0855, + "step": 5672 + }, + { + "epoch": 0.54, + "grad_norm": 0.3102331601705705, + "learning_rate": 0.00017437549354426586, + "loss": 0.979, + "step": 5673 + }, + { + "epoch": 0.54, + "grad_norm": 0.2904424568361712, + "learning_rate": 0.00017436491803733147, + "loss": 1.0758, + "step": 5674 + }, + { + "epoch": 0.54, + "grad_norm": 0.3038807854764857, + "learning_rate": 0.00017435434066936828, + "loss": 1.0813, + "step": 5675 + }, + { + "epoch": 0.54, + "grad_norm": 0.2982170129842694, + "learning_rate": 0.00017434376144064096, + "loss": 0.9269, + "step": 5676 + }, + { + "epoch": 0.54, + "grad_norm": 0.29066232686572985, + "learning_rate": 0.00017433318035141432, + "loss": 1.0761, + "step": 5677 + }, + { + "epoch": 0.54, + "grad_norm": 0.36384380111246045, + "learning_rate": 0.0001743225974019531, + "loss": 1.1277, + "step": 5678 + }, + { + "epoch": 0.54, + "grad_norm": 0.28539850304066966, + "learning_rate": 0.00017431201259252222, + "loss": 1.1838, + "step": 5679 + }, + { + "epoch": 0.54, + "grad_norm": 0.3010319695376003, + "learning_rate": 0.00017430142592338648, + "loss": 0.9309, + "step": 5680 + }, + { + "epoch": 0.54, + "grad_norm": 0.2639994319325743, + "learning_rate": 0.00017429083739481087, + "loss": 1.1049, + "step": 5681 + }, + { + "epoch": 0.54, + "grad_norm": 0.2979159287960645, + "learning_rate": 0.00017428024700706036, + "loss": 1.1221, + "step": 5682 + }, + { + "epoch": 0.54, + "grad_norm": 0.2819099394302753, + "learning_rate": 0.0001742696547604, + "loss": 1.0916, + "step": 5683 + }, + { + "epoch": 0.54, + "grad_norm": 0.260104363656028, + "learning_rate": 0.00017425906065509484, + "loss": 1.0034, + "step": 5684 + }, + { + "epoch": 0.54, + "grad_norm": 0.2970820007503617, + "learning_rate": 0.00017424846469141, + "loss": 1.1024, + "step": 5685 + }, + { + "epoch": 0.54, + "grad_norm": 0.2620883115817067, + "learning_rate": 0.0001742378668696107, + "loss": 1.0562, + "step": 5686 + }, + { + "epoch": 0.54, + "grad_norm": 0.2491078043946645, + "learning_rate": 0.0001742272671899621, + "loss": 1.0625, + "step": 5687 + }, + { + "epoch": 0.54, + "grad_norm": 0.30629114639440114, + "learning_rate": 0.00017421666565272948, + "loss": 1.202, + "step": 5688 + }, + { + "epoch": 0.54, + "grad_norm": 0.30830018039821566, + "learning_rate": 0.0001742060622581782, + "loss": 1.0535, + "step": 5689 + }, + { + "epoch": 0.54, + "grad_norm": 0.2724222943145667, + "learning_rate": 0.00017419545700657354, + "loss": 1.1325, + "step": 5690 + }, + { + "epoch": 0.54, + "grad_norm": 0.23708317781034122, + "learning_rate": 0.00017418484989818096, + "loss": 1.0638, + "step": 5691 + }, + { + "epoch": 0.54, + "grad_norm": 0.26229220094670336, + "learning_rate": 0.00017417424093326588, + "loss": 0.9439, + "step": 5692 + }, + { + "epoch": 0.54, + "grad_norm": 0.25017286913990483, + "learning_rate": 0.0001741636301120938, + "loss": 1.079, + "step": 5693 + }, + { + "epoch": 0.54, + "grad_norm": 0.2860341282853943, + "learning_rate": 0.00017415301743493026, + "loss": 1.109, + "step": 5694 + }, + { + "epoch": 0.54, + "grad_norm": 0.27000375628865514, + "learning_rate": 0.00017414240290204087, + "loss": 1.0366, + "step": 5695 + }, + { + "epoch": 0.54, + "grad_norm": 0.2944772274521784, + "learning_rate": 0.00017413178651369123, + "loss": 1.106, + "step": 5696 + }, + { + "epoch": 0.55, + "grad_norm": 0.29923112387635076, + "learning_rate": 0.00017412116827014707, + "loss": 1.118, + "step": 5697 + }, + { + "epoch": 0.55, + "grad_norm": 0.27744625628639163, + "learning_rate": 0.00017411054817167407, + "loss": 1.0582, + "step": 5698 + }, + { + "epoch": 0.55, + "grad_norm": 0.2872887689318743, + "learning_rate": 0.00017409992621853803, + "loss": 1.135, + "step": 5699 + }, + { + "epoch": 0.55, + "grad_norm": 0.2868590588865866, + "learning_rate": 0.00017408930241100476, + "loss": 1.1218, + "step": 5700 + }, + { + "epoch": 0.55, + "grad_norm": 0.28039756959308726, + "learning_rate": 0.00017407867674934014, + "loss": 1.0089, + "step": 5701 + }, + { + "epoch": 0.55, + "grad_norm": 0.26834033841073884, + "learning_rate": 0.00017406804923381008, + "loss": 1.0931, + "step": 5702 + }, + { + "epoch": 0.55, + "grad_norm": 0.2679055295777244, + "learning_rate": 0.00017405741986468054, + "loss": 1.0362, + "step": 5703 + }, + { + "epoch": 0.55, + "grad_norm": 0.31948114555469886, + "learning_rate": 0.00017404678864221752, + "loss": 1.1499, + "step": 5704 + }, + { + "epoch": 0.55, + "grad_norm": 0.254735913314033, + "learning_rate": 0.00017403615556668708, + "loss": 1.0521, + "step": 5705 + }, + { + "epoch": 0.55, + "grad_norm": 0.2758568173458296, + "learning_rate": 0.00017402552063835533, + "loss": 1.0276, + "step": 5706 + }, + { + "epoch": 0.55, + "grad_norm": 0.2881056253518769, + "learning_rate": 0.0001740148838574884, + "loss": 0.9892, + "step": 5707 + }, + { + "epoch": 0.55, + "grad_norm": 0.28754810292102423, + "learning_rate": 0.00017400424522435247, + "loss": 1.0138, + "step": 5708 + }, + { + "epoch": 0.55, + "grad_norm": 0.26622897762881104, + "learning_rate": 0.0001739936047392138, + "loss": 1.0121, + "step": 5709 + }, + { + "epoch": 0.55, + "grad_norm": 0.3257799896467255, + "learning_rate": 0.00017398296240233866, + "loss": 1.0488, + "step": 5710 + }, + { + "epoch": 0.55, + "grad_norm": 0.300245783416075, + "learning_rate": 0.0001739723182139934, + "loss": 1.199, + "step": 5711 + }, + { + "epoch": 0.55, + "grad_norm": 0.29793504189936215, + "learning_rate": 0.00017396167217444437, + "loss": 1.0326, + "step": 5712 + }, + { + "epoch": 0.55, + "grad_norm": 0.2895788092153529, + "learning_rate": 0.00017395102428395803, + "loss": 1.0282, + "step": 5713 + }, + { + "epoch": 0.55, + "grad_norm": 0.2759129672263007, + "learning_rate": 0.0001739403745428008, + "loss": 1.1353, + "step": 5714 + }, + { + "epoch": 0.55, + "grad_norm": 0.2736823382715656, + "learning_rate": 0.0001739297229512393, + "loss": 1.0918, + "step": 5715 + }, + { + "epoch": 0.55, + "grad_norm": 0.30836802392427065, + "learning_rate": 0.00017391906950953994, + "loss": 1.1101, + "step": 5716 + }, + { + "epoch": 0.55, + "grad_norm": 0.2923956322766849, + "learning_rate": 0.00017390841421796943, + "loss": 1.1005, + "step": 5717 + }, + { + "epoch": 0.55, + "grad_norm": 0.2582193114884755, + "learning_rate": 0.00017389775707679444, + "loss": 1.0708, + "step": 5718 + }, + { + "epoch": 0.55, + "grad_norm": 0.24558151292411884, + "learning_rate": 0.0001738870980862816, + "loss": 1.1303, + "step": 5719 + }, + { + "epoch": 0.55, + "grad_norm": 0.29322536081033934, + "learning_rate": 0.0001738764372466977, + "loss": 1.0927, + "step": 5720 + }, + { + "epoch": 0.55, + "grad_norm": 0.2659755712941869, + "learning_rate": 0.00017386577455830952, + "loss": 1.0032, + "step": 5721 + }, + { + "epoch": 0.55, + "grad_norm": 0.2635454724857395, + "learning_rate": 0.00017385511002138393, + "loss": 1.2808, + "step": 5722 + }, + { + "epoch": 0.55, + "grad_norm": 0.26786955193926654, + "learning_rate": 0.0001738444436361878, + "loss": 1.0475, + "step": 5723 + }, + { + "epoch": 0.55, + "grad_norm": 0.28425119186729847, + "learning_rate": 0.00017383377540298805, + "loss": 1.0817, + "step": 5724 + }, + { + "epoch": 0.55, + "grad_norm": 0.28386565821296494, + "learning_rate": 0.00017382310532205165, + "loss": 0.9009, + "step": 5725 + }, + { + "epoch": 0.55, + "grad_norm": 0.2455076247487801, + "learning_rate": 0.00017381243339364565, + "loss": 0.9972, + "step": 5726 + }, + { + "epoch": 0.55, + "grad_norm": 0.2949244070026697, + "learning_rate": 0.00017380175961803713, + "loss": 1.0042, + "step": 5727 + }, + { + "epoch": 0.55, + "grad_norm": 0.2806527868238312, + "learning_rate": 0.00017379108399549317, + "loss": 1.1932, + "step": 5728 + }, + { + "epoch": 0.55, + "grad_norm": 0.27785708544616866, + "learning_rate": 0.000173780406526281, + "loss": 1.0914, + "step": 5729 + }, + { + "epoch": 0.55, + "grad_norm": 0.28683282908042806, + "learning_rate": 0.00017376972721066776, + "loss": 1.1145, + "step": 5730 + }, + { + "epoch": 0.55, + "grad_norm": 0.2838770112878104, + "learning_rate": 0.00017375904604892073, + "loss": 1.0765, + "step": 5731 + }, + { + "epoch": 0.55, + "grad_norm": 0.2934314888524707, + "learning_rate": 0.0001737483630413072, + "loss": 1.0595, + "step": 5732 + }, + { + "epoch": 0.55, + "grad_norm": 0.25830945410699585, + "learning_rate": 0.00017373767818809456, + "loss": 0.9669, + "step": 5733 + }, + { + "epoch": 0.55, + "grad_norm": 0.2665953859888515, + "learning_rate": 0.00017372699148955018, + "loss": 1.0714, + "step": 5734 + }, + { + "epoch": 0.55, + "grad_norm": 0.2731137251463795, + "learning_rate": 0.0001737163029459415, + "loss": 1.0746, + "step": 5735 + }, + { + "epoch": 0.55, + "grad_norm": 0.25825981988483343, + "learning_rate": 0.00017370561255753602, + "loss": 0.9534, + "step": 5736 + }, + { + "epoch": 0.55, + "grad_norm": 0.30383684481721607, + "learning_rate": 0.00017369492032460123, + "loss": 1.1384, + "step": 5737 + }, + { + "epoch": 0.55, + "grad_norm": 0.31417342120569464, + "learning_rate": 0.00017368422624740478, + "loss": 1.0576, + "step": 5738 + }, + { + "epoch": 0.55, + "grad_norm": 0.2593381194717112, + "learning_rate": 0.00017367353032621426, + "loss": 1.0573, + "step": 5739 + }, + { + "epoch": 0.55, + "grad_norm": 0.2778893532357316, + "learning_rate": 0.00017366283256129732, + "loss": 0.9766, + "step": 5740 + }, + { + "epoch": 0.55, + "grad_norm": 0.3072153615495613, + "learning_rate": 0.0001736521329529217, + "loss": 1.1247, + "step": 5741 + }, + { + "epoch": 0.55, + "grad_norm": 0.28745857193672664, + "learning_rate": 0.00017364143150135517, + "loss": 1.1141, + "step": 5742 + }, + { + "epoch": 0.55, + "grad_norm": 0.2651127640673508, + "learning_rate": 0.00017363072820686552, + "loss": 1.0829, + "step": 5743 + }, + { + "epoch": 0.55, + "grad_norm": 0.2902298639371502, + "learning_rate": 0.00017362002306972065, + "loss": 0.987, + "step": 5744 + }, + { + "epoch": 0.55, + "grad_norm": 0.27655920423756686, + "learning_rate": 0.00017360931609018842, + "loss": 1.0076, + "step": 5745 + }, + { + "epoch": 0.55, + "grad_norm": 0.2567390152989073, + "learning_rate": 0.0001735986072685368, + "loss": 1.0375, + "step": 5746 + }, + { + "epoch": 0.55, + "grad_norm": 0.24515803399806585, + "learning_rate": 0.00017358789660503377, + "loss": 1.0745, + "step": 5747 + }, + { + "epoch": 0.55, + "grad_norm": 0.2575517841295073, + "learning_rate": 0.0001735771840999474, + "loss": 1.0153, + "step": 5748 + }, + { + "epoch": 0.55, + "grad_norm": 0.2764368481508909, + "learning_rate": 0.0001735664697535457, + "loss": 1.1584, + "step": 5749 + }, + { + "epoch": 0.55, + "grad_norm": 0.27351695694822586, + "learning_rate": 0.0001735557535660969, + "loss": 1.0281, + "step": 5750 + }, + { + "epoch": 0.55, + "grad_norm": 0.2667030418236546, + "learning_rate": 0.00017354503553786916, + "loss": 1.1321, + "step": 5751 + }, + { + "epoch": 0.55, + "grad_norm": 0.29279846320754066, + "learning_rate": 0.00017353431566913066, + "loss": 0.9984, + "step": 5752 + }, + { + "epoch": 0.55, + "grad_norm": 0.2415120154785472, + "learning_rate": 0.0001735235939601497, + "loss": 1.0611, + "step": 5753 + }, + { + "epoch": 0.55, + "grad_norm": 0.25955357361068576, + "learning_rate": 0.00017351287041119458, + "loss": 1.1294, + "step": 5754 + }, + { + "epoch": 0.55, + "grad_norm": 0.294206732380647, + "learning_rate": 0.0001735021450225337, + "loss": 1.0812, + "step": 5755 + }, + { + "epoch": 0.55, + "grad_norm": 0.29031247550246536, + "learning_rate": 0.00017349141779443542, + "loss": 1.0576, + "step": 5756 + }, + { + "epoch": 0.55, + "grad_norm": 0.2672750801707316, + "learning_rate": 0.00017348068872716823, + "loss": 1.0851, + "step": 5757 + }, + { + "epoch": 0.55, + "grad_norm": 0.26957335518481657, + "learning_rate": 0.00017346995782100062, + "loss": 1.0858, + "step": 5758 + }, + { + "epoch": 0.55, + "grad_norm": 0.32734612669273955, + "learning_rate": 0.00017345922507620116, + "loss": 1.0656, + "step": 5759 + }, + { + "epoch": 0.55, + "grad_norm": 0.24154451886063427, + "learning_rate": 0.00017344849049303842, + "loss": 0.9896, + "step": 5760 + }, + { + "epoch": 0.55, + "grad_norm": 0.27709567176869415, + "learning_rate": 0.00017343775407178104, + "loss": 1.0805, + "step": 5761 + }, + { + "epoch": 0.55, + "grad_norm": 0.272244078427738, + "learning_rate": 0.0001734270158126977, + "loss": 1.0433, + "step": 5762 + }, + { + "epoch": 0.55, + "grad_norm": 0.2673872164348252, + "learning_rate": 0.00017341627571605716, + "loss": 1.1464, + "step": 5763 + }, + { + "epoch": 0.55, + "grad_norm": 0.27315997652853985, + "learning_rate": 0.00017340553378212816, + "loss": 1.1567, + "step": 5764 + }, + { + "epoch": 0.55, + "grad_norm": 0.23474379585480734, + "learning_rate": 0.00017339479001117955, + "loss": 1.0089, + "step": 5765 + }, + { + "epoch": 0.55, + "grad_norm": 0.27652850547755903, + "learning_rate": 0.00017338404440348022, + "loss": 1.0461, + "step": 5766 + }, + { + "epoch": 0.55, + "grad_norm": 0.31896644764441623, + "learning_rate": 0.00017337329695929902, + "loss": 1.0359, + "step": 5767 + }, + { + "epoch": 0.55, + "grad_norm": 0.274574680140215, + "learning_rate": 0.00017336254767890498, + "loss": 1.0201, + "step": 5768 + }, + { + "epoch": 0.55, + "grad_norm": 0.2630831469195627, + "learning_rate": 0.00017335179656256705, + "loss": 1.0975, + "step": 5769 + }, + { + "epoch": 0.55, + "grad_norm": 0.2793728024955909, + "learning_rate": 0.00017334104361055436, + "loss": 1.1332, + "step": 5770 + }, + { + "epoch": 0.55, + "grad_norm": 0.27408839587852935, + "learning_rate": 0.0001733302888231359, + "loss": 0.939, + "step": 5771 + }, + { + "epoch": 0.55, + "grad_norm": 0.27297436472505615, + "learning_rate": 0.0001733195322005809, + "loss": 1.0168, + "step": 5772 + }, + { + "epoch": 0.55, + "grad_norm": 0.30630256010686685, + "learning_rate": 0.00017330877374315855, + "loss": 1.1169, + "step": 5773 + }, + { + "epoch": 0.55, + "grad_norm": 0.2767879583144832, + "learning_rate": 0.00017329801345113802, + "loss": 1.0233, + "step": 5774 + }, + { + "epoch": 0.55, + "grad_norm": 0.29684472098320214, + "learning_rate": 0.0001732872513247887, + "loss": 1.0463, + "step": 5775 + }, + { + "epoch": 0.55, + "grad_norm": 0.2785233560114013, + "learning_rate": 0.00017327648736437977, + "loss": 1.142, + "step": 5776 + }, + { + "epoch": 0.55, + "grad_norm": 0.3208276011124166, + "learning_rate": 0.00017326572157018078, + "loss": 1.1932, + "step": 5777 + }, + { + "epoch": 0.55, + "grad_norm": 0.22684754778328245, + "learning_rate": 0.000173254953942461, + "loss": 1.0656, + "step": 5778 + }, + { + "epoch": 0.55, + "grad_norm": 0.3374242730404179, + "learning_rate": 0.00017324418448148998, + "loss": 1.049, + "step": 5779 + }, + { + "epoch": 0.55, + "grad_norm": 0.3428578883405557, + "learning_rate": 0.0001732334131875372, + "loss": 1.1952, + "step": 5780 + }, + { + "epoch": 0.55, + "grad_norm": 0.27472370192399703, + "learning_rate": 0.00017322264006087225, + "loss": 1.2441, + "step": 5781 + }, + { + "epoch": 0.55, + "grad_norm": 0.27491429642165643, + "learning_rate": 0.0001732118651017647, + "loss": 1.0994, + "step": 5782 + }, + { + "epoch": 0.55, + "grad_norm": 0.27459654102558345, + "learning_rate": 0.00017320108831048422, + "loss": 1.0496, + "step": 5783 + }, + { + "epoch": 0.55, + "grad_norm": 0.2699347475330186, + "learning_rate": 0.0001731903096873005, + "loss": 1.1214, + "step": 5784 + }, + { + "epoch": 0.55, + "grad_norm": 0.296175698180158, + "learning_rate": 0.00017317952923248328, + "loss": 1.0308, + "step": 5785 + }, + { + "epoch": 0.55, + "grad_norm": 0.29058979499652565, + "learning_rate": 0.00017316874694630236, + "loss": 1.0828, + "step": 5786 + }, + { + "epoch": 0.55, + "grad_norm": 0.2997585209236349, + "learning_rate": 0.00017315796282902753, + "loss": 1.1366, + "step": 5787 + }, + { + "epoch": 0.55, + "grad_norm": 0.27673867013826936, + "learning_rate": 0.00017314717688092873, + "loss": 0.945, + "step": 5788 + }, + { + "epoch": 0.55, + "grad_norm": 0.23598291014298442, + "learning_rate": 0.00017313638910227585, + "loss": 0.9402, + "step": 5789 + }, + { + "epoch": 0.55, + "grad_norm": 0.26553154287144975, + "learning_rate": 0.00017312559949333886, + "loss": 1.1785, + "step": 5790 + }, + { + "epoch": 0.55, + "grad_norm": 0.28452803788674824, + "learning_rate": 0.0001731148080543878, + "loss": 1.124, + "step": 5791 + }, + { + "epoch": 0.55, + "grad_norm": 0.2705626232253567, + "learning_rate": 0.00017310401478569273, + "loss": 1.0191, + "step": 5792 + }, + { + "epoch": 0.55, + "grad_norm": 0.24817380433889646, + "learning_rate": 0.0001730932196875237, + "loss": 1.0482, + "step": 5793 + }, + { + "epoch": 0.55, + "grad_norm": 0.26652308262279306, + "learning_rate": 0.00017308242276015094, + "loss": 1.0738, + "step": 5794 + }, + { + "epoch": 0.55, + "grad_norm": 0.24818290945260654, + "learning_rate": 0.00017307162400384462, + "loss": 1.0175, + "step": 5795 + }, + { + "epoch": 0.55, + "grad_norm": 0.26331611360600954, + "learning_rate": 0.000173060823418875, + "loss": 1.0788, + "step": 5796 + }, + { + "epoch": 0.55, + "grad_norm": 0.25070740236687494, + "learning_rate": 0.00017305002100551233, + "loss": 0.9522, + "step": 5797 + }, + { + "epoch": 0.55, + "grad_norm": 0.27552761246559027, + "learning_rate": 0.000173039216764027, + "loss": 1.1854, + "step": 5798 + }, + { + "epoch": 0.55, + "grad_norm": 0.24607203901766253, + "learning_rate": 0.00017302841069468934, + "loss": 1.0574, + "step": 5799 + }, + { + "epoch": 0.55, + "grad_norm": 0.24789972386779166, + "learning_rate": 0.00017301760279776982, + "loss": 1.0585, + "step": 5800 + }, + { + "epoch": 0.55, + "grad_norm": 0.22933484912929925, + "learning_rate": 0.00017300679307353888, + "loss": 1.0806, + "step": 5801 + }, + { + "epoch": 0.56, + "grad_norm": 0.27649671163848355, + "learning_rate": 0.0001729959815222671, + "loss": 1.1766, + "step": 5802 + }, + { + "epoch": 0.56, + "grad_norm": 0.24226545584694414, + "learning_rate": 0.00017298516814422498, + "loss": 0.9117, + "step": 5803 + }, + { + "epoch": 0.56, + "grad_norm": 0.26242930789541363, + "learning_rate": 0.00017297435293968315, + "loss": 1.1165, + "step": 5804 + }, + { + "epoch": 0.56, + "grad_norm": 0.3120420916073829, + "learning_rate": 0.0001729635359089123, + "loss": 1.0574, + "step": 5805 + }, + { + "epoch": 0.56, + "grad_norm": 0.2701256919492001, + "learning_rate": 0.00017295271705218307, + "loss": 1.1047, + "step": 5806 + }, + { + "epoch": 0.56, + "grad_norm": 0.29645770053433096, + "learning_rate": 0.0001729418963697663, + "loss": 1.1329, + "step": 5807 + }, + { + "epoch": 0.56, + "grad_norm": 0.24632237776935018, + "learning_rate": 0.0001729310738619327, + "loss": 1.1359, + "step": 5808 + }, + { + "epoch": 0.56, + "grad_norm": 0.27163467992709467, + "learning_rate": 0.00017292024952895313, + "loss": 1.1006, + "step": 5809 + }, + { + "epoch": 0.56, + "grad_norm": 0.26959200346069406, + "learning_rate": 0.0001729094233710985, + "loss": 1.0042, + "step": 5810 + }, + { + "epoch": 0.56, + "grad_norm": 0.289959983695105, + "learning_rate": 0.00017289859538863973, + "loss": 1.1085, + "step": 5811 + }, + { + "epoch": 0.56, + "grad_norm": 0.3213875350320641, + "learning_rate": 0.0001728877655818478, + "loss": 1.1062, + "step": 5812 + }, + { + "epoch": 0.56, + "grad_norm": 0.3270439141750809, + "learning_rate": 0.0001728769339509937, + "loss": 1.0258, + "step": 5813 + }, + { + "epoch": 0.56, + "grad_norm": 0.3409072299007263, + "learning_rate": 0.00017286610049634856, + "loss": 0.9644, + "step": 5814 + }, + { + "epoch": 0.56, + "grad_norm": 0.24496972615719473, + "learning_rate": 0.00017285526521818346, + "loss": 1.1259, + "step": 5815 + }, + { + "epoch": 0.56, + "grad_norm": 0.3139089621120804, + "learning_rate": 0.00017284442811676953, + "loss": 1.0404, + "step": 5816 + }, + { + "epoch": 0.56, + "grad_norm": 0.26970603580675145, + "learning_rate": 0.00017283358919237802, + "loss": 1.0835, + "step": 5817 + }, + { + "epoch": 0.56, + "grad_norm": 0.2733732615441018, + "learning_rate": 0.0001728227484452802, + "loss": 1.0744, + "step": 5818 + }, + { + "epoch": 0.56, + "grad_norm": 0.28036462116083893, + "learning_rate": 0.00017281190587574728, + "loss": 1.1692, + "step": 5819 + }, + { + "epoch": 0.56, + "grad_norm": 0.2871660582969163, + "learning_rate": 0.0001728010614840507, + "loss": 1.0495, + "step": 5820 + }, + { + "epoch": 0.56, + "grad_norm": 0.23884476735212962, + "learning_rate": 0.00017279021527046178, + "loss": 1.0443, + "step": 5821 + }, + { + "epoch": 0.56, + "grad_norm": 0.2873207540071599, + "learning_rate": 0.00017277936723525197, + "loss": 0.993, + "step": 5822 + }, + { + "epoch": 0.56, + "grad_norm": 0.30547644132922835, + "learning_rate": 0.00017276851737869274, + "loss": 1.0843, + "step": 5823 + }, + { + "epoch": 0.56, + "grad_norm": 0.3057414154299913, + "learning_rate": 0.00017275766570105567, + "loss": 1.0655, + "step": 5824 + }, + { + "epoch": 0.56, + "grad_norm": 0.2632962822442975, + "learning_rate": 0.00017274681220261226, + "loss": 1.0939, + "step": 5825 + }, + { + "epoch": 0.56, + "grad_norm": 0.30877886648124775, + "learning_rate": 0.00017273595688363416, + "loss": 1.1277, + "step": 5826 + }, + { + "epoch": 0.56, + "grad_norm": 0.26307234053148654, + "learning_rate": 0.00017272509974439304, + "loss": 1.0079, + "step": 5827 + }, + { + "epoch": 0.56, + "grad_norm": 0.2841767029229268, + "learning_rate": 0.00017271424078516055, + "loss": 1.1768, + "step": 5828 + }, + { + "epoch": 0.56, + "grad_norm": 0.2603214619363828, + "learning_rate": 0.00017270338000620856, + "loss": 1.0806, + "step": 5829 + }, + { + "epoch": 0.56, + "grad_norm": 0.2545067452234432, + "learning_rate": 0.00017269251740780874, + "loss": 1.0412, + "step": 5830 + }, + { + "epoch": 0.56, + "grad_norm": 0.27855295154779947, + "learning_rate": 0.000172681652990233, + "loss": 1.2206, + "step": 5831 + }, + { + "epoch": 0.56, + "grad_norm": 0.28261805590758377, + "learning_rate": 0.00017267078675375322, + "loss": 1.0033, + "step": 5832 + }, + { + "epoch": 0.56, + "grad_norm": 0.27667495248962876, + "learning_rate": 0.00017265991869864128, + "loss": 1.0831, + "step": 5833 + }, + { + "epoch": 0.56, + "grad_norm": 0.2714857639489485, + "learning_rate": 0.00017264904882516928, + "loss": 1.109, + "step": 5834 + }, + { + "epoch": 0.56, + "grad_norm": 0.27229604294119086, + "learning_rate": 0.00017263817713360915, + "loss": 1.0874, + "step": 5835 + }, + { + "epoch": 0.56, + "grad_norm": 0.293920707567098, + "learning_rate": 0.00017262730362423297, + "loss": 1.0823, + "step": 5836 + }, + { + "epoch": 0.56, + "grad_norm": 0.2613726158263339, + "learning_rate": 0.00017261642829731287, + "loss": 1.0599, + "step": 5837 + }, + { + "epoch": 0.56, + "grad_norm": 0.2889492784232317, + "learning_rate": 0.00017260555115312104, + "loss": 1.1224, + "step": 5838 + }, + { + "epoch": 0.56, + "grad_norm": 0.25970536207303385, + "learning_rate": 0.00017259467219192968, + "loss": 1.1015, + "step": 5839 + }, + { + "epoch": 0.56, + "grad_norm": 0.28971995596673233, + "learning_rate": 0.00017258379141401098, + "loss": 1.0696, + "step": 5840 + }, + { + "epoch": 0.56, + "grad_norm": 0.30430404198918504, + "learning_rate": 0.00017257290881963732, + "loss": 1.0823, + "step": 5841 + }, + { + "epoch": 0.56, + "grad_norm": 0.2713987221161718, + "learning_rate": 0.00017256202440908095, + "loss": 1.0604, + "step": 5842 + }, + { + "epoch": 0.56, + "grad_norm": 0.3001859725452708, + "learning_rate": 0.00017255113818261437, + "loss": 0.8537, + "step": 5843 + }, + { + "epoch": 0.56, + "grad_norm": 0.24037659758164331, + "learning_rate": 0.00017254025014050995, + "loss": 1.0035, + "step": 5844 + }, + { + "epoch": 0.56, + "grad_norm": 0.29289291269851125, + "learning_rate": 0.00017252936028304015, + "loss": 0.9527, + "step": 5845 + }, + { + "epoch": 0.56, + "grad_norm": 0.28320339512377446, + "learning_rate": 0.00017251846861047755, + "loss": 1.0616, + "step": 5846 + }, + { + "epoch": 0.56, + "grad_norm": 0.2844954523450815, + "learning_rate": 0.0001725075751230947, + "loss": 1.1248, + "step": 5847 + }, + { + "epoch": 0.56, + "grad_norm": 0.25465891657756395, + "learning_rate": 0.0001724966798211642, + "loss": 1.2268, + "step": 5848 + }, + { + "epoch": 0.56, + "grad_norm": 0.318243854381763, + "learning_rate": 0.00017248578270495873, + "loss": 1.1983, + "step": 5849 + }, + { + "epoch": 0.56, + "grad_norm": 0.24392495708256506, + "learning_rate": 0.00017247488377475102, + "loss": 1.1131, + "step": 5850 + }, + { + "epoch": 0.56, + "grad_norm": 0.339924875849168, + "learning_rate": 0.00017246398303081377, + "loss": 1.1255, + "step": 5851 + }, + { + "epoch": 0.56, + "grad_norm": 0.2824682986273372, + "learning_rate": 0.00017245308047341977, + "loss": 1.0928, + "step": 5852 + }, + { + "epoch": 0.56, + "grad_norm": 0.294553844921709, + "learning_rate": 0.00017244217610284194, + "loss": 1.1941, + "step": 5853 + }, + { + "epoch": 0.56, + "grad_norm": 0.3175139266139292, + "learning_rate": 0.0001724312699193531, + "loss": 1.0488, + "step": 5854 + }, + { + "epoch": 0.56, + "grad_norm": 0.26336543686584807, + "learning_rate": 0.0001724203619232262, + "loss": 1.0406, + "step": 5855 + }, + { + "epoch": 0.56, + "grad_norm": 0.28698720470502975, + "learning_rate": 0.00017240945211473426, + "loss": 1.1673, + "step": 5856 + }, + { + "epoch": 0.56, + "grad_norm": 0.2705696042656757, + "learning_rate": 0.0001723985404941503, + "loss": 1.0654, + "step": 5857 + }, + { + "epoch": 0.56, + "grad_norm": 0.2609774023108237, + "learning_rate": 0.0001723876270617473, + "loss": 1.0166, + "step": 5858 + }, + { + "epoch": 0.56, + "grad_norm": 0.2879084621496255, + "learning_rate": 0.0001723767118177985, + "loss": 1.1731, + "step": 5859 + }, + { + "epoch": 0.56, + "grad_norm": 0.25995995206356726, + "learning_rate": 0.00017236579476257694, + "loss": 1.1141, + "step": 5860 + }, + { + "epoch": 0.56, + "grad_norm": 0.2832052274386346, + "learning_rate": 0.00017235487589635593, + "loss": 1.0356, + "step": 5861 + }, + { + "epoch": 0.56, + "grad_norm": 0.3009633175427442, + "learning_rate": 0.00017234395521940866, + "loss": 1.0634, + "step": 5862 + }, + { + "epoch": 0.56, + "grad_norm": 0.28724524890382835, + "learning_rate": 0.00017233303273200842, + "loss": 1.0978, + "step": 5863 + }, + { + "epoch": 0.56, + "grad_norm": 0.30150303153924973, + "learning_rate": 0.0001723221084344286, + "loss": 1.0883, + "step": 5864 + }, + { + "epoch": 0.56, + "grad_norm": 0.2602169228739236, + "learning_rate": 0.00017231118232694255, + "loss": 0.8809, + "step": 5865 + }, + { + "epoch": 0.56, + "grad_norm": 0.2597086205288419, + "learning_rate": 0.00017230025440982373, + "loss": 1.0913, + "step": 5866 + }, + { + "epoch": 0.56, + "grad_norm": 0.2627390421998769, + "learning_rate": 0.0001722893246833456, + "loss": 0.9786, + "step": 5867 + }, + { + "epoch": 0.56, + "grad_norm": 0.29086276852649495, + "learning_rate": 0.0001722783931477817, + "loss": 1.1312, + "step": 5868 + }, + { + "epoch": 0.56, + "grad_norm": 0.2631914398523322, + "learning_rate": 0.00017226745980340556, + "loss": 1.1383, + "step": 5869 + }, + { + "epoch": 0.56, + "grad_norm": 0.293675216189509, + "learning_rate": 0.00017225652465049086, + "loss": 1.037, + "step": 5870 + }, + { + "epoch": 0.56, + "grad_norm": 0.27269400481933986, + "learning_rate": 0.0001722455876893112, + "loss": 1.0594, + "step": 5871 + }, + { + "epoch": 0.56, + "grad_norm": 0.2913003315526418, + "learning_rate": 0.00017223464892014028, + "loss": 1.0343, + "step": 5872 + }, + { + "epoch": 0.56, + "grad_norm": 0.30749338727180847, + "learning_rate": 0.0001722237083432519, + "loss": 1.0609, + "step": 5873 + }, + { + "epoch": 0.56, + "grad_norm": 0.2780051476505012, + "learning_rate": 0.00017221276595891984, + "loss": 1.0403, + "step": 5874 + }, + { + "epoch": 0.56, + "grad_norm": 0.28351257017042697, + "learning_rate": 0.0001722018217674179, + "loss": 1.1222, + "step": 5875 + }, + { + "epoch": 0.56, + "grad_norm": 0.30556461707747595, + "learning_rate": 0.00017219087576902, + "loss": 1.1095, + "step": 5876 + }, + { + "epoch": 0.56, + "grad_norm": 0.2747330702712088, + "learning_rate": 0.00017217992796400005, + "loss": 1.0258, + "step": 5877 + }, + { + "epoch": 0.56, + "grad_norm": 0.3102546058358946, + "learning_rate": 0.00017216897835263209, + "loss": 0.9957, + "step": 5878 + }, + { + "epoch": 0.56, + "grad_norm": 0.3232197983937078, + "learning_rate": 0.00017215802693519003, + "loss": 1.0768, + "step": 5879 + }, + { + "epoch": 0.56, + "grad_norm": 0.269670235437008, + "learning_rate": 0.00017214707371194802, + "loss": 1.1033, + "step": 5880 + }, + { + "epoch": 0.56, + "grad_norm": 0.2633897286188578, + "learning_rate": 0.00017213611868318015, + "loss": 1.0035, + "step": 5881 + }, + { + "epoch": 0.56, + "grad_norm": 0.3042144256612858, + "learning_rate": 0.00017212516184916056, + "loss": 1.0715, + "step": 5882 + }, + { + "epoch": 0.56, + "grad_norm": 0.266078239126639, + "learning_rate": 0.0001721142032101635, + "loss": 1.1544, + "step": 5883 + }, + { + "epoch": 0.56, + "grad_norm": 0.30458594940724576, + "learning_rate": 0.00017210324276646316, + "loss": 1.1251, + "step": 5884 + }, + { + "epoch": 0.56, + "grad_norm": 0.28395232279028726, + "learning_rate": 0.00017209228051833387, + "loss": 1.0925, + "step": 5885 + }, + { + "epoch": 0.56, + "grad_norm": 0.29349557855153724, + "learning_rate": 0.00017208131646604993, + "loss": 1.1322, + "step": 5886 + }, + { + "epoch": 0.56, + "grad_norm": 0.2729717369394539, + "learning_rate": 0.00017207035060988574, + "loss": 1.0009, + "step": 5887 + }, + { + "epoch": 0.56, + "grad_norm": 0.278418729052514, + "learning_rate": 0.00017205938295011575, + "loss": 1.0637, + "step": 5888 + }, + { + "epoch": 0.56, + "grad_norm": 0.2689002291046586, + "learning_rate": 0.00017204841348701438, + "loss": 1.1421, + "step": 5889 + }, + { + "epoch": 0.56, + "grad_norm": 0.2936863871934149, + "learning_rate": 0.00017203744222085623, + "loss": 1.078, + "step": 5890 + }, + { + "epoch": 0.56, + "grad_norm": 0.291267791636599, + "learning_rate": 0.00017202646915191578, + "loss": 1.2519, + "step": 5891 + }, + { + "epoch": 0.56, + "grad_norm": 0.2717676026532944, + "learning_rate": 0.0001720154942804677, + "loss": 1.0844, + "step": 5892 + }, + { + "epoch": 0.56, + "grad_norm": 0.2774147698904939, + "learning_rate": 0.0001720045176067866, + "loss": 1.0487, + "step": 5893 + }, + { + "epoch": 0.56, + "grad_norm": 0.27887254178612914, + "learning_rate": 0.00017199353913114717, + "loss": 1.1334, + "step": 5894 + }, + { + "epoch": 0.56, + "grad_norm": 0.28978059963913083, + "learning_rate": 0.00017198255885382421, + "loss": 1.0868, + "step": 5895 + }, + { + "epoch": 0.56, + "grad_norm": 0.2766397079264888, + "learning_rate": 0.00017197157677509246, + "loss": 1.1068, + "step": 5896 + }, + { + "epoch": 0.56, + "grad_norm": 0.2851293309828098, + "learning_rate": 0.00017196059289522678, + "loss": 1.0717, + "step": 5897 + }, + { + "epoch": 0.56, + "grad_norm": 0.2549062649316722, + "learning_rate": 0.00017194960721450206, + "loss": 1.0199, + "step": 5898 + }, + { + "epoch": 0.56, + "grad_norm": 0.2461949098073852, + "learning_rate": 0.00017193861973319316, + "loss": 1.0461, + "step": 5899 + }, + { + "epoch": 0.56, + "grad_norm": 0.27203801042895437, + "learning_rate": 0.0001719276304515751, + "loss": 1.1691, + "step": 5900 + }, + { + "epoch": 0.56, + "grad_norm": 0.27460790915022437, + "learning_rate": 0.00017191663936992288, + "loss": 1.105, + "step": 5901 + }, + { + "epoch": 0.56, + "grad_norm": 0.29915013883951985, + "learning_rate": 0.0001719056464885116, + "loss": 0.9848, + "step": 5902 + }, + { + "epoch": 0.56, + "grad_norm": 0.29579312716715256, + "learning_rate": 0.00017189465180761628, + "loss": 1.1115, + "step": 5903 + }, + { + "epoch": 0.56, + "grad_norm": 0.30761711337398795, + "learning_rate": 0.00017188365532751213, + "loss": 1.164, + "step": 5904 + }, + { + "epoch": 0.56, + "grad_norm": 0.32410192655697073, + "learning_rate": 0.00017187265704847433, + "loss": 1.089, + "step": 5905 + }, + { + "epoch": 0.57, + "grad_norm": 0.2629249103887719, + "learning_rate": 0.00017186165697077809, + "loss": 0.9588, + "step": 5906 + }, + { + "epoch": 0.57, + "grad_norm": 0.28511065582542094, + "learning_rate": 0.00017185065509469876, + "loss": 1.1679, + "step": 5907 + }, + { + "epoch": 0.57, + "grad_norm": 0.2813371419019583, + "learning_rate": 0.00017183965142051163, + "loss": 1.0888, + "step": 5908 + }, + { + "epoch": 0.57, + "grad_norm": 0.27646089721769807, + "learning_rate": 0.00017182864594849205, + "loss": 1.0513, + "step": 5909 + }, + { + "epoch": 0.57, + "grad_norm": 0.30831778021556755, + "learning_rate": 0.00017181763867891547, + "loss": 1.1152, + "step": 5910 + }, + { + "epoch": 0.57, + "grad_norm": 0.31669929806320846, + "learning_rate": 0.00017180662961205733, + "loss": 1.0554, + "step": 5911 + }, + { + "epoch": 0.57, + "grad_norm": 0.2881244715767111, + "learning_rate": 0.00017179561874819318, + "loss": 1.0702, + "step": 5912 + }, + { + "epoch": 0.57, + "grad_norm": 0.28072579932667696, + "learning_rate": 0.00017178460608759853, + "loss": 0.9758, + "step": 5913 + }, + { + "epoch": 0.57, + "grad_norm": 0.2943649793795249, + "learning_rate": 0.00017177359163054903, + "loss": 1.0715, + "step": 5914 + }, + { + "epoch": 0.57, + "grad_norm": 0.24478814096705148, + "learning_rate": 0.00017176257537732025, + "loss": 1.0606, + "step": 5915 + }, + { + "epoch": 0.57, + "grad_norm": 0.27922088346647145, + "learning_rate": 0.00017175155732818796, + "loss": 1.1677, + "step": 5916 + }, + { + "epoch": 0.57, + "grad_norm": 0.25994214062479126, + "learning_rate": 0.00017174053748342783, + "loss": 0.9742, + "step": 5917 + }, + { + "epoch": 0.57, + "grad_norm": 0.282588772331292, + "learning_rate": 0.00017172951584331565, + "loss": 0.9751, + "step": 5918 + }, + { + "epoch": 0.57, + "grad_norm": 0.2878741945844007, + "learning_rate": 0.0001717184924081273, + "loss": 1.0041, + "step": 5919 + }, + { + "epoch": 0.57, + "grad_norm": 0.3029923936191996, + "learning_rate": 0.00017170746717813854, + "loss": 1.1071, + "step": 5920 + }, + { + "epoch": 0.57, + "grad_norm": 0.27290844629846106, + "learning_rate": 0.0001716964401536254, + "loss": 0.8786, + "step": 5921 + }, + { + "epoch": 0.57, + "grad_norm": 0.31966494850943117, + "learning_rate": 0.00017168541133486377, + "loss": 1.1118, + "step": 5922 + }, + { + "epoch": 0.57, + "grad_norm": 0.2689850435328001, + "learning_rate": 0.00017167438072212968, + "loss": 1.1024, + "step": 5923 + }, + { + "epoch": 0.57, + "grad_norm": 0.27928731290969944, + "learning_rate": 0.00017166334831569916, + "loss": 1.1399, + "step": 5924 + }, + { + "epoch": 0.57, + "grad_norm": 0.29591002661910865, + "learning_rate": 0.00017165231411584827, + "loss": 1.1074, + "step": 5925 + }, + { + "epoch": 0.57, + "grad_norm": 0.3104516805825287, + "learning_rate": 0.00017164127812285324, + "loss": 1.0186, + "step": 5926 + }, + { + "epoch": 0.57, + "grad_norm": 0.3329888234877537, + "learning_rate": 0.00017163024033699017, + "loss": 1.1095, + "step": 5927 + }, + { + "epoch": 0.57, + "grad_norm": 0.27420800800981127, + "learning_rate": 0.00017161920075853534, + "loss": 1.1286, + "step": 5928 + }, + { + "epoch": 0.57, + "grad_norm": 0.2687683336478087, + "learning_rate": 0.000171608159387765, + "loss": 1.1067, + "step": 5929 + }, + { + "epoch": 0.57, + "grad_norm": 0.28848253158286347, + "learning_rate": 0.00017159711622495544, + "loss": 1.1874, + "step": 5930 + }, + { + "epoch": 0.57, + "grad_norm": 0.28239325672690274, + "learning_rate": 0.0001715860712703831, + "loss": 1.1561, + "step": 5931 + }, + { + "epoch": 0.57, + "grad_norm": 0.2918884091102849, + "learning_rate": 0.00017157502452432429, + "loss": 1.0429, + "step": 5932 + }, + { + "epoch": 0.57, + "grad_norm": 0.24790386180695678, + "learning_rate": 0.00017156397598705548, + "loss": 0.9923, + "step": 5933 + }, + { + "epoch": 0.57, + "grad_norm": 0.24570072442193652, + "learning_rate": 0.00017155292565885328, + "loss": 1.0945, + "step": 5934 + }, + { + "epoch": 0.57, + "grad_norm": 0.23125439331824077, + "learning_rate": 0.0001715418735399941, + "loss": 0.9957, + "step": 5935 + }, + { + "epoch": 0.57, + "grad_norm": 0.2554465249970479, + "learning_rate": 0.00017153081963075458, + "loss": 1.1284, + "step": 5936 + }, + { + "epoch": 0.57, + "grad_norm": 0.2794800525821183, + "learning_rate": 0.00017151976393141132, + "loss": 0.9941, + "step": 5937 + }, + { + "epoch": 0.57, + "grad_norm": 0.30723518305938063, + "learning_rate": 0.000171508706442241, + "loss": 1.1475, + "step": 5938 + }, + { + "epoch": 0.57, + "grad_norm": 0.30203130938607553, + "learning_rate": 0.00017149764716352045, + "loss": 1.0853, + "step": 5939 + }, + { + "epoch": 0.57, + "grad_norm": 0.260328670983559, + "learning_rate": 0.00017148658609552627, + "loss": 1.1423, + "step": 5940 + }, + { + "epoch": 0.57, + "grad_norm": 0.293768400385368, + "learning_rate": 0.00017147552323853538, + "loss": 1.0503, + "step": 5941 + }, + { + "epoch": 0.57, + "grad_norm": 0.26640544240980046, + "learning_rate": 0.00017146445859282457, + "loss": 1.0018, + "step": 5942 + }, + { + "epoch": 0.57, + "grad_norm": 0.3159499546692293, + "learning_rate": 0.00017145339215867078, + "loss": 1.0303, + "step": 5943 + }, + { + "epoch": 0.57, + "grad_norm": 0.25980174549043206, + "learning_rate": 0.00017144232393635094, + "loss": 1.1472, + "step": 5944 + }, + { + "epoch": 0.57, + "grad_norm": 0.2727359561908714, + "learning_rate": 0.00017143125392614207, + "loss": 1.1425, + "step": 5945 + }, + { + "epoch": 0.57, + "grad_norm": 0.28270017496530925, + "learning_rate": 0.0001714201821283212, + "loss": 1.1023, + "step": 5946 + }, + { + "epoch": 0.57, + "grad_norm": 0.299277933501389, + "learning_rate": 0.0001714091085431653, + "loss": 1.1593, + "step": 5947 + }, + { + "epoch": 0.57, + "grad_norm": 0.2762476973789334, + "learning_rate": 0.00017139803317095165, + "loss": 1.0942, + "step": 5948 + }, + { + "epoch": 0.57, + "grad_norm": 0.3120883708584473, + "learning_rate": 0.00017138695601195733, + "loss": 1.0574, + "step": 5949 + }, + { + "epoch": 0.57, + "grad_norm": 0.27024241469527543, + "learning_rate": 0.00017137587706645956, + "loss": 1.12, + "step": 5950 + }, + { + "epoch": 0.57, + "grad_norm": 0.2584604551495038, + "learning_rate": 0.00017136479633473562, + "loss": 1.0682, + "step": 5951 + }, + { + "epoch": 0.57, + "grad_norm": 0.2525657024101026, + "learning_rate": 0.0001713537138170628, + "loss": 1.1037, + "step": 5952 + }, + { + "epoch": 0.57, + "grad_norm": 0.28274675021766404, + "learning_rate": 0.00017134262951371842, + "loss": 1.0135, + "step": 5953 + }, + { + "epoch": 0.57, + "grad_norm": 0.2774062534984412, + "learning_rate": 0.00017133154342497995, + "loss": 0.9824, + "step": 5954 + }, + { + "epoch": 0.57, + "grad_norm": 0.2832539855963225, + "learning_rate": 0.00017132045555112474, + "loss": 0.9708, + "step": 5955 + }, + { + "epoch": 0.57, + "grad_norm": 0.24641803171377838, + "learning_rate": 0.0001713093658924303, + "loss": 0.9826, + "step": 5956 + }, + { + "epoch": 0.57, + "grad_norm": 0.29062056115142054, + "learning_rate": 0.0001712982744491742, + "loss": 0.9739, + "step": 5957 + }, + { + "epoch": 0.57, + "grad_norm": 0.2922884188711339, + "learning_rate": 0.00017128718122163395, + "loss": 0.9959, + "step": 5958 + }, + { + "epoch": 0.57, + "grad_norm": 0.26793724543335196, + "learning_rate": 0.00017127608621008718, + "loss": 1.0553, + "step": 5959 + }, + { + "epoch": 0.57, + "grad_norm": 0.276065664300244, + "learning_rate": 0.00017126498941481155, + "loss": 1.1004, + "step": 5960 + }, + { + "epoch": 0.57, + "grad_norm": 0.26445107517178895, + "learning_rate": 0.00017125389083608479, + "loss": 1.0136, + "step": 5961 + }, + { + "epoch": 0.57, + "grad_norm": 0.2724968692637002, + "learning_rate": 0.00017124279047418464, + "loss": 0.9554, + "step": 5962 + }, + { + "epoch": 0.57, + "grad_norm": 0.2638896257757664, + "learning_rate": 0.00017123168832938886, + "loss": 1.0249, + "step": 5963 + }, + { + "epoch": 0.57, + "grad_norm": 0.25113449381499126, + "learning_rate": 0.00017122058440197533, + "loss": 1.0309, + "step": 5964 + }, + { + "epoch": 0.57, + "grad_norm": 0.29069944989919566, + "learning_rate": 0.0001712094786922219, + "loss": 1.0919, + "step": 5965 + }, + { + "epoch": 0.57, + "grad_norm": 0.28312314556833795, + "learning_rate": 0.00017119837120040652, + "loss": 1.0403, + "step": 5966 + }, + { + "epoch": 0.57, + "grad_norm": 0.26755188643974503, + "learning_rate": 0.00017118726192680717, + "loss": 1.0553, + "step": 5967 + }, + { + "epoch": 0.57, + "grad_norm": 0.26486022885030275, + "learning_rate": 0.0001711761508717018, + "loss": 1.0489, + "step": 5968 + }, + { + "epoch": 0.57, + "grad_norm": 0.3020328536278636, + "learning_rate": 0.00017116503803536856, + "loss": 1.1584, + "step": 5969 + }, + { + "epoch": 0.57, + "grad_norm": 0.34335564008695746, + "learning_rate": 0.00017115392341808555, + "loss": 1.1107, + "step": 5970 + }, + { + "epoch": 0.57, + "grad_norm": 0.30126089791181193, + "learning_rate": 0.00017114280702013084, + "loss": 1.0434, + "step": 5971 + }, + { + "epoch": 0.57, + "grad_norm": 0.28087942592247517, + "learning_rate": 0.00017113168884178267, + "loss": 1.0781, + "step": 5972 + }, + { + "epoch": 0.57, + "grad_norm": 0.2728680461126566, + "learning_rate": 0.0001711205688833193, + "loss": 1.0257, + "step": 5973 + }, + { + "epoch": 0.57, + "grad_norm": 0.2985709605948755, + "learning_rate": 0.000171109447145019, + "loss": 1.1728, + "step": 5974 + }, + { + "epoch": 0.57, + "grad_norm": 0.3082359218373612, + "learning_rate": 0.0001710983236271601, + "loss": 1.0554, + "step": 5975 + }, + { + "epoch": 0.57, + "grad_norm": 0.2795284645154645, + "learning_rate": 0.00017108719833002094, + "loss": 1.0948, + "step": 5976 + }, + { + "epoch": 0.57, + "grad_norm": 0.28120362222029427, + "learning_rate": 0.00017107607125387998, + "loss": 0.9671, + "step": 5977 + }, + { + "epoch": 0.57, + "grad_norm": 0.2823788055491324, + "learning_rate": 0.00017106494239901566, + "loss": 1.0073, + "step": 5978 + }, + { + "epoch": 0.57, + "grad_norm": 0.31065181033843386, + "learning_rate": 0.00017105381176570652, + "loss": 1.1014, + "step": 5979 + }, + { + "epoch": 0.57, + "grad_norm": 0.30900428726246143, + "learning_rate": 0.00017104267935423107, + "loss": 1.0998, + "step": 5980 + }, + { + "epoch": 0.57, + "grad_norm": 0.30988350895081024, + "learning_rate": 0.0001710315451648679, + "loss": 1.0667, + "step": 5981 + }, + { + "epoch": 0.57, + "grad_norm": 0.2981798835140071, + "learning_rate": 0.0001710204091978957, + "loss": 1.065, + "step": 5982 + }, + { + "epoch": 0.57, + "grad_norm": 0.2732778988161561, + "learning_rate": 0.0001710092714535931, + "loss": 1.184, + "step": 5983 + }, + { + "epoch": 0.57, + "grad_norm": 0.27164434920682934, + "learning_rate": 0.00017099813193223887, + "loss": 1.0524, + "step": 5984 + }, + { + "epoch": 0.57, + "grad_norm": 0.29410498736707635, + "learning_rate": 0.00017098699063411178, + "loss": 1.0777, + "step": 5985 + }, + { + "epoch": 0.57, + "grad_norm": 0.28163842201092487, + "learning_rate": 0.0001709758475594906, + "loss": 1.0853, + "step": 5986 + }, + { + "epoch": 0.57, + "grad_norm": 0.27782202027455466, + "learning_rate": 0.00017096470270865427, + "loss": 1.0146, + "step": 5987 + }, + { + "epoch": 0.57, + "grad_norm": 0.2768790776720804, + "learning_rate": 0.0001709535560818816, + "loss": 1.1312, + "step": 5988 + }, + { + "epoch": 0.57, + "grad_norm": 0.2824780756717249, + "learning_rate": 0.00017094240767945166, + "loss": 1.059, + "step": 5989 + }, + { + "epoch": 0.57, + "grad_norm": 0.27033728590320183, + "learning_rate": 0.00017093125750164333, + "loss": 1.197, + "step": 5990 + }, + { + "epoch": 0.57, + "grad_norm": 0.3100948675967715, + "learning_rate": 0.00017092010554873574, + "loss": 1.1042, + "step": 5991 + }, + { + "epoch": 0.57, + "grad_norm": 0.28820788597927405, + "learning_rate": 0.0001709089518210079, + "loss": 1.1635, + "step": 5992 + }, + { + "epoch": 0.57, + "grad_norm": 0.28201539629588646, + "learning_rate": 0.00017089779631873904, + "loss": 0.9284, + "step": 5993 + }, + { + "epoch": 0.57, + "grad_norm": 0.27679231851615277, + "learning_rate": 0.0001708866390422082, + "loss": 1.0167, + "step": 5994 + }, + { + "epoch": 0.57, + "grad_norm": 0.27944884514951135, + "learning_rate": 0.0001708754799916947, + "loss": 1.139, + "step": 5995 + }, + { + "epoch": 0.57, + "grad_norm": 0.3002084383540107, + "learning_rate": 0.00017086431916747778, + "loss": 1.0454, + "step": 5996 + }, + { + "epoch": 0.57, + "grad_norm": 0.26816351694504426, + "learning_rate": 0.0001708531565698367, + "loss": 1.0773, + "step": 5997 + }, + { + "epoch": 0.57, + "grad_norm": 0.30990517015210073, + "learning_rate": 0.0001708419921990509, + "loss": 1.1122, + "step": 5998 + }, + { + "epoch": 0.57, + "grad_norm": 0.30574794780484754, + "learning_rate": 0.0001708308260553997, + "loss": 1.1214, + "step": 5999 + }, + { + "epoch": 0.57, + "grad_norm": 0.2690373353619774, + "learning_rate": 0.00017081965813916253, + "loss": 1.062, + "step": 6000 + }, + { + "epoch": 0.57, + "grad_norm": 0.27472519148333513, + "learning_rate": 0.00017080848845061892, + "loss": 1.0179, + "step": 6001 + }, + { + "epoch": 0.57, + "grad_norm": 0.27470990890182256, + "learning_rate": 0.0001707973169900484, + "loss": 1.0747, + "step": 6002 + }, + { + "epoch": 0.57, + "grad_norm": 0.29061144311513604, + "learning_rate": 0.00017078614375773052, + "loss": 1.1307, + "step": 6003 + }, + { + "epoch": 0.57, + "grad_norm": 0.2713055223809409, + "learning_rate": 0.00017077496875394493, + "loss": 1.0008, + "step": 6004 + }, + { + "epoch": 0.57, + "grad_norm": 0.2925514548076972, + "learning_rate": 0.00017076379197897122, + "loss": 1.1317, + "step": 6005 + }, + { + "epoch": 0.57, + "grad_norm": 0.31101769602212137, + "learning_rate": 0.00017075261343308916, + "loss": 1.1037, + "step": 6006 + }, + { + "epoch": 0.57, + "grad_norm": 0.2647833297981797, + "learning_rate": 0.00017074143311657852, + "loss": 1.0076, + "step": 6007 + }, + { + "epoch": 0.57, + "grad_norm": 0.3013454403605899, + "learning_rate": 0.00017073025102971903, + "loss": 1.0792, + "step": 6008 + }, + { + "epoch": 0.57, + "grad_norm": 0.263491300792746, + "learning_rate": 0.00017071906717279053, + "loss": 0.9797, + "step": 6009 + }, + { + "epoch": 0.57, + "grad_norm": 0.2724539339009908, + "learning_rate": 0.00017070788154607293, + "loss": 0.9805, + "step": 6010 + }, + { + "epoch": 0.58, + "grad_norm": 0.31840312141141264, + "learning_rate": 0.00017069669414984618, + "loss": 1.142, + "step": 6011 + }, + { + "epoch": 0.58, + "grad_norm": 0.2983653164774024, + "learning_rate": 0.00017068550498439025, + "loss": 1.103, + "step": 6012 + }, + { + "epoch": 0.58, + "grad_norm": 0.282150969685538, + "learning_rate": 0.00017067431404998507, + "loss": 1.0682, + "step": 6013 + }, + { + "epoch": 0.58, + "grad_norm": 0.2771901829735762, + "learning_rate": 0.00017066312134691083, + "loss": 1.009, + "step": 6014 + }, + { + "epoch": 0.58, + "grad_norm": 0.2933084300533384, + "learning_rate": 0.00017065192687544753, + "loss": 1.102, + "step": 6015 + }, + { + "epoch": 0.58, + "grad_norm": 0.27679497108587947, + "learning_rate": 0.00017064073063587535, + "loss": 1.1393, + "step": 6016 + }, + { + "epoch": 0.58, + "grad_norm": 0.28895991711416996, + "learning_rate": 0.00017062953262847455, + "loss": 1.0694, + "step": 6017 + }, + { + "epoch": 0.58, + "grad_norm": 0.27823308988900375, + "learning_rate": 0.00017061833285352527, + "loss": 1.0905, + "step": 6018 + }, + { + "epoch": 0.58, + "grad_norm": 0.2748970724583933, + "learning_rate": 0.00017060713131130778, + "loss": 1.1278, + "step": 6019 + }, + { + "epoch": 0.58, + "grad_norm": 0.2730226539296915, + "learning_rate": 0.00017059592800210252, + "loss": 1.0858, + "step": 6020 + }, + { + "epoch": 0.58, + "grad_norm": 0.271328807355675, + "learning_rate": 0.00017058472292618977, + "loss": 0.9972, + "step": 6021 + }, + { + "epoch": 0.58, + "grad_norm": 0.27544958968420746, + "learning_rate": 0.00017057351608384995, + "loss": 1.1456, + "step": 6022 + }, + { + "epoch": 0.58, + "grad_norm": 0.2833122301378086, + "learning_rate": 0.00017056230747536355, + "loss": 1.1727, + "step": 6023 + }, + { + "epoch": 0.58, + "grad_norm": 0.2894626470697644, + "learning_rate": 0.00017055109710101108, + "loss": 1.2121, + "step": 6024 + }, + { + "epoch": 0.58, + "grad_norm": 0.3306470002946546, + "learning_rate": 0.00017053988496107305, + "loss": 1.0776, + "step": 6025 + }, + { + "epoch": 0.58, + "grad_norm": 0.27472508074504254, + "learning_rate": 0.00017052867105583005, + "loss": 0.973, + "step": 6026 + }, + { + "epoch": 0.58, + "grad_norm": 0.3021983840922597, + "learning_rate": 0.00017051745538556278, + "loss": 1.0972, + "step": 6027 + }, + { + "epoch": 0.58, + "grad_norm": 0.2970721247856769, + "learning_rate": 0.0001705062379505518, + "loss": 1.1562, + "step": 6028 + }, + { + "epoch": 0.58, + "grad_norm": 0.2830455403468363, + "learning_rate": 0.00017049501875107795, + "loss": 0.9997, + "step": 6029 + }, + { + "epoch": 0.58, + "grad_norm": 0.28710721053351523, + "learning_rate": 0.00017048379778742193, + "loss": 1.1453, + "step": 6030 + }, + { + "epoch": 0.58, + "grad_norm": 0.27478245168556914, + "learning_rate": 0.00017047257505986457, + "loss": 1.0092, + "step": 6031 + }, + { + "epoch": 0.58, + "grad_norm": 0.26896422201399867, + "learning_rate": 0.00017046135056868677, + "loss": 1.0855, + "step": 6032 + }, + { + "epoch": 0.58, + "grad_norm": 0.24707636757742915, + "learning_rate": 0.00017045012431416936, + "loss": 1.0971, + "step": 6033 + }, + { + "epoch": 0.58, + "grad_norm": 0.2493183820340796, + "learning_rate": 0.0001704388962965933, + "loss": 0.9823, + "step": 6034 + }, + { + "epoch": 0.58, + "grad_norm": 0.2805800589880677, + "learning_rate": 0.00017042766651623962, + "loss": 1.0523, + "step": 6035 + }, + { + "epoch": 0.58, + "grad_norm": 0.3038587968507171, + "learning_rate": 0.00017041643497338931, + "loss": 1.0162, + "step": 6036 + }, + { + "epoch": 0.58, + "grad_norm": 0.30638016511222943, + "learning_rate": 0.00017040520166832344, + "loss": 1.0421, + "step": 6037 + }, + { + "epoch": 0.58, + "grad_norm": 0.26320036877135733, + "learning_rate": 0.00017039396660132317, + "loss": 0.946, + "step": 6038 + }, + { + "epoch": 0.58, + "grad_norm": 0.2754763891016706, + "learning_rate": 0.00017038272977266966, + "loss": 1.0652, + "step": 6039 + }, + { + "epoch": 0.58, + "grad_norm": 0.26134384808972805, + "learning_rate": 0.0001703714911826441, + "loss": 1.1064, + "step": 6040 + }, + { + "epoch": 0.58, + "grad_norm": 0.2651686121986517, + "learning_rate": 0.0001703602508315277, + "loss": 1.1166, + "step": 6041 + }, + { + "epoch": 0.58, + "grad_norm": 0.25447003550809477, + "learning_rate": 0.00017034900871960184, + "loss": 1.0199, + "step": 6042 + }, + { + "epoch": 0.58, + "grad_norm": 0.28285744813232555, + "learning_rate": 0.0001703377648471478, + "loss": 1.0086, + "step": 6043 + }, + { + "epoch": 0.58, + "grad_norm": 0.2651737161168531, + "learning_rate": 0.000170326519214447, + "loss": 1.0729, + "step": 6044 + }, + { + "epoch": 0.58, + "grad_norm": 0.2855727067410992, + "learning_rate": 0.00017031527182178092, + "loss": 1.0919, + "step": 6045 + }, + { + "epoch": 0.58, + "grad_norm": 0.26177841865296425, + "learning_rate": 0.0001703040226694309, + "loss": 0.9408, + "step": 6046 + }, + { + "epoch": 0.58, + "grad_norm": 0.23691819299378247, + "learning_rate": 0.00017029277175767854, + "loss": 1.0849, + "step": 6047 + }, + { + "epoch": 0.58, + "grad_norm": 0.2511952002294269, + "learning_rate": 0.0001702815190868054, + "loss": 1.0779, + "step": 6048 + }, + { + "epoch": 0.58, + "grad_norm": 0.29110924767645496, + "learning_rate": 0.00017027026465709307, + "loss": 0.9933, + "step": 6049 + }, + { + "epoch": 0.58, + "grad_norm": 0.2830264573548182, + "learning_rate": 0.00017025900846882321, + "loss": 1.0192, + "step": 6050 + }, + { + "epoch": 0.58, + "grad_norm": 0.28455741758111286, + "learning_rate": 0.00017024775052227752, + "loss": 1.0588, + "step": 6051 + }, + { + "epoch": 0.58, + "grad_norm": 0.28932106749302483, + "learning_rate": 0.0001702364908177377, + "loss": 1.1211, + "step": 6052 + }, + { + "epoch": 0.58, + "grad_norm": 0.3307206565435221, + "learning_rate": 0.00017022522935548554, + "loss": 1.0975, + "step": 6053 + }, + { + "epoch": 0.58, + "grad_norm": 0.25352957645066343, + "learning_rate": 0.0001702139661358029, + "loss": 1.0298, + "step": 6054 + }, + { + "epoch": 0.58, + "grad_norm": 0.2572310440026188, + "learning_rate": 0.00017020270115897164, + "loss": 1.0728, + "step": 6055 + }, + { + "epoch": 0.58, + "grad_norm": 0.3172532579427463, + "learning_rate": 0.00017019143442527365, + "loss": 1.1023, + "step": 6056 + }, + { + "epoch": 0.58, + "grad_norm": 0.282565292417829, + "learning_rate": 0.0001701801659349909, + "loss": 1.0488, + "step": 6057 + }, + { + "epoch": 0.58, + "grad_norm": 0.275135543466168, + "learning_rate": 0.00017016889568840542, + "loss": 1.1721, + "step": 6058 + }, + { + "epoch": 0.58, + "grad_norm": 0.26182321188521385, + "learning_rate": 0.00017015762368579918, + "loss": 1.0598, + "step": 6059 + }, + { + "epoch": 0.58, + "grad_norm": 0.26125752561442744, + "learning_rate": 0.00017014634992745434, + "loss": 1.1684, + "step": 6060 + }, + { + "epoch": 0.58, + "grad_norm": 0.27290128374782624, + "learning_rate": 0.000170135074413653, + "loss": 0.9901, + "step": 6061 + }, + { + "epoch": 0.58, + "grad_norm": 0.2653240038158296, + "learning_rate": 0.00017012379714467736, + "loss": 1.1086, + "step": 6062 + }, + { + "epoch": 0.58, + "grad_norm": 0.3293682851680674, + "learning_rate": 0.0001701125181208096, + "loss": 1.117, + "step": 6063 + }, + { + "epoch": 0.58, + "grad_norm": 0.2329956724588023, + "learning_rate": 0.00017010123734233204, + "loss": 1.0925, + "step": 6064 + }, + { + "epoch": 0.58, + "grad_norm": 0.2755426850023404, + "learning_rate": 0.00017008995480952694, + "loss": 1.1292, + "step": 6065 + }, + { + "epoch": 0.58, + "grad_norm": 0.29650880006845154, + "learning_rate": 0.00017007867052267666, + "loss": 1.0334, + "step": 6066 + }, + { + "epoch": 0.58, + "grad_norm": 0.2803458685525201, + "learning_rate": 0.00017006738448206363, + "loss": 1.0168, + "step": 6067 + }, + { + "epoch": 0.58, + "grad_norm": 0.2916272901141261, + "learning_rate": 0.00017005609668797024, + "loss": 1.0039, + "step": 6068 + }, + { + "epoch": 0.58, + "grad_norm": 0.2691648334575106, + "learning_rate": 0.00017004480714067903, + "loss": 1.0727, + "step": 6069 + }, + { + "epoch": 0.58, + "grad_norm": 0.3270372768288688, + "learning_rate": 0.00017003351584047249, + "loss": 1.1879, + "step": 6070 + }, + { + "epoch": 0.58, + "grad_norm": 0.2782777828019017, + "learning_rate": 0.0001700222227876332, + "loss": 1.0006, + "step": 6071 + }, + { + "epoch": 0.58, + "grad_norm": 0.2661246430524472, + "learning_rate": 0.00017001092798244377, + "loss": 1.0393, + "step": 6072 + }, + { + "epoch": 0.58, + "grad_norm": 0.281715885539005, + "learning_rate": 0.00016999963142518687, + "loss": 1.1322, + "step": 6073 + }, + { + "epoch": 0.58, + "grad_norm": 0.2986783823949029, + "learning_rate": 0.0001699883331161452, + "loss": 0.9984, + "step": 6074 + }, + { + "epoch": 0.58, + "grad_norm": 0.28065012498362923, + "learning_rate": 0.00016997703305560153, + "loss": 1.06, + "step": 6075 + }, + { + "epoch": 0.58, + "grad_norm": 0.3292647162053623, + "learning_rate": 0.00016996573124383862, + "loss": 1.075, + "step": 6076 + }, + { + "epoch": 0.58, + "grad_norm": 0.3073957042018125, + "learning_rate": 0.0001699544276811393, + "loss": 1.1219, + "step": 6077 + }, + { + "epoch": 0.58, + "grad_norm": 0.3076254806022189, + "learning_rate": 0.00016994312236778646, + "loss": 1.1214, + "step": 6078 + }, + { + "epoch": 0.58, + "grad_norm": 0.24563100316558975, + "learning_rate": 0.00016993181530406304, + "loss": 1.0868, + "step": 6079 + }, + { + "epoch": 0.58, + "grad_norm": 0.32781029130768446, + "learning_rate": 0.00016992050649025197, + "loss": 1.0481, + "step": 6080 + }, + { + "epoch": 0.58, + "grad_norm": 0.27854015122577225, + "learning_rate": 0.0001699091959266363, + "loss": 1.0749, + "step": 6081 + }, + { + "epoch": 0.58, + "grad_norm": 0.2844546319680473, + "learning_rate": 0.00016989788361349906, + "loss": 1.0692, + "step": 6082 + }, + { + "epoch": 0.58, + "grad_norm": 0.26692237610439323, + "learning_rate": 0.00016988656955112337, + "loss": 1.1151, + "step": 6083 + }, + { + "epoch": 0.58, + "grad_norm": 0.26269204364410026, + "learning_rate": 0.00016987525373979233, + "loss": 0.9348, + "step": 6084 + }, + { + "epoch": 0.58, + "grad_norm": 0.27306822357867006, + "learning_rate": 0.00016986393617978918, + "loss": 1.2159, + "step": 6085 + }, + { + "epoch": 0.58, + "grad_norm": 0.2827921168255787, + "learning_rate": 0.0001698526168713971, + "loss": 0.9376, + "step": 6086 + }, + { + "epoch": 0.58, + "grad_norm": 0.23947335215371623, + "learning_rate": 0.00016984129581489935, + "loss": 1.0817, + "step": 6087 + }, + { + "epoch": 0.58, + "grad_norm": 0.3343810073022071, + "learning_rate": 0.0001698299730105793, + "loss": 1.1555, + "step": 6088 + }, + { + "epoch": 0.58, + "grad_norm": 0.26515752282559285, + "learning_rate": 0.00016981864845872033, + "loss": 1.1123, + "step": 6089 + }, + { + "epoch": 0.58, + "grad_norm": 0.27681937200219026, + "learning_rate": 0.00016980732215960575, + "loss": 1.0367, + "step": 6090 + }, + { + "epoch": 0.58, + "grad_norm": 0.2519010649566014, + "learning_rate": 0.0001697959941135191, + "loss": 1.0861, + "step": 6091 + }, + { + "epoch": 0.58, + "grad_norm": 0.2704443985177932, + "learning_rate": 0.00016978466432074381, + "loss": 1.0398, + "step": 6092 + }, + { + "epoch": 0.58, + "grad_norm": 0.28884051221369256, + "learning_rate": 0.00016977333278156347, + "loss": 1.0501, + "step": 6093 + }, + { + "epoch": 0.58, + "grad_norm": 0.2749924386950333, + "learning_rate": 0.0001697619994962616, + "loss": 1.0743, + "step": 6094 + }, + { + "epoch": 0.58, + "grad_norm": 0.279690239046547, + "learning_rate": 0.00016975066446512185, + "loss": 1.1504, + "step": 6095 + }, + { + "epoch": 0.58, + "grad_norm": 0.24616630543118756, + "learning_rate": 0.00016973932768842787, + "loss": 1.1121, + "step": 6096 + }, + { + "epoch": 0.58, + "grad_norm": 0.25087990953142, + "learning_rate": 0.00016972798916646336, + "loss": 1.0454, + "step": 6097 + }, + { + "epoch": 0.58, + "grad_norm": 0.2766189199572125, + "learning_rate": 0.00016971664889951215, + "loss": 1.0582, + "step": 6098 + }, + { + "epoch": 0.58, + "grad_norm": 0.28980752883287725, + "learning_rate": 0.00016970530688785798, + "loss": 1.1664, + "step": 6099 + }, + { + "epoch": 0.58, + "grad_norm": 0.334844857943314, + "learning_rate": 0.0001696939631317847, + "loss": 1.1833, + "step": 6100 + }, + { + "epoch": 0.58, + "grad_norm": 0.2923718680275688, + "learning_rate": 0.0001696826176315762, + "loss": 1.063, + "step": 6101 + }, + { + "epoch": 0.58, + "grad_norm": 0.2927999981743822, + "learning_rate": 0.00016967127038751637, + "loss": 1.0568, + "step": 6102 + }, + { + "epoch": 0.58, + "grad_norm": 0.30725008382724595, + "learning_rate": 0.0001696599213998892, + "loss": 1.113, + "step": 6103 + }, + { + "epoch": 0.58, + "grad_norm": 0.2864392247154817, + "learning_rate": 0.00016964857066897876, + "loss": 1.0959, + "step": 6104 + }, + { + "epoch": 0.58, + "grad_norm": 0.29730179190891975, + "learning_rate": 0.00016963721819506904, + "loss": 1.0681, + "step": 6105 + }, + { + "epoch": 0.58, + "grad_norm": 0.2338409227151746, + "learning_rate": 0.00016962586397844417, + "loss": 0.9203, + "step": 6106 + }, + { + "epoch": 0.58, + "grad_norm": 0.30140296809951767, + "learning_rate": 0.0001696145080193883, + "loss": 1.1501, + "step": 6107 + }, + { + "epoch": 0.58, + "grad_norm": 0.29957908772250824, + "learning_rate": 0.00016960315031818563, + "loss": 0.9797, + "step": 6108 + }, + { + "epoch": 0.58, + "grad_norm": 0.2703387299476374, + "learning_rate": 0.00016959179087512038, + "loss": 1.1028, + "step": 6109 + }, + { + "epoch": 0.58, + "grad_norm": 0.24345511227668115, + "learning_rate": 0.0001695804296904768, + "loss": 1.1393, + "step": 6110 + }, + { + "epoch": 0.58, + "grad_norm": 0.28466641760021855, + "learning_rate": 0.00016956906676453927, + "loss": 1.0691, + "step": 6111 + }, + { + "epoch": 0.58, + "grad_norm": 0.26942292513037813, + "learning_rate": 0.00016955770209759206, + "loss": 1.1351, + "step": 6112 + }, + { + "epoch": 0.58, + "grad_norm": 0.2432370590626464, + "learning_rate": 0.0001695463356899197, + "loss": 1.111, + "step": 6113 + }, + { + "epoch": 0.58, + "grad_norm": 0.2768776116799371, + "learning_rate": 0.00016953496754180657, + "loss": 1.0402, + "step": 6114 + }, + { + "epoch": 0.59, + "grad_norm": 0.2782542666733751, + "learning_rate": 0.00016952359765353716, + "loss": 1.016, + "step": 6115 + }, + { + "epoch": 0.59, + "grad_norm": 0.2606113670203369, + "learning_rate": 0.00016951222602539604, + "loss": 1.1145, + "step": 6116 + }, + { + "epoch": 0.59, + "grad_norm": 0.28152618216858916, + "learning_rate": 0.00016950085265766775, + "loss": 1.0144, + "step": 6117 + }, + { + "epoch": 0.59, + "grad_norm": 0.2762557373966403, + "learning_rate": 0.000169489477550637, + "loss": 1.0188, + "step": 6118 + }, + { + "epoch": 0.59, + "grad_norm": 0.27464396847770955, + "learning_rate": 0.00016947810070458836, + "loss": 1.0246, + "step": 6119 + }, + { + "epoch": 0.59, + "grad_norm": 0.2553714383261195, + "learning_rate": 0.00016946672211980656, + "loss": 1.0676, + "step": 6120 + }, + { + "epoch": 0.59, + "grad_norm": 0.2663697635305504, + "learning_rate": 0.00016945534179657642, + "loss": 1.0607, + "step": 6121 + }, + { + "epoch": 0.59, + "grad_norm": 0.2724085363177121, + "learning_rate": 0.00016944395973518273, + "loss": 1.0022, + "step": 6122 + }, + { + "epoch": 0.59, + "grad_norm": 0.2830680768080686, + "learning_rate": 0.00016943257593591025, + "loss": 1.1051, + "step": 6123 + }, + { + "epoch": 0.59, + "grad_norm": 0.28224872460306244, + "learning_rate": 0.00016942119039904392, + "loss": 1.066, + "step": 6124 + }, + { + "epoch": 0.59, + "grad_norm": 0.26377009917280125, + "learning_rate": 0.0001694098031248687, + "loss": 1.0324, + "step": 6125 + }, + { + "epoch": 0.59, + "grad_norm": 0.2718917219979864, + "learning_rate": 0.0001693984141136695, + "loss": 1.0499, + "step": 6126 + }, + { + "epoch": 0.59, + "grad_norm": 0.259809210248362, + "learning_rate": 0.0001693870233657314, + "loss": 1.0218, + "step": 6127 + }, + { + "epoch": 0.59, + "grad_norm": 0.2757737836543942, + "learning_rate": 0.00016937563088133942, + "loss": 1.0728, + "step": 6128 + }, + { + "epoch": 0.59, + "grad_norm": 0.3068744212914902, + "learning_rate": 0.0001693642366607787, + "loss": 1.0689, + "step": 6129 + }, + { + "epoch": 0.59, + "grad_norm": 0.2559474725346445, + "learning_rate": 0.00016935284070433436, + "loss": 0.9259, + "step": 6130 + }, + { + "epoch": 0.59, + "grad_norm": 0.2889397480153887, + "learning_rate": 0.00016934144301229155, + "loss": 1.1747, + "step": 6131 + }, + { + "epoch": 0.59, + "grad_norm": 0.3088905871334589, + "learning_rate": 0.0001693300435849356, + "loss": 1.172, + "step": 6132 + }, + { + "epoch": 0.59, + "grad_norm": 0.30471384991519385, + "learning_rate": 0.00016931864242255171, + "loss": 1.0735, + "step": 6133 + }, + { + "epoch": 0.59, + "grad_norm": 0.5872459177211959, + "learning_rate": 0.00016930723952542523, + "loss": 0.9714, + "step": 6134 + }, + { + "epoch": 0.59, + "grad_norm": 0.26880571869976677, + "learning_rate": 0.0001692958348938415, + "loss": 1.0637, + "step": 6135 + }, + { + "epoch": 0.59, + "grad_norm": 0.2842892019706417, + "learning_rate": 0.000169284428528086, + "loss": 1.0986, + "step": 6136 + }, + { + "epoch": 0.59, + "grad_norm": 0.3139992368614553, + "learning_rate": 0.0001692730204284441, + "loss": 1.0542, + "step": 6137 + }, + { + "epoch": 0.59, + "grad_norm": 0.2939973190730885, + "learning_rate": 0.00016926161059520133, + "loss": 1.018, + "step": 6138 + }, + { + "epoch": 0.59, + "grad_norm": 0.2745235711764705, + "learning_rate": 0.00016925019902864325, + "loss": 1.0936, + "step": 6139 + }, + { + "epoch": 0.59, + "grad_norm": 0.295538764046495, + "learning_rate": 0.0001692387857290554, + "loss": 1.238, + "step": 6140 + }, + { + "epoch": 0.59, + "grad_norm": 0.2918293376304929, + "learning_rate": 0.00016922737069672344, + "loss": 1.0324, + "step": 6141 + }, + { + "epoch": 0.59, + "grad_norm": 0.2661937879474406, + "learning_rate": 0.000169215953931933, + "loss": 1.1461, + "step": 6142 + }, + { + "epoch": 0.59, + "grad_norm": 0.29698439715428343, + "learning_rate": 0.0001692045354349698, + "loss": 1.0072, + "step": 6143 + }, + { + "epoch": 0.59, + "grad_norm": 0.2841385747639344, + "learning_rate": 0.0001691931152061196, + "loss": 1.0169, + "step": 6144 + }, + { + "epoch": 0.59, + "grad_norm": 0.2525826472239577, + "learning_rate": 0.00016918169324566825, + "loss": 1.0821, + "step": 6145 + }, + { + "epoch": 0.59, + "grad_norm": 0.30426659332063666, + "learning_rate": 0.0001691702695539015, + "loss": 1.0161, + "step": 6146 + }, + { + "epoch": 0.59, + "grad_norm": 0.3247256246142582, + "learning_rate": 0.00016915884413110536, + "loss": 1.0768, + "step": 6147 + }, + { + "epoch": 0.59, + "grad_norm": 0.28154171353151247, + "learning_rate": 0.0001691474169775656, + "loss": 1.0462, + "step": 6148 + }, + { + "epoch": 0.59, + "grad_norm": 0.32206989451215157, + "learning_rate": 0.0001691359880935683, + "loss": 1.0335, + "step": 6149 + }, + { + "epoch": 0.59, + "grad_norm": 0.29975094798814433, + "learning_rate": 0.00016912455747939946, + "loss": 1.08, + "step": 6150 + }, + { + "epoch": 0.59, + "grad_norm": 0.2940671992246637, + "learning_rate": 0.0001691131251353451, + "loss": 1.0279, + "step": 6151 + }, + { + "epoch": 0.59, + "grad_norm": 0.30973218098094907, + "learning_rate": 0.00016910169106169138, + "loss": 1.1113, + "step": 6152 + }, + { + "epoch": 0.59, + "grad_norm": 0.2985525692876313, + "learning_rate": 0.0001690902552587244, + "loss": 1.0637, + "step": 6153 + }, + { + "epoch": 0.59, + "grad_norm": 0.2539451576750145, + "learning_rate": 0.00016907881772673032, + "loss": 1.0424, + "step": 6154 + }, + { + "epoch": 0.59, + "grad_norm": 0.2687758329509323, + "learning_rate": 0.00016906737846599548, + "loss": 1.0618, + "step": 6155 + }, + { + "epoch": 0.59, + "grad_norm": 0.27971131955326073, + "learning_rate": 0.00016905593747680602, + "loss": 1.0893, + "step": 6156 + }, + { + "epoch": 0.59, + "grad_norm": 0.2543125383692707, + "learning_rate": 0.00016904449475944837, + "loss": 1.1021, + "step": 6157 + }, + { + "epoch": 0.59, + "grad_norm": 0.2789446404233202, + "learning_rate": 0.00016903305031420885, + "loss": 1.1103, + "step": 6158 + }, + { + "epoch": 0.59, + "grad_norm": 0.28334066222200344, + "learning_rate": 0.00016902160414137383, + "loss": 1.065, + "step": 6159 + }, + { + "epoch": 0.59, + "grad_norm": 0.2358577232682755, + "learning_rate": 0.0001690101562412298, + "loss": 1.021, + "step": 6160 + }, + { + "epoch": 0.59, + "grad_norm": 0.27559840264421875, + "learning_rate": 0.00016899870661406325, + "loss": 1.1527, + "step": 6161 + }, + { + "epoch": 0.59, + "grad_norm": 0.30571401380369834, + "learning_rate": 0.0001689872552601607, + "loss": 1.1118, + "step": 6162 + }, + { + "epoch": 0.59, + "grad_norm": 0.29488378034589946, + "learning_rate": 0.00016897580217980872, + "loss": 1.1922, + "step": 6163 + }, + { + "epoch": 0.59, + "grad_norm": 0.31253654893817673, + "learning_rate": 0.00016896434737329394, + "loss": 0.9932, + "step": 6164 + }, + { + "epoch": 0.59, + "grad_norm": 0.271362147934014, + "learning_rate": 0.00016895289084090304, + "loss": 1.1036, + "step": 6165 + }, + { + "epoch": 0.59, + "grad_norm": 0.2703212529224629, + "learning_rate": 0.0001689414325829227, + "loss": 1.1416, + "step": 6166 + }, + { + "epoch": 0.59, + "grad_norm": 0.2824957796659181, + "learning_rate": 0.0001689299725996397, + "loss": 1.0145, + "step": 6167 + }, + { + "epoch": 0.59, + "grad_norm": 0.2976711651462754, + "learning_rate": 0.00016891851089134079, + "loss": 1.0386, + "step": 6168 + }, + { + "epoch": 0.59, + "grad_norm": 0.26975195101683597, + "learning_rate": 0.00016890704745831282, + "loss": 1.0879, + "step": 6169 + }, + { + "epoch": 0.59, + "grad_norm": 0.2643048862785792, + "learning_rate": 0.00016889558230084273, + "loss": 1.0786, + "step": 6170 + }, + { + "epoch": 0.59, + "grad_norm": 0.2717495195222049, + "learning_rate": 0.00016888411541921735, + "loss": 1.1159, + "step": 6171 + }, + { + "epoch": 0.59, + "grad_norm": 0.2833620530227191, + "learning_rate": 0.0001688726468137237, + "loss": 1.097, + "step": 6172 + }, + { + "epoch": 0.59, + "grad_norm": 0.25257033358758985, + "learning_rate": 0.00016886117648464878, + "loss": 1.0967, + "step": 6173 + }, + { + "epoch": 0.59, + "grad_norm": 0.27530661316906846, + "learning_rate": 0.00016884970443227965, + "loss": 1.0395, + "step": 6174 + }, + { + "epoch": 0.59, + "grad_norm": 0.2527678120212698, + "learning_rate": 0.00016883823065690337, + "loss": 1.0305, + "step": 6175 + }, + { + "epoch": 0.59, + "grad_norm": 0.2574184683768036, + "learning_rate": 0.00016882675515880714, + "loss": 1.0878, + "step": 6176 + }, + { + "epoch": 0.59, + "grad_norm": 0.25761118308699854, + "learning_rate": 0.00016881527793827808, + "loss": 0.9574, + "step": 6177 + }, + { + "epoch": 0.59, + "grad_norm": 0.27107127416113136, + "learning_rate": 0.00016880379899560346, + "loss": 1.1092, + "step": 6178 + }, + { + "epoch": 0.59, + "grad_norm": 0.2842811642803488, + "learning_rate": 0.0001687923183310705, + "loss": 1.1637, + "step": 6179 + }, + { + "epoch": 0.59, + "grad_norm": 0.2728440826452907, + "learning_rate": 0.00016878083594496656, + "loss": 0.9771, + "step": 6180 + }, + { + "epoch": 0.59, + "grad_norm": 0.2886790568351847, + "learning_rate": 0.000168769351837579, + "loss": 0.9462, + "step": 6181 + }, + { + "epoch": 0.59, + "grad_norm": 0.25378061719229456, + "learning_rate": 0.00016875786600919514, + "loss": 1.0195, + "step": 6182 + }, + { + "epoch": 0.59, + "grad_norm": 0.2942815181656749, + "learning_rate": 0.00016874637846010248, + "loss": 1.1533, + "step": 6183 + }, + { + "epoch": 0.59, + "grad_norm": 0.26947359910559543, + "learning_rate": 0.00016873488919058854, + "loss": 0.9764, + "step": 6184 + }, + { + "epoch": 0.59, + "grad_norm": 0.2997412184794112, + "learning_rate": 0.00016872339820094074, + "loss": 1.0704, + "step": 6185 + }, + { + "epoch": 0.59, + "grad_norm": 0.241116336668137, + "learning_rate": 0.00016871190549144673, + "loss": 1.0213, + "step": 6186 + }, + { + "epoch": 0.59, + "grad_norm": 0.29798499768309145, + "learning_rate": 0.00016870041106239412, + "loss": 1.0051, + "step": 6187 + }, + { + "epoch": 0.59, + "grad_norm": 0.28286155546938635, + "learning_rate": 0.00016868891491407054, + "loss": 1.1061, + "step": 6188 + }, + { + "epoch": 0.59, + "grad_norm": 0.24398969837876033, + "learning_rate": 0.00016867741704676368, + "loss": 1.0401, + "step": 6189 + }, + { + "epoch": 0.59, + "grad_norm": 0.28740259957474623, + "learning_rate": 0.0001686659174607613, + "loss": 1.1156, + "step": 6190 + }, + { + "epoch": 0.59, + "grad_norm": 0.2892058554716867, + "learning_rate": 0.0001686544161563512, + "loss": 1.0922, + "step": 6191 + }, + { + "epoch": 0.59, + "grad_norm": 0.31955410366097675, + "learning_rate": 0.00016864291313382115, + "loss": 1.0641, + "step": 6192 + }, + { + "epoch": 0.59, + "grad_norm": 0.2794578951295337, + "learning_rate": 0.00016863140839345908, + "loss": 1.0945, + "step": 6193 + }, + { + "epoch": 0.59, + "grad_norm": 0.30665149624277827, + "learning_rate": 0.00016861990193555292, + "loss": 1.0357, + "step": 6194 + }, + { + "epoch": 0.59, + "grad_norm": 0.27440411091642547, + "learning_rate": 0.00016860839376039053, + "loss": 1.2011, + "step": 6195 + }, + { + "epoch": 0.59, + "grad_norm": 0.2817725535852179, + "learning_rate": 0.00016859688386826, + "loss": 1.0482, + "step": 6196 + }, + { + "epoch": 0.59, + "grad_norm": 0.3011982191792621, + "learning_rate": 0.0001685853722594493, + "loss": 1.0853, + "step": 6197 + }, + { + "epoch": 0.59, + "grad_norm": 0.29959655201744034, + "learning_rate": 0.00016857385893424658, + "loss": 1.0111, + "step": 6198 + }, + { + "epoch": 0.59, + "grad_norm": 0.2802735304530325, + "learning_rate": 0.00016856234389293995, + "loss": 1.1118, + "step": 6199 + }, + { + "epoch": 0.59, + "grad_norm": 0.30399590914216984, + "learning_rate": 0.00016855082713581758, + "loss": 1.0576, + "step": 6200 + }, + { + "epoch": 0.59, + "grad_norm": 0.28089634625081406, + "learning_rate": 0.00016853930866316766, + "loss": 1.1154, + "step": 6201 + }, + { + "epoch": 0.59, + "grad_norm": 0.33064903413133756, + "learning_rate": 0.00016852778847527847, + "loss": 1.0004, + "step": 6202 + }, + { + "epoch": 0.59, + "grad_norm": 0.28719188673845863, + "learning_rate": 0.00016851626657243834, + "loss": 1.0599, + "step": 6203 + }, + { + "epoch": 0.59, + "grad_norm": 0.2562638224677485, + "learning_rate": 0.0001685047429549355, + "loss": 1.0726, + "step": 6204 + }, + { + "epoch": 0.59, + "grad_norm": 0.27769483083836394, + "learning_rate": 0.00016849321762305846, + "loss": 1.0452, + "step": 6205 + }, + { + "epoch": 0.59, + "grad_norm": 0.311376243880295, + "learning_rate": 0.00016848169057709562, + "loss": 1.1139, + "step": 6206 + }, + { + "epoch": 0.59, + "grad_norm": 0.33186169126683296, + "learning_rate": 0.00016847016181733543, + "loss": 1.1708, + "step": 6207 + }, + { + "epoch": 0.59, + "grad_norm": 0.31561407579512624, + "learning_rate": 0.0001684586313440664, + "loss": 1.0678, + "step": 6208 + }, + { + "epoch": 0.59, + "grad_norm": 0.2907024924078061, + "learning_rate": 0.00016844709915757707, + "loss": 1.0885, + "step": 6209 + }, + { + "epoch": 0.59, + "grad_norm": 0.29061404968369564, + "learning_rate": 0.0001684355652581561, + "loss": 1.0829, + "step": 6210 + }, + { + "epoch": 0.59, + "grad_norm": 0.2379708775530534, + "learning_rate": 0.00016842402964609209, + "loss": 1.0688, + "step": 6211 + }, + { + "epoch": 0.59, + "grad_norm": 0.278600564728756, + "learning_rate": 0.00016841249232167372, + "loss": 0.975, + "step": 6212 + }, + { + "epoch": 0.59, + "grad_norm": 0.2995894305662367, + "learning_rate": 0.00016840095328518975, + "loss": 1.0657, + "step": 6213 + }, + { + "epoch": 0.59, + "grad_norm": 0.28752497877593164, + "learning_rate": 0.00016838941253692895, + "loss": 1.1441, + "step": 6214 + }, + { + "epoch": 0.59, + "grad_norm": 0.24322263909794603, + "learning_rate": 0.00016837787007718008, + "loss": 0.9605, + "step": 6215 + }, + { + "epoch": 0.59, + "grad_norm": 0.2706035311294752, + "learning_rate": 0.00016836632590623208, + "loss": 1.0975, + "step": 6216 + }, + { + "epoch": 0.59, + "grad_norm": 0.32174695455051877, + "learning_rate": 0.0001683547800243738, + "loss": 1.0976, + "step": 6217 + }, + { + "epoch": 0.59, + "grad_norm": 0.2886586498798772, + "learning_rate": 0.00016834323243189415, + "loss": 1.1534, + "step": 6218 + }, + { + "epoch": 0.59, + "grad_norm": 0.2855976937236902, + "learning_rate": 0.00016833168312908222, + "loss": 1.1865, + "step": 6219 + }, + { + "epoch": 0.6, + "grad_norm": 0.2667954790754751, + "learning_rate": 0.00016832013211622694, + "loss": 1.2281, + "step": 6220 + }, + { + "epoch": 0.6, + "grad_norm": 0.24933114672059709, + "learning_rate": 0.00016830857939361738, + "loss": 0.9391, + "step": 6221 + }, + { + "epoch": 0.6, + "grad_norm": 0.27072321221050405, + "learning_rate": 0.0001682970249615427, + "loss": 1.0534, + "step": 6222 + }, + { + "epoch": 0.6, + "grad_norm": 0.28347062120739175, + "learning_rate": 0.0001682854688202921, + "loss": 1.0556, + "step": 6223 + }, + { + "epoch": 0.6, + "grad_norm": 0.31128711878834986, + "learning_rate": 0.00016827391097015473, + "loss": 1.0196, + "step": 6224 + }, + { + "epoch": 0.6, + "grad_norm": 0.29466538688649235, + "learning_rate": 0.00016826235141141976, + "loss": 1.1365, + "step": 6225 + }, + { + "epoch": 0.6, + "grad_norm": 0.25144131922524754, + "learning_rate": 0.0001682507901443766, + "loss": 1.01, + "step": 6226 + }, + { + "epoch": 0.6, + "grad_norm": 0.2613185058399168, + "learning_rate": 0.00016823922716931451, + "loss": 1.0745, + "step": 6227 + }, + { + "epoch": 0.6, + "grad_norm": 0.27346829355051, + "learning_rate": 0.00016822766248652288, + "loss": 1.0676, + "step": 6228 + }, + { + "epoch": 0.6, + "grad_norm": 0.28753894574718736, + "learning_rate": 0.0001682160960962911, + "loss": 1.1616, + "step": 6229 + }, + { + "epoch": 0.6, + "grad_norm": 0.27774064585558034, + "learning_rate": 0.00016820452799890865, + "loss": 1.0945, + "step": 6230 + }, + { + "epoch": 0.6, + "grad_norm": 0.29844620149160067, + "learning_rate": 0.00016819295819466503, + "loss": 1.1999, + "step": 6231 + }, + { + "epoch": 0.6, + "grad_norm": 0.23278252939346147, + "learning_rate": 0.00016818138668384976, + "loss": 1.1572, + "step": 6232 + }, + { + "epoch": 0.6, + "grad_norm": 0.2910036401832354, + "learning_rate": 0.00016816981346675242, + "loss": 1.0418, + "step": 6233 + }, + { + "epoch": 0.6, + "grad_norm": 0.2679309567861235, + "learning_rate": 0.00016815823854366267, + "loss": 1.1591, + "step": 6234 + }, + { + "epoch": 0.6, + "grad_norm": 0.28959484717417816, + "learning_rate": 0.0001681466619148702, + "loss": 1.0724, + "step": 6235 + }, + { + "epoch": 0.6, + "grad_norm": 0.24969815430510078, + "learning_rate": 0.00016813508358066466, + "loss": 1.0834, + "step": 6236 + }, + { + "epoch": 0.6, + "grad_norm": 0.2817944152182526, + "learning_rate": 0.00016812350354133583, + "loss": 1.0752, + "step": 6237 + }, + { + "epoch": 0.6, + "grad_norm": 0.3095644924541878, + "learning_rate": 0.00016811192179717353, + "loss": 1.0517, + "step": 6238 + }, + { + "epoch": 0.6, + "grad_norm": 0.2826064816728548, + "learning_rate": 0.00016810033834846754, + "loss": 0.9918, + "step": 6239 + }, + { + "epoch": 0.6, + "grad_norm": 0.2670350115993338, + "learning_rate": 0.0001680887531955078, + "loss": 1.0787, + "step": 6240 + }, + { + "epoch": 0.6, + "grad_norm": 0.2786656526566983, + "learning_rate": 0.00016807716633858425, + "loss": 1.1872, + "step": 6241 + }, + { + "epoch": 0.6, + "grad_norm": 0.24633827043340034, + "learning_rate": 0.00016806557777798676, + "loss": 1.1754, + "step": 6242 + }, + { + "epoch": 0.6, + "grad_norm": 0.2510571689731006, + "learning_rate": 0.00016805398751400548, + "loss": 1.0483, + "step": 6243 + }, + { + "epoch": 0.6, + "grad_norm": 0.25726341692210536, + "learning_rate": 0.00016804239554693036, + "loss": 1.0626, + "step": 6244 + }, + { + "epoch": 0.6, + "grad_norm": 0.2973987113917484, + "learning_rate": 0.00016803080187705152, + "loss": 1.1194, + "step": 6245 + }, + { + "epoch": 0.6, + "grad_norm": 0.24985679408993092, + "learning_rate": 0.00016801920650465912, + "loss": 0.9716, + "step": 6246 + }, + { + "epoch": 0.6, + "grad_norm": 0.2373094805089608, + "learning_rate": 0.00016800760943004334, + "loss": 0.9422, + "step": 6247 + }, + { + "epoch": 0.6, + "grad_norm": 0.30043718466849206, + "learning_rate": 0.0001679960106534944, + "loss": 1.0484, + "step": 6248 + }, + { + "epoch": 0.6, + "grad_norm": 0.286122704125855, + "learning_rate": 0.0001679844101753025, + "loss": 1.1008, + "step": 6249 + }, + { + "epoch": 0.6, + "grad_norm": 0.262959271095953, + "learning_rate": 0.00016797280799575804, + "loss": 1.1192, + "step": 6250 + }, + { + "epoch": 0.6, + "grad_norm": 0.2715165644061917, + "learning_rate": 0.00016796120411515138, + "loss": 1.0921, + "step": 6251 + }, + { + "epoch": 0.6, + "grad_norm": 0.28638836988275007, + "learning_rate": 0.00016794959853377284, + "loss": 1.1773, + "step": 6252 + }, + { + "epoch": 0.6, + "grad_norm": 0.2776266911958128, + "learning_rate": 0.00016793799125191288, + "loss": 1.0659, + "step": 6253 + }, + { + "epoch": 0.6, + "grad_norm": 0.28766018953162986, + "learning_rate": 0.00016792638226986202, + "loss": 1.0234, + "step": 6254 + }, + { + "epoch": 0.6, + "grad_norm": 0.24147276451994776, + "learning_rate": 0.00016791477158791077, + "loss": 1.0404, + "step": 6255 + }, + { + "epoch": 0.6, + "grad_norm": 0.28431529367278485, + "learning_rate": 0.00016790315920634964, + "loss": 1.1169, + "step": 6256 + }, + { + "epoch": 0.6, + "grad_norm": 0.297855580080018, + "learning_rate": 0.00016789154512546927, + "loss": 1.1081, + "step": 6257 + }, + { + "epoch": 0.6, + "grad_norm": 0.26389657019849716, + "learning_rate": 0.00016787992934556032, + "loss": 1.0314, + "step": 6258 + }, + { + "epoch": 0.6, + "grad_norm": 0.2928163841306302, + "learning_rate": 0.0001678683118669135, + "loss": 1.2123, + "step": 6259 + }, + { + "epoch": 0.6, + "grad_norm": 0.2975129278676374, + "learning_rate": 0.00016785669268981949, + "loss": 1.1446, + "step": 6260 + }, + { + "epoch": 0.6, + "grad_norm": 0.2853505641305865, + "learning_rate": 0.00016784507181456912, + "loss": 0.9765, + "step": 6261 + }, + { + "epoch": 0.6, + "grad_norm": 0.29085919322220416, + "learning_rate": 0.0001678334492414532, + "loss": 1.0566, + "step": 6262 + }, + { + "epoch": 0.6, + "grad_norm": 0.2951008224142174, + "learning_rate": 0.00016782182497076257, + "loss": 1.0795, + "step": 6263 + }, + { + "epoch": 0.6, + "grad_norm": 0.2866385453022363, + "learning_rate": 0.00016781019900278813, + "loss": 1.043, + "step": 6264 + }, + { + "epoch": 0.6, + "grad_norm": 0.28440677949059723, + "learning_rate": 0.00016779857133782087, + "loss": 1.0659, + "step": 6265 + }, + { + "epoch": 0.6, + "grad_norm": 0.2513177187771401, + "learning_rate": 0.0001677869419761517, + "loss": 1.0256, + "step": 6266 + }, + { + "epoch": 0.6, + "grad_norm": 0.28595770700712264, + "learning_rate": 0.00016777531091807175, + "loss": 1.1266, + "step": 6267 + }, + { + "epoch": 0.6, + "grad_norm": 0.2514182299644808, + "learning_rate": 0.00016776367816387206, + "loss": 1.118, + "step": 6268 + }, + { + "epoch": 0.6, + "grad_norm": 0.2766744455914722, + "learning_rate": 0.00016775204371384373, + "loss": 1.0138, + "step": 6269 + }, + { + "epoch": 0.6, + "grad_norm": 0.30206625962707667, + "learning_rate": 0.0001677404075682779, + "loss": 1.194, + "step": 6270 + }, + { + "epoch": 0.6, + "grad_norm": 0.2631889376384602, + "learning_rate": 0.0001677287697274658, + "loss": 0.9856, + "step": 6271 + }, + { + "epoch": 0.6, + "grad_norm": 0.25406693794632235, + "learning_rate": 0.0001677171301916987, + "loss": 1.0063, + "step": 6272 + }, + { + "epoch": 0.6, + "grad_norm": 0.28950694169287894, + "learning_rate": 0.00016770548896126783, + "loss": 1.0609, + "step": 6273 + }, + { + "epoch": 0.6, + "grad_norm": 0.2761692174626653, + "learning_rate": 0.00016769384603646455, + "loss": 1.0876, + "step": 6274 + }, + { + "epoch": 0.6, + "grad_norm": 0.27628317931169183, + "learning_rate": 0.00016768220141758023, + "loss": 1.0243, + "step": 6275 + }, + { + "epoch": 0.6, + "grad_norm": 0.2660866756756718, + "learning_rate": 0.0001676705551049063, + "loss": 1.0615, + "step": 6276 + }, + { + "epoch": 0.6, + "grad_norm": 0.24048261615675012, + "learning_rate": 0.0001676589070987342, + "loss": 1.152, + "step": 6277 + }, + { + "epoch": 0.6, + "grad_norm": 0.30207736888758835, + "learning_rate": 0.0001676472573993554, + "loss": 1.1271, + "step": 6278 + }, + { + "epoch": 0.6, + "grad_norm": 0.28419649601292135, + "learning_rate": 0.0001676356060070615, + "loss": 1.0366, + "step": 6279 + }, + { + "epoch": 0.6, + "grad_norm": 0.25939328583829085, + "learning_rate": 0.00016762395292214404, + "loss": 1.1239, + "step": 6280 + }, + { + "epoch": 0.6, + "grad_norm": 0.24732667648488688, + "learning_rate": 0.00016761229814489466, + "loss": 1.0374, + "step": 6281 + }, + { + "epoch": 0.6, + "grad_norm": 0.28033363094981284, + "learning_rate": 0.00016760064167560502, + "loss": 1.1033, + "step": 6282 + }, + { + "epoch": 0.6, + "grad_norm": 0.2837131506670996, + "learning_rate": 0.00016758898351456683, + "loss": 1.1314, + "step": 6283 + }, + { + "epoch": 0.6, + "grad_norm": 0.3341961682280646, + "learning_rate": 0.00016757732366207186, + "loss": 1.1564, + "step": 6284 + }, + { + "epoch": 0.6, + "grad_norm": 0.2789710186742297, + "learning_rate": 0.00016756566211841188, + "loss": 1.0238, + "step": 6285 + }, + { + "epoch": 0.6, + "grad_norm": 0.3018369450049017, + "learning_rate": 0.00016755399888387874, + "loss": 1.0962, + "step": 6286 + }, + { + "epoch": 0.6, + "grad_norm": 0.265390590494802, + "learning_rate": 0.00016754233395876439, + "loss": 1.0181, + "step": 6287 + }, + { + "epoch": 0.6, + "grad_norm": 0.2849229079192597, + "learning_rate": 0.0001675306673433606, + "loss": 1.0362, + "step": 6288 + }, + { + "epoch": 0.6, + "grad_norm": 0.279551424881973, + "learning_rate": 0.00016751899903795947, + "loss": 1.0863, + "step": 6289 + }, + { + "epoch": 0.6, + "grad_norm": 0.30798802423123717, + "learning_rate": 0.00016750732904285292, + "loss": 1.0984, + "step": 6290 + }, + { + "epoch": 0.6, + "grad_norm": 0.28386059483142134, + "learning_rate": 0.00016749565735833306, + "loss": 1.0513, + "step": 6291 + }, + { + "epoch": 0.6, + "grad_norm": 0.2971213923065991, + "learning_rate": 0.0001674839839846919, + "loss": 1.0989, + "step": 6292 + }, + { + "epoch": 0.6, + "grad_norm": 0.279028774197727, + "learning_rate": 0.0001674723089222217, + "loss": 0.9628, + "step": 6293 + }, + { + "epoch": 0.6, + "grad_norm": 0.3005236299374139, + "learning_rate": 0.00016746063217121452, + "loss": 0.9983, + "step": 6294 + }, + { + "epoch": 0.6, + "grad_norm": 0.2556549845006676, + "learning_rate": 0.00016744895373196265, + "loss": 1.0838, + "step": 6295 + }, + { + "epoch": 0.6, + "grad_norm": 0.2754893169400004, + "learning_rate": 0.00016743727360475833, + "loss": 1.0528, + "step": 6296 + }, + { + "epoch": 0.6, + "grad_norm": 0.2525035036290481, + "learning_rate": 0.00016742559178989383, + "loss": 1.1438, + "step": 6297 + }, + { + "epoch": 0.6, + "grad_norm": 0.2994082116871912, + "learning_rate": 0.00016741390828766152, + "loss": 1.0641, + "step": 6298 + }, + { + "epoch": 0.6, + "grad_norm": 0.2915966725527121, + "learning_rate": 0.00016740222309835382, + "loss": 1.1256, + "step": 6299 + }, + { + "epoch": 0.6, + "grad_norm": 0.2976305302047498, + "learning_rate": 0.00016739053622226305, + "loss": 1.0256, + "step": 6300 + }, + { + "epoch": 0.6, + "grad_norm": 0.30003197401374204, + "learning_rate": 0.00016737884765968185, + "loss": 1.1071, + "step": 6301 + }, + { + "epoch": 0.6, + "grad_norm": 0.25781935696696723, + "learning_rate": 0.0001673671574109026, + "loss": 1.0172, + "step": 6302 + }, + { + "epoch": 0.6, + "grad_norm": 0.2854845428390189, + "learning_rate": 0.00016735546547621787, + "loss": 1.1589, + "step": 6303 + }, + { + "epoch": 0.6, + "grad_norm": 0.30166239723579663, + "learning_rate": 0.00016734377185592032, + "loss": 1.1384, + "step": 6304 + }, + { + "epoch": 0.6, + "grad_norm": 0.28301402595792946, + "learning_rate": 0.00016733207655030254, + "loss": 1.0882, + "step": 6305 + }, + { + "epoch": 0.6, + "grad_norm": 0.25169386672382876, + "learning_rate": 0.00016732037955965724, + "loss": 1.1254, + "step": 6306 + }, + { + "epoch": 0.6, + "grad_norm": 0.28028422765627997, + "learning_rate": 0.00016730868088427712, + "loss": 1.0061, + "step": 6307 + }, + { + "epoch": 0.6, + "grad_norm": 0.29537665905438676, + "learning_rate": 0.00016729698052445497, + "loss": 1.0539, + "step": 6308 + }, + { + "epoch": 0.6, + "grad_norm": 0.28211755903503166, + "learning_rate": 0.0001672852784804836, + "loss": 1.0818, + "step": 6309 + }, + { + "epoch": 0.6, + "grad_norm": 0.2685003161595143, + "learning_rate": 0.00016727357475265582, + "loss": 0.9727, + "step": 6310 + }, + { + "epoch": 0.6, + "grad_norm": 0.271237791328004, + "learning_rate": 0.00016726186934126457, + "loss": 1.0693, + "step": 6311 + }, + { + "epoch": 0.6, + "grad_norm": 0.29234966306787097, + "learning_rate": 0.00016725016224660274, + "loss": 1.0985, + "step": 6312 + }, + { + "epoch": 0.6, + "grad_norm": 0.2415128764029434, + "learning_rate": 0.00016723845346896336, + "loss": 1.0949, + "step": 6313 + }, + { + "epoch": 0.6, + "grad_norm": 0.28487373231440377, + "learning_rate": 0.00016722674300863942, + "loss": 1.0031, + "step": 6314 + }, + { + "epoch": 0.6, + "grad_norm": 0.24380319070580195, + "learning_rate": 0.00016721503086592398, + "loss": 0.9791, + "step": 6315 + }, + { + "epoch": 0.6, + "grad_norm": 0.2854973044582919, + "learning_rate": 0.00016720331704111015, + "loss": 1.0771, + "step": 6316 + }, + { + "epoch": 0.6, + "grad_norm": 0.2926702434021319, + "learning_rate": 0.0001671916015344911, + "loss": 1.0796, + "step": 6317 + }, + { + "epoch": 0.6, + "grad_norm": 0.2625594361259241, + "learning_rate": 0.00016717988434636, + "loss": 1.0848, + "step": 6318 + }, + { + "epoch": 0.6, + "grad_norm": 0.272123940034111, + "learning_rate": 0.00016716816547701003, + "loss": 1.0751, + "step": 6319 + }, + { + "epoch": 0.6, + "grad_norm": 0.25026070785323234, + "learning_rate": 0.00016715644492673452, + "loss": 1.0925, + "step": 6320 + }, + { + "epoch": 0.6, + "grad_norm": 0.28111670498694746, + "learning_rate": 0.00016714472269582678, + "loss": 1.1395, + "step": 6321 + }, + { + "epoch": 0.6, + "grad_norm": 0.30210515943962146, + "learning_rate": 0.00016713299878458012, + "loss": 1.117, + "step": 6322 + }, + { + "epoch": 0.6, + "grad_norm": 0.2717616248041951, + "learning_rate": 0.00016712127319328803, + "loss": 1.0637, + "step": 6323 + }, + { + "epoch": 0.61, + "grad_norm": 0.24801900424048062, + "learning_rate": 0.00016710954592224386, + "loss": 1.1057, + "step": 6324 + }, + { + "epoch": 0.61, + "grad_norm": 0.2898349287109403, + "learning_rate": 0.00016709781697174113, + "loss": 1.0904, + "step": 6325 + }, + { + "epoch": 0.61, + "grad_norm": 0.2615926237179176, + "learning_rate": 0.00016708608634207338, + "loss": 1.013, + "step": 6326 + }, + { + "epoch": 0.61, + "grad_norm": 0.25404922506608696, + "learning_rate": 0.00016707435403353412, + "loss": 1.0322, + "step": 6327 + }, + { + "epoch": 0.61, + "grad_norm": 0.2993573106392043, + "learning_rate": 0.000167062620046417, + "loss": 1.1574, + "step": 6328 + }, + { + "epoch": 0.61, + "grad_norm": 0.27922342474327827, + "learning_rate": 0.0001670508843810157, + "loss": 1.0711, + "step": 6329 + }, + { + "epoch": 0.61, + "grad_norm": 0.3011854686091883, + "learning_rate": 0.00016703914703762387, + "loss": 1.054, + "step": 6330 + }, + { + "epoch": 0.61, + "grad_norm": 0.25394776998572083, + "learning_rate": 0.00016702740801653523, + "loss": 1.0363, + "step": 6331 + }, + { + "epoch": 0.61, + "grad_norm": 0.32790248402799294, + "learning_rate": 0.00016701566731804358, + "loss": 1.1621, + "step": 6332 + }, + { + "epoch": 0.61, + "grad_norm": 0.2825346749177906, + "learning_rate": 0.00016700392494244277, + "loss": 0.9924, + "step": 6333 + }, + { + "epoch": 0.61, + "grad_norm": 0.2615222834120971, + "learning_rate": 0.0001669921808900266, + "loss": 1.1129, + "step": 6334 + }, + { + "epoch": 0.61, + "grad_norm": 0.2685877115229659, + "learning_rate": 0.000166980435161089, + "loss": 1.0624, + "step": 6335 + }, + { + "epoch": 0.61, + "grad_norm": 0.2782305195102996, + "learning_rate": 0.00016696868775592394, + "loss": 1.1125, + "step": 6336 + }, + { + "epoch": 0.61, + "grad_norm": 0.26429721749726304, + "learning_rate": 0.00016695693867482535, + "loss": 1.0898, + "step": 6337 + }, + { + "epoch": 0.61, + "grad_norm": 0.3006719307055734, + "learning_rate": 0.0001669451879180873, + "loss": 1.1202, + "step": 6338 + }, + { + "epoch": 0.61, + "grad_norm": 0.2546442290098055, + "learning_rate": 0.00016693343548600386, + "loss": 1.0205, + "step": 6339 + }, + { + "epoch": 0.61, + "grad_norm": 0.3163145075653169, + "learning_rate": 0.00016692168137886912, + "loss": 1.0082, + "step": 6340 + }, + { + "epoch": 0.61, + "grad_norm": 0.26767591471900337, + "learning_rate": 0.00016690992559697726, + "loss": 1.0872, + "step": 6341 + }, + { + "epoch": 0.61, + "grad_norm": 0.3092574347023085, + "learning_rate": 0.00016689816814062245, + "loss": 1.0457, + "step": 6342 + }, + { + "epoch": 0.61, + "grad_norm": 0.2637784366965473, + "learning_rate": 0.00016688640901009894, + "loss": 1.0477, + "step": 6343 + }, + { + "epoch": 0.61, + "grad_norm": 0.30076256747451274, + "learning_rate": 0.000166874648205701, + "loss": 1.1079, + "step": 6344 + }, + { + "epoch": 0.61, + "grad_norm": 0.2923545840530909, + "learning_rate": 0.00016686288572772295, + "loss": 1.2222, + "step": 6345 + }, + { + "epoch": 0.61, + "grad_norm": 0.3153084026273102, + "learning_rate": 0.00016685112157645916, + "loss": 0.9666, + "step": 6346 + }, + { + "epoch": 0.61, + "grad_norm": 0.2963713357046184, + "learning_rate": 0.00016683935575220407, + "loss": 1.0839, + "step": 6347 + }, + { + "epoch": 0.61, + "grad_norm": 0.2737775027067931, + "learning_rate": 0.00016682758825525208, + "loss": 1.1275, + "step": 6348 + }, + { + "epoch": 0.61, + "grad_norm": 0.2800942528032241, + "learning_rate": 0.00016681581908589772, + "loss": 0.9692, + "step": 6349 + }, + { + "epoch": 0.61, + "grad_norm": 0.3109943800000079, + "learning_rate": 0.00016680404824443546, + "loss": 1.0421, + "step": 6350 + }, + { + "epoch": 0.61, + "grad_norm": 0.27462352106623983, + "learning_rate": 0.0001667922757311599, + "loss": 1.1845, + "step": 6351 + }, + { + "epoch": 0.61, + "grad_norm": 0.32025726456616344, + "learning_rate": 0.0001667805015463657, + "loss": 1.0136, + "step": 6352 + }, + { + "epoch": 0.61, + "grad_norm": 0.301660874521527, + "learning_rate": 0.00016676872569034744, + "loss": 1.1865, + "step": 6353 + }, + { + "epoch": 0.61, + "grad_norm": 0.29842518248408173, + "learning_rate": 0.00016675694816339987, + "loss": 1.2503, + "step": 6354 + }, + { + "epoch": 0.61, + "grad_norm": 0.26028217660743824, + "learning_rate": 0.00016674516896581773, + "loss": 0.9257, + "step": 6355 + }, + { + "epoch": 0.61, + "grad_norm": 0.290277459931021, + "learning_rate": 0.00016673338809789577, + "loss": 1.0985, + "step": 6356 + }, + { + "epoch": 0.61, + "grad_norm": 0.27210815585379544, + "learning_rate": 0.00016672160555992885, + "loss": 1.1111, + "step": 6357 + }, + { + "epoch": 0.61, + "grad_norm": 0.27064221588352777, + "learning_rate": 0.0001667098213522118, + "loss": 0.9628, + "step": 6358 + }, + { + "epoch": 0.61, + "grad_norm": 0.31149953692608723, + "learning_rate": 0.00016669803547503958, + "loss": 1.0404, + "step": 6359 + }, + { + "epoch": 0.61, + "grad_norm": 0.2859751349658579, + "learning_rate": 0.0001666862479287071, + "loss": 0.9679, + "step": 6360 + }, + { + "epoch": 0.61, + "grad_norm": 0.29734676588988057, + "learning_rate": 0.0001666744587135093, + "loss": 1.1466, + "step": 6361 + }, + { + "epoch": 0.61, + "grad_norm": 0.2695100397906395, + "learning_rate": 0.00016666266782974133, + "loss": 1.0106, + "step": 6362 + }, + { + "epoch": 0.61, + "grad_norm": 0.33059150738359744, + "learning_rate": 0.00016665087527769815, + "loss": 1.1697, + "step": 6363 + }, + { + "epoch": 0.61, + "grad_norm": 0.2783320081837106, + "learning_rate": 0.00016663908105767495, + "loss": 1.0822, + "step": 6364 + }, + { + "epoch": 0.61, + "grad_norm": 0.31565170424334077, + "learning_rate": 0.00016662728516996688, + "loss": 1.1806, + "step": 6365 + }, + { + "epoch": 0.61, + "grad_norm": 0.2709155225707643, + "learning_rate": 0.0001666154876148691, + "loss": 1.0718, + "step": 6366 + }, + { + "epoch": 0.61, + "grad_norm": 0.2743956028640576, + "learning_rate": 0.00016660368839267693, + "loss": 1.1101, + "step": 6367 + }, + { + "epoch": 0.61, + "grad_norm": 0.3083906678441462, + "learning_rate": 0.00016659188750368554, + "loss": 1.1813, + "step": 6368 + }, + { + "epoch": 0.61, + "grad_norm": 0.27005425077077816, + "learning_rate": 0.00016658008494819032, + "loss": 1.01, + "step": 6369 + }, + { + "epoch": 0.61, + "grad_norm": 0.3190770015107477, + "learning_rate": 0.00016656828072648665, + "loss": 1.0918, + "step": 6370 + }, + { + "epoch": 0.61, + "grad_norm": 0.2786920758144284, + "learning_rate": 0.00016655647483886988, + "loss": 0.9536, + "step": 6371 + }, + { + "epoch": 0.61, + "grad_norm": 0.28799801927513924, + "learning_rate": 0.00016654466728563557, + "loss": 1.1653, + "step": 6372 + }, + { + "epoch": 0.61, + "grad_norm": 0.2740543474851488, + "learning_rate": 0.00016653285806707908, + "loss": 1.0493, + "step": 6373 + }, + { + "epoch": 0.61, + "grad_norm": 0.283012370411115, + "learning_rate": 0.000166521047183496, + "loss": 1.0719, + "step": 6374 + }, + { + "epoch": 0.61, + "grad_norm": 0.2866968650229277, + "learning_rate": 0.00016650923463518196, + "loss": 0.9907, + "step": 6375 + }, + { + "epoch": 0.61, + "grad_norm": 0.2802944367538119, + "learning_rate": 0.00016649742042243248, + "loss": 1.0706, + "step": 6376 + }, + { + "epoch": 0.61, + "grad_norm": 0.27154248970374756, + "learning_rate": 0.00016648560454554328, + "loss": 1.1223, + "step": 6377 + }, + { + "epoch": 0.61, + "grad_norm": 0.2838775500172997, + "learning_rate": 0.00016647378700481005, + "loss": 1.0581, + "step": 6378 + }, + { + "epoch": 0.61, + "grad_norm": 0.2920529073936615, + "learning_rate": 0.00016646196780052848, + "loss": 1.0481, + "step": 6379 + }, + { + "epoch": 0.61, + "grad_norm": 0.2670726827629129, + "learning_rate": 0.00016645014693299442, + "loss": 1.0974, + "step": 6380 + }, + { + "epoch": 0.61, + "grad_norm": 0.24730959097979163, + "learning_rate": 0.00016643832440250367, + "loss": 1.1209, + "step": 6381 + }, + { + "epoch": 0.61, + "grad_norm": 0.2947114263680111, + "learning_rate": 0.00016642650020935214, + "loss": 0.9936, + "step": 6382 + }, + { + "epoch": 0.61, + "grad_norm": 0.3233060078932617, + "learning_rate": 0.00016641467435383564, + "loss": 1.1597, + "step": 6383 + }, + { + "epoch": 0.61, + "grad_norm": 0.26002181675498415, + "learning_rate": 0.00016640284683625017, + "loss": 0.9699, + "step": 6384 + }, + { + "epoch": 0.61, + "grad_norm": 0.26273440181552443, + "learning_rate": 0.0001663910176568918, + "loss": 1.1475, + "step": 6385 + }, + { + "epoch": 0.61, + "grad_norm": 0.24674523967677028, + "learning_rate": 0.00016637918681605639, + "loss": 1.0134, + "step": 6386 + }, + { + "epoch": 0.61, + "grad_norm": 0.2618701917902359, + "learning_rate": 0.00016636735431404019, + "loss": 1.1473, + "step": 6387 + }, + { + "epoch": 0.61, + "grad_norm": 0.28468586915692684, + "learning_rate": 0.00016635552015113918, + "loss": 1.0526, + "step": 6388 + }, + { + "epoch": 0.61, + "grad_norm": 0.3028992395965799, + "learning_rate": 0.0001663436843276496, + "loss": 1.0536, + "step": 6389 + }, + { + "epoch": 0.61, + "grad_norm": 0.2789358949175681, + "learning_rate": 0.00016633184684386763, + "loss": 1.1303, + "step": 6390 + }, + { + "epoch": 0.61, + "grad_norm": 0.29126579487824283, + "learning_rate": 0.00016632000770008947, + "loss": 1.0763, + "step": 6391 + }, + { + "epoch": 0.61, + "grad_norm": 0.27542318323495574, + "learning_rate": 0.0001663081668966115, + "loss": 1.1802, + "step": 6392 + }, + { + "epoch": 0.61, + "grad_norm": 0.30182241732147397, + "learning_rate": 0.00016629632443372993, + "loss": 1.054, + "step": 6393 + }, + { + "epoch": 0.61, + "grad_norm": 0.2819317111614903, + "learning_rate": 0.0001662844803117412, + "loss": 1.0247, + "step": 6394 + }, + { + "epoch": 0.61, + "grad_norm": 0.2813661907175226, + "learning_rate": 0.00016627263453094168, + "loss": 1.153, + "step": 6395 + }, + { + "epoch": 0.61, + "grad_norm": 0.3156855165232155, + "learning_rate": 0.00016626078709162782, + "loss": 1.1216, + "step": 6396 + }, + { + "epoch": 0.61, + "grad_norm": 0.2762036811217373, + "learning_rate": 0.00016624893799409613, + "loss": 1.0578, + "step": 6397 + }, + { + "epoch": 0.61, + "grad_norm": 0.25813174874980066, + "learning_rate": 0.00016623708723864314, + "loss": 1.0028, + "step": 6398 + }, + { + "epoch": 0.61, + "grad_norm": 0.2547111721851036, + "learning_rate": 0.0001662252348255654, + "loss": 1.0322, + "step": 6399 + }, + { + "epoch": 0.61, + "grad_norm": 0.2670666010121307, + "learning_rate": 0.00016621338075515954, + "loss": 1.0951, + "step": 6400 + }, + { + "epoch": 0.61, + "grad_norm": 0.25941920255451006, + "learning_rate": 0.00016620152502772224, + "loss": 1.009, + "step": 6401 + }, + { + "epoch": 0.61, + "grad_norm": 0.32953249149853675, + "learning_rate": 0.00016618966764355016, + "loss": 1.1824, + "step": 6402 + }, + { + "epoch": 0.61, + "grad_norm": 0.3400097911397109, + "learning_rate": 0.00016617780860294002, + "loss": 1.0785, + "step": 6403 + }, + { + "epoch": 0.61, + "grad_norm": 0.2606374235637558, + "learning_rate": 0.00016616594790618865, + "loss": 1.0066, + "step": 6404 + }, + { + "epoch": 0.61, + "grad_norm": 0.28689829208173906, + "learning_rate": 0.00016615408555359284, + "loss": 1.1033, + "step": 6405 + }, + { + "epoch": 0.61, + "grad_norm": 0.275071242493895, + "learning_rate": 0.00016614222154544948, + "loss": 1.0145, + "step": 6406 + }, + { + "epoch": 0.61, + "grad_norm": 0.3036143858185991, + "learning_rate": 0.00016613035588205542, + "loss": 0.976, + "step": 6407 + }, + { + "epoch": 0.61, + "grad_norm": 0.2739250339275323, + "learning_rate": 0.00016611848856370768, + "loss": 0.9842, + "step": 6408 + }, + { + "epoch": 0.61, + "grad_norm": 0.269718907859172, + "learning_rate": 0.0001661066195907032, + "loss": 1.0573, + "step": 6409 + }, + { + "epoch": 0.61, + "grad_norm": 0.27815112294798416, + "learning_rate": 0.000166094748963339, + "loss": 1.1291, + "step": 6410 + }, + { + "epoch": 0.61, + "grad_norm": 0.2810699578192083, + "learning_rate": 0.0001660828766819122, + "loss": 0.9918, + "step": 6411 + }, + { + "epoch": 0.61, + "grad_norm": 0.25506946682445086, + "learning_rate": 0.00016607100274671982, + "loss": 1.1449, + "step": 6412 + }, + { + "epoch": 0.61, + "grad_norm": 0.2890524163750954, + "learning_rate": 0.00016605912715805915, + "loss": 1.124, + "step": 6413 + }, + { + "epoch": 0.61, + "grad_norm": 0.29021337271256664, + "learning_rate": 0.00016604724991622726, + "loss": 1.0194, + "step": 6414 + }, + { + "epoch": 0.61, + "grad_norm": 0.27539899144845825, + "learning_rate": 0.00016603537102152145, + "loss": 1.077, + "step": 6415 + }, + { + "epoch": 0.61, + "grad_norm": 0.29031381020521185, + "learning_rate": 0.00016602349047423895, + "loss": 1.068, + "step": 6416 + }, + { + "epoch": 0.61, + "grad_norm": 0.259955978828572, + "learning_rate": 0.00016601160827467713, + "loss": 0.9664, + "step": 6417 + }, + { + "epoch": 0.61, + "grad_norm": 0.29036829398239994, + "learning_rate": 0.00016599972442313333, + "loss": 1.0306, + "step": 6418 + }, + { + "epoch": 0.61, + "grad_norm": 0.2771649514160956, + "learning_rate": 0.00016598783891990496, + "loss": 1.0726, + "step": 6419 + }, + { + "epoch": 0.61, + "grad_norm": 0.2597924047332126, + "learning_rate": 0.00016597595176528942, + "loss": 1.0015, + "step": 6420 + }, + { + "epoch": 0.61, + "grad_norm": 0.291533841923736, + "learning_rate": 0.00016596406295958421, + "loss": 1.0385, + "step": 6421 + }, + { + "epoch": 0.61, + "grad_norm": 0.27196336627994283, + "learning_rate": 0.0001659521725030869, + "loss": 1.0915, + "step": 6422 + }, + { + "epoch": 0.61, + "grad_norm": 0.3091516771574349, + "learning_rate": 0.00016594028039609504, + "loss": 1.1498, + "step": 6423 + }, + { + "epoch": 0.61, + "grad_norm": 0.269385742381281, + "learning_rate": 0.00016592838663890617, + "loss": 1.1685, + "step": 6424 + }, + { + "epoch": 0.61, + "grad_norm": 0.31387068153422903, + "learning_rate": 0.00016591649123181803, + "loss": 1.0578, + "step": 6425 + }, + { + "epoch": 0.61, + "grad_norm": 0.2715541694948404, + "learning_rate": 0.00016590459417512824, + "loss": 1.0411, + "step": 6426 + }, + { + "epoch": 0.61, + "grad_norm": 0.31668645833456616, + "learning_rate": 0.00016589269546913457, + "loss": 1.045, + "step": 6427 + }, + { + "epoch": 0.61, + "grad_norm": 0.26537674929552535, + "learning_rate": 0.0001658807951141348, + "loss": 0.9741, + "step": 6428 + }, + { + "epoch": 0.62, + "grad_norm": 0.30617980217717927, + "learning_rate": 0.00016586889311042674, + "loss": 1.0323, + "step": 6429 + }, + { + "epoch": 0.62, + "grad_norm": 0.2893643055407271, + "learning_rate": 0.00016585698945830818, + "loss": 0.9841, + "step": 6430 + }, + { + "epoch": 0.62, + "grad_norm": 0.26428036894894746, + "learning_rate": 0.00016584508415807712, + "loss": 1.0233, + "step": 6431 + }, + { + "epoch": 0.62, + "grad_norm": 0.25987613015099514, + "learning_rate": 0.00016583317721003142, + "loss": 0.9113, + "step": 6432 + }, + { + "epoch": 0.62, + "grad_norm": 0.2676269631291612, + "learning_rate": 0.0001658212686144691, + "loss": 1.0322, + "step": 6433 + }, + { + "epoch": 0.62, + "grad_norm": 0.27792570272364814, + "learning_rate": 0.00016580935837168817, + "loss": 1.1305, + "step": 6434 + }, + { + "epoch": 0.62, + "grad_norm": 0.3054138054002042, + "learning_rate": 0.00016579744648198666, + "loss": 0.9909, + "step": 6435 + }, + { + "epoch": 0.62, + "grad_norm": 0.28755696638880873, + "learning_rate": 0.0001657855329456627, + "loss": 1.1987, + "step": 6436 + }, + { + "epoch": 0.62, + "grad_norm": 0.2937260358743029, + "learning_rate": 0.0001657736177630145, + "loss": 1.175, + "step": 6437 + }, + { + "epoch": 0.62, + "grad_norm": 0.25543914091501174, + "learning_rate": 0.00016576170093434008, + "loss": 0.9054, + "step": 6438 + }, + { + "epoch": 0.62, + "grad_norm": 0.2358502828421239, + "learning_rate": 0.00016574978245993783, + "loss": 1.1184, + "step": 6439 + }, + { + "epoch": 0.62, + "grad_norm": 0.32159479122906715, + "learning_rate": 0.00016573786234010593, + "loss": 1.0697, + "step": 6440 + }, + { + "epoch": 0.62, + "grad_norm": 0.29597420637835464, + "learning_rate": 0.0001657259405751427, + "loss": 1.0147, + "step": 6441 + }, + { + "epoch": 0.62, + "grad_norm": 0.27502750185112845, + "learning_rate": 0.0001657140171653465, + "loss": 1.1519, + "step": 6442 + }, + { + "epoch": 0.62, + "grad_norm": 0.28880378692907466, + "learning_rate": 0.00016570209211101578, + "loss": 1.0251, + "step": 6443 + }, + { + "epoch": 0.62, + "grad_norm": 0.29201502579906263, + "learning_rate": 0.00016569016541244884, + "loss": 1.1337, + "step": 6444 + }, + { + "epoch": 0.62, + "grad_norm": 0.2399540511362018, + "learning_rate": 0.00016567823706994426, + "loss": 1.0998, + "step": 6445 + }, + { + "epoch": 0.62, + "grad_norm": 0.3268134405014831, + "learning_rate": 0.00016566630708380052, + "loss": 0.9888, + "step": 6446 + }, + { + "epoch": 0.62, + "grad_norm": 0.3103957574422285, + "learning_rate": 0.00016565437545431618, + "loss": 1.1474, + "step": 6447 + }, + { + "epoch": 0.62, + "grad_norm": 0.2718259809863342, + "learning_rate": 0.0001656424421817898, + "loss": 0.9704, + "step": 6448 + }, + { + "epoch": 0.62, + "grad_norm": 0.27353989936463624, + "learning_rate": 0.00016563050726652007, + "loss": 1.0992, + "step": 6449 + }, + { + "epoch": 0.62, + "grad_norm": 0.2856249195760054, + "learning_rate": 0.00016561857070880565, + "loss": 1.1137, + "step": 6450 + }, + { + "epoch": 0.62, + "grad_norm": 0.273725036502166, + "learning_rate": 0.00016560663250894526, + "loss": 0.9277, + "step": 6451 + }, + { + "epoch": 0.62, + "grad_norm": 0.2763856735936746, + "learning_rate": 0.00016559469266723767, + "loss": 1.0204, + "step": 6452 + }, + { + "epoch": 0.62, + "grad_norm": 0.28085668794591295, + "learning_rate": 0.00016558275118398164, + "loss": 0.9921, + "step": 6453 + }, + { + "epoch": 0.62, + "grad_norm": 0.2735774703202355, + "learning_rate": 0.00016557080805947605, + "loss": 0.9692, + "step": 6454 + }, + { + "epoch": 0.62, + "grad_norm": 0.26616410789129574, + "learning_rate": 0.0001655588632940198, + "loss": 0.9541, + "step": 6455 + }, + { + "epoch": 0.62, + "grad_norm": 0.2890311486765243, + "learning_rate": 0.0001655469168879118, + "loss": 1.0578, + "step": 6456 + }, + { + "epoch": 0.62, + "grad_norm": 0.29609104767351785, + "learning_rate": 0.00016553496884145097, + "loss": 1.0883, + "step": 6457 + }, + { + "epoch": 0.62, + "grad_norm": 0.2608637836134015, + "learning_rate": 0.0001655230191549364, + "loss": 1.0304, + "step": 6458 + }, + { + "epoch": 0.62, + "grad_norm": 0.27959926336448077, + "learning_rate": 0.00016551106782866705, + "loss": 1.0061, + "step": 6459 + }, + { + "epoch": 0.62, + "grad_norm": 0.2771253997843709, + "learning_rate": 0.0001654991148629421, + "loss": 1.1194, + "step": 6460 + }, + { + "epoch": 0.62, + "grad_norm": 0.28449814950864316, + "learning_rate": 0.00016548716025806062, + "loss": 1.088, + "step": 6461 + }, + { + "epoch": 0.62, + "grad_norm": 0.2691654812600947, + "learning_rate": 0.0001654752040143218, + "loss": 1.0968, + "step": 6462 + }, + { + "epoch": 0.62, + "grad_norm": 0.31954023097003986, + "learning_rate": 0.00016546324613202483, + "loss": 0.9794, + "step": 6463 + }, + { + "epoch": 0.62, + "grad_norm": 0.2649303795932421, + "learning_rate": 0.000165451286611469, + "loss": 1.0693, + "step": 6464 + }, + { + "epoch": 0.62, + "grad_norm": 0.2630512812762983, + "learning_rate": 0.0001654393254529536, + "loss": 1.0332, + "step": 6465 + }, + { + "epoch": 0.62, + "grad_norm": 0.2938590102565347, + "learning_rate": 0.00016542736265677795, + "loss": 1.1377, + "step": 6466 + }, + { + "epoch": 0.62, + "grad_norm": 0.29423857954744015, + "learning_rate": 0.0001654153982232414, + "loss": 1.0742, + "step": 6467 + }, + { + "epoch": 0.62, + "grad_norm": 0.2770519185899091, + "learning_rate": 0.00016540343215264342, + "loss": 0.9763, + "step": 6468 + }, + { + "epoch": 0.62, + "grad_norm": 0.27149015096394086, + "learning_rate": 0.00016539146444528345, + "loss": 1.2458, + "step": 6469 + }, + { + "epoch": 0.62, + "grad_norm": 0.2723835734113916, + "learning_rate": 0.00016537949510146097, + "loss": 1.0588, + "step": 6470 + }, + { + "epoch": 0.62, + "grad_norm": 0.2860060696336331, + "learning_rate": 0.00016536752412147555, + "loss": 0.9471, + "step": 6471 + }, + { + "epoch": 0.62, + "grad_norm": 0.256259821327159, + "learning_rate": 0.0001653555515056268, + "loss": 1.0662, + "step": 6472 + }, + { + "epoch": 0.62, + "grad_norm": 0.26590917435909617, + "learning_rate": 0.00016534357725421422, + "loss": 0.9938, + "step": 6473 + }, + { + "epoch": 0.62, + "grad_norm": 0.30036253247255307, + "learning_rate": 0.0001653316013675376, + "loss": 0.9861, + "step": 6474 + }, + { + "epoch": 0.62, + "grad_norm": 0.24447569581162235, + "learning_rate": 0.00016531962384589655, + "loss": 1.0813, + "step": 6475 + }, + { + "epoch": 0.62, + "grad_norm": 0.31129076026873814, + "learning_rate": 0.0001653076446895909, + "loss": 1.0898, + "step": 6476 + }, + { + "epoch": 0.62, + "grad_norm": 0.2755133042122456, + "learning_rate": 0.00016529566389892039, + "loss": 1.0924, + "step": 6477 + }, + { + "epoch": 0.62, + "grad_norm": 0.24784520601298288, + "learning_rate": 0.00016528368147418485, + "loss": 1.2024, + "step": 6478 + }, + { + "epoch": 0.62, + "grad_norm": 0.2869637235166629, + "learning_rate": 0.00016527169741568416, + "loss": 1.0971, + "step": 6479 + }, + { + "epoch": 0.62, + "grad_norm": 0.25871213333736603, + "learning_rate": 0.00016525971172371822, + "loss": 0.993, + "step": 6480 + }, + { + "epoch": 0.62, + "grad_norm": 0.2604087934206153, + "learning_rate": 0.00016524772439858694, + "loss": 1.1067, + "step": 6481 + }, + { + "epoch": 0.62, + "grad_norm": 0.2816319137305283, + "learning_rate": 0.0001652357354405904, + "loss": 1.0467, + "step": 6482 + }, + { + "epoch": 0.62, + "grad_norm": 0.27702751178720125, + "learning_rate": 0.0001652237448500286, + "loss": 1.0449, + "step": 6483 + }, + { + "epoch": 0.62, + "grad_norm": 0.27403004840211326, + "learning_rate": 0.00016521175262720154, + "loss": 1.1382, + "step": 6484 + }, + { + "epoch": 0.62, + "grad_norm": 0.27012660723215115, + "learning_rate": 0.00016519975877240942, + "loss": 1.1069, + "step": 6485 + }, + { + "epoch": 0.62, + "grad_norm": 0.262756289576385, + "learning_rate": 0.00016518776328595234, + "loss": 1.0257, + "step": 6486 + }, + { + "epoch": 0.62, + "grad_norm": 0.2898435765096172, + "learning_rate": 0.0001651757661681305, + "loss": 1.2361, + "step": 6487 + }, + { + "epoch": 0.62, + "grad_norm": 0.2778059074062683, + "learning_rate": 0.0001651637674192442, + "loss": 1.116, + "step": 6488 + }, + { + "epoch": 0.62, + "grad_norm": 0.29747157960661386, + "learning_rate": 0.00016515176703959364, + "loss": 1.0449, + "step": 6489 + }, + { + "epoch": 0.62, + "grad_norm": 0.2758337700493432, + "learning_rate": 0.00016513976502947913, + "loss": 0.9708, + "step": 6490 + }, + { + "epoch": 0.62, + "grad_norm": 0.29558292743334763, + "learning_rate": 0.00016512776138920108, + "loss": 1.0372, + "step": 6491 + }, + { + "epoch": 0.62, + "grad_norm": 0.26433384453275566, + "learning_rate": 0.0001651157561190599, + "loss": 1.1413, + "step": 6492 + }, + { + "epoch": 0.62, + "grad_norm": 0.3164936697696587, + "learning_rate": 0.00016510374921935598, + "loss": 1.1305, + "step": 6493 + }, + { + "epoch": 0.62, + "grad_norm": 0.2680448222970725, + "learning_rate": 0.00016509174069038985, + "loss": 1.2003, + "step": 6494 + }, + { + "epoch": 0.62, + "grad_norm": 0.26701918502526756, + "learning_rate": 0.00016507973053246197, + "loss": 0.9809, + "step": 6495 + }, + { + "epoch": 0.62, + "grad_norm": 0.28698630137463055, + "learning_rate": 0.00016506771874587296, + "loss": 1.1306, + "step": 6496 + }, + { + "epoch": 0.62, + "grad_norm": 0.2861081221048606, + "learning_rate": 0.00016505570533092333, + "loss": 1.2112, + "step": 6497 + }, + { + "epoch": 0.62, + "grad_norm": 0.27462328764365, + "learning_rate": 0.00016504369028791382, + "loss": 1.0667, + "step": 6498 + }, + { + "epoch": 0.62, + "grad_norm": 0.2542616461897349, + "learning_rate": 0.0001650316736171451, + "loss": 1.1647, + "step": 6499 + }, + { + "epoch": 0.62, + "grad_norm": 0.268648077124173, + "learning_rate": 0.00016501965531891786, + "loss": 1.023, + "step": 6500 + }, + { + "epoch": 0.62, + "grad_norm": 0.30932903309172555, + "learning_rate": 0.0001650076353935329, + "loss": 1.0771, + "step": 6501 + }, + { + "epoch": 0.62, + "grad_norm": 0.2711465953776904, + "learning_rate": 0.000164995613841291, + "loss": 1.023, + "step": 6502 + }, + { + "epoch": 0.62, + "grad_norm": 0.2488690322666522, + "learning_rate": 0.000164983590662493, + "loss": 1.1685, + "step": 6503 + }, + { + "epoch": 0.62, + "grad_norm": 0.2447593866202204, + "learning_rate": 0.00016497156585743982, + "loss": 1.0553, + "step": 6504 + }, + { + "epoch": 0.62, + "grad_norm": 0.28996537913588744, + "learning_rate": 0.00016495953942643237, + "loss": 1.0453, + "step": 6505 + }, + { + "epoch": 0.62, + "grad_norm": 0.3100091579089353, + "learning_rate": 0.00016494751136977165, + "loss": 1.0363, + "step": 6506 + }, + { + "epoch": 0.62, + "grad_norm": 0.26438432513334514, + "learning_rate": 0.0001649354816877586, + "loss": 1.0239, + "step": 6507 + }, + { + "epoch": 0.62, + "grad_norm": 0.3136841961736258, + "learning_rate": 0.0001649234503806943, + "loss": 1.1726, + "step": 6508 + }, + { + "epoch": 0.62, + "grad_norm": 0.2793775958515165, + "learning_rate": 0.0001649114174488799, + "loss": 1.0953, + "step": 6509 + }, + { + "epoch": 0.62, + "grad_norm": 0.2991111217534248, + "learning_rate": 0.0001648993828926164, + "loss": 0.9635, + "step": 6510 + }, + { + "epoch": 0.62, + "grad_norm": 0.33123211260741653, + "learning_rate": 0.00016488734671220512, + "loss": 1.09, + "step": 6511 + }, + { + "epoch": 0.62, + "grad_norm": 0.3071684112575216, + "learning_rate": 0.0001648753089079472, + "loss": 1.112, + "step": 6512 + }, + { + "epoch": 0.62, + "grad_norm": 0.2919753257761195, + "learning_rate": 0.0001648632694801439, + "loss": 1.2142, + "step": 6513 + }, + { + "epoch": 0.62, + "grad_norm": 0.3047462341213951, + "learning_rate": 0.00016485122842909653, + "loss": 1.0747, + "step": 6514 + }, + { + "epoch": 0.62, + "grad_norm": 0.2561697275636995, + "learning_rate": 0.00016483918575510638, + "loss": 1.0719, + "step": 6515 + }, + { + "epoch": 0.62, + "grad_norm": 0.29441141071756, + "learning_rate": 0.00016482714145847488, + "loss": 1.1003, + "step": 6516 + }, + { + "epoch": 0.62, + "grad_norm": 0.2507495981732362, + "learning_rate": 0.0001648150955395034, + "loss": 1.0107, + "step": 6517 + }, + { + "epoch": 0.62, + "grad_norm": 0.2745896765591778, + "learning_rate": 0.00016480304799849343, + "loss": 1.0221, + "step": 6518 + }, + { + "epoch": 0.62, + "grad_norm": 0.2956074131649605, + "learning_rate": 0.00016479099883574648, + "loss": 1.0595, + "step": 6519 + }, + { + "epoch": 0.62, + "grad_norm": 0.2755124978473152, + "learning_rate": 0.00016477894805156404, + "loss": 0.9841, + "step": 6520 + }, + { + "epoch": 0.62, + "grad_norm": 0.2653332774638079, + "learning_rate": 0.00016476689564624773, + "loss": 1.1523, + "step": 6521 + }, + { + "epoch": 0.62, + "grad_norm": 0.28336035642857826, + "learning_rate": 0.00016475484162009913, + "loss": 1.0458, + "step": 6522 + }, + { + "epoch": 0.62, + "grad_norm": 0.27133324752026045, + "learning_rate": 0.00016474278597341995, + "loss": 1.0068, + "step": 6523 + }, + { + "epoch": 0.62, + "grad_norm": 0.254789334192467, + "learning_rate": 0.00016473072870651183, + "loss": 1.0911, + "step": 6524 + }, + { + "epoch": 0.62, + "grad_norm": 0.2645683646629639, + "learning_rate": 0.00016471866981967654, + "loss": 1.0735, + "step": 6525 + }, + { + "epoch": 0.62, + "grad_norm": 0.29514733454194775, + "learning_rate": 0.0001647066093132159, + "loss": 1.1258, + "step": 6526 + }, + { + "epoch": 0.62, + "grad_norm": 0.30773726775104837, + "learning_rate": 0.00016469454718743166, + "loss": 1.2085, + "step": 6527 + }, + { + "epoch": 0.62, + "grad_norm": 0.28753719268726, + "learning_rate": 0.00016468248344262575, + "loss": 1.1034, + "step": 6528 + }, + { + "epoch": 0.62, + "grad_norm": 0.28935088461393205, + "learning_rate": 0.00016467041807910002, + "loss": 0.9435, + "step": 6529 + }, + { + "epoch": 0.62, + "grad_norm": 0.2910181115331406, + "learning_rate": 0.00016465835109715643, + "loss": 0.8991, + "step": 6530 + }, + { + "epoch": 0.62, + "grad_norm": 0.26222425373411046, + "learning_rate": 0.00016464628249709699, + "loss": 1.1253, + "step": 6531 + }, + { + "epoch": 0.62, + "grad_norm": 0.24656232690052832, + "learning_rate": 0.00016463421227922367, + "loss": 0.979, + "step": 6532 + }, + { + "epoch": 0.63, + "grad_norm": 0.2997042750535666, + "learning_rate": 0.0001646221404438386, + "loss": 1.0947, + "step": 6533 + }, + { + "epoch": 0.63, + "grad_norm": 0.26285239590686826, + "learning_rate": 0.0001646100669912438, + "loss": 0.9987, + "step": 6534 + }, + { + "epoch": 0.63, + "grad_norm": 0.2810386117270015, + "learning_rate": 0.00016459799192174152, + "loss": 1.08, + "step": 6535 + }, + { + "epoch": 0.63, + "grad_norm": 0.2899490777892802, + "learning_rate": 0.0001645859152356339, + "loss": 1.0636, + "step": 6536 + }, + { + "epoch": 0.63, + "grad_norm": 0.32728289531293414, + "learning_rate": 0.00016457383693322314, + "loss": 1.1029, + "step": 6537 + }, + { + "epoch": 0.63, + "grad_norm": 0.2873444747610267, + "learning_rate": 0.0001645617570148115, + "loss": 1.1222, + "step": 6538 + }, + { + "epoch": 0.63, + "grad_norm": 0.26266152625659334, + "learning_rate": 0.00016454967548070135, + "loss": 1.1339, + "step": 6539 + }, + { + "epoch": 0.63, + "grad_norm": 0.2803672145152364, + "learning_rate": 0.00016453759233119503, + "loss": 1.094, + "step": 6540 + }, + { + "epoch": 0.63, + "grad_norm": 0.27084575822983264, + "learning_rate": 0.00016452550756659482, + "loss": 0.937, + "step": 6541 + }, + { + "epoch": 0.63, + "grad_norm": 0.2701911111109992, + "learning_rate": 0.00016451342118720328, + "loss": 1.029, + "step": 6542 + }, + { + "epoch": 0.63, + "grad_norm": 0.2716711794064477, + "learning_rate": 0.00016450133319332282, + "loss": 0.8668, + "step": 6543 + }, + { + "epoch": 0.63, + "grad_norm": 0.2794432093629184, + "learning_rate": 0.00016448924358525595, + "loss": 1.0109, + "step": 6544 + }, + { + "epoch": 0.63, + "grad_norm": 0.28134829606945316, + "learning_rate": 0.00016447715236330524, + "loss": 1.191, + "step": 6545 + }, + { + "epoch": 0.63, + "grad_norm": 0.22932974730919156, + "learning_rate": 0.0001644650595277733, + "loss": 1.0067, + "step": 6546 + }, + { + "epoch": 0.63, + "grad_norm": 0.2627137036762324, + "learning_rate": 0.00016445296507896267, + "loss": 1.0458, + "step": 6547 + }, + { + "epoch": 0.63, + "grad_norm": 0.31351571153357294, + "learning_rate": 0.00016444086901717614, + "loss": 1.0699, + "step": 6548 + }, + { + "epoch": 0.63, + "grad_norm": 0.3113263872681726, + "learning_rate": 0.00016442877134271633, + "loss": 1.1383, + "step": 6549 + }, + { + "epoch": 0.63, + "grad_norm": 0.24271597078080498, + "learning_rate": 0.00016441667205588603, + "loss": 0.9184, + "step": 6550 + }, + { + "epoch": 0.63, + "grad_norm": 0.27790009137065697, + "learning_rate": 0.00016440457115698802, + "loss": 1.2613, + "step": 6551 + }, + { + "epoch": 0.63, + "grad_norm": 0.27765509036577246, + "learning_rate": 0.0001643924686463252, + "loss": 1.1003, + "step": 6552 + }, + { + "epoch": 0.63, + "grad_norm": 0.29484287527906927, + "learning_rate": 0.00016438036452420032, + "loss": 1.0934, + "step": 6553 + }, + { + "epoch": 0.63, + "grad_norm": 0.29018553943805164, + "learning_rate": 0.00016436825879091635, + "loss": 1.0676, + "step": 6554 + }, + { + "epoch": 0.63, + "grad_norm": 0.28130026896984306, + "learning_rate": 0.00016435615144677629, + "loss": 1.1587, + "step": 6555 + }, + { + "epoch": 0.63, + "grad_norm": 0.2878221364125203, + "learning_rate": 0.00016434404249208306, + "loss": 1.0381, + "step": 6556 + }, + { + "epoch": 0.63, + "grad_norm": 0.2777652669244412, + "learning_rate": 0.00016433193192713974, + "loss": 1.0429, + "step": 6557 + }, + { + "epoch": 0.63, + "grad_norm": 0.26453569833758483, + "learning_rate": 0.00016431981975224938, + "loss": 1.1111, + "step": 6558 + }, + { + "epoch": 0.63, + "grad_norm": 0.28622435501200705, + "learning_rate": 0.00016430770596771512, + "loss": 1.0154, + "step": 6559 + }, + { + "epoch": 0.63, + "grad_norm": 0.2847988417285998, + "learning_rate": 0.00016429559057384011, + "loss": 0.9996, + "step": 6560 + }, + { + "epoch": 0.63, + "grad_norm": 0.26384397601020537, + "learning_rate": 0.00016428347357092755, + "loss": 0.9437, + "step": 6561 + }, + { + "epoch": 0.63, + "grad_norm": 0.2775557302168176, + "learning_rate": 0.00016427135495928062, + "loss": 1.1182, + "step": 6562 + }, + { + "epoch": 0.63, + "grad_norm": 0.30234120956824934, + "learning_rate": 0.00016425923473920267, + "loss": 1.0162, + "step": 6563 + }, + { + "epoch": 0.63, + "grad_norm": 0.2667444994608358, + "learning_rate": 0.000164247112910997, + "loss": 0.9534, + "step": 6564 + }, + { + "epoch": 0.63, + "grad_norm": 0.295387882239055, + "learning_rate": 0.0001642349894749669, + "loss": 1.1292, + "step": 6565 + }, + { + "epoch": 0.63, + "grad_norm": 0.25364552624174114, + "learning_rate": 0.00016422286443141585, + "loss": 1.0281, + "step": 6566 + }, + { + "epoch": 0.63, + "grad_norm": 0.30870163058659145, + "learning_rate": 0.00016421073778064726, + "loss": 0.9925, + "step": 6567 + }, + { + "epoch": 0.63, + "grad_norm": 0.31418016446029906, + "learning_rate": 0.0001641986095229646, + "loss": 1.0183, + "step": 6568 + }, + { + "epoch": 0.63, + "grad_norm": 0.30568890062391635, + "learning_rate": 0.0001641864796586714, + "loss": 1.1813, + "step": 6569 + }, + { + "epoch": 0.63, + "grad_norm": 0.28403262150412845, + "learning_rate": 0.00016417434818807118, + "loss": 1.1134, + "step": 6570 + }, + { + "epoch": 0.63, + "grad_norm": 0.26397330303772965, + "learning_rate": 0.00016416221511146757, + "loss": 1.113, + "step": 6571 + }, + { + "epoch": 0.63, + "grad_norm": 0.2863559947419068, + "learning_rate": 0.0001641500804291642, + "loss": 1.1201, + "step": 6572 + }, + { + "epoch": 0.63, + "grad_norm": 0.24406593687365424, + "learning_rate": 0.00016413794414146476, + "loss": 1.0542, + "step": 6573 + }, + { + "epoch": 0.63, + "grad_norm": 0.27139228105504454, + "learning_rate": 0.00016412580624867299, + "loss": 1.012, + "step": 6574 + }, + { + "epoch": 0.63, + "grad_norm": 0.27171095999953015, + "learning_rate": 0.00016411366675109256, + "loss": 0.9942, + "step": 6575 + }, + { + "epoch": 0.63, + "grad_norm": 0.2692825756718052, + "learning_rate": 0.00016410152564902734, + "loss": 1.0619, + "step": 6576 + }, + { + "epoch": 0.63, + "grad_norm": 0.2694897408553471, + "learning_rate": 0.00016408938294278118, + "loss": 1.1153, + "step": 6577 + }, + { + "epoch": 0.63, + "grad_norm": 0.2946068069084391, + "learning_rate": 0.0001640772386326579, + "loss": 0.9959, + "step": 6578 + }, + { + "epoch": 0.63, + "grad_norm": 0.2732041052214766, + "learning_rate": 0.0001640650927189615, + "loss": 1.1325, + "step": 6579 + }, + { + "epoch": 0.63, + "grad_norm": 0.25828674961643106, + "learning_rate": 0.00016405294520199586, + "loss": 1.0311, + "step": 6580 + }, + { + "epoch": 0.63, + "grad_norm": 0.2528935519865577, + "learning_rate": 0.000164040796082065, + "loss": 1.0198, + "step": 6581 + }, + { + "epoch": 0.63, + "grad_norm": 0.24788820972140796, + "learning_rate": 0.00016402864535947298, + "loss": 1.0777, + "step": 6582 + }, + { + "epoch": 0.63, + "grad_norm": 0.27027895661109375, + "learning_rate": 0.00016401649303452386, + "loss": 0.9911, + "step": 6583 + }, + { + "epoch": 0.63, + "grad_norm": 0.29776619082303163, + "learning_rate": 0.0001640043391075218, + "loss": 0.995, + "step": 6584 + }, + { + "epoch": 0.63, + "grad_norm": 0.25164405759102015, + "learning_rate": 0.0001639921835787709, + "loss": 1.0057, + "step": 6585 + }, + { + "epoch": 0.63, + "grad_norm": 0.28442402581138315, + "learning_rate": 0.00016398002644857538, + "loss": 1.0983, + "step": 6586 + }, + { + "epoch": 0.63, + "grad_norm": 0.27597236003640674, + "learning_rate": 0.00016396786771723953, + "loss": 1.0816, + "step": 6587 + }, + { + "epoch": 0.63, + "grad_norm": 0.3036495741618543, + "learning_rate": 0.00016395570738506754, + "loss": 1.0313, + "step": 6588 + }, + { + "epoch": 0.63, + "grad_norm": 0.2983086346865117, + "learning_rate": 0.0001639435454523638, + "loss": 1.1382, + "step": 6589 + }, + { + "epoch": 0.63, + "grad_norm": 0.24415345911957764, + "learning_rate": 0.00016393138191943266, + "loss": 0.9764, + "step": 6590 + }, + { + "epoch": 0.63, + "grad_norm": 0.26401568236315054, + "learning_rate": 0.0001639192167865785, + "loss": 1.0753, + "step": 6591 + }, + { + "epoch": 0.63, + "grad_norm": 0.30616078316140916, + "learning_rate": 0.00016390705005410577, + "loss": 1.1654, + "step": 6592 + }, + { + "epoch": 0.63, + "grad_norm": 0.268584602967286, + "learning_rate": 0.00016389488172231895, + "loss": 1.1281, + "step": 6593 + }, + { + "epoch": 0.63, + "grad_norm": 0.3042044038945393, + "learning_rate": 0.00016388271179152255, + "loss": 1.1451, + "step": 6594 + }, + { + "epoch": 0.63, + "grad_norm": 0.30865017852012816, + "learning_rate": 0.00016387054026202114, + "loss": 1.0497, + "step": 6595 + }, + { + "epoch": 0.63, + "grad_norm": 0.2981587654841934, + "learning_rate": 0.00016385836713411932, + "loss": 1.0117, + "step": 6596 + }, + { + "epoch": 0.63, + "grad_norm": 0.30634610666151335, + "learning_rate": 0.00016384619240812173, + "loss": 1.0624, + "step": 6597 + }, + { + "epoch": 0.63, + "grad_norm": 0.31330552508402865, + "learning_rate": 0.00016383401608433305, + "loss": 1.0021, + "step": 6598 + }, + { + "epoch": 0.63, + "grad_norm": 0.2746478862766659, + "learning_rate": 0.00016382183816305798, + "loss": 1.092, + "step": 6599 + }, + { + "epoch": 0.63, + "grad_norm": 0.31666878049663144, + "learning_rate": 0.00016380965864460135, + "loss": 1.1224, + "step": 6600 + }, + { + "epoch": 0.63, + "grad_norm": 0.2805806328462776, + "learning_rate": 0.00016379747752926787, + "loss": 1.0326, + "step": 6601 + }, + { + "epoch": 0.63, + "grad_norm": 0.27859538132326334, + "learning_rate": 0.00016378529481736242, + "loss": 1.0901, + "step": 6602 + }, + { + "epoch": 0.63, + "grad_norm": 0.27971190411521635, + "learning_rate": 0.00016377311050918989, + "loss": 1.0349, + "step": 6603 + }, + { + "epoch": 0.63, + "grad_norm": 0.35928083318694953, + "learning_rate": 0.0001637609246050552, + "loss": 1.0325, + "step": 6604 + }, + { + "epoch": 0.63, + "grad_norm": 0.2791809789319585, + "learning_rate": 0.00016374873710526327, + "loss": 1.101, + "step": 6605 + }, + { + "epoch": 0.63, + "grad_norm": 0.25414723944354833, + "learning_rate": 0.00016373654801011913, + "loss": 0.9845, + "step": 6606 + }, + { + "epoch": 0.63, + "grad_norm": 0.29501863717394433, + "learning_rate": 0.00016372435731992784, + "loss": 1.0297, + "step": 6607 + }, + { + "epoch": 0.63, + "grad_norm": 0.27739025744682017, + "learning_rate": 0.00016371216503499443, + "loss": 1.1649, + "step": 6608 + }, + { + "epoch": 0.63, + "grad_norm": 0.27992773939480853, + "learning_rate": 0.0001636999711556241, + "loss": 1.1623, + "step": 6609 + }, + { + "epoch": 0.63, + "grad_norm": 0.256055609384181, + "learning_rate": 0.00016368777568212192, + "loss": 1.0349, + "step": 6610 + }, + { + "epoch": 0.63, + "grad_norm": 0.27352283442582775, + "learning_rate": 0.00016367557861479316, + "loss": 1.081, + "step": 6611 + }, + { + "epoch": 0.63, + "grad_norm": 0.2878898287968566, + "learning_rate": 0.00016366337995394296, + "loss": 1.0915, + "step": 6612 + }, + { + "epoch": 0.63, + "grad_norm": 0.29458815084779255, + "learning_rate": 0.0001636511796998767, + "loss": 1.0093, + "step": 6613 + }, + { + "epoch": 0.63, + "grad_norm": 0.2833302474424508, + "learning_rate": 0.0001636389778528997, + "loss": 1.1375, + "step": 6614 + }, + { + "epoch": 0.63, + "grad_norm": 0.2827351352279611, + "learning_rate": 0.00016362677441331727, + "loss": 1.133, + "step": 6615 + }, + { + "epoch": 0.63, + "grad_norm": 0.2776267400325992, + "learning_rate": 0.0001636145693814348, + "loss": 1.082, + "step": 6616 + }, + { + "epoch": 0.63, + "grad_norm": 0.29210767304951707, + "learning_rate": 0.00016360236275755777, + "loss": 1.0961, + "step": 6617 + }, + { + "epoch": 0.63, + "grad_norm": 0.3111465892344082, + "learning_rate": 0.00016359015454199161, + "loss": 1.0517, + "step": 6618 + }, + { + "epoch": 0.63, + "grad_norm": 0.30230280360893014, + "learning_rate": 0.0001635779447350419, + "loss": 1.0257, + "step": 6619 + }, + { + "epoch": 0.63, + "grad_norm": 0.2640103618422073, + "learning_rate": 0.00016356573333701414, + "loss": 0.9745, + "step": 6620 + }, + { + "epoch": 0.63, + "grad_norm": 0.27301899027846943, + "learning_rate": 0.00016355352034821396, + "loss": 0.974, + "step": 6621 + }, + { + "epoch": 0.63, + "grad_norm": 0.2738570363795273, + "learning_rate": 0.00016354130576894698, + "loss": 0.995, + "step": 6622 + }, + { + "epoch": 0.63, + "grad_norm": 0.29413026629702177, + "learning_rate": 0.00016352908959951892, + "loss": 1.16, + "step": 6623 + }, + { + "epoch": 0.63, + "grad_norm": 0.30276135434523793, + "learning_rate": 0.00016351687184023547, + "loss": 1.0801, + "step": 6624 + }, + { + "epoch": 0.63, + "grad_norm": 0.26212091458025155, + "learning_rate": 0.00016350465249140235, + "loss": 1.1354, + "step": 6625 + }, + { + "epoch": 0.63, + "grad_norm": 0.295200448114549, + "learning_rate": 0.0001634924315533254, + "loss": 1.1294, + "step": 6626 + }, + { + "epoch": 0.63, + "grad_norm": 0.2719331912386029, + "learning_rate": 0.00016348020902631047, + "loss": 1.0, + "step": 6627 + }, + { + "epoch": 0.63, + "grad_norm": 0.2801548078187952, + "learning_rate": 0.0001634679849106634, + "loss": 1.0472, + "step": 6628 + }, + { + "epoch": 0.63, + "grad_norm": 0.28890264685867756, + "learning_rate": 0.0001634557592066901, + "loss": 1.159, + "step": 6629 + }, + { + "epoch": 0.63, + "grad_norm": 0.2972148378840613, + "learning_rate": 0.00016344353191469657, + "loss": 1.0649, + "step": 6630 + }, + { + "epoch": 0.63, + "grad_norm": 0.30972438758134824, + "learning_rate": 0.00016343130303498877, + "loss": 1.1313, + "step": 6631 + }, + { + "epoch": 0.63, + "grad_norm": 0.2787430037836057, + "learning_rate": 0.00016341907256787273, + "loss": 1.0461, + "step": 6632 + }, + { + "epoch": 0.63, + "grad_norm": 0.2509205560032734, + "learning_rate": 0.00016340684051365458, + "loss": 1.1465, + "step": 6633 + }, + { + "epoch": 0.63, + "grad_norm": 0.3371469012042158, + "learning_rate": 0.00016339460687264039, + "loss": 0.9917, + "step": 6634 + }, + { + "epoch": 0.63, + "grad_norm": 0.28031780574967835, + "learning_rate": 0.0001633823716451363, + "loss": 1.0859, + "step": 6635 + }, + { + "epoch": 0.63, + "grad_norm": 0.32014706377626473, + "learning_rate": 0.00016337013483144853, + "loss": 1.0125, + "step": 6636 + }, + { + "epoch": 0.63, + "grad_norm": 0.28048230557063836, + "learning_rate": 0.00016335789643188333, + "loss": 1.119, + "step": 6637 + }, + { + "epoch": 0.64, + "grad_norm": 0.27484347493922334, + "learning_rate": 0.0001633456564467469, + "loss": 1.001, + "step": 6638 + }, + { + "epoch": 0.64, + "grad_norm": 0.26589912392685283, + "learning_rate": 0.00016333341487634567, + "loss": 1.2356, + "step": 6639 + }, + { + "epoch": 0.64, + "grad_norm": 0.3261860350977622, + "learning_rate": 0.0001633211717209859, + "loss": 1.1768, + "step": 6640 + }, + { + "epoch": 0.64, + "grad_norm": 0.30705986319442513, + "learning_rate": 0.000163308926980974, + "loss": 1.072, + "step": 6641 + }, + { + "epoch": 0.64, + "grad_norm": 0.3098762264842706, + "learning_rate": 0.00016329668065661644, + "loss": 1.088, + "step": 6642 + }, + { + "epoch": 0.64, + "grad_norm": 0.24610152808075564, + "learning_rate": 0.00016328443274821964, + "loss": 1.0679, + "step": 6643 + }, + { + "epoch": 0.64, + "grad_norm": 0.293387945611742, + "learning_rate": 0.00016327218325609018, + "loss": 1.0764, + "step": 6644 + }, + { + "epoch": 0.64, + "grad_norm": 0.24880390773011782, + "learning_rate": 0.0001632599321805345, + "loss": 1.1687, + "step": 6645 + }, + { + "epoch": 0.64, + "grad_norm": 0.2871610507138743, + "learning_rate": 0.00016324767952185932, + "loss": 1.0554, + "step": 6646 + }, + { + "epoch": 0.64, + "grad_norm": 0.33306363001968703, + "learning_rate": 0.00016323542528037116, + "loss": 1.066, + "step": 6647 + }, + { + "epoch": 0.64, + "grad_norm": 0.2601315042428321, + "learning_rate": 0.0001632231694563768, + "loss": 1.062, + "step": 6648 + }, + { + "epoch": 0.64, + "grad_norm": 0.3122750901555906, + "learning_rate": 0.00016321091205018283, + "loss": 0.988, + "step": 6649 + }, + { + "epoch": 0.64, + "grad_norm": 0.2530794057553161, + "learning_rate": 0.0001631986530620961, + "loss": 0.9536, + "step": 6650 + }, + { + "epoch": 0.64, + "grad_norm": 0.25248623075991256, + "learning_rate": 0.00016318639249242336, + "loss": 1.0528, + "step": 6651 + }, + { + "epoch": 0.64, + "grad_norm": 0.25241303157165856, + "learning_rate": 0.00016317413034147143, + "loss": 0.9887, + "step": 6652 + }, + { + "epoch": 0.64, + "grad_norm": 0.25151161516493253, + "learning_rate": 0.00016316186660954716, + "loss": 1.0605, + "step": 6653 + }, + { + "epoch": 0.64, + "grad_norm": 0.2537064242578609, + "learning_rate": 0.0001631496012969575, + "loss": 1.0014, + "step": 6654 + }, + { + "epoch": 0.64, + "grad_norm": 0.26974830206999884, + "learning_rate": 0.00016313733440400941, + "loss": 1.1359, + "step": 6655 + }, + { + "epoch": 0.64, + "grad_norm": 0.28423801348421174, + "learning_rate": 0.0001631250659310098, + "loss": 1.0911, + "step": 6656 + }, + { + "epoch": 0.64, + "grad_norm": 0.3067572434414906, + "learning_rate": 0.00016311279587826575, + "loss": 1.1511, + "step": 6657 + }, + { + "epoch": 0.64, + "grad_norm": 0.30349651329317273, + "learning_rate": 0.00016310052424608435, + "loss": 1.0717, + "step": 6658 + }, + { + "epoch": 0.64, + "grad_norm": 0.31106706237441717, + "learning_rate": 0.00016308825103477262, + "loss": 1.0138, + "step": 6659 + }, + { + "epoch": 0.64, + "grad_norm": 0.30512529577510444, + "learning_rate": 0.0001630759762446378, + "loss": 1.1175, + "step": 6660 + }, + { + "epoch": 0.64, + "grad_norm": 0.3156400341146579, + "learning_rate": 0.00016306369987598705, + "loss": 1.0594, + "step": 6661 + }, + { + "epoch": 0.64, + "grad_norm": 0.3024624555405739, + "learning_rate": 0.00016305142192912754, + "loss": 1.1203, + "step": 6662 + }, + { + "epoch": 0.64, + "grad_norm": 0.24676397866245015, + "learning_rate": 0.00016303914240436656, + "loss": 1.0936, + "step": 6663 + }, + { + "epoch": 0.64, + "grad_norm": 0.2585771707617533, + "learning_rate": 0.00016302686130201144, + "loss": 1.1232, + "step": 6664 + }, + { + "epoch": 0.64, + "grad_norm": 0.2726696313152824, + "learning_rate": 0.00016301457862236954, + "loss": 1.0913, + "step": 6665 + }, + { + "epoch": 0.64, + "grad_norm": 0.2760746274104927, + "learning_rate": 0.00016300229436574815, + "loss": 0.9746, + "step": 6666 + }, + { + "epoch": 0.64, + "grad_norm": 0.2755395263384695, + "learning_rate": 0.00016299000853245475, + "loss": 1.0183, + "step": 6667 + }, + { + "epoch": 0.64, + "grad_norm": 0.27852069729003387, + "learning_rate": 0.00016297772112279683, + "loss": 1.0574, + "step": 6668 + }, + { + "epoch": 0.64, + "grad_norm": 0.27979974439621436, + "learning_rate": 0.00016296543213708184, + "loss": 0.9499, + "step": 6669 + }, + { + "epoch": 0.64, + "grad_norm": 0.2960060531441859, + "learning_rate": 0.00016295314157561736, + "loss": 1.0549, + "step": 6670 + }, + { + "epoch": 0.64, + "grad_norm": 0.2660000695711491, + "learning_rate": 0.00016294084943871092, + "loss": 1.1072, + "step": 6671 + }, + { + "epoch": 0.64, + "grad_norm": 0.2564269859230904, + "learning_rate": 0.0001629285557266702, + "loss": 1.1333, + "step": 6672 + }, + { + "epoch": 0.64, + "grad_norm": 0.27504533401306863, + "learning_rate": 0.00016291626043980282, + "loss": 1.0034, + "step": 6673 + }, + { + "epoch": 0.64, + "grad_norm": 0.3099324729512832, + "learning_rate": 0.00016290396357841646, + "loss": 1.0459, + "step": 6674 + }, + { + "epoch": 0.64, + "grad_norm": 0.2844820883790574, + "learning_rate": 0.00016289166514281888, + "loss": 0.9184, + "step": 6675 + }, + { + "epoch": 0.64, + "grad_norm": 0.27114882440422344, + "learning_rate": 0.00016287936513331787, + "loss": 1.1727, + "step": 6676 + }, + { + "epoch": 0.64, + "grad_norm": 0.2720012225716546, + "learning_rate": 0.00016286706355022118, + "loss": 1.0654, + "step": 6677 + }, + { + "epoch": 0.64, + "grad_norm": 0.2916372120440236, + "learning_rate": 0.00016285476039383675, + "loss": 1.0744, + "step": 6678 + }, + { + "epoch": 0.64, + "grad_norm": 0.27052138249686447, + "learning_rate": 0.00016284245566447245, + "loss": 1.1014, + "step": 6679 + }, + { + "epoch": 0.64, + "grad_norm": 0.2652236414824706, + "learning_rate": 0.0001628301493624362, + "loss": 1.1672, + "step": 6680 + }, + { + "epoch": 0.64, + "grad_norm": 0.27950873661043685, + "learning_rate": 0.00016281784148803596, + "loss": 1.0014, + "step": 6681 + }, + { + "epoch": 0.64, + "grad_norm": 0.2827281504818738, + "learning_rate": 0.0001628055320415798, + "loss": 0.9676, + "step": 6682 + }, + { + "epoch": 0.64, + "grad_norm": 0.2609854109436044, + "learning_rate": 0.00016279322102337565, + "loss": 0.8978, + "step": 6683 + }, + { + "epoch": 0.64, + "grad_norm": 0.24726688155868715, + "learning_rate": 0.00016278090843373173, + "loss": 1.0843, + "step": 6684 + }, + { + "epoch": 0.64, + "grad_norm": 0.3193189601858733, + "learning_rate": 0.00016276859427295613, + "loss": 1.028, + "step": 6685 + }, + { + "epoch": 0.64, + "grad_norm": 0.2845484123594936, + "learning_rate": 0.00016275627854135698, + "loss": 0.9256, + "step": 6686 + }, + { + "epoch": 0.64, + "grad_norm": 0.29028230142099054, + "learning_rate": 0.00016274396123924252, + "loss": 1.0812, + "step": 6687 + }, + { + "epoch": 0.64, + "grad_norm": 0.32773446539660916, + "learning_rate": 0.000162731642366921, + "loss": 1.0608, + "step": 6688 + }, + { + "epoch": 0.64, + "grad_norm": 0.29790785005575593, + "learning_rate": 0.00016271932192470074, + "loss": 1.0306, + "step": 6689 + }, + { + "epoch": 0.64, + "grad_norm": 0.27883966783277836, + "learning_rate": 0.00016270699991289, + "loss": 1.0423, + "step": 6690 + }, + { + "epoch": 0.64, + "grad_norm": 0.28349188815486853, + "learning_rate": 0.0001626946763317972, + "loss": 1.0897, + "step": 6691 + }, + { + "epoch": 0.64, + "grad_norm": 0.3054195899080486, + "learning_rate": 0.00016268235118173068, + "loss": 1.0126, + "step": 6692 + }, + { + "epoch": 0.64, + "grad_norm": 0.297957213586038, + "learning_rate": 0.00016267002446299891, + "loss": 1.0832, + "step": 6693 + }, + { + "epoch": 0.64, + "grad_norm": 0.2786420902687923, + "learning_rate": 0.00016265769617591046, + "loss": 1.1095, + "step": 6694 + }, + { + "epoch": 0.64, + "grad_norm": 0.28426733069435445, + "learning_rate": 0.00016264536632077376, + "loss": 1.0612, + "step": 6695 + }, + { + "epoch": 0.64, + "grad_norm": 0.27433860519495945, + "learning_rate": 0.0001626330348978974, + "loss": 1.0349, + "step": 6696 + }, + { + "epoch": 0.64, + "grad_norm": 0.317775496735832, + "learning_rate": 0.00016262070190758995, + "loss": 1.1712, + "step": 6697 + }, + { + "epoch": 0.64, + "grad_norm": 0.2568267188069798, + "learning_rate": 0.00016260836735016012, + "loss": 0.9826, + "step": 6698 + }, + { + "epoch": 0.64, + "grad_norm": 0.2852886758776049, + "learning_rate": 0.00016259603122591653, + "loss": 1.0588, + "step": 6699 + }, + { + "epoch": 0.64, + "grad_norm": 0.28264369508139864, + "learning_rate": 0.0001625836935351679, + "loss": 1.139, + "step": 6700 + }, + { + "epoch": 0.64, + "grad_norm": 0.3102222867034462, + "learning_rate": 0.00016257135427822302, + "loss": 1.0697, + "step": 6701 + }, + { + "epoch": 0.64, + "grad_norm": 0.268465641442644, + "learning_rate": 0.00016255901345539072, + "loss": 1.0887, + "step": 6702 + }, + { + "epoch": 0.64, + "grad_norm": 0.30026142767107994, + "learning_rate": 0.00016254667106697972, + "loss": 1.1509, + "step": 6703 + }, + { + "epoch": 0.64, + "grad_norm": 0.2706314630840685, + "learning_rate": 0.000162534327113299, + "loss": 1.0657, + "step": 6704 + }, + { + "epoch": 0.64, + "grad_norm": 0.26773703311582037, + "learning_rate": 0.00016252198159465744, + "loss": 1.1287, + "step": 6705 + }, + { + "epoch": 0.64, + "grad_norm": 0.2957605324588231, + "learning_rate": 0.000162509634511364, + "loss": 1.2115, + "step": 6706 + }, + { + "epoch": 0.64, + "grad_norm": 0.29017697603422365, + "learning_rate": 0.00016249728586372765, + "loss": 1.0255, + "step": 6707 + }, + { + "epoch": 0.64, + "grad_norm": 0.27496495480231525, + "learning_rate": 0.0001624849356520575, + "loss": 1.0261, + "step": 6708 + }, + { + "epoch": 0.64, + "grad_norm": 0.26386368126415394, + "learning_rate": 0.0001624725838766625, + "loss": 1.1855, + "step": 6709 + }, + { + "epoch": 0.64, + "grad_norm": 0.27191856479446785, + "learning_rate": 0.00016246023053785184, + "loss": 1.033, + "step": 6710 + }, + { + "epoch": 0.64, + "grad_norm": 0.26761649181678265, + "learning_rate": 0.0001624478756359347, + "loss": 1.137, + "step": 6711 + }, + { + "epoch": 0.64, + "grad_norm": 0.26810717439455456, + "learning_rate": 0.00016243551917122017, + "loss": 1.169, + "step": 6712 + }, + { + "epoch": 0.64, + "grad_norm": 0.23716095057095146, + "learning_rate": 0.00016242316114401754, + "loss": 0.9461, + "step": 6713 + }, + { + "epoch": 0.64, + "grad_norm": 0.2820129032334972, + "learning_rate": 0.00016241080155463613, + "loss": 1.028, + "step": 6714 + }, + { + "epoch": 0.64, + "grad_norm": 0.29187354158124584, + "learning_rate": 0.00016239844040338513, + "loss": 1.1075, + "step": 6715 + }, + { + "epoch": 0.64, + "grad_norm": 0.2707882691645488, + "learning_rate": 0.00016238607769057396, + "loss": 1.1043, + "step": 6716 + }, + { + "epoch": 0.64, + "grad_norm": 0.27825828988627205, + "learning_rate": 0.00016237371341651198, + "loss": 0.9317, + "step": 6717 + }, + { + "epoch": 0.64, + "grad_norm": 0.24456292837088042, + "learning_rate": 0.00016236134758150863, + "loss": 1.115, + "step": 6718 + }, + { + "epoch": 0.64, + "grad_norm": 0.30642644995469454, + "learning_rate": 0.00016234898018587337, + "loss": 1.0855, + "step": 6719 + }, + { + "epoch": 0.64, + "grad_norm": 0.2536012594797928, + "learning_rate": 0.00016233661122991568, + "loss": 1.0347, + "step": 6720 + }, + { + "epoch": 0.64, + "grad_norm": 0.28201291743191076, + "learning_rate": 0.00016232424071394513, + "loss": 0.923, + "step": 6721 + }, + { + "epoch": 0.64, + "grad_norm": 0.26500412097642506, + "learning_rate": 0.00016231186863827128, + "loss": 1.1197, + "step": 6722 + }, + { + "epoch": 0.64, + "grad_norm": 0.28336108524806514, + "learning_rate": 0.00016229949500320376, + "loss": 0.9663, + "step": 6723 + }, + { + "epoch": 0.64, + "grad_norm": 0.30535153183981684, + "learning_rate": 0.00016228711980905222, + "loss": 1.0546, + "step": 6724 + }, + { + "epoch": 0.64, + "grad_norm": 0.2738980528128915, + "learning_rate": 0.00016227474305612635, + "loss": 1.1652, + "step": 6725 + }, + { + "epoch": 0.64, + "grad_norm": 0.29696172073189514, + "learning_rate": 0.00016226236474473592, + "loss": 1.1307, + "step": 6726 + }, + { + "epoch": 0.64, + "grad_norm": 0.2575129900484908, + "learning_rate": 0.00016224998487519065, + "loss": 1.0553, + "step": 6727 + }, + { + "epoch": 0.64, + "grad_norm": 0.25508951971135796, + "learning_rate": 0.0001622376034478004, + "loss": 0.9931, + "step": 6728 + }, + { + "epoch": 0.64, + "grad_norm": 0.3026293198793625, + "learning_rate": 0.00016222522046287506, + "loss": 1.0979, + "step": 6729 + }, + { + "epoch": 0.64, + "grad_norm": 0.2598437183575833, + "learning_rate": 0.00016221283592072442, + "loss": 1.0128, + "step": 6730 + }, + { + "epoch": 0.64, + "grad_norm": 0.3163844838276975, + "learning_rate": 0.00016220044982165845, + "loss": 1.1538, + "step": 6731 + }, + { + "epoch": 0.64, + "grad_norm": 0.2362891477357362, + "learning_rate": 0.00016218806216598713, + "loss": 1.0506, + "step": 6732 + }, + { + "epoch": 0.64, + "grad_norm": 0.26662591447108, + "learning_rate": 0.00016217567295402052, + "loss": 0.962, + "step": 6733 + }, + { + "epoch": 0.64, + "grad_norm": 0.29725496528185136, + "learning_rate": 0.00016216328218606856, + "loss": 1.0977, + "step": 6734 + }, + { + "epoch": 0.64, + "grad_norm": 0.2571941755837014, + "learning_rate": 0.00016215088986244145, + "loss": 1.0301, + "step": 6735 + }, + { + "epoch": 0.64, + "grad_norm": 0.24442116258690824, + "learning_rate": 0.00016213849598344923, + "loss": 1.0068, + "step": 6736 + }, + { + "epoch": 0.64, + "grad_norm": 0.2747700753479199, + "learning_rate": 0.0001621261005494021, + "loss": 0.984, + "step": 6737 + }, + { + "epoch": 0.64, + "grad_norm": 0.2947494451070546, + "learning_rate": 0.00016211370356061024, + "loss": 0.9723, + "step": 6738 + }, + { + "epoch": 0.64, + "grad_norm": 0.28377559231615324, + "learning_rate": 0.00016210130501738393, + "loss": 1.1093, + "step": 6739 + }, + { + "epoch": 0.64, + "grad_norm": 0.281938530095994, + "learning_rate": 0.00016208890492003345, + "loss": 1.0411, + "step": 6740 + }, + { + "epoch": 0.64, + "grad_norm": 0.25782869247051643, + "learning_rate": 0.00016207650326886908, + "loss": 1.0448, + "step": 6741 + }, + { + "epoch": 0.65, + "grad_norm": 0.27863659241136324, + "learning_rate": 0.0001620641000642012, + "loss": 1.0766, + "step": 6742 + }, + { + "epoch": 0.65, + "grad_norm": 0.3034063935548644, + "learning_rate": 0.00016205169530634022, + "loss": 1.0668, + "step": 6743 + }, + { + "epoch": 0.65, + "grad_norm": 0.29522301302575077, + "learning_rate": 0.00016203928899559655, + "loss": 1.0437, + "step": 6744 + }, + { + "epoch": 0.65, + "grad_norm": 0.2407728356207695, + "learning_rate": 0.0001620268811322807, + "loss": 1.1312, + "step": 6745 + }, + { + "epoch": 0.65, + "grad_norm": 0.26324055091027326, + "learning_rate": 0.0001620144717167032, + "loss": 1.0093, + "step": 6746 + }, + { + "epoch": 0.65, + "grad_norm": 0.2804316768014678, + "learning_rate": 0.0001620020607491745, + "loss": 1.0229, + "step": 6747 + }, + { + "epoch": 0.65, + "grad_norm": 0.2753277098551558, + "learning_rate": 0.00016198964823000531, + "loss": 1.0634, + "step": 6748 + }, + { + "epoch": 0.65, + "grad_norm": 0.2800366285769209, + "learning_rate": 0.00016197723415950618, + "loss": 1.0365, + "step": 6749 + }, + { + "epoch": 0.65, + "grad_norm": 0.302146197809392, + "learning_rate": 0.00016196481853798783, + "loss": 0.9998, + "step": 6750 + }, + { + "epoch": 0.65, + "grad_norm": 0.3568635660133519, + "learning_rate": 0.00016195240136576098, + "loss": 1.1658, + "step": 6751 + }, + { + "epoch": 0.65, + "grad_norm": 0.2716917931898554, + "learning_rate": 0.00016193998264313632, + "loss": 1.0812, + "step": 6752 + }, + { + "epoch": 0.65, + "grad_norm": 0.27290621018790484, + "learning_rate": 0.0001619275623704247, + "loss": 1.094, + "step": 6753 + }, + { + "epoch": 0.65, + "grad_norm": 0.2682620960621536, + "learning_rate": 0.00016191514054793687, + "loss": 1.0381, + "step": 6754 + }, + { + "epoch": 0.65, + "grad_norm": 0.2845594233710903, + "learning_rate": 0.00016190271717598376, + "loss": 1.1278, + "step": 6755 + }, + { + "epoch": 0.65, + "grad_norm": 0.27355020118683776, + "learning_rate": 0.0001618902922548762, + "loss": 0.9993, + "step": 6756 + }, + { + "epoch": 0.65, + "grad_norm": 0.287166856164538, + "learning_rate": 0.00016187786578492527, + "loss": 1.1079, + "step": 6757 + }, + { + "epoch": 0.65, + "grad_norm": 0.31367583134610894, + "learning_rate": 0.00016186543776644177, + "loss": 1.0392, + "step": 6758 + }, + { + "epoch": 0.65, + "grad_norm": 0.26476803361212475, + "learning_rate": 0.00016185300819973687, + "loss": 1.0339, + "step": 6759 + }, + { + "epoch": 0.65, + "grad_norm": 0.2869935686558382, + "learning_rate": 0.00016184057708512156, + "loss": 1.0353, + "step": 6760 + }, + { + "epoch": 0.65, + "grad_norm": 0.2607335209317506, + "learning_rate": 0.0001618281444229069, + "loss": 1.0328, + "step": 6761 + }, + { + "epoch": 0.65, + "grad_norm": 0.2819258447169869, + "learning_rate": 0.0001618157102134041, + "loss": 1.0694, + "step": 6762 + }, + { + "epoch": 0.65, + "grad_norm": 0.2486602423602287, + "learning_rate": 0.0001618032744569243, + "loss": 1.0398, + "step": 6763 + }, + { + "epoch": 0.65, + "grad_norm": 0.2964468238045624, + "learning_rate": 0.0001617908371537787, + "loss": 1.1674, + "step": 6764 + }, + { + "epoch": 0.65, + "grad_norm": 0.28968276628283657, + "learning_rate": 0.00016177839830427862, + "loss": 1.0923, + "step": 6765 + }, + { + "epoch": 0.65, + "grad_norm": 0.30259410481968824, + "learning_rate": 0.00016176595790873526, + "loss": 1.1047, + "step": 6766 + }, + { + "epoch": 0.65, + "grad_norm": 0.299883854383291, + "learning_rate": 0.00016175351596745997, + "loss": 1.0987, + "step": 6767 + }, + { + "epoch": 0.65, + "grad_norm": 0.28742268599750015, + "learning_rate": 0.00016174107248076414, + "loss": 1.1337, + "step": 6768 + }, + { + "epoch": 0.65, + "grad_norm": 0.27078457130344585, + "learning_rate": 0.00016172862744895917, + "loss": 1.1273, + "step": 6769 + }, + { + "epoch": 0.65, + "grad_norm": 0.28329813420365574, + "learning_rate": 0.00016171618087235652, + "loss": 1.042, + "step": 6770 + }, + { + "epoch": 0.65, + "grad_norm": 0.2835411240856413, + "learning_rate": 0.00016170373275126761, + "loss": 1.1055, + "step": 6771 + }, + { + "epoch": 0.65, + "grad_norm": 0.2697682905187557, + "learning_rate": 0.00016169128308600404, + "loss": 1.1592, + "step": 6772 + }, + { + "epoch": 0.65, + "grad_norm": 0.2606901412284456, + "learning_rate": 0.00016167883187687737, + "loss": 1.0796, + "step": 6773 + }, + { + "epoch": 0.65, + "grad_norm": 0.2718079210220933, + "learning_rate": 0.0001616663791241991, + "loss": 0.9178, + "step": 6774 + }, + { + "epoch": 0.65, + "grad_norm": 0.2992698573782557, + "learning_rate": 0.00016165392482828098, + "loss": 0.9155, + "step": 6775 + }, + { + "epoch": 0.65, + "grad_norm": 0.2820273589437464, + "learning_rate": 0.00016164146898943463, + "loss": 1.1096, + "step": 6776 + }, + { + "epoch": 0.65, + "grad_norm": 0.27964729949513284, + "learning_rate": 0.00016162901160797182, + "loss": 1.0301, + "step": 6777 + }, + { + "epoch": 0.65, + "grad_norm": 0.30035527749172686, + "learning_rate": 0.0001616165526842042, + "loss": 1.0112, + "step": 6778 + }, + { + "epoch": 0.65, + "grad_norm": 0.26367756968433076, + "learning_rate": 0.0001616040922184437, + "loss": 1.062, + "step": 6779 + }, + { + "epoch": 0.65, + "grad_norm": 0.30454178782431157, + "learning_rate": 0.000161591630211002, + "loss": 1.0908, + "step": 6780 + }, + { + "epoch": 0.65, + "grad_norm": 0.2736709005007848, + "learning_rate": 0.0001615791666621911, + "loss": 1.0642, + "step": 6781 + }, + { + "epoch": 0.65, + "grad_norm": 0.28955239699459784, + "learning_rate": 0.00016156670157232278, + "loss": 1.038, + "step": 6782 + }, + { + "epoch": 0.65, + "grad_norm": 0.3055984799372455, + "learning_rate": 0.00016155423494170913, + "loss": 1.083, + "step": 6783 + }, + { + "epoch": 0.65, + "grad_norm": 0.28914856789930005, + "learning_rate": 0.00016154176677066204, + "loss": 1.0251, + "step": 6784 + }, + { + "epoch": 0.65, + "grad_norm": 0.2937920207039406, + "learning_rate": 0.00016152929705949356, + "loss": 1.0395, + "step": 6785 + }, + { + "epoch": 0.65, + "grad_norm": 0.26681380130500015, + "learning_rate": 0.00016151682580851576, + "loss": 1.1342, + "step": 6786 + }, + { + "epoch": 0.65, + "grad_norm": 0.31513605238395026, + "learning_rate": 0.00016150435301804072, + "loss": 1.1459, + "step": 6787 + }, + { + "epoch": 0.65, + "grad_norm": 0.2962395662354257, + "learning_rate": 0.0001614918786883806, + "loss": 0.9866, + "step": 6788 + }, + { + "epoch": 0.65, + "grad_norm": 0.27715689822063977, + "learning_rate": 0.00016147940281984754, + "loss": 1.0869, + "step": 6789 + }, + { + "epoch": 0.65, + "grad_norm": 0.261319217896763, + "learning_rate": 0.00016146692541275383, + "loss": 0.9879, + "step": 6790 + }, + { + "epoch": 0.65, + "grad_norm": 0.29020817288316764, + "learning_rate": 0.00016145444646741166, + "loss": 1.1018, + "step": 6791 + }, + { + "epoch": 0.65, + "grad_norm": 0.2564090714223762, + "learning_rate": 0.00016144196598413336, + "loss": 1.0812, + "step": 6792 + }, + { + "epoch": 0.65, + "grad_norm": 0.252997662859385, + "learning_rate": 0.00016142948396323124, + "loss": 1.0051, + "step": 6793 + }, + { + "epoch": 0.65, + "grad_norm": 0.27979442491585477, + "learning_rate": 0.00016141700040501767, + "loss": 1.0166, + "step": 6794 + }, + { + "epoch": 0.65, + "grad_norm": 0.2723199327047408, + "learning_rate": 0.00016140451530980503, + "loss": 1.0498, + "step": 6795 + }, + { + "epoch": 0.65, + "grad_norm": 0.24997068906233713, + "learning_rate": 0.00016139202867790586, + "loss": 1.0254, + "step": 6796 + }, + { + "epoch": 0.65, + "grad_norm": 0.25606449185707403, + "learning_rate": 0.00016137954050963256, + "loss": 1.0375, + "step": 6797 + }, + { + "epoch": 0.65, + "grad_norm": 0.27072931875871203, + "learning_rate": 0.0001613670508052977, + "loss": 1.1278, + "step": 6798 + }, + { + "epoch": 0.65, + "grad_norm": 0.2861760154233243, + "learning_rate": 0.00016135455956521383, + "loss": 1.0276, + "step": 6799 + }, + { + "epoch": 0.65, + "grad_norm": 0.281890088050873, + "learning_rate": 0.00016134206678969351, + "loss": 1.151, + "step": 6800 + }, + { + "epoch": 0.65, + "grad_norm": 0.2561578106187753, + "learning_rate": 0.00016132957247904948, + "loss": 1.0579, + "step": 6801 + }, + { + "epoch": 0.65, + "grad_norm": 0.296476813043644, + "learning_rate": 0.0001613170766335943, + "loss": 1.0815, + "step": 6802 + }, + { + "epoch": 0.65, + "grad_norm": 0.24047219584398974, + "learning_rate": 0.00016130457925364074, + "loss": 1.0674, + "step": 6803 + }, + { + "epoch": 0.65, + "grad_norm": 0.28537999657913954, + "learning_rate": 0.00016129208033950157, + "loss": 1.1703, + "step": 6804 + }, + { + "epoch": 0.65, + "grad_norm": 0.3071076656903495, + "learning_rate": 0.00016127957989148958, + "loss": 1.1097, + "step": 6805 + }, + { + "epoch": 0.65, + "grad_norm": 0.2912482911905167, + "learning_rate": 0.00016126707790991757, + "loss": 1.0033, + "step": 6806 + }, + { + "epoch": 0.65, + "grad_norm": 0.2873587122615141, + "learning_rate": 0.00016125457439509843, + "loss": 1.1402, + "step": 6807 + }, + { + "epoch": 0.65, + "grad_norm": 0.2822651267978888, + "learning_rate": 0.00016124206934734509, + "loss": 1.0684, + "step": 6808 + }, + { + "epoch": 0.65, + "grad_norm": 0.30569024152993257, + "learning_rate": 0.0001612295627669705, + "loss": 1.0356, + "step": 6809 + }, + { + "epoch": 0.65, + "grad_norm": 0.2552189424981213, + "learning_rate": 0.00016121705465428756, + "loss": 1.1015, + "step": 6810 + }, + { + "epoch": 0.65, + "grad_norm": 0.2780062595426105, + "learning_rate": 0.0001612045450096094, + "loss": 1.0252, + "step": 6811 + }, + { + "epoch": 0.65, + "grad_norm": 0.2958626948496943, + "learning_rate": 0.000161192033833249, + "loss": 1.047, + "step": 6812 + }, + { + "epoch": 0.65, + "grad_norm": 0.2547715653655437, + "learning_rate": 0.0001611795211255195, + "loss": 0.9724, + "step": 6813 + }, + { + "epoch": 0.65, + "grad_norm": 0.26184844172876026, + "learning_rate": 0.00016116700688673406, + "loss": 0.9989, + "step": 6814 + }, + { + "epoch": 0.65, + "grad_norm": 0.2903838433558316, + "learning_rate": 0.0001611544911172058, + "loss": 1.0365, + "step": 6815 + }, + { + "epoch": 0.65, + "grad_norm": 0.2787578244449706, + "learning_rate": 0.00016114197381724798, + "loss": 1.1122, + "step": 6816 + }, + { + "epoch": 0.65, + "grad_norm": 0.2890262361588652, + "learning_rate": 0.00016112945498717384, + "loss": 0.9743, + "step": 6817 + }, + { + "epoch": 0.65, + "grad_norm": 0.25957088901852887, + "learning_rate": 0.00016111693462729666, + "loss": 1.0411, + "step": 6818 + }, + { + "epoch": 0.65, + "grad_norm": 0.29141337706844284, + "learning_rate": 0.0001611044127379298, + "loss": 1.0738, + "step": 6819 + }, + { + "epoch": 0.65, + "grad_norm": 0.27868532305453414, + "learning_rate": 0.00016109188931938658, + "loss": 1.0309, + "step": 6820 + }, + { + "epoch": 0.65, + "grad_norm": 0.2865127928357969, + "learning_rate": 0.00016107936437198048, + "loss": 1.0432, + "step": 6821 + }, + { + "epoch": 0.65, + "grad_norm": 0.28901415788858237, + "learning_rate": 0.00016106683789602485, + "loss": 1.0913, + "step": 6822 + }, + { + "epoch": 0.65, + "grad_norm": 0.31087668233226007, + "learning_rate": 0.00016105430989183324, + "loss": 1.0044, + "step": 6823 + }, + { + "epoch": 0.65, + "grad_norm": 0.2907385138727252, + "learning_rate": 0.0001610417803597192, + "loss": 0.9812, + "step": 6824 + }, + { + "epoch": 0.65, + "grad_norm": 0.2955311073850929, + "learning_rate": 0.00016102924929999618, + "loss": 0.9718, + "step": 6825 + }, + { + "epoch": 0.65, + "grad_norm": 0.25938433521785725, + "learning_rate": 0.00016101671671297786, + "loss": 1.0653, + "step": 6826 + }, + { + "epoch": 0.65, + "grad_norm": 0.29835875083842295, + "learning_rate": 0.00016100418259897787, + "loss": 1.1136, + "step": 6827 + }, + { + "epoch": 0.65, + "grad_norm": 0.28475427911325607, + "learning_rate": 0.00016099164695830987, + "loss": 1.0817, + "step": 6828 + }, + { + "epoch": 0.65, + "grad_norm": 0.29712133149339015, + "learning_rate": 0.00016097910979128756, + "loss": 1.2519, + "step": 6829 + }, + { + "epoch": 0.65, + "grad_norm": 0.30942764161840036, + "learning_rate": 0.00016096657109822472, + "loss": 1.153, + "step": 6830 + }, + { + "epoch": 0.65, + "grad_norm": 0.24877924990251216, + "learning_rate": 0.0001609540308794351, + "loss": 1.0787, + "step": 6831 + }, + { + "epoch": 0.65, + "grad_norm": 0.2491230221886881, + "learning_rate": 0.00016094148913523254, + "loss": 1.0693, + "step": 6832 + }, + { + "epoch": 0.65, + "grad_norm": 0.33875565455907103, + "learning_rate": 0.00016092894586593098, + "loss": 1.0437, + "step": 6833 + }, + { + "epoch": 0.65, + "grad_norm": 0.25683830659317614, + "learning_rate": 0.00016091640107184418, + "loss": 1.0761, + "step": 6834 + }, + { + "epoch": 0.65, + "grad_norm": 0.28029893510677445, + "learning_rate": 0.00016090385475328616, + "loss": 1.0066, + "step": 6835 + }, + { + "epoch": 0.65, + "grad_norm": 0.2638298088456665, + "learning_rate": 0.00016089130691057096, + "loss": 1.0488, + "step": 6836 + }, + { + "epoch": 0.65, + "grad_norm": 0.2732308385667981, + "learning_rate": 0.0001608787575440125, + "loss": 1.0224, + "step": 6837 + }, + { + "epoch": 0.65, + "grad_norm": 0.2652051822786465, + "learning_rate": 0.0001608662066539249, + "loss": 1.1503, + "step": 6838 + }, + { + "epoch": 0.65, + "grad_norm": 0.3030626041580578, + "learning_rate": 0.00016085365424062218, + "loss": 1.0871, + "step": 6839 + }, + { + "epoch": 0.65, + "grad_norm": 0.30008149751746244, + "learning_rate": 0.00016084110030441853, + "loss": 1.0668, + "step": 6840 + }, + { + "epoch": 0.65, + "grad_norm": 0.2888340632480941, + "learning_rate": 0.00016082854484562813, + "loss": 1.0374, + "step": 6841 + }, + { + "epoch": 0.65, + "grad_norm": 0.2862699272392478, + "learning_rate": 0.00016081598786456516, + "loss": 1.1416, + "step": 6842 + }, + { + "epoch": 0.65, + "grad_norm": 0.2717722410816119, + "learning_rate": 0.00016080342936154388, + "loss": 1.0422, + "step": 6843 + }, + { + "epoch": 0.65, + "grad_norm": 0.2726242149316908, + "learning_rate": 0.00016079086933687854, + "loss": 1.0457, + "step": 6844 + }, + { + "epoch": 0.65, + "grad_norm": 0.2531509697020992, + "learning_rate": 0.0001607783077908835, + "loss": 1.0804, + "step": 6845 + }, + { + "epoch": 0.65, + "grad_norm": 0.2574041172829116, + "learning_rate": 0.0001607657447238731, + "loss": 1.0583, + "step": 6846 + }, + { + "epoch": 0.66, + "grad_norm": 0.28649411980651573, + "learning_rate": 0.00016075318013616174, + "loss": 1.0341, + "step": 6847 + }, + { + "epoch": 0.66, + "grad_norm": 0.32154960312770714, + "learning_rate": 0.0001607406140280639, + "loss": 1.0304, + "step": 6848 + }, + { + "epoch": 0.66, + "grad_norm": 0.27521017806337644, + "learning_rate": 0.000160728046399894, + "loss": 1.0798, + "step": 6849 + }, + { + "epoch": 0.66, + "grad_norm": 0.27096865352974614, + "learning_rate": 0.00016071547725196657, + "loss": 1.1019, + "step": 6850 + }, + { + "epoch": 0.66, + "grad_norm": 0.2827906410490978, + "learning_rate": 0.0001607029065845962, + "loss": 1.0481, + "step": 6851 + }, + { + "epoch": 0.66, + "grad_norm": 0.28755989114991476, + "learning_rate": 0.00016069033439809738, + "loss": 1.221, + "step": 6852 + }, + { + "epoch": 0.66, + "grad_norm": 0.2849156830204197, + "learning_rate": 0.00016067776069278485, + "loss": 1.0805, + "step": 6853 + }, + { + "epoch": 0.66, + "grad_norm": 0.2738501723030775, + "learning_rate": 0.0001606651854689732, + "loss": 1.0744, + "step": 6854 + }, + { + "epoch": 0.66, + "grad_norm": 0.30408775836991736, + "learning_rate": 0.00016065260872697717, + "loss": 1.0593, + "step": 6855 + }, + { + "epoch": 0.66, + "grad_norm": 0.2742033230168456, + "learning_rate": 0.00016064003046711148, + "loss": 1.1384, + "step": 6856 + }, + { + "epoch": 0.66, + "grad_norm": 0.2623388811156543, + "learning_rate": 0.00016062745068969088, + "loss": 1.0076, + "step": 6857 + }, + { + "epoch": 0.66, + "grad_norm": 0.2830978029989236, + "learning_rate": 0.00016061486939503028, + "loss": 1.0956, + "step": 6858 + }, + { + "epoch": 0.66, + "grad_norm": 0.2825549299962127, + "learning_rate": 0.00016060228658344445, + "loss": 0.9938, + "step": 6859 + }, + { + "epoch": 0.66, + "grad_norm": 0.2787473530770543, + "learning_rate": 0.00016058970225524833, + "loss": 1.108, + "step": 6860 + }, + { + "epoch": 0.66, + "grad_norm": 0.2766509594000378, + "learning_rate": 0.00016057711641075684, + "loss": 1.0104, + "step": 6861 + }, + { + "epoch": 0.66, + "grad_norm": 0.2835424799524376, + "learning_rate": 0.00016056452905028492, + "loss": 1.0538, + "step": 6862 + }, + { + "epoch": 0.66, + "grad_norm": 0.2718902303042608, + "learning_rate": 0.0001605519401741476, + "loss": 1.1156, + "step": 6863 + }, + { + "epoch": 0.66, + "grad_norm": 0.31396026069305516, + "learning_rate": 0.0001605393497826599, + "loss": 1.0302, + "step": 6864 + }, + { + "epoch": 0.66, + "grad_norm": 0.3105439907019263, + "learning_rate": 0.00016052675787613696, + "loss": 1.0684, + "step": 6865 + }, + { + "epoch": 0.66, + "grad_norm": 0.2728613957688195, + "learning_rate": 0.00016051416445489385, + "loss": 1.0522, + "step": 6866 + }, + { + "epoch": 0.66, + "grad_norm": 0.26407320682849367, + "learning_rate": 0.00016050156951924574, + "loss": 0.9828, + "step": 6867 + }, + { + "epoch": 0.66, + "grad_norm": 0.286389653868074, + "learning_rate": 0.00016048897306950784, + "loss": 1.0011, + "step": 6868 + }, + { + "epoch": 0.66, + "grad_norm": 0.20462171644432642, + "learning_rate": 0.00016047637510599534, + "loss": 0.9521, + "step": 6869 + }, + { + "epoch": 0.66, + "grad_norm": 0.28123560210233106, + "learning_rate": 0.00016046377562902356, + "loss": 0.9788, + "step": 6870 + }, + { + "epoch": 0.66, + "grad_norm": 0.30001191282665973, + "learning_rate": 0.0001604511746389078, + "loss": 1.0776, + "step": 6871 + }, + { + "epoch": 0.66, + "grad_norm": 0.24622530091185105, + "learning_rate": 0.00016043857213596344, + "loss": 1.0529, + "step": 6872 + }, + { + "epoch": 0.66, + "grad_norm": 0.27492099325556013, + "learning_rate": 0.00016042596812050576, + "loss": 1.0378, + "step": 6873 + }, + { + "epoch": 0.66, + "grad_norm": 0.2918899294834224, + "learning_rate": 0.00016041336259285031, + "loss": 1.0596, + "step": 6874 + }, + { + "epoch": 0.66, + "grad_norm": 0.2623677600554193, + "learning_rate": 0.00016040075555331246, + "loss": 1.1632, + "step": 6875 + }, + { + "epoch": 0.66, + "grad_norm": 0.26087552461867863, + "learning_rate": 0.00016038814700220777, + "loss": 1.0231, + "step": 6876 + }, + { + "epoch": 0.66, + "grad_norm": 0.27308513799182704, + "learning_rate": 0.00016037553693985172, + "loss": 1.0093, + "step": 6877 + }, + { + "epoch": 0.66, + "grad_norm": 0.3112434726283997, + "learning_rate": 0.00016036292536655993, + "loss": 1.1635, + "step": 6878 + }, + { + "epoch": 0.66, + "grad_norm": 0.2884405944631099, + "learning_rate": 0.00016035031228264798, + "loss": 1.1504, + "step": 6879 + }, + { + "epoch": 0.66, + "grad_norm": 0.25052783602254847, + "learning_rate": 0.00016033769768843153, + "loss": 1.0535, + "step": 6880 + }, + { + "epoch": 0.66, + "grad_norm": 0.2952430681334669, + "learning_rate": 0.00016032508158422633, + "loss": 1.1563, + "step": 6881 + }, + { + "epoch": 0.66, + "grad_norm": 0.2775844498309543, + "learning_rate": 0.00016031246397034797, + "loss": 1.063, + "step": 6882 + }, + { + "epoch": 0.66, + "grad_norm": 0.2641737504846404, + "learning_rate": 0.00016029984484711233, + "loss": 0.9881, + "step": 6883 + }, + { + "epoch": 0.66, + "grad_norm": 0.25954131255382207, + "learning_rate": 0.0001602872242148352, + "loss": 0.9942, + "step": 6884 + }, + { + "epoch": 0.66, + "grad_norm": 0.235184004978054, + "learning_rate": 0.00016027460207383238, + "loss": 1.0592, + "step": 6885 + }, + { + "epoch": 0.66, + "grad_norm": 0.3208319103526141, + "learning_rate": 0.00016026197842441975, + "loss": 0.9071, + "step": 6886 + }, + { + "epoch": 0.66, + "grad_norm": 0.38310981570001407, + "learning_rate": 0.00016024935326691323, + "loss": 1.0084, + "step": 6887 + }, + { + "epoch": 0.66, + "grad_norm": 0.2736848488804497, + "learning_rate": 0.00016023672660162881, + "loss": 1.1093, + "step": 6888 + }, + { + "epoch": 0.66, + "grad_norm": 0.30255083039667124, + "learning_rate": 0.00016022409842888244, + "loss": 1.0453, + "step": 6889 + }, + { + "epoch": 0.66, + "grad_norm": 0.29616770728129344, + "learning_rate": 0.00016021146874899015, + "loss": 1.1753, + "step": 6890 + }, + { + "epoch": 0.66, + "grad_norm": 0.26891944823488517, + "learning_rate": 0.000160198837562268, + "loss": 0.998, + "step": 6891 + }, + { + "epoch": 0.66, + "grad_norm": 0.3069851887194386, + "learning_rate": 0.00016018620486903213, + "loss": 0.9517, + "step": 6892 + }, + { + "epoch": 0.66, + "grad_norm": 0.2878707069041881, + "learning_rate": 0.00016017357066959863, + "loss": 1.0137, + "step": 6893 + }, + { + "epoch": 0.66, + "grad_norm": 0.2804636505710241, + "learning_rate": 0.0001601609349642837, + "loss": 1.0739, + "step": 6894 + }, + { + "epoch": 0.66, + "grad_norm": 0.272015497269986, + "learning_rate": 0.00016014829775340362, + "loss": 1.1176, + "step": 6895 + }, + { + "epoch": 0.66, + "grad_norm": 0.24507472885852935, + "learning_rate": 0.00016013565903727454, + "loss": 1.1288, + "step": 6896 + }, + { + "epoch": 0.66, + "grad_norm": 0.30186275659636463, + "learning_rate": 0.00016012301881621283, + "loss": 1.0384, + "step": 6897 + }, + { + "epoch": 0.66, + "grad_norm": 0.3163308512937789, + "learning_rate": 0.00016011037709053478, + "loss": 1.0186, + "step": 6898 + }, + { + "epoch": 0.66, + "grad_norm": 0.30810804171123124, + "learning_rate": 0.00016009773386055676, + "loss": 1.1093, + "step": 6899 + }, + { + "epoch": 0.66, + "grad_norm": 0.2816814074951005, + "learning_rate": 0.00016008508912659518, + "loss": 1.0513, + "step": 6900 + }, + { + "epoch": 0.66, + "grad_norm": 0.30201609837419807, + "learning_rate": 0.00016007244288896645, + "loss": 0.9661, + "step": 6901 + }, + { + "epoch": 0.66, + "grad_norm": 0.28829302388205935, + "learning_rate": 0.00016005979514798713, + "loss": 1.0704, + "step": 6902 + }, + { + "epoch": 0.66, + "grad_norm": 0.26515326393964955, + "learning_rate": 0.00016004714590397366, + "loss": 1.1436, + "step": 6903 + }, + { + "epoch": 0.66, + "grad_norm": 0.2971122785850026, + "learning_rate": 0.00016003449515724263, + "loss": 1.1088, + "step": 6904 + }, + { + "epoch": 0.66, + "grad_norm": 0.3097599416745419, + "learning_rate": 0.00016002184290811065, + "loss": 1.0972, + "step": 6905 + }, + { + "epoch": 0.66, + "grad_norm": 0.2522652174175764, + "learning_rate": 0.00016000918915689432, + "loss": 1.0747, + "step": 6906 + }, + { + "epoch": 0.66, + "grad_norm": 0.3182119441827633, + "learning_rate": 0.0001599965339039103, + "loss": 1.027, + "step": 6907 + }, + { + "epoch": 0.66, + "grad_norm": 0.2783524829429487, + "learning_rate": 0.0001599838771494753, + "loss": 1.0525, + "step": 6908 + }, + { + "epoch": 0.66, + "grad_norm": 0.26729480846780146, + "learning_rate": 0.0001599712188939061, + "loss": 1.02, + "step": 6909 + }, + { + "epoch": 0.66, + "grad_norm": 0.27005772910227627, + "learning_rate": 0.00015995855913751946, + "loss": 1.1131, + "step": 6910 + }, + { + "epoch": 0.66, + "grad_norm": 0.23819686657027173, + "learning_rate": 0.00015994589788063222, + "loss": 1.0546, + "step": 6911 + }, + { + "epoch": 0.66, + "grad_norm": 0.2937203478865386, + "learning_rate": 0.00015993323512356118, + "loss": 1.1432, + "step": 6912 + }, + { + "epoch": 0.66, + "grad_norm": 0.2788834474966195, + "learning_rate": 0.00015992057086662323, + "loss": 1.2207, + "step": 6913 + }, + { + "epoch": 0.66, + "grad_norm": 0.26680578605709754, + "learning_rate": 0.0001599079051101354, + "loss": 1.0555, + "step": 6914 + }, + { + "epoch": 0.66, + "grad_norm": 0.2829489557203681, + "learning_rate": 0.00015989523785441456, + "loss": 0.9449, + "step": 6915 + }, + { + "epoch": 0.66, + "grad_norm": 0.29363875973797865, + "learning_rate": 0.00015988256909977777, + "loss": 0.9878, + "step": 6916 + }, + { + "epoch": 0.66, + "grad_norm": 0.2543061502185059, + "learning_rate": 0.00015986989884654202, + "loss": 1.1064, + "step": 6917 + }, + { + "epoch": 0.66, + "grad_norm": 0.26512820084084004, + "learning_rate": 0.00015985722709502444, + "loss": 1.036, + "step": 6918 + }, + { + "epoch": 0.66, + "grad_norm": 0.28652859367622907, + "learning_rate": 0.00015984455384554215, + "loss": 1.1668, + "step": 6919 + }, + { + "epoch": 0.66, + "grad_norm": 0.31723374235235535, + "learning_rate": 0.00015983187909841226, + "loss": 1.0463, + "step": 6920 + }, + { + "epoch": 0.66, + "grad_norm": 0.29129953499102984, + "learning_rate": 0.00015981920285395202, + "loss": 1.058, + "step": 6921 + }, + { + "epoch": 0.66, + "grad_norm": 0.29993610209681537, + "learning_rate": 0.0001598065251124786, + "loss": 1.04, + "step": 6922 + }, + { + "epoch": 0.66, + "grad_norm": 0.2947422459230847, + "learning_rate": 0.00015979384587430935, + "loss": 1.0852, + "step": 6923 + }, + { + "epoch": 0.66, + "grad_norm": 0.2714231587486264, + "learning_rate": 0.00015978116513976152, + "loss": 0.9586, + "step": 6924 + }, + { + "epoch": 0.66, + "grad_norm": 0.31441338998821106, + "learning_rate": 0.00015976848290915244, + "loss": 1.0261, + "step": 6925 + }, + { + "epoch": 0.66, + "grad_norm": 0.3020176096145915, + "learning_rate": 0.0001597557991827995, + "loss": 1.0744, + "step": 6926 + }, + { + "epoch": 0.66, + "grad_norm": 0.2941393454482125, + "learning_rate": 0.00015974311396102015, + "loss": 1.0624, + "step": 6927 + }, + { + "epoch": 0.66, + "grad_norm": 0.29719508910446507, + "learning_rate": 0.00015973042724413183, + "loss": 1.18, + "step": 6928 + }, + { + "epoch": 0.66, + "grad_norm": 0.28110919244257476, + "learning_rate": 0.00015971773903245202, + "loss": 0.9661, + "step": 6929 + }, + { + "epoch": 0.66, + "grad_norm": 0.2803124705765725, + "learning_rate": 0.00015970504932629823, + "loss": 1.0396, + "step": 6930 + }, + { + "epoch": 0.66, + "grad_norm": 0.29208189180807126, + "learning_rate": 0.0001596923581259881, + "loss": 1.1863, + "step": 6931 + }, + { + "epoch": 0.66, + "grad_norm": 0.29689595292251414, + "learning_rate": 0.0001596796654318392, + "loss": 1.0811, + "step": 6932 + }, + { + "epoch": 0.66, + "grad_norm": 0.3077207175340659, + "learning_rate": 0.00015966697124416914, + "loss": 1.0538, + "step": 6933 + }, + { + "epoch": 0.66, + "grad_norm": 0.30686176921995806, + "learning_rate": 0.0001596542755632956, + "loss": 1.0504, + "step": 6934 + }, + { + "epoch": 0.66, + "grad_norm": 0.24712629310974943, + "learning_rate": 0.00015964157838953638, + "loss": 1.0084, + "step": 6935 + }, + { + "epoch": 0.66, + "grad_norm": 0.26002780535214853, + "learning_rate": 0.00015962887972320914, + "loss": 1.0972, + "step": 6936 + }, + { + "epoch": 0.66, + "grad_norm": 0.29963814827197655, + "learning_rate": 0.00015961617956463173, + "loss": 1.1617, + "step": 6937 + }, + { + "epoch": 0.66, + "grad_norm": 0.29022102343364803, + "learning_rate": 0.00015960347791412196, + "loss": 0.9665, + "step": 6938 + }, + { + "epoch": 0.66, + "grad_norm": 0.30836910286066316, + "learning_rate": 0.00015959077477199765, + "loss": 1.0885, + "step": 6939 + }, + { + "epoch": 0.66, + "grad_norm": 0.2836035210572582, + "learning_rate": 0.0001595780701385768, + "loss": 1.0539, + "step": 6940 + }, + { + "epoch": 0.66, + "grad_norm": 0.30435356184547685, + "learning_rate": 0.0001595653640141773, + "loss": 0.976, + "step": 6941 + }, + { + "epoch": 0.66, + "grad_norm": 0.26960506796384626, + "learning_rate": 0.00015955265639911711, + "loss": 1.1245, + "step": 6942 + }, + { + "epoch": 0.66, + "grad_norm": 0.36290217128036273, + "learning_rate": 0.00015953994729371427, + "loss": 1.0396, + "step": 6943 + }, + { + "epoch": 0.66, + "grad_norm": 0.32401610966187167, + "learning_rate": 0.00015952723669828683, + "loss": 1.0519, + "step": 6944 + }, + { + "epoch": 0.66, + "grad_norm": 0.27984623740696507, + "learning_rate": 0.00015951452461315292, + "loss": 1.0498, + "step": 6945 + }, + { + "epoch": 0.66, + "grad_norm": 0.24996113180352433, + "learning_rate": 0.00015950181103863056, + "loss": 1.0642, + "step": 6946 + }, + { + "epoch": 0.66, + "grad_norm": 0.2587685668722977, + "learning_rate": 0.000159489095975038, + "loss": 1.1062, + "step": 6947 + }, + { + "epoch": 0.66, + "grad_norm": 0.32763762154630816, + "learning_rate": 0.00015947637942269343, + "loss": 1.1287, + "step": 6948 + }, + { + "epoch": 0.66, + "grad_norm": 0.2585215704752277, + "learning_rate": 0.0001594636613819151, + "loss": 1.0892, + "step": 6949 + }, + { + "epoch": 0.66, + "grad_norm": 0.2885043831452831, + "learning_rate": 0.0001594509418530213, + "loss": 1.1353, + "step": 6950 + }, + { + "epoch": 0.67, + "grad_norm": 0.26515145631234566, + "learning_rate": 0.00015943822083633026, + "loss": 0.9613, + "step": 6951 + }, + { + "epoch": 0.67, + "grad_norm": 0.2832070363192226, + "learning_rate": 0.00015942549833216043, + "loss": 1.047, + "step": 6952 + }, + { + "epoch": 0.67, + "grad_norm": 0.2851606867131872, + "learning_rate": 0.00015941277434083014, + "loss": 1.1776, + "step": 6953 + }, + { + "epoch": 0.67, + "grad_norm": 0.26970800321487326, + "learning_rate": 0.00015940004886265781, + "loss": 1.0471, + "step": 6954 + }, + { + "epoch": 0.67, + "grad_norm": 0.30024373875561594, + "learning_rate": 0.00015938732189796196, + "loss": 1.1125, + "step": 6955 + }, + { + "epoch": 0.67, + "grad_norm": 0.29870880904188896, + "learning_rate": 0.00015937459344706105, + "loss": 1.106, + "step": 6956 + }, + { + "epoch": 0.67, + "grad_norm": 0.2990216369851441, + "learning_rate": 0.0001593618635102736, + "loss": 1.0811, + "step": 6957 + }, + { + "epoch": 0.67, + "grad_norm": 0.2848511215046961, + "learning_rate": 0.00015934913208791825, + "loss": 1.0252, + "step": 6958 + }, + { + "epoch": 0.67, + "grad_norm": 0.26962263498375855, + "learning_rate": 0.00015933639918031353, + "loss": 1.1282, + "step": 6959 + }, + { + "epoch": 0.67, + "grad_norm": 0.3226088976764872, + "learning_rate": 0.00015932366478777816, + "loss": 1.1156, + "step": 6960 + }, + { + "epoch": 0.67, + "grad_norm": 0.2775279024141733, + "learning_rate": 0.00015931092891063078, + "loss": 1.0649, + "step": 6961 + }, + { + "epoch": 0.67, + "grad_norm": 0.29817813856951386, + "learning_rate": 0.0001592981915491901, + "loss": 1.0693, + "step": 6962 + }, + { + "epoch": 0.67, + "grad_norm": 0.2685645767583313, + "learning_rate": 0.00015928545270377494, + "loss": 1.019, + "step": 6963 + }, + { + "epoch": 0.67, + "grad_norm": 0.2950719704962928, + "learning_rate": 0.00015927271237470408, + "loss": 1.0845, + "step": 6964 + }, + { + "epoch": 0.67, + "grad_norm": 0.2834261349957536, + "learning_rate": 0.0001592599705622963, + "loss": 1.153, + "step": 6965 + }, + { + "epoch": 0.67, + "grad_norm": 0.2703350081406132, + "learning_rate": 0.00015924722726687058, + "loss": 1.1079, + "step": 6966 + }, + { + "epoch": 0.67, + "grad_norm": 0.2573244560178056, + "learning_rate": 0.0001592344824887457, + "loss": 1.0111, + "step": 6967 + }, + { + "epoch": 0.67, + "grad_norm": 0.24815316601701018, + "learning_rate": 0.0001592217362282407, + "loss": 1.0481, + "step": 6968 + }, + { + "epoch": 0.67, + "grad_norm": 0.30891957392030694, + "learning_rate": 0.0001592089884856745, + "loss": 1.0926, + "step": 6969 + }, + { + "epoch": 0.67, + "grad_norm": 0.30606523617707265, + "learning_rate": 0.00015919623926136618, + "loss": 1.099, + "step": 6970 + }, + { + "epoch": 0.67, + "grad_norm": 0.28728857946398434, + "learning_rate": 0.00015918348855563477, + "loss": 1.1097, + "step": 6971 + }, + { + "epoch": 0.67, + "grad_norm": 0.26905395320578945, + "learning_rate": 0.00015917073636879936, + "loss": 1.0316, + "step": 6972 + }, + { + "epoch": 0.67, + "grad_norm": 0.2791301663055386, + "learning_rate": 0.00015915798270117905, + "loss": 1.078, + "step": 6973 + }, + { + "epoch": 0.67, + "grad_norm": 0.27771895204228286, + "learning_rate": 0.0001591452275530931, + "loss": 0.9038, + "step": 6974 + }, + { + "epoch": 0.67, + "grad_norm": 0.27933418779286445, + "learning_rate": 0.0001591324709248606, + "loss": 1.0519, + "step": 6975 + }, + { + "epoch": 0.67, + "grad_norm": 0.2816750044645727, + "learning_rate": 0.00015911971281680088, + "loss": 0.9544, + "step": 6976 + }, + { + "epoch": 0.67, + "grad_norm": 0.285922508579133, + "learning_rate": 0.0001591069532292332, + "loss": 1.0725, + "step": 6977 + }, + { + "epoch": 0.67, + "grad_norm": 0.33215001802712657, + "learning_rate": 0.00015909419216247688, + "loss": 1.0101, + "step": 6978 + }, + { + "epoch": 0.67, + "grad_norm": 0.23393732531173045, + "learning_rate": 0.00015908142961685125, + "loss": 1.0705, + "step": 6979 + }, + { + "epoch": 0.67, + "grad_norm": 0.257596551482316, + "learning_rate": 0.0001590686655926757, + "loss": 1.1013, + "step": 6980 + }, + { + "epoch": 0.67, + "grad_norm": 0.3135249829190419, + "learning_rate": 0.00015905590009026967, + "loss": 1.0304, + "step": 6981 + }, + { + "epoch": 0.67, + "grad_norm": 0.2856466968173445, + "learning_rate": 0.00015904313310995263, + "loss": 0.9811, + "step": 6982 + }, + { + "epoch": 0.67, + "grad_norm": 0.2829987935572402, + "learning_rate": 0.00015903036465204407, + "loss": 1.1061, + "step": 6983 + }, + { + "epoch": 0.67, + "grad_norm": 0.2780452591589329, + "learning_rate": 0.00015901759471686358, + "loss": 1.1667, + "step": 6984 + }, + { + "epoch": 0.67, + "grad_norm": 0.27814002322278586, + "learning_rate": 0.00015900482330473062, + "loss": 1.1414, + "step": 6985 + }, + { + "epoch": 0.67, + "grad_norm": 0.2860266253901477, + "learning_rate": 0.0001589920504159649, + "loss": 0.9647, + "step": 6986 + }, + { + "epoch": 0.67, + "grad_norm": 0.2760940301311592, + "learning_rate": 0.0001589792760508861, + "loss": 1.0185, + "step": 6987 + }, + { + "epoch": 0.67, + "grad_norm": 0.2706769903514577, + "learning_rate": 0.00015896650020981378, + "loss": 1.2201, + "step": 6988 + }, + { + "epoch": 0.67, + "grad_norm": 0.2622304954412599, + "learning_rate": 0.00015895372289306776, + "loss": 1.0844, + "step": 6989 + }, + { + "epoch": 0.67, + "grad_norm": 0.30180292321521496, + "learning_rate": 0.00015894094410096775, + "loss": 0.9252, + "step": 6990 + }, + { + "epoch": 0.67, + "grad_norm": 0.280964051345383, + "learning_rate": 0.0001589281638338336, + "loss": 1.0206, + "step": 6991 + }, + { + "epoch": 0.67, + "grad_norm": 0.2722800997770079, + "learning_rate": 0.0001589153820919851, + "loss": 0.9916, + "step": 6992 + }, + { + "epoch": 0.67, + "grad_norm": 0.27884975208948837, + "learning_rate": 0.00015890259887574215, + "loss": 0.9978, + "step": 6993 + }, + { + "epoch": 0.67, + "grad_norm": 0.24598496860742308, + "learning_rate": 0.00015888981418542462, + "loss": 1.0904, + "step": 6994 + }, + { + "epoch": 0.67, + "grad_norm": 0.2961053801063102, + "learning_rate": 0.00015887702802135252, + "loss": 0.9764, + "step": 6995 + }, + { + "epoch": 0.67, + "grad_norm": 0.26389220997829704, + "learning_rate": 0.00015886424038384577, + "loss": 1.0735, + "step": 6996 + }, + { + "epoch": 0.67, + "grad_norm": 0.3058819655409103, + "learning_rate": 0.00015885145127322438, + "loss": 1.1001, + "step": 6997 + }, + { + "epoch": 0.67, + "grad_norm": 0.279716453620408, + "learning_rate": 0.00015883866068980846, + "loss": 1.1435, + "step": 6998 + }, + { + "epoch": 0.67, + "grad_norm": 0.2938126610887859, + "learning_rate": 0.00015882586863391807, + "loss": 1.0411, + "step": 6999 + }, + { + "epoch": 0.67, + "grad_norm": 0.24676335418681838, + "learning_rate": 0.00015881307510587337, + "loss": 0.9959, + "step": 7000 + }, + { + "epoch": 0.67, + "grad_norm": 0.2653940896900372, + "learning_rate": 0.0001588002801059945, + "loss": 1.0835, + "step": 7001 + }, + { + "epoch": 0.67, + "grad_norm": 0.3109595788932631, + "learning_rate": 0.00015878748363460163, + "loss": 1.0282, + "step": 7002 + }, + { + "epoch": 0.67, + "grad_norm": 0.30081836243424015, + "learning_rate": 0.00015877468569201506, + "loss": 0.9415, + "step": 7003 + }, + { + "epoch": 0.67, + "grad_norm": 0.31448590635210893, + "learning_rate": 0.00015876188627855507, + "loss": 1.074, + "step": 7004 + }, + { + "epoch": 0.67, + "grad_norm": 0.3017891643305516, + "learning_rate": 0.00015874908539454188, + "loss": 1.0724, + "step": 7005 + }, + { + "epoch": 0.67, + "grad_norm": 0.3277818013492165, + "learning_rate": 0.00015873628304029596, + "loss": 1.1018, + "step": 7006 + }, + { + "epoch": 0.67, + "grad_norm": 0.2802892779964618, + "learning_rate": 0.00015872347921613763, + "loss": 1.0991, + "step": 7007 + }, + { + "epoch": 0.67, + "grad_norm": 0.28604449999798276, + "learning_rate": 0.0001587106739223873, + "loss": 0.9591, + "step": 7008 + }, + { + "epoch": 0.67, + "grad_norm": 0.2974499942407421, + "learning_rate": 0.0001586978671593655, + "loss": 1.0818, + "step": 7009 + }, + { + "epoch": 0.67, + "grad_norm": 0.26346530008044805, + "learning_rate": 0.00015868505892739266, + "loss": 1.0774, + "step": 7010 + }, + { + "epoch": 0.67, + "grad_norm": 0.30282010918915797, + "learning_rate": 0.00015867224922678933, + "loss": 1.0961, + "step": 7011 + }, + { + "epoch": 0.67, + "grad_norm": 0.31016719288564637, + "learning_rate": 0.0001586594380578761, + "loss": 1.1236, + "step": 7012 + }, + { + "epoch": 0.67, + "grad_norm": 0.28052033504795476, + "learning_rate": 0.00015864662542097358, + "loss": 1.0947, + "step": 7013 + }, + { + "epoch": 0.67, + "grad_norm": 0.23638403426307897, + "learning_rate": 0.00015863381131640236, + "loss": 1.0231, + "step": 7014 + }, + { + "epoch": 0.67, + "grad_norm": 0.291740553126471, + "learning_rate": 0.00015862099574448317, + "loss": 0.9802, + "step": 7015 + }, + { + "epoch": 0.67, + "grad_norm": 0.3029857623248768, + "learning_rate": 0.00015860817870553677, + "loss": 1.12, + "step": 7016 + }, + { + "epoch": 0.67, + "grad_norm": 0.2630855945498341, + "learning_rate": 0.00015859536019988384, + "loss": 0.9507, + "step": 7017 + }, + { + "epoch": 0.67, + "grad_norm": 0.2758349794123403, + "learning_rate": 0.00015858254022784515, + "loss": 1.1074, + "step": 7018 + }, + { + "epoch": 0.67, + "grad_norm": 0.27661938873325614, + "learning_rate": 0.00015856971878974163, + "loss": 1.0374, + "step": 7019 + }, + { + "epoch": 0.67, + "grad_norm": 0.2714412044951736, + "learning_rate": 0.00015855689588589405, + "loss": 1.052, + "step": 7020 + }, + { + "epoch": 0.67, + "grad_norm": 0.2677737703288316, + "learning_rate": 0.00015854407151662337, + "loss": 1.0147, + "step": 7021 + }, + { + "epoch": 0.67, + "grad_norm": 0.2927994644038343, + "learning_rate": 0.0001585312456822505, + "loss": 1.0889, + "step": 7022 + }, + { + "epoch": 0.67, + "grad_norm": 0.2790949416690381, + "learning_rate": 0.0001585184183830964, + "loss": 1.1484, + "step": 7023 + }, + { + "epoch": 0.67, + "grad_norm": 0.28034621779286634, + "learning_rate": 0.00015850558961948217, + "loss": 1.0267, + "step": 7024 + }, + { + "epoch": 0.67, + "grad_norm": 0.2745504280573277, + "learning_rate": 0.00015849275939172874, + "loss": 1.1027, + "step": 7025 + }, + { + "epoch": 0.67, + "grad_norm": 0.26978925236279233, + "learning_rate": 0.00015847992770015725, + "loss": 0.9182, + "step": 7026 + }, + { + "epoch": 0.67, + "grad_norm": 0.29839939566941587, + "learning_rate": 0.00015846709454508883, + "loss": 1.0897, + "step": 7027 + }, + { + "epoch": 0.67, + "grad_norm": 0.26196842338250126, + "learning_rate": 0.0001584542599268446, + "loss": 1.0644, + "step": 7028 + }, + { + "epoch": 0.67, + "grad_norm": 0.28373187399028676, + "learning_rate": 0.0001584414238457458, + "loss": 1.0633, + "step": 7029 + }, + { + "epoch": 0.67, + "grad_norm": 0.2537841362135921, + "learning_rate": 0.0001584285863021136, + "loss": 1.0892, + "step": 7030 + }, + { + "epoch": 0.67, + "grad_norm": 0.27473559874173337, + "learning_rate": 0.00015841574729626935, + "loss": 0.9814, + "step": 7031 + }, + { + "epoch": 0.67, + "grad_norm": 0.30313556481972587, + "learning_rate": 0.00015840290682853428, + "loss": 1.016, + "step": 7032 + }, + { + "epoch": 0.67, + "grad_norm": 0.28512884288704177, + "learning_rate": 0.0001583900648992298, + "loss": 0.9909, + "step": 7033 + }, + { + "epoch": 0.67, + "grad_norm": 0.2551073361295876, + "learning_rate": 0.00015837722150867722, + "loss": 0.9779, + "step": 7034 + }, + { + "epoch": 0.67, + "grad_norm": 0.2939908816315301, + "learning_rate": 0.00015836437665719798, + "loss": 1.1285, + "step": 7035 + }, + { + "epoch": 0.67, + "grad_norm": 0.2725649426346149, + "learning_rate": 0.00015835153034511357, + "loss": 1.1472, + "step": 7036 + }, + { + "epoch": 0.67, + "grad_norm": 0.2718133362148875, + "learning_rate": 0.0001583386825727454, + "loss": 1.0443, + "step": 7037 + }, + { + "epoch": 0.67, + "grad_norm": 0.27419111607826285, + "learning_rate": 0.00015832583334041505, + "loss": 1.0012, + "step": 7038 + }, + { + "epoch": 0.67, + "grad_norm": 0.2849089224699415, + "learning_rate": 0.00015831298264844406, + "loss": 1.0507, + "step": 7039 + }, + { + "epoch": 0.67, + "grad_norm": 0.3068910572836713, + "learning_rate": 0.00015830013049715404, + "loss": 1.0838, + "step": 7040 + }, + { + "epoch": 0.67, + "grad_norm": 0.2779576724478555, + "learning_rate": 0.0001582872768868666, + "loss": 1.0766, + "step": 7041 + }, + { + "epoch": 0.67, + "grad_norm": 0.3302291612230447, + "learning_rate": 0.00015827442181790344, + "loss": 0.9843, + "step": 7042 + }, + { + "epoch": 0.67, + "grad_norm": 0.25384666865589006, + "learning_rate": 0.00015826156529058624, + "loss": 1.0086, + "step": 7043 + }, + { + "epoch": 0.67, + "grad_norm": 0.27280371588140856, + "learning_rate": 0.00015824870730523675, + "loss": 1.0686, + "step": 7044 + }, + { + "epoch": 0.67, + "grad_norm": 0.3030164504363727, + "learning_rate": 0.00015823584786217677, + "loss": 1.0226, + "step": 7045 + }, + { + "epoch": 0.67, + "grad_norm": 0.26909739332574156, + "learning_rate": 0.00015822298696172805, + "loss": 0.99, + "step": 7046 + }, + { + "epoch": 0.67, + "grad_norm": 0.2877874749400455, + "learning_rate": 0.00015821012460421255, + "loss": 0.9923, + "step": 7047 + }, + { + "epoch": 0.67, + "grad_norm": 0.2877538039040546, + "learning_rate": 0.00015819726078995208, + "loss": 1.0115, + "step": 7048 + }, + { + "epoch": 0.67, + "grad_norm": 0.30848201813052933, + "learning_rate": 0.00015818439551926856, + "loss": 1.0591, + "step": 7049 + }, + { + "epoch": 0.67, + "grad_norm": 0.29140546183062116, + "learning_rate": 0.000158171528792484, + "loss": 1.0429, + "step": 7050 + }, + { + "epoch": 0.67, + "grad_norm": 0.26994340789160964, + "learning_rate": 0.00015815866060992035, + "loss": 1.011, + "step": 7051 + }, + { + "epoch": 0.67, + "grad_norm": 0.2792969580723623, + "learning_rate": 0.00015814579097189966, + "loss": 1.0891, + "step": 7052 + }, + { + "epoch": 0.67, + "grad_norm": 0.28250949172509243, + "learning_rate": 0.00015813291987874407, + "loss": 1.1065, + "step": 7053 + }, + { + "epoch": 0.67, + "grad_norm": 0.2978217599721609, + "learning_rate": 0.00015812004733077554, + "loss": 0.9388, + "step": 7054 + }, + { + "epoch": 0.67, + "grad_norm": 0.2908978622536936, + "learning_rate": 0.00015810717332831635, + "loss": 1.0988, + "step": 7055 + }, + { + "epoch": 0.68, + "grad_norm": 0.2799056103902689, + "learning_rate": 0.0001580942978716886, + "loss": 1.1076, + "step": 7056 + }, + { + "epoch": 0.68, + "grad_norm": 0.30269500535419785, + "learning_rate": 0.00015808142096121456, + "loss": 1.1737, + "step": 7057 + }, + { + "epoch": 0.68, + "grad_norm": 0.28238103258320846, + "learning_rate": 0.00015806854259721646, + "loss": 1.0724, + "step": 7058 + }, + { + "epoch": 0.68, + "grad_norm": 0.2985043169531798, + "learning_rate": 0.00015805566278001657, + "loss": 0.9103, + "step": 7059 + }, + { + "epoch": 0.68, + "grad_norm": 0.293719688850402, + "learning_rate": 0.00015804278150993722, + "loss": 1.0877, + "step": 7060 + }, + { + "epoch": 0.68, + "grad_norm": 0.27699549920215416, + "learning_rate": 0.00015802989878730084, + "loss": 1.1226, + "step": 7061 + }, + { + "epoch": 0.68, + "grad_norm": 0.26487298015420774, + "learning_rate": 0.00015801701461242974, + "loss": 1.1144, + "step": 7062 + }, + { + "epoch": 0.68, + "grad_norm": 0.26324552701590875, + "learning_rate": 0.00015800412898564636, + "loss": 1.173, + "step": 7063 + }, + { + "epoch": 0.68, + "grad_norm": 0.27384596081191975, + "learning_rate": 0.00015799124190727322, + "loss": 0.9868, + "step": 7064 + }, + { + "epoch": 0.68, + "grad_norm": 0.25963681695290847, + "learning_rate": 0.00015797835337763282, + "loss": 0.9288, + "step": 7065 + }, + { + "epoch": 0.68, + "grad_norm": 0.29013003407961624, + "learning_rate": 0.00015796546339704766, + "loss": 1.0005, + "step": 7066 + }, + { + "epoch": 0.68, + "grad_norm": 0.3020156469079804, + "learning_rate": 0.00015795257196584038, + "loss": 0.9843, + "step": 7067 + }, + { + "epoch": 0.68, + "grad_norm": 0.2895277107493972, + "learning_rate": 0.00015793967908433353, + "loss": 1.105, + "step": 7068 + }, + { + "epoch": 0.68, + "grad_norm": 0.2755240824809856, + "learning_rate": 0.0001579267847528498, + "loss": 1.082, + "step": 7069 + }, + { + "epoch": 0.68, + "grad_norm": 0.30036604934270644, + "learning_rate": 0.00015791388897171192, + "loss": 1.2018, + "step": 7070 + }, + { + "epoch": 0.68, + "grad_norm": 0.30614155747981436, + "learning_rate": 0.00015790099174124253, + "loss": 1.0759, + "step": 7071 + }, + { + "epoch": 0.68, + "grad_norm": 0.2941340821458282, + "learning_rate": 0.00015788809306176447, + "loss": 1.0597, + "step": 7072 + }, + { + "epoch": 0.68, + "grad_norm": 0.30856171230251395, + "learning_rate": 0.00015787519293360044, + "loss": 1.1318, + "step": 7073 + }, + { + "epoch": 0.68, + "grad_norm": 0.29580057657767084, + "learning_rate": 0.00015786229135707338, + "loss": 1.0522, + "step": 7074 + }, + { + "epoch": 0.68, + "grad_norm": 0.274616244018645, + "learning_rate": 0.0001578493883325061, + "loss": 1.0299, + "step": 7075 + }, + { + "epoch": 0.68, + "grad_norm": 0.3114772800268081, + "learning_rate": 0.00015783648386022151, + "loss": 1.161, + "step": 7076 + }, + { + "epoch": 0.68, + "grad_norm": 0.2802238017688198, + "learning_rate": 0.0001578235779405426, + "loss": 1.0197, + "step": 7077 + }, + { + "epoch": 0.68, + "grad_norm": 0.2944810135727739, + "learning_rate": 0.00015781067057379228, + "loss": 1.1013, + "step": 7078 + }, + { + "epoch": 0.68, + "grad_norm": 0.24998357909349614, + "learning_rate": 0.00015779776176029356, + "loss": 1.0692, + "step": 7079 + }, + { + "epoch": 0.68, + "grad_norm": 0.2945811953423457, + "learning_rate": 0.0001577848515003696, + "loss": 1.068, + "step": 7080 + }, + { + "epoch": 0.68, + "grad_norm": 0.25806411996635104, + "learning_rate": 0.00015777193979434338, + "loss": 1.0476, + "step": 7081 + }, + { + "epoch": 0.68, + "grad_norm": 0.2598704969021484, + "learning_rate": 0.00015775902664253808, + "loss": 1.0567, + "step": 7082 + }, + { + "epoch": 0.68, + "grad_norm": 0.28645413042100387, + "learning_rate": 0.00015774611204527681, + "loss": 1.0646, + "step": 7083 + }, + { + "epoch": 0.68, + "grad_norm": 0.25410792181961595, + "learning_rate": 0.0001577331960028828, + "loss": 0.9907, + "step": 7084 + }, + { + "epoch": 0.68, + "grad_norm": 0.26092421697879675, + "learning_rate": 0.00015772027851567927, + "loss": 0.9608, + "step": 7085 + }, + { + "epoch": 0.68, + "grad_norm": 0.28943346794498837, + "learning_rate": 0.00015770735958398952, + "loss": 1.0073, + "step": 7086 + }, + { + "epoch": 0.68, + "grad_norm": 0.26566974144868033, + "learning_rate": 0.0001576944392081368, + "loss": 1.109, + "step": 7087 + }, + { + "epoch": 0.68, + "grad_norm": 0.28616557026410305, + "learning_rate": 0.0001576815173884445, + "loss": 1.1788, + "step": 7088 + }, + { + "epoch": 0.68, + "grad_norm": 0.3140389757089799, + "learning_rate": 0.00015766859412523596, + "loss": 1.1028, + "step": 7089 + }, + { + "epoch": 0.68, + "grad_norm": 0.2903534504256985, + "learning_rate": 0.0001576556694188346, + "loss": 1.0848, + "step": 7090 + }, + { + "epoch": 0.68, + "grad_norm": 0.2739660674757886, + "learning_rate": 0.00015764274326956392, + "loss": 1.088, + "step": 7091 + }, + { + "epoch": 0.68, + "grad_norm": 0.26183889146766914, + "learning_rate": 0.00015762981567774733, + "loss": 1.045, + "step": 7092 + }, + { + "epoch": 0.68, + "grad_norm": 0.2773165562228698, + "learning_rate": 0.0001576168866437084, + "loss": 1.0625, + "step": 7093 + }, + { + "epoch": 0.68, + "grad_norm": 0.28250207452060827, + "learning_rate": 0.00015760395616777064, + "loss": 1.0738, + "step": 7094 + }, + { + "epoch": 0.68, + "grad_norm": 0.2664810800693953, + "learning_rate": 0.0001575910242502577, + "loss": 1.1199, + "step": 7095 + }, + { + "epoch": 0.68, + "grad_norm": 0.29450171685022813, + "learning_rate": 0.00015757809089149319, + "loss": 1.0634, + "step": 7096 + }, + { + "epoch": 0.68, + "grad_norm": 0.28549635512987454, + "learning_rate": 0.00015756515609180073, + "loss": 1.0337, + "step": 7097 + }, + { + "epoch": 0.68, + "grad_norm": 0.28429692726911365, + "learning_rate": 0.00015755221985150412, + "loss": 1.1475, + "step": 7098 + }, + { + "epoch": 0.68, + "grad_norm": 0.30253389968364547, + "learning_rate": 0.000157539282170927, + "loss": 1.047, + "step": 7099 + }, + { + "epoch": 0.68, + "grad_norm": 0.3098958322121182, + "learning_rate": 0.00015752634305039317, + "loss": 0.965, + "step": 7100 + }, + { + "epoch": 0.68, + "grad_norm": 0.25143769789194087, + "learning_rate": 0.0001575134024902265, + "loss": 0.9581, + "step": 7101 + }, + { + "epoch": 0.68, + "grad_norm": 0.3016367636629631, + "learning_rate": 0.00015750046049075076, + "loss": 1.0633, + "step": 7102 + }, + { + "epoch": 0.68, + "grad_norm": 0.3271227303031079, + "learning_rate": 0.00015748751705228984, + "loss": 0.9946, + "step": 7103 + }, + { + "epoch": 0.68, + "grad_norm": 0.25230564082743323, + "learning_rate": 0.00015747457217516768, + "loss": 0.998, + "step": 7104 + }, + { + "epoch": 0.68, + "grad_norm": 0.29985887178796994, + "learning_rate": 0.00015746162585970826, + "loss": 1.0747, + "step": 7105 + }, + { + "epoch": 0.68, + "grad_norm": 0.3268684025188319, + "learning_rate": 0.00015744867810623553, + "loss": 1.2323, + "step": 7106 + }, + { + "epoch": 0.68, + "grad_norm": 0.24907462724647375, + "learning_rate": 0.0001574357289150735, + "loss": 1.0212, + "step": 7107 + }, + { + "epoch": 0.68, + "grad_norm": 0.2670820715697481, + "learning_rate": 0.0001574227782865463, + "loss": 0.9549, + "step": 7108 + }, + { + "epoch": 0.68, + "grad_norm": 0.2872956326475909, + "learning_rate": 0.00015740982622097793, + "loss": 1.0685, + "step": 7109 + }, + { + "epoch": 0.68, + "grad_norm": 0.3131384318834513, + "learning_rate": 0.00015739687271869258, + "loss": 1.0596, + "step": 7110 + }, + { + "epoch": 0.68, + "grad_norm": 0.2658074059936171, + "learning_rate": 0.00015738391778001446, + "loss": 1.0484, + "step": 7111 + }, + { + "epoch": 0.68, + "grad_norm": 0.2628890032193891, + "learning_rate": 0.00015737096140526773, + "loss": 0.9861, + "step": 7112 + }, + { + "epoch": 0.68, + "grad_norm": 0.3111781620355991, + "learning_rate": 0.0001573580035947766, + "loss": 1.0708, + "step": 7113 + }, + { + "epoch": 0.68, + "grad_norm": 0.28054369956222347, + "learning_rate": 0.0001573450443488654, + "loss": 1.0379, + "step": 7114 + }, + { + "epoch": 0.68, + "grad_norm": 0.2972730553838039, + "learning_rate": 0.00015733208366785847, + "loss": 1.1547, + "step": 7115 + }, + { + "epoch": 0.68, + "grad_norm": 0.2731099842689578, + "learning_rate": 0.00015731912155208004, + "loss": 1.0216, + "step": 7116 + }, + { + "epoch": 0.68, + "grad_norm": 0.25449973903583906, + "learning_rate": 0.0001573061580018546, + "loss": 1.0567, + "step": 7117 + }, + { + "epoch": 0.68, + "grad_norm": 0.3130389985554061, + "learning_rate": 0.00015729319301750655, + "loss": 1.0886, + "step": 7118 + }, + { + "epoch": 0.68, + "grad_norm": 0.2764607660175635, + "learning_rate": 0.00015728022659936033, + "loss": 0.9724, + "step": 7119 + }, + { + "epoch": 0.68, + "grad_norm": 0.24734793912425435, + "learning_rate": 0.0001572672587477404, + "loss": 1.0651, + "step": 7120 + }, + { + "epoch": 0.68, + "grad_norm": 0.28747093554729464, + "learning_rate": 0.00015725428946297137, + "loss": 1.1001, + "step": 7121 + }, + { + "epoch": 0.68, + "grad_norm": 0.270729761427449, + "learning_rate": 0.0001572413187453778, + "loss": 1.0384, + "step": 7122 + }, + { + "epoch": 0.68, + "grad_norm": 0.2928842971220595, + "learning_rate": 0.00015722834659528422, + "loss": 1.1656, + "step": 7123 + }, + { + "epoch": 0.68, + "grad_norm": 0.2793916142108935, + "learning_rate": 0.00015721537301301527, + "loss": 0.9688, + "step": 7124 + }, + { + "epoch": 0.68, + "grad_norm": 0.29909818823600665, + "learning_rate": 0.0001572023979988957, + "loss": 0.9318, + "step": 7125 + }, + { + "epoch": 0.68, + "grad_norm": 0.2696167713839793, + "learning_rate": 0.00015718942155325011, + "loss": 1.1889, + "step": 7126 + }, + { + "epoch": 0.68, + "grad_norm": 0.29698260893246986, + "learning_rate": 0.00015717644367640334, + "loss": 1.1648, + "step": 7127 + }, + { + "epoch": 0.68, + "grad_norm": 0.27565722448673946, + "learning_rate": 0.00015716346436868016, + "loss": 1.0083, + "step": 7128 + }, + { + "epoch": 0.68, + "grad_norm": 0.27200588461690844, + "learning_rate": 0.00015715048363040533, + "loss": 1.0808, + "step": 7129 + }, + { + "epoch": 0.68, + "grad_norm": 0.274483688022852, + "learning_rate": 0.00015713750146190372, + "loss": 1.0244, + "step": 7130 + }, + { + "epoch": 0.68, + "grad_norm": 0.2603147750459217, + "learning_rate": 0.00015712451786350023, + "loss": 1.0541, + "step": 7131 + }, + { + "epoch": 0.68, + "grad_norm": 0.2611050046236202, + "learning_rate": 0.0001571115328355198, + "loss": 0.9513, + "step": 7132 + }, + { + "epoch": 0.68, + "grad_norm": 0.2611354213480838, + "learning_rate": 0.00015709854637828733, + "loss": 1.0073, + "step": 7133 + }, + { + "epoch": 0.68, + "grad_norm": 0.29780693358025156, + "learning_rate": 0.0001570855584921279, + "loss": 1.0521, + "step": 7134 + }, + { + "epoch": 0.68, + "grad_norm": 0.298538656430755, + "learning_rate": 0.00015707256917736647, + "loss": 1.2377, + "step": 7135 + }, + { + "epoch": 0.68, + "grad_norm": 0.2748841653644941, + "learning_rate": 0.0001570595784343281, + "loss": 0.9403, + "step": 7136 + }, + { + "epoch": 0.68, + "grad_norm": 0.2746937829833261, + "learning_rate": 0.00015704658626333794, + "loss": 1.1224, + "step": 7137 + }, + { + "epoch": 0.68, + "grad_norm": 0.2842293587504884, + "learning_rate": 0.00015703359266472112, + "loss": 1.1599, + "step": 7138 + }, + { + "epoch": 0.68, + "grad_norm": 0.2670841998105975, + "learning_rate": 0.0001570205976388028, + "loss": 1.138, + "step": 7139 + }, + { + "epoch": 0.68, + "grad_norm": 0.27756874671553816, + "learning_rate": 0.00015700760118590815, + "loss": 0.9721, + "step": 7140 + }, + { + "epoch": 0.68, + "grad_norm": 0.28756494584263737, + "learning_rate": 0.00015699460330636248, + "loss": 1.1181, + "step": 7141 + }, + { + "epoch": 0.68, + "grad_norm": 0.32898360339581917, + "learning_rate": 0.00015698160400049105, + "loss": 1.219, + "step": 7142 + }, + { + "epoch": 0.68, + "grad_norm": 0.2803241831359587, + "learning_rate": 0.00015696860326861917, + "loss": 1.1124, + "step": 7143 + }, + { + "epoch": 0.68, + "grad_norm": 0.27535423833378253, + "learning_rate": 0.00015695560111107218, + "loss": 1.1046, + "step": 7144 + }, + { + "epoch": 0.68, + "grad_norm": 0.2606122473280347, + "learning_rate": 0.0001569425975281755, + "loss": 1.158, + "step": 7145 + }, + { + "epoch": 0.68, + "grad_norm": 0.26610041445452237, + "learning_rate": 0.00015692959252025447, + "loss": 1.1423, + "step": 7146 + }, + { + "epoch": 0.68, + "grad_norm": 0.2953405796549938, + "learning_rate": 0.00015691658608763467, + "loss": 1.1065, + "step": 7147 + }, + { + "epoch": 0.68, + "grad_norm": 0.2809208524967904, + "learning_rate": 0.00015690357823064147, + "loss": 1.0091, + "step": 7148 + }, + { + "epoch": 0.68, + "grad_norm": 0.298379105314993, + "learning_rate": 0.00015689056894960054, + "loss": 0.9916, + "step": 7149 + }, + { + "epoch": 0.68, + "grad_norm": 0.29683307271349985, + "learning_rate": 0.00015687755824483733, + "loss": 1.1125, + "step": 7150 + }, + { + "epoch": 0.68, + "grad_norm": 0.29899393435333316, + "learning_rate": 0.00015686454611667745, + "loss": 1.0796, + "step": 7151 + }, + { + "epoch": 0.68, + "grad_norm": 0.31478792374668824, + "learning_rate": 0.00015685153256544658, + "loss": 1.0593, + "step": 7152 + }, + { + "epoch": 0.68, + "grad_norm": 0.2930746130040074, + "learning_rate": 0.0001568385175914704, + "loss": 1.0662, + "step": 7153 + }, + { + "epoch": 0.68, + "grad_norm": 0.2882046106628799, + "learning_rate": 0.00015682550119507457, + "loss": 1.0464, + "step": 7154 + }, + { + "epoch": 0.68, + "grad_norm": 0.24774454066142032, + "learning_rate": 0.0001568124833765849, + "loss": 1.0727, + "step": 7155 + }, + { + "epoch": 0.68, + "grad_norm": 0.28400220165986323, + "learning_rate": 0.0001567994641363271, + "loss": 1.2001, + "step": 7156 + }, + { + "epoch": 0.68, + "grad_norm": 0.2847089193485156, + "learning_rate": 0.000156786443474627, + "loss": 1.0203, + "step": 7157 + }, + { + "epoch": 0.68, + "grad_norm": 0.25521289971018285, + "learning_rate": 0.0001567734213918105, + "loss": 1.0875, + "step": 7158 + }, + { + "epoch": 0.68, + "grad_norm": 0.26777210499225285, + "learning_rate": 0.0001567603978882034, + "loss": 1.0413, + "step": 7159 + }, + { + "epoch": 0.69, + "grad_norm": 0.29958229941874526, + "learning_rate": 0.00015674737296413171, + "loss": 1.1779, + "step": 7160 + }, + { + "epoch": 0.69, + "grad_norm": 0.2973923709791303, + "learning_rate": 0.00015673434661992133, + "loss": 1.0605, + "step": 7161 + }, + { + "epoch": 0.69, + "grad_norm": 0.29890629646161765, + "learning_rate": 0.00015672131885589827, + "loss": 1.1378, + "step": 7162 + }, + { + "epoch": 0.69, + "grad_norm": 0.27465315968911586, + "learning_rate": 0.00015670828967238857, + "loss": 1.0176, + "step": 7163 + }, + { + "epoch": 0.69, + "grad_norm": 0.27330309587431983, + "learning_rate": 0.00015669525906971825, + "loss": 1.099, + "step": 7164 + }, + { + "epoch": 0.69, + "grad_norm": 0.2830955541275837, + "learning_rate": 0.00015668222704821346, + "loss": 1.0174, + "step": 7165 + }, + { + "epoch": 0.69, + "grad_norm": 0.2918518106052513, + "learning_rate": 0.00015666919360820034, + "loss": 1.0828, + "step": 7166 + }, + { + "epoch": 0.69, + "grad_norm": 0.2938118596159364, + "learning_rate": 0.000156656158750005, + "loss": 1.1368, + "step": 7167 + }, + { + "epoch": 0.69, + "grad_norm": 0.33564117245367897, + "learning_rate": 0.0001566431224739537, + "loss": 1.1594, + "step": 7168 + }, + { + "epoch": 0.69, + "grad_norm": 0.2791063027357139, + "learning_rate": 0.00015663008478037263, + "loss": 1.0643, + "step": 7169 + }, + { + "epoch": 0.69, + "grad_norm": 0.2836589857893266, + "learning_rate": 0.00015661704566958816, + "loss": 1.0865, + "step": 7170 + }, + { + "epoch": 0.69, + "grad_norm": 0.3268578411481459, + "learning_rate": 0.00015660400514192648, + "loss": 1.1146, + "step": 7171 + }, + { + "epoch": 0.69, + "grad_norm": 0.26432781847496883, + "learning_rate": 0.00015659096319771401, + "loss": 1.2087, + "step": 7172 + }, + { + "epoch": 0.69, + "grad_norm": 0.2947543226709198, + "learning_rate": 0.00015657791983727715, + "loss": 1.0841, + "step": 7173 + }, + { + "epoch": 0.69, + "grad_norm": 0.3064445919955032, + "learning_rate": 0.00015656487506094226, + "loss": 1.1338, + "step": 7174 + }, + { + "epoch": 0.69, + "grad_norm": 0.2768287567420236, + "learning_rate": 0.00015655182886903582, + "loss": 1.1217, + "step": 7175 + }, + { + "epoch": 0.69, + "grad_norm": 0.3115480635288813, + "learning_rate": 0.00015653878126188433, + "loss": 1.1119, + "step": 7176 + }, + { + "epoch": 0.69, + "grad_norm": 0.26658700110300837, + "learning_rate": 0.00015652573223981432, + "loss": 1.1532, + "step": 7177 + }, + { + "epoch": 0.69, + "grad_norm": 0.30729315346172287, + "learning_rate": 0.0001565126818031523, + "loss": 1.0824, + "step": 7178 + }, + { + "epoch": 0.69, + "grad_norm": 0.27545018818322364, + "learning_rate": 0.00015649962995222493, + "loss": 1.0008, + "step": 7179 + }, + { + "epoch": 0.69, + "grad_norm": 0.2840294196959938, + "learning_rate": 0.0001564865766873588, + "loss": 0.9888, + "step": 7180 + }, + { + "epoch": 0.69, + "grad_norm": 0.2704386331778757, + "learning_rate": 0.00015647352200888056, + "loss": 1.0845, + "step": 7181 + }, + { + "epoch": 0.69, + "grad_norm": 0.309378454139959, + "learning_rate": 0.00015646046591711698, + "loss": 1.1094, + "step": 7182 + }, + { + "epoch": 0.69, + "grad_norm": 0.33236253311844427, + "learning_rate": 0.00015644740841239477, + "loss": 1.0607, + "step": 7183 + }, + { + "epoch": 0.69, + "grad_norm": 0.2654938165964374, + "learning_rate": 0.00015643434949504066, + "loss": 1.0977, + "step": 7184 + }, + { + "epoch": 0.69, + "grad_norm": 0.25891560294027866, + "learning_rate": 0.00015642128916538148, + "loss": 1.0862, + "step": 7185 + }, + { + "epoch": 0.69, + "grad_norm": 0.30191735715591245, + "learning_rate": 0.00015640822742374411, + "loss": 1.0986, + "step": 7186 + }, + { + "epoch": 0.69, + "grad_norm": 0.3016522569398602, + "learning_rate": 0.00015639516427045538, + "loss": 1.1071, + "step": 7187 + }, + { + "epoch": 0.69, + "grad_norm": 0.3747262023726705, + "learning_rate": 0.00015638209970584218, + "loss": 0.9347, + "step": 7188 + }, + { + "epoch": 0.69, + "grad_norm": 0.26969894694899865, + "learning_rate": 0.0001563690337302316, + "loss": 0.9472, + "step": 7189 + }, + { + "epoch": 0.69, + "grad_norm": 0.27707566766666775, + "learning_rate": 0.00015635596634395045, + "loss": 1.0062, + "step": 7190 + }, + { + "epoch": 0.69, + "grad_norm": 0.299605953934922, + "learning_rate": 0.00015634289754732584, + "loss": 1.0789, + "step": 7191 + }, + { + "epoch": 0.69, + "grad_norm": 0.2756403847856143, + "learning_rate": 0.00015632982734068479, + "loss": 1.0478, + "step": 7192 + }, + { + "epoch": 0.69, + "grad_norm": 0.28164477492401824, + "learning_rate": 0.00015631675572435442, + "loss": 1.0234, + "step": 7193 + }, + { + "epoch": 0.69, + "grad_norm": 0.3139943484530815, + "learning_rate": 0.00015630368269866187, + "loss": 1.0499, + "step": 7194 + }, + { + "epoch": 0.69, + "grad_norm": 0.30405542664327845, + "learning_rate": 0.0001562906082639342, + "loss": 1.023, + "step": 7195 + }, + { + "epoch": 0.69, + "grad_norm": 0.2874490807251814, + "learning_rate": 0.00015627753242049877, + "loss": 0.9521, + "step": 7196 + }, + { + "epoch": 0.69, + "grad_norm": 0.2643259838229851, + "learning_rate": 0.0001562644551686827, + "loss": 1.0435, + "step": 7197 + }, + { + "epoch": 0.69, + "grad_norm": 0.2865014648758085, + "learning_rate": 0.00015625137650881324, + "loss": 1.0175, + "step": 7198 + }, + { + "epoch": 0.69, + "grad_norm": 0.30454373875693874, + "learning_rate": 0.00015623829644121777, + "loss": 1.134, + "step": 7199 + }, + { + "epoch": 0.69, + "grad_norm": 0.31859855773654133, + "learning_rate": 0.00015622521496622355, + "loss": 1.0659, + "step": 7200 + }, + { + "epoch": 0.69, + "grad_norm": 0.2858177196834928, + "learning_rate": 0.00015621213208415804, + "loss": 0.9598, + "step": 7201 + }, + { + "epoch": 0.69, + "grad_norm": 0.26598227866462754, + "learning_rate": 0.00015619904779534856, + "loss": 0.9907, + "step": 7202 + }, + { + "epoch": 0.69, + "grad_norm": 0.2951400583438705, + "learning_rate": 0.00015618596210012256, + "loss": 1.0765, + "step": 7203 + }, + { + "epoch": 0.69, + "grad_norm": 0.3114702561846883, + "learning_rate": 0.00015617287499880762, + "loss": 1.1041, + "step": 7204 + }, + { + "epoch": 0.69, + "grad_norm": 0.278499185049389, + "learning_rate": 0.00015615978649173112, + "loss": 0.9533, + "step": 7205 + }, + { + "epoch": 0.69, + "grad_norm": 0.28555299537963824, + "learning_rate": 0.0001561466965792207, + "loss": 1.0967, + "step": 7206 + }, + { + "epoch": 0.69, + "grad_norm": 0.2636715267212951, + "learning_rate": 0.00015613360526160392, + "loss": 1.086, + "step": 7207 + }, + { + "epoch": 0.69, + "grad_norm": 0.29854739591999657, + "learning_rate": 0.00015612051253920836, + "loss": 1.1259, + "step": 7208 + }, + { + "epoch": 0.69, + "grad_norm": 0.29584977312779975, + "learning_rate": 0.00015610741841236173, + "loss": 1.1717, + "step": 7209 + }, + { + "epoch": 0.69, + "grad_norm": 0.2853540873126679, + "learning_rate": 0.00015609432288139167, + "loss": 1.0673, + "step": 7210 + }, + { + "epoch": 0.69, + "grad_norm": 0.26104277026079287, + "learning_rate": 0.00015608122594662596, + "loss": 1.0772, + "step": 7211 + }, + { + "epoch": 0.69, + "grad_norm": 0.3189693042004204, + "learning_rate": 0.00015606812760839226, + "loss": 1.0342, + "step": 7212 + }, + { + "epoch": 0.69, + "grad_norm": 0.27816239495102424, + "learning_rate": 0.00015605502786701848, + "loss": 1.0825, + "step": 7213 + }, + { + "epoch": 0.69, + "grad_norm": 0.2635774551759029, + "learning_rate": 0.0001560419267228324, + "loss": 1.0783, + "step": 7214 + }, + { + "epoch": 0.69, + "grad_norm": 0.29195022738920845, + "learning_rate": 0.00015602882417616184, + "loss": 1.1758, + "step": 7215 + }, + { + "epoch": 0.69, + "grad_norm": 0.3144273867404004, + "learning_rate": 0.0001560157202273348, + "loss": 1.0973, + "step": 7216 + }, + { + "epoch": 0.69, + "grad_norm": 0.3065507511047062, + "learning_rate": 0.00015600261487667912, + "loss": 1.1111, + "step": 7217 + }, + { + "epoch": 0.69, + "grad_norm": 0.3131853451412242, + "learning_rate": 0.0001559895081245228, + "loss": 1.2187, + "step": 7218 + }, + { + "epoch": 0.69, + "grad_norm": 0.26515742013497673, + "learning_rate": 0.00015597639997119389, + "loss": 1.0605, + "step": 7219 + }, + { + "epoch": 0.69, + "grad_norm": 0.3129582152206669, + "learning_rate": 0.00015596329041702036, + "loss": 1.0686, + "step": 7220 + }, + { + "epoch": 0.69, + "grad_norm": 0.34295960089795735, + "learning_rate": 0.00015595017946233033, + "loss": 1.0771, + "step": 7221 + }, + { + "epoch": 0.69, + "grad_norm": 0.2721307338351633, + "learning_rate": 0.00015593706710745187, + "loss": 1.0284, + "step": 7222 + }, + { + "epoch": 0.69, + "grad_norm": 0.304506700589439, + "learning_rate": 0.00015592395335271316, + "loss": 1.055, + "step": 7223 + }, + { + "epoch": 0.69, + "grad_norm": 0.31053881654022203, + "learning_rate": 0.0001559108381984424, + "loss": 1.1429, + "step": 7224 + }, + { + "epoch": 0.69, + "grad_norm": 0.30691636576138964, + "learning_rate": 0.00015589772164496774, + "loss": 1.1403, + "step": 7225 + }, + { + "epoch": 0.69, + "grad_norm": 0.2833916505818309, + "learning_rate": 0.00015588460369261748, + "loss": 0.9685, + "step": 7226 + }, + { + "epoch": 0.69, + "grad_norm": 0.28649595640599806, + "learning_rate": 0.0001558714843417199, + "loss": 0.9536, + "step": 7227 + }, + { + "epoch": 0.69, + "grad_norm": 0.27284801442293516, + "learning_rate": 0.0001558583635926033, + "loss": 1.1062, + "step": 7228 + }, + { + "epoch": 0.69, + "grad_norm": 0.27007347326614567, + "learning_rate": 0.00015584524144559604, + "loss": 1.1615, + "step": 7229 + }, + { + "epoch": 0.69, + "grad_norm": 0.3039328987168462, + "learning_rate": 0.00015583211790102652, + "loss": 1.1376, + "step": 7230 + }, + { + "epoch": 0.69, + "grad_norm": 0.26912448480042633, + "learning_rate": 0.00015581899295922318, + "loss": 1.0096, + "step": 7231 + }, + { + "epoch": 0.69, + "grad_norm": 0.2653339147614556, + "learning_rate": 0.00015580586662051444, + "loss": 1.0514, + "step": 7232 + }, + { + "epoch": 0.69, + "grad_norm": 0.26697292715319065, + "learning_rate": 0.0001557927388852288, + "loss": 0.9873, + "step": 7233 + }, + { + "epoch": 0.69, + "grad_norm": 0.2937309164974668, + "learning_rate": 0.00015577960975369484, + "loss": 1.1235, + "step": 7234 + }, + { + "epoch": 0.69, + "grad_norm": 0.2781751496250474, + "learning_rate": 0.00015576647922624105, + "loss": 1.0094, + "step": 7235 + }, + { + "epoch": 0.69, + "grad_norm": 0.28016398794516684, + "learning_rate": 0.00015575334730319611, + "loss": 1.1144, + "step": 7236 + }, + { + "epoch": 0.69, + "grad_norm": 0.2756048290515205, + "learning_rate": 0.00015574021398488862, + "loss": 1.0359, + "step": 7237 + }, + { + "epoch": 0.69, + "grad_norm": 0.24131864496426217, + "learning_rate": 0.0001557270792716472, + "loss": 1.0668, + "step": 7238 + }, + { + "epoch": 0.69, + "grad_norm": 0.2840317470056723, + "learning_rate": 0.00015571394316380062, + "loss": 1.1596, + "step": 7239 + }, + { + "epoch": 0.69, + "grad_norm": 0.27376264865951977, + "learning_rate": 0.0001557008056616776, + "loss": 0.9976, + "step": 7240 + }, + { + "epoch": 0.69, + "grad_norm": 0.3019067450221086, + "learning_rate": 0.0001556876667656069, + "loss": 1.0813, + "step": 7241 + }, + { + "epoch": 0.69, + "grad_norm": 0.3162835708665788, + "learning_rate": 0.00015567452647591732, + "loss": 1.1084, + "step": 7242 + }, + { + "epoch": 0.69, + "grad_norm": 0.27896797749608704, + "learning_rate": 0.00015566138479293775, + "loss": 1.0595, + "step": 7243 + }, + { + "epoch": 0.69, + "grad_norm": 0.2992448181482971, + "learning_rate": 0.00015564824171699707, + "loss": 0.9487, + "step": 7244 + }, + { + "epoch": 0.69, + "grad_norm": 0.27553640224406994, + "learning_rate": 0.00015563509724842413, + "loss": 1.0404, + "step": 7245 + }, + { + "epoch": 0.69, + "grad_norm": 0.27552371209875953, + "learning_rate": 0.00015562195138754792, + "loss": 1.1528, + "step": 7246 + }, + { + "epoch": 0.69, + "grad_norm": 0.29426928783634326, + "learning_rate": 0.00015560880413469742, + "loss": 1.0984, + "step": 7247 + }, + { + "epoch": 0.69, + "grad_norm": 0.27853895487179603, + "learning_rate": 0.00015559565549020169, + "loss": 1.1804, + "step": 7248 + }, + { + "epoch": 0.69, + "grad_norm": 0.3136870635528011, + "learning_rate": 0.00015558250545438972, + "loss": 1.0698, + "step": 7249 + }, + { + "epoch": 0.69, + "grad_norm": 0.2624215588147093, + "learning_rate": 0.0001555693540275906, + "loss": 1.1105, + "step": 7250 + }, + { + "epoch": 0.69, + "grad_norm": 0.23782920442038563, + "learning_rate": 0.0001555562012101335, + "loss": 1.1211, + "step": 7251 + }, + { + "epoch": 0.69, + "grad_norm": 0.2695440813802512, + "learning_rate": 0.00015554304700234747, + "loss": 1.0031, + "step": 7252 + }, + { + "epoch": 0.69, + "grad_norm": 0.30147711904941943, + "learning_rate": 0.00015552989140456185, + "loss": 1.1955, + "step": 7253 + }, + { + "epoch": 0.69, + "grad_norm": 0.2430503125575205, + "learning_rate": 0.0001555167344171058, + "loss": 0.983, + "step": 7254 + }, + { + "epoch": 0.69, + "grad_norm": 0.29554732708566633, + "learning_rate": 0.00015550357604030856, + "loss": 1.0728, + "step": 7255 + }, + { + "epoch": 0.69, + "grad_norm": 0.33084597515765934, + "learning_rate": 0.00015549041627449945, + "loss": 1.0754, + "step": 7256 + }, + { + "epoch": 0.69, + "grad_norm": 0.35533675482762406, + "learning_rate": 0.0001554772551200078, + "loss": 1.1181, + "step": 7257 + }, + { + "epoch": 0.69, + "grad_norm": 0.27809709182399867, + "learning_rate": 0.00015546409257716296, + "loss": 1.1706, + "step": 7258 + }, + { + "epoch": 0.69, + "grad_norm": 0.2644018070633185, + "learning_rate": 0.00015545092864629437, + "loss": 1.1147, + "step": 7259 + }, + { + "epoch": 0.69, + "grad_norm": 0.2749285252651721, + "learning_rate": 0.00015543776332773142, + "loss": 1.1475, + "step": 7260 + }, + { + "epoch": 0.69, + "grad_norm": 0.26759109696003713, + "learning_rate": 0.00015542459662180362, + "loss": 1.1167, + "step": 7261 + }, + { + "epoch": 0.69, + "grad_norm": 0.2744664795704848, + "learning_rate": 0.00015541142852884044, + "loss": 0.9624, + "step": 7262 + }, + { + "epoch": 0.69, + "grad_norm": 0.2656603725314074, + "learning_rate": 0.00015539825904917147, + "loss": 1.0158, + "step": 7263 + }, + { + "epoch": 0.69, + "grad_norm": 0.27736864447841364, + "learning_rate": 0.0001553850881831262, + "loss": 1.0009, + "step": 7264 + }, + { + "epoch": 0.7, + "grad_norm": 0.23778024632317593, + "learning_rate": 0.00015537191593103432, + "loss": 1.1653, + "step": 7265 + }, + { + "epoch": 0.7, + "grad_norm": 0.30730004897313506, + "learning_rate": 0.00015535874229322545, + "loss": 1.0293, + "step": 7266 + }, + { + "epoch": 0.7, + "grad_norm": 0.26576264865967103, + "learning_rate": 0.00015534556727002925, + "loss": 1.1149, + "step": 7267 + }, + { + "epoch": 0.7, + "grad_norm": 0.2910068427246134, + "learning_rate": 0.00015533239086177548, + "loss": 1.0587, + "step": 7268 + }, + { + "epoch": 0.7, + "grad_norm": 0.2719761437844997, + "learning_rate": 0.0001553192130687938, + "loss": 1.0378, + "step": 7269 + }, + { + "epoch": 0.7, + "grad_norm": 0.2578029237737743, + "learning_rate": 0.00015530603389141408, + "loss": 1.0591, + "step": 7270 + }, + { + "epoch": 0.7, + "grad_norm": 0.2799149293630919, + "learning_rate": 0.0001552928533299661, + "loss": 1.0374, + "step": 7271 + }, + { + "epoch": 0.7, + "grad_norm": 0.260847576690139, + "learning_rate": 0.00015527967138477967, + "loss": 0.9975, + "step": 7272 + }, + { + "epoch": 0.7, + "grad_norm": 0.2907795091020474, + "learning_rate": 0.00015526648805618478, + "loss": 1.1593, + "step": 7273 + }, + { + "epoch": 0.7, + "grad_norm": 0.22310418172043128, + "learning_rate": 0.00015525330334451127, + "loss": 0.9877, + "step": 7274 + }, + { + "epoch": 0.7, + "grad_norm": 0.29425332767908924, + "learning_rate": 0.00015524011725008912, + "loss": 1.0992, + "step": 7275 + }, + { + "epoch": 0.7, + "grad_norm": 0.3105156950201626, + "learning_rate": 0.0001552269297732483, + "loss": 1.0496, + "step": 7276 + }, + { + "epoch": 0.7, + "grad_norm": 0.2558739957093287, + "learning_rate": 0.00015521374091431888, + "loss": 1.0964, + "step": 7277 + }, + { + "epoch": 0.7, + "grad_norm": 0.28594690539220524, + "learning_rate": 0.00015520055067363089, + "loss": 1.0916, + "step": 7278 + }, + { + "epoch": 0.7, + "grad_norm": 0.3061482044460731, + "learning_rate": 0.00015518735905151442, + "loss": 1.0903, + "step": 7279 + }, + { + "epoch": 0.7, + "grad_norm": 0.3191580668312093, + "learning_rate": 0.00015517416604829962, + "loss": 0.9281, + "step": 7280 + }, + { + "epoch": 0.7, + "grad_norm": 0.26708235106774547, + "learning_rate": 0.00015516097166431663, + "loss": 1.1284, + "step": 7281 + }, + { + "epoch": 0.7, + "grad_norm": 0.2741503886756114, + "learning_rate": 0.00015514777589989564, + "loss": 0.9834, + "step": 7282 + }, + { + "epoch": 0.7, + "grad_norm": 0.2721226495347994, + "learning_rate": 0.00015513457875536692, + "loss": 1.0924, + "step": 7283 + }, + { + "epoch": 0.7, + "grad_norm": 0.2903941706772716, + "learning_rate": 0.0001551213802310607, + "loss": 1.1032, + "step": 7284 + }, + { + "epoch": 0.7, + "grad_norm": 0.26719034153148796, + "learning_rate": 0.0001551081803273073, + "loss": 1.0721, + "step": 7285 + }, + { + "epoch": 0.7, + "grad_norm": 0.2410538608083278, + "learning_rate": 0.00015509497904443706, + "loss": 1.0446, + "step": 7286 + }, + { + "epoch": 0.7, + "grad_norm": 0.288052793290722, + "learning_rate": 0.00015508177638278036, + "loss": 1.0968, + "step": 7287 + }, + { + "epoch": 0.7, + "grad_norm": 0.28114598468656504, + "learning_rate": 0.00015506857234266755, + "loss": 1.2161, + "step": 7288 + }, + { + "epoch": 0.7, + "grad_norm": 0.30363151390349236, + "learning_rate": 0.00015505536692442915, + "loss": 1.1299, + "step": 7289 + }, + { + "epoch": 0.7, + "grad_norm": 0.3430101339177829, + "learning_rate": 0.00015504216012839555, + "loss": 1.057, + "step": 7290 + }, + { + "epoch": 0.7, + "grad_norm": 0.31124798109609725, + "learning_rate": 0.00015502895195489735, + "loss": 1.0329, + "step": 7291 + }, + { + "epoch": 0.7, + "grad_norm": 0.31977891525097923, + "learning_rate": 0.000155015742404265, + "loss": 1.0195, + "step": 7292 + }, + { + "epoch": 0.7, + "grad_norm": 0.2605016731210418, + "learning_rate": 0.00015500253147682913, + "loss": 1.1187, + "step": 7293 + }, + { + "epoch": 0.7, + "grad_norm": 0.25377938560950036, + "learning_rate": 0.00015498931917292037, + "loss": 0.9918, + "step": 7294 + }, + { + "epoch": 0.7, + "grad_norm": 0.3046468563219072, + "learning_rate": 0.0001549761054928693, + "loss": 1.112, + "step": 7295 + }, + { + "epoch": 0.7, + "grad_norm": 0.34132422940513657, + "learning_rate": 0.00015496289043700665, + "loss": 1.0857, + "step": 7296 + }, + { + "epoch": 0.7, + "grad_norm": 0.25593975051636847, + "learning_rate": 0.00015494967400566311, + "loss": 1.0094, + "step": 7297 + }, + { + "epoch": 0.7, + "grad_norm": 0.30660036228331966, + "learning_rate": 0.00015493645619916947, + "loss": 1.0384, + "step": 7298 + }, + { + "epoch": 0.7, + "grad_norm": 0.28934490627014536, + "learning_rate": 0.0001549232370178565, + "loss": 1.1638, + "step": 7299 + }, + { + "epoch": 0.7, + "grad_norm": 0.3082067568322258, + "learning_rate": 0.00015491001646205496, + "loss": 1.1484, + "step": 7300 + }, + { + "epoch": 0.7, + "grad_norm": 0.24608283441694623, + "learning_rate": 0.00015489679453209578, + "loss": 1.0935, + "step": 7301 + }, + { + "epoch": 0.7, + "grad_norm": 0.2878652096204575, + "learning_rate": 0.0001548835712283098, + "loss": 1.0195, + "step": 7302 + }, + { + "epoch": 0.7, + "grad_norm": 0.2445700870996487, + "learning_rate": 0.00015487034655102796, + "loss": 1.0676, + "step": 7303 + }, + { + "epoch": 0.7, + "grad_norm": 0.2792976608554407, + "learning_rate": 0.00015485712050058125, + "loss": 1.0282, + "step": 7304 + }, + { + "epoch": 0.7, + "grad_norm": 0.2792852923073424, + "learning_rate": 0.00015484389307730056, + "loss": 1.0393, + "step": 7305 + }, + { + "epoch": 0.7, + "grad_norm": 0.2921531944021332, + "learning_rate": 0.00015483066428151703, + "loss": 1.1671, + "step": 7306 + }, + { + "epoch": 0.7, + "grad_norm": 0.2840087290455877, + "learning_rate": 0.00015481743411356163, + "loss": 1.0849, + "step": 7307 + }, + { + "epoch": 0.7, + "grad_norm": 0.3040563023087563, + "learning_rate": 0.0001548042025737655, + "loss": 1.1268, + "step": 7308 + }, + { + "epoch": 0.7, + "grad_norm": 0.29355520065474283, + "learning_rate": 0.00015479096966245978, + "loss": 1.03, + "step": 7309 + }, + { + "epoch": 0.7, + "grad_norm": 0.2760436996975324, + "learning_rate": 0.00015477773537997557, + "loss": 1.1871, + "step": 7310 + }, + { + "epoch": 0.7, + "grad_norm": 0.27687371076005574, + "learning_rate": 0.00015476449972664412, + "loss": 1.1466, + "step": 7311 + }, + { + "epoch": 0.7, + "grad_norm": 0.2845099189305587, + "learning_rate": 0.00015475126270279667, + "loss": 1.0636, + "step": 7312 + }, + { + "epoch": 0.7, + "grad_norm": 0.2505523226102105, + "learning_rate": 0.00015473802430876444, + "loss": 1.0668, + "step": 7313 + }, + { + "epoch": 0.7, + "grad_norm": 0.32837255120480047, + "learning_rate": 0.00015472478454487876, + "loss": 1.1224, + "step": 7314 + }, + { + "epoch": 0.7, + "grad_norm": 0.26481048965460596, + "learning_rate": 0.00015471154341147094, + "loss": 1.0428, + "step": 7315 + }, + { + "epoch": 0.7, + "grad_norm": 0.2690353020587212, + "learning_rate": 0.00015469830090887235, + "loss": 1.0432, + "step": 7316 + }, + { + "epoch": 0.7, + "grad_norm": 0.27626600703824794, + "learning_rate": 0.00015468505703741442, + "loss": 1.0638, + "step": 7317 + }, + { + "epoch": 0.7, + "grad_norm": 0.3065923845948702, + "learning_rate": 0.00015467181179742857, + "loss": 0.914, + "step": 7318 + }, + { + "epoch": 0.7, + "grad_norm": 0.2829656174177215, + "learning_rate": 0.0001546585651892463, + "loss": 1.0586, + "step": 7319 + }, + { + "epoch": 0.7, + "grad_norm": 0.277648311025837, + "learning_rate": 0.00015464531721319903, + "loss": 0.9368, + "step": 7320 + }, + { + "epoch": 0.7, + "grad_norm": 0.26414058914433497, + "learning_rate": 0.00015463206786961838, + "loss": 0.9696, + "step": 7321 + }, + { + "epoch": 0.7, + "grad_norm": 0.3161051076868171, + "learning_rate": 0.0001546188171588359, + "loss": 1.0229, + "step": 7322 + }, + { + "epoch": 0.7, + "grad_norm": 0.2799019895774647, + "learning_rate": 0.0001546055650811832, + "loss": 1.1345, + "step": 7323 + }, + { + "epoch": 0.7, + "grad_norm": 0.305865812690483, + "learning_rate": 0.0001545923116369919, + "loss": 1.0023, + "step": 7324 + }, + { + "epoch": 0.7, + "grad_norm": 0.30382443041697793, + "learning_rate": 0.00015457905682659368, + "loss": 1.0488, + "step": 7325 + }, + { + "epoch": 0.7, + "grad_norm": 0.31386464628323874, + "learning_rate": 0.0001545658006503203, + "loss": 1.0361, + "step": 7326 + }, + { + "epoch": 0.7, + "grad_norm": 0.27019240058841637, + "learning_rate": 0.00015455254310850345, + "loss": 1.0565, + "step": 7327 + }, + { + "epoch": 0.7, + "grad_norm": 0.266426547020745, + "learning_rate": 0.0001545392842014749, + "loss": 0.9752, + "step": 7328 + }, + { + "epoch": 0.7, + "grad_norm": 0.2995784354581383, + "learning_rate": 0.0001545260239295665, + "loss": 1.0005, + "step": 7329 + }, + { + "epoch": 0.7, + "grad_norm": 0.317973794140023, + "learning_rate": 0.0001545127622931101, + "loss": 1.0538, + "step": 7330 + }, + { + "epoch": 0.7, + "grad_norm": 0.2947298100738568, + "learning_rate": 0.00015449949929243755, + "loss": 1.0116, + "step": 7331 + }, + { + "epoch": 0.7, + "grad_norm": 0.3348856644428043, + "learning_rate": 0.00015448623492788076, + "loss": 0.9802, + "step": 7332 + }, + { + "epoch": 0.7, + "grad_norm": 0.2989455629929353, + "learning_rate": 0.00015447296919977172, + "loss": 1.0376, + "step": 7333 + }, + { + "epoch": 0.7, + "grad_norm": 0.3011061423556309, + "learning_rate": 0.0001544597021084424, + "loss": 1.0908, + "step": 7334 + }, + { + "epoch": 0.7, + "grad_norm": 0.3049819395201172, + "learning_rate": 0.00015444643365422478, + "loss": 1.0768, + "step": 7335 + }, + { + "epoch": 0.7, + "grad_norm": 0.26542548997246496, + "learning_rate": 0.00015443316383745095, + "loss": 1.024, + "step": 7336 + }, + { + "epoch": 0.7, + "grad_norm": 0.2797529721086807, + "learning_rate": 0.00015441989265845297, + "loss": 1.072, + "step": 7337 + }, + { + "epoch": 0.7, + "grad_norm": 0.2538780155773843, + "learning_rate": 0.000154406620117563, + "loss": 1.038, + "step": 7338 + }, + { + "epoch": 0.7, + "grad_norm": 0.2831186717367567, + "learning_rate": 0.00015439334621511318, + "loss": 1.0638, + "step": 7339 + }, + { + "epoch": 0.7, + "grad_norm": 0.28787268851064585, + "learning_rate": 0.00015438007095143567, + "loss": 1.0426, + "step": 7340 + }, + { + "epoch": 0.7, + "grad_norm": 0.2948120261226757, + "learning_rate": 0.0001543667943268627, + "loss": 1.1221, + "step": 7341 + }, + { + "epoch": 0.7, + "grad_norm": 0.32242230412435835, + "learning_rate": 0.00015435351634172654, + "loss": 1.0958, + "step": 7342 + }, + { + "epoch": 0.7, + "grad_norm": 0.2654273360662479, + "learning_rate": 0.00015434023699635948, + "loss": 1.0071, + "step": 7343 + }, + { + "epoch": 0.7, + "grad_norm": 0.27327197799382885, + "learning_rate": 0.00015432695629109385, + "loss": 1.0576, + "step": 7344 + }, + { + "epoch": 0.7, + "grad_norm": 0.2754942416375848, + "learning_rate": 0.00015431367422626195, + "loss": 1.1498, + "step": 7345 + }, + { + "epoch": 0.7, + "grad_norm": 0.27801354524717076, + "learning_rate": 0.00015430039080219625, + "loss": 1.1084, + "step": 7346 + }, + { + "epoch": 0.7, + "grad_norm": 0.2914103943454639, + "learning_rate": 0.00015428710601922914, + "loss": 1.0507, + "step": 7347 + }, + { + "epoch": 0.7, + "grad_norm": 0.2638150945585758, + "learning_rate": 0.00015427381987769307, + "loss": 1.0324, + "step": 7348 + }, + { + "epoch": 0.7, + "grad_norm": 0.2526962446018732, + "learning_rate": 0.0001542605323779206, + "loss": 0.9352, + "step": 7349 + }, + { + "epoch": 0.7, + "grad_norm": 0.2595273949493344, + "learning_rate": 0.00015424724352024413, + "loss": 1.0355, + "step": 7350 + }, + { + "epoch": 0.7, + "grad_norm": 0.28782174706723024, + "learning_rate": 0.00015423395330499632, + "loss": 1.1083, + "step": 7351 + }, + { + "epoch": 0.7, + "grad_norm": 0.3115649702709986, + "learning_rate": 0.00015422066173250974, + "loss": 1.0422, + "step": 7352 + }, + { + "epoch": 0.7, + "grad_norm": 0.29111738038165, + "learning_rate": 0.00015420736880311707, + "loss": 1.0315, + "step": 7353 + }, + { + "epoch": 0.7, + "grad_norm": 0.28032086546612484, + "learning_rate": 0.00015419407451715088, + "loss": 1.1888, + "step": 7354 + }, + { + "epoch": 0.7, + "grad_norm": 0.2716417113526186, + "learning_rate": 0.00015418077887494394, + "loss": 1.0331, + "step": 7355 + }, + { + "epoch": 0.7, + "grad_norm": 0.28116455658959255, + "learning_rate": 0.00015416748187682897, + "loss": 1.0433, + "step": 7356 + }, + { + "epoch": 0.7, + "grad_norm": 0.3165740582098866, + "learning_rate": 0.00015415418352313868, + "loss": 1.0488, + "step": 7357 + }, + { + "epoch": 0.7, + "grad_norm": 0.24167939460825494, + "learning_rate": 0.00015414088381420594, + "loss": 1.1047, + "step": 7358 + }, + { + "epoch": 0.7, + "grad_norm": 0.2362202925878594, + "learning_rate": 0.00015412758275036356, + "loss": 1.111, + "step": 7359 + }, + { + "epoch": 0.7, + "grad_norm": 0.30927705441472164, + "learning_rate": 0.0001541142803319444, + "loss": 0.9948, + "step": 7360 + }, + { + "epoch": 0.7, + "grad_norm": 0.26694430612905373, + "learning_rate": 0.00015410097655928136, + "loss": 1.2141, + "step": 7361 + }, + { + "epoch": 0.7, + "grad_norm": 0.28735202961238854, + "learning_rate": 0.00015408767143270738, + "loss": 1.0482, + "step": 7362 + }, + { + "epoch": 0.7, + "grad_norm": 0.3028944111263906, + "learning_rate": 0.00015407436495255543, + "loss": 0.9829, + "step": 7363 + }, + { + "epoch": 0.7, + "grad_norm": 0.3036916238157443, + "learning_rate": 0.0001540610571191585, + "loss": 1.1743, + "step": 7364 + }, + { + "epoch": 0.7, + "grad_norm": 0.2897890002077795, + "learning_rate": 0.00015404774793284967, + "loss": 1.0033, + "step": 7365 + }, + { + "epoch": 0.7, + "grad_norm": 0.29386071816379244, + "learning_rate": 0.00015403443739396195, + "loss": 1.1666, + "step": 7366 + }, + { + "epoch": 0.7, + "grad_norm": 0.32062513986248775, + "learning_rate": 0.0001540211255028285, + "loss": 1.141, + "step": 7367 + }, + { + "epoch": 0.7, + "grad_norm": 0.3180897552174907, + "learning_rate": 0.00015400781225978242, + "loss": 1.0144, + "step": 7368 + }, + { + "epoch": 0.7, + "grad_norm": 0.27245479542574785, + "learning_rate": 0.00015399449766515688, + "loss": 1.0604, + "step": 7369 + }, + { + "epoch": 0.71, + "grad_norm": 0.2915213084795467, + "learning_rate": 0.00015398118171928516, + "loss": 1.1338, + "step": 7370 + }, + { + "epoch": 0.71, + "grad_norm": 0.23168432157865712, + "learning_rate": 0.00015396786442250035, + "loss": 1.0496, + "step": 7371 + }, + { + "epoch": 0.71, + "grad_norm": 0.2782432262911835, + "learning_rate": 0.0001539545457751359, + "loss": 1.0979, + "step": 7372 + }, + { + "epoch": 0.71, + "grad_norm": 0.27977300422557033, + "learning_rate": 0.000153941225777525, + "loss": 1.0744, + "step": 7373 + }, + { + "epoch": 0.71, + "grad_norm": 0.2545144680006588, + "learning_rate": 0.000153927904430001, + "loss": 1.0237, + "step": 7374 + }, + { + "epoch": 0.71, + "grad_norm": 0.23952050585666343, + "learning_rate": 0.00015391458173289734, + "loss": 1.1329, + "step": 7375 + }, + { + "epoch": 0.71, + "grad_norm": 0.27650814084254066, + "learning_rate": 0.00015390125768654738, + "loss": 1.128, + "step": 7376 + }, + { + "epoch": 0.71, + "grad_norm": 0.2957362698116198, + "learning_rate": 0.00015388793229128455, + "loss": 1.1255, + "step": 7377 + }, + { + "epoch": 0.71, + "grad_norm": 0.24485467093779958, + "learning_rate": 0.00015387460554744235, + "loss": 1.0197, + "step": 7378 + }, + { + "epoch": 0.71, + "grad_norm": 0.25214439017148976, + "learning_rate": 0.0001538612774553543, + "loss": 1.0601, + "step": 7379 + }, + { + "epoch": 0.71, + "grad_norm": 0.282880344840166, + "learning_rate": 0.00015384794801535394, + "loss": 1.1426, + "step": 7380 + }, + { + "epoch": 0.71, + "grad_norm": 0.3152140814752052, + "learning_rate": 0.00015383461722777482, + "loss": 1.0964, + "step": 7381 + }, + { + "epoch": 0.71, + "grad_norm": 0.24688971723773373, + "learning_rate": 0.00015382128509295058, + "loss": 1.0926, + "step": 7382 + }, + { + "epoch": 0.71, + "grad_norm": 0.2959651478445274, + "learning_rate": 0.00015380795161121485, + "loss": 1.0349, + "step": 7383 + }, + { + "epoch": 0.71, + "grad_norm": 0.29305053218157, + "learning_rate": 0.0001537946167829013, + "loss": 1.1347, + "step": 7384 + }, + { + "epoch": 0.71, + "grad_norm": 0.2717846083673245, + "learning_rate": 0.00015378128060834366, + "loss": 1.0096, + "step": 7385 + }, + { + "epoch": 0.71, + "grad_norm": 0.2620308073344996, + "learning_rate": 0.0001537679430878757, + "loss": 1.0911, + "step": 7386 + }, + { + "epoch": 0.71, + "grad_norm": 0.2531071813077481, + "learning_rate": 0.00015375460422183116, + "loss": 0.9925, + "step": 7387 + }, + { + "epoch": 0.71, + "grad_norm": 0.2797391994034549, + "learning_rate": 0.00015374126401054383, + "loss": 1.1076, + "step": 7388 + }, + { + "epoch": 0.71, + "grad_norm": 0.2727192530520494, + "learning_rate": 0.00015372792245434765, + "loss": 1.0334, + "step": 7389 + }, + { + "epoch": 0.71, + "grad_norm": 0.29997950673640783, + "learning_rate": 0.00015371457955357643, + "loss": 0.9707, + "step": 7390 + }, + { + "epoch": 0.71, + "grad_norm": 0.2816972910580794, + "learning_rate": 0.00015370123530856407, + "loss": 1.1254, + "step": 7391 + }, + { + "epoch": 0.71, + "grad_norm": 0.2959827743331956, + "learning_rate": 0.00015368788971964454, + "loss": 1.0022, + "step": 7392 + }, + { + "epoch": 0.71, + "grad_norm": 0.3027989628861647, + "learning_rate": 0.0001536745427871519, + "loss": 1.0555, + "step": 7393 + }, + { + "epoch": 0.71, + "grad_norm": 0.27628323794583914, + "learning_rate": 0.00015366119451142002, + "loss": 0.985, + "step": 7394 + }, + { + "epoch": 0.71, + "grad_norm": 0.3098642640517758, + "learning_rate": 0.00015364784489278304, + "loss": 1.0772, + "step": 7395 + }, + { + "epoch": 0.71, + "grad_norm": 0.3045327679601191, + "learning_rate": 0.00015363449393157504, + "loss": 1.1383, + "step": 7396 + }, + { + "epoch": 0.71, + "grad_norm": 0.27647845171066426, + "learning_rate": 0.00015362114162813012, + "loss": 1.0514, + "step": 7397 + }, + { + "epoch": 0.71, + "grad_norm": 0.27708090315989137, + "learning_rate": 0.00015360778798278243, + "loss": 0.9376, + "step": 7398 + }, + { + "epoch": 0.71, + "grad_norm": 0.2903239285477044, + "learning_rate": 0.00015359443299586614, + "loss": 1.1016, + "step": 7399 + }, + { + "epoch": 0.71, + "grad_norm": 0.3010839768456833, + "learning_rate": 0.0001535810766677155, + "loss": 1.0842, + "step": 7400 + }, + { + "epoch": 0.71, + "grad_norm": 0.276746949232895, + "learning_rate": 0.0001535677189986647, + "loss": 1.2018, + "step": 7401 + }, + { + "epoch": 0.71, + "grad_norm": 0.29483934180185845, + "learning_rate": 0.0001535543599890481, + "loss": 1.0932, + "step": 7402 + }, + { + "epoch": 0.71, + "grad_norm": 0.2944209969280942, + "learning_rate": 0.0001535409996392, + "loss": 1.1482, + "step": 7403 + }, + { + "epoch": 0.71, + "grad_norm": 0.31126699379727446, + "learning_rate": 0.0001535276379494547, + "loss": 1.1388, + "step": 7404 + }, + { + "epoch": 0.71, + "grad_norm": 0.2843244483748962, + "learning_rate": 0.00015351427492014662, + "loss": 1.122, + "step": 7405 + }, + { + "epoch": 0.71, + "grad_norm": 0.2940291993707008, + "learning_rate": 0.00015350091055161023, + "loss": 0.8989, + "step": 7406 + }, + { + "epoch": 0.71, + "grad_norm": 0.3351477452765215, + "learning_rate": 0.0001534875448441799, + "loss": 1.1108, + "step": 7407 + }, + { + "epoch": 0.71, + "grad_norm": 0.2906887381085749, + "learning_rate": 0.0001534741777981901, + "loss": 0.9912, + "step": 7408 + }, + { + "epoch": 0.71, + "grad_norm": 0.3122468085486279, + "learning_rate": 0.00015346080941397544, + "loss": 1.113, + "step": 7409 + }, + { + "epoch": 0.71, + "grad_norm": 0.28278667430577964, + "learning_rate": 0.00015344743969187042, + "loss": 1.0403, + "step": 7410 + }, + { + "epoch": 0.71, + "grad_norm": 0.27569491315509187, + "learning_rate": 0.00015343406863220962, + "loss": 1.121, + "step": 7411 + }, + { + "epoch": 0.71, + "grad_norm": 0.2804750598763221, + "learning_rate": 0.0001534206962353277, + "loss": 1.0365, + "step": 7412 + }, + { + "epoch": 0.71, + "grad_norm": 0.2524239751697441, + "learning_rate": 0.00015340732250155927, + "loss": 0.9917, + "step": 7413 + }, + { + "epoch": 0.71, + "grad_norm": 0.29005569119447944, + "learning_rate": 0.000153393947431239, + "loss": 1.1511, + "step": 7414 + }, + { + "epoch": 0.71, + "grad_norm": 0.2749251352441248, + "learning_rate": 0.00015338057102470164, + "loss": 1.0658, + "step": 7415 + }, + { + "epoch": 0.71, + "grad_norm": 0.2838189917053858, + "learning_rate": 0.000153367193282282, + "loss": 1.0391, + "step": 7416 + }, + { + "epoch": 0.71, + "grad_norm": 0.3244226146833391, + "learning_rate": 0.00015335381420431476, + "loss": 1.1094, + "step": 7417 + }, + { + "epoch": 0.71, + "grad_norm": 0.2574324902880777, + "learning_rate": 0.0001533404337911348, + "loss": 1.1391, + "step": 7418 + }, + { + "epoch": 0.71, + "grad_norm": 0.30913323555851724, + "learning_rate": 0.00015332705204307696, + "loss": 1.0444, + "step": 7419 + }, + { + "epoch": 0.71, + "grad_norm": 0.2902578699002574, + "learning_rate": 0.00015331366896047613, + "loss": 0.9731, + "step": 7420 + }, + { + "epoch": 0.71, + "grad_norm": 0.2736910311823876, + "learning_rate": 0.00015330028454366723, + "loss": 1.096, + "step": 7421 + }, + { + "epoch": 0.71, + "grad_norm": 0.2595210721180384, + "learning_rate": 0.0001532868987929852, + "loss": 1.0444, + "step": 7422 + }, + { + "epoch": 0.71, + "grad_norm": 0.2691043276914828, + "learning_rate": 0.00015327351170876504, + "loss": 0.9763, + "step": 7423 + }, + { + "epoch": 0.71, + "grad_norm": 0.2625617634998602, + "learning_rate": 0.00015326012329134177, + "loss": 1.0655, + "step": 7424 + }, + { + "epoch": 0.71, + "grad_norm": 0.3040586098635329, + "learning_rate": 0.00015324673354105044, + "loss": 1.0892, + "step": 7425 + }, + { + "epoch": 0.71, + "grad_norm": 0.2688279407074845, + "learning_rate": 0.00015323334245822613, + "loss": 1.0881, + "step": 7426 + }, + { + "epoch": 0.71, + "grad_norm": 0.2772663570425649, + "learning_rate": 0.00015321995004320398, + "loss": 1.0775, + "step": 7427 + }, + { + "epoch": 0.71, + "grad_norm": 0.28558771196591126, + "learning_rate": 0.00015320655629631915, + "loss": 1.1682, + "step": 7428 + }, + { + "epoch": 0.71, + "grad_norm": 0.3068876838609169, + "learning_rate": 0.00015319316121790676, + "loss": 1.1325, + "step": 7429 + }, + { + "epoch": 0.71, + "grad_norm": 0.25376782398833597, + "learning_rate": 0.00015317976480830214, + "loss": 1.051, + "step": 7430 + }, + { + "epoch": 0.71, + "grad_norm": 0.26689013628589564, + "learning_rate": 0.00015316636706784047, + "loss": 1.0535, + "step": 7431 + }, + { + "epoch": 0.71, + "grad_norm": 0.27913817719785716, + "learning_rate": 0.00015315296799685703, + "loss": 1.0799, + "step": 7432 + }, + { + "epoch": 0.71, + "grad_norm": 0.27105117018470604, + "learning_rate": 0.00015313956759568717, + "loss": 0.9799, + "step": 7433 + }, + { + "epoch": 0.71, + "grad_norm": 0.2650201668901937, + "learning_rate": 0.00015312616586466625, + "loss": 1.0213, + "step": 7434 + }, + { + "epoch": 0.71, + "grad_norm": 0.2597414213724003, + "learning_rate": 0.0001531127628041296, + "loss": 1.0696, + "step": 7435 + }, + { + "epoch": 0.71, + "grad_norm": 0.2720574465290946, + "learning_rate": 0.0001530993584144127, + "loss": 1.0064, + "step": 7436 + }, + { + "epoch": 0.71, + "grad_norm": 0.2926790787938213, + "learning_rate": 0.000153085952695851, + "loss": 1.0582, + "step": 7437 + }, + { + "epoch": 0.71, + "grad_norm": 0.30451030733173934, + "learning_rate": 0.00015307254564877996, + "loss": 1.0364, + "step": 7438 + }, + { + "epoch": 0.71, + "grad_norm": 0.2969242563944365, + "learning_rate": 0.00015305913727353508, + "loss": 0.9929, + "step": 7439 + }, + { + "epoch": 0.71, + "grad_norm": 0.2843406055001775, + "learning_rate": 0.000153045727570452, + "loss": 1.0706, + "step": 7440 + }, + { + "epoch": 0.71, + "grad_norm": 0.27079473763903633, + "learning_rate": 0.0001530323165398662, + "loss": 1.13, + "step": 7441 + }, + { + "epoch": 0.71, + "grad_norm": 0.31673668159080653, + "learning_rate": 0.00015301890418211337, + "loss": 1.033, + "step": 7442 + }, + { + "epoch": 0.71, + "grad_norm": 0.25852536375912855, + "learning_rate": 0.00015300549049752915, + "loss": 0.9558, + "step": 7443 + }, + { + "epoch": 0.71, + "grad_norm": 0.2610535794849494, + "learning_rate": 0.00015299207548644922, + "loss": 0.9075, + "step": 7444 + }, + { + "epoch": 0.71, + "grad_norm": 0.3144932740991477, + "learning_rate": 0.0001529786591492093, + "loss": 1.088, + "step": 7445 + }, + { + "epoch": 0.71, + "grad_norm": 0.3002199992017584, + "learning_rate": 0.0001529652414861451, + "loss": 1.0292, + "step": 7446 + }, + { + "epoch": 0.71, + "grad_norm": 0.29137989165257083, + "learning_rate": 0.00015295182249759246, + "loss": 1.1373, + "step": 7447 + }, + { + "epoch": 0.71, + "grad_norm": 0.3000740319743772, + "learning_rate": 0.0001529384021838872, + "loss": 1.0349, + "step": 7448 + }, + { + "epoch": 0.71, + "grad_norm": 0.37952107317617517, + "learning_rate": 0.00015292498054536515, + "loss": 1.027, + "step": 7449 + }, + { + "epoch": 0.71, + "grad_norm": 0.2658313512672852, + "learning_rate": 0.00015291155758236219, + "loss": 1.0365, + "step": 7450 + }, + { + "epoch": 0.71, + "grad_norm": 0.30711739440319363, + "learning_rate": 0.00015289813329521427, + "loss": 1.0623, + "step": 7451 + }, + { + "epoch": 0.71, + "grad_norm": 0.26947346016409285, + "learning_rate": 0.0001528847076842573, + "loss": 1.0008, + "step": 7452 + }, + { + "epoch": 0.71, + "grad_norm": 0.294244616915276, + "learning_rate": 0.00015287128074982728, + "loss": 1.1527, + "step": 7453 + }, + { + "epoch": 0.71, + "grad_norm": 0.26434224087661645, + "learning_rate": 0.00015285785249226025, + "loss": 1.0076, + "step": 7454 + }, + { + "epoch": 0.71, + "grad_norm": 0.2638878195945205, + "learning_rate": 0.00015284442291189224, + "loss": 1.0129, + "step": 7455 + }, + { + "epoch": 0.71, + "grad_norm": 0.291822498691914, + "learning_rate": 0.0001528309920090593, + "loss": 1.0994, + "step": 7456 + }, + { + "epoch": 0.71, + "grad_norm": 0.24079989611181216, + "learning_rate": 0.00015281755978409763, + "loss": 0.9589, + "step": 7457 + }, + { + "epoch": 0.71, + "grad_norm": 0.26439658524128423, + "learning_rate": 0.00015280412623734331, + "loss": 1.0177, + "step": 7458 + }, + { + "epoch": 0.71, + "grad_norm": 0.28359751564704094, + "learning_rate": 0.00015279069136913252, + "loss": 1.0652, + "step": 7459 + }, + { + "epoch": 0.71, + "grad_norm": 0.27983466260304535, + "learning_rate": 0.00015277725517980152, + "loss": 1.0485, + "step": 7460 + }, + { + "epoch": 0.71, + "grad_norm": 0.29708736109513595, + "learning_rate": 0.00015276381766968656, + "loss": 1.074, + "step": 7461 + }, + { + "epoch": 0.71, + "grad_norm": 0.29289233835975614, + "learning_rate": 0.0001527503788391239, + "loss": 1.1547, + "step": 7462 + }, + { + "epoch": 0.71, + "grad_norm": 0.2505395055508232, + "learning_rate": 0.00015273693868844983, + "loss": 0.9737, + "step": 7463 + }, + { + "epoch": 0.71, + "grad_norm": 0.32901669963923147, + "learning_rate": 0.00015272349721800075, + "loss": 1.0536, + "step": 7464 + }, + { + "epoch": 0.71, + "grad_norm": 0.256290518603839, + "learning_rate": 0.000152710054428113, + "loss": 1.1606, + "step": 7465 + }, + { + "epoch": 0.71, + "grad_norm": 0.26769516189713743, + "learning_rate": 0.000152696610319123, + "loss": 0.9284, + "step": 7466 + }, + { + "epoch": 0.71, + "grad_norm": 0.2644771395655536, + "learning_rate": 0.00015268316489136722, + "loss": 1.1177, + "step": 7467 + }, + { + "epoch": 0.71, + "grad_norm": 0.2806512107610077, + "learning_rate": 0.00015266971814518213, + "loss": 0.9686, + "step": 7468 + }, + { + "epoch": 0.71, + "grad_norm": 0.2845865397349829, + "learning_rate": 0.00015265627008090424, + "loss": 1.0744, + "step": 7469 + }, + { + "epoch": 0.71, + "grad_norm": 0.26504898270571114, + "learning_rate": 0.00015264282069887012, + "loss": 1.1322, + "step": 7470 + }, + { + "epoch": 0.71, + "grad_norm": 0.2916082135751575, + "learning_rate": 0.0001526293699994163, + "loss": 1.103, + "step": 7471 + }, + { + "epoch": 0.71, + "grad_norm": 0.31144997066611424, + "learning_rate": 0.00015261591798287945, + "loss": 1.0014, + "step": 7472 + }, + { + "epoch": 0.71, + "grad_norm": 0.2574195705247452, + "learning_rate": 0.00015260246464959614, + "loss": 1.1275, + "step": 7473 + }, + { + "epoch": 0.72, + "grad_norm": 0.31598031433107676, + "learning_rate": 0.00015258900999990313, + "loss": 0.9769, + "step": 7474 + }, + { + "epoch": 0.72, + "grad_norm": 0.2733626438470682, + "learning_rate": 0.00015257555403413707, + "loss": 1.1188, + "step": 7475 + }, + { + "epoch": 0.72, + "grad_norm": 0.27826283583830214, + "learning_rate": 0.00015256209675263473, + "loss": 0.9995, + "step": 7476 + }, + { + "epoch": 0.72, + "grad_norm": 0.3089275459108109, + "learning_rate": 0.0001525486381557329, + "loss": 1.0316, + "step": 7477 + }, + { + "epoch": 0.72, + "grad_norm": 0.2928951184842569, + "learning_rate": 0.00015253517824376838, + "loss": 1.0514, + "step": 7478 + }, + { + "epoch": 0.72, + "grad_norm": 0.26365816516744844, + "learning_rate": 0.00015252171701707798, + "loss": 1.0454, + "step": 7479 + }, + { + "epoch": 0.72, + "grad_norm": 0.28372734957336343, + "learning_rate": 0.00015250825447599863, + "loss": 1.0397, + "step": 7480 + }, + { + "epoch": 0.72, + "grad_norm": 0.24476440536696728, + "learning_rate": 0.0001524947906208672, + "loss": 1.1185, + "step": 7481 + }, + { + "epoch": 0.72, + "grad_norm": 0.29553606635449614, + "learning_rate": 0.00015248132545202066, + "loss": 1.0536, + "step": 7482 + }, + { + "epoch": 0.72, + "grad_norm": 0.28192161411982414, + "learning_rate": 0.00015246785896979592, + "loss": 1.0789, + "step": 7483 + }, + { + "epoch": 0.72, + "grad_norm": 0.28836399016991415, + "learning_rate": 0.00015245439117453005, + "loss": 1.1376, + "step": 7484 + }, + { + "epoch": 0.72, + "grad_norm": 0.285252586401288, + "learning_rate": 0.00015244092206656012, + "loss": 1.0715, + "step": 7485 + }, + { + "epoch": 0.72, + "grad_norm": 0.34180059296617976, + "learning_rate": 0.0001524274516462231, + "loss": 1.0974, + "step": 7486 + }, + { + "epoch": 0.72, + "grad_norm": 0.28411030591653186, + "learning_rate": 0.0001524139799138562, + "loss": 1.1046, + "step": 7487 + }, + { + "epoch": 0.72, + "grad_norm": 0.2688829534104327, + "learning_rate": 0.00015240050686979648, + "loss": 1.0534, + "step": 7488 + }, + { + "epoch": 0.72, + "grad_norm": 0.3171051572047796, + "learning_rate": 0.00015238703251438116, + "loss": 0.9673, + "step": 7489 + }, + { + "epoch": 0.72, + "grad_norm": 0.2715311978491321, + "learning_rate": 0.00015237355684794742, + "loss": 1.0009, + "step": 7490 + }, + { + "epoch": 0.72, + "grad_norm": 0.3107037826927023, + "learning_rate": 0.0001523600798708325, + "loss": 1.1782, + "step": 7491 + }, + { + "epoch": 0.72, + "grad_norm": 0.3004283097955692, + "learning_rate": 0.00015234660158337367, + "loss": 1.1222, + "step": 7492 + }, + { + "epoch": 0.72, + "grad_norm": 0.34256065220175824, + "learning_rate": 0.00015233312198590824, + "loss": 1.1572, + "step": 7493 + }, + { + "epoch": 0.72, + "grad_norm": 0.30264641686373545, + "learning_rate": 0.00015231964107877355, + "loss": 0.9944, + "step": 7494 + }, + { + "epoch": 0.72, + "grad_norm": 0.29115852098449335, + "learning_rate": 0.00015230615886230696, + "loss": 1.0331, + "step": 7495 + }, + { + "epoch": 0.72, + "grad_norm": 0.2663234013844971, + "learning_rate": 0.00015229267533684588, + "loss": 0.9686, + "step": 7496 + }, + { + "epoch": 0.72, + "grad_norm": 0.2706122042940165, + "learning_rate": 0.00015227919050272775, + "loss": 1.029, + "step": 7497 + }, + { + "epoch": 0.72, + "grad_norm": 0.28192878540642374, + "learning_rate": 0.00015226570436028996, + "loss": 1.0831, + "step": 7498 + }, + { + "epoch": 0.72, + "grad_norm": 0.3136140113978119, + "learning_rate": 0.00015225221690987013, + "loss": 1.1534, + "step": 7499 + }, + { + "epoch": 0.72, + "grad_norm": 0.29044795123500067, + "learning_rate": 0.0001522387281518057, + "loss": 1.0986, + "step": 7500 + }, + { + "epoch": 0.72, + "grad_norm": 0.25709365283504254, + "learning_rate": 0.00015222523808643428, + "loss": 1.1544, + "step": 7501 + }, + { + "epoch": 0.72, + "grad_norm": 0.27642043583466885, + "learning_rate": 0.00015221174671409347, + "loss": 1.0419, + "step": 7502 + }, + { + "epoch": 0.72, + "grad_norm": 0.28235849443833444, + "learning_rate": 0.00015219825403512086, + "loss": 1.1054, + "step": 7503 + }, + { + "epoch": 0.72, + "grad_norm": 0.3347988942349422, + "learning_rate": 0.00015218476004985414, + "loss": 1.1444, + "step": 7504 + }, + { + "epoch": 0.72, + "grad_norm": 0.28011799381802555, + "learning_rate": 0.000152171264758631, + "loss": 1.0586, + "step": 7505 + }, + { + "epoch": 0.72, + "grad_norm": 0.29631516225770976, + "learning_rate": 0.00015215776816178918, + "loss": 1.066, + "step": 7506 + }, + { + "epoch": 0.72, + "grad_norm": 0.312587038609111, + "learning_rate": 0.00015214427025966642, + "loss": 1.06, + "step": 7507 + }, + { + "epoch": 0.72, + "grad_norm": 0.26267971626273046, + "learning_rate": 0.00015213077105260053, + "loss": 0.99, + "step": 7508 + }, + { + "epoch": 0.72, + "grad_norm": 0.29362317699509005, + "learning_rate": 0.00015211727054092932, + "loss": 1.0374, + "step": 7509 + }, + { + "epoch": 0.72, + "grad_norm": 0.2743031690741454, + "learning_rate": 0.00015210376872499068, + "loss": 0.9676, + "step": 7510 + }, + { + "epoch": 0.72, + "grad_norm": 0.3008169494678444, + "learning_rate": 0.0001520902656051225, + "loss": 1.0962, + "step": 7511 + }, + { + "epoch": 0.72, + "grad_norm": 0.2510179331269226, + "learning_rate": 0.00015207676118166266, + "loss": 1.0657, + "step": 7512 + }, + { + "epoch": 0.72, + "grad_norm": 0.3006570781756079, + "learning_rate": 0.00015206325545494913, + "loss": 1.1252, + "step": 7513 + }, + { + "epoch": 0.72, + "grad_norm": 0.3074556798522242, + "learning_rate": 0.00015204974842531995, + "loss": 1.1561, + "step": 7514 + }, + { + "epoch": 0.72, + "grad_norm": 0.3251122148768213, + "learning_rate": 0.00015203624009311307, + "loss": 1.1283, + "step": 7515 + }, + { + "epoch": 0.72, + "grad_norm": 0.2595576229638888, + "learning_rate": 0.0001520227304586666, + "loss": 0.9711, + "step": 7516 + }, + { + "epoch": 0.72, + "grad_norm": 0.29928452954008067, + "learning_rate": 0.00015200921952231858, + "loss": 1.0824, + "step": 7517 + }, + { + "epoch": 0.72, + "grad_norm": 0.3139867616135075, + "learning_rate": 0.0001519957072844072, + "loss": 1.1125, + "step": 7518 + }, + { + "epoch": 0.72, + "grad_norm": 0.29374530619599887, + "learning_rate": 0.00015198219374527053, + "loss": 1.1183, + "step": 7519 + }, + { + "epoch": 0.72, + "grad_norm": 0.2952280509086445, + "learning_rate": 0.00015196867890524676, + "loss": 1.0937, + "step": 7520 + }, + { + "epoch": 0.72, + "grad_norm": 0.2826852773745996, + "learning_rate": 0.00015195516276467422, + "loss": 1.1857, + "step": 7521 + }, + { + "epoch": 0.72, + "grad_norm": 0.2803858395978367, + "learning_rate": 0.000151941645323891, + "loss": 0.9876, + "step": 7522 + }, + { + "epoch": 0.72, + "grad_norm": 0.27577711685224354, + "learning_rate": 0.00015192812658323552, + "loss": 1.0477, + "step": 7523 + }, + { + "epoch": 0.72, + "grad_norm": 0.29937814744106744, + "learning_rate": 0.00015191460654304602, + "loss": 1.0781, + "step": 7524 + }, + { + "epoch": 0.72, + "grad_norm": 0.344475738203272, + "learning_rate": 0.00015190108520366085, + "loss": 1.0677, + "step": 7525 + }, + { + "epoch": 0.72, + "grad_norm": 0.269619028961468, + "learning_rate": 0.00015188756256541842, + "loss": 1.1207, + "step": 7526 + }, + { + "epoch": 0.72, + "grad_norm": 0.2856659171795633, + "learning_rate": 0.0001518740386286571, + "loss": 0.9971, + "step": 7527 + }, + { + "epoch": 0.72, + "grad_norm": 0.2690448461790446, + "learning_rate": 0.0001518605133937154, + "loss": 1.0434, + "step": 7528 + }, + { + "epoch": 0.72, + "grad_norm": 0.3153664787777255, + "learning_rate": 0.00015184698686093173, + "loss": 1.1609, + "step": 7529 + }, + { + "epoch": 0.72, + "grad_norm": 0.31002810501031186, + "learning_rate": 0.00015183345903064467, + "loss": 1.0097, + "step": 7530 + }, + { + "epoch": 0.72, + "grad_norm": 0.2764530793093042, + "learning_rate": 0.00015181992990319265, + "loss": 1.0459, + "step": 7531 + }, + { + "epoch": 0.72, + "grad_norm": 0.2581950187662785, + "learning_rate": 0.00015180639947891437, + "loss": 1.0612, + "step": 7532 + }, + { + "epoch": 0.72, + "grad_norm": 0.3228690135774238, + "learning_rate": 0.0001517928677581484, + "loss": 1.1375, + "step": 7533 + }, + { + "epoch": 0.72, + "grad_norm": 0.2830701920026826, + "learning_rate": 0.0001517793347412333, + "loss": 0.9702, + "step": 7534 + }, + { + "epoch": 0.72, + "grad_norm": 0.30970817365242975, + "learning_rate": 0.00015176580042850787, + "loss": 1.0011, + "step": 7535 + }, + { + "epoch": 0.72, + "grad_norm": 0.2879799004430592, + "learning_rate": 0.00015175226482031073, + "loss": 1.0469, + "step": 7536 + }, + { + "epoch": 0.72, + "grad_norm": 0.2721982692551152, + "learning_rate": 0.0001517387279169806, + "loss": 1.0661, + "step": 7537 + }, + { + "epoch": 0.72, + "grad_norm": 0.30110301529460076, + "learning_rate": 0.00015172518971885634, + "loss": 1.0512, + "step": 7538 + }, + { + "epoch": 0.72, + "grad_norm": 0.26509470901065213, + "learning_rate": 0.00015171165022627667, + "loss": 1.0675, + "step": 7539 + }, + { + "epoch": 0.72, + "grad_norm": 0.29294036548567454, + "learning_rate": 0.00015169810943958044, + "loss": 1.12, + "step": 7540 + }, + { + "epoch": 0.72, + "grad_norm": 0.2756337326490096, + "learning_rate": 0.00015168456735910657, + "loss": 1.0387, + "step": 7541 + }, + { + "epoch": 0.72, + "grad_norm": 0.2974971175746808, + "learning_rate": 0.0001516710239851939, + "loss": 1.1836, + "step": 7542 + }, + { + "epoch": 0.72, + "grad_norm": 0.31089357056437184, + "learning_rate": 0.0001516574793181814, + "loss": 1.1023, + "step": 7543 + }, + { + "epoch": 0.72, + "grad_norm": 0.30077365644294696, + "learning_rate": 0.00015164393335840798, + "loss": 0.9271, + "step": 7544 + }, + { + "epoch": 0.72, + "grad_norm": 0.2565124797415133, + "learning_rate": 0.00015163038610621269, + "loss": 1.0294, + "step": 7545 + }, + { + "epoch": 0.72, + "grad_norm": 0.2826401399220724, + "learning_rate": 0.00015161683756193456, + "loss": 1.0563, + "step": 7546 + }, + { + "epoch": 0.72, + "grad_norm": 0.30362864947281726, + "learning_rate": 0.00015160328772591256, + "loss": 1.1748, + "step": 7547 + }, + { + "epoch": 0.72, + "grad_norm": 0.29370524398637515, + "learning_rate": 0.00015158973659848592, + "loss": 1.0426, + "step": 7548 + }, + { + "epoch": 0.72, + "grad_norm": 0.2684686193459749, + "learning_rate": 0.00015157618417999366, + "loss": 1.0464, + "step": 7549 + }, + { + "epoch": 0.72, + "grad_norm": 0.2761867607331919, + "learning_rate": 0.000151562630470775, + "loss": 1.0953, + "step": 7550 + }, + { + "epoch": 0.72, + "grad_norm": 0.25846363686227103, + "learning_rate": 0.0001515490754711691, + "loss": 1.061, + "step": 7551 + }, + { + "epoch": 0.72, + "grad_norm": 0.2642419825902769, + "learning_rate": 0.0001515355191815152, + "loss": 1.1347, + "step": 7552 + }, + { + "epoch": 0.72, + "grad_norm": 0.2559409653328977, + "learning_rate": 0.00015152196160215253, + "loss": 1.0638, + "step": 7553 + }, + { + "epoch": 0.72, + "grad_norm": 0.28318731458349305, + "learning_rate": 0.00015150840273342038, + "loss": 1.0592, + "step": 7554 + }, + { + "epoch": 0.72, + "grad_norm": 0.2850162725399112, + "learning_rate": 0.00015149484257565813, + "loss": 1.1141, + "step": 7555 + }, + { + "epoch": 0.72, + "grad_norm": 0.26118830684653604, + "learning_rate": 0.000151481281129205, + "loss": 1.0173, + "step": 7556 + }, + { + "epoch": 0.72, + "grad_norm": 0.27293247877399707, + "learning_rate": 0.0001514677183944005, + "loss": 1.0846, + "step": 7557 + }, + { + "epoch": 0.72, + "grad_norm": 0.304675314149257, + "learning_rate": 0.00015145415437158401, + "loss": 1.1428, + "step": 7558 + }, + { + "epoch": 0.72, + "grad_norm": 0.2972819357482946, + "learning_rate": 0.00015144058906109496, + "loss": 1.0655, + "step": 7559 + }, + { + "epoch": 0.72, + "grad_norm": 0.28299905657676133, + "learning_rate": 0.00015142702246327287, + "loss": 1.0705, + "step": 7560 + }, + { + "epoch": 0.72, + "grad_norm": 0.29542732633039054, + "learning_rate": 0.00015141345457845716, + "loss": 1.0193, + "step": 7561 + }, + { + "epoch": 0.72, + "grad_norm": 0.2663372553374203, + "learning_rate": 0.00015139988540698748, + "loss": 0.9831, + "step": 7562 + }, + { + "epoch": 0.72, + "grad_norm": 0.25393991962427676, + "learning_rate": 0.00015138631494920337, + "loss": 1.1387, + "step": 7563 + }, + { + "epoch": 0.72, + "grad_norm": 0.27744832277420794, + "learning_rate": 0.00015137274320544438, + "loss": 1.1088, + "step": 7564 + }, + { + "epoch": 0.72, + "grad_norm": 0.27311375703469265, + "learning_rate": 0.00015135917017605025, + "loss": 1.092, + "step": 7565 + }, + { + "epoch": 0.72, + "grad_norm": 0.3264586447066704, + "learning_rate": 0.0001513455958613606, + "loss": 1.1544, + "step": 7566 + }, + { + "epoch": 0.72, + "grad_norm": 0.29926443610547104, + "learning_rate": 0.00015133202026171514, + "loss": 1.0584, + "step": 7567 + }, + { + "epoch": 0.72, + "grad_norm": 0.2945733521922124, + "learning_rate": 0.00015131844337745362, + "loss": 0.9397, + "step": 7568 + }, + { + "epoch": 0.72, + "grad_norm": 0.2730547592785977, + "learning_rate": 0.00015130486520891582, + "loss": 0.9991, + "step": 7569 + }, + { + "epoch": 0.72, + "grad_norm": 0.30119301432734397, + "learning_rate": 0.00015129128575644147, + "loss": 1.0843, + "step": 7570 + }, + { + "epoch": 0.72, + "grad_norm": 0.30645684857055866, + "learning_rate": 0.00015127770502037052, + "loss": 1.0142, + "step": 7571 + }, + { + "epoch": 0.72, + "grad_norm": 0.2821703954502372, + "learning_rate": 0.00015126412300104272, + "loss": 0.982, + "step": 7572 + }, + { + "epoch": 0.72, + "grad_norm": 0.27920328503608616, + "learning_rate": 0.00015125053969879807, + "loss": 1.0294, + "step": 7573 + }, + { + "epoch": 0.72, + "grad_norm": 0.2385258107376846, + "learning_rate": 0.0001512369551139764, + "loss": 1.0406, + "step": 7574 + }, + { + "epoch": 0.72, + "grad_norm": 0.2972641315163676, + "learning_rate": 0.0001512233692469178, + "loss": 1.1835, + "step": 7575 + }, + { + "epoch": 0.72, + "grad_norm": 0.2602247995915003, + "learning_rate": 0.00015120978209796213, + "loss": 1.0921, + "step": 7576 + }, + { + "epoch": 0.72, + "grad_norm": 0.27839769616677684, + "learning_rate": 0.00015119619366744952, + "loss": 1.099, + "step": 7577 + }, + { + "epoch": 0.72, + "grad_norm": 0.316376234085856, + "learning_rate": 0.00015118260395571994, + "loss": 1.0387, + "step": 7578 + }, + { + "epoch": 0.73, + "grad_norm": 0.31660368584142895, + "learning_rate": 0.00015116901296311356, + "loss": 1.1306, + "step": 7579 + }, + { + "epoch": 0.73, + "grad_norm": 0.260900438740011, + "learning_rate": 0.00015115542068997047, + "loss": 0.9071, + "step": 7580 + }, + { + "epoch": 0.73, + "grad_norm": 0.2624881211285536, + "learning_rate": 0.00015114182713663082, + "loss": 1.1411, + "step": 7581 + }, + { + "epoch": 0.73, + "grad_norm": 0.257486571814327, + "learning_rate": 0.00015112823230343484, + "loss": 1.0307, + "step": 7582 + }, + { + "epoch": 0.73, + "grad_norm": 0.2823321215311315, + "learning_rate": 0.00015111463619072265, + "loss": 1.1471, + "step": 7583 + }, + { + "epoch": 0.73, + "grad_norm": 0.30117635588768243, + "learning_rate": 0.00015110103879883462, + "loss": 1.1452, + "step": 7584 + }, + { + "epoch": 0.73, + "grad_norm": 0.3022579959192287, + "learning_rate": 0.00015108744012811096, + "loss": 1.0584, + "step": 7585 + }, + { + "epoch": 0.73, + "grad_norm": 0.29740329090227624, + "learning_rate": 0.000151073840178892, + "loss": 1.0415, + "step": 7586 + }, + { + "epoch": 0.73, + "grad_norm": 0.29372205328515927, + "learning_rate": 0.0001510602389515181, + "loss": 1.1383, + "step": 7587 + }, + { + "epoch": 0.73, + "grad_norm": 0.2920566881167881, + "learning_rate": 0.00015104663644632962, + "loss": 1.0683, + "step": 7588 + }, + { + "epoch": 0.73, + "grad_norm": 0.22058431176486398, + "learning_rate": 0.000151033032663667, + "loss": 1.0229, + "step": 7589 + }, + { + "epoch": 0.73, + "grad_norm": 0.25213010351059206, + "learning_rate": 0.00015101942760387065, + "loss": 1.1396, + "step": 7590 + }, + { + "epoch": 0.73, + "grad_norm": 0.25918222437239363, + "learning_rate": 0.00015100582126728105, + "loss": 1.0704, + "step": 7591 + }, + { + "epoch": 0.73, + "grad_norm": 0.2961632170304152, + "learning_rate": 0.00015099221365423872, + "loss": 1.1117, + "step": 7592 + }, + { + "epoch": 0.73, + "grad_norm": 0.28187090698557027, + "learning_rate": 0.0001509786047650842, + "loss": 0.9727, + "step": 7593 + }, + { + "epoch": 0.73, + "grad_norm": 0.2892636190812079, + "learning_rate": 0.00015096499460015805, + "loss": 1.013, + "step": 7594 + }, + { + "epoch": 0.73, + "grad_norm": 0.28236820071622526, + "learning_rate": 0.0001509513831598009, + "loss": 0.9727, + "step": 7595 + }, + { + "epoch": 0.73, + "grad_norm": 0.2722476913875187, + "learning_rate": 0.00015093777044435333, + "loss": 1.0157, + "step": 7596 + }, + { + "epoch": 0.73, + "grad_norm": 0.2652646040859842, + "learning_rate": 0.00015092415645415606, + "loss": 1.0091, + "step": 7597 + }, + { + "epoch": 0.73, + "grad_norm": 0.28143692367142764, + "learning_rate": 0.00015091054118954978, + "loss": 1.0371, + "step": 7598 + }, + { + "epoch": 0.73, + "grad_norm": 0.2767178043247685, + "learning_rate": 0.0001508969246508752, + "loss": 1.0008, + "step": 7599 + }, + { + "epoch": 0.73, + "grad_norm": 0.25719222763437416, + "learning_rate": 0.0001508833068384731, + "loss": 1.0056, + "step": 7600 + }, + { + "epoch": 0.73, + "grad_norm": 0.30194394764309046, + "learning_rate": 0.00015086968775268427, + "loss": 1.0588, + "step": 7601 + }, + { + "epoch": 0.73, + "grad_norm": 0.2775361029431778, + "learning_rate": 0.00015085606739384953, + "loss": 1.1844, + "step": 7602 + }, + { + "epoch": 0.73, + "grad_norm": 0.2529941931845389, + "learning_rate": 0.00015084244576230976, + "loss": 0.9901, + "step": 7603 + }, + { + "epoch": 0.73, + "grad_norm": 0.3165846646756356, + "learning_rate": 0.00015082882285840578, + "loss": 0.997, + "step": 7604 + }, + { + "epoch": 0.73, + "grad_norm": 0.2782657643449699, + "learning_rate": 0.0001508151986824786, + "loss": 1.0908, + "step": 7605 + }, + { + "epoch": 0.73, + "grad_norm": 0.2781877735281842, + "learning_rate": 0.00015080157323486915, + "loss": 1.1314, + "step": 7606 + }, + { + "epoch": 0.73, + "grad_norm": 0.281042713599222, + "learning_rate": 0.00015078794651591837, + "loss": 1.0458, + "step": 7607 + }, + { + "epoch": 0.73, + "grad_norm": 0.27849574824675116, + "learning_rate": 0.0001507743185259673, + "loss": 1.1229, + "step": 7608 + }, + { + "epoch": 0.73, + "grad_norm": 0.2759377287295315, + "learning_rate": 0.00015076068926535706, + "loss": 1.044, + "step": 7609 + }, + { + "epoch": 0.73, + "grad_norm": 0.2821493820751014, + "learning_rate": 0.00015074705873442863, + "loss": 1.0843, + "step": 7610 + }, + { + "epoch": 0.73, + "grad_norm": 0.2607548413248963, + "learning_rate": 0.00015073342693352316, + "loss": 1.0291, + "step": 7611 + }, + { + "epoch": 0.73, + "grad_norm": 0.3107914480290541, + "learning_rate": 0.0001507197938629818, + "loss": 1.1632, + "step": 7612 + }, + { + "epoch": 0.73, + "grad_norm": 0.2644427769663581, + "learning_rate": 0.0001507061595231457, + "loss": 1.112, + "step": 7613 + }, + { + "epoch": 0.73, + "grad_norm": 0.29569519438471115, + "learning_rate": 0.00015069252391435614, + "loss": 0.8825, + "step": 7614 + }, + { + "epoch": 0.73, + "grad_norm": 0.2725562040988613, + "learning_rate": 0.00015067888703695426, + "loss": 0.975, + "step": 7615 + }, + { + "epoch": 0.73, + "grad_norm": 0.27141691407540564, + "learning_rate": 0.00015066524889128139, + "loss": 1.1129, + "step": 7616 + }, + { + "epoch": 0.73, + "grad_norm": 0.286170558294994, + "learning_rate": 0.00015065160947767887, + "loss": 1.1549, + "step": 7617 + }, + { + "epoch": 0.73, + "grad_norm": 0.28827191966294335, + "learning_rate": 0.00015063796879648793, + "loss": 1.0636, + "step": 7618 + }, + { + "epoch": 0.73, + "grad_norm": 0.34763534936988866, + "learning_rate": 0.00015062432684805, + "loss": 1.0821, + "step": 7619 + }, + { + "epoch": 0.73, + "grad_norm": 0.2928541057535552, + "learning_rate": 0.00015061068363270654, + "loss": 0.9746, + "step": 7620 + }, + { + "epoch": 0.73, + "grad_norm": 0.22676940791019204, + "learning_rate": 0.00015059703915079888, + "loss": 0.9351, + "step": 7621 + }, + { + "epoch": 0.73, + "grad_norm": 0.2932641803272752, + "learning_rate": 0.0001505833934026685, + "loss": 1.169, + "step": 7622 + }, + { + "epoch": 0.73, + "grad_norm": 0.2545708248714656, + "learning_rate": 0.0001505697463886569, + "loss": 1.0129, + "step": 7623 + }, + { + "epoch": 0.73, + "grad_norm": 0.31554332098638554, + "learning_rate": 0.00015055609810910565, + "loss": 1.0664, + "step": 7624 + }, + { + "epoch": 0.73, + "grad_norm": 0.3159256467723614, + "learning_rate": 0.00015054244856435624, + "loss": 1.1755, + "step": 7625 + }, + { + "epoch": 0.73, + "grad_norm": 0.25618170107349153, + "learning_rate": 0.0001505287977547503, + "loss": 1.0721, + "step": 7626 + }, + { + "epoch": 0.73, + "grad_norm": 0.3162579358667636, + "learning_rate": 0.00015051514568062947, + "loss": 0.958, + "step": 7627 + }, + { + "epoch": 0.73, + "grad_norm": 0.25894438378797824, + "learning_rate": 0.00015050149234233532, + "loss": 1.0019, + "step": 7628 + }, + { + "epoch": 0.73, + "grad_norm": 0.2852760760049263, + "learning_rate": 0.00015048783774020962, + "loss": 1.1271, + "step": 7629 + }, + { + "epoch": 0.73, + "grad_norm": 0.3912607456759231, + "learning_rate": 0.00015047418187459405, + "loss": 1.1038, + "step": 7630 + }, + { + "epoch": 0.73, + "grad_norm": 0.322140496874572, + "learning_rate": 0.00015046052474583033, + "loss": 1.0758, + "step": 7631 + }, + { + "epoch": 0.73, + "grad_norm": 0.3229270016075276, + "learning_rate": 0.0001504468663542603, + "loss": 1.0946, + "step": 7632 + }, + { + "epoch": 0.73, + "grad_norm": 0.3081581987259254, + "learning_rate": 0.0001504332067002257, + "loss": 1.0036, + "step": 7633 + }, + { + "epoch": 0.73, + "grad_norm": 0.29552024618919837, + "learning_rate": 0.00015041954578406844, + "loss": 1.0949, + "step": 7634 + }, + { + "epoch": 0.73, + "grad_norm": 0.2963238428604876, + "learning_rate": 0.00015040588360613034, + "loss": 1.0493, + "step": 7635 + }, + { + "epoch": 0.73, + "grad_norm": 0.29301025689881727, + "learning_rate": 0.00015039222016675332, + "loss": 1.0601, + "step": 7636 + }, + { + "epoch": 0.73, + "grad_norm": 0.31243960952554717, + "learning_rate": 0.0001503785554662793, + "loss": 1.0141, + "step": 7637 + }, + { + "epoch": 0.73, + "grad_norm": 0.2810933699464373, + "learning_rate": 0.00015036488950505032, + "loss": 0.9789, + "step": 7638 + }, + { + "epoch": 0.73, + "grad_norm": 0.2800782903010916, + "learning_rate": 0.0001503512222834083, + "loss": 1.0996, + "step": 7639 + }, + { + "epoch": 0.73, + "grad_norm": 0.2737931904759353, + "learning_rate": 0.0001503375538016953, + "loss": 1.0249, + "step": 7640 + }, + { + "epoch": 0.73, + "grad_norm": 0.28069227541654734, + "learning_rate": 0.0001503238840602534, + "loss": 0.9117, + "step": 7641 + }, + { + "epoch": 0.73, + "grad_norm": 0.26949283739174246, + "learning_rate": 0.00015031021305942464, + "loss": 1.0084, + "step": 7642 + }, + { + "epoch": 0.73, + "grad_norm": 0.31475584607752194, + "learning_rate": 0.00015029654079955118, + "loss": 1.0414, + "step": 7643 + }, + { + "epoch": 0.73, + "grad_norm": 0.28698683483189275, + "learning_rate": 0.0001502828672809752, + "loss": 1.0434, + "step": 7644 + }, + { + "epoch": 0.73, + "grad_norm": 0.25256982279011375, + "learning_rate": 0.00015026919250403883, + "loss": 1.0292, + "step": 7645 + }, + { + "epoch": 0.73, + "grad_norm": 0.29577865157695005, + "learning_rate": 0.00015025551646908437, + "loss": 1.1659, + "step": 7646 + }, + { + "epoch": 0.73, + "grad_norm": 0.2728622137260916, + "learning_rate": 0.000150241839176454, + "loss": 1.0527, + "step": 7647 + }, + { + "epoch": 0.73, + "grad_norm": 0.30338258907392546, + "learning_rate": 0.00015022816062649005, + "loss": 1.1148, + "step": 7648 + }, + { + "epoch": 0.73, + "grad_norm": 0.26931970248244586, + "learning_rate": 0.0001502144808195348, + "loss": 1.1357, + "step": 7649 + }, + { + "epoch": 0.73, + "grad_norm": 0.2742596946281973, + "learning_rate": 0.0001502007997559306, + "loss": 1.0646, + "step": 7650 + }, + { + "epoch": 0.73, + "grad_norm": 0.3011532902374543, + "learning_rate": 0.00015018711743601984, + "loss": 1.1351, + "step": 7651 + }, + { + "epoch": 0.73, + "grad_norm": 0.2821557355247745, + "learning_rate": 0.00015017343386014494, + "loss": 1.0087, + "step": 7652 + }, + { + "epoch": 0.73, + "grad_norm": 0.31425399583262464, + "learning_rate": 0.0001501597490286483, + "loss": 1.0384, + "step": 7653 + }, + { + "epoch": 0.73, + "grad_norm": 0.2931669718113375, + "learning_rate": 0.0001501460629418724, + "loss": 1.0466, + "step": 7654 + }, + { + "epoch": 0.73, + "grad_norm": 0.2875430271869409, + "learning_rate": 0.0001501323756001598, + "loss": 1.0878, + "step": 7655 + }, + { + "epoch": 0.73, + "grad_norm": 0.2597750556495404, + "learning_rate": 0.00015011868700385298, + "loss": 1.0548, + "step": 7656 + }, + { + "epoch": 0.73, + "grad_norm": 0.27705706713516576, + "learning_rate": 0.00015010499715329453, + "loss": 0.984, + "step": 7657 + }, + { + "epoch": 0.73, + "grad_norm": 0.26538202477264977, + "learning_rate": 0.00015009130604882702, + "loss": 1.0417, + "step": 7658 + }, + { + "epoch": 0.73, + "grad_norm": 0.28174584601024333, + "learning_rate": 0.0001500776136907931, + "loss": 1.0257, + "step": 7659 + }, + { + "epoch": 0.73, + "grad_norm": 0.25036859344556583, + "learning_rate": 0.00015006392007953543, + "loss": 1.1059, + "step": 7660 + }, + { + "epoch": 0.73, + "grad_norm": 0.27900752760462394, + "learning_rate": 0.00015005022521539672, + "loss": 1.1089, + "step": 7661 + }, + { + "epoch": 0.73, + "grad_norm": 0.31107819751826155, + "learning_rate": 0.0001500365290987196, + "loss": 1.0433, + "step": 7662 + }, + { + "epoch": 0.73, + "grad_norm": 0.2843202029978909, + "learning_rate": 0.00015002283172984695, + "loss": 1.0845, + "step": 7663 + }, + { + "epoch": 0.73, + "grad_norm": 0.2589541502663174, + "learning_rate": 0.00015000913310912148, + "loss": 0.9485, + "step": 7664 + }, + { + "epoch": 0.73, + "grad_norm": 0.300671018081089, + "learning_rate": 0.00014999543323688603, + "loss": 1.0407, + "step": 7665 + }, + { + "epoch": 0.73, + "grad_norm": 0.27922119836694304, + "learning_rate": 0.00014998173211348343, + "loss": 0.9969, + "step": 7666 + }, + { + "epoch": 0.73, + "grad_norm": 0.29425343441337976, + "learning_rate": 0.0001499680297392566, + "loss": 0.9274, + "step": 7667 + }, + { + "epoch": 0.73, + "grad_norm": 0.2941232322325698, + "learning_rate": 0.0001499543261145484, + "loss": 1.0088, + "step": 7668 + }, + { + "epoch": 0.73, + "grad_norm": 0.2914706729504272, + "learning_rate": 0.0001499406212397018, + "loss": 1.0058, + "step": 7669 + }, + { + "epoch": 0.73, + "grad_norm": 0.2953576562328182, + "learning_rate": 0.00014992691511505975, + "loss": 1.0051, + "step": 7670 + }, + { + "epoch": 0.73, + "grad_norm": 0.34306316772779066, + "learning_rate": 0.0001499132077409653, + "loss": 1.153, + "step": 7671 + }, + { + "epoch": 0.73, + "grad_norm": 0.2679983439498134, + "learning_rate": 0.0001498994991177614, + "loss": 1.0104, + "step": 7672 + }, + { + "epoch": 0.73, + "grad_norm": 0.2594539874622118, + "learning_rate": 0.00014988578924579122, + "loss": 1.1645, + "step": 7673 + }, + { + "epoch": 0.73, + "grad_norm": 0.3048213376270825, + "learning_rate": 0.0001498720781253978, + "loss": 1.098, + "step": 7674 + }, + { + "epoch": 0.73, + "grad_norm": 0.3077629403566935, + "learning_rate": 0.0001498583657569243, + "loss": 1.1484, + "step": 7675 + }, + { + "epoch": 0.73, + "grad_norm": 0.31902620346830557, + "learning_rate": 0.00014984465214071385, + "loss": 1.009, + "step": 7676 + }, + { + "epoch": 0.73, + "grad_norm": 0.29403015567312174, + "learning_rate": 0.00014983093727710965, + "loss": 0.9773, + "step": 7677 + }, + { + "epoch": 0.73, + "grad_norm": 0.2548359871414429, + "learning_rate": 0.00014981722116645495, + "loss": 1.0751, + "step": 7678 + }, + { + "epoch": 0.73, + "grad_norm": 0.2787506686083023, + "learning_rate": 0.00014980350380909294, + "loss": 1.1147, + "step": 7679 + }, + { + "epoch": 0.73, + "grad_norm": 0.2992261122500776, + "learning_rate": 0.00014978978520536698, + "loss": 1.0858, + "step": 7680 + }, + { + "epoch": 0.73, + "grad_norm": 0.3011621462376174, + "learning_rate": 0.00014977606535562034, + "loss": 1.046, + "step": 7681 + }, + { + "epoch": 0.73, + "grad_norm": 0.2986573455838004, + "learning_rate": 0.0001497623442601964, + "loss": 1.0208, + "step": 7682 + }, + { + "epoch": 0.74, + "grad_norm": 0.28656498019974624, + "learning_rate": 0.00014974862191943848, + "loss": 1.1759, + "step": 7683 + }, + { + "epoch": 0.74, + "grad_norm": 0.2762485785593129, + "learning_rate": 0.00014973489833369004, + "loss": 1.1052, + "step": 7684 + }, + { + "epoch": 0.74, + "grad_norm": 0.3083477893434321, + "learning_rate": 0.00014972117350329456, + "loss": 0.9617, + "step": 7685 + }, + { + "epoch": 0.74, + "grad_norm": 0.3003280251753417, + "learning_rate": 0.0001497074474285954, + "loss": 1.029, + "step": 7686 + }, + { + "epoch": 0.74, + "grad_norm": 0.2572025859628947, + "learning_rate": 0.00014969372010993618, + "loss": 1.2292, + "step": 7687 + }, + { + "epoch": 0.74, + "grad_norm": 0.24791565205884394, + "learning_rate": 0.00014967999154766036, + "loss": 1.1202, + "step": 7688 + }, + { + "epoch": 0.74, + "grad_norm": 0.2834724391112066, + "learning_rate": 0.00014966626174211153, + "loss": 1.1773, + "step": 7689 + }, + { + "epoch": 0.74, + "grad_norm": 0.2793645427994029, + "learning_rate": 0.0001496525306936333, + "loss": 1.0002, + "step": 7690 + }, + { + "epoch": 0.74, + "grad_norm": 0.2742995002805119, + "learning_rate": 0.00014963879840256927, + "loss": 1.1219, + "step": 7691 + }, + { + "epoch": 0.74, + "grad_norm": 0.31255435123133335, + "learning_rate": 0.0001496250648692631, + "loss": 1.0062, + "step": 7692 + }, + { + "epoch": 0.74, + "grad_norm": 0.3095875184013366, + "learning_rate": 0.00014961133009405852, + "loss": 1.0721, + "step": 7693 + }, + { + "epoch": 0.74, + "grad_norm": 0.2804828390216687, + "learning_rate": 0.00014959759407729922, + "loss": 0.9511, + "step": 7694 + }, + { + "epoch": 0.74, + "grad_norm": 0.28459275473311924, + "learning_rate": 0.00014958385681932893, + "loss": 0.8945, + "step": 7695 + }, + { + "epoch": 0.74, + "grad_norm": 0.30693979635734253, + "learning_rate": 0.00014957011832049147, + "loss": 1.0574, + "step": 7696 + }, + { + "epoch": 0.74, + "grad_norm": 0.2661866180961575, + "learning_rate": 0.00014955637858113065, + "loss": 1.1286, + "step": 7697 + }, + { + "epoch": 0.74, + "grad_norm": 0.2748632910585887, + "learning_rate": 0.00014954263760159033, + "loss": 1.0264, + "step": 7698 + }, + { + "epoch": 0.74, + "grad_norm": 0.2639660707256238, + "learning_rate": 0.00014952889538221434, + "loss": 0.9609, + "step": 7699 + }, + { + "epoch": 0.74, + "grad_norm": 0.28950568503410234, + "learning_rate": 0.00014951515192334665, + "loss": 1.0243, + "step": 7700 + }, + { + "epoch": 0.74, + "grad_norm": 0.2741181044596097, + "learning_rate": 0.00014950140722533114, + "loss": 1.1703, + "step": 7701 + }, + { + "epoch": 0.74, + "grad_norm": 0.2939600698983923, + "learning_rate": 0.0001494876612885118, + "loss": 1.0576, + "step": 7702 + }, + { + "epoch": 0.74, + "grad_norm": 0.27769959394242444, + "learning_rate": 0.00014947391411323263, + "loss": 1.032, + "step": 7703 + }, + { + "epoch": 0.74, + "grad_norm": 0.27126944244428325, + "learning_rate": 0.0001494601656998377, + "loss": 0.9935, + "step": 7704 + }, + { + "epoch": 0.74, + "grad_norm": 0.2842820549012656, + "learning_rate": 0.000149446416048671, + "loss": 1.1503, + "step": 7705 + }, + { + "epoch": 0.74, + "grad_norm": 0.2669012700932571, + "learning_rate": 0.0001494326651600767, + "loss": 1.0303, + "step": 7706 + }, + { + "epoch": 0.74, + "grad_norm": 0.30368700424850303, + "learning_rate": 0.00014941891303439886, + "loss": 1.0606, + "step": 7707 + }, + { + "epoch": 0.74, + "grad_norm": 0.30976284634068807, + "learning_rate": 0.0001494051596719817, + "loss": 1.0383, + "step": 7708 + }, + { + "epoch": 0.74, + "grad_norm": 0.3052315022073979, + "learning_rate": 0.00014939140507316934, + "loss": 1.0763, + "step": 7709 + }, + { + "epoch": 0.74, + "grad_norm": 0.28582065805745044, + "learning_rate": 0.00014937764923830598, + "loss": 1.2036, + "step": 7710 + }, + { + "epoch": 0.74, + "grad_norm": 0.2783946932750405, + "learning_rate": 0.000149363892167736, + "loss": 1.0478, + "step": 7711 + }, + { + "epoch": 0.74, + "grad_norm": 0.2863324153725781, + "learning_rate": 0.00014935013386180353, + "loss": 1.0988, + "step": 7712 + }, + { + "epoch": 0.74, + "grad_norm": 0.2785613479438333, + "learning_rate": 0.00014933637432085295, + "loss": 1.0189, + "step": 7713 + }, + { + "epoch": 0.74, + "grad_norm": 0.28499263197797736, + "learning_rate": 0.00014932261354522864, + "loss": 1.1063, + "step": 7714 + }, + { + "epoch": 0.74, + "grad_norm": 0.30255835930843594, + "learning_rate": 0.00014930885153527492, + "loss": 1.0056, + "step": 7715 + }, + { + "epoch": 0.74, + "grad_norm": 0.23887877046224554, + "learning_rate": 0.00014929508829133616, + "loss": 1.0982, + "step": 7716 + }, + { + "epoch": 0.74, + "grad_norm": 0.25941492372004377, + "learning_rate": 0.0001492813238137569, + "loss": 1.0775, + "step": 7717 + }, + { + "epoch": 0.74, + "grad_norm": 0.2840455310981083, + "learning_rate": 0.00014926755810288147, + "loss": 1.0016, + "step": 7718 + }, + { + "epoch": 0.74, + "grad_norm": 0.3159294197735588, + "learning_rate": 0.00014925379115905444, + "loss": 1.1289, + "step": 7719 + }, + { + "epoch": 0.74, + "grad_norm": 0.3001807062258611, + "learning_rate": 0.00014924002298262034, + "loss": 0.9867, + "step": 7720 + }, + { + "epoch": 0.74, + "grad_norm": 0.2783216610430766, + "learning_rate": 0.00014922625357392376, + "loss": 1.0793, + "step": 7721 + }, + { + "epoch": 0.74, + "grad_norm": 0.28020041552297037, + "learning_rate": 0.00014921248293330922, + "loss": 1.0532, + "step": 7722 + }, + { + "epoch": 0.74, + "grad_norm": 0.3424716991146364, + "learning_rate": 0.00014919871106112135, + "loss": 1.0768, + "step": 7723 + }, + { + "epoch": 0.74, + "grad_norm": 0.2779795066958149, + "learning_rate": 0.00014918493795770482, + "loss": 0.9554, + "step": 7724 + }, + { + "epoch": 0.74, + "grad_norm": 0.3181269730793016, + "learning_rate": 0.00014917116362340435, + "loss": 1.0211, + "step": 7725 + }, + { + "epoch": 0.74, + "grad_norm": 0.28616822041259976, + "learning_rate": 0.00014915738805856458, + "loss": 1.1136, + "step": 7726 + }, + { + "epoch": 0.74, + "grad_norm": 0.25638910226945644, + "learning_rate": 0.00014914361126353026, + "loss": 0.9941, + "step": 7727 + }, + { + "epoch": 0.74, + "grad_norm": 0.25181470798369615, + "learning_rate": 0.0001491298332386462, + "loss": 1.0635, + "step": 7728 + }, + { + "epoch": 0.74, + "grad_norm": 0.279163837257328, + "learning_rate": 0.0001491160539842572, + "loss": 1.0245, + "step": 7729 + }, + { + "epoch": 0.74, + "grad_norm": 0.28555043885157716, + "learning_rate": 0.00014910227350070805, + "loss": 1.1297, + "step": 7730 + }, + { + "epoch": 0.74, + "grad_norm": 0.28507597908988613, + "learning_rate": 0.00014908849178834366, + "loss": 1.1125, + "step": 7731 + }, + { + "epoch": 0.74, + "grad_norm": 0.28046175174288296, + "learning_rate": 0.00014907470884750892, + "loss": 1.1712, + "step": 7732 + }, + { + "epoch": 0.74, + "grad_norm": 0.2841570962434119, + "learning_rate": 0.00014906092467854875, + "loss": 1.061, + "step": 7733 + }, + { + "epoch": 0.74, + "grad_norm": 0.25589378852673406, + "learning_rate": 0.00014904713928180806, + "loss": 1.0623, + "step": 7734 + }, + { + "epoch": 0.74, + "grad_norm": 0.29195274168710955, + "learning_rate": 0.00014903335265763193, + "loss": 1.008, + "step": 7735 + }, + { + "epoch": 0.74, + "grad_norm": 0.2742605176860971, + "learning_rate": 0.00014901956480636535, + "loss": 1.153, + "step": 7736 + }, + { + "epoch": 0.74, + "grad_norm": 0.28128186302274166, + "learning_rate": 0.0001490057757283533, + "loss": 1.0106, + "step": 7737 + }, + { + "epoch": 0.74, + "grad_norm": 0.2829056481616711, + "learning_rate": 0.00014899198542394094, + "loss": 1.1351, + "step": 7738 + }, + { + "epoch": 0.74, + "grad_norm": 0.2715236434676326, + "learning_rate": 0.00014897819389347335, + "loss": 1.0404, + "step": 7739 + }, + { + "epoch": 0.74, + "grad_norm": 0.2911015197712587, + "learning_rate": 0.00014896440113729568, + "loss": 1.2098, + "step": 7740 + }, + { + "epoch": 0.74, + "grad_norm": 0.2752066845807117, + "learning_rate": 0.0001489506071557531, + "loss": 1.0378, + "step": 7741 + }, + { + "epoch": 0.74, + "grad_norm": 0.33207199501826795, + "learning_rate": 0.00014893681194919084, + "loss": 0.967, + "step": 7742 + }, + { + "epoch": 0.74, + "grad_norm": 0.2838659301448611, + "learning_rate": 0.00014892301551795408, + "loss": 1.0683, + "step": 7743 + }, + { + "epoch": 0.74, + "grad_norm": 0.29348859811094435, + "learning_rate": 0.0001489092178623881, + "loss": 1.1144, + "step": 7744 + }, + { + "epoch": 0.74, + "grad_norm": 0.2682746013538995, + "learning_rate": 0.00014889541898283821, + "loss": 1.1194, + "step": 7745 + }, + { + "epoch": 0.74, + "grad_norm": 0.30171726380766734, + "learning_rate": 0.00014888161887964974, + "loss": 1.023, + "step": 7746 + }, + { + "epoch": 0.74, + "grad_norm": 0.25296517645030825, + "learning_rate": 0.000148867817553168, + "loss": 1.1727, + "step": 7747 + }, + { + "epoch": 0.74, + "grad_norm": 0.31768749485918707, + "learning_rate": 0.00014885401500373845, + "loss": 1.0391, + "step": 7748 + }, + { + "epoch": 0.74, + "grad_norm": 0.2438411570540729, + "learning_rate": 0.0001488402112317065, + "loss": 1.0145, + "step": 7749 + }, + { + "epoch": 0.74, + "grad_norm": 0.3034976664176374, + "learning_rate": 0.0001488264062374175, + "loss": 1.1215, + "step": 7750 + }, + { + "epoch": 0.74, + "grad_norm": 0.265851976673612, + "learning_rate": 0.00014881260002121705, + "loss": 0.9724, + "step": 7751 + }, + { + "epoch": 0.74, + "grad_norm": 0.29764121681730704, + "learning_rate": 0.00014879879258345057, + "loss": 1.0749, + "step": 7752 + }, + { + "epoch": 0.74, + "grad_norm": 0.2797392427717897, + "learning_rate": 0.00014878498392446366, + "loss": 0.8735, + "step": 7753 + }, + { + "epoch": 0.74, + "grad_norm": 0.24366220050786846, + "learning_rate": 0.00014877117404460185, + "loss": 1.0055, + "step": 7754 + }, + { + "epoch": 0.74, + "grad_norm": 0.28588196020457907, + "learning_rate": 0.00014875736294421078, + "loss": 1.16, + "step": 7755 + }, + { + "epoch": 0.74, + "grad_norm": 0.30525823027606785, + "learning_rate": 0.00014874355062363605, + "loss": 1.1435, + "step": 7756 + }, + { + "epoch": 0.74, + "grad_norm": 0.248479759938403, + "learning_rate": 0.00014872973708322332, + "loss": 1.0148, + "step": 7757 + }, + { + "epoch": 0.74, + "grad_norm": 0.2705904252639747, + "learning_rate": 0.00014871592232331833, + "loss": 1.1533, + "step": 7758 + }, + { + "epoch": 0.74, + "grad_norm": 0.2624337205195208, + "learning_rate": 0.0001487021063442667, + "loss": 0.9806, + "step": 7759 + }, + { + "epoch": 0.74, + "grad_norm": 0.2526960220148543, + "learning_rate": 0.00014868828914641431, + "loss": 0.9377, + "step": 7760 + }, + { + "epoch": 0.74, + "grad_norm": 0.2974913829902583, + "learning_rate": 0.00014867447073010686, + "loss": 1.0434, + "step": 7761 + }, + { + "epoch": 0.74, + "grad_norm": 0.22591683924408296, + "learning_rate": 0.0001486606510956902, + "loss": 1.145, + "step": 7762 + }, + { + "epoch": 0.74, + "grad_norm": 0.26106572494311425, + "learning_rate": 0.00014864683024351017, + "loss": 1.0687, + "step": 7763 + }, + { + "epoch": 0.74, + "grad_norm": 0.2540501633396988, + "learning_rate": 0.00014863300817391262, + "loss": 1.0217, + "step": 7764 + }, + { + "epoch": 0.74, + "grad_norm": 0.2801665117949245, + "learning_rate": 0.0001486191848872435, + "loss": 1.1218, + "step": 7765 + }, + { + "epoch": 0.74, + "grad_norm": 0.3187353776069642, + "learning_rate": 0.0001486053603838487, + "loss": 1.0898, + "step": 7766 + }, + { + "epoch": 0.74, + "grad_norm": 0.25763576552152534, + "learning_rate": 0.0001485915346640742, + "loss": 1.0585, + "step": 7767 + }, + { + "epoch": 0.74, + "grad_norm": 0.2876319448148584, + "learning_rate": 0.00014857770772826602, + "loss": 1.0169, + "step": 7768 + }, + { + "epoch": 0.74, + "grad_norm": 0.25997267525577705, + "learning_rate": 0.0001485638795767702, + "loss": 1.0577, + "step": 7769 + }, + { + "epoch": 0.74, + "grad_norm": 0.2870532110656952, + "learning_rate": 0.00014855005020993276, + "loss": 1.1589, + "step": 7770 + }, + { + "epoch": 0.74, + "grad_norm": 0.28744735564229734, + "learning_rate": 0.00014853621962809975, + "loss": 1.0786, + "step": 7771 + }, + { + "epoch": 0.74, + "grad_norm": 0.26983452322850415, + "learning_rate": 0.0001485223878316174, + "loss": 1.2123, + "step": 7772 + }, + { + "epoch": 0.74, + "grad_norm": 0.33765627577778073, + "learning_rate": 0.00014850855482083177, + "loss": 1.1044, + "step": 7773 + }, + { + "epoch": 0.74, + "grad_norm": 0.272206284878232, + "learning_rate": 0.00014849472059608906, + "loss": 1.0212, + "step": 7774 + }, + { + "epoch": 0.74, + "grad_norm": 0.29277937755205086, + "learning_rate": 0.00014848088515773553, + "loss": 1.1695, + "step": 7775 + }, + { + "epoch": 0.74, + "grad_norm": 0.25356023491550367, + "learning_rate": 0.00014846704850611736, + "loss": 1.1737, + "step": 7776 + }, + { + "epoch": 0.74, + "grad_norm": 0.3022516410233581, + "learning_rate": 0.00014845321064158083, + "loss": 1.0756, + "step": 7777 + }, + { + "epoch": 0.74, + "grad_norm": 0.2530511912100064, + "learning_rate": 0.00014843937156447227, + "loss": 1.0036, + "step": 7778 + }, + { + "epoch": 0.74, + "grad_norm": 0.3022412340923616, + "learning_rate": 0.000148425531275138, + "loss": 0.9716, + "step": 7779 + }, + { + "epoch": 0.74, + "grad_norm": 0.2678799188134652, + "learning_rate": 0.00014841168977392432, + "loss": 1.1076, + "step": 7780 + }, + { + "epoch": 0.74, + "grad_norm": 0.2963039241793785, + "learning_rate": 0.00014839784706117775, + "loss": 1.0797, + "step": 7781 + }, + { + "epoch": 0.74, + "grad_norm": 0.27673978607030375, + "learning_rate": 0.00014838400313724458, + "loss": 1.1354, + "step": 7782 + }, + { + "epoch": 0.74, + "grad_norm": 0.2599969777043114, + "learning_rate": 0.00014837015800247137, + "loss": 1.0195, + "step": 7783 + }, + { + "epoch": 0.74, + "grad_norm": 0.28872142479078317, + "learning_rate": 0.0001483563116572045, + "loss": 0.9883, + "step": 7784 + }, + { + "epoch": 0.74, + "grad_norm": 0.2728653682651342, + "learning_rate": 0.0001483424641017906, + "loss": 1.1095, + "step": 7785 + }, + { + "epoch": 0.74, + "grad_norm": 0.3069290092163076, + "learning_rate": 0.00014832861533657613, + "loss": 1.093, + "step": 7786 + }, + { + "epoch": 0.74, + "grad_norm": 0.25616829490396015, + "learning_rate": 0.0001483147653619077, + "loss": 1.1398, + "step": 7787 + }, + { + "epoch": 0.75, + "grad_norm": 0.33842983259832143, + "learning_rate": 0.00014830091417813188, + "loss": 1.0526, + "step": 7788 + }, + { + "epoch": 0.75, + "grad_norm": 0.29499869905887005, + "learning_rate": 0.00014828706178559534, + "loss": 1.1995, + "step": 7789 + }, + { + "epoch": 0.75, + "grad_norm": 0.2867836759486996, + "learning_rate": 0.00014827320818464474, + "loss": 1.1038, + "step": 7790 + }, + { + "epoch": 0.75, + "grad_norm": 0.27597630873723183, + "learning_rate": 0.00014825935337562673, + "loss": 1.1934, + "step": 7791 + }, + { + "epoch": 0.75, + "grad_norm": 0.2566617689778122, + "learning_rate": 0.0001482454973588881, + "loss": 1.0383, + "step": 7792 + }, + { + "epoch": 0.75, + "grad_norm": 0.28408572266180726, + "learning_rate": 0.0001482316401347756, + "loss": 0.9326, + "step": 7793 + }, + { + "epoch": 0.75, + "grad_norm": 0.2745042716972083, + "learning_rate": 0.00014821778170363595, + "loss": 1.0336, + "step": 7794 + }, + { + "epoch": 0.75, + "grad_norm": 0.3032701457073397, + "learning_rate": 0.00014820392206581602, + "loss": 1.0234, + "step": 7795 + }, + { + "epoch": 0.75, + "grad_norm": 0.2751280009313665, + "learning_rate": 0.00014819006122166267, + "loss": 1.0432, + "step": 7796 + }, + { + "epoch": 0.75, + "grad_norm": 0.3125193076347104, + "learning_rate": 0.00014817619917152275, + "loss": 1.1172, + "step": 7797 + }, + { + "epoch": 0.75, + "grad_norm": 0.26116307629794683, + "learning_rate": 0.00014816233591574313, + "loss": 1.0669, + "step": 7798 + }, + { + "epoch": 0.75, + "grad_norm": 0.28349617158355267, + "learning_rate": 0.0001481484714546708, + "loss": 1.1477, + "step": 7799 + }, + { + "epoch": 0.75, + "grad_norm": 0.34282406819344863, + "learning_rate": 0.00014813460578865274, + "loss": 1.009, + "step": 7800 + }, + { + "epoch": 0.75, + "grad_norm": 0.28578123221740864, + "learning_rate": 0.00014812073891803587, + "loss": 1.156, + "step": 7801 + }, + { + "epoch": 0.75, + "grad_norm": 0.2980354742362817, + "learning_rate": 0.0001481068708431673, + "loss": 1.0768, + "step": 7802 + }, + { + "epoch": 0.75, + "grad_norm": 0.3220759599695297, + "learning_rate": 0.00014809300156439406, + "loss": 1.0764, + "step": 7803 + }, + { + "epoch": 0.75, + "grad_norm": 0.2640835765034414, + "learning_rate": 0.00014807913108206322, + "loss": 1.0768, + "step": 7804 + }, + { + "epoch": 0.75, + "grad_norm": 0.27220178093073455, + "learning_rate": 0.00014806525939652188, + "loss": 1.1449, + "step": 7805 + }, + { + "epoch": 0.75, + "grad_norm": 0.27363838878553143, + "learning_rate": 0.00014805138650811724, + "loss": 1.0526, + "step": 7806 + }, + { + "epoch": 0.75, + "grad_norm": 0.2883264760303332, + "learning_rate": 0.0001480375124171965, + "loss": 1.0072, + "step": 7807 + }, + { + "epoch": 0.75, + "grad_norm": 0.28230583793684294, + "learning_rate": 0.00014802363712410673, + "loss": 1.043, + "step": 7808 + }, + { + "epoch": 0.75, + "grad_norm": 0.2615628239719908, + "learning_rate": 0.00014800976062919532, + "loss": 1.0759, + "step": 7809 + }, + { + "epoch": 0.75, + "grad_norm": 0.29229275325827664, + "learning_rate": 0.00014799588293280946, + "loss": 1.0116, + "step": 7810 + }, + { + "epoch": 0.75, + "grad_norm": 0.2738661601965768, + "learning_rate": 0.00014798200403529646, + "loss": 1.1214, + "step": 7811 + }, + { + "epoch": 0.75, + "grad_norm": 0.2743653194914421, + "learning_rate": 0.00014796812393700368, + "loss": 1.0277, + "step": 7812 + }, + { + "epoch": 0.75, + "grad_norm": 0.27052008019631224, + "learning_rate": 0.00014795424263827842, + "loss": 1.0564, + "step": 7813 + }, + { + "epoch": 0.75, + "grad_norm": 0.30231813442957706, + "learning_rate": 0.00014794036013946813, + "loss": 1.1308, + "step": 7814 + }, + { + "epoch": 0.75, + "grad_norm": 0.29045069706562976, + "learning_rate": 0.00014792647644092016, + "loss": 1.0502, + "step": 7815 + }, + { + "epoch": 0.75, + "grad_norm": 0.2663099880383512, + "learning_rate": 0.000147912591542982, + "loss": 1.1342, + "step": 7816 + }, + { + "epoch": 0.75, + "grad_norm": 0.2671935344264389, + "learning_rate": 0.00014789870544600116, + "loss": 1.0337, + "step": 7817 + }, + { + "epoch": 0.75, + "grad_norm": 0.2376028787105682, + "learning_rate": 0.00014788481815032509, + "loss": 1.0951, + "step": 7818 + }, + { + "epoch": 0.75, + "grad_norm": 0.2873919541494794, + "learning_rate": 0.00014787092965630135, + "loss": 1.0885, + "step": 7819 + }, + { + "epoch": 0.75, + "grad_norm": 0.28326748757574954, + "learning_rate": 0.00014785703996427754, + "loss": 0.9765, + "step": 7820 + }, + { + "epoch": 0.75, + "grad_norm": 0.26333898401252714, + "learning_rate": 0.0001478431490746012, + "loss": 0.9915, + "step": 7821 + }, + { + "epoch": 0.75, + "grad_norm": 0.2638238903309792, + "learning_rate": 0.00014782925698761997, + "loss": 1.0908, + "step": 7822 + }, + { + "epoch": 0.75, + "grad_norm": 0.30274071491597526, + "learning_rate": 0.00014781536370368157, + "loss": 1.0742, + "step": 7823 + }, + { + "epoch": 0.75, + "grad_norm": 0.2610374416233597, + "learning_rate": 0.0001478014692231336, + "loss": 1.1589, + "step": 7824 + }, + { + "epoch": 0.75, + "grad_norm": 0.312177588979833, + "learning_rate": 0.00014778757354632382, + "loss": 1.023, + "step": 7825 + }, + { + "epoch": 0.75, + "grad_norm": 0.2817902834771327, + "learning_rate": 0.00014777367667360002, + "loss": 1.0925, + "step": 7826 + }, + { + "epoch": 0.75, + "grad_norm": 0.24540218148082688, + "learning_rate": 0.00014775977860530988, + "loss": 0.9525, + "step": 7827 + }, + { + "epoch": 0.75, + "grad_norm": 0.292375111038645, + "learning_rate": 0.0001477458793418013, + "loss": 1.0295, + "step": 7828 + }, + { + "epoch": 0.75, + "grad_norm": 0.25821078004181164, + "learning_rate": 0.0001477319788834221, + "loss": 1.0538, + "step": 7829 + }, + { + "epoch": 0.75, + "grad_norm": 0.26371249547752684, + "learning_rate": 0.00014771807723052013, + "loss": 1.0396, + "step": 7830 + }, + { + "epoch": 0.75, + "grad_norm": 0.24548178512781826, + "learning_rate": 0.00014770417438344325, + "loss": 1.1311, + "step": 7831 + }, + { + "epoch": 0.75, + "grad_norm": 0.28303419109838207, + "learning_rate": 0.00014769027034253944, + "loss": 1.0892, + "step": 7832 + }, + { + "epoch": 0.75, + "grad_norm": 0.27687609304875627, + "learning_rate": 0.00014767636510815667, + "loss": 1.0869, + "step": 7833 + }, + { + "epoch": 0.75, + "grad_norm": 0.28746479888055076, + "learning_rate": 0.00014766245868064285, + "loss": 1.1869, + "step": 7834 + }, + { + "epoch": 0.75, + "grad_norm": 0.2879143036391422, + "learning_rate": 0.00014764855106034607, + "loss": 1.0875, + "step": 7835 + }, + { + "epoch": 0.75, + "grad_norm": 0.2653922351709626, + "learning_rate": 0.00014763464224761436, + "loss": 1.1238, + "step": 7836 + }, + { + "epoch": 0.75, + "grad_norm": 0.2748579199210656, + "learning_rate": 0.00014762073224279578, + "loss": 1.0366, + "step": 7837 + }, + { + "epoch": 0.75, + "grad_norm": 0.29359776941173216, + "learning_rate": 0.00014760682104623845, + "loss": 0.9695, + "step": 7838 + }, + { + "epoch": 0.75, + "grad_norm": 0.2671398097358774, + "learning_rate": 0.00014759290865829053, + "loss": 1.0089, + "step": 7839 + }, + { + "epoch": 0.75, + "eval_loss": 1.1271681785583496, + "eval_runtime": 4227.9142, + "eval_samples_per_second": 19.778, + "eval_steps_per_second": 2.472, + "step": 7839 + }, + { + "epoch": 0.75, + "grad_norm": 0.28108668366507505, + "learning_rate": 0.00014757899507930012, + "loss": 1.1518, + "step": 7840 + }, + { + "epoch": 0.75, + "grad_norm": 0.2889156834394568, + "learning_rate": 0.00014756508030961543, + "loss": 1.057, + "step": 7841 + }, + { + "epoch": 0.75, + "grad_norm": 0.23620442034659647, + "learning_rate": 0.00014755116434958477, + "loss": 1.0139, + "step": 7842 + }, + { + "epoch": 0.75, + "grad_norm": 0.26963408436529646, + "learning_rate": 0.00014753724719955634, + "loss": 1.2173, + "step": 7843 + }, + { + "epoch": 0.75, + "grad_norm": 0.2930307296639112, + "learning_rate": 0.0001475233288598784, + "loss": 1.1191, + "step": 7844 + }, + { + "epoch": 0.75, + "grad_norm": 0.26447900586343337, + "learning_rate": 0.00014750940933089927, + "loss": 1.0972, + "step": 7845 + }, + { + "epoch": 0.75, + "grad_norm": 0.28229996008223884, + "learning_rate": 0.00014749548861296734, + "loss": 1.1229, + "step": 7846 + }, + { + "epoch": 0.75, + "grad_norm": 0.2670260276637567, + "learning_rate": 0.00014748156670643097, + "loss": 1.2654, + "step": 7847 + }, + { + "epoch": 0.75, + "grad_norm": 0.29233732430571957, + "learning_rate": 0.00014746764361163854, + "loss": 1.1009, + "step": 7848 + }, + { + "epoch": 0.75, + "grad_norm": 0.2872352622058822, + "learning_rate": 0.00014745371932893848, + "loss": 1.0123, + "step": 7849 + }, + { + "epoch": 0.75, + "grad_norm": 0.26320292513113624, + "learning_rate": 0.00014743979385867928, + "loss": 1.056, + "step": 7850 + }, + { + "epoch": 0.75, + "grad_norm": 0.29353850314037555, + "learning_rate": 0.00014742586720120943, + "loss": 1.0711, + "step": 7851 + }, + { + "epoch": 0.75, + "grad_norm": 0.24643873017384735, + "learning_rate": 0.00014741193935687743, + "loss": 0.942, + "step": 7852 + }, + { + "epoch": 0.75, + "grad_norm": 0.30839731486986416, + "learning_rate": 0.00014739801032603186, + "loss": 0.932, + "step": 7853 + }, + { + "epoch": 0.75, + "grad_norm": 0.25565523985069327, + "learning_rate": 0.0001473840801090213, + "loss": 1.0941, + "step": 7854 + }, + { + "epoch": 0.75, + "grad_norm": 0.26831609426984876, + "learning_rate": 0.00014737014870619438, + "loss": 1.079, + "step": 7855 + }, + { + "epoch": 0.75, + "grad_norm": 0.2782662024037316, + "learning_rate": 0.00014735621611789963, + "loss": 1.085, + "step": 7856 + }, + { + "epoch": 0.75, + "grad_norm": 0.29723517036293495, + "learning_rate": 0.0001473422823444859, + "loss": 1.0157, + "step": 7857 + }, + { + "epoch": 0.75, + "grad_norm": 0.26674243695427563, + "learning_rate": 0.00014732834738630178, + "loss": 1.0748, + "step": 7858 + }, + { + "epoch": 0.75, + "grad_norm": 0.3178208105289837, + "learning_rate": 0.00014731441124369598, + "loss": 1.047, + "step": 7859 + }, + { + "epoch": 0.75, + "grad_norm": 0.3797982501806393, + "learning_rate": 0.00014730047391701737, + "loss": 0.9933, + "step": 7860 + }, + { + "epoch": 0.75, + "grad_norm": 0.2901368038846893, + "learning_rate": 0.00014728653540661463, + "loss": 1.0821, + "step": 7861 + }, + { + "epoch": 0.75, + "grad_norm": 0.2955915921047405, + "learning_rate": 0.0001472725957128366, + "loss": 0.9773, + "step": 7862 + }, + { + "epoch": 0.75, + "grad_norm": 0.29647306511427657, + "learning_rate": 0.00014725865483603218, + "loss": 1.1752, + "step": 7863 + }, + { + "epoch": 0.75, + "grad_norm": 0.2918367246052348, + "learning_rate": 0.00014724471277655023, + "loss": 0.9763, + "step": 7864 + }, + { + "epoch": 0.75, + "grad_norm": 0.2815720894283948, + "learning_rate": 0.00014723076953473965, + "loss": 1.098, + "step": 7865 + }, + { + "epoch": 0.75, + "grad_norm": 0.2930097766676262, + "learning_rate": 0.00014721682511094935, + "loss": 1.0655, + "step": 7866 + }, + { + "epoch": 0.75, + "grad_norm": 0.29679633835988606, + "learning_rate": 0.00014720287950552836, + "loss": 1.103, + "step": 7867 + }, + { + "epoch": 0.75, + "grad_norm": 0.29540666426295414, + "learning_rate": 0.00014718893271882562, + "loss": 1.1377, + "step": 7868 + }, + { + "epoch": 0.75, + "grad_norm": 0.28866732431657105, + "learning_rate": 0.0001471749847511902, + "loss": 1.1552, + "step": 7869 + }, + { + "epoch": 0.75, + "grad_norm": 0.2897250322336478, + "learning_rate": 0.00014716103560297116, + "loss": 1.1178, + "step": 7870 + }, + { + "epoch": 0.75, + "grad_norm": 0.2871031890874088, + "learning_rate": 0.00014714708527451752, + "loss": 1.0608, + "step": 7871 + }, + { + "epoch": 0.75, + "grad_norm": 0.2660012537881913, + "learning_rate": 0.00014713313376617845, + "loss": 0.9785, + "step": 7872 + }, + { + "epoch": 0.75, + "grad_norm": 0.30288071506941316, + "learning_rate": 0.00014711918107830312, + "loss": 1.0374, + "step": 7873 + }, + { + "epoch": 0.75, + "grad_norm": 0.2623082689214895, + "learning_rate": 0.00014710522721124062, + "loss": 1.1882, + "step": 7874 + }, + { + "epoch": 0.75, + "grad_norm": 0.29218912698161004, + "learning_rate": 0.00014709127216534025, + "loss": 1.1121, + "step": 7875 + }, + { + "epoch": 0.75, + "grad_norm": 0.2926792107209431, + "learning_rate": 0.00014707731594095118, + "loss": 1.1558, + "step": 7876 + }, + { + "epoch": 0.75, + "grad_norm": 0.3036786630685544, + "learning_rate": 0.00014706335853842268, + "loss": 1.1285, + "step": 7877 + }, + { + "epoch": 0.75, + "grad_norm": 0.3070257247499537, + "learning_rate": 0.00014704939995810408, + "loss": 1.1566, + "step": 7878 + }, + { + "epoch": 0.75, + "grad_norm": 0.2753834785951813, + "learning_rate": 0.00014703544020034463, + "loss": 1.1336, + "step": 7879 + }, + { + "epoch": 0.75, + "grad_norm": 0.30835133105536705, + "learning_rate": 0.00014702147926549377, + "loss": 1.0733, + "step": 7880 + }, + { + "epoch": 0.75, + "grad_norm": 0.30129377981823663, + "learning_rate": 0.00014700751715390084, + "loss": 1.1398, + "step": 7881 + }, + { + "epoch": 0.75, + "grad_norm": 0.26568110306618115, + "learning_rate": 0.00014699355386591523, + "loss": 1.0773, + "step": 7882 + }, + { + "epoch": 0.75, + "grad_norm": 0.28052397758769976, + "learning_rate": 0.00014697958940188642, + "loss": 1.0731, + "step": 7883 + }, + { + "epoch": 0.75, + "grad_norm": 0.2696382620489903, + "learning_rate": 0.00014696562376216381, + "loss": 0.9844, + "step": 7884 + }, + { + "epoch": 0.75, + "grad_norm": 0.2532216857709228, + "learning_rate": 0.000146951656947097, + "loss": 1.0863, + "step": 7885 + }, + { + "epoch": 0.75, + "grad_norm": 0.2622266542497967, + "learning_rate": 0.00014693768895703544, + "loss": 1.0305, + "step": 7886 + }, + { + "epoch": 0.75, + "grad_norm": 0.2800195946028661, + "learning_rate": 0.00014692371979232872, + "loss": 1.1075, + "step": 7887 + }, + { + "epoch": 0.75, + "grad_norm": 0.2953080493463318, + "learning_rate": 0.00014690974945332644, + "loss": 0.9948, + "step": 7888 + }, + { + "epoch": 0.75, + "grad_norm": 0.2886266231002391, + "learning_rate": 0.00014689577794037818, + "loss": 0.9764, + "step": 7889 + }, + { + "epoch": 0.75, + "grad_norm": 0.29706661719229843, + "learning_rate": 0.0001468818052538336, + "loss": 1.055, + "step": 7890 + }, + { + "epoch": 0.75, + "grad_norm": 0.28179682965161795, + "learning_rate": 0.0001468678313940424, + "loss": 1.0903, + "step": 7891 + }, + { + "epoch": 0.76, + "grad_norm": 0.3030242128434006, + "learning_rate": 0.00014685385636135426, + "loss": 1.1486, + "step": 7892 + }, + { + "epoch": 0.76, + "grad_norm": 0.2647061302202202, + "learning_rate": 0.00014683988015611892, + "loss": 1.0548, + "step": 7893 + }, + { + "epoch": 0.76, + "grad_norm": 0.300733623893973, + "learning_rate": 0.00014682590277868612, + "loss": 1.1367, + "step": 7894 + }, + { + "epoch": 0.76, + "grad_norm": 0.2995793418222508, + "learning_rate": 0.0001468119242294057, + "loss": 1.0878, + "step": 7895 + }, + { + "epoch": 0.76, + "grad_norm": 0.28708772163438406, + "learning_rate": 0.00014679794450862745, + "loss": 0.9396, + "step": 7896 + }, + { + "epoch": 0.76, + "grad_norm": 0.311063842263335, + "learning_rate": 0.0001467839636167012, + "loss": 1.1202, + "step": 7897 + }, + { + "epoch": 0.76, + "grad_norm": 0.2986192805747378, + "learning_rate": 0.0001467699815539769, + "loss": 1.0607, + "step": 7898 + }, + { + "epoch": 0.76, + "grad_norm": 0.258096740209968, + "learning_rate": 0.0001467559983208044, + "loss": 0.9521, + "step": 7899 + }, + { + "epoch": 0.76, + "grad_norm": 0.27283666156818637, + "learning_rate": 0.00014674201391753363, + "loss": 1.1342, + "step": 7900 + }, + { + "epoch": 0.76, + "grad_norm": 0.28773671837545195, + "learning_rate": 0.0001467280283445146, + "loss": 1.1128, + "step": 7901 + }, + { + "epoch": 0.76, + "grad_norm": 0.307556564712858, + "learning_rate": 0.00014671404160209733, + "loss": 0.9963, + "step": 7902 + }, + { + "epoch": 0.76, + "grad_norm": 0.2684719138920676, + "learning_rate": 0.0001467000536906318, + "loss": 1.0201, + "step": 7903 + }, + { + "epoch": 0.76, + "grad_norm": 0.26240667501651854, + "learning_rate": 0.00014668606461046806, + "loss": 1.0589, + "step": 7904 + }, + { + "epoch": 0.76, + "grad_norm": 0.27922442657332547, + "learning_rate": 0.0001466720743619562, + "loss": 1.0683, + "step": 7905 + }, + { + "epoch": 0.76, + "grad_norm": 0.2540084140712392, + "learning_rate": 0.00014665808294544633, + "loss": 0.9643, + "step": 7906 + }, + { + "epoch": 0.76, + "grad_norm": 0.31142085724795016, + "learning_rate": 0.00014664409036128866, + "loss": 1.045, + "step": 7907 + }, + { + "epoch": 0.76, + "grad_norm": 0.3206848269424768, + "learning_rate": 0.00014663009660983328, + "loss": 1.1147, + "step": 7908 + }, + { + "epoch": 0.76, + "grad_norm": 0.33340917807727916, + "learning_rate": 0.00014661610169143044, + "loss": 1.0744, + "step": 7909 + }, + { + "epoch": 0.76, + "grad_norm": 0.32071991522507715, + "learning_rate": 0.00014660210560643036, + "loss": 1.1297, + "step": 7910 + }, + { + "epoch": 0.76, + "grad_norm": 0.2761333180916678, + "learning_rate": 0.00014658810835518332, + "loss": 1.0477, + "step": 7911 + }, + { + "epoch": 0.76, + "grad_norm": 0.2859125633527157, + "learning_rate": 0.00014657410993803956, + "loss": 1.0781, + "step": 7912 + }, + { + "epoch": 0.76, + "grad_norm": 0.3274797315976737, + "learning_rate": 0.00014656011035534943, + "loss": 1.0144, + "step": 7913 + }, + { + "epoch": 0.76, + "grad_norm": 0.28598470593343445, + "learning_rate": 0.00014654610960746327, + "loss": 1.0887, + "step": 7914 + }, + { + "epoch": 0.76, + "grad_norm": 0.28926204149947815, + "learning_rate": 0.00014653210769473147, + "loss": 1.0627, + "step": 7915 + }, + { + "epoch": 0.76, + "grad_norm": 0.26324720206303615, + "learning_rate": 0.00014651810461750446, + "loss": 1.1053, + "step": 7916 + }, + { + "epoch": 0.76, + "grad_norm": 0.30675528347879105, + "learning_rate": 0.0001465041003761326, + "loss": 1.1054, + "step": 7917 + }, + { + "epoch": 0.76, + "grad_norm": 0.26066528416246476, + "learning_rate": 0.0001464900949709664, + "loss": 1.0405, + "step": 7918 + }, + { + "epoch": 0.76, + "grad_norm": 0.29329946245835375, + "learning_rate": 0.0001464760884023564, + "loss": 1.0615, + "step": 7919 + }, + { + "epoch": 0.76, + "grad_norm": 0.2679473490528041, + "learning_rate": 0.00014646208067065305, + "loss": 1.0988, + "step": 7920 + }, + { + "epoch": 0.76, + "grad_norm": 0.29866219772813785, + "learning_rate": 0.00014644807177620694, + "loss": 1.1193, + "step": 7921 + }, + { + "epoch": 0.76, + "grad_norm": 0.3004083603649936, + "learning_rate": 0.00014643406171936863, + "loss": 1.0458, + "step": 7922 + }, + { + "epoch": 0.76, + "grad_norm": 0.3255922513505334, + "learning_rate": 0.00014642005050048877, + "loss": 1.1641, + "step": 7923 + }, + { + "epoch": 0.76, + "grad_norm": 0.27331269973566386, + "learning_rate": 0.00014640603811991794, + "loss": 1.0904, + "step": 7924 + }, + { + "epoch": 0.76, + "grad_norm": 0.3021895089941293, + "learning_rate": 0.00014639202457800688, + "loss": 1.0426, + "step": 7925 + }, + { + "epoch": 0.76, + "grad_norm": 0.31651111587188924, + "learning_rate": 0.0001463780098751062, + "loss": 0.9561, + "step": 7926 + }, + { + "epoch": 0.76, + "grad_norm": 0.27245193343641233, + "learning_rate": 0.00014636399401156668, + "loss": 0.9084, + "step": 7927 + }, + { + "epoch": 0.76, + "grad_norm": 0.28171140443893944, + "learning_rate": 0.0001463499769877391, + "loss": 1.0122, + "step": 7928 + }, + { + "epoch": 0.76, + "grad_norm": 0.2722172920609029, + "learning_rate": 0.00014633595880397422, + "loss": 0.9773, + "step": 7929 + }, + { + "epoch": 0.76, + "grad_norm": 0.314701712266059, + "learning_rate": 0.00014632193946062283, + "loss": 1.0967, + "step": 7930 + }, + { + "epoch": 0.76, + "grad_norm": 0.2766120012080896, + "learning_rate": 0.0001463079189580358, + "loss": 1.1545, + "step": 7931 + }, + { + "epoch": 0.76, + "grad_norm": 0.26393567149311037, + "learning_rate": 0.00014629389729656399, + "loss": 0.8912, + "step": 7932 + }, + { + "epoch": 0.76, + "grad_norm": 0.30280952487557516, + "learning_rate": 0.0001462798744765583, + "loss": 1.1161, + "step": 7933 + }, + { + "epoch": 0.76, + "grad_norm": 0.31616202197131665, + "learning_rate": 0.0001462658504983697, + "loss": 1.179, + "step": 7934 + }, + { + "epoch": 0.76, + "grad_norm": 0.26565495442579756, + "learning_rate": 0.0001462518253623491, + "loss": 1.0568, + "step": 7935 + }, + { + "epoch": 0.76, + "grad_norm": 0.318280638548035, + "learning_rate": 0.00014623779906884748, + "loss": 1.1108, + "step": 7936 + }, + { + "epoch": 0.76, + "grad_norm": 0.2586299065330331, + "learning_rate": 0.00014622377161821587, + "loss": 1.148, + "step": 7937 + }, + { + "epoch": 0.76, + "grad_norm": 0.2715217059364236, + "learning_rate": 0.00014620974301080537, + "loss": 1.0559, + "step": 7938 + }, + { + "epoch": 0.76, + "grad_norm": 0.24985474931051407, + "learning_rate": 0.00014619571324696697, + "loss": 1.1047, + "step": 7939 + }, + { + "epoch": 0.76, + "grad_norm": 0.2961624550424931, + "learning_rate": 0.00014618168232705182, + "loss": 0.9659, + "step": 7940 + }, + { + "epoch": 0.76, + "grad_norm": 0.2966955141929237, + "learning_rate": 0.00014616765025141106, + "loss": 1.0266, + "step": 7941 + }, + { + "epoch": 0.76, + "grad_norm": 0.2986341505874412, + "learning_rate": 0.00014615361702039582, + "loss": 1.0165, + "step": 7942 + }, + { + "epoch": 0.76, + "grad_norm": 0.291306108234052, + "learning_rate": 0.00014613958263435734, + "loss": 1.0463, + "step": 7943 + }, + { + "epoch": 0.76, + "grad_norm": 0.24770026210630766, + "learning_rate": 0.00014612554709364677, + "loss": 1.0362, + "step": 7944 + }, + { + "epoch": 0.76, + "grad_norm": 0.30727250672800005, + "learning_rate": 0.00014611151039861542, + "loss": 1.0903, + "step": 7945 + }, + { + "epoch": 0.76, + "grad_norm": 0.30656468148580795, + "learning_rate": 0.00014609747254961452, + "loss": 1.0681, + "step": 7946 + }, + { + "epoch": 0.76, + "grad_norm": 0.28351015720243533, + "learning_rate": 0.0001460834335469954, + "loss": 1.166, + "step": 7947 + }, + { + "epoch": 0.76, + "grad_norm": 0.28579851887764157, + "learning_rate": 0.0001460693933911094, + "loss": 1.0082, + "step": 7948 + }, + { + "epoch": 0.76, + "grad_norm": 0.26715615685634764, + "learning_rate": 0.00014605535208230789, + "loss": 1.1272, + "step": 7949 + }, + { + "epoch": 0.76, + "grad_norm": 0.2908957018166822, + "learning_rate": 0.0001460413096209422, + "loss": 1.0386, + "step": 7950 + }, + { + "epoch": 0.76, + "grad_norm": 0.30421751423231963, + "learning_rate": 0.00014602726600736388, + "loss": 1.1368, + "step": 7951 + }, + { + "epoch": 0.76, + "grad_norm": 0.24654599232461855, + "learning_rate": 0.00014601322124192426, + "loss": 1.0704, + "step": 7952 + }, + { + "epoch": 0.76, + "grad_norm": 0.2858412396022753, + "learning_rate": 0.00014599917532497487, + "loss": 0.9629, + "step": 7953 + }, + { + "epoch": 0.76, + "grad_norm": 0.31986567150353395, + "learning_rate": 0.00014598512825686718, + "loss": 1.1515, + "step": 7954 + }, + { + "epoch": 0.76, + "grad_norm": 0.26767724316265096, + "learning_rate": 0.0001459710800379528, + "loss": 1.0711, + "step": 7955 + }, + { + "epoch": 0.76, + "grad_norm": 0.2683093058976096, + "learning_rate": 0.0001459570306685832, + "loss": 0.9834, + "step": 7956 + }, + { + "epoch": 0.76, + "grad_norm": 0.26418718399717656, + "learning_rate": 0.00014594298014911005, + "loss": 1.1891, + "step": 7957 + }, + { + "epoch": 0.76, + "grad_norm": 0.2997490177491894, + "learning_rate": 0.00014592892847988494, + "loss": 1.1303, + "step": 7958 + }, + { + "epoch": 0.76, + "grad_norm": 0.3092712454899324, + "learning_rate": 0.00014591487566125957, + "loss": 1.0605, + "step": 7959 + }, + { + "epoch": 0.76, + "grad_norm": 0.270510283984863, + "learning_rate": 0.00014590082169358554, + "loss": 1.0886, + "step": 7960 + }, + { + "epoch": 0.76, + "grad_norm": 0.24910690948882463, + "learning_rate": 0.0001458867665772146, + "loss": 1.0897, + "step": 7961 + }, + { + "epoch": 0.76, + "grad_norm": 0.29126461074740784, + "learning_rate": 0.0001458727103124985, + "loss": 1.1572, + "step": 7962 + }, + { + "epoch": 0.76, + "grad_norm": 0.32863806873359025, + "learning_rate": 0.000145858652899789, + "loss": 1.0036, + "step": 7963 + }, + { + "epoch": 0.76, + "grad_norm": 0.27115363531745906, + "learning_rate": 0.00014584459433943786, + "loss": 1.0609, + "step": 7964 + }, + { + "epoch": 0.76, + "grad_norm": 0.26632334307016087, + "learning_rate": 0.00014583053463179695, + "loss": 0.929, + "step": 7965 + }, + { + "epoch": 0.76, + "grad_norm": 0.30896648642067515, + "learning_rate": 0.00014581647377721812, + "loss": 1.1302, + "step": 7966 + }, + { + "epoch": 0.76, + "grad_norm": 0.31271610841373415, + "learning_rate": 0.00014580241177605322, + "loss": 1.0555, + "step": 7967 + }, + { + "epoch": 0.76, + "grad_norm": 0.2862084572430668, + "learning_rate": 0.0001457883486286542, + "loss": 1.1462, + "step": 7968 + }, + { + "epoch": 0.76, + "grad_norm": 0.2722638712262504, + "learning_rate": 0.00014577428433537297, + "loss": 1.0671, + "step": 7969 + }, + { + "epoch": 0.76, + "grad_norm": 0.29776545230446366, + "learning_rate": 0.0001457602188965615, + "loss": 1.0892, + "step": 7970 + }, + { + "epoch": 0.76, + "grad_norm": 0.28339621303738605, + "learning_rate": 0.00014574615231257177, + "loss": 1.0042, + "step": 7971 + }, + { + "epoch": 0.76, + "grad_norm": 0.2872649900886519, + "learning_rate": 0.00014573208458375586, + "loss": 1.0962, + "step": 7972 + }, + { + "epoch": 0.76, + "grad_norm": 0.29073436106492967, + "learning_rate": 0.0001457180157104658, + "loss": 1.096, + "step": 7973 + }, + { + "epoch": 0.76, + "grad_norm": 0.2782075864070279, + "learning_rate": 0.00014570394569305366, + "loss": 1.0278, + "step": 7974 + }, + { + "epoch": 0.76, + "grad_norm": 0.31350190862970956, + "learning_rate": 0.00014568987453187154, + "loss": 1.091, + "step": 7975 + }, + { + "epoch": 0.76, + "grad_norm": 0.2652268268602932, + "learning_rate": 0.0001456758022272716, + "loss": 1.128, + "step": 7976 + }, + { + "epoch": 0.76, + "grad_norm": 0.2810839334912306, + "learning_rate": 0.00014566172877960603, + "loss": 1.0408, + "step": 7977 + }, + { + "epoch": 0.76, + "grad_norm": 0.25013935049819563, + "learning_rate": 0.00014564765418922696, + "loss": 1.1124, + "step": 7978 + }, + { + "epoch": 0.76, + "grad_norm": 0.23668034177376465, + "learning_rate": 0.00014563357845648667, + "loss": 1.028, + "step": 7979 + }, + { + "epoch": 0.76, + "grad_norm": 0.2736819717949704, + "learning_rate": 0.0001456195015817374, + "loss": 0.9969, + "step": 7980 + }, + { + "epoch": 0.76, + "grad_norm": 0.2771456339595817, + "learning_rate": 0.00014560542356533142, + "loss": 1.1049, + "step": 7981 + }, + { + "epoch": 0.76, + "grad_norm": 0.294782138767455, + "learning_rate": 0.00014559134440762108, + "loss": 1.13, + "step": 7982 + }, + { + "epoch": 0.76, + "grad_norm": 0.26345886128180046, + "learning_rate": 0.0001455772641089587, + "loss": 1.1155, + "step": 7983 + }, + { + "epoch": 0.76, + "grad_norm": 0.2829069470535079, + "learning_rate": 0.00014556318266969656, + "loss": 1.0918, + "step": 7984 + }, + { + "epoch": 0.76, + "grad_norm": 0.28542191746861517, + "learning_rate": 0.00014554910009018722, + "loss": 1.0391, + "step": 7985 + }, + { + "epoch": 0.76, + "grad_norm": 0.2538797891395937, + "learning_rate": 0.000145535016370783, + "loss": 0.9585, + "step": 7986 + }, + { + "epoch": 0.76, + "grad_norm": 0.26884615169189024, + "learning_rate": 0.0001455209315118364, + "loss": 1.0022, + "step": 7987 + }, + { + "epoch": 0.76, + "grad_norm": 0.2716483059032336, + "learning_rate": 0.00014550684551369985, + "loss": 1.0702, + "step": 7988 + }, + { + "epoch": 0.76, + "grad_norm": 0.27843349775471815, + "learning_rate": 0.00014549275837672586, + "loss": 1.1401, + "step": 7989 + }, + { + "epoch": 0.76, + "grad_norm": 0.2518481347111056, + "learning_rate": 0.00014547867010126706, + "loss": 1.0111, + "step": 7990 + }, + { + "epoch": 0.76, + "grad_norm": 0.3216392004906807, + "learning_rate": 0.00014546458068767594, + "loss": 1.0472, + "step": 7991 + }, + { + "epoch": 0.76, + "grad_norm": 0.28909463374066535, + "learning_rate": 0.00014545049013630512, + "loss": 1.0201, + "step": 7992 + }, + { + "epoch": 0.76, + "grad_norm": 0.31788442875623607, + "learning_rate": 0.0001454363984475072, + "loss": 1.0955, + "step": 7993 + }, + { + "epoch": 0.76, + "grad_norm": 0.2758373823123496, + "learning_rate": 0.00014542230562163488, + "loss": 1.0873, + "step": 7994 + }, + { + "epoch": 0.76, + "grad_norm": 0.2818308726562138, + "learning_rate": 0.0001454082116590408, + "loss": 1.2114, + "step": 7995 + }, + { + "epoch": 0.76, + "grad_norm": 0.2641986472531651, + "learning_rate": 0.0001453941165600777, + "loss": 1.1287, + "step": 7996 + }, + { + "epoch": 0.77, + "grad_norm": 0.27941463601912914, + "learning_rate": 0.0001453800203250983, + "loss": 1.0574, + "step": 7997 + }, + { + "epoch": 0.77, + "grad_norm": 0.2756789148686793, + "learning_rate": 0.00014536592295445532, + "loss": 0.9556, + "step": 7998 + }, + { + "epoch": 0.77, + "grad_norm": 0.27533489457097576, + "learning_rate": 0.00014535182444850165, + "loss": 1.1021, + "step": 7999 + }, + { + "epoch": 0.77, + "grad_norm": 0.2964512342749194, + "learning_rate": 0.00014533772480759008, + "loss": 1.0421, + "step": 8000 + }, + { + "epoch": 0.77, + "grad_norm": 0.26212904131408093, + "learning_rate": 0.00014532362403207346, + "loss": 1.0015, + "step": 8001 + }, + { + "epoch": 0.77, + "grad_norm": 0.2949190595576002, + "learning_rate": 0.00014530952212230463, + "loss": 1.0511, + "step": 8002 + }, + { + "epoch": 0.77, + "grad_norm": 0.3066540690434103, + "learning_rate": 0.00014529541907863655, + "loss": 1.0643, + "step": 8003 + }, + { + "epoch": 0.77, + "grad_norm": 0.27584654577393586, + "learning_rate": 0.00014528131490142217, + "loss": 0.9773, + "step": 8004 + }, + { + "epoch": 0.77, + "grad_norm": 0.27774031420621953, + "learning_rate": 0.00014526720959101436, + "loss": 0.9925, + "step": 8005 + }, + { + "epoch": 0.77, + "grad_norm": 0.2512278740693556, + "learning_rate": 0.00014525310314776623, + "loss": 1.0008, + "step": 8006 + }, + { + "epoch": 0.77, + "grad_norm": 0.2630439424936692, + "learning_rate": 0.00014523899557203075, + "loss": 1.1098, + "step": 8007 + }, + { + "epoch": 0.77, + "grad_norm": 0.27426353058067837, + "learning_rate": 0.00014522488686416097, + "loss": 1.0728, + "step": 8008 + }, + { + "epoch": 0.77, + "grad_norm": 0.2763756735231871, + "learning_rate": 0.00014521077702450995, + "loss": 1.0639, + "step": 8009 + }, + { + "epoch": 0.77, + "grad_norm": 0.3325734526066199, + "learning_rate": 0.00014519666605343083, + "loss": 1.0027, + "step": 8010 + }, + { + "epoch": 0.77, + "grad_norm": 0.26469084339952254, + "learning_rate": 0.00014518255395127677, + "loss": 1.0773, + "step": 8011 + }, + { + "epoch": 0.77, + "grad_norm": 0.2755841427968526, + "learning_rate": 0.00014516844071840086, + "loss": 1.118, + "step": 8012 + }, + { + "epoch": 0.77, + "grad_norm": 0.2970488079219033, + "learning_rate": 0.00014515432635515635, + "loss": 1.0362, + "step": 8013 + }, + { + "epoch": 0.77, + "grad_norm": 0.2954323242845108, + "learning_rate": 0.00014514021086189645, + "loss": 1.0746, + "step": 8014 + }, + { + "epoch": 0.77, + "grad_norm": 0.2617764506782068, + "learning_rate": 0.00014512609423897438, + "loss": 1.0385, + "step": 8015 + }, + { + "epoch": 0.77, + "grad_norm": 0.30127357515303343, + "learning_rate": 0.00014511197648674348, + "loss": 1.0966, + "step": 8016 + }, + { + "epoch": 0.77, + "grad_norm": 0.3004406925080267, + "learning_rate": 0.00014509785760555697, + "loss": 1.0441, + "step": 8017 + }, + { + "epoch": 0.77, + "grad_norm": 0.25228201013427193, + "learning_rate": 0.00014508373759576824, + "loss": 1.0692, + "step": 8018 + }, + { + "epoch": 0.77, + "grad_norm": 0.3052520210235085, + "learning_rate": 0.00014506961645773068, + "loss": 1.1513, + "step": 8019 + }, + { + "epoch": 0.77, + "grad_norm": 0.2709522809700183, + "learning_rate": 0.00014505549419179765, + "loss": 0.991, + "step": 8020 + }, + { + "epoch": 0.77, + "grad_norm": 0.26525464351023736, + "learning_rate": 0.00014504137079832252, + "loss": 0.8964, + "step": 8021 + }, + { + "epoch": 0.77, + "grad_norm": 0.2877907446668608, + "learning_rate": 0.00014502724627765877, + "loss": 1.0461, + "step": 8022 + }, + { + "epoch": 0.77, + "grad_norm": 0.29764029690496574, + "learning_rate": 0.00014501312063015993, + "loss": 1.0284, + "step": 8023 + }, + { + "epoch": 0.77, + "grad_norm": 0.2883370401645907, + "learning_rate": 0.00014499899385617943, + "loss": 1.0268, + "step": 8024 + }, + { + "epoch": 0.77, + "grad_norm": 0.2563877310771201, + "learning_rate": 0.0001449848659560708, + "loss": 1.1029, + "step": 8025 + }, + { + "epoch": 0.77, + "grad_norm": 0.29710595080293883, + "learning_rate": 0.00014497073693018768, + "loss": 1.1315, + "step": 8026 + }, + { + "epoch": 0.77, + "grad_norm": 0.2559529397106755, + "learning_rate": 0.00014495660677888358, + "loss": 1.009, + "step": 8027 + }, + { + "epoch": 0.77, + "grad_norm": 0.29014013639604846, + "learning_rate": 0.00014494247550251213, + "loss": 1.0681, + "step": 8028 + }, + { + "epoch": 0.77, + "grad_norm": 0.2875485426750669, + "learning_rate": 0.00014492834310142702, + "loss": 1.1195, + "step": 8029 + }, + { + "epoch": 0.77, + "grad_norm": 0.286722769874282, + "learning_rate": 0.00014491420957598184, + "loss": 1.0529, + "step": 8030 + }, + { + "epoch": 0.77, + "grad_norm": 0.2674676429533566, + "learning_rate": 0.0001449000749265304, + "loss": 1.0475, + "step": 8031 + }, + { + "epoch": 0.77, + "grad_norm": 0.29840922095847167, + "learning_rate": 0.00014488593915342628, + "loss": 1.123, + "step": 8032 + }, + { + "epoch": 0.77, + "grad_norm": 0.29901175240287103, + "learning_rate": 0.0001448718022570234, + "loss": 1.148, + "step": 8033 + }, + { + "epoch": 0.77, + "grad_norm": 0.2915382990881953, + "learning_rate": 0.00014485766423767544, + "loss": 1.0401, + "step": 8034 + }, + { + "epoch": 0.77, + "grad_norm": 0.26281760733012527, + "learning_rate": 0.00014484352509573626, + "loss": 1.1292, + "step": 8035 + }, + { + "epoch": 0.77, + "grad_norm": 0.2756692483739733, + "learning_rate": 0.00014482938483155965, + "loss": 1.0544, + "step": 8036 + }, + { + "epoch": 0.77, + "grad_norm": 0.28042375304828776, + "learning_rate": 0.00014481524344549953, + "loss": 1.1094, + "step": 8037 + }, + { + "epoch": 0.77, + "grad_norm": 0.2666048364114282, + "learning_rate": 0.00014480110093790976, + "loss": 1.0748, + "step": 8038 + }, + { + "epoch": 0.77, + "grad_norm": 0.5049686356926107, + "learning_rate": 0.0001447869573091443, + "loss": 1.0392, + "step": 8039 + }, + { + "epoch": 0.77, + "grad_norm": 0.2898604153674101, + "learning_rate": 0.0001447728125595571, + "loss": 1.0527, + "step": 8040 + }, + { + "epoch": 0.77, + "grad_norm": 0.2938423642503213, + "learning_rate": 0.0001447586666895021, + "loss": 1.0593, + "step": 8041 + }, + { + "epoch": 0.77, + "grad_norm": 0.3199560544400549, + "learning_rate": 0.00014474451969933333, + "loss": 1.1253, + "step": 8042 + }, + { + "epoch": 0.77, + "grad_norm": 0.279058551647216, + "learning_rate": 0.00014473037158940484, + "loss": 1.0904, + "step": 8043 + }, + { + "epoch": 0.77, + "grad_norm": 0.3317633487011477, + "learning_rate": 0.0001447162223600707, + "loss": 1.0621, + "step": 8044 + }, + { + "epoch": 0.77, + "grad_norm": 0.31113174621821804, + "learning_rate": 0.00014470207201168497, + "loss": 1.1192, + "step": 8045 + }, + { + "epoch": 0.77, + "grad_norm": 0.2732527847810805, + "learning_rate": 0.00014468792054460184, + "loss": 1.0248, + "step": 8046 + }, + { + "epoch": 0.77, + "grad_norm": 0.29153439327487507, + "learning_rate": 0.00014467376795917537, + "loss": 1.124, + "step": 8047 + }, + { + "epoch": 0.77, + "grad_norm": 0.2860523862518081, + "learning_rate": 0.0001446596142557598, + "loss": 1.0621, + "step": 8048 + }, + { + "epoch": 0.77, + "grad_norm": 0.2862949081963216, + "learning_rate": 0.00014464545943470932, + "loss": 1.0017, + "step": 8049 + }, + { + "epoch": 0.77, + "grad_norm": 0.2968576640237807, + "learning_rate": 0.00014463130349637814, + "loss": 1.0521, + "step": 8050 + }, + { + "epoch": 0.77, + "grad_norm": 0.3203158707213885, + "learning_rate": 0.00014461714644112053, + "loss": 1.0283, + "step": 8051 + }, + { + "epoch": 0.77, + "grad_norm": 0.23962634495433535, + "learning_rate": 0.0001446029882692908, + "loss": 1.0052, + "step": 8052 + }, + { + "epoch": 0.77, + "grad_norm": 0.29338455713698386, + "learning_rate": 0.0001445888289812433, + "loss": 1.1304, + "step": 8053 + }, + { + "epoch": 0.77, + "grad_norm": 0.2650099607677807, + "learning_rate": 0.0001445746685773323, + "loss": 0.9916, + "step": 8054 + }, + { + "epoch": 0.77, + "grad_norm": 0.26198071838847975, + "learning_rate": 0.00014456050705791216, + "loss": 1.0875, + "step": 8055 + }, + { + "epoch": 0.77, + "grad_norm": 0.292914913997817, + "learning_rate": 0.00014454634442333738, + "loss": 1.0666, + "step": 8056 + }, + { + "epoch": 0.77, + "grad_norm": 0.3015285693686079, + "learning_rate": 0.00014453218067396231, + "loss": 1.1363, + "step": 8057 + }, + { + "epoch": 0.77, + "grad_norm": 0.30434995117078845, + "learning_rate": 0.00014451801581014147, + "loss": 1.0074, + "step": 8058 + }, + { + "epoch": 0.77, + "grad_norm": 0.30572536430411557, + "learning_rate": 0.00014450384983222926, + "loss": 1.0941, + "step": 8059 + }, + { + "epoch": 0.77, + "grad_norm": 0.2665325169928879, + "learning_rate": 0.00014448968274058025, + "loss": 1.0584, + "step": 8060 + }, + { + "epoch": 0.77, + "grad_norm": 0.291238529425458, + "learning_rate": 0.000144475514535549, + "loss": 0.9758, + "step": 8061 + }, + { + "epoch": 0.77, + "grad_norm": 0.320923065004721, + "learning_rate": 0.00014446134521749, + "loss": 0.9887, + "step": 8062 + }, + { + "epoch": 0.77, + "grad_norm": 0.2695062501972563, + "learning_rate": 0.00014444717478675792, + "loss": 1.063, + "step": 8063 + }, + { + "epoch": 0.77, + "grad_norm": 0.29036294842452276, + "learning_rate": 0.00014443300324370738, + "loss": 1.0825, + "step": 8064 + }, + { + "epoch": 0.77, + "grad_norm": 0.2667481516709462, + "learning_rate": 0.00014441883058869298, + "loss": 1.0139, + "step": 8065 + }, + { + "epoch": 0.77, + "grad_norm": 0.31330268446835163, + "learning_rate": 0.00014440465682206944, + "loss": 1.1519, + "step": 8066 + }, + { + "epoch": 0.77, + "grad_norm": 0.29426667401104506, + "learning_rate": 0.00014439048194419141, + "loss": 1.001, + "step": 8067 + }, + { + "epoch": 0.77, + "grad_norm": 0.30892190035705414, + "learning_rate": 0.00014437630595541374, + "loss": 1.0714, + "step": 8068 + }, + { + "epoch": 0.77, + "grad_norm": 0.28625371937125405, + "learning_rate": 0.00014436212885609106, + "loss": 1.1328, + "step": 8069 + }, + { + "epoch": 0.77, + "grad_norm": 0.2904730209899161, + "learning_rate": 0.00014434795064657827, + "loss": 0.9807, + "step": 8070 + }, + { + "epoch": 0.77, + "grad_norm": 0.29946573442663216, + "learning_rate": 0.0001443337713272301, + "loss": 1.108, + "step": 8071 + }, + { + "epoch": 0.77, + "grad_norm": 0.25138650851267935, + "learning_rate": 0.0001443195908984015, + "loss": 0.9354, + "step": 8072 + }, + { + "epoch": 0.77, + "grad_norm": 0.3046129695319591, + "learning_rate": 0.00014430540936044724, + "loss": 1.0464, + "step": 8073 + }, + { + "epoch": 0.77, + "grad_norm": 0.2815772869608019, + "learning_rate": 0.0001442912267137223, + "loss": 1.1689, + "step": 8074 + }, + { + "epoch": 0.77, + "grad_norm": 0.28788534180895897, + "learning_rate": 0.00014427704295858154, + "loss": 1.1042, + "step": 8075 + }, + { + "epoch": 0.77, + "grad_norm": 0.28620999022686416, + "learning_rate": 0.00014426285809537997, + "loss": 1.0644, + "step": 8076 + }, + { + "epoch": 0.77, + "grad_norm": 0.2718948107560663, + "learning_rate": 0.00014424867212447254, + "loss": 1.0555, + "step": 8077 + }, + { + "epoch": 0.77, + "grad_norm": 0.2745313908908246, + "learning_rate": 0.0001442344850462143, + "loss": 0.953, + "step": 8078 + }, + { + "epoch": 0.77, + "grad_norm": 0.2782122491457099, + "learning_rate": 0.0001442202968609603, + "loss": 1.098, + "step": 8079 + }, + { + "epoch": 0.77, + "grad_norm": 0.27832697424885927, + "learning_rate": 0.00014420610756906552, + "loss": 0.9896, + "step": 8080 + }, + { + "epoch": 0.77, + "grad_norm": 0.2850630781424915, + "learning_rate": 0.00014419191717088517, + "loss": 1.0579, + "step": 8081 + }, + { + "epoch": 0.77, + "grad_norm": 0.30698153794733524, + "learning_rate": 0.00014417772566677428, + "loss": 0.9522, + "step": 8082 + }, + { + "epoch": 0.77, + "grad_norm": 0.2412252387912062, + "learning_rate": 0.00014416353305708802, + "loss": 0.963, + "step": 8083 + }, + { + "epoch": 0.77, + "grad_norm": 0.2760933371029509, + "learning_rate": 0.00014414933934218165, + "loss": 1.039, + "step": 8084 + }, + { + "epoch": 0.77, + "grad_norm": 0.27478977307311525, + "learning_rate": 0.0001441351445224103, + "loss": 1.0191, + "step": 8085 + }, + { + "epoch": 0.77, + "grad_norm": 0.27460908719231536, + "learning_rate": 0.0001441209485981292, + "loss": 0.942, + "step": 8086 + }, + { + "epoch": 0.77, + "grad_norm": 0.300097647062556, + "learning_rate": 0.00014410675156969362, + "loss": 1.1083, + "step": 8087 + }, + { + "epoch": 0.77, + "grad_norm": 0.301434940986813, + "learning_rate": 0.0001440925534374589, + "loss": 1.1201, + "step": 8088 + }, + { + "epoch": 0.77, + "grad_norm": 0.34304082819084863, + "learning_rate": 0.00014407835420178028, + "loss": 1.2234, + "step": 8089 + }, + { + "epoch": 0.77, + "grad_norm": 0.30266233132954434, + "learning_rate": 0.00014406415386301319, + "loss": 1.0731, + "step": 8090 + }, + { + "epoch": 0.77, + "grad_norm": 0.29052074003899026, + "learning_rate": 0.00014404995242151293, + "loss": 1.0351, + "step": 8091 + }, + { + "epoch": 0.77, + "grad_norm": 0.31236829856827036, + "learning_rate": 0.00014403574987763493, + "loss": 1.1238, + "step": 8092 + }, + { + "epoch": 0.77, + "grad_norm": 0.27979550559379457, + "learning_rate": 0.0001440215462317346, + "loss": 1.142, + "step": 8093 + }, + { + "epoch": 0.77, + "grad_norm": 0.303167287374647, + "learning_rate": 0.00014400734148416742, + "loss": 1.1546, + "step": 8094 + }, + { + "epoch": 0.77, + "grad_norm": 0.2722206268311316, + "learning_rate": 0.00014399313563528886, + "loss": 1.0722, + "step": 8095 + }, + { + "epoch": 0.77, + "grad_norm": 0.29950103549697416, + "learning_rate": 0.00014397892868545442, + "loss": 1.0869, + "step": 8096 + }, + { + "epoch": 0.77, + "grad_norm": 0.29083600055293385, + "learning_rate": 0.00014396472063501968, + "loss": 1.0576, + "step": 8097 + }, + { + "epoch": 0.77, + "grad_norm": 0.2883189452608079, + "learning_rate": 0.00014395051148434015, + "loss": 0.9811, + "step": 8098 + }, + { + "epoch": 0.77, + "grad_norm": 0.2614201885833928, + "learning_rate": 0.0001439363012337715, + "loss": 1.0721, + "step": 8099 + }, + { + "epoch": 0.77, + "grad_norm": 0.27854811920048617, + "learning_rate": 0.00014392208988366921, + "loss": 1.0133, + "step": 8100 + }, + { + "epoch": 0.78, + "grad_norm": 0.33867360910508465, + "learning_rate": 0.00014390787743438907, + "loss": 1.0265, + "step": 8101 + }, + { + "epoch": 0.78, + "grad_norm": 0.27479933367603354, + "learning_rate": 0.0001438936638862867, + "loss": 0.9831, + "step": 8102 + }, + { + "epoch": 0.78, + "grad_norm": 0.3270439622691649, + "learning_rate": 0.00014387944923971782, + "loss": 1.1035, + "step": 8103 + }, + { + "epoch": 0.78, + "grad_norm": 0.29562739459578125, + "learning_rate": 0.0001438652334950381, + "loss": 1.0352, + "step": 8104 + }, + { + "epoch": 0.78, + "grad_norm": 0.2771019668036566, + "learning_rate": 0.00014385101665260338, + "loss": 1.0598, + "step": 8105 + }, + { + "epoch": 0.78, + "grad_norm": 0.26504488308797247, + "learning_rate": 0.0001438367987127694, + "loss": 1.1604, + "step": 8106 + }, + { + "epoch": 0.78, + "grad_norm": 0.3218630057212102, + "learning_rate": 0.000143822579675892, + "loss": 1.0142, + "step": 8107 + }, + { + "epoch": 0.78, + "grad_norm": 0.28024189474691924, + "learning_rate": 0.00014380835954232697, + "loss": 1.0721, + "step": 8108 + }, + { + "epoch": 0.78, + "grad_norm": 0.2852976599263691, + "learning_rate": 0.00014379413831243026, + "loss": 0.9966, + "step": 8109 + }, + { + "epoch": 0.78, + "grad_norm": 0.2904612714774512, + "learning_rate": 0.00014377991598655765, + "loss": 1.0675, + "step": 8110 + }, + { + "epoch": 0.78, + "grad_norm": 0.26588060857502965, + "learning_rate": 0.00014376569256506516, + "loss": 1.0215, + "step": 8111 + }, + { + "epoch": 0.78, + "grad_norm": 0.28963536427741476, + "learning_rate": 0.0001437514680483087, + "loss": 0.9806, + "step": 8112 + }, + { + "epoch": 0.78, + "grad_norm": 0.2669044448329335, + "learning_rate": 0.00014373724243664423, + "loss": 1.1047, + "step": 8113 + }, + { + "epoch": 0.78, + "grad_norm": 0.2904302678513009, + "learning_rate": 0.00014372301573042782, + "loss": 1.0147, + "step": 8114 + }, + { + "epoch": 0.78, + "grad_norm": 0.25491810409780064, + "learning_rate": 0.00014370878793001546, + "loss": 1.0453, + "step": 8115 + }, + { + "epoch": 0.78, + "grad_norm": 0.2592085806502541, + "learning_rate": 0.0001436945590357632, + "loss": 1.0276, + "step": 8116 + }, + { + "epoch": 0.78, + "grad_norm": 0.2821215453803441, + "learning_rate": 0.00014368032904802714, + "loss": 1.0488, + "step": 8117 + }, + { + "epoch": 0.78, + "grad_norm": 0.2615363952942239, + "learning_rate": 0.00014366609796716338, + "loss": 1.1121, + "step": 8118 + }, + { + "epoch": 0.78, + "grad_norm": 0.31736889976384736, + "learning_rate": 0.0001436518657935281, + "loss": 1.1233, + "step": 8119 + }, + { + "epoch": 0.78, + "grad_norm": 0.2598336195194818, + "learning_rate": 0.00014363763252747745, + "loss": 1.0805, + "step": 8120 + }, + { + "epoch": 0.78, + "grad_norm": 0.3284573374828336, + "learning_rate": 0.0001436233981693676, + "loss": 1.0575, + "step": 8121 + }, + { + "epoch": 0.78, + "grad_norm": 0.2908417184290244, + "learning_rate": 0.00014360916271955482, + "loss": 1.0383, + "step": 8122 + }, + { + "epoch": 0.78, + "grad_norm": 0.28781407930130587, + "learning_rate": 0.0001435949261783953, + "loss": 0.9912, + "step": 8123 + }, + { + "epoch": 0.78, + "grad_norm": 0.2929647743747517, + "learning_rate": 0.0001435806885462454, + "loss": 1.0553, + "step": 8124 + }, + { + "epoch": 0.78, + "grad_norm": 0.2753194722230229, + "learning_rate": 0.00014356644982346133, + "loss": 0.9769, + "step": 8125 + }, + { + "epoch": 0.78, + "grad_norm": 0.3013577274487869, + "learning_rate": 0.0001435522100103995, + "loss": 1.0244, + "step": 8126 + }, + { + "epoch": 0.78, + "grad_norm": 0.2901508132919651, + "learning_rate": 0.00014353796910741623, + "loss": 1.011, + "step": 8127 + }, + { + "epoch": 0.78, + "grad_norm": 0.29254447956246066, + "learning_rate": 0.0001435237271148679, + "loss": 1.0236, + "step": 8128 + }, + { + "epoch": 0.78, + "grad_norm": 0.3042957195012631, + "learning_rate": 0.000143509484033111, + "loss": 1.0665, + "step": 8129 + }, + { + "epoch": 0.78, + "grad_norm": 0.29428251771956543, + "learning_rate": 0.0001434952398625019, + "loss": 1.065, + "step": 8130 + }, + { + "epoch": 0.78, + "grad_norm": 0.2734033579364972, + "learning_rate": 0.00014348099460339707, + "loss": 1.001, + "step": 8131 + }, + { + "epoch": 0.78, + "grad_norm": 0.3303462730760466, + "learning_rate": 0.00014346674825615303, + "loss": 1.0916, + "step": 8132 + }, + { + "epoch": 0.78, + "grad_norm": 0.27668942643869243, + "learning_rate": 0.0001434525008211263, + "loss": 1.0658, + "step": 8133 + }, + { + "epoch": 0.78, + "grad_norm": 0.2953192209869138, + "learning_rate": 0.00014343825229867343, + "loss": 1.0055, + "step": 8134 + }, + { + "epoch": 0.78, + "grad_norm": 0.30590123544431624, + "learning_rate": 0.00014342400268915097, + "loss": 1.1254, + "step": 8135 + }, + { + "epoch": 0.78, + "grad_norm": 0.289254633390817, + "learning_rate": 0.00014340975199291558, + "loss": 0.8961, + "step": 8136 + }, + { + "epoch": 0.78, + "grad_norm": 0.2554256044992401, + "learning_rate": 0.00014339550021032384, + "loss": 1.0181, + "step": 8137 + }, + { + "epoch": 0.78, + "grad_norm": 0.3141083587137128, + "learning_rate": 0.00014338124734173245, + "loss": 1.044, + "step": 8138 + }, + { + "epoch": 0.78, + "grad_norm": 0.2569367079798127, + "learning_rate": 0.0001433669933874981, + "loss": 1.1267, + "step": 8139 + }, + { + "epoch": 0.78, + "grad_norm": 0.2678458091756298, + "learning_rate": 0.00014335273834797745, + "loss": 1.1156, + "step": 8140 + }, + { + "epoch": 0.78, + "grad_norm": 0.29169647370051666, + "learning_rate": 0.0001433384822235273, + "loss": 1.0323, + "step": 8141 + }, + { + "epoch": 0.78, + "grad_norm": 0.2742299999761089, + "learning_rate": 0.0001433242250145044, + "loss": 1.0005, + "step": 8142 + }, + { + "epoch": 0.78, + "grad_norm": 0.2685403344557798, + "learning_rate": 0.00014330996672126553, + "loss": 0.9437, + "step": 8143 + }, + { + "epoch": 0.78, + "grad_norm": 0.29515305992400265, + "learning_rate": 0.0001432957073441675, + "loss": 1.1712, + "step": 8144 + }, + { + "epoch": 0.78, + "grad_norm": 0.2932051822985775, + "learning_rate": 0.00014328144688356722, + "loss": 1.083, + "step": 8145 + }, + { + "epoch": 0.78, + "grad_norm": 0.28221973458585115, + "learning_rate": 0.00014326718533982154, + "loss": 1.0657, + "step": 8146 + }, + { + "epoch": 0.78, + "grad_norm": 0.2528985840125888, + "learning_rate": 0.00014325292271328733, + "loss": 1.1343, + "step": 8147 + }, + { + "epoch": 0.78, + "grad_norm": 0.3099604595171793, + "learning_rate": 0.00014323865900432153, + "loss": 1.1354, + "step": 8148 + }, + { + "epoch": 0.78, + "grad_norm": 0.28724423689538947, + "learning_rate": 0.00014322439421328114, + "loss": 1.0258, + "step": 8149 + }, + { + "epoch": 0.78, + "grad_norm": 0.29908396514945557, + "learning_rate": 0.0001432101283405231, + "loss": 1.0992, + "step": 8150 + }, + { + "epoch": 0.78, + "grad_norm": 0.25176928414336436, + "learning_rate": 0.00014319586138640447, + "loss": 1.127, + "step": 8151 + }, + { + "epoch": 0.78, + "grad_norm": 0.27010420300606885, + "learning_rate": 0.00014318159335128226, + "loss": 1.112, + "step": 8152 + }, + { + "epoch": 0.78, + "grad_norm": 0.27285928105597573, + "learning_rate": 0.0001431673242355135, + "loss": 1.0926, + "step": 8153 + }, + { + "epoch": 0.78, + "grad_norm": 0.2606896768983877, + "learning_rate": 0.00014315305403945534, + "loss": 1.0197, + "step": 8154 + }, + { + "epoch": 0.78, + "grad_norm": 0.2828962978680743, + "learning_rate": 0.0001431387827634649, + "loss": 1.1644, + "step": 8155 + }, + { + "epoch": 0.78, + "grad_norm": 0.30392229600605375, + "learning_rate": 0.00014312451040789928, + "loss": 1.0046, + "step": 8156 + }, + { + "epoch": 0.78, + "grad_norm": 0.22992598628886599, + "learning_rate": 0.0001431102369731157, + "loss": 1.1334, + "step": 8157 + }, + { + "epoch": 0.78, + "grad_norm": 0.27602772065447234, + "learning_rate": 0.00014309596245947134, + "loss": 1.1262, + "step": 8158 + }, + { + "epoch": 0.78, + "grad_norm": 0.25737443548054917, + "learning_rate": 0.00014308168686732344, + "loss": 0.9536, + "step": 8159 + }, + { + "epoch": 0.78, + "grad_norm": 0.2674977275846256, + "learning_rate": 0.00014306741019702926, + "loss": 1.1088, + "step": 8160 + }, + { + "epoch": 0.78, + "grad_norm": 0.26488508791506327, + "learning_rate": 0.00014305313244894604, + "loss": 1.0541, + "step": 8161 + }, + { + "epoch": 0.78, + "grad_norm": 0.2689193524609303, + "learning_rate": 0.00014303885362343115, + "loss": 1.1577, + "step": 8162 + }, + { + "epoch": 0.78, + "grad_norm": 0.2879200289332181, + "learning_rate": 0.00014302457372084192, + "loss": 1.1757, + "step": 8163 + }, + { + "epoch": 0.78, + "grad_norm": 0.2867128935586349, + "learning_rate": 0.00014301029274153563, + "loss": 1.0072, + "step": 8164 + }, + { + "epoch": 0.78, + "grad_norm": 0.24341160678709342, + "learning_rate": 0.00014299601068586978, + "loss": 1.0225, + "step": 8165 + }, + { + "epoch": 0.78, + "grad_norm": 0.2762861135172704, + "learning_rate": 0.00014298172755420173, + "loss": 1.1304, + "step": 8166 + }, + { + "epoch": 0.78, + "grad_norm": 0.2769763284115676, + "learning_rate": 0.00014296744334688893, + "loss": 1.0157, + "step": 8167 + }, + { + "epoch": 0.78, + "grad_norm": 0.26734305962867105, + "learning_rate": 0.0001429531580642889, + "loss": 1.1552, + "step": 8168 + }, + { + "epoch": 0.78, + "grad_norm": 0.26480907501831297, + "learning_rate": 0.00014293887170675903, + "loss": 0.9893, + "step": 8169 + }, + { + "epoch": 0.78, + "grad_norm": 0.29281043302440196, + "learning_rate": 0.00014292458427465695, + "loss": 0.9863, + "step": 8170 + }, + { + "epoch": 0.78, + "grad_norm": 0.2888301828367874, + "learning_rate": 0.00014291029576834013, + "loss": 1.1172, + "step": 8171 + }, + { + "epoch": 0.78, + "grad_norm": 0.3039150196032684, + "learning_rate": 0.00014289600618816627, + "loss": 1.0797, + "step": 8172 + }, + { + "epoch": 0.78, + "grad_norm": 0.2841118966636821, + "learning_rate": 0.0001428817155344928, + "loss": 1.1855, + "step": 8173 + }, + { + "epoch": 0.78, + "grad_norm": 0.28079473707368974, + "learning_rate": 0.0001428674238076775, + "loss": 1.0272, + "step": 8174 + }, + { + "epoch": 0.78, + "grad_norm": 0.2862456351557848, + "learning_rate": 0.00014285313100807797, + "loss": 1.0838, + "step": 8175 + }, + { + "epoch": 0.78, + "grad_norm": 0.2602397115203145, + "learning_rate": 0.00014283883713605192, + "loss": 1.083, + "step": 8176 + }, + { + "epoch": 0.78, + "grad_norm": 0.30104111034073594, + "learning_rate": 0.00014282454219195702, + "loss": 1.0324, + "step": 8177 + }, + { + "epoch": 0.78, + "grad_norm": 0.24143983810247652, + "learning_rate": 0.00014281024617615105, + "loss": 1.0873, + "step": 8178 + }, + { + "epoch": 0.78, + "grad_norm": 0.2932603381628741, + "learning_rate": 0.00014279594908899175, + "loss": 1.017, + "step": 8179 + }, + { + "epoch": 0.78, + "grad_norm": 0.3006594349278346, + "learning_rate": 0.00014278165093083696, + "loss": 1.0753, + "step": 8180 + }, + { + "epoch": 0.78, + "grad_norm": 0.28022927773599904, + "learning_rate": 0.00014276735170204444, + "loss": 1.0561, + "step": 8181 + }, + { + "epoch": 0.78, + "grad_norm": 0.27844733340976957, + "learning_rate": 0.0001427530514029721, + "loss": 1.1647, + "step": 8182 + }, + { + "epoch": 0.78, + "grad_norm": 0.2571022498864135, + "learning_rate": 0.00014273875003397774, + "loss": 0.92, + "step": 8183 + }, + { + "epoch": 0.78, + "grad_norm": 0.2692587322659136, + "learning_rate": 0.0001427244475954193, + "loss": 1.0409, + "step": 8184 + }, + { + "epoch": 0.78, + "grad_norm": 0.2868029667790984, + "learning_rate": 0.00014271014408765472, + "loss": 0.9207, + "step": 8185 + }, + { + "epoch": 0.78, + "grad_norm": 0.3122721251692393, + "learning_rate": 0.00014269583951104196, + "loss": 1.0165, + "step": 8186 + }, + { + "epoch": 0.78, + "grad_norm": 0.240316991720147, + "learning_rate": 0.00014268153386593898, + "loss": 1.1213, + "step": 8187 + }, + { + "epoch": 0.78, + "grad_norm": 0.283970934006317, + "learning_rate": 0.00014266722715270376, + "loss": 1.1421, + "step": 8188 + }, + { + "epoch": 0.78, + "grad_norm": 0.2848531576221525, + "learning_rate": 0.0001426529193716944, + "loss": 1.0823, + "step": 8189 + }, + { + "epoch": 0.78, + "grad_norm": 0.2683429644781905, + "learning_rate": 0.0001426386105232689, + "loss": 1.0785, + "step": 8190 + }, + { + "epoch": 0.78, + "grad_norm": 0.2862688618200794, + "learning_rate": 0.00014262430060778538, + "loss": 1.0695, + "step": 8191 + }, + { + "epoch": 0.78, + "grad_norm": 0.26946047893455116, + "learning_rate": 0.00014260998962560195, + "loss": 0.9804, + "step": 8192 + }, + { + "epoch": 0.78, + "grad_norm": 0.3044969174847609, + "learning_rate": 0.00014259567757707675, + "loss": 1.0052, + "step": 8193 + }, + { + "epoch": 0.78, + "grad_norm": 0.27698543650468876, + "learning_rate": 0.00014258136446256795, + "loss": 0.9854, + "step": 8194 + }, + { + "epoch": 0.78, + "grad_norm": 0.2971561209977439, + "learning_rate": 0.00014256705028243375, + "loss": 1.0358, + "step": 8195 + }, + { + "epoch": 0.78, + "grad_norm": 0.2770540731397757, + "learning_rate": 0.00014255273503703238, + "loss": 1.0042, + "step": 8196 + }, + { + "epoch": 0.78, + "grad_norm": 0.28561457756539305, + "learning_rate": 0.00014253841872672202, + "loss": 1.1068, + "step": 8197 + }, + { + "epoch": 0.78, + "grad_norm": 0.2813535778333693, + "learning_rate": 0.00014252410135186103, + "loss": 1.1568, + "step": 8198 + }, + { + "epoch": 0.78, + "grad_norm": 0.3051058643154024, + "learning_rate": 0.00014250978291280766, + "loss": 1.0966, + "step": 8199 + }, + { + "epoch": 0.78, + "grad_norm": 0.2995687199626439, + "learning_rate": 0.00014249546340992027, + "loss": 0.9566, + "step": 8200 + }, + { + "epoch": 0.78, + "grad_norm": 0.25038359737911414, + "learning_rate": 0.0001424811428435572, + "loss": 0.9642, + "step": 8201 + }, + { + "epoch": 0.78, + "grad_norm": 0.2755441297303153, + "learning_rate": 0.00014246682121407686, + "loss": 0.9964, + "step": 8202 + }, + { + "epoch": 0.78, + "grad_norm": 0.28771744037000346, + "learning_rate": 0.0001424524985218376, + "loss": 1.1178, + "step": 8203 + }, + { + "epoch": 0.78, + "grad_norm": 0.26096812303528166, + "learning_rate": 0.00014243817476719789, + "loss": 1.0602, + "step": 8204 + }, + { + "epoch": 0.78, + "grad_norm": 0.2910822777089926, + "learning_rate": 0.00014242384995051617, + "loss": 1.1028, + "step": 8205 + }, + { + "epoch": 0.79, + "grad_norm": 0.2946214010693757, + "learning_rate": 0.000142409524072151, + "loss": 1.0582, + "step": 8206 + }, + { + "epoch": 0.79, + "grad_norm": 0.2782362063554336, + "learning_rate": 0.00014239519713246077, + "loss": 1.0472, + "step": 8207 + }, + { + "epoch": 0.79, + "grad_norm": 0.3096824457906856, + "learning_rate": 0.00014238086913180407, + "loss": 1.1764, + "step": 8208 + }, + { + "epoch": 0.79, + "grad_norm": 0.2705085127084864, + "learning_rate": 0.00014236654007053956, + "loss": 1.0761, + "step": 8209 + }, + { + "epoch": 0.79, + "grad_norm": 0.2746196439259208, + "learning_rate": 0.00014235220994902572, + "loss": 1.0842, + "step": 8210 + }, + { + "epoch": 0.79, + "grad_norm": 0.2818643359206753, + "learning_rate": 0.0001423378787676212, + "loss": 1.0669, + "step": 8211 + }, + { + "epoch": 0.79, + "grad_norm": 0.3016521365181387, + "learning_rate": 0.0001423235465266847, + "loss": 1.1674, + "step": 8212 + }, + { + "epoch": 0.79, + "grad_norm": 0.3097236944936014, + "learning_rate": 0.0001423092132265748, + "loss": 1.019, + "step": 8213 + }, + { + "epoch": 0.79, + "grad_norm": 0.25229115162540394, + "learning_rate": 0.00014229487886765026, + "loss": 1.1445, + "step": 8214 + }, + { + "epoch": 0.79, + "grad_norm": 0.27446316421143896, + "learning_rate": 0.0001422805434502698, + "loss": 1.1201, + "step": 8215 + }, + { + "epoch": 0.79, + "grad_norm": 0.265891783959781, + "learning_rate": 0.00014226620697479217, + "loss": 0.9653, + "step": 8216 + }, + { + "epoch": 0.79, + "grad_norm": 0.2858901663372903, + "learning_rate": 0.00014225186944157614, + "loss": 1.1012, + "step": 8217 + }, + { + "epoch": 0.79, + "grad_norm": 0.24368459446009416, + "learning_rate": 0.00014223753085098052, + "loss": 1.0438, + "step": 8218 + }, + { + "epoch": 0.79, + "grad_norm": 0.2658297205018217, + "learning_rate": 0.00014222319120336415, + "loss": 1.0923, + "step": 8219 + }, + { + "epoch": 0.79, + "grad_norm": 0.27643416390290804, + "learning_rate": 0.0001422088504990859, + "loss": 1.0201, + "step": 8220 + }, + { + "epoch": 0.79, + "grad_norm": 0.27516078932172305, + "learning_rate": 0.00014219450873850464, + "loss": 0.9544, + "step": 8221 + }, + { + "epoch": 0.79, + "grad_norm": 0.27601315150275746, + "learning_rate": 0.00014218016592197925, + "loss": 1.0343, + "step": 8222 + }, + { + "epoch": 0.79, + "grad_norm": 0.24192057612707965, + "learning_rate": 0.00014216582204986872, + "loss": 1.0806, + "step": 8223 + }, + { + "epoch": 0.79, + "grad_norm": 0.28077409693408506, + "learning_rate": 0.000142151477122532, + "loss": 0.8763, + "step": 8224 + }, + { + "epoch": 0.79, + "grad_norm": 0.27896630435292175, + "learning_rate": 0.00014213713114032803, + "loss": 1.0493, + "step": 8225 + }, + { + "epoch": 0.79, + "grad_norm": 0.2737335624375359, + "learning_rate": 0.0001421227841036159, + "loss": 1.0675, + "step": 8226 + }, + { + "epoch": 0.79, + "grad_norm": 0.3407111785009769, + "learning_rate": 0.00014210843601275466, + "loss": 1.0628, + "step": 8227 + }, + { + "epoch": 0.79, + "grad_norm": 0.2617914284389135, + "learning_rate": 0.00014209408686810329, + "loss": 1.0987, + "step": 8228 + }, + { + "epoch": 0.79, + "grad_norm": 0.3013100303523883, + "learning_rate": 0.00014207973667002097, + "loss": 0.9312, + "step": 8229 + }, + { + "epoch": 0.79, + "grad_norm": 0.2846920379517571, + "learning_rate": 0.00014206538541886677, + "loss": 1.0237, + "step": 8230 + }, + { + "epoch": 0.79, + "grad_norm": 0.30283490816190023, + "learning_rate": 0.0001420510331149999, + "loss": 1.0535, + "step": 8231 + }, + { + "epoch": 0.79, + "grad_norm": 0.24872498659564674, + "learning_rate": 0.00014203667975877946, + "loss": 1.057, + "step": 8232 + }, + { + "epoch": 0.79, + "grad_norm": 0.27630382799540015, + "learning_rate": 0.00014202232535056472, + "loss": 1.0121, + "step": 8233 + }, + { + "epoch": 0.79, + "grad_norm": 0.28477175943154037, + "learning_rate": 0.00014200796989071487, + "loss": 1.045, + "step": 8234 + }, + { + "epoch": 0.79, + "grad_norm": 0.2719512123825706, + "learning_rate": 0.00014199361337958915, + "loss": 1.075, + "step": 8235 + }, + { + "epoch": 0.79, + "grad_norm": 0.27304851623487275, + "learning_rate": 0.0001419792558175469, + "loss": 0.9973, + "step": 8236 + }, + { + "epoch": 0.79, + "grad_norm": 0.2749777621769799, + "learning_rate": 0.0001419648972049474, + "loss": 1.0225, + "step": 8237 + }, + { + "epoch": 0.79, + "grad_norm": 0.26176184932542573, + "learning_rate": 0.0001419505375421499, + "loss": 1.061, + "step": 8238 + }, + { + "epoch": 0.79, + "grad_norm": 0.2825657955231182, + "learning_rate": 0.0001419361768295139, + "loss": 1.039, + "step": 8239 + }, + { + "epoch": 0.79, + "grad_norm": 0.26238648793597996, + "learning_rate": 0.00014192181506739868, + "loss": 0.9806, + "step": 8240 + }, + { + "epoch": 0.79, + "grad_norm": 0.2596777187775208, + "learning_rate": 0.0001419074522561637, + "loss": 1.0234, + "step": 8241 + }, + { + "epoch": 0.79, + "grad_norm": 0.2883448002693972, + "learning_rate": 0.0001418930883961684, + "loss": 1.113, + "step": 8242 + }, + { + "epoch": 0.79, + "grad_norm": 0.31700421775940696, + "learning_rate": 0.00014187872348777223, + "loss": 0.9246, + "step": 8243 + }, + { + "epoch": 0.79, + "grad_norm": 0.2502687124845023, + "learning_rate": 0.00014186435753133468, + "loss": 0.9632, + "step": 8244 + }, + { + "epoch": 0.79, + "grad_norm": 0.2721584066704403, + "learning_rate": 0.00014184999052721528, + "loss": 1.0489, + "step": 8245 + }, + { + "epoch": 0.79, + "grad_norm": 0.2672093711111685, + "learning_rate": 0.00014183562247577358, + "loss": 1.029, + "step": 8246 + }, + { + "epoch": 0.79, + "grad_norm": 0.31595242015080555, + "learning_rate": 0.00014182125337736912, + "loss": 1.0484, + "step": 8247 + }, + { + "epoch": 0.79, + "grad_norm": 0.217568494553999, + "learning_rate": 0.0001418068832323615, + "loss": 0.8719, + "step": 8248 + }, + { + "epoch": 0.79, + "grad_norm": 0.27618501018822034, + "learning_rate": 0.00014179251204111037, + "loss": 0.9553, + "step": 8249 + }, + { + "epoch": 0.79, + "grad_norm": 0.28614850226175115, + "learning_rate": 0.00014177813980397535, + "loss": 1.0244, + "step": 8250 + }, + { + "epoch": 0.79, + "grad_norm": 0.2804392960583254, + "learning_rate": 0.00014176376652131614, + "loss": 1.117, + "step": 8251 + }, + { + "epoch": 0.79, + "grad_norm": 0.296161246928998, + "learning_rate": 0.0001417493921934924, + "loss": 0.9899, + "step": 8252 + }, + { + "epoch": 0.79, + "grad_norm": 0.27500753507752407, + "learning_rate": 0.00014173501682086389, + "loss": 1.0332, + "step": 8253 + }, + { + "epoch": 0.79, + "grad_norm": 0.27240297524975027, + "learning_rate": 0.00014172064040379037, + "loss": 1.1169, + "step": 8254 + }, + { + "epoch": 0.79, + "grad_norm": 0.2862065599891984, + "learning_rate": 0.00014170626294263158, + "loss": 0.9608, + "step": 8255 + }, + { + "epoch": 0.79, + "grad_norm": 0.29049647009040475, + "learning_rate": 0.00014169188443774737, + "loss": 1.0737, + "step": 8256 + }, + { + "epoch": 0.79, + "grad_norm": 0.30650214779789686, + "learning_rate": 0.00014167750488949753, + "loss": 1.1447, + "step": 8257 + }, + { + "epoch": 0.79, + "grad_norm": 0.25682892320136824, + "learning_rate": 0.00014166312429824196, + "loss": 1.1008, + "step": 8258 + }, + { + "epoch": 0.79, + "grad_norm": 0.289522043516876, + "learning_rate": 0.0001416487426643405, + "loss": 0.9996, + "step": 8259 + }, + { + "epoch": 0.79, + "grad_norm": 0.2612733947909529, + "learning_rate": 0.00014163435998815308, + "loss": 1.0454, + "step": 8260 + }, + { + "epoch": 0.79, + "grad_norm": 0.3017746734519574, + "learning_rate": 0.00014161997627003964, + "loss": 1.1057, + "step": 8261 + }, + { + "epoch": 0.79, + "grad_norm": 0.3026134541065002, + "learning_rate": 0.0001416055915103601, + "loss": 1.0563, + "step": 8262 + }, + { + "epoch": 0.79, + "grad_norm": 0.2921549638360822, + "learning_rate": 0.00014159120570947454, + "loss": 1.2412, + "step": 8263 + }, + { + "epoch": 0.79, + "grad_norm": 0.27462991805196874, + "learning_rate": 0.00014157681886774293, + "loss": 1.1579, + "step": 8264 + }, + { + "epoch": 0.79, + "grad_norm": 0.30353840216135913, + "learning_rate": 0.0001415624309855253, + "loss": 1.0529, + "step": 8265 + }, + { + "epoch": 0.79, + "grad_norm": 0.2628785239134127, + "learning_rate": 0.00014154804206318165, + "loss": 1.0783, + "step": 8266 + }, + { + "epoch": 0.79, + "grad_norm": 0.279824355560124, + "learning_rate": 0.00014153365210107217, + "loss": 0.992, + "step": 8267 + }, + { + "epoch": 0.79, + "grad_norm": 0.25845741171089465, + "learning_rate": 0.00014151926109955696, + "loss": 0.9018, + "step": 8268 + }, + { + "epoch": 0.79, + "grad_norm": 0.2949235009130459, + "learning_rate": 0.0001415048690589961, + "loss": 1.1514, + "step": 8269 + }, + { + "epoch": 0.79, + "grad_norm": 0.3091426382260449, + "learning_rate": 0.00014149047597974984, + "loss": 1.128, + "step": 8270 + }, + { + "epoch": 0.79, + "grad_norm": 0.261254800590016, + "learning_rate": 0.00014147608186217836, + "loss": 1.0248, + "step": 8271 + }, + { + "epoch": 0.79, + "grad_norm": 0.28155062611196985, + "learning_rate": 0.0001414616867066418, + "loss": 1.1088, + "step": 8272 + }, + { + "epoch": 0.79, + "grad_norm": 0.2802398085480191, + "learning_rate": 0.00014144729051350055, + "loss": 1.0411, + "step": 8273 + }, + { + "epoch": 0.79, + "grad_norm": 0.28950599861246645, + "learning_rate": 0.00014143289328311478, + "loss": 1.1909, + "step": 8274 + }, + { + "epoch": 0.79, + "grad_norm": 0.3156838267805892, + "learning_rate": 0.0001414184950158448, + "loss": 1.0001, + "step": 8275 + }, + { + "epoch": 0.79, + "grad_norm": 0.2761437699110209, + "learning_rate": 0.00014140409571205095, + "loss": 1.0747, + "step": 8276 + }, + { + "epoch": 0.79, + "grad_norm": 0.2556738607507545, + "learning_rate": 0.00014138969537209358, + "loss": 1.1727, + "step": 8277 + }, + { + "epoch": 0.79, + "grad_norm": 0.3116455068455913, + "learning_rate": 0.0001413752939963331, + "loss": 1.0714, + "step": 8278 + }, + { + "epoch": 0.79, + "grad_norm": 0.3023777310208979, + "learning_rate": 0.00014136089158512985, + "loss": 1.1541, + "step": 8279 + }, + { + "epoch": 0.79, + "grad_norm": 0.29881211105271765, + "learning_rate": 0.00014134648813884433, + "loss": 1.1441, + "step": 8280 + }, + { + "epoch": 0.79, + "grad_norm": 0.2564362293165476, + "learning_rate": 0.00014133208365783693, + "loss": 1.2231, + "step": 8281 + }, + { + "epoch": 0.79, + "grad_norm": 0.25459170299471473, + "learning_rate": 0.00014131767814246817, + "loss": 1.0638, + "step": 8282 + }, + { + "epoch": 0.79, + "grad_norm": 0.3164293509155231, + "learning_rate": 0.00014130327159309853, + "loss": 1.0238, + "step": 8283 + }, + { + "epoch": 0.79, + "grad_norm": 0.3161096823663635, + "learning_rate": 0.0001412888640100886, + "loss": 1.0997, + "step": 8284 + }, + { + "epoch": 0.79, + "grad_norm": 0.2926302601160675, + "learning_rate": 0.00014127445539379886, + "loss": 1.0269, + "step": 8285 + }, + { + "epoch": 0.79, + "grad_norm": 0.3026119265810598, + "learning_rate": 0.00014126004574458996, + "loss": 1.0535, + "step": 8286 + }, + { + "epoch": 0.79, + "grad_norm": 0.31239765394033814, + "learning_rate": 0.00014124563506282247, + "loss": 1.1042, + "step": 8287 + }, + { + "epoch": 0.79, + "grad_norm": 0.2730026191662197, + "learning_rate": 0.00014123122334885706, + "loss": 1.0406, + "step": 8288 + }, + { + "epoch": 0.79, + "grad_norm": 0.2802587764341756, + "learning_rate": 0.00014121681060305435, + "loss": 1.0393, + "step": 8289 + }, + { + "epoch": 0.79, + "grad_norm": 0.2965134782106928, + "learning_rate": 0.00014120239682577506, + "loss": 1.1487, + "step": 8290 + }, + { + "epoch": 0.79, + "grad_norm": 0.3604661013234594, + "learning_rate": 0.0001411879820173799, + "loss": 1.0004, + "step": 8291 + }, + { + "epoch": 0.79, + "grad_norm": 0.2754245502665909, + "learning_rate": 0.0001411735661782296, + "loss": 0.9554, + "step": 8292 + }, + { + "epoch": 0.79, + "grad_norm": 0.24638138967620207, + "learning_rate": 0.00014115914930868493, + "loss": 1.1353, + "step": 8293 + }, + { + "epoch": 0.79, + "grad_norm": 0.30600504343829077, + "learning_rate": 0.00014114473140910668, + "loss": 1.1201, + "step": 8294 + }, + { + "epoch": 0.79, + "grad_norm": 0.3035845944676315, + "learning_rate": 0.00014113031247985566, + "loss": 1.0861, + "step": 8295 + }, + { + "epoch": 0.79, + "grad_norm": 0.3384934195188291, + "learning_rate": 0.00014111589252129272, + "loss": 1.1501, + "step": 8296 + }, + { + "epoch": 0.79, + "grad_norm": 0.2716845796991304, + "learning_rate": 0.00014110147153377874, + "loss": 1.0534, + "step": 8297 + }, + { + "epoch": 0.79, + "grad_norm": 0.3062067246424864, + "learning_rate": 0.0001410870495176746, + "loss": 1.1802, + "step": 8298 + }, + { + "epoch": 0.79, + "grad_norm": 0.2673331008196946, + "learning_rate": 0.0001410726264733412, + "loss": 1.0404, + "step": 8299 + }, + { + "epoch": 0.79, + "grad_norm": 0.2860719333532579, + "learning_rate": 0.00014105820240113955, + "loss": 1.1296, + "step": 8300 + }, + { + "epoch": 0.79, + "grad_norm": 0.2924049271387296, + "learning_rate": 0.0001410437773014306, + "loss": 1.0039, + "step": 8301 + }, + { + "epoch": 0.79, + "grad_norm": 0.32493379544572254, + "learning_rate": 0.00014102935117457524, + "loss": 0.9896, + "step": 8302 + }, + { + "epoch": 0.79, + "grad_norm": 0.308689353469358, + "learning_rate": 0.00014101492402093463, + "loss": 0.9233, + "step": 8303 + }, + { + "epoch": 0.79, + "grad_norm": 0.2713635705097109, + "learning_rate": 0.00014100049584086979, + "loss": 1.0322, + "step": 8304 + }, + { + "epoch": 0.79, + "grad_norm": 0.2853711362692117, + "learning_rate": 0.00014098606663474176, + "loss": 1.0401, + "step": 8305 + }, + { + "epoch": 0.79, + "grad_norm": 0.25822823305650644, + "learning_rate": 0.00014097163640291164, + "loss": 1.0283, + "step": 8306 + }, + { + "epoch": 0.79, + "grad_norm": 0.3343555252521715, + "learning_rate": 0.00014095720514574058, + "loss": 1.0707, + "step": 8307 + }, + { + "epoch": 0.79, + "grad_norm": 0.28266257518096666, + "learning_rate": 0.00014094277286358972, + "loss": 1.1636, + "step": 8308 + }, + { + "epoch": 0.79, + "grad_norm": 0.2649055365462347, + "learning_rate": 0.00014092833955682026, + "loss": 1.0662, + "step": 8309 + }, + { + "epoch": 0.8, + "grad_norm": 0.31075468177448395, + "learning_rate": 0.00014091390522579333, + "loss": 1.0101, + "step": 8310 + }, + { + "epoch": 0.8, + "grad_norm": 0.3011177221612655, + "learning_rate": 0.00014089946987087023, + "loss": 1.136, + "step": 8311 + }, + { + "epoch": 0.8, + "grad_norm": 0.29851607856732926, + "learning_rate": 0.00014088503349241223, + "loss": 1.1139, + "step": 8312 + }, + { + "epoch": 0.8, + "grad_norm": 0.29855074002538673, + "learning_rate": 0.00014087059609078052, + "loss": 1.0843, + "step": 8313 + }, + { + "epoch": 0.8, + "grad_norm": 0.2768067900609332, + "learning_rate": 0.0001408561576663365, + "loss": 1.0972, + "step": 8314 + }, + { + "epoch": 0.8, + "grad_norm": 0.2819211049236916, + "learning_rate": 0.00014084171821944144, + "loss": 1.1288, + "step": 8315 + }, + { + "epoch": 0.8, + "grad_norm": 0.28684815717935336, + "learning_rate": 0.00014082727775045667, + "loss": 1.1531, + "step": 8316 + }, + { + "epoch": 0.8, + "grad_norm": 0.29080240408238495, + "learning_rate": 0.00014081283625974367, + "loss": 1.1591, + "step": 8317 + }, + { + "epoch": 0.8, + "grad_norm": 0.29429553590146723, + "learning_rate": 0.0001407983937476638, + "loss": 1.1297, + "step": 8318 + }, + { + "epoch": 0.8, + "grad_norm": 0.30787470171045134, + "learning_rate": 0.00014078395021457845, + "loss": 1.2024, + "step": 8319 + }, + { + "epoch": 0.8, + "grad_norm": 0.27975097914183483, + "learning_rate": 0.0001407695056608491, + "loss": 0.9402, + "step": 8320 + }, + { + "epoch": 0.8, + "grad_norm": 0.2832395257132092, + "learning_rate": 0.0001407550600868373, + "loss": 1.0379, + "step": 8321 + }, + { + "epoch": 0.8, + "grad_norm": 0.29437871608032573, + "learning_rate": 0.00014074061349290447, + "loss": 1.0362, + "step": 8322 + }, + { + "epoch": 0.8, + "grad_norm": 0.269143724602035, + "learning_rate": 0.00014072616587941218, + "loss": 1.0537, + "step": 8323 + }, + { + "epoch": 0.8, + "grad_norm": 0.29710110944603785, + "learning_rate": 0.00014071171724672202, + "loss": 1.0264, + "step": 8324 + }, + { + "epoch": 0.8, + "grad_norm": 0.30415387019725554, + "learning_rate": 0.00014069726759519553, + "loss": 1.231, + "step": 8325 + }, + { + "epoch": 0.8, + "grad_norm": 0.29030887320948506, + "learning_rate": 0.00014068281692519434, + "loss": 1.1106, + "step": 8326 + }, + { + "epoch": 0.8, + "grad_norm": 0.32054128469538806, + "learning_rate": 0.0001406683652370801, + "loss": 1.1057, + "step": 8327 + }, + { + "epoch": 0.8, + "grad_norm": 0.30650933574016137, + "learning_rate": 0.00014065391253121446, + "loss": 1.1166, + "step": 8328 + }, + { + "epoch": 0.8, + "grad_norm": 0.2694980594720075, + "learning_rate": 0.0001406394588079591, + "loss": 0.9278, + "step": 8329 + }, + { + "epoch": 0.8, + "grad_norm": 0.2696507606928678, + "learning_rate": 0.00014062500406767574, + "loss": 1.079, + "step": 8330 + }, + { + "epoch": 0.8, + "grad_norm": 0.2759447283420335, + "learning_rate": 0.00014061054831072614, + "loss": 0.9918, + "step": 8331 + }, + { + "epoch": 0.8, + "grad_norm": 0.2738734236935922, + "learning_rate": 0.00014059609153747204, + "loss": 1.1079, + "step": 8332 + }, + { + "epoch": 0.8, + "grad_norm": 0.28515443322196976, + "learning_rate": 0.00014058163374827521, + "loss": 1.0569, + "step": 8333 + }, + { + "epoch": 0.8, + "grad_norm": 0.3080422030491291, + "learning_rate": 0.0001405671749434975, + "loss": 1.0172, + "step": 8334 + }, + { + "epoch": 0.8, + "grad_norm": 0.6582786045483043, + "learning_rate": 0.00014055271512350079, + "loss": 1.458, + "step": 8335 + }, + { + "epoch": 0.8, + "grad_norm": 0.29640316228647345, + "learning_rate": 0.00014053825428864686, + "loss": 0.9889, + "step": 8336 + }, + { + "epoch": 0.8, + "grad_norm": 0.3224808939799605, + "learning_rate": 0.00014052379243929762, + "loss": 1.143, + "step": 8337 + }, + { + "epoch": 0.8, + "grad_norm": 0.26780388495165136, + "learning_rate": 0.00014050932957581505, + "loss": 1.1065, + "step": 8338 + }, + { + "epoch": 0.8, + "grad_norm": 0.3007238046192213, + "learning_rate": 0.000140494865698561, + "loss": 1.2191, + "step": 8339 + }, + { + "epoch": 0.8, + "grad_norm": 0.3150280632023674, + "learning_rate": 0.00014048040080789752, + "loss": 1.1275, + "step": 8340 + }, + { + "epoch": 0.8, + "grad_norm": 0.25501610087568716, + "learning_rate": 0.00014046593490418656, + "loss": 1.0839, + "step": 8341 + }, + { + "epoch": 0.8, + "grad_norm": 0.3241245476281749, + "learning_rate": 0.00014045146798779014, + "loss": 1.0121, + "step": 8342 + }, + { + "epoch": 0.8, + "grad_norm": 0.28717103288948476, + "learning_rate": 0.00014043700005907033, + "loss": 1.0748, + "step": 8343 + }, + { + "epoch": 0.8, + "grad_norm": 0.2949240771446552, + "learning_rate": 0.00014042253111838917, + "loss": 1.1718, + "step": 8344 + }, + { + "epoch": 0.8, + "grad_norm": 0.3080849599666515, + "learning_rate": 0.00014040806116610873, + "loss": 1.1352, + "step": 8345 + }, + { + "epoch": 0.8, + "grad_norm": 0.2757839282426602, + "learning_rate": 0.0001403935902025912, + "loss": 1.0233, + "step": 8346 + }, + { + "epoch": 0.8, + "grad_norm": 0.2993473446158299, + "learning_rate": 0.00014037911822819868, + "loss": 1.1544, + "step": 8347 + }, + { + "epoch": 0.8, + "grad_norm": 0.292778124826498, + "learning_rate": 0.00014036464524329337, + "loss": 1.1381, + "step": 8348 + }, + { + "epoch": 0.8, + "grad_norm": 0.2958157472586977, + "learning_rate": 0.00014035017124823743, + "loss": 1.0634, + "step": 8349 + }, + { + "epoch": 0.8, + "grad_norm": 0.2973614768918341, + "learning_rate": 0.00014033569624339308, + "loss": 1.0993, + "step": 8350 + }, + { + "epoch": 0.8, + "grad_norm": 0.2877291411238009, + "learning_rate": 0.0001403212202291226, + "loss": 1.1405, + "step": 8351 + }, + { + "epoch": 0.8, + "grad_norm": 0.3161483762081105, + "learning_rate": 0.00014030674320578823, + "loss": 1.0157, + "step": 8352 + }, + { + "epoch": 0.8, + "grad_norm": 0.3097883664079197, + "learning_rate": 0.0001402922651737523, + "loss": 1.1105, + "step": 8353 + }, + { + "epoch": 0.8, + "grad_norm": 0.3052718311315742, + "learning_rate": 0.00014027778613337708, + "loss": 1.1271, + "step": 8354 + }, + { + "epoch": 0.8, + "grad_norm": 0.24254225363550241, + "learning_rate": 0.00014026330608502496, + "loss": 0.9638, + "step": 8355 + }, + { + "epoch": 0.8, + "grad_norm": 0.28453519098553043, + "learning_rate": 0.00014024882502905833, + "loss": 1.095, + "step": 8356 + }, + { + "epoch": 0.8, + "grad_norm": 0.3148338032717982, + "learning_rate": 0.0001402343429658395, + "loss": 1.1137, + "step": 8357 + }, + { + "epoch": 0.8, + "grad_norm": 0.3117328371653278, + "learning_rate": 0.000140219859895731, + "loss": 1.1287, + "step": 8358 + }, + { + "epoch": 0.8, + "grad_norm": 0.26695229050684094, + "learning_rate": 0.00014020537581909524, + "loss": 1.0255, + "step": 8359 + }, + { + "epoch": 0.8, + "grad_norm": 0.28843280197149884, + "learning_rate": 0.00014019089073629464, + "loss": 1.0416, + "step": 8360 + }, + { + "epoch": 0.8, + "grad_norm": 0.3067274794496299, + "learning_rate": 0.00014017640464769176, + "loss": 1.0825, + "step": 8361 + }, + { + "epoch": 0.8, + "grad_norm": 0.2739855692621317, + "learning_rate": 0.00014016191755364908, + "loss": 1.0509, + "step": 8362 + }, + { + "epoch": 0.8, + "grad_norm": 0.2395995459480104, + "learning_rate": 0.0001401474294545292, + "loss": 1.0843, + "step": 8363 + }, + { + "epoch": 0.8, + "grad_norm": 0.2759339406448203, + "learning_rate": 0.00014013294035069467, + "loss": 1.1089, + "step": 8364 + }, + { + "epoch": 0.8, + "grad_norm": 0.3012501447545405, + "learning_rate": 0.00014011845024250805, + "loss": 1.0218, + "step": 8365 + }, + { + "epoch": 0.8, + "grad_norm": 0.2743545412783308, + "learning_rate": 0.00014010395913033202, + "loss": 0.9881, + "step": 8366 + }, + { + "epoch": 0.8, + "grad_norm": 0.22999177193777853, + "learning_rate": 0.00014008946701452921, + "loss": 1.0122, + "step": 8367 + }, + { + "epoch": 0.8, + "grad_norm": 0.3145753551754261, + "learning_rate": 0.00014007497389546228, + "loss": 1.1592, + "step": 8368 + }, + { + "epoch": 0.8, + "grad_norm": 0.29504190170329436, + "learning_rate": 0.000140060479773494, + "loss": 1.0783, + "step": 8369 + }, + { + "epoch": 0.8, + "grad_norm": 0.27650998107466573, + "learning_rate": 0.00014004598464898698, + "loss": 1.1227, + "step": 8370 + }, + { + "epoch": 0.8, + "grad_norm": 0.2594677164964031, + "learning_rate": 0.00014003148852230403, + "loss": 1.0184, + "step": 8371 + }, + { + "epoch": 0.8, + "grad_norm": 0.26498120130511177, + "learning_rate": 0.00014001699139380792, + "loss": 1.016, + "step": 8372 + }, + { + "epoch": 0.8, + "grad_norm": 0.33720870260470887, + "learning_rate": 0.00014000249326386147, + "loss": 0.9954, + "step": 8373 + }, + { + "epoch": 0.8, + "grad_norm": 0.27003696605507693, + "learning_rate": 0.0001399879941328275, + "loss": 1.0899, + "step": 8374 + }, + { + "epoch": 0.8, + "grad_norm": 0.25731301945171275, + "learning_rate": 0.0001399734940010688, + "loss": 1.1751, + "step": 8375 + }, + { + "epoch": 0.8, + "grad_norm": 0.288234443384105, + "learning_rate": 0.0001399589928689483, + "loss": 1.1105, + "step": 8376 + }, + { + "epoch": 0.8, + "grad_norm": 0.3086646943980646, + "learning_rate": 0.0001399444907368289, + "loss": 1.0816, + "step": 8377 + }, + { + "epoch": 0.8, + "grad_norm": 0.2857761154797601, + "learning_rate": 0.00013992998760507352, + "loss": 1.1379, + "step": 8378 + }, + { + "epoch": 0.8, + "grad_norm": 0.2848709741740318, + "learning_rate": 0.00013991548347404512, + "loss": 1.1337, + "step": 8379 + }, + { + "epoch": 0.8, + "grad_norm": 0.26144857764795576, + "learning_rate": 0.00013990097834410664, + "loss": 0.9858, + "step": 8380 + }, + { + "epoch": 0.8, + "grad_norm": 0.28140647243062356, + "learning_rate": 0.0001398864722156211, + "loss": 1.1319, + "step": 8381 + }, + { + "epoch": 0.8, + "grad_norm": 0.2757644481768791, + "learning_rate": 0.00013987196508895153, + "loss": 0.9251, + "step": 8382 + }, + { + "epoch": 0.8, + "grad_norm": 0.29461811148669015, + "learning_rate": 0.00013985745696446097, + "loss": 1.1373, + "step": 8383 + }, + { + "epoch": 0.8, + "grad_norm": 0.3141106450117521, + "learning_rate": 0.0001398429478425125, + "loss": 1.0046, + "step": 8384 + }, + { + "epoch": 0.8, + "grad_norm": 0.2932601643453963, + "learning_rate": 0.00013982843772346922, + "loss": 1.0836, + "step": 8385 + }, + { + "epoch": 0.8, + "grad_norm": 0.3080641745939993, + "learning_rate": 0.00013981392660769424, + "loss": 1.1587, + "step": 8386 + }, + { + "epoch": 0.8, + "grad_norm": 0.3161172120816325, + "learning_rate": 0.00013979941449555075, + "loss": 1.0562, + "step": 8387 + }, + { + "epoch": 0.8, + "grad_norm": 0.2919139460605944, + "learning_rate": 0.00013978490138740187, + "loss": 1.1742, + "step": 8388 + }, + { + "epoch": 0.8, + "grad_norm": 0.27923451724228243, + "learning_rate": 0.00013977038728361086, + "loss": 1.1083, + "step": 8389 + }, + { + "epoch": 0.8, + "grad_norm": 0.31621277440252016, + "learning_rate": 0.0001397558721845409, + "loss": 1.0143, + "step": 8390 + }, + { + "epoch": 0.8, + "grad_norm": 0.3053969306091787, + "learning_rate": 0.00013974135609055527, + "loss": 1.0729, + "step": 8391 + }, + { + "epoch": 0.8, + "grad_norm": 0.2825597454845708, + "learning_rate": 0.00013972683900201723, + "loss": 1.0619, + "step": 8392 + }, + { + "epoch": 0.8, + "grad_norm": 0.29068615722749824, + "learning_rate": 0.00013971232091929006, + "loss": 1.065, + "step": 8393 + }, + { + "epoch": 0.8, + "grad_norm": 0.2853464043964099, + "learning_rate": 0.00013969780184273705, + "loss": 0.9789, + "step": 8394 + }, + { + "epoch": 0.8, + "grad_norm": 0.2849572721200672, + "learning_rate": 0.0001396832817727217, + "loss": 1.0263, + "step": 8395 + }, + { + "epoch": 0.8, + "grad_norm": 0.28211914596567356, + "learning_rate": 0.00013966876070960722, + "loss": 1.1037, + "step": 8396 + }, + { + "epoch": 0.8, + "grad_norm": 0.2995697358994056, + "learning_rate": 0.00013965423865375712, + "loss": 1.1653, + "step": 8397 + }, + { + "epoch": 0.8, + "grad_norm": 0.29293561141447033, + "learning_rate": 0.0001396397156055347, + "loss": 1.094, + "step": 8398 + }, + { + "epoch": 0.8, + "grad_norm": 0.25956145112228035, + "learning_rate": 0.00013962519156530354, + "loss": 1.0856, + "step": 8399 + }, + { + "epoch": 0.8, + "grad_norm": 0.27947255917814445, + "learning_rate": 0.00013961066653342706, + "loss": 1.0245, + "step": 8400 + }, + { + "epoch": 0.8, + "grad_norm": 0.2745380656704247, + "learning_rate": 0.00013959614051026873, + "loss": 1.0483, + "step": 8401 + }, + { + "epoch": 0.8, + "grad_norm": 0.2980202904500265, + "learning_rate": 0.0001395816134961921, + "loss": 1.0607, + "step": 8402 + }, + { + "epoch": 0.8, + "grad_norm": 0.25864322411654844, + "learning_rate": 0.00013956708549156072, + "loss": 0.9817, + "step": 8403 + }, + { + "epoch": 0.8, + "grad_norm": 0.27059445234233453, + "learning_rate": 0.00013955255649673816, + "loss": 1.0721, + "step": 8404 + }, + { + "epoch": 0.8, + "grad_norm": 0.30462552256042574, + "learning_rate": 0.00013953802651208802, + "loss": 1.0382, + "step": 8405 + }, + { + "epoch": 0.8, + "grad_norm": 0.2863563756673883, + "learning_rate": 0.0001395234955379739, + "loss": 1.1493, + "step": 8406 + }, + { + "epoch": 0.8, + "grad_norm": 0.2862298189990451, + "learning_rate": 0.0001395089635747595, + "loss": 1.1067, + "step": 8407 + }, + { + "epoch": 0.8, + "grad_norm": 0.2993560987033485, + "learning_rate": 0.00013949443062280842, + "loss": 1.081, + "step": 8408 + }, + { + "epoch": 0.8, + "grad_norm": 0.2717526863604508, + "learning_rate": 0.00013947989668248442, + "loss": 1.0246, + "step": 8409 + }, + { + "epoch": 0.8, + "grad_norm": 0.2843109420913706, + "learning_rate": 0.00013946536175415118, + "loss": 1.1101, + "step": 8410 + }, + { + "epoch": 0.8, + "grad_norm": 0.31237587843637793, + "learning_rate": 0.00013945082583817245, + "loss": 1.0994, + "step": 8411 + }, + { + "epoch": 0.8, + "grad_norm": 0.2655591048449623, + "learning_rate": 0.00013943628893491202, + "loss": 1.1926, + "step": 8412 + }, + { + "epoch": 0.8, + "grad_norm": 0.2923686496830888, + "learning_rate": 0.0001394217510447337, + "loss": 1.1175, + "step": 8413 + }, + { + "epoch": 0.8, + "grad_norm": 0.26432275264898286, + "learning_rate": 0.00013940721216800127, + "loss": 0.9868, + "step": 8414 + }, + { + "epoch": 0.81, + "grad_norm": 0.2767290301083631, + "learning_rate": 0.00013939267230507856, + "loss": 0.9424, + "step": 8415 + }, + { + "epoch": 0.81, + "grad_norm": 0.2865371880691285, + "learning_rate": 0.0001393781314563295, + "loss": 1.1268, + "step": 8416 + }, + { + "epoch": 0.81, + "grad_norm": 0.27359931565493517, + "learning_rate": 0.00013936358962211794, + "loss": 0.894, + "step": 8417 + }, + { + "epoch": 0.81, + "grad_norm": 0.2682490335376522, + "learning_rate": 0.00013934904680280781, + "loss": 1.0634, + "step": 8418 + }, + { + "epoch": 0.81, + "grad_norm": 0.323666262645315, + "learning_rate": 0.00013933450299876305, + "loss": 1.0572, + "step": 8419 + }, + { + "epoch": 0.81, + "grad_norm": 0.313542678895405, + "learning_rate": 0.00013931995821034766, + "loss": 1.0228, + "step": 8420 + }, + { + "epoch": 0.81, + "grad_norm": 0.2741537767451146, + "learning_rate": 0.00013930541243792555, + "loss": 1.1194, + "step": 8421 + }, + { + "epoch": 0.81, + "grad_norm": 0.2727440924938454, + "learning_rate": 0.00013929086568186083, + "loss": 1.1567, + "step": 8422 + }, + { + "epoch": 0.81, + "grad_norm": 0.27596495736712096, + "learning_rate": 0.0001392763179425175, + "loss": 1.0803, + "step": 8423 + }, + { + "epoch": 0.81, + "grad_norm": 0.2789242022899579, + "learning_rate": 0.00013926176922025963, + "loss": 0.912, + "step": 8424 + }, + { + "epoch": 0.81, + "grad_norm": 0.28533265323613294, + "learning_rate": 0.00013924721951545128, + "loss": 1.1343, + "step": 8425 + }, + { + "epoch": 0.81, + "grad_norm": 0.29008984332342763, + "learning_rate": 0.00013923266882845666, + "loss": 1.0476, + "step": 8426 + }, + { + "epoch": 0.81, + "grad_norm": 0.24250004605487083, + "learning_rate": 0.00013921811715963977, + "loss": 1.062, + "step": 8427 + }, + { + "epoch": 0.81, + "grad_norm": 0.29532956323803095, + "learning_rate": 0.0001392035645093649, + "loss": 1.0561, + "step": 8428 + }, + { + "epoch": 0.81, + "grad_norm": 0.3001458878270363, + "learning_rate": 0.00013918901087799616, + "loss": 1.081, + "step": 8429 + }, + { + "epoch": 0.81, + "grad_norm": 0.3037811128687649, + "learning_rate": 0.0001391744562658978, + "loss": 1.098, + "step": 8430 + }, + { + "epoch": 0.81, + "grad_norm": 0.3304936166647709, + "learning_rate": 0.00013915990067343408, + "loss": 0.9485, + "step": 8431 + }, + { + "epoch": 0.81, + "grad_norm": 0.2831506029753717, + "learning_rate": 0.0001391453441009692, + "loss": 1.0727, + "step": 8432 + }, + { + "epoch": 0.81, + "grad_norm": 0.2528874867012647, + "learning_rate": 0.0001391307865488675, + "loss": 1.046, + "step": 8433 + }, + { + "epoch": 0.81, + "grad_norm": 0.2926620284475519, + "learning_rate": 0.00013911622801749326, + "loss": 1.066, + "step": 8434 + }, + { + "epoch": 0.81, + "grad_norm": 0.28658384259986724, + "learning_rate": 0.00013910166850721086, + "loss": 1.1164, + "step": 8435 + }, + { + "epoch": 0.81, + "grad_norm": 0.2611948977130402, + "learning_rate": 0.0001390871080183846, + "loss": 1.1024, + "step": 8436 + }, + { + "epoch": 0.81, + "grad_norm": 0.29623740606511145, + "learning_rate": 0.0001390725465513789, + "loss": 0.9396, + "step": 8437 + }, + { + "epoch": 0.81, + "grad_norm": 0.2759634164236242, + "learning_rate": 0.00013905798410655817, + "loss": 0.991, + "step": 8438 + }, + { + "epoch": 0.81, + "grad_norm": 0.2559555603571331, + "learning_rate": 0.00013904342068428688, + "loss": 0.9583, + "step": 8439 + }, + { + "epoch": 0.81, + "grad_norm": 0.2834397140569843, + "learning_rate": 0.00013902885628492938, + "loss": 1.1322, + "step": 8440 + }, + { + "epoch": 0.81, + "grad_norm": 0.2959073999001046, + "learning_rate": 0.00013901429090885028, + "loss": 1.0687, + "step": 8441 + }, + { + "epoch": 0.81, + "grad_norm": 0.25399362804027736, + "learning_rate": 0.000138999724556414, + "loss": 1.0374, + "step": 8442 + }, + { + "epoch": 0.81, + "grad_norm": 0.29065321813906453, + "learning_rate": 0.00013898515722798513, + "loss": 1.0508, + "step": 8443 + }, + { + "epoch": 0.81, + "grad_norm": 0.37210670978861016, + "learning_rate": 0.00013897058892392818, + "loss": 0.9855, + "step": 8444 + }, + { + "epoch": 0.81, + "grad_norm": 0.2800545837279327, + "learning_rate": 0.00013895601964460775, + "loss": 1.1169, + "step": 8445 + }, + { + "epoch": 0.81, + "grad_norm": 0.2926130011031746, + "learning_rate": 0.00013894144939038844, + "loss": 1.0492, + "step": 8446 + }, + { + "epoch": 0.81, + "grad_norm": 0.31446837274856276, + "learning_rate": 0.00013892687816163487, + "loss": 1.077, + "step": 8447 + }, + { + "epoch": 0.81, + "grad_norm": 0.2894693179667743, + "learning_rate": 0.00013891230595871175, + "loss": 1.1005, + "step": 8448 + }, + { + "epoch": 0.81, + "grad_norm": 0.2658040957365846, + "learning_rate": 0.0001388977327819837, + "loss": 1.129, + "step": 8449 + }, + { + "epoch": 0.81, + "grad_norm": 0.30570591750106335, + "learning_rate": 0.0001388831586318154, + "loss": 1.1457, + "step": 8450 + }, + { + "epoch": 0.81, + "grad_norm": 0.30282216470657447, + "learning_rate": 0.00013886858350857167, + "loss": 1.0694, + "step": 8451 + }, + { + "epoch": 0.81, + "grad_norm": 0.3179740476883664, + "learning_rate": 0.00013885400741261717, + "loss": 1.1539, + "step": 8452 + }, + { + "epoch": 0.81, + "grad_norm": 0.2714947725647423, + "learning_rate": 0.00013883943034431677, + "loss": 1.0887, + "step": 8453 + }, + { + "epoch": 0.81, + "grad_norm": 0.2617135621115103, + "learning_rate": 0.0001388248523040352, + "loss": 1.1484, + "step": 8454 + }, + { + "epoch": 0.81, + "grad_norm": 0.2589470456450907, + "learning_rate": 0.00013881027329213727, + "loss": 0.976, + "step": 8455 + }, + { + "epoch": 0.81, + "grad_norm": 0.2971405117388152, + "learning_rate": 0.0001387956933089879, + "loss": 1.1232, + "step": 8456 + }, + { + "epoch": 0.81, + "grad_norm": 0.2638200620165654, + "learning_rate": 0.0001387811123549519, + "loss": 1.1529, + "step": 8457 + }, + { + "epoch": 0.81, + "grad_norm": 0.2792026970736888, + "learning_rate": 0.00013876653043039418, + "loss": 1.1957, + "step": 8458 + }, + { + "epoch": 0.81, + "grad_norm": 0.27823742191201595, + "learning_rate": 0.0001387519475356797, + "loss": 1.0414, + "step": 8459 + }, + { + "epoch": 0.81, + "grad_norm": 0.25238962999634595, + "learning_rate": 0.00013873736367117336, + "loss": 1.0228, + "step": 8460 + }, + { + "epoch": 0.81, + "grad_norm": 0.28008622103734476, + "learning_rate": 0.00013872277883724015, + "loss": 1.023, + "step": 8461 + }, + { + "epoch": 0.81, + "grad_norm": 0.2777949786844138, + "learning_rate": 0.00013870819303424506, + "loss": 1.1302, + "step": 8462 + }, + { + "epoch": 0.81, + "grad_norm": 0.24701542829888543, + "learning_rate": 0.0001386936062625531, + "loss": 1.1174, + "step": 8463 + }, + { + "epoch": 0.81, + "grad_norm": 0.29033108955425646, + "learning_rate": 0.00013867901852252935, + "loss": 1.0872, + "step": 8464 + }, + { + "epoch": 0.81, + "grad_norm": 0.2747150319565986, + "learning_rate": 0.00013866442981453887, + "loss": 0.95, + "step": 8465 + }, + { + "epoch": 0.81, + "grad_norm": 0.28133751977616195, + "learning_rate": 0.00013864984013894669, + "loss": 1.04, + "step": 8466 + }, + { + "epoch": 0.81, + "grad_norm": 0.2603016003322199, + "learning_rate": 0.00013863524949611798, + "loss": 1.0926, + "step": 8467 + }, + { + "epoch": 0.81, + "grad_norm": 0.28873463989684345, + "learning_rate": 0.00013862065788641787, + "loss": 1.0727, + "step": 8468 + }, + { + "epoch": 0.81, + "grad_norm": 0.3043522634022851, + "learning_rate": 0.00013860606531021155, + "loss": 1.017, + "step": 8469 + }, + { + "epoch": 0.81, + "grad_norm": 0.258266209093827, + "learning_rate": 0.00013859147176786417, + "loss": 1.0572, + "step": 8470 + }, + { + "epoch": 0.81, + "grad_norm": 0.2960901666392521, + "learning_rate": 0.00013857687725974093, + "loss": 0.9861, + "step": 8471 + }, + { + "epoch": 0.81, + "grad_norm": 0.29910616519981614, + "learning_rate": 0.00013856228178620709, + "loss": 1.1596, + "step": 8472 + }, + { + "epoch": 0.81, + "grad_norm": 0.2990576464485771, + "learning_rate": 0.00013854768534762795, + "loss": 1.0265, + "step": 8473 + }, + { + "epoch": 0.81, + "grad_norm": 0.31441877420729514, + "learning_rate": 0.00013853308794436876, + "loss": 1.0613, + "step": 8474 + }, + { + "epoch": 0.81, + "grad_norm": 0.2634200444546087, + "learning_rate": 0.0001385184895767948, + "loss": 1.004, + "step": 8475 + }, + { + "epoch": 0.81, + "grad_norm": 0.2556240322403397, + "learning_rate": 0.0001385038902452714, + "loss": 1.1238, + "step": 8476 + }, + { + "epoch": 0.81, + "grad_norm": 0.27888482542250587, + "learning_rate": 0.00013848928995016403, + "loss": 1.1326, + "step": 8477 + }, + { + "epoch": 0.81, + "grad_norm": 0.27815317220422986, + "learning_rate": 0.00013847468869183796, + "loss": 1.123, + "step": 8478 + }, + { + "epoch": 0.81, + "grad_norm": 0.2831745331072525, + "learning_rate": 0.00013846008647065857, + "loss": 1.0664, + "step": 8479 + }, + { + "epoch": 0.81, + "grad_norm": 0.3041401975450099, + "learning_rate": 0.0001384454832869914, + "loss": 1.0162, + "step": 8480 + }, + { + "epoch": 0.81, + "grad_norm": 0.27417387963872447, + "learning_rate": 0.00013843087914120185, + "loss": 1.1522, + "step": 8481 + }, + { + "epoch": 0.81, + "grad_norm": 0.2551238865778478, + "learning_rate": 0.00013841627403365537, + "loss": 1.1159, + "step": 8482 + }, + { + "epoch": 0.81, + "grad_norm": 0.2772835560718097, + "learning_rate": 0.0001384016679647175, + "loss": 1.0014, + "step": 8483 + }, + { + "epoch": 0.81, + "grad_norm": 0.266588619864142, + "learning_rate": 0.00013838706093475379, + "loss": 1.0242, + "step": 8484 + }, + { + "epoch": 0.81, + "grad_norm": 0.2950429695678513, + "learning_rate": 0.0001383724529441297, + "loss": 1.0789, + "step": 8485 + }, + { + "epoch": 0.81, + "grad_norm": 0.26853491046599237, + "learning_rate": 0.00013835784399321088, + "loss": 1.1386, + "step": 8486 + }, + { + "epoch": 0.81, + "grad_norm": 0.27338834723695976, + "learning_rate": 0.0001383432340823629, + "loss": 0.9977, + "step": 8487 + }, + { + "epoch": 0.81, + "grad_norm": 0.2865977979325858, + "learning_rate": 0.00013832862321195143, + "loss": 1.0583, + "step": 8488 + }, + { + "epoch": 0.81, + "grad_norm": 0.28068233052657926, + "learning_rate": 0.000138314011382342, + "loss": 1.0611, + "step": 8489 + }, + { + "epoch": 0.81, + "grad_norm": 0.3053917578678752, + "learning_rate": 0.0001382993985939004, + "loss": 1.1556, + "step": 8490 + }, + { + "epoch": 0.81, + "grad_norm": 0.26177227751546944, + "learning_rate": 0.00013828478484699227, + "loss": 1.1853, + "step": 8491 + }, + { + "epoch": 0.81, + "grad_norm": 0.28197772368176605, + "learning_rate": 0.00013827017014198336, + "loss": 1.108, + "step": 8492 + }, + { + "epoch": 0.81, + "grad_norm": 0.25534797811164694, + "learning_rate": 0.00013825555447923935, + "loss": 1.1122, + "step": 8493 + }, + { + "epoch": 0.81, + "grad_norm": 0.2985449459023056, + "learning_rate": 0.00013824093785912609, + "loss": 1.0912, + "step": 8494 + }, + { + "epoch": 0.81, + "grad_norm": 0.3047078671768318, + "learning_rate": 0.0001382263202820093, + "loss": 0.909, + "step": 8495 + }, + { + "epoch": 0.81, + "grad_norm": 0.2566715818782515, + "learning_rate": 0.0001382117017482548, + "loss": 1.0322, + "step": 8496 + }, + { + "epoch": 0.81, + "grad_norm": 0.2612952596331514, + "learning_rate": 0.0001381970822582285, + "loss": 0.9924, + "step": 8497 + }, + { + "epoch": 0.81, + "grad_norm": 0.2933790624492483, + "learning_rate": 0.00013818246181229618, + "loss": 1.0607, + "step": 8498 + }, + { + "epoch": 0.81, + "grad_norm": 0.3051960049265422, + "learning_rate": 0.00013816784041082374, + "loss": 1.1308, + "step": 8499 + }, + { + "epoch": 0.81, + "grad_norm": 0.2790257819761861, + "learning_rate": 0.0001381532180541772, + "loss": 1.0386, + "step": 8500 + }, + { + "epoch": 0.81, + "grad_norm": 0.3118122919886825, + "learning_rate": 0.0001381385947427223, + "loss": 1.1372, + "step": 8501 + }, + { + "epoch": 0.81, + "grad_norm": 0.29570017068411963, + "learning_rate": 0.00013812397047682513, + "loss": 1.0147, + "step": 8502 + }, + { + "epoch": 0.81, + "grad_norm": 0.31683426314572966, + "learning_rate": 0.00013810934525685165, + "loss": 1.0712, + "step": 8503 + }, + { + "epoch": 0.81, + "grad_norm": 0.2909795486157777, + "learning_rate": 0.00013809471908316783, + "loss": 1.2234, + "step": 8504 + }, + { + "epoch": 0.81, + "grad_norm": 0.2955103667345504, + "learning_rate": 0.00013808009195613973, + "loss": 1.0783, + "step": 8505 + }, + { + "epoch": 0.81, + "grad_norm": 0.2867522171152345, + "learning_rate": 0.0001380654638761334, + "loss": 1.0277, + "step": 8506 + }, + { + "epoch": 0.81, + "grad_norm": 0.3681601782152508, + "learning_rate": 0.0001380508348435149, + "loss": 1.1069, + "step": 8507 + }, + { + "epoch": 0.81, + "grad_norm": 0.2932701192842754, + "learning_rate": 0.00013803620485865035, + "loss": 1.0882, + "step": 8508 + }, + { + "epoch": 0.81, + "grad_norm": 0.31028006826949395, + "learning_rate": 0.0001380215739219059, + "loss": 0.9705, + "step": 8509 + }, + { + "epoch": 0.81, + "grad_norm": 0.311953807926566, + "learning_rate": 0.00013800694203364763, + "loss": 1.1564, + "step": 8510 + }, + { + "epoch": 0.81, + "grad_norm": 0.2959800715326205, + "learning_rate": 0.00013799230919424175, + "loss": 1.0579, + "step": 8511 + }, + { + "epoch": 0.81, + "grad_norm": 0.2605263218657084, + "learning_rate": 0.00013797767540405447, + "loss": 1.0171, + "step": 8512 + }, + { + "epoch": 0.81, + "grad_norm": 0.30387836906175536, + "learning_rate": 0.00013796304066345197, + "loss": 0.9672, + "step": 8513 + }, + { + "epoch": 0.81, + "grad_norm": 0.299921065982018, + "learning_rate": 0.00013794840497280056, + "loss": 1.1199, + "step": 8514 + }, + { + "epoch": 0.81, + "grad_norm": 0.32487111245703615, + "learning_rate": 0.00013793376833246644, + "loss": 1.2367, + "step": 8515 + }, + { + "epoch": 0.81, + "grad_norm": 0.2847819157265213, + "learning_rate": 0.00013791913074281595, + "loss": 1.1113, + "step": 8516 + }, + { + "epoch": 0.81, + "grad_norm": 0.2868712720370698, + "learning_rate": 0.00013790449220421535, + "loss": 1.1692, + "step": 8517 + }, + { + "epoch": 0.81, + "grad_norm": 0.2678298209756187, + "learning_rate": 0.00013788985271703105, + "loss": 1.1474, + "step": 8518 + }, + { + "epoch": 0.82, + "grad_norm": 0.29099670328302274, + "learning_rate": 0.00013787521228162934, + "loss": 1.0656, + "step": 8519 + }, + { + "epoch": 0.82, + "grad_norm": 0.30474414530355776, + "learning_rate": 0.00013786057089837663, + "loss": 1.0057, + "step": 8520 + }, + { + "epoch": 0.82, + "grad_norm": 0.28018691357325703, + "learning_rate": 0.00013784592856763936, + "loss": 1.0637, + "step": 8521 + }, + { + "epoch": 0.82, + "grad_norm": 0.30281968491489586, + "learning_rate": 0.00013783128528978395, + "loss": 0.9859, + "step": 8522 + }, + { + "epoch": 0.82, + "grad_norm": 0.2833193148012386, + "learning_rate": 0.00013781664106517685, + "loss": 1.1878, + "step": 8523 + }, + { + "epoch": 0.82, + "grad_norm": 0.27855400568860456, + "learning_rate": 0.00013780199589418453, + "loss": 1.0736, + "step": 8524 + }, + { + "epoch": 0.82, + "grad_norm": 0.3023424425522343, + "learning_rate": 0.00013778734977717348, + "loss": 0.9647, + "step": 8525 + }, + { + "epoch": 0.82, + "grad_norm": 0.24163910511953707, + "learning_rate": 0.00013777270271451031, + "loss": 1.0617, + "step": 8526 + }, + { + "epoch": 0.82, + "grad_norm": 0.2643004336760188, + "learning_rate": 0.00013775805470656147, + "loss": 1.0083, + "step": 8527 + }, + { + "epoch": 0.82, + "grad_norm": 0.30690050558969756, + "learning_rate": 0.00013774340575369357, + "loss": 1.1863, + "step": 8528 + }, + { + "epoch": 0.82, + "grad_norm": 0.2803369681980211, + "learning_rate": 0.00013772875585627326, + "loss": 0.9811, + "step": 8529 + }, + { + "epoch": 0.82, + "grad_norm": 0.27882776591624236, + "learning_rate": 0.00013771410501466712, + "loss": 1.0438, + "step": 8530 + }, + { + "epoch": 0.82, + "grad_norm": 0.2866694179672929, + "learning_rate": 0.00013769945322924179, + "loss": 1.0089, + "step": 8531 + }, + { + "epoch": 0.82, + "grad_norm": 0.3193049906912792, + "learning_rate": 0.00013768480050036392, + "loss": 0.9945, + "step": 8532 + }, + { + "epoch": 0.82, + "grad_norm": 0.3134610280410675, + "learning_rate": 0.00013767014682840027, + "loss": 1.0041, + "step": 8533 + }, + { + "epoch": 0.82, + "grad_norm": 0.25858959641085166, + "learning_rate": 0.0001376554922137175, + "loss": 1.0951, + "step": 8534 + }, + { + "epoch": 0.82, + "grad_norm": 0.27495664936656555, + "learning_rate": 0.00013764083665668237, + "loss": 1.0078, + "step": 8535 + }, + { + "epoch": 0.82, + "grad_norm": 0.26221912952359344, + "learning_rate": 0.00013762618015766167, + "loss": 1.0953, + "step": 8536 + }, + { + "epoch": 0.82, + "grad_norm": 0.2969797589840666, + "learning_rate": 0.00013761152271702214, + "loss": 1.104, + "step": 8537 + }, + { + "epoch": 0.82, + "grad_norm": 0.3003478528891654, + "learning_rate": 0.00013759686433513062, + "loss": 1.0608, + "step": 8538 + }, + { + "epoch": 0.82, + "grad_norm": 0.35023226379753625, + "learning_rate": 0.00013758220501235396, + "loss": 1.0195, + "step": 8539 + }, + { + "epoch": 0.82, + "grad_norm": 0.2598816030866285, + "learning_rate": 0.000137567544749059, + "loss": 0.8741, + "step": 8540 + }, + { + "epoch": 0.82, + "grad_norm": 0.29603444421799885, + "learning_rate": 0.0001375528835456126, + "loss": 1.124, + "step": 8541 + }, + { + "epoch": 0.82, + "grad_norm": 0.24876455442000533, + "learning_rate": 0.0001375382214023817, + "loss": 0.9333, + "step": 8542 + }, + { + "epoch": 0.82, + "grad_norm": 0.29348410647341444, + "learning_rate": 0.00013752355831973324, + "loss": 0.9545, + "step": 8543 + }, + { + "epoch": 0.82, + "grad_norm": 0.31014489060350814, + "learning_rate": 0.00013750889429803412, + "loss": 1.0946, + "step": 8544 + }, + { + "epoch": 0.82, + "grad_norm": 0.2901149550164192, + "learning_rate": 0.00013749422933765135, + "loss": 1.1371, + "step": 8545 + }, + { + "epoch": 0.82, + "grad_norm": 0.2796054693890663, + "learning_rate": 0.00013747956343895194, + "loss": 1.0809, + "step": 8546 + }, + { + "epoch": 0.82, + "grad_norm": 0.2809110179945892, + "learning_rate": 0.00013746489660230288, + "loss": 1.0668, + "step": 8547 + }, + { + "epoch": 0.82, + "grad_norm": 0.29179871521386425, + "learning_rate": 0.00013745022882807127, + "loss": 1.1174, + "step": 8548 + }, + { + "epoch": 0.82, + "grad_norm": 0.310921629295036, + "learning_rate": 0.00013743556011662413, + "loss": 1.0533, + "step": 8549 + }, + { + "epoch": 0.82, + "grad_norm": 0.2538334627557585, + "learning_rate": 0.00013742089046832855, + "loss": 0.994, + "step": 8550 + }, + { + "epoch": 0.82, + "grad_norm": 0.2919456298279374, + "learning_rate": 0.00013740621988355168, + "loss": 1.0132, + "step": 8551 + }, + { + "epoch": 0.82, + "grad_norm": 0.2532952808062169, + "learning_rate": 0.00013739154836266064, + "loss": 0.9537, + "step": 8552 + }, + { + "epoch": 0.82, + "grad_norm": 0.31354688802504277, + "learning_rate": 0.0001373768759060226, + "loss": 1.1491, + "step": 8553 + }, + { + "epoch": 0.82, + "grad_norm": 0.2485754538763557, + "learning_rate": 0.00013736220251400478, + "loss": 1.1273, + "step": 8554 + }, + { + "epoch": 0.82, + "grad_norm": 0.29503224281382395, + "learning_rate": 0.00013734752818697434, + "loss": 1.0555, + "step": 8555 + }, + { + "epoch": 0.82, + "grad_norm": 0.2671248855695662, + "learning_rate": 0.00013733285292529855, + "loss": 1.001, + "step": 8556 + }, + { + "epoch": 0.82, + "grad_norm": 0.3018392538404123, + "learning_rate": 0.00013731817672934463, + "loss": 1.1354, + "step": 8557 + }, + { + "epoch": 0.82, + "grad_norm": 0.297525745763971, + "learning_rate": 0.0001373034995994799, + "loss": 1.1386, + "step": 8558 + }, + { + "epoch": 0.82, + "grad_norm": 0.3031989211002846, + "learning_rate": 0.00013728882153607165, + "loss": 1.0058, + "step": 8559 + }, + { + "epoch": 0.82, + "grad_norm": 0.2925383632398921, + "learning_rate": 0.00013727414253948719, + "loss": 0.9204, + "step": 8560 + }, + { + "epoch": 0.82, + "grad_norm": 0.28581280375824064, + "learning_rate": 0.0001372594626100939, + "loss": 1.1115, + "step": 8561 + }, + { + "epoch": 0.82, + "grad_norm": 0.2849917863325391, + "learning_rate": 0.00013724478174825916, + "loss": 1.0351, + "step": 8562 + }, + { + "epoch": 0.82, + "grad_norm": 0.2867357667558398, + "learning_rate": 0.0001372300999543503, + "loss": 0.9713, + "step": 8563 + }, + { + "epoch": 0.82, + "grad_norm": 0.2912838079543734, + "learning_rate": 0.00013721541722873484, + "loss": 1.0435, + "step": 8564 + }, + { + "epoch": 0.82, + "grad_norm": 0.3121331293675006, + "learning_rate": 0.00013720073357178017, + "loss": 1.1627, + "step": 8565 + }, + { + "epoch": 0.82, + "grad_norm": 0.29971607543863277, + "learning_rate": 0.00013718604898385375, + "loss": 1.1364, + "step": 8566 + }, + { + "epoch": 0.82, + "grad_norm": 0.2740375080807219, + "learning_rate": 0.00013717136346532306, + "loss": 1.0376, + "step": 8567 + }, + { + "epoch": 0.82, + "grad_norm": 0.26055014984233066, + "learning_rate": 0.00013715667701655565, + "loss": 0.9892, + "step": 8568 + }, + { + "epoch": 0.82, + "grad_norm": 0.28985982137880956, + "learning_rate": 0.00013714198963791908, + "loss": 1.0258, + "step": 8569 + }, + { + "epoch": 0.82, + "grad_norm": 0.24153544179134467, + "learning_rate": 0.00013712730132978083, + "loss": 1.0979, + "step": 8570 + }, + { + "epoch": 0.82, + "grad_norm": 0.2819940528233919, + "learning_rate": 0.00013711261209250857, + "loss": 1.0627, + "step": 8571 + }, + { + "epoch": 0.82, + "grad_norm": 0.3014061072273147, + "learning_rate": 0.00013709792192646985, + "loss": 1.0036, + "step": 8572 + }, + { + "epoch": 0.82, + "grad_norm": 0.2943687582185628, + "learning_rate": 0.00013708323083203228, + "loss": 1.1475, + "step": 8573 + }, + { + "epoch": 0.82, + "grad_norm": 0.29418992771044433, + "learning_rate": 0.0001370685388095636, + "loss": 1.0402, + "step": 8574 + }, + { + "epoch": 0.82, + "grad_norm": 0.3135438299333758, + "learning_rate": 0.00013705384585943145, + "loss": 1.1125, + "step": 8575 + }, + { + "epoch": 0.82, + "grad_norm": 0.28733629193402527, + "learning_rate": 0.00013703915198200347, + "loss": 1.0161, + "step": 8576 + }, + { + "epoch": 0.82, + "grad_norm": 0.29180260421642634, + "learning_rate": 0.00013702445717764746, + "loss": 1.0476, + "step": 8577 + }, + { + "epoch": 0.82, + "grad_norm": 0.2980601220397487, + "learning_rate": 0.00013700976144673116, + "loss": 1.0602, + "step": 8578 + }, + { + "epoch": 0.82, + "grad_norm": 0.3066488522260643, + "learning_rate": 0.00013699506478962231, + "loss": 1.0332, + "step": 8579 + }, + { + "epoch": 0.82, + "grad_norm": 0.3353162200492158, + "learning_rate": 0.00013698036720668873, + "loss": 0.9495, + "step": 8580 + }, + { + "epoch": 0.82, + "grad_norm": 0.2816337655551374, + "learning_rate": 0.00013696566869829816, + "loss": 1.1274, + "step": 8581 + }, + { + "epoch": 0.82, + "grad_norm": 0.3100139223925439, + "learning_rate": 0.00013695096926481855, + "loss": 1.0753, + "step": 8582 + }, + { + "epoch": 0.82, + "grad_norm": 0.2746129531995423, + "learning_rate": 0.0001369362689066177, + "loss": 1.0354, + "step": 8583 + }, + { + "epoch": 0.82, + "grad_norm": 0.2697359432512603, + "learning_rate": 0.00013692156762406347, + "loss": 0.9328, + "step": 8584 + }, + { + "epoch": 0.82, + "grad_norm": 0.290636331567875, + "learning_rate": 0.00013690686541752384, + "loss": 1.0889, + "step": 8585 + }, + { + "epoch": 0.82, + "grad_norm": 0.28720885198550766, + "learning_rate": 0.0001368921622873667, + "loss": 1.1372, + "step": 8586 + }, + { + "epoch": 0.82, + "grad_norm": 0.30775748112991347, + "learning_rate": 0.00013687745823396007, + "loss": 1.1094, + "step": 8587 + }, + { + "epoch": 0.82, + "grad_norm": 0.2624465157124863, + "learning_rate": 0.0001368627532576718, + "loss": 0.9509, + "step": 8588 + }, + { + "epoch": 0.82, + "grad_norm": 0.26983758242587186, + "learning_rate": 0.00013684804735887, + "loss": 1.0852, + "step": 8589 + }, + { + "epoch": 0.82, + "grad_norm": 0.3019798930382428, + "learning_rate": 0.00013683334053792262, + "loss": 1.0498, + "step": 8590 + }, + { + "epoch": 0.82, + "grad_norm": 0.28333579655941465, + "learning_rate": 0.00013681863279519776, + "loss": 1.0397, + "step": 8591 + }, + { + "epoch": 0.82, + "grad_norm": 0.27677050640426154, + "learning_rate": 0.0001368039241310635, + "loss": 1.0302, + "step": 8592 + }, + { + "epoch": 0.82, + "grad_norm": 0.2740272283421442, + "learning_rate": 0.00013678921454588787, + "loss": 0.9894, + "step": 8593 + }, + { + "epoch": 0.82, + "grad_norm": 0.30385602594318123, + "learning_rate": 0.00013677450404003905, + "loss": 0.9967, + "step": 8594 + }, + { + "epoch": 0.82, + "grad_norm": 0.2543360042755725, + "learning_rate": 0.0001367597926138851, + "loss": 1.1073, + "step": 8595 + }, + { + "epoch": 0.82, + "grad_norm": 0.30243436611460256, + "learning_rate": 0.0001367450802677943, + "loss": 1.0471, + "step": 8596 + }, + { + "epoch": 0.82, + "grad_norm": 0.298278417963026, + "learning_rate": 0.00013673036700213476, + "loss": 1.108, + "step": 8597 + }, + { + "epoch": 0.82, + "grad_norm": 0.344466314776565, + "learning_rate": 0.0001367156528172747, + "loss": 1.0931, + "step": 8598 + }, + { + "epoch": 0.82, + "grad_norm": 0.2597226647292173, + "learning_rate": 0.00013670093771358234, + "loss": 1.0871, + "step": 8599 + }, + { + "epoch": 0.82, + "grad_norm": 0.24557911295793564, + "learning_rate": 0.00013668622169142597, + "loss": 1.0619, + "step": 8600 + }, + { + "epoch": 0.82, + "grad_norm": 0.2747878102964318, + "learning_rate": 0.00013667150475117382, + "loss": 1.1528, + "step": 8601 + }, + { + "epoch": 0.82, + "grad_norm": 0.2853209445830605, + "learning_rate": 0.00013665678689319424, + "loss": 1.0169, + "step": 8602 + }, + { + "epoch": 0.82, + "grad_norm": 0.29466027746832646, + "learning_rate": 0.00013664206811785554, + "loss": 1.0538, + "step": 8603 + }, + { + "epoch": 0.82, + "grad_norm": 0.28940310201790015, + "learning_rate": 0.000136627348425526, + "loss": 0.9515, + "step": 8604 + }, + { + "epoch": 0.82, + "grad_norm": 0.28401605740381525, + "learning_rate": 0.0001366126278165741, + "loss": 1.1516, + "step": 8605 + }, + { + "epoch": 0.82, + "grad_norm": 0.2740954715279189, + "learning_rate": 0.00013659790629136817, + "loss": 1.0099, + "step": 8606 + }, + { + "epoch": 0.82, + "grad_norm": 0.2874894186741719, + "learning_rate": 0.00013658318385027665, + "loss": 0.9993, + "step": 8607 + }, + { + "epoch": 0.82, + "grad_norm": 0.2673309894002083, + "learning_rate": 0.0001365684604936679, + "loss": 1.1178, + "step": 8608 + }, + { + "epoch": 0.82, + "grad_norm": 0.28060184644025465, + "learning_rate": 0.0001365537362219105, + "loss": 1.1111, + "step": 8609 + }, + { + "epoch": 0.82, + "grad_norm": 0.262926201140649, + "learning_rate": 0.00013653901103537287, + "loss": 1.0677, + "step": 8610 + }, + { + "epoch": 0.82, + "grad_norm": 0.2603806384626445, + "learning_rate": 0.0001365242849344235, + "loss": 1.0661, + "step": 8611 + }, + { + "epoch": 0.82, + "grad_norm": 0.27821013084562063, + "learning_rate": 0.00013650955791943097, + "loss": 1.1453, + "step": 8612 + }, + { + "epoch": 0.82, + "grad_norm": 0.3025962932247498, + "learning_rate": 0.0001364948299907638, + "loss": 1.2105, + "step": 8613 + }, + { + "epoch": 0.82, + "grad_norm": 0.27742930612541283, + "learning_rate": 0.00013648010114879056, + "loss": 0.9934, + "step": 8614 + }, + { + "epoch": 0.82, + "grad_norm": 0.2436905652104897, + "learning_rate": 0.0001364653713938799, + "loss": 0.98, + "step": 8615 + }, + { + "epoch": 0.82, + "grad_norm": 0.308431748553756, + "learning_rate": 0.00013645064072640036, + "loss": 1.1859, + "step": 8616 + }, + { + "epoch": 0.82, + "grad_norm": 0.32785207627829216, + "learning_rate": 0.00013643590914672065, + "loss": 1.0128, + "step": 8617 + }, + { + "epoch": 0.82, + "grad_norm": 0.3089734401683715, + "learning_rate": 0.00013642117665520938, + "loss": 1.1553, + "step": 8618 + }, + { + "epoch": 0.82, + "grad_norm": 0.3077673526631775, + "learning_rate": 0.0001364064432522353, + "loss": 1.0379, + "step": 8619 + }, + { + "epoch": 0.82, + "grad_norm": 0.29158179240095367, + "learning_rate": 0.00013639170893816713, + "loss": 1.0954, + "step": 8620 + }, + { + "epoch": 0.82, + "grad_norm": 0.2899047445443715, + "learning_rate": 0.00013637697371337353, + "loss": 1.0493, + "step": 8621 + }, + { + "epoch": 0.82, + "grad_norm": 0.2409967915205593, + "learning_rate": 0.0001363622375782233, + "loss": 1.0478, + "step": 8622 + }, + { + "epoch": 0.82, + "grad_norm": 0.294664521205138, + "learning_rate": 0.00013634750053308524, + "loss": 1.1147, + "step": 8623 + }, + { + "epoch": 0.83, + "grad_norm": 0.2691622747456586, + "learning_rate": 0.00013633276257832814, + "loss": 1.0245, + "step": 8624 + }, + { + "epoch": 0.83, + "grad_norm": 0.2662427740875133, + "learning_rate": 0.0001363180237143208, + "loss": 1.1204, + "step": 8625 + }, + { + "epoch": 0.83, + "grad_norm": 0.315565873248549, + "learning_rate": 0.00013630328394143213, + "loss": 1.0977, + "step": 8626 + }, + { + "epoch": 0.83, + "grad_norm": 0.2958160614252357, + "learning_rate": 0.00013628854326003093, + "loss": 1.0883, + "step": 8627 + }, + { + "epoch": 0.83, + "grad_norm": 0.27828748808814535, + "learning_rate": 0.00013627380167048614, + "loss": 0.9, + "step": 8628 + }, + { + "epoch": 0.83, + "grad_norm": 0.2605213690147047, + "learning_rate": 0.00013625905917316665, + "loss": 1.1766, + "step": 8629 + }, + { + "epoch": 0.83, + "grad_norm": 0.28580811147831237, + "learning_rate": 0.00013624431576844144, + "loss": 1.142, + "step": 8630 + }, + { + "epoch": 0.83, + "grad_norm": 0.2645180239308723, + "learning_rate": 0.00013622957145667945, + "loss": 1.0222, + "step": 8631 + }, + { + "epoch": 0.83, + "grad_norm": 0.27332832418491226, + "learning_rate": 0.00013621482623824965, + "loss": 1.1453, + "step": 8632 + }, + { + "epoch": 0.83, + "grad_norm": 0.24290270100131645, + "learning_rate": 0.00013620008011352105, + "loss": 1.0981, + "step": 8633 + }, + { + "epoch": 0.83, + "grad_norm": 0.2738275452675957, + "learning_rate": 0.0001361853330828627, + "loss": 1.087, + "step": 8634 + }, + { + "epoch": 0.83, + "grad_norm": 0.2973345148003589, + "learning_rate": 0.00013617058514664367, + "loss": 1.0528, + "step": 8635 + }, + { + "epoch": 0.83, + "grad_norm": 0.30243238138049405, + "learning_rate": 0.000136155836305233, + "loss": 1.1979, + "step": 8636 + }, + { + "epoch": 0.83, + "grad_norm": 0.2973554678344532, + "learning_rate": 0.0001361410865589998, + "loss": 1.1496, + "step": 8637 + }, + { + "epoch": 0.83, + "grad_norm": 0.27907427949892327, + "learning_rate": 0.00013612633590831319, + "loss": 1.1112, + "step": 8638 + }, + { + "epoch": 0.83, + "grad_norm": 0.2546823283144357, + "learning_rate": 0.00013611158435354232, + "loss": 0.9379, + "step": 8639 + }, + { + "epoch": 0.83, + "grad_norm": 0.24491672704420625, + "learning_rate": 0.0001360968318950564, + "loss": 1.1017, + "step": 8640 + }, + { + "epoch": 0.83, + "grad_norm": 0.2397150988377803, + "learning_rate": 0.00013608207853322454, + "loss": 0.9593, + "step": 8641 + }, + { + "epoch": 0.83, + "grad_norm": 0.27419817711093586, + "learning_rate": 0.00013606732426841596, + "loss": 0.937, + "step": 8642 + }, + { + "epoch": 0.83, + "grad_norm": 0.3159034039720681, + "learning_rate": 0.00013605256910099997, + "loss": 1.1914, + "step": 8643 + }, + { + "epoch": 0.83, + "grad_norm": 0.27134589534535863, + "learning_rate": 0.00013603781303134576, + "loss": 1.1164, + "step": 8644 + }, + { + "epoch": 0.83, + "grad_norm": 0.3025553917420271, + "learning_rate": 0.00013602305605982262, + "loss": 1.0479, + "step": 8645 + }, + { + "epoch": 0.83, + "grad_norm": 0.27129927828178074, + "learning_rate": 0.0001360082981867999, + "loss": 1.1052, + "step": 8646 + }, + { + "epoch": 0.83, + "grad_norm": 0.31517001904029224, + "learning_rate": 0.00013599353941264684, + "loss": 1.0203, + "step": 8647 + }, + { + "epoch": 0.83, + "grad_norm": 0.2851479531322408, + "learning_rate": 0.0001359787797377329, + "loss": 1.0592, + "step": 8648 + }, + { + "epoch": 0.83, + "grad_norm": 0.2856016662841958, + "learning_rate": 0.00013596401916242732, + "loss": 1.0729, + "step": 8649 + }, + { + "epoch": 0.83, + "grad_norm": 0.2847359988614896, + "learning_rate": 0.00013594925768709959, + "loss": 1.0519, + "step": 8650 + }, + { + "epoch": 0.83, + "grad_norm": 0.2656376058979404, + "learning_rate": 0.00013593449531211908, + "loss": 1.0247, + "step": 8651 + }, + { + "epoch": 0.83, + "grad_norm": 0.2591314203645854, + "learning_rate": 0.00013591973203785524, + "loss": 1.099, + "step": 8652 + }, + { + "epoch": 0.83, + "grad_norm": 0.28779350087950495, + "learning_rate": 0.00013590496786467754, + "loss": 1.054, + "step": 8653 + }, + { + "epoch": 0.83, + "grad_norm": 0.2546447105719811, + "learning_rate": 0.00013589020279295544, + "loss": 1.0369, + "step": 8654 + }, + { + "epoch": 0.83, + "grad_norm": 0.33112794737534357, + "learning_rate": 0.00013587543682305847, + "loss": 1.0239, + "step": 8655 + }, + { + "epoch": 0.83, + "grad_norm": 0.261490090551106, + "learning_rate": 0.00013586066995535616, + "loss": 1.0201, + "step": 8656 + }, + { + "epoch": 0.83, + "grad_norm": 0.2880468769072062, + "learning_rate": 0.000135845902190218, + "loss": 1.1193, + "step": 8657 + }, + { + "epoch": 0.83, + "grad_norm": 0.288557868149757, + "learning_rate": 0.00013583113352801367, + "loss": 1.1048, + "step": 8658 + }, + { + "epoch": 0.83, + "grad_norm": 0.29517378742374495, + "learning_rate": 0.00013581636396911266, + "loss": 1.0905, + "step": 8659 + }, + { + "epoch": 0.83, + "grad_norm": 0.2617708019113313, + "learning_rate": 0.00013580159351388464, + "loss": 1.0682, + "step": 8660 + }, + { + "epoch": 0.83, + "grad_norm": 0.31436033052025997, + "learning_rate": 0.00013578682216269927, + "loss": 0.9827, + "step": 8661 + }, + { + "epoch": 0.83, + "grad_norm": 0.2710558291831824, + "learning_rate": 0.00013577204991592617, + "loss": 1.0678, + "step": 8662 + }, + { + "epoch": 0.83, + "grad_norm": 0.30589524951652153, + "learning_rate": 0.000135757276773935, + "loss": 1.0166, + "step": 8663 + }, + { + "epoch": 0.83, + "grad_norm": 0.31535012911315774, + "learning_rate": 0.00013574250273709555, + "loss": 1.0809, + "step": 8664 + }, + { + "epoch": 0.83, + "grad_norm": 0.34357963327522195, + "learning_rate": 0.0001357277278057775, + "loss": 1.0025, + "step": 8665 + }, + { + "epoch": 0.83, + "grad_norm": 0.29336089111450286, + "learning_rate": 0.0001357129519803506, + "loss": 1.14, + "step": 8666 + }, + { + "epoch": 0.83, + "grad_norm": 0.2988503565133178, + "learning_rate": 0.00013569817526118465, + "loss": 0.9728, + "step": 8667 + }, + { + "epoch": 0.83, + "grad_norm": 0.2586925578598481, + "learning_rate": 0.0001356833976486494, + "loss": 1.0163, + "step": 8668 + }, + { + "epoch": 0.83, + "grad_norm": 0.2937963873608329, + "learning_rate": 0.0001356686191431147, + "loss": 1.0555, + "step": 8669 + }, + { + "epoch": 0.83, + "grad_norm": 0.29571540658867496, + "learning_rate": 0.0001356538397449504, + "loss": 1.0379, + "step": 8670 + }, + { + "epoch": 0.83, + "grad_norm": 0.27868553879152896, + "learning_rate": 0.00013563905945452638, + "loss": 1.0352, + "step": 8671 + }, + { + "epoch": 0.83, + "grad_norm": 0.29105897745078474, + "learning_rate": 0.00013562427827221244, + "loss": 0.993, + "step": 8672 + }, + { + "epoch": 0.83, + "grad_norm": 0.24758183366141243, + "learning_rate": 0.0001356094961983786, + "loss": 0.8968, + "step": 8673 + }, + { + "epoch": 0.83, + "grad_norm": 0.2605888157367459, + "learning_rate": 0.0001355947132333947, + "loss": 1.0564, + "step": 8674 + }, + { + "epoch": 0.83, + "grad_norm": 0.2924270653891825, + "learning_rate": 0.00013557992937763077, + "loss": 1.0911, + "step": 8675 + }, + { + "epoch": 0.83, + "grad_norm": 0.3153612885581894, + "learning_rate": 0.00013556514463145672, + "loss": 1.0308, + "step": 8676 + }, + { + "epoch": 0.83, + "grad_norm": 0.27974012653020464, + "learning_rate": 0.00013555035899524257, + "loss": 1.0977, + "step": 8677 + }, + { + "epoch": 0.83, + "grad_norm": 0.2686146240839227, + "learning_rate": 0.00013553557246935834, + "loss": 0.9855, + "step": 8678 + }, + { + "epoch": 0.83, + "grad_norm": 0.2809894346574622, + "learning_rate": 0.00013552078505417412, + "loss": 1.1707, + "step": 8679 + }, + { + "epoch": 0.83, + "grad_norm": 0.2869178669204025, + "learning_rate": 0.00013550599675005986, + "loss": 1.1491, + "step": 8680 + }, + { + "epoch": 0.83, + "grad_norm": 0.2955879869986699, + "learning_rate": 0.00013549120755738576, + "loss": 1.0608, + "step": 8681 + }, + { + "epoch": 0.83, + "grad_norm": 0.2875101038157737, + "learning_rate": 0.00013547641747652187, + "loss": 1.0307, + "step": 8682 + }, + { + "epoch": 0.83, + "grad_norm": 0.28175365232286614, + "learning_rate": 0.00013546162650783836, + "loss": 1.0323, + "step": 8683 + }, + { + "epoch": 0.83, + "grad_norm": 0.27831603189171494, + "learning_rate": 0.00013544683465170537, + "loss": 1.057, + "step": 8684 + }, + { + "epoch": 0.83, + "grad_norm": 0.3073334793522284, + "learning_rate": 0.00013543204190849303, + "loss": 1.0045, + "step": 8685 + }, + { + "epoch": 0.83, + "grad_norm": 0.29215172333875483, + "learning_rate": 0.00013541724827857157, + "loss": 1.1192, + "step": 8686 + }, + { + "epoch": 0.83, + "grad_norm": 0.2771543039799703, + "learning_rate": 0.00013540245376231122, + "loss": 1.1609, + "step": 8687 + }, + { + "epoch": 0.83, + "grad_norm": 0.29777594479305014, + "learning_rate": 0.00013538765836008224, + "loss": 0.9681, + "step": 8688 + }, + { + "epoch": 0.83, + "grad_norm": 0.2545360895510685, + "learning_rate": 0.00013537286207225484, + "loss": 1.0139, + "step": 8689 + }, + { + "epoch": 0.83, + "grad_norm": 0.26647845539942555, + "learning_rate": 0.00013535806489919935, + "loss": 1.1347, + "step": 8690 + }, + { + "epoch": 0.83, + "grad_norm": 0.29722399863436927, + "learning_rate": 0.00013534326684128605, + "loss": 1.1584, + "step": 8691 + }, + { + "epoch": 0.83, + "grad_norm": 0.30891387675873, + "learning_rate": 0.00013532846789888532, + "loss": 1.0211, + "step": 8692 + }, + { + "epoch": 0.83, + "grad_norm": 0.31589225263631776, + "learning_rate": 0.00013531366807236742, + "loss": 1.0062, + "step": 8693 + }, + { + "epoch": 0.83, + "grad_norm": 0.2972943846537337, + "learning_rate": 0.00013529886736210285, + "loss": 1.0088, + "step": 8694 + }, + { + "epoch": 0.83, + "grad_norm": 0.28937720201774464, + "learning_rate": 0.00013528406576846189, + "loss": 0.9853, + "step": 8695 + }, + { + "epoch": 0.83, + "grad_norm": 0.24765821705508315, + "learning_rate": 0.000135269263291815, + "loss": 0.9191, + "step": 8696 + }, + { + "epoch": 0.83, + "grad_norm": 0.27414169282442363, + "learning_rate": 0.00013525445993253267, + "loss": 1.0309, + "step": 8697 + }, + { + "epoch": 0.83, + "grad_norm": 0.2688265567951683, + "learning_rate": 0.0001352396556909853, + "loss": 1.0866, + "step": 8698 + }, + { + "epoch": 0.83, + "grad_norm": 0.26180708297498495, + "learning_rate": 0.0001352248505675434, + "loss": 1.0702, + "step": 8699 + }, + { + "epoch": 0.83, + "grad_norm": 0.296931683455755, + "learning_rate": 0.00013521004456257748, + "loss": 1.0738, + "step": 8700 + }, + { + "epoch": 0.83, + "grad_norm": 0.2518973661220569, + "learning_rate": 0.0001351952376764581, + "loss": 1.0174, + "step": 8701 + }, + { + "epoch": 0.83, + "grad_norm": 0.30852287173282533, + "learning_rate": 0.00013518042990955575, + "loss": 1.0339, + "step": 8702 + }, + { + "epoch": 0.83, + "grad_norm": 0.28709043823004965, + "learning_rate": 0.000135165621262241, + "loss": 1.127, + "step": 8703 + }, + { + "epoch": 0.83, + "grad_norm": 0.2402225336023348, + "learning_rate": 0.00013515081173488453, + "loss": 1.0512, + "step": 8704 + }, + { + "epoch": 0.83, + "grad_norm": 0.3009799139239725, + "learning_rate": 0.00013513600132785688, + "loss": 0.995, + "step": 8705 + }, + { + "epoch": 0.83, + "grad_norm": 0.31841563271518275, + "learning_rate": 0.0001351211900415287, + "loss": 1.1342, + "step": 8706 + }, + { + "epoch": 0.83, + "grad_norm": 0.2726227896301979, + "learning_rate": 0.00013510637787627068, + "loss": 1.1005, + "step": 8707 + }, + { + "epoch": 0.83, + "grad_norm": 0.30718818182896745, + "learning_rate": 0.0001350915648324535, + "loss": 0.9629, + "step": 8708 + }, + { + "epoch": 0.83, + "grad_norm": 0.24754498668980748, + "learning_rate": 0.00013507675091044787, + "loss": 1.1344, + "step": 8709 + }, + { + "epoch": 0.83, + "grad_norm": 0.3341325246272737, + "learning_rate": 0.00013506193611062444, + "loss": 1.0433, + "step": 8710 + }, + { + "epoch": 0.83, + "grad_norm": 0.2831390747623922, + "learning_rate": 0.0001350471204333541, + "loss": 0.9654, + "step": 8711 + }, + { + "epoch": 0.83, + "grad_norm": 0.27958419761160536, + "learning_rate": 0.0001350323038790075, + "loss": 1.0065, + "step": 8712 + }, + { + "epoch": 0.83, + "grad_norm": 0.3219792061230287, + "learning_rate": 0.00013501748644795548, + "loss": 1.008, + "step": 8713 + }, + { + "epoch": 0.83, + "grad_norm": 0.27010290849276297, + "learning_rate": 0.00013500266814056886, + "loss": 0.9119, + "step": 8714 + }, + { + "epoch": 0.83, + "grad_norm": 0.3035143869003971, + "learning_rate": 0.0001349878489572185, + "loss": 1.0691, + "step": 8715 + }, + { + "epoch": 0.83, + "grad_norm": 0.27992327647557635, + "learning_rate": 0.0001349730288982752, + "loss": 1.067, + "step": 8716 + }, + { + "epoch": 0.83, + "grad_norm": 0.31109391121037, + "learning_rate": 0.00013495820796410987, + "loss": 1.0007, + "step": 8717 + }, + { + "epoch": 0.83, + "grad_norm": 0.30367401228401525, + "learning_rate": 0.00013494338615509344, + "loss": 1.1086, + "step": 8718 + }, + { + "epoch": 0.83, + "grad_norm": 0.27392132119871376, + "learning_rate": 0.00013492856347159678, + "loss": 0.9861, + "step": 8719 + }, + { + "epoch": 0.83, + "grad_norm": 0.318345859783819, + "learning_rate": 0.00013491373991399088, + "loss": 1.042, + "step": 8720 + }, + { + "epoch": 0.83, + "grad_norm": 0.3354485387598985, + "learning_rate": 0.0001348989154826467, + "loss": 1.1203, + "step": 8721 + }, + { + "epoch": 0.83, + "grad_norm": 0.3027876660339266, + "learning_rate": 0.0001348840901779352, + "loss": 1.1517, + "step": 8722 + }, + { + "epoch": 0.83, + "grad_norm": 0.3025089370772015, + "learning_rate": 0.00013486926400022744, + "loss": 1.2937, + "step": 8723 + }, + { + "epoch": 0.83, + "grad_norm": 0.27699234873722095, + "learning_rate": 0.00013485443694989443, + "loss": 1.0503, + "step": 8724 + }, + { + "epoch": 0.83, + "grad_norm": 0.30429620705081767, + "learning_rate": 0.00013483960902730725, + "loss": 1.0699, + "step": 8725 + }, + { + "epoch": 0.83, + "grad_norm": 0.3040291893430194, + "learning_rate": 0.00013482478023283694, + "loss": 1.0117, + "step": 8726 + }, + { + "epoch": 0.83, + "grad_norm": 0.28916049582699416, + "learning_rate": 0.00013480995056685462, + "loss": 1.0908, + "step": 8727 + }, + { + "epoch": 0.84, + "grad_norm": 0.24217865268899172, + "learning_rate": 0.00013479512002973143, + "loss": 1.056, + "step": 8728 + }, + { + "epoch": 0.84, + "grad_norm": 0.27032893449561624, + "learning_rate": 0.00013478028862183846, + "loss": 1.0668, + "step": 8729 + }, + { + "epoch": 0.84, + "grad_norm": 0.27542193970843054, + "learning_rate": 0.00013476545634354692, + "loss": 0.9492, + "step": 8730 + }, + { + "epoch": 0.84, + "grad_norm": 0.23543044415563721, + "learning_rate": 0.000134750623195228, + "loss": 1.0316, + "step": 8731 + }, + { + "epoch": 0.84, + "grad_norm": 0.3223797172938787, + "learning_rate": 0.0001347357891772529, + "loss": 1.1393, + "step": 8732 + }, + { + "epoch": 0.84, + "grad_norm": 0.2998222335002957, + "learning_rate": 0.0001347209542899928, + "loss": 1.1096, + "step": 8733 + }, + { + "epoch": 0.84, + "grad_norm": 0.2845913693982625, + "learning_rate": 0.00013470611853381905, + "loss": 1.0304, + "step": 8734 + }, + { + "epoch": 0.84, + "grad_norm": 0.312734748694851, + "learning_rate": 0.00013469128190910285, + "loss": 1.0816, + "step": 8735 + }, + { + "epoch": 0.84, + "grad_norm": 0.2784778983582345, + "learning_rate": 0.00013467644441621552, + "loss": 0.9477, + "step": 8736 + }, + { + "epoch": 0.84, + "grad_norm": 0.294746691747119, + "learning_rate": 0.00013466160605552836, + "loss": 1.1006, + "step": 8737 + }, + { + "epoch": 0.84, + "grad_norm": 0.2729555754078699, + "learning_rate": 0.00013464676682741275, + "loss": 1.165, + "step": 8738 + }, + { + "epoch": 0.84, + "grad_norm": 0.27914582732602733, + "learning_rate": 0.00013463192673223998, + "loss": 1.0035, + "step": 8739 + }, + { + "epoch": 0.84, + "grad_norm": 0.2759299004442835, + "learning_rate": 0.00013461708577038154, + "loss": 1.1369, + "step": 8740 + }, + { + "epoch": 0.84, + "grad_norm": 0.25997588040826375, + "learning_rate": 0.00013460224394220871, + "loss": 1.0008, + "step": 8741 + }, + { + "epoch": 0.84, + "grad_norm": 0.3006336454437912, + "learning_rate": 0.00013458740124809302, + "loss": 1.0918, + "step": 8742 + }, + { + "epoch": 0.84, + "grad_norm": 0.2865486476111894, + "learning_rate": 0.00013457255768840586, + "loss": 1.0318, + "step": 8743 + }, + { + "epoch": 0.84, + "grad_norm": 0.2610230590960148, + "learning_rate": 0.00013455771326351874, + "loss": 1.0608, + "step": 8744 + }, + { + "epoch": 0.84, + "grad_norm": 0.2747856141015647, + "learning_rate": 0.0001345428679738031, + "loss": 0.9236, + "step": 8745 + }, + { + "epoch": 0.84, + "grad_norm": 0.250840631667433, + "learning_rate": 0.0001345280218196305, + "loss": 1.076, + "step": 8746 + }, + { + "epoch": 0.84, + "grad_norm": 0.3427774684297268, + "learning_rate": 0.0001345131748013724, + "loss": 1.0931, + "step": 8747 + }, + { + "epoch": 0.84, + "grad_norm": 0.30101078325665653, + "learning_rate": 0.0001344983269194005, + "loss": 0.9994, + "step": 8748 + }, + { + "epoch": 0.84, + "grad_norm": 0.285861675215507, + "learning_rate": 0.00013448347817408623, + "loss": 1.0467, + "step": 8749 + }, + { + "epoch": 0.84, + "grad_norm": 0.28454224361070146, + "learning_rate": 0.00013446862856580127, + "loss": 1.0008, + "step": 8750 + }, + { + "epoch": 0.84, + "grad_norm": 0.33159442196308175, + "learning_rate": 0.0001344537780949172, + "loss": 1.206, + "step": 8751 + }, + { + "epoch": 0.84, + "grad_norm": 0.28094929027693405, + "learning_rate": 0.0001344389267618057, + "loss": 1.182, + "step": 8752 + }, + { + "epoch": 0.84, + "grad_norm": 0.2904531787454453, + "learning_rate": 0.0001344240745668384, + "loss": 0.982, + "step": 8753 + }, + { + "epoch": 0.84, + "grad_norm": 0.3124634413609586, + "learning_rate": 0.00013440922151038698, + "loss": 1.0623, + "step": 8754 + }, + { + "epoch": 0.84, + "grad_norm": 0.28809714187038504, + "learning_rate": 0.0001343943675928232, + "loss": 1.1198, + "step": 8755 + }, + { + "epoch": 0.84, + "grad_norm": 0.2873805808293868, + "learning_rate": 0.00013437951281451875, + "loss": 1.0509, + "step": 8756 + }, + { + "epoch": 0.84, + "grad_norm": 0.3130611893567796, + "learning_rate": 0.00013436465717584533, + "loss": 1.0459, + "step": 8757 + }, + { + "epoch": 0.84, + "grad_norm": 0.3002619940368619, + "learning_rate": 0.00013434980067717484, + "loss": 0.9741, + "step": 8758 + }, + { + "epoch": 0.84, + "grad_norm": 0.29994300371561994, + "learning_rate": 0.00013433494331887896, + "loss": 1.0306, + "step": 8759 + }, + { + "epoch": 0.84, + "grad_norm": 0.2709309546499614, + "learning_rate": 0.00013432008510132955, + "loss": 1.0472, + "step": 8760 + }, + { + "epoch": 0.84, + "grad_norm": 0.30275123120474723, + "learning_rate": 0.00013430522602489846, + "loss": 1.0934, + "step": 8761 + }, + { + "epoch": 0.84, + "grad_norm": 0.27267018063077797, + "learning_rate": 0.0001342903660899575, + "loss": 1.0372, + "step": 8762 + }, + { + "epoch": 0.84, + "grad_norm": 0.3179594835836974, + "learning_rate": 0.0001342755052968786, + "loss": 1.0563, + "step": 8763 + }, + { + "epoch": 0.84, + "grad_norm": 0.31426468902967697, + "learning_rate": 0.0001342606436460336, + "loss": 1.1932, + "step": 8764 + }, + { + "epoch": 0.84, + "grad_norm": 0.2715991486632539, + "learning_rate": 0.00013424578113779452, + "loss": 1.0489, + "step": 8765 + }, + { + "epoch": 0.84, + "grad_norm": 0.31900892380705087, + "learning_rate": 0.00013423091777253323, + "loss": 1.109, + "step": 8766 + }, + { + "epoch": 0.84, + "grad_norm": 0.2661697001151372, + "learning_rate": 0.0001342160535506217, + "loss": 1.0726, + "step": 8767 + }, + { + "epoch": 0.84, + "grad_norm": 0.29001722553751685, + "learning_rate": 0.00013420118847243191, + "loss": 1.0504, + "step": 8768 + }, + { + "epoch": 0.84, + "grad_norm": 0.26543562475469634, + "learning_rate": 0.0001341863225383359, + "loss": 1.0057, + "step": 8769 + }, + { + "epoch": 0.84, + "grad_norm": 0.30664308972274495, + "learning_rate": 0.0001341714557487057, + "loss": 1.0982, + "step": 8770 + }, + { + "epoch": 0.84, + "grad_norm": 0.29541897062886224, + "learning_rate": 0.0001341565881039133, + "loss": 1.0859, + "step": 8771 + }, + { + "epoch": 0.84, + "grad_norm": 0.280865811627392, + "learning_rate": 0.00013414171960433085, + "loss": 1.119, + "step": 8772 + }, + { + "epoch": 0.84, + "grad_norm": 0.24611125528299096, + "learning_rate": 0.00013412685025033038, + "loss": 1.1245, + "step": 8773 + }, + { + "epoch": 0.84, + "grad_norm": 0.31141735928913117, + "learning_rate": 0.00013411198004228405, + "loss": 1.0752, + "step": 8774 + }, + { + "epoch": 0.84, + "grad_norm": 0.23851136770012923, + "learning_rate": 0.000134097108980564, + "loss": 1.0644, + "step": 8775 + }, + { + "epoch": 0.84, + "grad_norm": 0.2371811186751139, + "learning_rate": 0.00013408223706554235, + "loss": 1.0813, + "step": 8776 + }, + { + "epoch": 0.84, + "grad_norm": 0.2987397006710603, + "learning_rate": 0.0001340673642975913, + "loss": 1.0765, + "step": 8777 + }, + { + "epoch": 0.84, + "grad_norm": 0.26831922729943924, + "learning_rate": 0.00013405249067708304, + "loss": 1.0317, + "step": 8778 + }, + { + "epoch": 0.84, + "grad_norm": 0.2899859345363733, + "learning_rate": 0.00013403761620438983, + "loss": 1.0402, + "step": 8779 + }, + { + "epoch": 0.84, + "grad_norm": 0.2878652206230178, + "learning_rate": 0.00013402274087988384, + "loss": 0.9933, + "step": 8780 + }, + { + "epoch": 0.84, + "grad_norm": 0.31702536400845066, + "learning_rate": 0.0001340078647039374, + "loss": 1.0253, + "step": 8781 + }, + { + "epoch": 0.84, + "grad_norm": 0.30001278045384056, + "learning_rate": 0.00013399298767692277, + "loss": 1.123, + "step": 8782 + }, + { + "epoch": 0.84, + "grad_norm": 0.2624717683492848, + "learning_rate": 0.00013397810979921227, + "loss": 1.021, + "step": 8783 + }, + { + "epoch": 0.84, + "grad_norm": 0.316080617053837, + "learning_rate": 0.0001339632310711782, + "loss": 1.133, + "step": 8784 + }, + { + "epoch": 0.84, + "grad_norm": 0.27598469027777045, + "learning_rate": 0.00013394835149319292, + "loss": 0.8538, + "step": 8785 + }, + { + "epoch": 0.84, + "grad_norm": 0.29532824662066726, + "learning_rate": 0.00013393347106562884, + "loss": 1.0311, + "step": 8786 + }, + { + "epoch": 0.84, + "grad_norm": 0.3293468526276591, + "learning_rate": 0.00013391858978885828, + "loss": 1.0673, + "step": 8787 + }, + { + "epoch": 0.84, + "grad_norm": 0.2643309414987189, + "learning_rate": 0.00013390370766325373, + "loss": 1.1561, + "step": 8788 + }, + { + "epoch": 0.84, + "grad_norm": 0.2763364535801901, + "learning_rate": 0.00013388882468918758, + "loss": 1.0627, + "step": 8789 + }, + { + "epoch": 0.84, + "grad_norm": 0.2862222208420642, + "learning_rate": 0.0001338739408670323, + "loss": 1.0606, + "step": 8790 + }, + { + "epoch": 0.84, + "grad_norm": 0.2927406980062334, + "learning_rate": 0.00013385905619716032, + "loss": 1.0987, + "step": 8791 + }, + { + "epoch": 0.84, + "grad_norm": 0.27073771357355814, + "learning_rate": 0.00013384417067994423, + "loss": 1.0864, + "step": 8792 + }, + { + "epoch": 0.84, + "grad_norm": 0.26681668946322573, + "learning_rate": 0.00013382928431575648, + "loss": 0.9944, + "step": 8793 + }, + { + "epoch": 0.84, + "grad_norm": 0.2872926853091687, + "learning_rate": 0.00013381439710496962, + "loss": 1.1929, + "step": 8794 + }, + { + "epoch": 0.84, + "grad_norm": 0.29473601888234835, + "learning_rate": 0.00013379950904795625, + "loss": 1.0424, + "step": 8795 + }, + { + "epoch": 0.84, + "grad_norm": 0.2767351991760329, + "learning_rate": 0.0001337846201450889, + "loss": 0.976, + "step": 8796 + }, + { + "epoch": 0.84, + "grad_norm": 0.31278846280951567, + "learning_rate": 0.00013376973039674019, + "loss": 1.0786, + "step": 8797 + }, + { + "epoch": 0.84, + "grad_norm": 0.29106202302244005, + "learning_rate": 0.00013375483980328275, + "loss": 1.0206, + "step": 8798 + }, + { + "epoch": 0.84, + "grad_norm": 0.3062557626049918, + "learning_rate": 0.00013373994836508925, + "loss": 1.0934, + "step": 8799 + }, + { + "epoch": 0.84, + "grad_norm": 0.2930477172648966, + "learning_rate": 0.00013372505608253235, + "loss": 1.0262, + "step": 8800 + }, + { + "epoch": 0.84, + "grad_norm": 0.34634712769514453, + "learning_rate": 0.0001337101629559847, + "loss": 0.9546, + "step": 8801 + }, + { + "epoch": 0.84, + "grad_norm": 0.2772018365006181, + "learning_rate": 0.00013369526898581902, + "loss": 1.0512, + "step": 8802 + }, + { + "epoch": 0.84, + "grad_norm": 0.2709787690909469, + "learning_rate": 0.00013368037417240807, + "loss": 0.994, + "step": 8803 + }, + { + "epoch": 0.84, + "grad_norm": 0.2814766172965834, + "learning_rate": 0.0001336654785161246, + "loss": 1.0845, + "step": 8804 + }, + { + "epoch": 0.84, + "grad_norm": 0.2853983711193067, + "learning_rate": 0.00013365058201734135, + "loss": 0.9669, + "step": 8805 + }, + { + "epoch": 0.84, + "grad_norm": 0.3256691908202577, + "learning_rate": 0.00013363568467643117, + "loss": 1.0751, + "step": 8806 + }, + { + "epoch": 0.84, + "grad_norm": 0.2878851657217033, + "learning_rate": 0.00013362078649376683, + "loss": 1.045, + "step": 8807 + }, + { + "epoch": 0.84, + "grad_norm": 0.30585377114209605, + "learning_rate": 0.00013360588746972118, + "loss": 1.0328, + "step": 8808 + }, + { + "epoch": 0.84, + "grad_norm": 0.2793018113302156, + "learning_rate": 0.00013359098760466707, + "loss": 1.123, + "step": 8809 + }, + { + "epoch": 0.84, + "grad_norm": 0.25457785398487776, + "learning_rate": 0.0001335760868989774, + "loss": 1.1305, + "step": 8810 + }, + { + "epoch": 0.84, + "grad_norm": 0.26931250840260823, + "learning_rate": 0.00013356118535302503, + "loss": 1.1142, + "step": 8811 + }, + { + "epoch": 0.84, + "grad_norm": 0.2779912568855338, + "learning_rate": 0.00013354628296718293, + "loss": 1.0774, + "step": 8812 + }, + { + "epoch": 0.84, + "grad_norm": 0.30698926954918604, + "learning_rate": 0.000133531379741824, + "loss": 1.1206, + "step": 8813 + }, + { + "epoch": 0.84, + "grad_norm": 0.28571250644527174, + "learning_rate": 0.0001335164756773212, + "loss": 1.1259, + "step": 8814 + }, + { + "epoch": 0.84, + "grad_norm": 0.30864009381335133, + "learning_rate": 0.00013350157077404755, + "loss": 0.9197, + "step": 8815 + }, + { + "epoch": 0.84, + "grad_norm": 0.2714746153198062, + "learning_rate": 0.00013348666503237603, + "loss": 0.9842, + "step": 8816 + }, + { + "epoch": 0.84, + "grad_norm": 0.28173495612303234, + "learning_rate": 0.0001334717584526797, + "loss": 0.9923, + "step": 8817 + }, + { + "epoch": 0.84, + "grad_norm": 0.3216688395468582, + "learning_rate": 0.00013345685103533154, + "loss": 1.0964, + "step": 8818 + }, + { + "epoch": 0.84, + "grad_norm": 0.2823984993319558, + "learning_rate": 0.00013344194278070467, + "loss": 1.0409, + "step": 8819 + }, + { + "epoch": 0.84, + "grad_norm": 0.257746108679655, + "learning_rate": 0.00013342703368917217, + "loss": 1.1301, + "step": 8820 + }, + { + "epoch": 0.84, + "grad_norm": 0.3051717861148401, + "learning_rate": 0.00013341212376110715, + "loss": 1.0672, + "step": 8821 + }, + { + "epoch": 0.84, + "grad_norm": 0.27852339454292335, + "learning_rate": 0.00013339721299688272, + "loss": 0.9949, + "step": 8822 + }, + { + "epoch": 0.84, + "grad_norm": 0.30905296931037346, + "learning_rate": 0.00013338230139687206, + "loss": 1.0062, + "step": 8823 + }, + { + "epoch": 0.84, + "grad_norm": 0.24655148826104478, + "learning_rate": 0.0001333673889614483, + "loss": 1.001, + "step": 8824 + }, + { + "epoch": 0.84, + "grad_norm": 0.30269701257489295, + "learning_rate": 0.00013335247569098467, + "loss": 1.0285, + "step": 8825 + }, + { + "epoch": 0.84, + "grad_norm": 0.29912297649204367, + "learning_rate": 0.00013333756158585437, + "loss": 1.1004, + "step": 8826 + }, + { + "epoch": 0.84, + "grad_norm": 0.29035499574876744, + "learning_rate": 0.00013332264664643067, + "loss": 1.053, + "step": 8827 + }, + { + "epoch": 0.84, + "grad_norm": 0.2869996333200353, + "learning_rate": 0.00013330773087308676, + "loss": 1.0357, + "step": 8828 + }, + { + "epoch": 0.84, + "grad_norm": 0.29232380763052546, + "learning_rate": 0.00013329281426619597, + "loss": 1.0288, + "step": 8829 + }, + { + "epoch": 0.84, + "grad_norm": 0.2495577755639189, + "learning_rate": 0.0001332778968261316, + "loss": 1.0611, + "step": 8830 + }, + { + "epoch": 0.84, + "grad_norm": 0.2979321424581524, + "learning_rate": 0.0001332629785532669, + "loss": 0.9719, + "step": 8831 + }, + { + "epoch": 0.84, + "grad_norm": 0.2821090849292814, + "learning_rate": 0.0001332480594479753, + "loss": 1.1135, + "step": 8832 + }, + { + "epoch": 0.85, + "grad_norm": 0.3016300483432541, + "learning_rate": 0.0001332331395106301, + "loss": 1.1395, + "step": 8833 + }, + { + "epoch": 0.85, + "grad_norm": 0.3017945019892449, + "learning_rate": 0.00013321821874160472, + "loss": 0.9148, + "step": 8834 + }, + { + "epoch": 0.85, + "grad_norm": 0.25675693104083414, + "learning_rate": 0.00013320329714127248, + "loss": 1.0208, + "step": 8835 + }, + { + "epoch": 0.85, + "grad_norm": 0.30879551976248903, + "learning_rate": 0.0001331883747100069, + "loss": 1.1158, + "step": 8836 + }, + { + "epoch": 0.85, + "grad_norm": 0.2910089252086533, + "learning_rate": 0.0001331734514481814, + "loss": 1.0531, + "step": 8837 + }, + { + "epoch": 0.85, + "grad_norm": 0.2800038334110076, + "learning_rate": 0.0001331585273561694, + "loss": 1.0722, + "step": 8838 + }, + { + "epoch": 0.85, + "grad_norm": 0.29386882904498585, + "learning_rate": 0.00013314360243434442, + "loss": 1.0914, + "step": 8839 + }, + { + "epoch": 0.85, + "grad_norm": 0.32822389242666516, + "learning_rate": 0.00013312867668307998, + "loss": 1.0664, + "step": 8840 + }, + { + "epoch": 0.85, + "grad_norm": 0.28150462602342563, + "learning_rate": 0.00013311375010274958, + "loss": 1.0505, + "step": 8841 + }, + { + "epoch": 0.85, + "grad_norm": 0.28190761168490724, + "learning_rate": 0.00013309882269372676, + "loss": 1.0243, + "step": 8842 + }, + { + "epoch": 0.85, + "grad_norm": 0.324588959170944, + "learning_rate": 0.00013308389445638508, + "loss": 1.1538, + "step": 8843 + }, + { + "epoch": 0.85, + "grad_norm": 0.313116228461298, + "learning_rate": 0.0001330689653910982, + "loss": 1.0932, + "step": 8844 + }, + { + "epoch": 0.85, + "grad_norm": 0.3028011762250344, + "learning_rate": 0.00013305403549823962, + "loss": 1.0032, + "step": 8845 + }, + { + "epoch": 0.85, + "grad_norm": 0.2679913864423184, + "learning_rate": 0.00013303910477818306, + "loss": 1.0489, + "step": 8846 + }, + { + "epoch": 0.85, + "grad_norm": 0.23253134929888603, + "learning_rate": 0.00013302417323130214, + "loss": 1.0339, + "step": 8847 + }, + { + "epoch": 0.85, + "grad_norm": 0.32940341276625706, + "learning_rate": 0.00013300924085797052, + "loss": 1.0542, + "step": 8848 + }, + { + "epoch": 0.85, + "grad_norm": 0.32063577062079657, + "learning_rate": 0.0001329943076585619, + "loss": 1.029, + "step": 8849 + }, + { + "epoch": 0.85, + "grad_norm": 0.30995947199370877, + "learning_rate": 0.00013297937363345, + "loss": 1.0355, + "step": 8850 + }, + { + "epoch": 0.85, + "grad_norm": 0.25561792258910343, + "learning_rate": 0.00013296443878300858, + "loss": 1.0827, + "step": 8851 + }, + { + "epoch": 0.85, + "grad_norm": 0.26801538052648793, + "learning_rate": 0.0001329495031076113, + "loss": 1.0586, + "step": 8852 + }, + { + "epoch": 0.85, + "grad_norm": 0.2728699949209647, + "learning_rate": 0.00013293456660763204, + "loss": 1.033, + "step": 8853 + }, + { + "epoch": 0.85, + "grad_norm": 0.2807693199525417, + "learning_rate": 0.00013291962928344456, + "loss": 1.0447, + "step": 8854 + }, + { + "epoch": 0.85, + "grad_norm": 0.30144064538223736, + "learning_rate": 0.00013290469113542264, + "loss": 1.0855, + "step": 8855 + }, + { + "epoch": 0.85, + "grad_norm": 0.2871386350695731, + "learning_rate": 0.00013288975216394015, + "loss": 1.045, + "step": 8856 + }, + { + "epoch": 0.85, + "grad_norm": 0.2783823690348202, + "learning_rate": 0.00013287481236937094, + "loss": 1.0184, + "step": 8857 + }, + { + "epoch": 0.85, + "grad_norm": 0.28082328661100076, + "learning_rate": 0.0001328598717520889, + "loss": 1.11, + "step": 8858 + }, + { + "epoch": 0.85, + "grad_norm": 0.2537605984201099, + "learning_rate": 0.00013284493031246792, + "loss": 1.0461, + "step": 8859 + }, + { + "epoch": 0.85, + "grad_norm": 0.29693120122561123, + "learning_rate": 0.00013282998805088191, + "loss": 0.9376, + "step": 8860 + }, + { + "epoch": 0.85, + "grad_norm": 0.2736943955631886, + "learning_rate": 0.0001328150449677048, + "loss": 1.1058, + "step": 8861 + }, + { + "epoch": 0.85, + "grad_norm": 0.3420821362503677, + "learning_rate": 0.00013280010106331058, + "loss": 1.1671, + "step": 8862 + }, + { + "epoch": 0.85, + "grad_norm": 0.2989694017306944, + "learning_rate": 0.00013278515633807322, + "loss": 1.1161, + "step": 8863 + }, + { + "epoch": 0.85, + "grad_norm": 0.24965561624077762, + "learning_rate": 0.00013277021079236673, + "loss": 0.9428, + "step": 8864 + }, + { + "epoch": 0.85, + "grad_norm": 0.27560232204493745, + "learning_rate": 0.0001327552644265651, + "loss": 1.0965, + "step": 8865 + }, + { + "epoch": 0.85, + "grad_norm": 0.31916956119622164, + "learning_rate": 0.0001327403172410424, + "loss": 1.1617, + "step": 8866 + }, + { + "epoch": 0.85, + "grad_norm": 0.26752857028568255, + "learning_rate": 0.00013272536923617266, + "loss": 0.9989, + "step": 8867 + }, + { + "epoch": 0.85, + "grad_norm": 0.2883358139977724, + "learning_rate": 0.00013271042041233003, + "loss": 0.9731, + "step": 8868 + }, + { + "epoch": 0.85, + "grad_norm": 0.33884977680481115, + "learning_rate": 0.00013269547076988854, + "loss": 1.1386, + "step": 8869 + }, + { + "epoch": 0.85, + "grad_norm": 0.32546200445254864, + "learning_rate": 0.00013268052030922237, + "loss": 0.9609, + "step": 8870 + }, + { + "epoch": 0.85, + "grad_norm": 0.2826007467613128, + "learning_rate": 0.00013266556903070563, + "loss": 1.0233, + "step": 8871 + }, + { + "epoch": 0.85, + "grad_norm": 0.2778261537338506, + "learning_rate": 0.00013265061693471246, + "loss": 1.0323, + "step": 8872 + }, + { + "epoch": 0.85, + "grad_norm": 0.26913753452272493, + "learning_rate": 0.00013263566402161713, + "loss": 0.9943, + "step": 8873 + }, + { + "epoch": 0.85, + "grad_norm": 0.26448807283032155, + "learning_rate": 0.0001326207102917938, + "loss": 1.0572, + "step": 8874 + }, + { + "epoch": 0.85, + "grad_norm": 0.3046847456760344, + "learning_rate": 0.00013260575574561666, + "loss": 1.0784, + "step": 8875 + }, + { + "epoch": 0.85, + "grad_norm": 0.297455531468288, + "learning_rate": 0.00013259080038345998, + "loss": 1.1246, + "step": 8876 + }, + { + "epoch": 0.85, + "grad_norm": 0.31150901502574146, + "learning_rate": 0.0001325758442056981, + "loss": 1.1541, + "step": 8877 + }, + { + "epoch": 0.85, + "grad_norm": 0.32499878866127807, + "learning_rate": 0.00013256088721270518, + "loss": 1.1098, + "step": 8878 + }, + { + "epoch": 0.85, + "grad_norm": 0.30353184520322335, + "learning_rate": 0.00013254592940485562, + "loss": 1.1114, + "step": 8879 + }, + { + "epoch": 0.85, + "grad_norm": 0.2765014106506231, + "learning_rate": 0.00013253097078252374, + "loss": 1.0607, + "step": 8880 + }, + { + "epoch": 0.85, + "grad_norm": 0.3232413136845846, + "learning_rate": 0.00013251601134608385, + "loss": 1.0267, + "step": 8881 + }, + { + "epoch": 0.85, + "grad_norm": 0.2576596041228671, + "learning_rate": 0.00013250105109591034, + "loss": 1.1395, + "step": 8882 + }, + { + "epoch": 0.85, + "grad_norm": 0.299087343065221, + "learning_rate": 0.00013248609003237762, + "loss": 1.0855, + "step": 8883 + }, + { + "epoch": 0.85, + "grad_norm": 0.31373524675219927, + "learning_rate": 0.00013247112815586008, + "loss": 1.0168, + "step": 8884 + }, + { + "epoch": 0.85, + "grad_norm": 0.29307123636308, + "learning_rate": 0.00013245616546673212, + "loss": 1.1391, + "step": 8885 + }, + { + "epoch": 0.85, + "grad_norm": 0.2892716141501793, + "learning_rate": 0.00013244120196536825, + "loss": 1.0368, + "step": 8886 + }, + { + "epoch": 0.85, + "grad_norm": 0.30030230235650646, + "learning_rate": 0.0001324262376521429, + "loss": 0.9953, + "step": 8887 + }, + { + "epoch": 0.85, + "grad_norm": 0.2958420718954474, + "learning_rate": 0.00013241127252743056, + "loss": 1.0635, + "step": 8888 + }, + { + "epoch": 0.85, + "grad_norm": 0.30426355904923175, + "learning_rate": 0.00013239630659160577, + "loss": 0.9941, + "step": 8889 + }, + { + "epoch": 0.85, + "grad_norm": 0.26204874988846405, + "learning_rate": 0.00013238133984504305, + "loss": 1.0014, + "step": 8890 + }, + { + "epoch": 0.85, + "grad_norm": 0.2808476806284864, + "learning_rate": 0.00013236637228811695, + "loss": 0.9811, + "step": 8891 + }, + { + "epoch": 0.85, + "grad_norm": 0.2613165358148853, + "learning_rate": 0.00013235140392120202, + "loss": 1.0464, + "step": 8892 + }, + { + "epoch": 0.85, + "grad_norm": 0.2614569034666025, + "learning_rate": 0.0001323364347446729, + "loss": 1.0346, + "step": 8893 + }, + { + "epoch": 0.85, + "grad_norm": 0.3048725191683244, + "learning_rate": 0.00013232146475890415, + "loss": 1.1072, + "step": 8894 + }, + { + "epoch": 0.85, + "grad_norm": 0.3051855377395641, + "learning_rate": 0.00013230649396427048, + "loss": 1.2126, + "step": 8895 + }, + { + "epoch": 0.85, + "grad_norm": 0.2885176846905764, + "learning_rate": 0.00013229152236114646, + "loss": 1.1485, + "step": 8896 + }, + { + "epoch": 0.85, + "grad_norm": 0.27063874711622693, + "learning_rate": 0.0001322765499499068, + "loss": 1.0701, + "step": 8897 + }, + { + "epoch": 0.85, + "grad_norm": 0.2650234430261914, + "learning_rate": 0.0001322615767309262, + "loss": 0.9788, + "step": 8898 + }, + { + "epoch": 0.85, + "grad_norm": 0.3332394548518117, + "learning_rate": 0.00013224660270457937, + "loss": 1.0501, + "step": 8899 + }, + { + "epoch": 0.85, + "grad_norm": 0.2622983793530935, + "learning_rate": 0.00013223162787124104, + "loss": 1.0524, + "step": 8900 + }, + { + "epoch": 0.85, + "grad_norm": 0.28082192393839644, + "learning_rate": 0.00013221665223128593, + "loss": 1.2141, + "step": 8901 + }, + { + "epoch": 0.85, + "grad_norm": 0.30805730414546934, + "learning_rate": 0.00013220167578508892, + "loss": 1.073, + "step": 8902 + }, + { + "epoch": 0.85, + "grad_norm": 0.2984108714398881, + "learning_rate": 0.00013218669853302467, + "loss": 1.075, + "step": 8903 + }, + { + "epoch": 0.85, + "grad_norm": 0.24970867197130167, + "learning_rate": 0.0001321717204754681, + "loss": 0.9129, + "step": 8904 + }, + { + "epoch": 0.85, + "grad_norm": 0.2976975973384336, + "learning_rate": 0.00013215674161279402, + "loss": 1.0718, + "step": 8905 + }, + { + "epoch": 0.85, + "grad_norm": 0.26473164124738907, + "learning_rate": 0.00013214176194537722, + "loss": 1.1786, + "step": 8906 + }, + { + "epoch": 0.85, + "grad_norm": 0.28433983890808473, + "learning_rate": 0.00013212678147359267, + "loss": 1.1131, + "step": 8907 + }, + { + "epoch": 0.85, + "grad_norm": 0.268486990998593, + "learning_rate": 0.00013211180019781518, + "loss": 1.0515, + "step": 8908 + }, + { + "epoch": 0.85, + "grad_norm": 0.2737650744067927, + "learning_rate": 0.00013209681811841972, + "loss": 1.0995, + "step": 8909 + }, + { + "epoch": 0.85, + "grad_norm": 0.30007514688621423, + "learning_rate": 0.00013208183523578124, + "loss": 1.0624, + "step": 8910 + }, + { + "epoch": 0.85, + "grad_norm": 0.26052045873934276, + "learning_rate": 0.00013206685155027465, + "loss": 1.1553, + "step": 8911 + }, + { + "epoch": 0.85, + "grad_norm": 0.27092778505487736, + "learning_rate": 0.00013205186706227498, + "loss": 1.1842, + "step": 8912 + }, + { + "epoch": 0.85, + "grad_norm": 0.3258021626325885, + "learning_rate": 0.00013203688177215714, + "loss": 1.081, + "step": 8913 + }, + { + "epoch": 0.85, + "grad_norm": 0.2984371253963151, + "learning_rate": 0.00013202189568029625, + "loss": 1.0585, + "step": 8914 + }, + { + "epoch": 0.85, + "grad_norm": 0.28356401437541234, + "learning_rate": 0.00013200690878706724, + "loss": 0.9656, + "step": 8915 + }, + { + "epoch": 0.85, + "grad_norm": 0.27533653430669, + "learning_rate": 0.00013199192109284526, + "loss": 1.0106, + "step": 8916 + }, + { + "epoch": 0.85, + "grad_norm": 0.28571972152442254, + "learning_rate": 0.00013197693259800534, + "loss": 0.9763, + "step": 8917 + }, + { + "epoch": 0.85, + "grad_norm": 0.2560488227643934, + "learning_rate": 0.0001319619433029226, + "loss": 1.182, + "step": 8918 + }, + { + "epoch": 0.85, + "grad_norm": 0.2984853514513387, + "learning_rate": 0.00013194695320797214, + "loss": 1.1465, + "step": 8919 + }, + { + "epoch": 0.85, + "grad_norm": 0.25687016849701755, + "learning_rate": 0.00013193196231352905, + "loss": 0.9482, + "step": 8920 + }, + { + "epoch": 0.85, + "grad_norm": 0.31648422763460987, + "learning_rate": 0.00013191697061996858, + "loss": 1.0639, + "step": 8921 + }, + { + "epoch": 0.85, + "grad_norm": 0.2499110549279051, + "learning_rate": 0.00013190197812766588, + "loss": 1.0542, + "step": 8922 + }, + { + "epoch": 0.85, + "grad_norm": 0.26107365347955813, + "learning_rate": 0.00013188698483699608, + "loss": 1.0763, + "step": 8923 + }, + { + "epoch": 0.85, + "grad_norm": 0.2890340873975123, + "learning_rate": 0.00013187199074833449, + "loss": 1.1517, + "step": 8924 + }, + { + "epoch": 0.85, + "grad_norm": 0.26683546682272963, + "learning_rate": 0.00013185699586205628, + "loss": 1.1019, + "step": 8925 + }, + { + "epoch": 0.85, + "grad_norm": 0.2751530786283772, + "learning_rate": 0.0001318420001785367, + "loss": 1.0552, + "step": 8926 + }, + { + "epoch": 0.85, + "grad_norm": 0.2805940758160267, + "learning_rate": 0.00013182700369815108, + "loss": 1.046, + "step": 8927 + }, + { + "epoch": 0.85, + "grad_norm": 0.2581306634911059, + "learning_rate": 0.00013181200642127468, + "loss": 0.9718, + "step": 8928 + }, + { + "epoch": 0.85, + "grad_norm": 0.2699851519716227, + "learning_rate": 0.00013179700834828282, + "loss": 1.1284, + "step": 8929 + }, + { + "epoch": 0.85, + "grad_norm": 0.2735704940152805, + "learning_rate": 0.00013178200947955087, + "loss": 0.9873, + "step": 8930 + }, + { + "epoch": 0.85, + "grad_norm": 0.29262357118516796, + "learning_rate": 0.00013176700981545414, + "loss": 1.052, + "step": 8931 + }, + { + "epoch": 0.85, + "grad_norm": 0.3021541436530508, + "learning_rate": 0.00013175200935636804, + "loss": 0.9968, + "step": 8932 + }, + { + "epoch": 0.85, + "grad_norm": 0.2994688515381643, + "learning_rate": 0.0001317370081026679, + "loss": 1.0423, + "step": 8933 + }, + { + "epoch": 0.85, + "grad_norm": 0.21622128251751344, + "learning_rate": 0.00013172200605472925, + "loss": 1.0596, + "step": 8934 + }, + { + "epoch": 0.85, + "grad_norm": 0.27357520538102, + "learning_rate": 0.00013170700321292746, + "loss": 1.0868, + "step": 8935 + }, + { + "epoch": 0.85, + "grad_norm": 0.32314314647789527, + "learning_rate": 0.00013169199957763797, + "loss": 1.1866, + "step": 8936 + }, + { + "epoch": 0.86, + "grad_norm": 0.27729229797866944, + "learning_rate": 0.00013167699514923624, + "loss": 1.0354, + "step": 8937 + }, + { + "epoch": 0.86, + "grad_norm": 0.2689304675066156, + "learning_rate": 0.00013166198992809784, + "loss": 1.118, + "step": 8938 + }, + { + "epoch": 0.86, + "grad_norm": 0.281878748265055, + "learning_rate": 0.00013164698391459823, + "loss": 1.0886, + "step": 8939 + }, + { + "epoch": 0.86, + "grad_norm": 0.31972214001347354, + "learning_rate": 0.00013163197710911294, + "loss": 1.0638, + "step": 8940 + }, + { + "epoch": 0.86, + "grad_norm": 0.30752537186921586, + "learning_rate": 0.00013161696951201755, + "loss": 1.0808, + "step": 8941 + }, + { + "epoch": 0.86, + "grad_norm": 0.3169449014565689, + "learning_rate": 0.00013160196112368765, + "loss": 1.0815, + "step": 8942 + }, + { + "epoch": 0.86, + "grad_norm": 0.279353465633034, + "learning_rate": 0.00013158695194449878, + "loss": 0.9856, + "step": 8943 + }, + { + "epoch": 0.86, + "grad_norm": 0.2506893330731972, + "learning_rate": 0.00013157194197482662, + "loss": 1.0427, + "step": 8944 + }, + { + "epoch": 0.86, + "grad_norm": 0.28379173348711845, + "learning_rate": 0.00013155693121504676, + "loss": 1.077, + "step": 8945 + }, + { + "epoch": 0.86, + "grad_norm": 0.35906069380401406, + "learning_rate": 0.00013154191966553488, + "loss": 1.0853, + "step": 8946 + }, + { + "epoch": 0.86, + "grad_norm": 0.2682913120576577, + "learning_rate": 0.0001315269073266666, + "loss": 1.011, + "step": 8947 + }, + { + "epoch": 0.86, + "grad_norm": 0.2686741445506597, + "learning_rate": 0.00013151189419881767, + "loss": 1.0058, + "step": 8948 + }, + { + "epoch": 0.86, + "grad_norm": 0.30034831308239196, + "learning_rate": 0.00013149688028236378, + "loss": 0.999, + "step": 8949 + }, + { + "epoch": 0.86, + "grad_norm": 0.2785651237874469, + "learning_rate": 0.00013148186557768065, + "loss": 1.0743, + "step": 8950 + }, + { + "epoch": 0.86, + "grad_norm": 0.26665224113450825, + "learning_rate": 0.00013146685008514405, + "loss": 1.1498, + "step": 8951 + }, + { + "epoch": 0.86, + "grad_norm": 0.2767238879256942, + "learning_rate": 0.00013145183380512977, + "loss": 0.9933, + "step": 8952 + }, + { + "epoch": 0.86, + "grad_norm": 0.27676417086874466, + "learning_rate": 0.0001314368167380136, + "loss": 1.048, + "step": 8953 + }, + { + "epoch": 0.86, + "grad_norm": 0.26331762630608485, + "learning_rate": 0.00013142179888417127, + "loss": 1.0725, + "step": 8954 + }, + { + "epoch": 0.86, + "grad_norm": 0.2744742895646326, + "learning_rate": 0.00013140678024397876, + "loss": 1.0698, + "step": 8955 + }, + { + "epoch": 0.86, + "grad_norm": 0.3284594696993571, + "learning_rate": 0.00013139176081781176, + "loss": 1.1246, + "step": 8956 + }, + { + "epoch": 0.86, + "grad_norm": 0.25822096562028224, + "learning_rate": 0.00013137674060604627, + "loss": 0.9168, + "step": 8957 + }, + { + "epoch": 0.86, + "grad_norm": 0.26107725204700727, + "learning_rate": 0.0001313617196090581, + "loss": 1.2021, + "step": 8958 + }, + { + "epoch": 0.86, + "grad_norm": 0.3179530192263277, + "learning_rate": 0.0001313466978272232, + "loss": 1.0834, + "step": 8959 + }, + { + "epoch": 0.86, + "grad_norm": 0.2754800386668831, + "learning_rate": 0.00013133167526091746, + "loss": 1.0735, + "step": 8960 + }, + { + "epoch": 0.86, + "grad_norm": 0.28969505946971996, + "learning_rate": 0.00013131665191051686, + "loss": 1.1824, + "step": 8961 + }, + { + "epoch": 0.86, + "grad_norm": 0.3050269917318578, + "learning_rate": 0.0001313016277763974, + "loss": 1.1192, + "step": 8962 + }, + { + "epoch": 0.86, + "grad_norm": 0.28997353889525096, + "learning_rate": 0.00013128660285893502, + "loss": 1.0988, + "step": 8963 + }, + { + "epoch": 0.86, + "grad_norm": 0.2639957610030003, + "learning_rate": 0.00013127157715850572, + "loss": 1.095, + "step": 8964 + }, + { + "epoch": 0.86, + "grad_norm": 0.25243571206061105, + "learning_rate": 0.00013125655067548555, + "loss": 1.0326, + "step": 8965 + }, + { + "epoch": 0.86, + "grad_norm": 0.27769176135258244, + "learning_rate": 0.00013124152341025057, + "loss": 1.0556, + "step": 8966 + }, + { + "epoch": 0.86, + "grad_norm": 0.31119684825896593, + "learning_rate": 0.00013122649536317682, + "loss": 1.0474, + "step": 8967 + }, + { + "epoch": 0.86, + "grad_norm": 0.2899032223624125, + "learning_rate": 0.0001312114665346404, + "loss": 1.1379, + "step": 8968 + }, + { + "epoch": 0.86, + "grad_norm": 0.26474621772474893, + "learning_rate": 0.00013119643692501742, + "loss": 0.9737, + "step": 8969 + }, + { + "epoch": 0.86, + "grad_norm": 0.296070787125954, + "learning_rate": 0.000131181406534684, + "loss": 1.1303, + "step": 8970 + }, + { + "epoch": 0.86, + "grad_norm": 0.28196083416898055, + "learning_rate": 0.00013116637536401626, + "loss": 0.9212, + "step": 8971 + }, + { + "epoch": 0.86, + "grad_norm": 0.33777731349783074, + "learning_rate": 0.00013115134341339042, + "loss": 1.1114, + "step": 8972 + }, + { + "epoch": 0.86, + "grad_norm": 0.2980471209168762, + "learning_rate": 0.00013113631068318262, + "loss": 1.0419, + "step": 8973 + }, + { + "epoch": 0.86, + "grad_norm": 0.34067752191686806, + "learning_rate": 0.00013112127717376906, + "loss": 1.0489, + "step": 8974 + }, + { + "epoch": 0.86, + "grad_norm": 0.2857921528011418, + "learning_rate": 0.000131106242885526, + "loss": 1.095, + "step": 8975 + }, + { + "epoch": 0.86, + "grad_norm": 0.30595650571474364, + "learning_rate": 0.0001310912078188297, + "loss": 1.0579, + "step": 8976 + }, + { + "epoch": 0.86, + "grad_norm": 0.3113974291276129, + "learning_rate": 0.00013107617197405632, + "loss": 1.1008, + "step": 8977 + }, + { + "epoch": 0.86, + "grad_norm": 0.2835220329748002, + "learning_rate": 0.00013106113535158223, + "loss": 1.1758, + "step": 8978 + }, + { + "epoch": 0.86, + "grad_norm": 0.30573600648132765, + "learning_rate": 0.00013104609795178373, + "loss": 1.0587, + "step": 8979 + }, + { + "epoch": 0.86, + "grad_norm": 0.2724899362196218, + "learning_rate": 0.00013103105977503712, + "loss": 1.0202, + "step": 8980 + }, + { + "epoch": 0.86, + "grad_norm": 0.28699936186440506, + "learning_rate": 0.0001310160208217187, + "loss": 1.1057, + "step": 8981 + }, + { + "epoch": 0.86, + "grad_norm": 0.26272510332229243, + "learning_rate": 0.00013100098109220486, + "loss": 1.0776, + "step": 8982 + }, + { + "epoch": 0.86, + "grad_norm": 0.27023197555661166, + "learning_rate": 0.00013098594058687203, + "loss": 1.1075, + "step": 8983 + }, + { + "epoch": 0.86, + "grad_norm": 0.27849525197693725, + "learning_rate": 0.00013097089930609653, + "loss": 1.0556, + "step": 8984 + }, + { + "epoch": 0.86, + "grad_norm": 0.2813628673155407, + "learning_rate": 0.00013095585725025481, + "loss": 0.9954, + "step": 8985 + }, + { + "epoch": 0.86, + "grad_norm": 0.23165959095448188, + "learning_rate": 0.00013094081441972333, + "loss": 1.0472, + "step": 8986 + }, + { + "epoch": 0.86, + "grad_norm": 0.2865879802186648, + "learning_rate": 0.0001309257708148785, + "loss": 1.0757, + "step": 8987 + }, + { + "epoch": 0.86, + "grad_norm": 0.28003705635045895, + "learning_rate": 0.00013091072643609683, + "loss": 1.027, + "step": 8988 + }, + { + "epoch": 0.86, + "grad_norm": 0.2518027494418857, + "learning_rate": 0.0001308956812837548, + "loss": 1.074, + "step": 8989 + }, + { + "epoch": 0.86, + "grad_norm": 0.2578937586135656, + "learning_rate": 0.0001308806353582289, + "loss": 1.0831, + "step": 8990 + }, + { + "epoch": 0.86, + "grad_norm": 0.27260063912350424, + "learning_rate": 0.00013086558865989576, + "loss": 1.0183, + "step": 8991 + }, + { + "epoch": 0.86, + "grad_norm": 0.29385408837160987, + "learning_rate": 0.0001308505411891318, + "loss": 1.0944, + "step": 8992 + }, + { + "epoch": 0.86, + "grad_norm": 0.293490265358121, + "learning_rate": 0.0001308354929463137, + "loss": 1.0724, + "step": 8993 + }, + { + "epoch": 0.86, + "grad_norm": 0.2822276360093321, + "learning_rate": 0.00013082044393181798, + "loss": 1.0708, + "step": 8994 + }, + { + "epoch": 0.86, + "grad_norm": 0.31600027722892315, + "learning_rate": 0.0001308053941460213, + "loss": 1.036, + "step": 8995 + }, + { + "epoch": 0.86, + "grad_norm": 0.31974027288364265, + "learning_rate": 0.00013079034358930028, + "loss": 1.1244, + "step": 8996 + }, + { + "epoch": 0.86, + "grad_norm": 0.2636215501400869, + "learning_rate": 0.00013077529226203155, + "loss": 0.9505, + "step": 8997 + }, + { + "epoch": 0.86, + "grad_norm": 0.2732300022362931, + "learning_rate": 0.00013076024016459177, + "loss": 1.0561, + "step": 8998 + }, + { + "epoch": 0.86, + "grad_norm": 0.33093910848931535, + "learning_rate": 0.0001307451872973577, + "loss": 1.0408, + "step": 8999 + }, + { + "epoch": 0.86, + "grad_norm": 0.2880821403188908, + "learning_rate": 0.00013073013366070595, + "loss": 1.052, + "step": 9000 + }, + { + "epoch": 0.86, + "grad_norm": 0.3061067194549959, + "learning_rate": 0.0001307150792550133, + "loss": 1.0279, + "step": 9001 + }, + { + "epoch": 0.86, + "grad_norm": 0.2723856651427697, + "learning_rate": 0.0001307000240806565, + "loss": 1.0259, + "step": 9002 + }, + { + "epoch": 0.86, + "grad_norm": 0.2997832873455384, + "learning_rate": 0.0001306849681380123, + "loss": 1.0171, + "step": 9003 + }, + { + "epoch": 0.86, + "grad_norm": 0.268324862068108, + "learning_rate": 0.00013066991142745746, + "loss": 1.0015, + "step": 9004 + }, + { + "epoch": 0.86, + "grad_norm": 0.3313758049625676, + "learning_rate": 0.00013065485394936886, + "loss": 1.0488, + "step": 9005 + }, + { + "epoch": 0.86, + "grad_norm": 0.2840867112984875, + "learning_rate": 0.00013063979570412324, + "loss": 1.0547, + "step": 9006 + }, + { + "epoch": 0.86, + "grad_norm": 0.2859672508903575, + "learning_rate": 0.0001306247366920975, + "loss": 1.0588, + "step": 9007 + }, + { + "epoch": 0.86, + "grad_norm": 0.29147422535829376, + "learning_rate": 0.00013060967691366844, + "loss": 1.126, + "step": 9008 + }, + { + "epoch": 0.86, + "grad_norm": 0.2736559351409344, + "learning_rate": 0.00013059461636921298, + "loss": 0.9615, + "step": 9009 + }, + { + "epoch": 0.86, + "grad_norm": 0.2772933874203004, + "learning_rate": 0.00013057955505910805, + "loss": 1.085, + "step": 9010 + }, + { + "epoch": 0.86, + "grad_norm": 0.29810412332818575, + "learning_rate": 0.00013056449298373053, + "loss": 1.0038, + "step": 9011 + }, + { + "epoch": 0.86, + "grad_norm": 0.2844185615999324, + "learning_rate": 0.00013054943014345732, + "loss": 1.1034, + "step": 9012 + }, + { + "epoch": 0.86, + "grad_norm": 0.3198834730415027, + "learning_rate": 0.0001305343665386655, + "loss": 1.0527, + "step": 9013 + }, + { + "epoch": 0.86, + "grad_norm": 0.29395624791864494, + "learning_rate": 0.00013051930216973192, + "loss": 1.1132, + "step": 9014 + }, + { + "epoch": 0.86, + "grad_norm": 0.28969238208023507, + "learning_rate": 0.0001305042370370336, + "loss": 0.8879, + "step": 9015 + }, + { + "epoch": 0.86, + "grad_norm": 0.2718247193165485, + "learning_rate": 0.0001304891711409476, + "loss": 0.9514, + "step": 9016 + }, + { + "epoch": 0.86, + "grad_norm": 0.2736779348162901, + "learning_rate": 0.00013047410448185096, + "loss": 1.0625, + "step": 9017 + }, + { + "epoch": 0.86, + "grad_norm": 0.3190676200490636, + "learning_rate": 0.00013045903706012066, + "loss": 1.1119, + "step": 9018 + }, + { + "epoch": 0.86, + "grad_norm": 0.2645211188163359, + "learning_rate": 0.00013044396887613383, + "loss": 1.0451, + "step": 9019 + }, + { + "epoch": 0.86, + "grad_norm": 0.27409629322747, + "learning_rate": 0.00013042889993026757, + "loss": 1.0542, + "step": 9020 + }, + { + "epoch": 0.86, + "grad_norm": 0.3057021252154314, + "learning_rate": 0.00013041383022289893, + "loss": 0.9845, + "step": 9021 + }, + { + "epoch": 0.86, + "grad_norm": 0.31255114430184533, + "learning_rate": 0.00013039875975440508, + "loss": 0.923, + "step": 9022 + }, + { + "epoch": 0.86, + "grad_norm": 0.28633949394067004, + "learning_rate": 0.00013038368852516318, + "loss": 1.0501, + "step": 9023 + }, + { + "epoch": 0.86, + "grad_norm": 0.2635390208946298, + "learning_rate": 0.00013036861653555038, + "loss": 1.0946, + "step": 9024 + }, + { + "epoch": 0.86, + "grad_norm": 0.2938192444403693, + "learning_rate": 0.00013035354378594384, + "loss": 1.0054, + "step": 9025 + }, + { + "epoch": 0.86, + "grad_norm": 0.30536276310624666, + "learning_rate": 0.0001303384702767208, + "loss": 1.0497, + "step": 9026 + }, + { + "epoch": 0.86, + "grad_norm": 0.2790718788746024, + "learning_rate": 0.0001303233960082585, + "loss": 1.0417, + "step": 9027 + }, + { + "epoch": 0.86, + "grad_norm": 0.2816516990804407, + "learning_rate": 0.00013030832098093412, + "loss": 0.9402, + "step": 9028 + }, + { + "epoch": 0.86, + "grad_norm": 0.31401103333965236, + "learning_rate": 0.00013029324519512497, + "loss": 0.9892, + "step": 9029 + }, + { + "epoch": 0.86, + "grad_norm": 0.26148363698940397, + "learning_rate": 0.00013027816865120834, + "loss": 1.0242, + "step": 9030 + }, + { + "epoch": 0.86, + "grad_norm": 0.2998596222624002, + "learning_rate": 0.0001302630913495615, + "loss": 1.0302, + "step": 9031 + }, + { + "epoch": 0.86, + "grad_norm": 0.29176595806240635, + "learning_rate": 0.00013024801329056178, + "loss": 1.2279, + "step": 9032 + }, + { + "epoch": 0.86, + "grad_norm": 0.3344098434780136, + "learning_rate": 0.00013023293447458648, + "loss": 1.0633, + "step": 9033 + }, + { + "epoch": 0.86, + "grad_norm": 0.34296449020035136, + "learning_rate": 0.00013021785490201305, + "loss": 1.2539, + "step": 9034 + }, + { + "epoch": 0.86, + "grad_norm": 0.2564122283684667, + "learning_rate": 0.00013020277457321877, + "loss": 1.0903, + "step": 9035 + }, + { + "epoch": 0.86, + "grad_norm": 0.2917999377183397, + "learning_rate": 0.00013018769348858107, + "loss": 0.9977, + "step": 9036 + }, + { + "epoch": 0.86, + "grad_norm": 0.31170385260658784, + "learning_rate": 0.00013017261164847743, + "loss": 0.8905, + "step": 9037 + }, + { + "epoch": 0.86, + "grad_norm": 0.30015787859342813, + "learning_rate": 0.00013015752905328514, + "loss": 1.0676, + "step": 9038 + }, + { + "epoch": 0.86, + "grad_norm": 0.43157315991514567, + "learning_rate": 0.00013014244570338178, + "loss": 1.1419, + "step": 9039 + }, + { + "epoch": 0.86, + "grad_norm": 0.25061895805404616, + "learning_rate": 0.0001301273615991448, + "loss": 0.9809, + "step": 9040 + }, + { + "epoch": 0.86, + "grad_norm": 0.2678433671294428, + "learning_rate": 0.00013011227674095162, + "loss": 0.951, + "step": 9041 + }, + { + "epoch": 0.87, + "grad_norm": 0.2886930597012415, + "learning_rate": 0.00013009719112917978, + "loss": 1.0616, + "step": 9042 + }, + { + "epoch": 0.87, + "grad_norm": 0.25785608296938134, + "learning_rate": 0.00013008210476420684, + "loss": 1.0351, + "step": 9043 + }, + { + "epoch": 0.87, + "grad_norm": 0.25783062009926794, + "learning_rate": 0.0001300670176464103, + "loss": 1.0019, + "step": 9044 + }, + { + "epoch": 0.87, + "grad_norm": 0.30369123418177635, + "learning_rate": 0.00013005192977616777, + "loss": 0.8641, + "step": 9045 + }, + { + "epoch": 0.87, + "grad_norm": 0.3225428130185902, + "learning_rate": 0.0001300368411538568, + "loss": 1.1576, + "step": 9046 + }, + { + "epoch": 0.87, + "grad_norm": 0.2967976819197218, + "learning_rate": 0.00013002175177985502, + "loss": 1.1703, + "step": 9047 + }, + { + "epoch": 0.87, + "grad_norm": 0.3014813200032349, + "learning_rate": 0.00013000666165454, + "loss": 1.1789, + "step": 9048 + }, + { + "epoch": 0.87, + "grad_norm": 0.2501779735414716, + "learning_rate": 0.00012999157077828944, + "loss": 1.105, + "step": 9049 + }, + { + "epoch": 0.87, + "grad_norm": 0.32187566601028167, + "learning_rate": 0.000129976479151481, + "loss": 1.0835, + "step": 9050 + }, + { + "epoch": 0.87, + "grad_norm": 0.27746438481780444, + "learning_rate": 0.0001299613867744923, + "loss": 1.0821, + "step": 9051 + }, + { + "epoch": 0.87, + "grad_norm": 0.28339489212125185, + "learning_rate": 0.00012994629364770102, + "loss": 1.0247, + "step": 9052 + }, + { + "epoch": 0.87, + "grad_norm": 0.27520392235897523, + "learning_rate": 0.00012993119977148499, + "loss": 0.9562, + "step": 9053 + }, + { + "epoch": 0.87, + "grad_norm": 0.2575142187774334, + "learning_rate": 0.0001299161051462218, + "loss": 1.0923, + "step": 9054 + }, + { + "epoch": 0.87, + "grad_norm": 0.2934030821339123, + "learning_rate": 0.00012990100977228934, + "loss": 1.0186, + "step": 9055 + }, + { + "epoch": 0.87, + "grad_norm": 0.2825842909756287, + "learning_rate": 0.0001298859136500653, + "loss": 1.0244, + "step": 9056 + }, + { + "epoch": 0.87, + "grad_norm": 0.2732801101975911, + "learning_rate": 0.0001298708167799275, + "loss": 1.1346, + "step": 9057 + }, + { + "epoch": 0.87, + "grad_norm": 0.2835565308919408, + "learning_rate": 0.0001298557191622537, + "loss": 0.9529, + "step": 9058 + }, + { + "epoch": 0.87, + "grad_norm": 0.32615560594023263, + "learning_rate": 0.00012984062079742181, + "loss": 1.1499, + "step": 9059 + }, + { + "epoch": 0.87, + "grad_norm": 0.2869135232591449, + "learning_rate": 0.00012982552168580962, + "loss": 0.9194, + "step": 9060 + }, + { + "epoch": 0.87, + "grad_norm": 0.3134787513394852, + "learning_rate": 0.000129810421827795, + "loss": 1.1952, + "step": 9061 + }, + { + "epoch": 0.87, + "grad_norm": 0.32346472417562094, + "learning_rate": 0.0001297953212237558, + "loss": 1.023, + "step": 9062 + }, + { + "epoch": 0.87, + "grad_norm": 0.3367904265131591, + "learning_rate": 0.00012978021987407004, + "loss": 1.1234, + "step": 9063 + }, + { + "epoch": 0.87, + "grad_norm": 0.30432907150859745, + "learning_rate": 0.0001297651177791155, + "loss": 0.9762, + "step": 9064 + }, + { + "epoch": 0.87, + "grad_norm": 0.3161449071160964, + "learning_rate": 0.00012975001493927018, + "loss": 0.9887, + "step": 9065 + }, + { + "epoch": 0.87, + "grad_norm": 0.24592648677051562, + "learning_rate": 0.00012973491135491206, + "loss": 1.0069, + "step": 9066 + }, + { + "epoch": 0.87, + "grad_norm": 0.26653722611873165, + "learning_rate": 0.00012971980702641912, + "loss": 1.0546, + "step": 9067 + }, + { + "epoch": 0.87, + "grad_norm": 0.2862693690630182, + "learning_rate": 0.00012970470195416931, + "loss": 1.1177, + "step": 9068 + }, + { + "epoch": 0.87, + "grad_norm": 0.29138542499171116, + "learning_rate": 0.00012968959613854063, + "loss": 1.1238, + "step": 9069 + }, + { + "epoch": 0.87, + "grad_norm": 0.2859847406201628, + "learning_rate": 0.0001296744895799112, + "loss": 1.047, + "step": 9070 + }, + { + "epoch": 0.87, + "grad_norm": 0.2807399557511384, + "learning_rate": 0.000129659382278659, + "loss": 0.9589, + "step": 9071 + }, + { + "epoch": 0.87, + "grad_norm": 0.28450650670607563, + "learning_rate": 0.0001296442742351621, + "loss": 0.9999, + "step": 9072 + }, + { + "epoch": 0.87, + "grad_norm": 0.31676772977788115, + "learning_rate": 0.0001296291654497986, + "loss": 0.978, + "step": 9073 + }, + { + "epoch": 0.87, + "grad_norm": 0.2784851835452743, + "learning_rate": 0.00012961405592294665, + "loss": 1.0332, + "step": 9074 + }, + { + "epoch": 0.87, + "grad_norm": 0.26832152402985765, + "learning_rate": 0.0001295989456549843, + "loss": 1.0811, + "step": 9075 + }, + { + "epoch": 0.87, + "grad_norm": 0.29162865744847255, + "learning_rate": 0.00012958383464628975, + "loss": 1.1361, + "step": 9076 + }, + { + "epoch": 0.87, + "grad_norm": 0.26672809187008667, + "learning_rate": 0.00012956872289724116, + "loss": 1.0796, + "step": 9077 + }, + { + "epoch": 0.87, + "grad_norm": 0.27076549176531384, + "learning_rate": 0.0001295536104082167, + "loss": 1.1287, + "step": 9078 + }, + { + "epoch": 0.87, + "grad_norm": 0.28696821632322866, + "learning_rate": 0.00012953849717959454, + "loss": 1.1795, + "step": 9079 + }, + { + "epoch": 0.87, + "grad_norm": 0.2955303786908336, + "learning_rate": 0.00012952338321175293, + "loss": 1.0168, + "step": 9080 + }, + { + "epoch": 0.87, + "grad_norm": 0.3039172006339761, + "learning_rate": 0.00012950826850507011, + "loss": 0.9734, + "step": 9081 + }, + { + "epoch": 0.87, + "grad_norm": 0.2680076109464228, + "learning_rate": 0.00012949315305992433, + "loss": 1.1325, + "step": 9082 + }, + { + "epoch": 0.87, + "grad_norm": 0.3197355029722075, + "learning_rate": 0.00012947803687669385, + "loss": 1.1474, + "step": 9083 + }, + { + "epoch": 0.87, + "grad_norm": 0.260643212258335, + "learning_rate": 0.00012946291995575697, + "loss": 1.0691, + "step": 9084 + }, + { + "epoch": 0.87, + "grad_norm": 0.28094689856026667, + "learning_rate": 0.00012944780229749201, + "loss": 1.1027, + "step": 9085 + }, + { + "epoch": 0.87, + "grad_norm": 0.3354262462770161, + "learning_rate": 0.00012943268390227727, + "loss": 0.9943, + "step": 9086 + }, + { + "epoch": 0.87, + "grad_norm": 0.2863437127555651, + "learning_rate": 0.00012941756477049114, + "loss": 1.101, + "step": 9087 + }, + { + "epoch": 0.87, + "grad_norm": 0.24300912831153612, + "learning_rate": 0.00012940244490251197, + "loss": 0.9077, + "step": 9088 + }, + { + "epoch": 0.87, + "grad_norm": 0.259039073094172, + "learning_rate": 0.0001293873242987181, + "loss": 1.117, + "step": 9089 + }, + { + "epoch": 0.87, + "grad_norm": 0.2512219349760124, + "learning_rate": 0.000129372202959488, + "loss": 0.9785, + "step": 9090 + }, + { + "epoch": 0.87, + "grad_norm": 0.27273837493945713, + "learning_rate": 0.00012935708088520007, + "loss": 0.9931, + "step": 9091 + }, + { + "epoch": 0.87, + "grad_norm": 0.27530538774617, + "learning_rate": 0.0001293419580762327, + "loss": 1.0153, + "step": 9092 + }, + { + "epoch": 0.87, + "grad_norm": 0.2926747917554771, + "learning_rate": 0.0001293268345329644, + "loss": 1.0461, + "step": 9093 + }, + { + "epoch": 0.87, + "grad_norm": 0.2500212462256167, + "learning_rate": 0.00012931171025577366, + "loss": 1.0736, + "step": 9094 + }, + { + "epoch": 0.87, + "grad_norm": 0.29111956874370015, + "learning_rate": 0.00012929658524503894, + "loss": 1.0124, + "step": 9095 + }, + { + "epoch": 0.87, + "grad_norm": 0.263782277974996, + "learning_rate": 0.00012928145950113877, + "loss": 0.9819, + "step": 9096 + }, + { + "epoch": 0.87, + "grad_norm": 0.2905200592972542, + "learning_rate": 0.00012926633302445164, + "loss": 1.0002, + "step": 9097 + }, + { + "epoch": 0.87, + "grad_norm": 0.29496184216102744, + "learning_rate": 0.00012925120581535614, + "loss": 1.0787, + "step": 9098 + }, + { + "epoch": 0.87, + "grad_norm": 0.3117510201468771, + "learning_rate": 0.00012923607787423085, + "loss": 1.0186, + "step": 9099 + }, + { + "epoch": 0.87, + "grad_norm": 0.32624449591492205, + "learning_rate": 0.00012922094920145432, + "loss": 1.1453, + "step": 9100 + }, + { + "epoch": 0.87, + "grad_norm": 0.33313379774620716, + "learning_rate": 0.0001292058197974052, + "loss": 1.1103, + "step": 9101 + }, + { + "epoch": 0.87, + "grad_norm": 0.30476232541112913, + "learning_rate": 0.0001291906896624621, + "loss": 1.0197, + "step": 9102 + }, + { + "epoch": 0.87, + "grad_norm": 0.2966527557371042, + "learning_rate": 0.00012917555879700358, + "loss": 1.0609, + "step": 9103 + }, + { + "epoch": 0.87, + "grad_norm": 0.3050234189493314, + "learning_rate": 0.0001291604272014084, + "loss": 1.098, + "step": 9104 + }, + { + "epoch": 0.87, + "grad_norm": 0.27720296715762255, + "learning_rate": 0.0001291452948760552, + "loss": 1.0512, + "step": 9105 + }, + { + "epoch": 0.87, + "grad_norm": 0.3042006172159329, + "learning_rate": 0.00012913016182132268, + "loss": 0.9907, + "step": 9106 + }, + { + "epoch": 0.87, + "grad_norm": 0.3068804882948529, + "learning_rate": 0.00012911502803758954, + "loss": 1.1273, + "step": 9107 + }, + { + "epoch": 0.87, + "grad_norm": 0.25437915672597605, + "learning_rate": 0.00012909989352523455, + "loss": 1.0053, + "step": 9108 + }, + { + "epoch": 0.87, + "grad_norm": 0.2588485696040596, + "learning_rate": 0.00012908475828463643, + "loss": 1.069, + "step": 9109 + }, + { + "epoch": 0.87, + "grad_norm": 0.288489014040999, + "learning_rate": 0.00012906962231617396, + "loss": 1.1162, + "step": 9110 + }, + { + "epoch": 0.87, + "grad_norm": 0.2902499883183751, + "learning_rate": 0.00012905448562022592, + "loss": 0.959, + "step": 9111 + }, + { + "epoch": 0.87, + "grad_norm": 0.2885389820207888, + "learning_rate": 0.00012903934819717108, + "loss": 1.0781, + "step": 9112 + }, + { + "epoch": 0.87, + "grad_norm": 0.2434151493288888, + "learning_rate": 0.00012902421004738833, + "loss": 1.1026, + "step": 9113 + }, + { + "epoch": 0.87, + "grad_norm": 0.27724613447577373, + "learning_rate": 0.0001290090711712565, + "loss": 0.9479, + "step": 9114 + }, + { + "epoch": 0.87, + "grad_norm": 0.2638723940275514, + "learning_rate": 0.00012899393156915438, + "loss": 1.0193, + "step": 9115 + }, + { + "epoch": 0.87, + "grad_norm": 0.3146091758887233, + "learning_rate": 0.00012897879124146094, + "loss": 1.0415, + "step": 9116 + }, + { + "epoch": 0.87, + "grad_norm": 0.2781231516386979, + "learning_rate": 0.00012896365018855502, + "loss": 1.1148, + "step": 9117 + }, + { + "epoch": 0.87, + "grad_norm": 0.2814021450123451, + "learning_rate": 0.00012894850841081555, + "loss": 0.9947, + "step": 9118 + }, + { + "epoch": 0.87, + "grad_norm": 0.2951572978095963, + "learning_rate": 0.0001289333659086215, + "loss": 1.0666, + "step": 9119 + }, + { + "epoch": 0.87, + "grad_norm": 0.2789547266879171, + "learning_rate": 0.00012891822268235175, + "loss": 1.1119, + "step": 9120 + }, + { + "epoch": 0.87, + "grad_norm": 0.2944178727397588, + "learning_rate": 0.0001289030787323853, + "loss": 1.0622, + "step": 9121 + }, + { + "epoch": 0.87, + "grad_norm": 0.3002264804462691, + "learning_rate": 0.00012888793405910117, + "loss": 1.1301, + "step": 9122 + }, + { + "epoch": 0.87, + "grad_norm": 0.29596892828518445, + "learning_rate": 0.0001288727886628783, + "loss": 0.9955, + "step": 9123 + }, + { + "epoch": 0.87, + "grad_norm": 0.28581897006530516, + "learning_rate": 0.00012885764254409577, + "loss": 1.0991, + "step": 9124 + }, + { + "epoch": 0.87, + "grad_norm": 0.2848802047710548, + "learning_rate": 0.0001288424957031326, + "loss": 1.1973, + "step": 9125 + }, + { + "epoch": 0.87, + "grad_norm": 0.30273121467305963, + "learning_rate": 0.00012882734814036783, + "loss": 1.1433, + "step": 9126 + }, + { + "epoch": 0.87, + "grad_norm": 0.26901216004525036, + "learning_rate": 0.00012881219985618058, + "loss": 0.9651, + "step": 9127 + }, + { + "epoch": 0.87, + "grad_norm": 0.29357748445834536, + "learning_rate": 0.0001287970508509499, + "loss": 1.0533, + "step": 9128 + }, + { + "epoch": 0.87, + "grad_norm": 0.28302245682622934, + "learning_rate": 0.00012878190112505496, + "loss": 1.1715, + "step": 9129 + }, + { + "epoch": 0.87, + "grad_norm": 0.32220788805164724, + "learning_rate": 0.0001287667506788748, + "loss": 1.037, + "step": 9130 + }, + { + "epoch": 0.87, + "grad_norm": 0.2838656933293558, + "learning_rate": 0.00012875159951278867, + "loss": 0.9909, + "step": 9131 + }, + { + "epoch": 0.87, + "grad_norm": 0.31812680763347256, + "learning_rate": 0.0001287364476271757, + "loss": 1.0599, + "step": 9132 + }, + { + "epoch": 0.87, + "grad_norm": 0.2729154895834484, + "learning_rate": 0.00012872129502241502, + "loss": 1.0977, + "step": 9133 + }, + { + "epoch": 0.87, + "grad_norm": 0.3125996237172212, + "learning_rate": 0.0001287061416988859, + "loss": 1.0514, + "step": 9134 + }, + { + "epoch": 0.87, + "grad_norm": 0.2807631918763277, + "learning_rate": 0.00012869098765696757, + "loss": 1.051, + "step": 9135 + }, + { + "epoch": 0.87, + "grad_norm": 0.31219828383217524, + "learning_rate": 0.0001286758328970392, + "loss": 1.0464, + "step": 9136 + }, + { + "epoch": 0.87, + "grad_norm": 0.2639358482520139, + "learning_rate": 0.0001286606774194801, + "loss": 1.0578, + "step": 9137 + }, + { + "epoch": 0.87, + "grad_norm": 0.2974661876294612, + "learning_rate": 0.00012864552122466956, + "loss": 0.9595, + "step": 9138 + }, + { + "epoch": 0.87, + "grad_norm": 0.2926598180262982, + "learning_rate": 0.00012863036431298684, + "loss": 1.1708, + "step": 9139 + }, + { + "epoch": 0.87, + "grad_norm": 0.25929024999451794, + "learning_rate": 0.00012861520668481122, + "loss": 0.9799, + "step": 9140 + }, + { + "epoch": 0.87, + "grad_norm": 0.29654960563184474, + "learning_rate": 0.0001286000483405221, + "loss": 1.1523, + "step": 9141 + }, + { + "epoch": 0.87, + "grad_norm": 0.27691848301002864, + "learning_rate": 0.00012858488928049882, + "loss": 1.0334, + "step": 9142 + }, + { + "epoch": 0.87, + "grad_norm": 0.29295271269701484, + "learning_rate": 0.00012856972950512068, + "loss": 1.0546, + "step": 9143 + }, + { + "epoch": 0.87, + "grad_norm": 0.2803146245701744, + "learning_rate": 0.00012855456901476712, + "loss": 0.9898, + "step": 9144 + }, + { + "epoch": 0.87, + "grad_norm": 0.2863989751594063, + "learning_rate": 0.00012853940780981751, + "loss": 1.1504, + "step": 9145 + }, + { + "epoch": 0.88, + "grad_norm": 0.2739972569019825, + "learning_rate": 0.00012852424589065132, + "loss": 0.923, + "step": 9146 + }, + { + "epoch": 0.88, + "grad_norm": 0.25052615701520137, + "learning_rate": 0.0001285090832576479, + "loss": 1.0061, + "step": 9147 + }, + { + "epoch": 0.88, + "grad_norm": 0.3030142194521633, + "learning_rate": 0.00012849391991118683, + "loss": 1.0694, + "step": 9148 + }, + { + "epoch": 0.88, + "grad_norm": 0.2714696456433213, + "learning_rate": 0.00012847875585164745, + "loss": 1.0009, + "step": 9149 + }, + { + "epoch": 0.88, + "grad_norm": 0.28875446135458116, + "learning_rate": 0.00012846359107940931, + "loss": 1.1649, + "step": 9150 + }, + { + "epoch": 0.88, + "grad_norm": 0.3150764287272924, + "learning_rate": 0.00012844842559485192, + "loss": 0.9091, + "step": 9151 + }, + { + "epoch": 0.88, + "grad_norm": 0.27701993372597217, + "learning_rate": 0.00012843325939835483, + "loss": 1.083, + "step": 9152 + }, + { + "epoch": 0.88, + "grad_norm": 0.29338798312190123, + "learning_rate": 0.00012841809249029747, + "loss": 0.9318, + "step": 9153 + }, + { + "epoch": 0.88, + "grad_norm": 0.30076117068870556, + "learning_rate": 0.00012840292487105955, + "loss": 1.0577, + "step": 9154 + }, + { + "epoch": 0.88, + "grad_norm": 0.26916632631906984, + "learning_rate": 0.0001283877565410206, + "loss": 1.0901, + "step": 9155 + }, + { + "epoch": 0.88, + "grad_norm": 0.2863676566010998, + "learning_rate": 0.00012837258750056016, + "loss": 1.0916, + "step": 9156 + }, + { + "epoch": 0.88, + "grad_norm": 0.311513288826919, + "learning_rate": 0.0001283574177500579, + "loss": 0.9895, + "step": 9157 + }, + { + "epoch": 0.88, + "grad_norm": 0.2788747044437777, + "learning_rate": 0.00012834224728989344, + "loss": 1.0058, + "step": 9158 + }, + { + "epoch": 0.88, + "grad_norm": 0.31246123148504296, + "learning_rate": 0.00012832707612044642, + "loss": 1.1208, + "step": 9159 + }, + { + "epoch": 0.88, + "grad_norm": 0.3153487374039508, + "learning_rate": 0.00012831190424209655, + "loss": 1.0619, + "step": 9160 + }, + { + "epoch": 0.88, + "grad_norm": 0.2979832645966896, + "learning_rate": 0.00012829673165522343, + "loss": 1.0788, + "step": 9161 + }, + { + "epoch": 0.88, + "grad_norm": 0.27761648021394475, + "learning_rate": 0.00012828155836020687, + "loss": 1.0676, + "step": 9162 + }, + { + "epoch": 0.88, + "grad_norm": 0.2966190620374098, + "learning_rate": 0.00012826638435742654, + "loss": 1.127, + "step": 9163 + }, + { + "epoch": 0.88, + "grad_norm": 0.2677449239886606, + "learning_rate": 0.0001282512096472621, + "loss": 1.1182, + "step": 9164 + }, + { + "epoch": 0.88, + "grad_norm": 0.2955895529683899, + "learning_rate": 0.00012823603423009347, + "loss": 0.8931, + "step": 9165 + }, + { + "epoch": 0.88, + "grad_norm": 0.2864122311531748, + "learning_rate": 0.0001282208581063003, + "loss": 1.1078, + "step": 9166 + }, + { + "epoch": 0.88, + "grad_norm": 0.2901266851305628, + "learning_rate": 0.00012820568127626242, + "loss": 1.0239, + "step": 9167 + }, + { + "epoch": 0.88, + "grad_norm": 0.3049079996092775, + "learning_rate": 0.00012819050374035962, + "loss": 1.0681, + "step": 9168 + }, + { + "epoch": 0.88, + "grad_norm": 0.27371968074107056, + "learning_rate": 0.0001281753254989718, + "loss": 1.0393, + "step": 9169 + }, + { + "epoch": 0.88, + "grad_norm": 0.31873095311089905, + "learning_rate": 0.0001281601465524787, + "loss": 1.0495, + "step": 9170 + }, + { + "epoch": 0.88, + "grad_norm": 0.33430466740169595, + "learning_rate": 0.00012814496690126027, + "loss": 1.0828, + "step": 9171 + }, + { + "epoch": 0.88, + "grad_norm": 0.2793091890823801, + "learning_rate": 0.00012812978654569635, + "loss": 1.0, + "step": 9172 + }, + { + "epoch": 0.88, + "grad_norm": 0.28127672160759526, + "learning_rate": 0.00012811460548616682, + "loss": 1.1068, + "step": 9173 + }, + { + "epoch": 0.88, + "grad_norm": 0.2822256164444625, + "learning_rate": 0.00012809942372305164, + "loss": 1.1039, + "step": 9174 + }, + { + "epoch": 0.88, + "grad_norm": 0.2989368506283751, + "learning_rate": 0.0001280842412567307, + "loss": 1.0661, + "step": 9175 + }, + { + "epoch": 0.88, + "grad_norm": 0.2787834185347091, + "learning_rate": 0.000128069058087584, + "loss": 1.0754, + "step": 9176 + }, + { + "epoch": 0.88, + "grad_norm": 0.26459335862211264, + "learning_rate": 0.00012805387421599144, + "loss": 1.0921, + "step": 9177 + }, + { + "epoch": 0.88, + "grad_norm": 0.2717879411221796, + "learning_rate": 0.0001280386896423331, + "loss": 1.0256, + "step": 9178 + }, + { + "epoch": 0.88, + "grad_norm": 0.3392781499730835, + "learning_rate": 0.00012802350436698888, + "loss": 0.963, + "step": 9179 + }, + { + "epoch": 0.88, + "grad_norm": 0.3008890143060967, + "learning_rate": 0.0001280083183903389, + "loss": 1.0453, + "step": 9180 + }, + { + "epoch": 0.88, + "grad_norm": 0.2859398503777074, + "learning_rate": 0.00012799313171276308, + "loss": 1.095, + "step": 9181 + }, + { + "epoch": 0.88, + "grad_norm": 0.269640842666525, + "learning_rate": 0.0001279779443346416, + "loss": 1.0724, + "step": 9182 + }, + { + "epoch": 0.88, + "grad_norm": 0.28038845119609257, + "learning_rate": 0.0001279627562563545, + "loss": 1.0738, + "step": 9183 + }, + { + "epoch": 0.88, + "grad_norm": 0.2887167863295914, + "learning_rate": 0.00012794756747828179, + "loss": 1.0641, + "step": 9184 + }, + { + "epoch": 0.88, + "grad_norm": 0.29441549775626563, + "learning_rate": 0.00012793237800080365, + "loss": 1.1956, + "step": 9185 + }, + { + "epoch": 0.88, + "grad_norm": 0.2614449210833528, + "learning_rate": 0.00012791718782430024, + "loss": 0.9941, + "step": 9186 + }, + { + "epoch": 0.88, + "grad_norm": 0.27898220786899924, + "learning_rate": 0.00012790199694915163, + "loss": 1.0125, + "step": 9187 + }, + { + "epoch": 0.88, + "grad_norm": 0.2623293441843174, + "learning_rate": 0.000127886805375738, + "loss": 1.018, + "step": 9188 + }, + { + "epoch": 0.88, + "grad_norm": 0.24474445860858335, + "learning_rate": 0.00012787161310443958, + "loss": 0.9925, + "step": 9189 + }, + { + "epoch": 0.88, + "grad_norm": 0.26691886103805196, + "learning_rate": 0.0001278564201356365, + "loss": 1.0262, + "step": 9190 + }, + { + "epoch": 0.88, + "grad_norm": 0.2996480479285695, + "learning_rate": 0.000127841226469709, + "loss": 1.1404, + "step": 9191 + }, + { + "epoch": 0.88, + "grad_norm": 0.2831005610084299, + "learning_rate": 0.0001278260321070373, + "loss": 1.0199, + "step": 9192 + }, + { + "epoch": 0.88, + "grad_norm": 0.3016403069097424, + "learning_rate": 0.00012781083704800167, + "loss": 1.1395, + "step": 9193 + }, + { + "epoch": 0.88, + "grad_norm": 0.258599684442815, + "learning_rate": 0.00012779564129298233, + "loss": 1.0072, + "step": 9194 + }, + { + "epoch": 0.88, + "grad_norm": 0.28981820294114957, + "learning_rate": 0.00012778044484235964, + "loss": 1.0872, + "step": 9195 + }, + { + "epoch": 0.88, + "grad_norm": 0.2944671368276984, + "learning_rate": 0.0001277652476965139, + "loss": 0.9736, + "step": 9196 + }, + { + "epoch": 0.88, + "grad_norm": 0.32250052614896707, + "learning_rate": 0.0001277500498558253, + "loss": 1.0861, + "step": 9197 + }, + { + "epoch": 0.88, + "grad_norm": 0.33413812354501804, + "learning_rate": 0.00012773485132067428, + "loss": 0.9882, + "step": 9198 + }, + { + "epoch": 0.88, + "grad_norm": 0.2396696634982347, + "learning_rate": 0.00012771965209144122, + "loss": 0.999, + "step": 9199 + }, + { + "epoch": 0.88, + "grad_norm": 0.2871461793248707, + "learning_rate": 0.00012770445216850638, + "loss": 0.9985, + "step": 9200 + }, + { + "epoch": 0.88, + "grad_norm": 0.27681646471608834, + "learning_rate": 0.00012768925155225025, + "loss": 0.9664, + "step": 9201 + }, + { + "epoch": 0.88, + "grad_norm": 0.27957278139129027, + "learning_rate": 0.00012767405024305322, + "loss": 1.0305, + "step": 9202 + }, + { + "epoch": 0.88, + "grad_norm": 0.28753086690595353, + "learning_rate": 0.00012765884824129565, + "loss": 1.0237, + "step": 9203 + }, + { + "epoch": 0.88, + "grad_norm": 0.3111440689789316, + "learning_rate": 0.000127643645547358, + "loss": 1.2017, + "step": 9204 + }, + { + "epoch": 0.88, + "grad_norm": 0.2921443717707693, + "learning_rate": 0.0001276284421616208, + "loss": 1.029, + "step": 9205 + }, + { + "epoch": 0.88, + "grad_norm": 0.30261904265294104, + "learning_rate": 0.00012761323808446447, + "loss": 1.0758, + "step": 9206 + }, + { + "epoch": 0.88, + "grad_norm": 0.25772792131049205, + "learning_rate": 0.00012759803331626948, + "loss": 1.0139, + "step": 9207 + }, + { + "epoch": 0.88, + "grad_norm": 0.3046831790519165, + "learning_rate": 0.00012758282785741638, + "loss": 1.0825, + "step": 9208 + }, + { + "epoch": 0.88, + "grad_norm": 0.3183296731093666, + "learning_rate": 0.00012756762170828566, + "loss": 0.9899, + "step": 9209 + }, + { + "epoch": 0.88, + "grad_norm": 0.2817939566711674, + "learning_rate": 0.0001275524148692579, + "loss": 0.9752, + "step": 9210 + }, + { + "epoch": 0.88, + "grad_norm": 0.2665373598422484, + "learning_rate": 0.0001275372073407136, + "loss": 1.0575, + "step": 9211 + }, + { + "epoch": 0.88, + "grad_norm": 0.2725700632449814, + "learning_rate": 0.00012752199912303345, + "loss": 1.0651, + "step": 9212 + }, + { + "epoch": 0.88, + "grad_norm": 0.29400308946907394, + "learning_rate": 0.00012750679021659794, + "loss": 1.0696, + "step": 9213 + }, + { + "epoch": 0.88, + "grad_norm": 0.297641959611062, + "learning_rate": 0.00012749158062178769, + "loss": 1.0051, + "step": 9214 + }, + { + "epoch": 0.88, + "grad_norm": 0.24521868429916097, + "learning_rate": 0.0001274763703389834, + "loss": 0.9303, + "step": 9215 + }, + { + "epoch": 0.88, + "grad_norm": 0.29297299579392244, + "learning_rate": 0.00012746115936856564, + "loss": 1.1431, + "step": 9216 + }, + { + "epoch": 0.88, + "grad_norm": 0.26851527665671227, + "learning_rate": 0.00012744594771091513, + "loss": 1.0198, + "step": 9217 + }, + { + "epoch": 0.88, + "grad_norm": 0.2819204613169668, + "learning_rate": 0.0001274307353664125, + "loss": 1.1456, + "step": 9218 + }, + { + "epoch": 0.88, + "grad_norm": 0.3038986733258005, + "learning_rate": 0.00012741552233543852, + "loss": 0.9887, + "step": 9219 + }, + { + "epoch": 0.88, + "grad_norm": 0.3047117203847732, + "learning_rate": 0.0001274003086183738, + "loss": 1.069, + "step": 9220 + }, + { + "epoch": 0.88, + "grad_norm": 0.2920407974764749, + "learning_rate": 0.0001273850942155992, + "loss": 1.113, + "step": 9221 + }, + { + "epoch": 0.88, + "grad_norm": 0.28997161723843756, + "learning_rate": 0.0001273698791274954, + "loss": 0.9928, + "step": 9222 + }, + { + "epoch": 0.88, + "grad_norm": 0.2601062102739676, + "learning_rate": 0.00012735466335444314, + "loss": 1.0277, + "step": 9223 + }, + { + "epoch": 0.88, + "grad_norm": 0.24196462571085023, + "learning_rate": 0.00012733944689682325, + "loss": 1.0118, + "step": 9224 + }, + { + "epoch": 0.88, + "grad_norm": 0.27692067285133026, + "learning_rate": 0.00012732422975501653, + "loss": 1.1095, + "step": 9225 + }, + { + "epoch": 0.88, + "grad_norm": 0.25760809183221073, + "learning_rate": 0.0001273090119294038, + "loss": 0.9548, + "step": 9226 + }, + { + "epoch": 0.88, + "grad_norm": 0.25859993613149374, + "learning_rate": 0.00012729379342036587, + "loss": 0.9637, + "step": 9227 + }, + { + "epoch": 0.88, + "grad_norm": 0.2540462826516634, + "learning_rate": 0.00012727857422828359, + "loss": 1.1307, + "step": 9228 + }, + { + "epoch": 0.88, + "grad_norm": 0.2921227707514754, + "learning_rate": 0.00012726335435353785, + "loss": 1.1008, + "step": 9229 + }, + { + "epoch": 0.88, + "grad_norm": 0.2984915373574552, + "learning_rate": 0.00012724813379650954, + "loss": 1.0848, + "step": 9230 + }, + { + "epoch": 0.88, + "grad_norm": 0.3095681979770164, + "learning_rate": 0.00012723291255757957, + "loss": 1.1033, + "step": 9231 + }, + { + "epoch": 0.88, + "grad_norm": 0.28318769682527445, + "learning_rate": 0.00012721769063712884, + "loss": 1.0971, + "step": 9232 + }, + { + "epoch": 0.88, + "grad_norm": 0.27525707454107234, + "learning_rate": 0.00012720246803553828, + "loss": 1.0534, + "step": 9233 + }, + { + "epoch": 0.88, + "grad_norm": 0.28825651516813144, + "learning_rate": 0.0001271872447531889, + "loss": 1.1621, + "step": 9234 + }, + { + "epoch": 0.88, + "grad_norm": 0.3010094105143729, + "learning_rate": 0.0001271720207904616, + "loss": 1.072, + "step": 9235 + }, + { + "epoch": 0.88, + "grad_norm": 0.24956098318442146, + "learning_rate": 0.00012715679614773738, + "loss": 1.2222, + "step": 9236 + }, + { + "epoch": 0.88, + "grad_norm": 0.2764944338239069, + "learning_rate": 0.00012714157082539733, + "loss": 1.0995, + "step": 9237 + }, + { + "epoch": 0.88, + "grad_norm": 0.27634606116433363, + "learning_rate": 0.00012712634482382238, + "loss": 1.036, + "step": 9238 + }, + { + "epoch": 0.88, + "grad_norm": 0.2794870761616325, + "learning_rate": 0.0001271111181433936, + "loss": 1.1254, + "step": 9239 + }, + { + "epoch": 0.88, + "grad_norm": 0.2601345404562058, + "learning_rate": 0.00012709589078449204, + "loss": 1.1053, + "step": 9240 + }, + { + "epoch": 0.88, + "grad_norm": 0.2622935878916725, + "learning_rate": 0.0001270806627474988, + "loss": 1.0455, + "step": 9241 + }, + { + "epoch": 0.88, + "grad_norm": 0.29883490193917805, + "learning_rate": 0.00012706543403279497, + "loss": 0.9762, + "step": 9242 + }, + { + "epoch": 0.88, + "grad_norm": 0.22869752618561462, + "learning_rate": 0.0001270502046407616, + "loss": 0.9916, + "step": 9243 + }, + { + "epoch": 0.88, + "grad_norm": 0.29444348606081083, + "learning_rate": 0.00012703497457177988, + "loss": 0.9847, + "step": 9244 + }, + { + "epoch": 0.88, + "grad_norm": 0.2938071041973925, + "learning_rate": 0.00012701974382623094, + "loss": 1.0545, + "step": 9245 + }, + { + "epoch": 0.88, + "grad_norm": 0.26894685253793144, + "learning_rate": 0.00012700451240449593, + "loss": 1.0102, + "step": 9246 + }, + { + "epoch": 0.88, + "grad_norm": 0.3295306409371883, + "learning_rate": 0.00012698928030695602, + "loss": 1.1135, + "step": 9247 + }, + { + "epoch": 0.88, + "grad_norm": 0.29519953492147355, + "learning_rate": 0.0001269740475339924, + "loss": 1.1716, + "step": 9248 + }, + { + "epoch": 0.88, + "grad_norm": 0.31644779195394845, + "learning_rate": 0.0001269588140859863, + "loss": 1.0338, + "step": 9249 + }, + { + "epoch": 0.88, + "grad_norm": 0.28862661343334506, + "learning_rate": 0.00012694357996331893, + "loss": 0.9805, + "step": 9250 + }, + { + "epoch": 0.89, + "grad_norm": 0.29109092097690287, + "learning_rate": 0.00012692834516637156, + "loss": 1.0948, + "step": 9251 + }, + { + "epoch": 0.89, + "grad_norm": 0.2755208136905377, + "learning_rate": 0.00012691310969552538, + "loss": 1.0859, + "step": 9252 + }, + { + "epoch": 0.89, + "grad_norm": 0.2947036722853116, + "learning_rate": 0.00012689787355116177, + "loss": 1.0358, + "step": 9253 + }, + { + "epoch": 0.89, + "grad_norm": 0.2871135631256799, + "learning_rate": 0.00012688263673366195, + "loss": 1.0249, + "step": 9254 + }, + { + "epoch": 0.89, + "grad_norm": 0.2562542263019815, + "learning_rate": 0.0001268673992434072, + "loss": 1.0182, + "step": 9255 + }, + { + "epoch": 0.89, + "grad_norm": 0.31659182199731395, + "learning_rate": 0.00012685216108077895, + "loss": 1.0291, + "step": 9256 + }, + { + "epoch": 0.89, + "grad_norm": 0.2933670124167318, + "learning_rate": 0.0001268369222461585, + "loss": 1.0631, + "step": 9257 + }, + { + "epoch": 0.89, + "grad_norm": 0.2657225784590937, + "learning_rate": 0.0001268216827399272, + "loss": 1.119, + "step": 9258 + }, + { + "epoch": 0.89, + "grad_norm": 0.2797518168945411, + "learning_rate": 0.00012680644256246642, + "loss": 0.9506, + "step": 9259 + }, + { + "epoch": 0.89, + "grad_norm": 0.3020240632022724, + "learning_rate": 0.00012679120171415757, + "loss": 0.9763, + "step": 9260 + }, + { + "epoch": 0.89, + "grad_norm": 0.2637706589429481, + "learning_rate": 0.00012677596019538206, + "loss": 1.0194, + "step": 9261 + }, + { + "epoch": 0.89, + "grad_norm": 0.31418144937011505, + "learning_rate": 0.0001267607180065213, + "loss": 1.1327, + "step": 9262 + }, + { + "epoch": 0.89, + "grad_norm": 0.2952874649931668, + "learning_rate": 0.00012674547514795675, + "loss": 1.0817, + "step": 9263 + }, + { + "epoch": 0.89, + "grad_norm": 0.3150374662852804, + "learning_rate": 0.00012673023162006989, + "loss": 1.0925, + "step": 9264 + }, + { + "epoch": 0.89, + "grad_norm": 0.3198349522638215, + "learning_rate": 0.0001267149874232422, + "loss": 1.1409, + "step": 9265 + }, + { + "epoch": 0.89, + "grad_norm": 0.25854203157035266, + "learning_rate": 0.00012669974255785516, + "loss": 1.138, + "step": 9266 + }, + { + "epoch": 0.89, + "grad_norm": 0.2608990015480636, + "learning_rate": 0.00012668449702429028, + "loss": 1.0271, + "step": 9267 + }, + { + "epoch": 0.89, + "grad_norm": 0.29909886728393076, + "learning_rate": 0.0001266692508229291, + "loss": 1.0969, + "step": 9268 + }, + { + "epoch": 0.89, + "grad_norm": 0.23909051250886207, + "learning_rate": 0.0001266540039541531, + "loss": 0.9744, + "step": 9269 + }, + { + "epoch": 0.89, + "grad_norm": 0.28087485327219586, + "learning_rate": 0.00012663875641834394, + "loss": 1.1004, + "step": 9270 + }, + { + "epoch": 0.89, + "grad_norm": 0.2552216763566098, + "learning_rate": 0.0001266235082158832, + "loss": 0.9802, + "step": 9271 + }, + { + "epoch": 0.89, + "grad_norm": 0.28763301871999136, + "learning_rate": 0.00012660825934715235, + "loss": 1.0887, + "step": 9272 + }, + { + "epoch": 0.89, + "grad_norm": 0.29630933468752285, + "learning_rate": 0.00012659300981253315, + "loss": 1.1228, + "step": 9273 + }, + { + "epoch": 0.89, + "grad_norm": 0.29194868948582664, + "learning_rate": 0.00012657775961240713, + "loss": 1.0174, + "step": 9274 + }, + { + "epoch": 0.89, + "grad_norm": 0.2617941239283223, + "learning_rate": 0.000126562508747156, + "loss": 1.0721, + "step": 9275 + }, + { + "epoch": 0.89, + "grad_norm": 0.28279751520459484, + "learning_rate": 0.00012654725721716138, + "loss": 1.0261, + "step": 9276 + }, + { + "epoch": 0.89, + "grad_norm": 0.30103442627581245, + "learning_rate": 0.00012653200502280498, + "loss": 1.1181, + "step": 9277 + }, + { + "epoch": 0.89, + "grad_norm": 0.27808862986365707, + "learning_rate": 0.00012651675216446848, + "loss": 1.0745, + "step": 9278 + }, + { + "epoch": 0.89, + "grad_norm": 0.29656831264442357, + "learning_rate": 0.00012650149864253357, + "loss": 1.1474, + "step": 9279 + }, + { + "epoch": 0.89, + "grad_norm": 0.27503893982251604, + "learning_rate": 0.000126486244457382, + "loss": 1.0173, + "step": 9280 + }, + { + "epoch": 0.89, + "grad_norm": 0.2657056986153241, + "learning_rate": 0.00012647098960939554, + "loss": 0.9806, + "step": 9281 + }, + { + "epoch": 0.89, + "grad_norm": 0.25653477829913407, + "learning_rate": 0.0001264557340989559, + "loss": 1.0358, + "step": 9282 + }, + { + "epoch": 0.89, + "grad_norm": 0.29619037014025956, + "learning_rate": 0.0001264404779264449, + "loss": 1.1536, + "step": 9283 + }, + { + "epoch": 0.89, + "grad_norm": 0.2822157801841811, + "learning_rate": 0.00012642522109224434, + "loss": 1.1011, + "step": 9284 + }, + { + "epoch": 0.89, + "grad_norm": 0.31455283414554064, + "learning_rate": 0.000126409963596736, + "loss": 0.9813, + "step": 9285 + }, + { + "epoch": 0.89, + "grad_norm": 0.2775822826731524, + "learning_rate": 0.0001263947054403017, + "loss": 0.951, + "step": 9286 + }, + { + "epoch": 0.89, + "grad_norm": 0.2575236283104343, + "learning_rate": 0.00012637944662332332, + "loss": 0.9371, + "step": 9287 + }, + { + "epoch": 0.89, + "grad_norm": 0.24149231079738087, + "learning_rate": 0.00012636418714618273, + "loss": 0.9715, + "step": 9288 + }, + { + "epoch": 0.89, + "grad_norm": 0.2831753975004831, + "learning_rate": 0.00012634892700926178, + "loss": 1.0039, + "step": 9289 + }, + { + "epoch": 0.89, + "grad_norm": 0.29422901876914576, + "learning_rate": 0.00012633366621294238, + "loss": 1.1093, + "step": 9290 + }, + { + "epoch": 0.89, + "grad_norm": 0.27392177332876777, + "learning_rate": 0.00012631840475760644, + "loss": 1.1547, + "step": 9291 + }, + { + "epoch": 0.89, + "grad_norm": 0.27799764214008904, + "learning_rate": 0.00012630314264363584, + "loss": 1.1274, + "step": 9292 + }, + { + "epoch": 0.89, + "grad_norm": 0.31072988212098307, + "learning_rate": 0.0001262878798714126, + "loss": 1.0469, + "step": 9293 + }, + { + "epoch": 0.89, + "grad_norm": 0.31492709765160437, + "learning_rate": 0.00012627261644131862, + "loss": 0.9578, + "step": 9294 + }, + { + "epoch": 0.89, + "grad_norm": 0.2528035347256275, + "learning_rate": 0.00012625735235373593, + "loss": 1.0668, + "step": 9295 + }, + { + "epoch": 0.89, + "grad_norm": 0.3155214475191695, + "learning_rate": 0.00012624208760904647, + "loss": 0.9916, + "step": 9296 + }, + { + "epoch": 0.89, + "grad_norm": 0.302087235196872, + "learning_rate": 0.00012622682220763228, + "loss": 1.0884, + "step": 9297 + }, + { + "epoch": 0.89, + "grad_norm": 0.2990095102194339, + "learning_rate": 0.00012621155614987538, + "loss": 1.0101, + "step": 9298 + }, + { + "epoch": 0.89, + "grad_norm": 0.26706525825653776, + "learning_rate": 0.00012619628943615782, + "loss": 1.0278, + "step": 9299 + }, + { + "epoch": 0.89, + "grad_norm": 0.27756280255006816, + "learning_rate": 0.00012618102206686166, + "loss": 1.0099, + "step": 9300 + }, + { + "epoch": 0.89, + "grad_norm": 0.30586196916112135, + "learning_rate": 0.00012616575404236899, + "loss": 1.0915, + "step": 9301 + }, + { + "epoch": 0.89, + "grad_norm": 0.25272225815414157, + "learning_rate": 0.0001261504853630618, + "loss": 1.0799, + "step": 9302 + }, + { + "epoch": 0.89, + "grad_norm": 0.3258028723785073, + "learning_rate": 0.00012613521602932237, + "loss": 0.9907, + "step": 9303 + }, + { + "epoch": 0.89, + "grad_norm": 0.3105133570818196, + "learning_rate": 0.00012611994604153269, + "loss": 1.0494, + "step": 9304 + }, + { + "epoch": 0.89, + "grad_norm": 0.2853288951774398, + "learning_rate": 0.00012610467540007494, + "loss": 1.0367, + "step": 9305 + }, + { + "epoch": 0.89, + "grad_norm": 0.30136724902716244, + "learning_rate": 0.00012608940410533127, + "loss": 1.0905, + "step": 9306 + }, + { + "epoch": 0.89, + "grad_norm": 0.27766552230271924, + "learning_rate": 0.00012607413215768388, + "loss": 1.0067, + "step": 9307 + }, + { + "epoch": 0.89, + "grad_norm": 0.2839527667018618, + "learning_rate": 0.00012605885955751497, + "loss": 0.9767, + "step": 9308 + }, + { + "epoch": 0.89, + "grad_norm": 0.30687156353062944, + "learning_rate": 0.0001260435863052067, + "loss": 1.0725, + "step": 9309 + }, + { + "epoch": 0.89, + "grad_norm": 0.3156040190026411, + "learning_rate": 0.0001260283124011413, + "loss": 1.0457, + "step": 9310 + }, + { + "epoch": 0.89, + "grad_norm": 0.3104920835342392, + "learning_rate": 0.00012601303784570106, + "loss": 1.1438, + "step": 9311 + }, + { + "epoch": 0.89, + "grad_norm": 0.27017582473335783, + "learning_rate": 0.0001259977626392682, + "loss": 1.052, + "step": 9312 + }, + { + "epoch": 0.89, + "grad_norm": 0.27144499750698986, + "learning_rate": 0.00012598248678222498, + "loss": 1.0559, + "step": 9313 + }, + { + "epoch": 0.89, + "grad_norm": 0.2675880100657242, + "learning_rate": 0.0001259672102749537, + "loss": 0.9763, + "step": 9314 + }, + { + "epoch": 0.89, + "grad_norm": 0.2955864081602817, + "learning_rate": 0.00012595193311783665, + "loss": 0.9867, + "step": 9315 + }, + { + "epoch": 0.89, + "grad_norm": 0.2813400142046753, + "learning_rate": 0.00012593665531125615, + "loss": 1.0049, + "step": 9316 + }, + { + "epoch": 0.89, + "grad_norm": 0.2831320746009813, + "learning_rate": 0.00012592137685559458, + "loss": 1.029, + "step": 9317 + }, + { + "epoch": 0.89, + "grad_norm": 0.29831547442685147, + "learning_rate": 0.00012590609775123426, + "loss": 1.116, + "step": 9318 + }, + { + "epoch": 0.89, + "grad_norm": 0.2936254006184292, + "learning_rate": 0.00012589081799855756, + "loss": 1.0433, + "step": 9319 + }, + { + "epoch": 0.89, + "grad_norm": 0.30202822877591445, + "learning_rate": 0.00012587553759794683, + "loss": 1.067, + "step": 9320 + }, + { + "epoch": 0.89, + "grad_norm": 0.31153441957615685, + "learning_rate": 0.00012586025654978458, + "loss": 1.0185, + "step": 9321 + }, + { + "epoch": 0.89, + "grad_norm": 0.31241827225447677, + "learning_rate": 0.0001258449748544531, + "loss": 1.0975, + "step": 9322 + }, + { + "epoch": 0.89, + "grad_norm": 0.27730753920433476, + "learning_rate": 0.0001258296925123349, + "loss": 1.069, + "step": 9323 + }, + { + "epoch": 0.89, + "grad_norm": 0.2897016648466612, + "learning_rate": 0.00012581440952381243, + "loss": 1.067, + "step": 9324 + }, + { + "epoch": 0.89, + "grad_norm": 0.24652987673338808, + "learning_rate": 0.0001257991258892681, + "loss": 1.1338, + "step": 9325 + }, + { + "epoch": 0.89, + "grad_norm": 0.3071238167163373, + "learning_rate": 0.00012578384160908445, + "loss": 1.1402, + "step": 9326 + }, + { + "epoch": 0.89, + "grad_norm": 0.29112654720110537, + "learning_rate": 0.00012576855668364396, + "loss": 1.0772, + "step": 9327 + }, + { + "epoch": 0.89, + "grad_norm": 0.28108564003717257, + "learning_rate": 0.00012575327111332912, + "loss": 1.0895, + "step": 9328 + }, + { + "epoch": 0.89, + "grad_norm": 0.2697345251089792, + "learning_rate": 0.00012573798489852253, + "loss": 1.1332, + "step": 9329 + }, + { + "epoch": 0.89, + "grad_norm": 0.304121790273392, + "learning_rate": 0.00012572269803960665, + "loss": 1.0902, + "step": 9330 + }, + { + "epoch": 0.89, + "grad_norm": 0.28853126394768475, + "learning_rate": 0.00012570741053696412, + "loss": 1.0721, + "step": 9331 + }, + { + "epoch": 0.89, + "grad_norm": 0.2851724014266135, + "learning_rate": 0.0001256921223909775, + "loss": 1.1567, + "step": 9332 + }, + { + "epoch": 0.89, + "grad_norm": 0.28817110546138114, + "learning_rate": 0.0001256768336020293, + "loss": 1.1432, + "step": 9333 + }, + { + "epoch": 0.89, + "grad_norm": 0.2674528738842632, + "learning_rate": 0.00012566154417050225, + "loss": 1.1189, + "step": 9334 + }, + { + "epoch": 0.89, + "grad_norm": 0.27322750722188155, + "learning_rate": 0.00012564625409677895, + "loss": 1.0305, + "step": 9335 + }, + { + "epoch": 0.89, + "grad_norm": 0.29977545197889227, + "learning_rate": 0.000125630963381242, + "loss": 1.1117, + "step": 9336 + }, + { + "epoch": 0.89, + "grad_norm": 0.29111294479338623, + "learning_rate": 0.00012561567202427407, + "loss": 0.9916, + "step": 9337 + }, + { + "epoch": 0.89, + "grad_norm": 0.3036268811633583, + "learning_rate": 0.00012560038002625788, + "loss": 1.0984, + "step": 9338 + }, + { + "epoch": 0.89, + "grad_norm": 0.2549172822007168, + "learning_rate": 0.00012558508738757604, + "loss": 1.1815, + "step": 9339 + }, + { + "epoch": 0.89, + "grad_norm": 0.2675936118833171, + "learning_rate": 0.00012556979410861135, + "loss": 0.962, + "step": 9340 + }, + { + "epoch": 0.89, + "grad_norm": 0.2773919289873, + "learning_rate": 0.00012555450018974647, + "loss": 1.1229, + "step": 9341 + }, + { + "epoch": 0.89, + "grad_norm": 0.2813050855847403, + "learning_rate": 0.00012553920563136418, + "loss": 1.1681, + "step": 9342 + }, + { + "epoch": 0.89, + "grad_norm": 0.2798799129707208, + "learning_rate": 0.00012552391043384718, + "loss": 1.1568, + "step": 9343 + }, + { + "epoch": 0.89, + "grad_norm": 0.30424213764742, + "learning_rate": 0.00012550861459757835, + "loss": 1.1593, + "step": 9344 + }, + { + "epoch": 0.89, + "grad_norm": 0.32261170593030764, + "learning_rate": 0.00012549331812294033, + "loss": 1.0733, + "step": 9345 + }, + { + "epoch": 0.89, + "grad_norm": 0.28695317695752437, + "learning_rate": 0.00012547802101031604, + "loss": 1.0534, + "step": 9346 + }, + { + "epoch": 0.89, + "grad_norm": 0.297270830182022, + "learning_rate": 0.00012546272326008828, + "loss": 0.9348, + "step": 9347 + }, + { + "epoch": 0.89, + "grad_norm": 0.32798136328441224, + "learning_rate": 0.00012544742487263983, + "loss": 1.2133, + "step": 9348 + }, + { + "epoch": 0.89, + "grad_norm": 0.276438841709617, + "learning_rate": 0.00012543212584835363, + "loss": 0.9188, + "step": 9349 + }, + { + "epoch": 0.89, + "grad_norm": 0.30713210860480056, + "learning_rate": 0.00012541682618761243, + "loss": 0.936, + "step": 9350 + }, + { + "epoch": 0.89, + "grad_norm": 0.3058264322272426, + "learning_rate": 0.00012540152589079922, + "loss": 1.0953, + "step": 9351 + }, + { + "epoch": 0.89, + "grad_norm": 0.3086810532233868, + "learning_rate": 0.00012538622495829687, + "loss": 1.0574, + "step": 9352 + }, + { + "epoch": 0.89, + "grad_norm": 0.26492015598732366, + "learning_rate": 0.00012537092339048829, + "loss": 1.0913, + "step": 9353 + }, + { + "epoch": 0.89, + "grad_norm": 0.2613494790949844, + "learning_rate": 0.00012535562118775638, + "loss": 1.0103, + "step": 9354 + }, + { + "epoch": 0.9, + "grad_norm": 0.2745257568238265, + "learning_rate": 0.00012534031835048412, + "loss": 1.016, + "step": 9355 + }, + { + "epoch": 0.9, + "grad_norm": 0.32547504865315513, + "learning_rate": 0.00012532501487905447, + "loss": 1.0429, + "step": 9356 + }, + { + "epoch": 0.9, + "grad_norm": 0.25947151074133556, + "learning_rate": 0.0001253097107738504, + "loss": 1.0684, + "step": 9357 + }, + { + "epoch": 0.9, + "grad_norm": 0.2749564028335704, + "learning_rate": 0.00012529440603525495, + "loss": 1.0026, + "step": 9358 + }, + { + "epoch": 0.9, + "grad_norm": 0.26444889751740525, + "learning_rate": 0.00012527910066365108, + "loss": 1.1251, + "step": 9359 + }, + { + "epoch": 0.9, + "grad_norm": 0.24760688189734706, + "learning_rate": 0.00012526379465942179, + "loss": 1.0365, + "step": 9360 + }, + { + "epoch": 0.9, + "grad_norm": 0.2952587878180295, + "learning_rate": 0.00012524848802295018, + "loss": 1.1582, + "step": 9361 + }, + { + "epoch": 0.9, + "grad_norm": 0.27540696444772433, + "learning_rate": 0.0001252331807546193, + "loss": 1.1061, + "step": 9362 + }, + { + "epoch": 0.9, + "grad_norm": 0.27595222755388815, + "learning_rate": 0.00012521787285481222, + "loss": 1.0053, + "step": 9363 + }, + { + "epoch": 0.9, + "grad_norm": 0.3078665527994498, + "learning_rate": 0.00012520256432391197, + "loss": 1.0609, + "step": 9364 + }, + { + "epoch": 0.9, + "grad_norm": 0.30912162354934847, + "learning_rate": 0.00012518725516230176, + "loss": 1.092, + "step": 9365 + }, + { + "epoch": 0.9, + "grad_norm": 0.24850758690847607, + "learning_rate": 0.00012517194537036463, + "loss": 0.9636, + "step": 9366 + }, + { + "epoch": 0.9, + "grad_norm": 0.25462201249484734, + "learning_rate": 0.00012515663494848378, + "loss": 1.0612, + "step": 9367 + }, + { + "epoch": 0.9, + "grad_norm": 0.30057529749822187, + "learning_rate": 0.0001251413238970423, + "loss": 1.1008, + "step": 9368 + }, + { + "epoch": 0.9, + "grad_norm": 0.28606400293322526, + "learning_rate": 0.00012512601221642338, + "loss": 0.9646, + "step": 9369 + }, + { + "epoch": 0.9, + "grad_norm": 0.2718761895079578, + "learning_rate": 0.00012511069990701022, + "loss": 1.1127, + "step": 9370 + }, + { + "epoch": 0.9, + "grad_norm": 0.26749842342835667, + "learning_rate": 0.00012509538696918606, + "loss": 1.0016, + "step": 9371 + }, + { + "epoch": 0.9, + "grad_norm": 0.24422845567032508, + "learning_rate": 0.00012508007340333402, + "loss": 1.0075, + "step": 9372 + }, + { + "epoch": 0.9, + "grad_norm": 0.26485906439128654, + "learning_rate": 0.00012506475920983742, + "loss": 1.0334, + "step": 9373 + }, + { + "epoch": 0.9, + "grad_norm": 0.23407388212556063, + "learning_rate": 0.00012504944438907945, + "loss": 0.9974, + "step": 9374 + }, + { + "epoch": 0.9, + "grad_norm": 0.2948257650153299, + "learning_rate": 0.00012503412894144337, + "loss": 1.0004, + "step": 9375 + }, + { + "epoch": 0.9, + "grad_norm": 0.2624520239691215, + "learning_rate": 0.0001250188128673125, + "loss": 1.113, + "step": 9376 + }, + { + "epoch": 0.9, + "grad_norm": 0.28609464800481466, + "learning_rate": 0.00012500349616707013, + "loss": 0.9897, + "step": 9377 + }, + { + "epoch": 0.9, + "grad_norm": 0.2647060271102632, + "learning_rate": 0.0001249881788410995, + "loss": 1.1142, + "step": 9378 + }, + { + "epoch": 0.9, + "grad_norm": 0.3319227336941671, + "learning_rate": 0.00012497286088978407, + "loss": 1.1371, + "step": 9379 + }, + { + "epoch": 0.9, + "grad_norm": 0.290890609063919, + "learning_rate": 0.00012495754231350704, + "loss": 0.9454, + "step": 9380 + }, + { + "epoch": 0.9, + "grad_norm": 0.275011083994787, + "learning_rate": 0.00012494222311265185, + "loss": 1.0498, + "step": 9381 + }, + { + "epoch": 0.9, + "grad_norm": 0.29328964264372026, + "learning_rate": 0.00012492690328760184, + "loss": 1.2401, + "step": 9382 + }, + { + "epoch": 0.9, + "grad_norm": 0.2849134201992309, + "learning_rate": 0.00012491158283874042, + "loss": 1.0384, + "step": 9383 + }, + { + "epoch": 0.9, + "grad_norm": 0.2644889309340255, + "learning_rate": 0.00012489626176645098, + "loss": 1.0729, + "step": 9384 + }, + { + "epoch": 0.9, + "grad_norm": 0.2671846138345653, + "learning_rate": 0.00012488094007111694, + "loss": 1.0643, + "step": 9385 + }, + { + "epoch": 0.9, + "grad_norm": 0.28342755899615574, + "learning_rate": 0.00012486561775312176, + "loss": 1.0652, + "step": 9386 + }, + { + "epoch": 0.9, + "grad_norm": 0.2708718366533827, + "learning_rate": 0.00012485029481284883, + "loss": 1.0131, + "step": 9387 + }, + { + "epoch": 0.9, + "grad_norm": 0.2893275903976366, + "learning_rate": 0.00012483497125068168, + "loss": 1.1254, + "step": 9388 + }, + { + "epoch": 0.9, + "grad_norm": 0.29651203987564034, + "learning_rate": 0.00012481964706700374, + "loss": 1.0964, + "step": 9389 + }, + { + "epoch": 0.9, + "grad_norm": 0.2668243860412548, + "learning_rate": 0.00012480432226219857, + "loss": 1.0646, + "step": 9390 + }, + { + "epoch": 0.9, + "grad_norm": 0.2607022670832829, + "learning_rate": 0.0001247889968366496, + "loss": 1.094, + "step": 9391 + }, + { + "epoch": 0.9, + "grad_norm": 0.3032946090686901, + "learning_rate": 0.00012477367079074045, + "loss": 1.0649, + "step": 9392 + }, + { + "epoch": 0.9, + "grad_norm": 0.31332018726986793, + "learning_rate": 0.0001247583441248546, + "loss": 1.0273, + "step": 9393 + }, + { + "epoch": 0.9, + "grad_norm": 0.25149055452189184, + "learning_rate": 0.00012474301683937562, + "loss": 1.0565, + "step": 9394 + }, + { + "epoch": 0.9, + "grad_norm": 0.31857711281294315, + "learning_rate": 0.00012472768893468712, + "loss": 1.1011, + "step": 9395 + }, + { + "epoch": 0.9, + "grad_norm": 0.27584446547396924, + "learning_rate": 0.00012471236041117263, + "loss": 1.1056, + "step": 9396 + }, + { + "epoch": 0.9, + "grad_norm": 0.2710593279658963, + "learning_rate": 0.00012469703126921582, + "loss": 1.0088, + "step": 9397 + }, + { + "epoch": 0.9, + "grad_norm": 0.2924314964891264, + "learning_rate": 0.00012468170150920028, + "loss": 1.0443, + "step": 9398 + }, + { + "epoch": 0.9, + "grad_norm": 0.3100262243859713, + "learning_rate": 0.00012466637113150964, + "loss": 1.068, + "step": 9399 + }, + { + "epoch": 0.9, + "grad_norm": 0.3203008665877059, + "learning_rate": 0.00012465104013652755, + "loss": 1.0662, + "step": 9400 + }, + { + "epoch": 0.9, + "grad_norm": 0.28159144815769815, + "learning_rate": 0.00012463570852463767, + "loss": 1.0416, + "step": 9401 + }, + { + "epoch": 0.9, + "grad_norm": 0.2670293148269375, + "learning_rate": 0.00012462037629622374, + "loss": 1.0158, + "step": 9402 + }, + { + "epoch": 0.9, + "grad_norm": 0.28251780180802666, + "learning_rate": 0.00012460504345166942, + "loss": 1.0844, + "step": 9403 + }, + { + "epoch": 0.9, + "grad_norm": 0.2762339006289285, + "learning_rate": 0.00012458970999135839, + "loss": 1.0557, + "step": 9404 + }, + { + "epoch": 0.9, + "grad_norm": 0.26743346835091364, + "learning_rate": 0.00012457437591567442, + "loss": 1.0952, + "step": 9405 + }, + { + "epoch": 0.9, + "grad_norm": 0.28037210231320464, + "learning_rate": 0.00012455904122500128, + "loss": 0.9932, + "step": 9406 + }, + { + "epoch": 0.9, + "grad_norm": 0.2629358216186376, + "learning_rate": 0.00012454370591972268, + "loss": 0.9753, + "step": 9407 + }, + { + "epoch": 0.9, + "grad_norm": 0.2749296239864377, + "learning_rate": 0.0001245283700002224, + "loss": 1.0294, + "step": 9408 + }, + { + "epoch": 0.9, + "grad_norm": 0.3005194418484203, + "learning_rate": 0.00012451303346688424, + "loss": 0.9774, + "step": 9409 + }, + { + "epoch": 0.9, + "grad_norm": 0.2994528395818742, + "learning_rate": 0.00012449769632009205, + "loss": 1.0247, + "step": 9410 + }, + { + "epoch": 0.9, + "grad_norm": 0.2491825962382407, + "learning_rate": 0.00012448235856022958, + "loss": 1.0416, + "step": 9411 + }, + { + "epoch": 0.9, + "grad_norm": 0.24946941218248858, + "learning_rate": 0.0001244670201876807, + "loss": 1.0701, + "step": 9412 + }, + { + "epoch": 0.9, + "grad_norm": 0.30337869321931366, + "learning_rate": 0.0001244516812028293, + "loss": 1.1049, + "step": 9413 + }, + { + "epoch": 0.9, + "grad_norm": 0.30488539436890616, + "learning_rate": 0.00012443634160605918, + "loss": 1.0473, + "step": 9414 + }, + { + "epoch": 0.9, + "grad_norm": 0.307492284380008, + "learning_rate": 0.00012442100139775425, + "loss": 1.0344, + "step": 9415 + }, + { + "epoch": 0.9, + "grad_norm": 0.3091356412029321, + "learning_rate": 0.00012440566057829843, + "loss": 1.0037, + "step": 9416 + }, + { + "epoch": 0.9, + "grad_norm": 0.27659203912070673, + "learning_rate": 0.0001243903191480756, + "loss": 0.9788, + "step": 9417 + }, + { + "epoch": 0.9, + "grad_norm": 0.295213821134966, + "learning_rate": 0.00012437497710746974, + "loss": 1.0793, + "step": 9418 + }, + { + "epoch": 0.9, + "grad_norm": 0.2921351405650045, + "learning_rate": 0.00012435963445686472, + "loss": 1.0692, + "step": 9419 + }, + { + "epoch": 0.9, + "grad_norm": 0.32103907136316706, + "learning_rate": 0.00012434429119664457, + "loss": 1.1415, + "step": 9420 + }, + { + "epoch": 0.9, + "grad_norm": 0.30821488009357795, + "learning_rate": 0.0001243289473271932, + "loss": 1.1574, + "step": 9421 + }, + { + "epoch": 0.9, + "grad_norm": 0.3020517741494569, + "learning_rate": 0.00012431360284889464, + "loss": 1.1252, + "step": 9422 + }, + { + "epoch": 0.9, + "grad_norm": 0.30111749627739215, + "learning_rate": 0.0001242982577621329, + "loss": 1.0815, + "step": 9423 + }, + { + "epoch": 0.9, + "grad_norm": 0.2654478388360173, + "learning_rate": 0.000124282912067292, + "loss": 1.0287, + "step": 9424 + }, + { + "epoch": 0.9, + "grad_norm": 0.2845065279316126, + "learning_rate": 0.00012426756576475593, + "loss": 1.0703, + "step": 9425 + }, + { + "epoch": 0.9, + "grad_norm": 0.24604429019373228, + "learning_rate": 0.00012425221885490882, + "loss": 0.982, + "step": 9426 + }, + { + "epoch": 0.9, + "grad_norm": 0.3204878522592504, + "learning_rate": 0.00012423687133813466, + "loss": 0.9874, + "step": 9427 + }, + { + "epoch": 0.9, + "grad_norm": 0.2698848880583595, + "learning_rate": 0.00012422152321481754, + "loss": 1.1232, + "step": 9428 + }, + { + "epoch": 0.9, + "grad_norm": 0.2956334280898767, + "learning_rate": 0.00012420617448534162, + "loss": 1.1245, + "step": 9429 + }, + { + "epoch": 0.9, + "grad_norm": 0.28215348025828507, + "learning_rate": 0.00012419082515009093, + "loss": 1.0598, + "step": 9430 + }, + { + "epoch": 0.9, + "grad_norm": 0.29800328339448867, + "learning_rate": 0.00012417547520944967, + "loss": 0.9549, + "step": 9431 + }, + { + "epoch": 0.9, + "grad_norm": 0.2758307830968502, + "learning_rate": 0.00012416012466380194, + "loss": 1.0299, + "step": 9432 + }, + { + "epoch": 0.9, + "grad_norm": 0.27789615338120255, + "learning_rate": 0.00012414477351353192, + "loss": 0.955, + "step": 9433 + }, + { + "epoch": 0.9, + "grad_norm": 0.28216151632355363, + "learning_rate": 0.00012412942175902376, + "loss": 1.0228, + "step": 9434 + }, + { + "epoch": 0.9, + "grad_norm": 0.3646066899027749, + "learning_rate": 0.00012411406940066163, + "loss": 1.2245, + "step": 9435 + }, + { + "epoch": 0.9, + "grad_norm": 0.28097256754194205, + "learning_rate": 0.0001240987164388298, + "loss": 1.0332, + "step": 9436 + }, + { + "epoch": 0.9, + "grad_norm": 0.25589371187045934, + "learning_rate": 0.00012408336287391243, + "loss": 1.0571, + "step": 9437 + }, + { + "epoch": 0.9, + "grad_norm": 0.2924108280699943, + "learning_rate": 0.00012406800870629373, + "loss": 1.0727, + "step": 9438 + }, + { + "epoch": 0.9, + "grad_norm": 0.26309668552925775, + "learning_rate": 0.00012405265393635804, + "loss": 0.9953, + "step": 9439 + }, + { + "epoch": 0.9, + "grad_norm": 0.3055036695706396, + "learning_rate": 0.00012403729856448956, + "loss": 1.1705, + "step": 9440 + }, + { + "epoch": 0.9, + "grad_norm": 0.26404553824001786, + "learning_rate": 0.00012402194259107256, + "loss": 0.8035, + "step": 9441 + }, + { + "epoch": 0.9, + "grad_norm": 0.263456979982817, + "learning_rate": 0.00012400658601649135, + "loss": 1.0515, + "step": 9442 + }, + { + "epoch": 0.9, + "grad_norm": 0.2697295571066528, + "learning_rate": 0.00012399122884113024, + "loss": 1.0783, + "step": 9443 + }, + { + "epoch": 0.9, + "grad_norm": 0.3032996814397787, + "learning_rate": 0.00012397587106537355, + "loss": 1.107, + "step": 9444 + }, + { + "epoch": 0.9, + "grad_norm": 0.30209747282548055, + "learning_rate": 0.0001239605126896056, + "loss": 1.0748, + "step": 9445 + }, + { + "epoch": 0.9, + "grad_norm": 0.2541798522215149, + "learning_rate": 0.0001239451537142108, + "loss": 0.9973, + "step": 9446 + }, + { + "epoch": 0.9, + "grad_norm": 0.26055885474176194, + "learning_rate": 0.0001239297941395735, + "loss": 0.9793, + "step": 9447 + }, + { + "epoch": 0.9, + "grad_norm": 0.2937502415391966, + "learning_rate": 0.00012391443396607798, + "loss": 1.0501, + "step": 9448 + }, + { + "epoch": 0.9, + "grad_norm": 0.2484096340317221, + "learning_rate": 0.00012389907319410877, + "loss": 0.9737, + "step": 9449 + }, + { + "epoch": 0.9, + "grad_norm": 0.34708042996433486, + "learning_rate": 0.00012388371182405023, + "loss": 1.0665, + "step": 9450 + }, + { + "epoch": 0.9, + "grad_norm": 0.2870344403909073, + "learning_rate": 0.0001238683498562868, + "loss": 1.0346, + "step": 9451 + }, + { + "epoch": 0.9, + "grad_norm": 0.32368230775180656, + "learning_rate": 0.00012385298729120287, + "loss": 0.9915, + "step": 9452 + }, + { + "epoch": 0.9, + "grad_norm": 0.2902156260509134, + "learning_rate": 0.00012383762412918297, + "loss": 1.0838, + "step": 9453 + }, + { + "epoch": 0.9, + "grad_norm": 0.26483538433540343, + "learning_rate": 0.00012382226037061157, + "loss": 1.0152, + "step": 9454 + }, + { + "epoch": 0.9, + "grad_norm": 0.31506688541873007, + "learning_rate": 0.0001238068960158731, + "loss": 1.0498, + "step": 9455 + }, + { + "epoch": 0.9, + "grad_norm": 0.31606368328175294, + "learning_rate": 0.00012379153106535212, + "loss": 1.0568, + "step": 9456 + }, + { + "epoch": 0.9, + "grad_norm": 0.26749250980306544, + "learning_rate": 0.00012377616551943312, + "loss": 1.1441, + "step": 9457 + }, + { + "epoch": 0.9, + "grad_norm": 0.2805411852480236, + "learning_rate": 0.0001237607993785006, + "loss": 1.0273, + "step": 9458 + }, + { + "epoch": 0.9, + "grad_norm": 0.29590637384614576, + "learning_rate": 0.0001237454326429392, + "loss": 1.2318, + "step": 9459 + }, + { + "epoch": 0.91, + "grad_norm": 0.277157897803464, + "learning_rate": 0.00012373006531313338, + "loss": 1.0606, + "step": 9460 + }, + { + "epoch": 0.91, + "grad_norm": 0.2820422335962146, + "learning_rate": 0.0001237146973894678, + "loss": 1.0724, + "step": 9461 + }, + { + "epoch": 0.91, + "grad_norm": 0.28970884694413745, + "learning_rate": 0.00012369932887232695, + "loss": 1.0501, + "step": 9462 + }, + { + "epoch": 0.91, + "grad_norm": 0.26821460214493453, + "learning_rate": 0.00012368395976209554, + "loss": 1.1002, + "step": 9463 + }, + { + "epoch": 0.91, + "grad_norm": 0.3020120162788039, + "learning_rate": 0.00012366859005915817, + "loss": 1.2129, + "step": 9464 + }, + { + "epoch": 0.91, + "grad_norm": 0.25290616792649784, + "learning_rate": 0.00012365321976389942, + "loss": 0.9608, + "step": 9465 + }, + { + "epoch": 0.91, + "grad_norm": 0.3118979180099709, + "learning_rate": 0.000123637848876704, + "loss": 1.0709, + "step": 9466 + }, + { + "epoch": 0.91, + "grad_norm": 0.31154007441669707, + "learning_rate": 0.00012362247739795658, + "loss": 1.0235, + "step": 9467 + }, + { + "epoch": 0.91, + "grad_norm": 0.28995095468089827, + "learning_rate": 0.00012360710532804178, + "loss": 1.201, + "step": 9468 + }, + { + "epoch": 0.91, + "grad_norm": 0.3110913941186855, + "learning_rate": 0.00012359173266734435, + "loss": 1.1567, + "step": 9469 + }, + { + "epoch": 0.91, + "grad_norm": 0.27514587325982237, + "learning_rate": 0.00012357635941624898, + "loss": 1.0559, + "step": 9470 + }, + { + "epoch": 0.91, + "grad_norm": 0.30517981996469423, + "learning_rate": 0.00012356098557514037, + "loss": 1.0068, + "step": 9471 + }, + { + "epoch": 0.91, + "grad_norm": 0.2725388181887218, + "learning_rate": 0.00012354561114440334, + "loss": 0.9899, + "step": 9472 + }, + { + "epoch": 0.91, + "grad_norm": 0.297899718417481, + "learning_rate": 0.00012353023612442254, + "loss": 1.0983, + "step": 9473 + }, + { + "epoch": 0.91, + "grad_norm": 0.2848396496021322, + "learning_rate": 0.00012351486051558283, + "loss": 1.0539, + "step": 9474 + }, + { + "epoch": 0.91, + "grad_norm": 0.2852186312128984, + "learning_rate": 0.00012349948431826895, + "loss": 1.0044, + "step": 9475 + }, + { + "epoch": 0.91, + "grad_norm": 0.29012020789960563, + "learning_rate": 0.0001234841075328657, + "loss": 0.9673, + "step": 9476 + }, + { + "epoch": 0.91, + "grad_norm": 0.24672860443444639, + "learning_rate": 0.0001234687301597579, + "loss": 1.0161, + "step": 9477 + }, + { + "epoch": 0.91, + "grad_norm": 0.2726877358325203, + "learning_rate": 0.0001234533521993304, + "loss": 1.0621, + "step": 9478 + }, + { + "epoch": 0.91, + "grad_norm": 0.28948332424230283, + "learning_rate": 0.00012343797365196797, + "loss": 0.9913, + "step": 9479 + }, + { + "epoch": 0.91, + "grad_norm": 0.26969250387705407, + "learning_rate": 0.00012342259451805557, + "loss": 1.1202, + "step": 9480 + }, + { + "epoch": 0.91, + "grad_norm": 0.3258916039185605, + "learning_rate": 0.000123407214797978, + "loss": 1.0168, + "step": 9481 + }, + { + "epoch": 0.91, + "grad_norm": 0.25973605245424924, + "learning_rate": 0.00012339183449212017, + "loss": 1.0874, + "step": 9482 + }, + { + "epoch": 0.91, + "grad_norm": 0.2676443845035316, + "learning_rate": 0.00012337645360086698, + "loss": 0.9667, + "step": 9483 + }, + { + "epoch": 0.91, + "grad_norm": 0.30203903279304206, + "learning_rate": 0.00012336107212460338, + "loss": 1.0537, + "step": 9484 + }, + { + "epoch": 0.91, + "grad_norm": 0.27476191579196646, + "learning_rate": 0.00012334569006371422, + "loss": 1.0224, + "step": 9485 + }, + { + "epoch": 0.91, + "grad_norm": 0.26328678406410266, + "learning_rate": 0.0001233303074185845, + "loss": 1.1169, + "step": 9486 + }, + { + "epoch": 0.91, + "grad_norm": 0.29307038009665376, + "learning_rate": 0.0001233149241895992, + "loss": 1.0547, + "step": 9487 + }, + { + "epoch": 0.91, + "grad_norm": 0.3029822127610837, + "learning_rate": 0.00012329954037714326, + "loss": 1.0455, + "step": 9488 + }, + { + "epoch": 0.91, + "grad_norm": 0.31316815984313306, + "learning_rate": 0.00012328415598160167, + "loss": 1.0461, + "step": 9489 + }, + { + "epoch": 0.91, + "grad_norm": 0.26416179108140025, + "learning_rate": 0.00012326877100335946, + "loss": 0.9973, + "step": 9490 + }, + { + "epoch": 0.91, + "grad_norm": 0.27917647838809806, + "learning_rate": 0.0001232533854428016, + "loss": 1.1014, + "step": 9491 + }, + { + "epoch": 0.91, + "grad_norm": 0.2880924083809222, + "learning_rate": 0.00012323799930031318, + "loss": 1.1198, + "step": 9492 + }, + { + "epoch": 0.91, + "grad_norm": 0.2738883771003005, + "learning_rate": 0.0001232226125762792, + "loss": 0.9753, + "step": 9493 + }, + { + "epoch": 0.91, + "grad_norm": 0.28505026809880435, + "learning_rate": 0.00012320722527108476, + "loss": 1.0131, + "step": 9494 + }, + { + "epoch": 0.91, + "grad_norm": 0.2700388433926024, + "learning_rate": 0.00012319183738511495, + "loss": 1.0627, + "step": 9495 + }, + { + "epoch": 0.91, + "grad_norm": 0.21724216250627967, + "learning_rate": 0.0001231764489187548, + "loss": 0.969, + "step": 9496 + }, + { + "epoch": 0.91, + "grad_norm": 0.30567642004976847, + "learning_rate": 0.00012316105987238946, + "loss": 1.1282, + "step": 9497 + }, + { + "epoch": 0.91, + "grad_norm": 0.2960952448251748, + "learning_rate": 0.00012314567024640405, + "loss": 1.0719, + "step": 9498 + }, + { + "epoch": 0.91, + "grad_norm": 0.2679330934652599, + "learning_rate": 0.00012313028004118368, + "loss": 1.0671, + "step": 9499 + }, + { + "epoch": 0.91, + "grad_norm": 0.30977923520733813, + "learning_rate": 0.00012311488925711352, + "loss": 1.0649, + "step": 9500 + }, + { + "epoch": 0.91, + "grad_norm": 0.2877059133094227, + "learning_rate": 0.00012309949789457872, + "loss": 0.99, + "step": 9501 + }, + { + "epoch": 0.91, + "grad_norm": 0.2747597630168646, + "learning_rate": 0.0001230841059539645, + "loss": 1.0708, + "step": 9502 + }, + { + "epoch": 0.91, + "grad_norm": 0.2849182513539573, + "learning_rate": 0.00012306871343565598, + "loss": 1.1457, + "step": 9503 + }, + { + "epoch": 0.91, + "grad_norm": 0.2816422560556322, + "learning_rate": 0.00012305332034003843, + "loss": 1.0653, + "step": 9504 + }, + { + "epoch": 0.91, + "grad_norm": 0.2449196404919519, + "learning_rate": 0.00012303792666749704, + "loss": 1.0025, + "step": 9505 + }, + { + "epoch": 0.91, + "grad_norm": 0.28277588561340145, + "learning_rate": 0.00012302253241841705, + "loss": 1.0981, + "step": 9506 + }, + { + "epoch": 0.91, + "grad_norm": 0.28881985968739604, + "learning_rate": 0.00012300713759318374, + "loss": 0.8564, + "step": 9507 + }, + { + "epoch": 0.91, + "grad_norm": 0.2658935819955603, + "learning_rate": 0.00012299174219218236, + "loss": 0.9685, + "step": 9508 + }, + { + "epoch": 0.91, + "grad_norm": 0.33536541922122687, + "learning_rate": 0.00012297634621579815, + "loss": 1.0644, + "step": 9509 + }, + { + "epoch": 0.91, + "grad_norm": 0.2883666884244152, + "learning_rate": 0.00012296094966441644, + "loss": 1.0163, + "step": 9510 + }, + { + "epoch": 0.91, + "grad_norm": 0.2874022639714324, + "learning_rate": 0.00012294555253842258, + "loss": 1.0547, + "step": 9511 + }, + { + "epoch": 0.91, + "grad_norm": 0.28615282013866633, + "learning_rate": 0.0001229301548382018, + "loss": 1.0267, + "step": 9512 + }, + { + "epoch": 0.91, + "grad_norm": 0.31559288068892855, + "learning_rate": 0.0001229147565641395, + "loss": 1.0427, + "step": 9513 + }, + { + "epoch": 0.91, + "grad_norm": 0.29067613727477026, + "learning_rate": 0.000122899357716621, + "loss": 1.0771, + "step": 9514 + }, + { + "epoch": 0.91, + "grad_norm": 0.3091379944841265, + "learning_rate": 0.00012288395829603168, + "loss": 1.1228, + "step": 9515 + }, + { + "epoch": 0.91, + "grad_norm": 0.34288634284416575, + "learning_rate": 0.0001228685583027569, + "loss": 1.07, + "step": 9516 + }, + { + "epoch": 0.91, + "grad_norm": 0.25806670007511917, + "learning_rate": 0.00012285315773718215, + "loss": 1.177, + "step": 9517 + }, + { + "epoch": 0.91, + "grad_norm": 0.28617035042389005, + "learning_rate": 0.00012283775659969272, + "loss": 0.9902, + "step": 9518 + }, + { + "epoch": 0.91, + "grad_norm": 0.2699590057863018, + "learning_rate": 0.00012282235489067406, + "loss": 1.0114, + "step": 9519 + }, + { + "epoch": 0.91, + "grad_norm": 0.2668076542427271, + "learning_rate": 0.00012280695261051168, + "loss": 1.0479, + "step": 9520 + }, + { + "epoch": 0.91, + "grad_norm": 0.27069952546890097, + "learning_rate": 0.00012279154975959093, + "loss": 0.9784, + "step": 9521 + }, + { + "epoch": 0.91, + "grad_norm": 0.26601560597027246, + "learning_rate": 0.00012277614633829736, + "loss": 0.9275, + "step": 9522 + }, + { + "epoch": 0.91, + "grad_norm": 0.34915490477463507, + "learning_rate": 0.00012276074234701637, + "loss": 1.0747, + "step": 9523 + }, + { + "epoch": 0.91, + "grad_norm": 0.2963938117016832, + "learning_rate": 0.00012274533778613354, + "loss": 0.968, + "step": 9524 + }, + { + "epoch": 0.91, + "grad_norm": 0.28627744365026025, + "learning_rate": 0.00012272993265603432, + "loss": 1.0285, + "step": 9525 + }, + { + "epoch": 0.91, + "grad_norm": 0.2543047213053924, + "learning_rate": 0.00012271452695710423, + "loss": 0.9785, + "step": 9526 + }, + { + "epoch": 0.91, + "grad_norm": 0.30369638655565206, + "learning_rate": 0.00012269912068972887, + "loss": 1.0088, + "step": 9527 + }, + { + "epoch": 0.91, + "grad_norm": 0.30048051465973585, + "learning_rate": 0.00012268371385429371, + "loss": 1.0032, + "step": 9528 + }, + { + "epoch": 0.91, + "grad_norm": 0.24363694434494604, + "learning_rate": 0.0001226683064511844, + "loss": 1.083, + "step": 9529 + }, + { + "epoch": 0.91, + "grad_norm": 0.27300145834174794, + "learning_rate": 0.0001226528984807864, + "loss": 1.056, + "step": 9530 + }, + { + "epoch": 0.91, + "grad_norm": 0.2840057926179831, + "learning_rate": 0.00012263748994348543, + "loss": 1.0906, + "step": 9531 + }, + { + "epoch": 0.91, + "grad_norm": 0.2867767195608796, + "learning_rate": 0.00012262208083966707, + "loss": 1.0343, + "step": 9532 + }, + { + "epoch": 0.91, + "grad_norm": 0.30097203291437813, + "learning_rate": 0.00012260667116971687, + "loss": 1.1535, + "step": 9533 + }, + { + "epoch": 0.91, + "grad_norm": 0.26057797605093236, + "learning_rate": 0.0001225912609340205, + "loss": 0.9956, + "step": 9534 + }, + { + "epoch": 0.91, + "grad_norm": 0.2722485625219177, + "learning_rate": 0.00012257585013296368, + "loss": 0.9609, + "step": 9535 + }, + { + "epoch": 0.91, + "grad_norm": 0.31674933662782234, + "learning_rate": 0.00012256043876693199, + "loss": 1.1484, + "step": 9536 + }, + { + "epoch": 0.91, + "grad_norm": 0.24732657574619976, + "learning_rate": 0.00012254502683631114, + "loss": 1.11, + "step": 9537 + }, + { + "epoch": 0.91, + "grad_norm": 0.302444859564973, + "learning_rate": 0.00012252961434148685, + "loss": 1.1601, + "step": 9538 + }, + { + "epoch": 0.91, + "grad_norm": 0.3039631031521198, + "learning_rate": 0.0001225142012828448, + "loss": 1.1319, + "step": 9539 + }, + { + "epoch": 0.91, + "grad_norm": 0.2833615843410249, + "learning_rate": 0.0001224987876607707, + "loss": 1.1211, + "step": 9540 + }, + { + "epoch": 0.91, + "grad_norm": 0.24087751384697798, + "learning_rate": 0.0001224833734756503, + "loss": 0.9828, + "step": 9541 + }, + { + "epoch": 0.91, + "grad_norm": 0.3022341870078711, + "learning_rate": 0.00012246795872786938, + "loss": 1.0778, + "step": 9542 + }, + { + "epoch": 0.91, + "grad_norm": 0.3125008887305215, + "learning_rate": 0.00012245254341781362, + "loss": 0.9715, + "step": 9543 + }, + { + "epoch": 0.91, + "grad_norm": 0.26961102066302006, + "learning_rate": 0.0001224371275458689, + "loss": 1.0219, + "step": 9544 + }, + { + "epoch": 0.91, + "grad_norm": 0.30552221121015993, + "learning_rate": 0.00012242171111242093, + "loss": 0.9941, + "step": 9545 + }, + { + "epoch": 0.91, + "grad_norm": 0.30365825120398526, + "learning_rate": 0.00012240629411785557, + "loss": 0.9531, + "step": 9546 + }, + { + "epoch": 0.91, + "grad_norm": 0.27544518032809634, + "learning_rate": 0.0001223908765625586, + "loss": 1.0634, + "step": 9547 + }, + { + "epoch": 0.91, + "grad_norm": 0.2469345335356712, + "learning_rate": 0.00012237545844691585, + "loss": 1.0724, + "step": 9548 + }, + { + "epoch": 0.91, + "grad_norm": 0.2956105861133632, + "learning_rate": 0.00012236003977131324, + "loss": 1.1523, + "step": 9549 + }, + { + "epoch": 0.91, + "grad_norm": 0.2968897322282517, + "learning_rate": 0.00012234462053613653, + "loss": 1.07, + "step": 9550 + }, + { + "epoch": 0.91, + "grad_norm": 0.26572173065636373, + "learning_rate": 0.0001223292007417717, + "loss": 1.0555, + "step": 9551 + }, + { + "epoch": 0.91, + "grad_norm": 0.24837107366849048, + "learning_rate": 0.00012231378038860455, + "loss": 1.1674, + "step": 9552 + }, + { + "epoch": 0.91, + "grad_norm": 0.27292429243707583, + "learning_rate": 0.00012229835947702103, + "loss": 1.1317, + "step": 9553 + }, + { + "epoch": 0.91, + "grad_norm": 0.31026913454307414, + "learning_rate": 0.00012228293800740705, + "loss": 1.1446, + "step": 9554 + }, + { + "epoch": 0.91, + "grad_norm": 0.3072735487423723, + "learning_rate": 0.00012226751598014854, + "loss": 1.1987, + "step": 9555 + }, + { + "epoch": 0.91, + "grad_norm": 0.24847902065517485, + "learning_rate": 0.00012225209339563145, + "loss": 1.1041, + "step": 9556 + }, + { + "epoch": 0.91, + "grad_norm": 0.2924020829498602, + "learning_rate": 0.00012223667025424172, + "loss": 1.0721, + "step": 9557 + }, + { + "epoch": 0.91, + "grad_norm": 0.2613223633837016, + "learning_rate": 0.00012222124655636538, + "loss": 1.1266, + "step": 9558 + }, + { + "epoch": 0.91, + "grad_norm": 0.2894905328595956, + "learning_rate": 0.00012220582230238839, + "loss": 1.0732, + "step": 9559 + }, + { + "epoch": 0.91, + "grad_norm": 0.28681410216353886, + "learning_rate": 0.00012219039749269668, + "loss": 1.0915, + "step": 9560 + }, + { + "epoch": 0.91, + "grad_norm": 0.2943586385473255, + "learning_rate": 0.00012217497212767636, + "loss": 1.0788, + "step": 9561 + }, + { + "epoch": 0.91, + "grad_norm": 0.285929389887662, + "learning_rate": 0.00012215954620771344, + "loss": 1.1005, + "step": 9562 + }, + { + "epoch": 0.91, + "grad_norm": 0.3149078861825459, + "learning_rate": 0.00012214411973319396, + "loss": 1.0933, + "step": 9563 + }, + { + "epoch": 0.91, + "grad_norm": 0.24280112314190866, + "learning_rate": 0.0001221286927045039, + "loss": 0.954, + "step": 9564 + }, + { + "epoch": 0.92, + "grad_norm": 0.24225580439211833, + "learning_rate": 0.00012211326512202945, + "loss": 0.9918, + "step": 9565 + }, + { + "epoch": 0.92, + "grad_norm": 0.3066920417822219, + "learning_rate": 0.00012209783698615665, + "loss": 1.0828, + "step": 9566 + }, + { + "epoch": 0.92, + "grad_norm": 0.3112335184826233, + "learning_rate": 0.00012208240829727156, + "loss": 1.1426, + "step": 9567 + }, + { + "epoch": 0.92, + "grad_norm": 0.263752238595472, + "learning_rate": 0.00012206697905576034, + "loss": 1.0959, + "step": 9568 + }, + { + "epoch": 0.92, + "grad_norm": 0.3013509779853578, + "learning_rate": 0.0001220515492620091, + "loss": 1.0237, + "step": 9569 + }, + { + "epoch": 0.92, + "grad_norm": 0.29198852460496516, + "learning_rate": 0.00012203611891640398, + "loss": 1.1627, + "step": 9570 + }, + { + "epoch": 0.92, + "grad_norm": 0.26850931222903324, + "learning_rate": 0.00012202068801933112, + "loss": 1.1425, + "step": 9571 + }, + { + "epoch": 0.92, + "grad_norm": 0.30251674022231434, + "learning_rate": 0.00012200525657117673, + "loss": 1.1273, + "step": 9572 + }, + { + "epoch": 0.92, + "grad_norm": 0.3018015700917101, + "learning_rate": 0.00012198982457232698, + "loss": 0.9809, + "step": 9573 + }, + { + "epoch": 0.92, + "grad_norm": 0.273598131245597, + "learning_rate": 0.000121974392023168, + "loss": 1.0571, + "step": 9574 + }, + { + "epoch": 0.92, + "grad_norm": 0.30277174215701025, + "learning_rate": 0.00012195895892408609, + "loss": 0.9031, + "step": 9575 + }, + { + "epoch": 0.92, + "grad_norm": 0.28490196205477425, + "learning_rate": 0.00012194352527546739, + "loss": 1.0131, + "step": 9576 + }, + { + "epoch": 0.92, + "grad_norm": 0.2715100350623435, + "learning_rate": 0.0001219280910776982, + "loss": 1.0771, + "step": 9577 + }, + { + "epoch": 0.92, + "grad_norm": 0.2948896697832583, + "learning_rate": 0.00012191265633116473, + "loss": 1.0838, + "step": 9578 + }, + { + "epoch": 0.92, + "grad_norm": 0.2570476008819864, + "learning_rate": 0.00012189722103625332, + "loss": 1.0897, + "step": 9579 + }, + { + "epoch": 0.92, + "grad_norm": 0.2923182653513327, + "learning_rate": 0.00012188178519335014, + "loss": 1.063, + "step": 9580 + }, + { + "epoch": 0.92, + "grad_norm": 0.24843565354969446, + "learning_rate": 0.00012186634880284155, + "loss": 1.0832, + "step": 9581 + }, + { + "epoch": 0.92, + "grad_norm": 0.30405197780168924, + "learning_rate": 0.00012185091186511383, + "loss": 1.0964, + "step": 9582 + }, + { + "epoch": 0.92, + "grad_norm": 0.28552359904155467, + "learning_rate": 0.00012183547438055334, + "loss": 1.0048, + "step": 9583 + }, + { + "epoch": 0.92, + "grad_norm": 0.25623311580886565, + "learning_rate": 0.00012182003634954635, + "loss": 1.1127, + "step": 9584 + }, + { + "epoch": 0.92, + "grad_norm": 0.28043711484669764, + "learning_rate": 0.00012180459777247924, + "loss": 0.9836, + "step": 9585 + }, + { + "epoch": 0.92, + "grad_norm": 0.29306608514635557, + "learning_rate": 0.00012178915864973839, + "loss": 1.0981, + "step": 9586 + }, + { + "epoch": 0.92, + "grad_norm": 0.2509994097045669, + "learning_rate": 0.00012177371898171011, + "loss": 1.0955, + "step": 9587 + }, + { + "epoch": 0.92, + "grad_norm": 0.25595385108043606, + "learning_rate": 0.00012175827876878085, + "loss": 1.0905, + "step": 9588 + }, + { + "epoch": 0.92, + "grad_norm": 0.29970697558699483, + "learning_rate": 0.00012174283801133701, + "loss": 1.08, + "step": 9589 + }, + { + "epoch": 0.92, + "grad_norm": 0.25132811288192697, + "learning_rate": 0.00012172739670976497, + "loss": 0.9933, + "step": 9590 + }, + { + "epoch": 0.92, + "grad_norm": 0.2795967397200255, + "learning_rate": 0.00012171195486445115, + "loss": 1.0812, + "step": 9591 + }, + { + "epoch": 0.92, + "grad_norm": 0.27861509817589664, + "learning_rate": 0.00012169651247578205, + "loss": 1.0368, + "step": 9592 + }, + { + "epoch": 0.92, + "grad_norm": 0.2916254831937168, + "learning_rate": 0.00012168106954414406, + "loss": 1.0809, + "step": 9593 + }, + { + "epoch": 0.92, + "grad_norm": 0.2922945494777507, + "learning_rate": 0.00012166562606992368, + "loss": 1.1979, + "step": 9594 + }, + { + "epoch": 0.92, + "grad_norm": 0.27637344653802837, + "learning_rate": 0.0001216501820535074, + "loss": 0.957, + "step": 9595 + }, + { + "epoch": 0.92, + "grad_norm": 0.2890101477293797, + "learning_rate": 0.00012163473749528169, + "loss": 1.0743, + "step": 9596 + }, + { + "epoch": 0.92, + "grad_norm": 0.27375555736611096, + "learning_rate": 0.0001216192923956331, + "loss": 1.0294, + "step": 9597 + }, + { + "epoch": 0.92, + "grad_norm": 0.3020964878540027, + "learning_rate": 0.00012160384675494811, + "loss": 1.117, + "step": 9598 + }, + { + "epoch": 0.92, + "grad_norm": 0.31564811381117275, + "learning_rate": 0.00012158840057361332, + "loss": 1.1155, + "step": 9599 + }, + { + "epoch": 0.92, + "grad_norm": 0.30570724996592125, + "learning_rate": 0.00012157295385201522, + "loss": 1.0954, + "step": 9600 + }, + { + "epoch": 0.92, + "grad_norm": 0.2772559483117768, + "learning_rate": 0.00012155750659054035, + "loss": 1.0597, + "step": 9601 + }, + { + "epoch": 0.92, + "grad_norm": 0.27996328612745197, + "learning_rate": 0.00012154205878957539, + "loss": 1.0686, + "step": 9602 + }, + { + "epoch": 0.92, + "grad_norm": 0.2862321218720333, + "learning_rate": 0.00012152661044950684, + "loss": 0.9934, + "step": 9603 + }, + { + "epoch": 0.92, + "grad_norm": 0.2853452333425892, + "learning_rate": 0.00012151116157072132, + "loss": 1.1348, + "step": 9604 + }, + { + "epoch": 0.92, + "grad_norm": 0.2617628600038958, + "learning_rate": 0.00012149571215360547, + "loss": 0.97, + "step": 9605 + }, + { + "epoch": 0.92, + "grad_norm": 0.25696280278663025, + "learning_rate": 0.00012148026219854594, + "loss": 1.0261, + "step": 9606 + }, + { + "epoch": 0.92, + "grad_norm": 0.310031454374293, + "learning_rate": 0.00012146481170592933, + "loss": 1.0158, + "step": 9607 + }, + { + "epoch": 0.92, + "grad_norm": 0.2790355327820539, + "learning_rate": 0.0001214493606761423, + "loss": 1.1757, + "step": 9608 + }, + { + "epoch": 0.92, + "grad_norm": 0.30770923269815564, + "learning_rate": 0.00012143390910957157, + "loss": 1.1518, + "step": 9609 + }, + { + "epoch": 0.92, + "grad_norm": 0.30029358695910546, + "learning_rate": 0.00012141845700660379, + "loss": 1.1968, + "step": 9610 + }, + { + "epoch": 0.92, + "grad_norm": 0.3098726943709629, + "learning_rate": 0.00012140300436762564, + "loss": 1.0352, + "step": 9611 + }, + { + "epoch": 0.92, + "grad_norm": 0.2959419907143475, + "learning_rate": 0.00012138755119302388, + "loss": 1.0186, + "step": 9612 + }, + { + "epoch": 0.92, + "grad_norm": 0.27785698197656766, + "learning_rate": 0.00012137209748318521, + "loss": 1.0375, + "step": 9613 + }, + { + "epoch": 0.92, + "grad_norm": 0.2930400342362595, + "learning_rate": 0.00012135664323849634, + "loss": 0.9418, + "step": 9614 + }, + { + "epoch": 0.92, + "grad_norm": 0.28085631013272555, + "learning_rate": 0.00012134118845934405, + "loss": 1.0465, + "step": 9615 + }, + { + "epoch": 0.92, + "grad_norm": 0.2541256790591151, + "learning_rate": 0.00012132573314611516, + "loss": 1.0293, + "step": 9616 + }, + { + "epoch": 0.92, + "grad_norm": 0.28908641994102546, + "learning_rate": 0.00012131027729919633, + "loss": 1.078, + "step": 9617 + }, + { + "epoch": 0.92, + "grad_norm": 0.2772893250241189, + "learning_rate": 0.00012129482091897446, + "loss": 1.1151, + "step": 9618 + }, + { + "epoch": 0.92, + "grad_norm": 0.327164314875272, + "learning_rate": 0.00012127936400583629, + "loss": 1.0591, + "step": 9619 + }, + { + "epoch": 0.92, + "grad_norm": 0.2886231869799765, + "learning_rate": 0.00012126390656016866, + "loss": 1.0113, + "step": 9620 + }, + { + "epoch": 0.92, + "grad_norm": 0.283680263633664, + "learning_rate": 0.0001212484485823584, + "loss": 1.0249, + "step": 9621 + }, + { + "epoch": 0.92, + "grad_norm": 0.26111321954434336, + "learning_rate": 0.00012123299007279238, + "loss": 1.0342, + "step": 9622 + }, + { + "epoch": 0.92, + "grad_norm": 0.2955195301556354, + "learning_rate": 0.00012121753103185745, + "loss": 0.9461, + "step": 9623 + }, + { + "epoch": 0.92, + "grad_norm": 0.27409190677236495, + "learning_rate": 0.00012120207145994045, + "loss": 1.0637, + "step": 9624 + }, + { + "epoch": 0.92, + "grad_norm": 0.26924614303903166, + "learning_rate": 0.00012118661135742828, + "loss": 0.9253, + "step": 9625 + }, + { + "epoch": 0.92, + "grad_norm": 0.2692477485784918, + "learning_rate": 0.00012117115072470788, + "loss": 1.0827, + "step": 9626 + }, + { + "epoch": 0.92, + "grad_norm": 0.26774970205249055, + "learning_rate": 0.0001211556895621661, + "loss": 1.0528, + "step": 9627 + }, + { + "epoch": 0.92, + "grad_norm": 0.29867507929874776, + "learning_rate": 0.00012114022787018988, + "loss": 1.0436, + "step": 9628 + }, + { + "epoch": 0.92, + "grad_norm": 0.3208070747287111, + "learning_rate": 0.00012112476564916622, + "loss": 1.1365, + "step": 9629 + }, + { + "epoch": 0.92, + "grad_norm": 0.2972966476878782, + "learning_rate": 0.000121109302899482, + "loss": 1.1186, + "step": 9630 + }, + { + "epoch": 0.92, + "grad_norm": 0.30544954734093066, + "learning_rate": 0.00012109383962152416, + "loss": 1.1047, + "step": 9631 + }, + { + "epoch": 0.92, + "grad_norm": 0.2967657674672275, + "learning_rate": 0.00012107837581567977, + "loss": 1.072, + "step": 9632 + }, + { + "epoch": 0.92, + "grad_norm": 0.3033096939499795, + "learning_rate": 0.00012106291148233579, + "loss": 1.0254, + "step": 9633 + }, + { + "epoch": 0.92, + "grad_norm": 0.26626395824315857, + "learning_rate": 0.00012104744662187922, + "loss": 0.9941, + "step": 9634 + }, + { + "epoch": 0.92, + "grad_norm": 0.2895890185705288, + "learning_rate": 0.00012103198123469704, + "loss": 1.0531, + "step": 9635 + }, + { + "epoch": 0.92, + "grad_norm": 0.2957215592529329, + "learning_rate": 0.00012101651532117632, + "loss": 1.0199, + "step": 9636 + }, + { + "epoch": 0.92, + "grad_norm": 0.2966423558454665, + "learning_rate": 0.00012100104888170407, + "loss": 0.9559, + "step": 9637 + }, + { + "epoch": 0.92, + "grad_norm": 0.3146917874326988, + "learning_rate": 0.00012098558191666742, + "loss": 1.1623, + "step": 9638 + }, + { + "epoch": 0.92, + "grad_norm": 0.33430522861396766, + "learning_rate": 0.00012097011442645337, + "loss": 1.0295, + "step": 9639 + }, + { + "epoch": 0.92, + "grad_norm": 0.2725271575959663, + "learning_rate": 0.00012095464641144902, + "loss": 1.0745, + "step": 9640 + }, + { + "epoch": 0.92, + "grad_norm": 0.32836523657529365, + "learning_rate": 0.00012093917787204148, + "loss": 1.0739, + "step": 9641 + }, + { + "epoch": 0.92, + "grad_norm": 0.2905477137452141, + "learning_rate": 0.00012092370880861786, + "loss": 0.9218, + "step": 9642 + }, + { + "epoch": 0.92, + "grad_norm": 0.28312434963987126, + "learning_rate": 0.00012090823922156526, + "loss": 1.1302, + "step": 9643 + }, + { + "epoch": 0.92, + "grad_norm": 0.31119878698668474, + "learning_rate": 0.00012089276911127088, + "loss": 1.1067, + "step": 9644 + }, + { + "epoch": 0.92, + "grad_norm": 0.24999334472319124, + "learning_rate": 0.00012087729847812176, + "loss": 1.0741, + "step": 9645 + }, + { + "epoch": 0.92, + "grad_norm": 0.2543284979471403, + "learning_rate": 0.00012086182732250517, + "loss": 0.9579, + "step": 9646 + }, + { + "epoch": 0.92, + "grad_norm": 0.2594464862081111, + "learning_rate": 0.00012084635564480824, + "loss": 1.0921, + "step": 9647 + }, + { + "epoch": 0.92, + "grad_norm": 0.3158864173446626, + "learning_rate": 0.00012083088344541813, + "loss": 1.1075, + "step": 9648 + }, + { + "epoch": 0.92, + "grad_norm": 0.29277153816070817, + "learning_rate": 0.00012081541072472208, + "loss": 1.0819, + "step": 9649 + }, + { + "epoch": 0.92, + "grad_norm": 0.285551700825859, + "learning_rate": 0.00012079993748310729, + "loss": 1.1289, + "step": 9650 + }, + { + "epoch": 0.92, + "grad_norm": 0.26578837779844633, + "learning_rate": 0.00012078446372096102, + "loss": 1.0786, + "step": 9651 + }, + { + "epoch": 0.92, + "grad_norm": 0.2770368943423453, + "learning_rate": 0.00012076898943867046, + "loss": 1.2504, + "step": 9652 + }, + { + "epoch": 0.92, + "grad_norm": 0.2915791206912121, + "learning_rate": 0.0001207535146366229, + "loss": 1.0691, + "step": 9653 + }, + { + "epoch": 0.92, + "grad_norm": 0.29701485898418156, + "learning_rate": 0.00012073803931520557, + "loss": 1.0343, + "step": 9654 + }, + { + "epoch": 0.92, + "grad_norm": 0.3164931927910207, + "learning_rate": 0.0001207225634748058, + "loss": 1.0555, + "step": 9655 + }, + { + "epoch": 0.92, + "grad_norm": 0.26381700408017583, + "learning_rate": 0.00012070708711581083, + "loss": 1.1291, + "step": 9656 + }, + { + "epoch": 0.92, + "grad_norm": 0.29962327944839096, + "learning_rate": 0.00012069161023860802, + "loss": 1.039, + "step": 9657 + }, + { + "epoch": 0.92, + "grad_norm": 0.3144393601015309, + "learning_rate": 0.00012067613284358461, + "loss": 1.1407, + "step": 9658 + }, + { + "epoch": 0.92, + "grad_norm": 0.29485729590956794, + "learning_rate": 0.00012066065493112803, + "loss": 0.9772, + "step": 9659 + }, + { + "epoch": 0.92, + "grad_norm": 0.3317695176709224, + "learning_rate": 0.00012064517650162555, + "loss": 1.0321, + "step": 9660 + }, + { + "epoch": 0.92, + "grad_norm": 0.2795257128963298, + "learning_rate": 0.00012062969755546456, + "loss": 1.1162, + "step": 9661 + }, + { + "epoch": 0.92, + "grad_norm": 0.3308060051457206, + "learning_rate": 0.00012061421809303241, + "loss": 0.9852, + "step": 9662 + }, + { + "epoch": 0.92, + "grad_norm": 0.284967097310632, + "learning_rate": 0.00012059873811471651, + "loss": 0.9865, + "step": 9663 + }, + { + "epoch": 0.92, + "grad_norm": 0.31468502602583576, + "learning_rate": 0.00012058325762090426, + "loss": 1.0112, + "step": 9664 + }, + { + "epoch": 0.92, + "grad_norm": 0.2760079699322398, + "learning_rate": 0.00012056777661198301, + "loss": 0.9597, + "step": 9665 + }, + { + "epoch": 0.92, + "grad_norm": 0.2911837084304875, + "learning_rate": 0.00012055229508834027, + "loss": 1.0258, + "step": 9666 + }, + { + "epoch": 0.92, + "grad_norm": 0.30981871374584724, + "learning_rate": 0.00012053681305036342, + "loss": 1.1091, + "step": 9667 + }, + { + "epoch": 0.92, + "grad_norm": 0.28199493834041306, + "learning_rate": 0.00012052133049843992, + "loss": 0.9613, + "step": 9668 + }, + { + "epoch": 0.93, + "grad_norm": 0.2833626576757252, + "learning_rate": 0.00012050584743295718, + "loss": 1.0918, + "step": 9669 + }, + { + "epoch": 0.93, + "grad_norm": 0.32179313490464345, + "learning_rate": 0.00012049036385430277, + "loss": 1.0589, + "step": 9670 + }, + { + "epoch": 0.93, + "grad_norm": 0.3529820688530236, + "learning_rate": 0.0001204748797628641, + "loss": 1.0392, + "step": 9671 + }, + { + "epoch": 0.93, + "grad_norm": 0.316182685666834, + "learning_rate": 0.00012045939515902872, + "loss": 1.091, + "step": 9672 + }, + { + "epoch": 0.93, + "grad_norm": 0.3132427307808237, + "learning_rate": 0.00012044391004318409, + "loss": 1.1227, + "step": 9673 + }, + { + "epoch": 0.93, + "grad_norm": 0.2997545510559344, + "learning_rate": 0.0001204284244157178, + "loss": 1.0994, + "step": 9674 + }, + { + "epoch": 0.93, + "grad_norm": 0.2536805673335262, + "learning_rate": 0.00012041293827701729, + "loss": 0.9636, + "step": 9675 + }, + { + "epoch": 0.93, + "grad_norm": 0.2876440010059215, + "learning_rate": 0.00012039745162747022, + "loss": 1.1536, + "step": 9676 + }, + { + "epoch": 0.93, + "grad_norm": 0.2691609443880876, + "learning_rate": 0.0001203819644674641, + "loss": 1.1113, + "step": 9677 + }, + { + "epoch": 0.93, + "grad_norm": 0.25701727543705255, + "learning_rate": 0.00012036647679738649, + "loss": 0.9819, + "step": 9678 + }, + { + "epoch": 0.93, + "grad_norm": 0.29817078458954743, + "learning_rate": 0.00012035098861762502, + "loss": 0.9931, + "step": 9679 + }, + { + "epoch": 0.93, + "grad_norm": 0.2642831901167002, + "learning_rate": 0.00012033549992856726, + "loss": 1.0036, + "step": 9680 + }, + { + "epoch": 0.93, + "grad_norm": 0.29959073213056514, + "learning_rate": 0.00012032001073060082, + "loss": 1.1449, + "step": 9681 + }, + { + "epoch": 0.93, + "grad_norm": 0.32204374587710666, + "learning_rate": 0.00012030452102411333, + "loss": 0.9079, + "step": 9682 + }, + { + "epoch": 0.93, + "grad_norm": 0.30732832828204176, + "learning_rate": 0.00012028903080949248, + "loss": 0.8944, + "step": 9683 + }, + { + "epoch": 0.93, + "grad_norm": 0.27203865429830504, + "learning_rate": 0.00012027354008712588, + "loss": 0.9959, + "step": 9684 + }, + { + "epoch": 0.93, + "grad_norm": 0.31723507213333874, + "learning_rate": 0.0001202580488574012, + "loss": 1.0008, + "step": 9685 + }, + { + "epoch": 0.93, + "grad_norm": 0.2753881383976869, + "learning_rate": 0.00012024255712070607, + "loss": 0.9831, + "step": 9686 + }, + { + "epoch": 0.93, + "grad_norm": 0.2905972632173138, + "learning_rate": 0.00012022706487742827, + "loss": 0.9718, + "step": 9687 + }, + { + "epoch": 0.93, + "grad_norm": 0.3441351333384633, + "learning_rate": 0.00012021157212795544, + "loss": 1.1038, + "step": 9688 + }, + { + "epoch": 0.93, + "grad_norm": 0.30829573394192783, + "learning_rate": 0.00012019607887267532, + "loss": 1.0747, + "step": 9689 + }, + { + "epoch": 0.93, + "grad_norm": 0.3116599969382794, + "learning_rate": 0.00012018058511197563, + "loss": 0.9906, + "step": 9690 + }, + { + "epoch": 0.93, + "grad_norm": 0.32153118671256326, + "learning_rate": 0.00012016509084624413, + "loss": 1.0987, + "step": 9691 + }, + { + "epoch": 0.93, + "grad_norm": 0.30020496331906704, + "learning_rate": 0.00012014959607586853, + "loss": 0.9893, + "step": 9692 + }, + { + "epoch": 0.93, + "grad_norm": 0.2488961913424749, + "learning_rate": 0.00012013410080123666, + "loss": 1.0128, + "step": 9693 + }, + { + "epoch": 0.93, + "grad_norm": 0.2894864987548495, + "learning_rate": 0.00012011860502273625, + "loss": 1.0898, + "step": 9694 + }, + { + "epoch": 0.93, + "grad_norm": 0.339459814596149, + "learning_rate": 0.0001201031087407551, + "loss": 0.972, + "step": 9695 + }, + { + "epoch": 0.93, + "grad_norm": 0.28936642655855416, + "learning_rate": 0.00012008761195568101, + "loss": 1.1054, + "step": 9696 + }, + { + "epoch": 0.93, + "grad_norm": 0.29865336998518743, + "learning_rate": 0.00012007211466790183, + "loss": 1.0518, + "step": 9697 + }, + { + "epoch": 0.93, + "grad_norm": 0.29410810287938477, + "learning_rate": 0.00012005661687780537, + "loss": 1.0348, + "step": 9698 + }, + { + "epoch": 0.93, + "grad_norm": 0.28715006026168827, + "learning_rate": 0.00012004111858577942, + "loss": 1.0277, + "step": 9699 + }, + { + "epoch": 0.93, + "grad_norm": 0.3246035439472215, + "learning_rate": 0.00012002561979221191, + "loss": 1.0978, + "step": 9700 + }, + { + "epoch": 0.93, + "grad_norm": 0.28714503427033067, + "learning_rate": 0.00012001012049749067, + "loss": 1.0528, + "step": 9701 + }, + { + "epoch": 0.93, + "grad_norm": 0.2999877264540504, + "learning_rate": 0.0001199946207020036, + "loss": 1.0535, + "step": 9702 + }, + { + "epoch": 0.93, + "grad_norm": 0.2831842528289797, + "learning_rate": 0.00011997912040613856, + "loss": 1.0861, + "step": 9703 + }, + { + "epoch": 0.93, + "grad_norm": 0.27863916487179596, + "learning_rate": 0.00011996361961028351, + "loss": 1.0172, + "step": 9704 + }, + { + "epoch": 0.93, + "grad_norm": 0.3076428593870626, + "learning_rate": 0.00011994811831482633, + "loss": 1.154, + "step": 9705 + }, + { + "epoch": 0.93, + "grad_norm": 0.3178394621331127, + "learning_rate": 0.00011993261652015493, + "loss": 1.0459, + "step": 9706 + }, + { + "epoch": 0.93, + "grad_norm": 0.2834780610000245, + "learning_rate": 0.00011991711422665728, + "loss": 0.8991, + "step": 9707 + }, + { + "epoch": 0.93, + "grad_norm": 0.28248971237831144, + "learning_rate": 0.00011990161143472134, + "loss": 1.0412, + "step": 9708 + }, + { + "epoch": 0.93, + "grad_norm": 0.37293406629713854, + "learning_rate": 0.00011988610814473504, + "loss": 1.1236, + "step": 9709 + }, + { + "epoch": 0.93, + "grad_norm": 0.29203647947031824, + "learning_rate": 0.00011987060435708643, + "loss": 0.972, + "step": 9710 + }, + { + "epoch": 0.93, + "grad_norm": 0.2784065820475021, + "learning_rate": 0.00011985510007216343, + "loss": 1.0975, + "step": 9711 + }, + { + "epoch": 0.93, + "grad_norm": 0.2967922411536282, + "learning_rate": 0.0001198395952903541, + "loss": 1.0979, + "step": 9712 + }, + { + "epoch": 0.93, + "grad_norm": 0.29200788752529727, + "learning_rate": 0.00011982409001204637, + "loss": 1.0726, + "step": 9713 + }, + { + "epoch": 0.93, + "grad_norm": 0.2692696205517818, + "learning_rate": 0.00011980858423762837, + "loss": 1.0631, + "step": 9714 + }, + { + "epoch": 0.93, + "grad_norm": 0.2965913502544387, + "learning_rate": 0.00011979307796748811, + "loss": 1.1334, + "step": 9715 + }, + { + "epoch": 0.93, + "grad_norm": 0.28309793277521694, + "learning_rate": 0.0001197775712020136, + "loss": 1.1148, + "step": 9716 + }, + { + "epoch": 0.93, + "grad_norm": 0.2842362819659992, + "learning_rate": 0.00011976206394159297, + "loss": 1.089, + "step": 9717 + }, + { + "epoch": 0.93, + "grad_norm": 0.30869020384451756, + "learning_rate": 0.00011974655618661425, + "loss": 1.1634, + "step": 9718 + }, + { + "epoch": 0.93, + "grad_norm": 0.2919947030319265, + "learning_rate": 0.00011973104793746554, + "loss": 0.9976, + "step": 9719 + }, + { + "epoch": 0.93, + "grad_norm": 0.28107687880463733, + "learning_rate": 0.000119715539194535, + "loss": 1.0656, + "step": 9720 + }, + { + "epoch": 0.93, + "grad_norm": 0.2538194528442572, + "learning_rate": 0.00011970002995821069, + "loss": 0.9763, + "step": 9721 + }, + { + "epoch": 0.93, + "grad_norm": 0.3042314896103001, + "learning_rate": 0.0001196845202288807, + "loss": 1.1705, + "step": 9722 + }, + { + "epoch": 0.93, + "grad_norm": 0.2572188222835515, + "learning_rate": 0.00011966901000693325, + "loss": 1.0154, + "step": 9723 + }, + { + "epoch": 0.93, + "grad_norm": 0.28736803000564753, + "learning_rate": 0.00011965349929275646, + "loss": 0.9974, + "step": 9724 + }, + { + "epoch": 0.93, + "grad_norm": 0.31110944588178774, + "learning_rate": 0.00011963798808673852, + "loss": 1.0147, + "step": 9725 + }, + { + "epoch": 0.93, + "grad_norm": 0.2875577964788188, + "learning_rate": 0.00011962247638926755, + "loss": 1.0678, + "step": 9726 + }, + { + "epoch": 0.93, + "grad_norm": 0.2700897281383975, + "learning_rate": 0.00011960696420073181, + "loss": 1.1177, + "step": 9727 + }, + { + "epoch": 0.93, + "grad_norm": 0.2806117435648341, + "learning_rate": 0.00011959145152151947, + "loss": 1.012, + "step": 9728 + }, + { + "epoch": 0.93, + "grad_norm": 0.31209826283604314, + "learning_rate": 0.00011957593835201875, + "loss": 0.9591, + "step": 9729 + }, + { + "epoch": 0.93, + "grad_norm": 0.2750239320324656, + "learning_rate": 0.00011956042469261781, + "loss": 1.1095, + "step": 9730 + }, + { + "epoch": 0.93, + "grad_norm": 0.28264832045438565, + "learning_rate": 0.000119544910543705, + "loss": 1.037, + "step": 9731 + }, + { + "epoch": 0.93, + "grad_norm": 0.3300651441181935, + "learning_rate": 0.00011952939590566852, + "loss": 1.0304, + "step": 9732 + }, + { + "epoch": 0.93, + "grad_norm": 0.2859521337438915, + "learning_rate": 0.0001195138807788966, + "loss": 0.969, + "step": 9733 + }, + { + "epoch": 0.93, + "grad_norm": 0.2828356666065811, + "learning_rate": 0.00011949836516377759, + "loss": 1.004, + "step": 9734 + }, + { + "epoch": 0.93, + "grad_norm": 0.2696647965723336, + "learning_rate": 0.00011948284906069974, + "loss": 1.089, + "step": 9735 + }, + { + "epoch": 0.93, + "grad_norm": 0.2708560032776781, + "learning_rate": 0.00011946733247005131, + "loss": 0.9534, + "step": 9736 + }, + { + "epoch": 0.93, + "grad_norm": 0.3142091199851108, + "learning_rate": 0.00011945181539222065, + "loss": 1.1159, + "step": 9737 + }, + { + "epoch": 0.93, + "grad_norm": 0.2580939496791897, + "learning_rate": 0.00011943629782759611, + "loss": 0.9785, + "step": 9738 + }, + { + "epoch": 0.93, + "grad_norm": 0.29871072295327306, + "learning_rate": 0.00011942077977656601, + "loss": 1.072, + "step": 9739 + }, + { + "epoch": 0.93, + "grad_norm": 0.28134297529949587, + "learning_rate": 0.00011940526123951865, + "loss": 1.0097, + "step": 9740 + }, + { + "epoch": 0.93, + "grad_norm": 0.27030245416409626, + "learning_rate": 0.00011938974221684248, + "loss": 1.0362, + "step": 9741 + }, + { + "epoch": 0.93, + "grad_norm": 0.27891127006754407, + "learning_rate": 0.00011937422270892578, + "loss": 1.1503, + "step": 9742 + }, + { + "epoch": 0.93, + "grad_norm": 0.2527819203001018, + "learning_rate": 0.00011935870271615701, + "loss": 1.0682, + "step": 9743 + }, + { + "epoch": 0.93, + "grad_norm": 0.25930692949225115, + "learning_rate": 0.00011934318223892451, + "loss": 1.062, + "step": 9744 + }, + { + "epoch": 0.93, + "grad_norm": 0.2875000837745984, + "learning_rate": 0.00011932766127761675, + "loss": 0.9477, + "step": 9745 + }, + { + "epoch": 0.93, + "grad_norm": 0.325246715534001, + "learning_rate": 0.00011931213983262211, + "loss": 1.0165, + "step": 9746 + }, + { + "epoch": 0.93, + "grad_norm": 0.25662864976415845, + "learning_rate": 0.00011929661790432903, + "loss": 1.0786, + "step": 9747 + }, + { + "epoch": 0.93, + "grad_norm": 0.260785274906786, + "learning_rate": 0.00011928109549312596, + "loss": 1.0819, + "step": 9748 + }, + { + "epoch": 0.93, + "grad_norm": 0.24971660647552188, + "learning_rate": 0.00011926557259940137, + "loss": 1.1172, + "step": 9749 + }, + { + "epoch": 0.93, + "grad_norm": 0.2634144133215548, + "learning_rate": 0.00011925004922354368, + "loss": 1.0457, + "step": 9750 + }, + { + "epoch": 0.93, + "grad_norm": 0.28481353084008365, + "learning_rate": 0.00011923452536594144, + "loss": 1.0667, + "step": 9751 + }, + { + "epoch": 0.93, + "grad_norm": 0.2480420939966222, + "learning_rate": 0.00011921900102698312, + "loss": 1.0387, + "step": 9752 + }, + { + "epoch": 0.93, + "grad_norm": 0.290303015413711, + "learning_rate": 0.00011920347620705719, + "loss": 0.9624, + "step": 9753 + }, + { + "epoch": 0.93, + "grad_norm": 0.30582174621304614, + "learning_rate": 0.00011918795090655221, + "loss": 0.982, + "step": 9754 + }, + { + "epoch": 0.93, + "grad_norm": 0.2655028218340249, + "learning_rate": 0.00011917242512585674, + "loss": 1.0536, + "step": 9755 + }, + { + "epoch": 0.93, + "grad_norm": 0.27477226526624493, + "learning_rate": 0.00011915689886535923, + "loss": 0.9856, + "step": 9756 + }, + { + "epoch": 0.93, + "grad_norm": 0.28682754344560973, + "learning_rate": 0.00011914137212544831, + "loss": 1.1707, + "step": 9757 + }, + { + "epoch": 0.93, + "grad_norm": 0.26153050639845216, + "learning_rate": 0.00011912584490651253, + "loss": 1.0497, + "step": 9758 + }, + { + "epoch": 0.93, + "grad_norm": 0.28229038067976947, + "learning_rate": 0.00011911031720894046, + "loss": 0.9965, + "step": 9759 + }, + { + "epoch": 0.93, + "grad_norm": 0.28006029083616607, + "learning_rate": 0.00011909478903312066, + "loss": 1.0668, + "step": 9760 + }, + { + "epoch": 0.93, + "grad_norm": 0.27740453645729457, + "learning_rate": 0.00011907926037944179, + "loss": 1.0366, + "step": 9761 + }, + { + "epoch": 0.93, + "grad_norm": 0.2384257527120478, + "learning_rate": 0.00011906373124829244, + "loss": 1.0827, + "step": 9762 + }, + { + "epoch": 0.93, + "grad_norm": 0.293678354692542, + "learning_rate": 0.0001190482016400612, + "loss": 1.0645, + "step": 9763 + }, + { + "epoch": 0.93, + "grad_norm": 0.2938241245866292, + "learning_rate": 0.00011903267155513677, + "loss": 1.055, + "step": 9764 + }, + { + "epoch": 0.93, + "grad_norm": 0.26615653569909886, + "learning_rate": 0.00011901714099390777, + "loss": 1.0972, + "step": 9765 + }, + { + "epoch": 0.93, + "grad_norm": 0.28171980966420157, + "learning_rate": 0.00011900160995676288, + "loss": 0.9422, + "step": 9766 + }, + { + "epoch": 0.93, + "grad_norm": 0.26740119824230185, + "learning_rate": 0.00011898607844409073, + "loss": 1.0326, + "step": 9767 + }, + { + "epoch": 0.93, + "grad_norm": 0.27023112187042453, + "learning_rate": 0.00011897054645628005, + "loss": 0.9777, + "step": 9768 + }, + { + "epoch": 0.93, + "grad_norm": 0.3200885822581613, + "learning_rate": 0.00011895501399371953, + "loss": 1.0587, + "step": 9769 + }, + { + "epoch": 0.93, + "grad_norm": 0.31831634758810756, + "learning_rate": 0.00011893948105679787, + "loss": 1.0764, + "step": 9770 + }, + { + "epoch": 0.93, + "grad_norm": 0.3081776019474058, + "learning_rate": 0.00011892394764590378, + "loss": 1.0104, + "step": 9771 + }, + { + "epoch": 0.93, + "grad_norm": 0.28957521753453463, + "learning_rate": 0.00011890841376142603, + "loss": 1.0592, + "step": 9772 + }, + { + "epoch": 0.93, + "grad_norm": 0.28207411325497395, + "learning_rate": 0.00011889287940375334, + "loss": 1.104, + "step": 9773 + }, + { + "epoch": 0.94, + "grad_norm": 0.36180324747623, + "learning_rate": 0.00011887734457327443, + "loss": 1.0525, + "step": 9774 + }, + { + "epoch": 0.94, + "grad_norm": 0.2910045433010645, + "learning_rate": 0.00011886180927037815, + "loss": 1.087, + "step": 9775 + }, + { + "epoch": 0.94, + "grad_norm": 0.28543697105123744, + "learning_rate": 0.00011884627349545323, + "loss": 1.0952, + "step": 9776 + }, + { + "epoch": 0.94, + "grad_norm": 0.2849455865427251, + "learning_rate": 0.00011883073724888844, + "loss": 1.0036, + "step": 9777 + }, + { + "epoch": 0.94, + "grad_norm": 0.3445968201656461, + "learning_rate": 0.00011881520053107267, + "loss": 1.0498, + "step": 9778 + }, + { + "epoch": 0.94, + "grad_norm": 0.2506321633298875, + "learning_rate": 0.00011879966334239466, + "loss": 1.1624, + "step": 9779 + }, + { + "epoch": 0.94, + "grad_norm": 0.29162103318787663, + "learning_rate": 0.00011878412568324322, + "loss": 1.0233, + "step": 9780 + }, + { + "epoch": 0.94, + "grad_norm": 0.29500101419963837, + "learning_rate": 0.00011876858755400728, + "loss": 1.0922, + "step": 9781 + }, + { + "epoch": 0.94, + "grad_norm": 0.3009884592875106, + "learning_rate": 0.00011875304895507562, + "loss": 1.1165, + "step": 9782 + }, + { + "epoch": 0.94, + "grad_norm": 0.27565694136640945, + "learning_rate": 0.00011873750988683712, + "loss": 1.0688, + "step": 9783 + }, + { + "epoch": 0.94, + "grad_norm": 0.3273050375332514, + "learning_rate": 0.00011872197034968067, + "loss": 1.1118, + "step": 9784 + }, + { + "epoch": 0.94, + "grad_norm": 0.29960996186748606, + "learning_rate": 0.00011870643034399514, + "loss": 1.2125, + "step": 9785 + }, + { + "epoch": 0.94, + "grad_norm": 0.2964779408152491, + "learning_rate": 0.00011869088987016943, + "loss": 1.1393, + "step": 9786 + }, + { + "epoch": 0.94, + "grad_norm": 0.2858838476728597, + "learning_rate": 0.00011867534892859244, + "loss": 1.0267, + "step": 9787 + }, + { + "epoch": 0.94, + "grad_norm": 0.2852488900654838, + "learning_rate": 0.00011865980751965313, + "loss": 1.1121, + "step": 9788 + }, + { + "epoch": 0.94, + "grad_norm": 0.29119397522264384, + "learning_rate": 0.00011864426564374043, + "loss": 1.0825, + "step": 9789 + }, + { + "epoch": 0.94, + "grad_norm": 0.2594508280784169, + "learning_rate": 0.00011862872330124324, + "loss": 1.154, + "step": 9790 + }, + { + "epoch": 0.94, + "grad_norm": 0.2975677876895368, + "learning_rate": 0.00011861318049255052, + "loss": 1.0173, + "step": 9791 + }, + { + "epoch": 0.94, + "grad_norm": 0.2648318009564742, + "learning_rate": 0.00011859763721805128, + "loss": 1.0818, + "step": 9792 + }, + { + "epoch": 0.94, + "grad_norm": 0.2475611229418435, + "learning_rate": 0.00011858209347813449, + "loss": 0.9974, + "step": 9793 + }, + { + "epoch": 0.94, + "grad_norm": 0.24368772705519215, + "learning_rate": 0.00011856654927318914, + "loss": 1.0076, + "step": 9794 + }, + { + "epoch": 0.94, + "grad_norm": 0.2589222900696365, + "learning_rate": 0.0001185510046036042, + "loss": 1.0072, + "step": 9795 + }, + { + "epoch": 0.94, + "grad_norm": 0.2518421468766372, + "learning_rate": 0.00011853545946976874, + "loss": 1.1606, + "step": 9796 + }, + { + "epoch": 0.94, + "grad_norm": 0.295292430219631, + "learning_rate": 0.00011851991387207171, + "loss": 0.98, + "step": 9797 + }, + { + "epoch": 0.94, + "grad_norm": 0.29245516327817817, + "learning_rate": 0.00011850436781090223, + "loss": 1.0599, + "step": 9798 + }, + { + "epoch": 0.94, + "grad_norm": 0.2914565208305866, + "learning_rate": 0.00011848882128664933, + "loss": 0.9911, + "step": 9799 + }, + { + "epoch": 0.94, + "grad_norm": 0.3078689389856212, + "learning_rate": 0.00011847327429970203, + "loss": 0.9396, + "step": 9800 + }, + { + "epoch": 0.94, + "grad_norm": 0.3127304428982239, + "learning_rate": 0.00011845772685044945, + "loss": 1.0645, + "step": 9801 + }, + { + "epoch": 0.94, + "grad_norm": 0.2873883647576085, + "learning_rate": 0.00011844217893928064, + "loss": 0.9597, + "step": 9802 + }, + { + "epoch": 0.94, + "grad_norm": 0.29524076369164753, + "learning_rate": 0.00011842663056658471, + "loss": 1.054, + "step": 9803 + }, + { + "epoch": 0.94, + "grad_norm": 0.30216797407479606, + "learning_rate": 0.00011841108173275078, + "loss": 1.0366, + "step": 9804 + }, + { + "epoch": 0.94, + "grad_norm": 0.2683352824632881, + "learning_rate": 0.00011839553243816794, + "loss": 1.1457, + "step": 9805 + }, + { + "epoch": 0.94, + "grad_norm": 0.3088671649045264, + "learning_rate": 0.00011837998268322535, + "loss": 0.9811, + "step": 9806 + }, + { + "epoch": 0.94, + "grad_norm": 0.3127798071067639, + "learning_rate": 0.00011836443246831215, + "loss": 1.0054, + "step": 9807 + }, + { + "epoch": 0.94, + "grad_norm": 0.2958330910482311, + "learning_rate": 0.00011834888179381746, + "loss": 0.9717, + "step": 9808 + }, + { + "epoch": 0.94, + "grad_norm": 0.24632414148720827, + "learning_rate": 0.00011833333066013051, + "loss": 1.0858, + "step": 9809 + }, + { + "epoch": 0.94, + "grad_norm": 0.2723921866158821, + "learning_rate": 0.00011831777906764044, + "loss": 1.0244, + "step": 9810 + }, + { + "epoch": 0.94, + "grad_norm": 0.2975337869324926, + "learning_rate": 0.00011830222701673639, + "loss": 1.0093, + "step": 9811 + }, + { + "epoch": 0.94, + "grad_norm": 0.28650604212889696, + "learning_rate": 0.00011828667450780764, + "loss": 1.069, + "step": 9812 + }, + { + "epoch": 0.94, + "grad_norm": 0.27777382822296665, + "learning_rate": 0.00011827112154124338, + "loss": 1.1207, + "step": 9813 + }, + { + "epoch": 0.94, + "grad_norm": 0.2870567816741802, + "learning_rate": 0.00011825556811743279, + "loss": 1.1142, + "step": 9814 + }, + { + "epoch": 0.94, + "grad_norm": 0.30386027251939157, + "learning_rate": 0.00011824001423676513, + "loss": 1.1055, + "step": 9815 + }, + { + "epoch": 0.94, + "grad_norm": 0.2705336512672914, + "learning_rate": 0.00011822445989962969, + "loss": 1.1277, + "step": 9816 + }, + { + "epoch": 0.94, + "grad_norm": 0.29969456459847577, + "learning_rate": 0.00011820890510641567, + "loss": 1.0124, + "step": 9817 + }, + { + "epoch": 0.94, + "grad_norm": 0.30298340488413317, + "learning_rate": 0.00011819334985751233, + "loss": 1.0302, + "step": 9818 + }, + { + "epoch": 0.94, + "grad_norm": 0.27778928676842507, + "learning_rate": 0.00011817779415330901, + "loss": 1.0885, + "step": 9819 + }, + { + "epoch": 0.94, + "grad_norm": 0.300820511952422, + "learning_rate": 0.00011816223799419497, + "loss": 1.0825, + "step": 9820 + }, + { + "epoch": 0.94, + "grad_norm": 0.3444116599675863, + "learning_rate": 0.00011814668138055947, + "loss": 1.098, + "step": 9821 + }, + { + "epoch": 0.94, + "grad_norm": 0.27136608383916344, + "learning_rate": 0.0001181311243127919, + "loss": 1.1083, + "step": 9822 + }, + { + "epoch": 0.94, + "grad_norm": 0.29143092481382066, + "learning_rate": 0.00011811556679128153, + "loss": 1.0637, + "step": 9823 + }, + { + "epoch": 0.94, + "grad_norm": 0.25973448401962235, + "learning_rate": 0.00011810000881641771, + "loss": 1.0963, + "step": 9824 + }, + { + "epoch": 0.94, + "grad_norm": 0.2774775599714097, + "learning_rate": 0.00011808445038858982, + "loss": 1.0984, + "step": 9825 + }, + { + "epoch": 0.94, + "grad_norm": 0.263532587397457, + "learning_rate": 0.00011806889150818716, + "loss": 0.9681, + "step": 9826 + }, + { + "epoch": 0.94, + "grad_norm": 0.3141288079300534, + "learning_rate": 0.00011805333217559918, + "loss": 1.0527, + "step": 9827 + }, + { + "epoch": 0.94, + "grad_norm": 0.31147150343587526, + "learning_rate": 0.00011803777239121516, + "loss": 1.0198, + "step": 9828 + }, + { + "epoch": 0.94, + "grad_norm": 0.2732581866158368, + "learning_rate": 0.00011802221215542459, + "loss": 1.0903, + "step": 9829 + }, + { + "epoch": 0.94, + "grad_norm": 0.28505766654611703, + "learning_rate": 0.00011800665146861683, + "loss": 1.0171, + "step": 9830 + }, + { + "epoch": 0.94, + "grad_norm": 0.28825531298858104, + "learning_rate": 0.00011799109033118127, + "loss": 1.1042, + "step": 9831 + }, + { + "epoch": 0.94, + "grad_norm": 0.28681592008334156, + "learning_rate": 0.00011797552874350739, + "loss": 1.0476, + "step": 9832 + }, + { + "epoch": 0.94, + "grad_norm": 0.3123668239526817, + "learning_rate": 0.00011795996670598462, + "loss": 1.0743, + "step": 9833 + }, + { + "epoch": 0.94, + "grad_norm": 0.2418704079745117, + "learning_rate": 0.0001179444042190024, + "loss": 1.0915, + "step": 9834 + }, + { + "epoch": 0.94, + "grad_norm": 0.3233001919313393, + "learning_rate": 0.00011792884128295014, + "loss": 1.1112, + "step": 9835 + }, + { + "epoch": 0.94, + "grad_norm": 0.23543282622618317, + "learning_rate": 0.0001179132778982174, + "loss": 0.9408, + "step": 9836 + }, + { + "epoch": 0.94, + "grad_norm": 0.27953787864369156, + "learning_rate": 0.00011789771406519361, + "loss": 1.0951, + "step": 9837 + }, + { + "epoch": 0.94, + "grad_norm": 0.26888014225579154, + "learning_rate": 0.00011788214978426827, + "loss": 1.0421, + "step": 9838 + }, + { + "epoch": 0.94, + "grad_norm": 0.28757566533635587, + "learning_rate": 0.0001178665850558309, + "loss": 1.1287, + "step": 9839 + }, + { + "epoch": 0.94, + "grad_norm": 0.3053049659715436, + "learning_rate": 0.00011785101988027103, + "loss": 1.0709, + "step": 9840 + }, + { + "epoch": 0.94, + "grad_norm": 0.2693832593847435, + "learning_rate": 0.00011783545425797813, + "loss": 1.178, + "step": 9841 + }, + { + "epoch": 0.94, + "grad_norm": 0.29150202296705147, + "learning_rate": 0.0001178198881893418, + "loss": 1.1276, + "step": 9842 + }, + { + "epoch": 0.94, + "grad_norm": 0.28026413298185754, + "learning_rate": 0.00011780432167475157, + "loss": 1.1026, + "step": 9843 + }, + { + "epoch": 0.94, + "grad_norm": 0.2883237105659363, + "learning_rate": 0.00011778875471459703, + "loss": 1.029, + "step": 9844 + }, + { + "epoch": 0.94, + "grad_norm": 0.2882413609766784, + "learning_rate": 0.00011777318730926768, + "loss": 1.1348, + "step": 9845 + }, + { + "epoch": 0.94, + "grad_norm": 0.3024147348726725, + "learning_rate": 0.00011775761945915315, + "loss": 1.1326, + "step": 9846 + }, + { + "epoch": 0.94, + "grad_norm": 0.27230068738152696, + "learning_rate": 0.00011774205116464304, + "loss": 1.1274, + "step": 9847 + }, + { + "epoch": 0.94, + "grad_norm": 0.2671829447086581, + "learning_rate": 0.00011772648242612694, + "loss": 0.999, + "step": 9848 + }, + { + "epoch": 0.94, + "grad_norm": 0.27183496557565345, + "learning_rate": 0.00011771091324399447, + "loss": 0.9818, + "step": 9849 + }, + { + "epoch": 0.94, + "grad_norm": 0.30565783107867467, + "learning_rate": 0.0001176953436186353, + "loss": 1.0334, + "step": 9850 + }, + { + "epoch": 0.94, + "grad_norm": 0.2672952617830373, + "learning_rate": 0.00011767977355043902, + "loss": 1.014, + "step": 9851 + }, + { + "epoch": 0.94, + "grad_norm": 0.25977834030458224, + "learning_rate": 0.00011766420303979528, + "loss": 1.0847, + "step": 9852 + }, + { + "epoch": 0.94, + "grad_norm": 0.29226617360753704, + "learning_rate": 0.00011764863208709378, + "loss": 1.1198, + "step": 9853 + }, + { + "epoch": 0.94, + "grad_norm": 0.2710445438667288, + "learning_rate": 0.00011763306069272415, + "loss": 1.1502, + "step": 9854 + }, + { + "epoch": 0.94, + "grad_norm": 0.2683767291753003, + "learning_rate": 0.00011761748885707611, + "loss": 1.0218, + "step": 9855 + }, + { + "epoch": 0.94, + "grad_norm": 0.30996151508397074, + "learning_rate": 0.00011760191658053933, + "loss": 1.0196, + "step": 9856 + }, + { + "epoch": 0.94, + "grad_norm": 0.2836262926909511, + "learning_rate": 0.00011758634386350353, + "loss": 0.9859, + "step": 9857 + }, + { + "epoch": 0.94, + "grad_norm": 0.2835130303902405, + "learning_rate": 0.00011757077070635842, + "loss": 0.9518, + "step": 9858 + }, + { + "epoch": 0.94, + "grad_norm": 0.29179997921864015, + "learning_rate": 0.00011755519710949375, + "loss": 1.0851, + "step": 9859 + }, + { + "epoch": 0.94, + "grad_norm": 0.2580784420137259, + "learning_rate": 0.0001175396230732992, + "loss": 1.0792, + "step": 9860 + }, + { + "epoch": 0.94, + "grad_norm": 0.2980701381253143, + "learning_rate": 0.00011752404859816459, + "loss": 1.1523, + "step": 9861 + }, + { + "epoch": 0.94, + "grad_norm": 0.30650598237537346, + "learning_rate": 0.00011750847368447963, + "loss": 1.0509, + "step": 9862 + }, + { + "epoch": 0.94, + "grad_norm": 0.2829225311103543, + "learning_rate": 0.00011749289833263413, + "loss": 1.0698, + "step": 9863 + }, + { + "epoch": 0.94, + "grad_norm": 0.3150147271464283, + "learning_rate": 0.00011747732254301786, + "loss": 1.0618, + "step": 9864 + }, + { + "epoch": 0.94, + "grad_norm": 0.26617589803804353, + "learning_rate": 0.00011746174631602059, + "loss": 0.9886, + "step": 9865 + }, + { + "epoch": 0.94, + "grad_norm": 0.28821178558985444, + "learning_rate": 0.00011744616965203214, + "loss": 0.9826, + "step": 9866 + }, + { + "epoch": 0.94, + "grad_norm": 0.264781657352801, + "learning_rate": 0.00011743059255144233, + "loss": 0.8746, + "step": 9867 + }, + { + "epoch": 0.94, + "grad_norm": 0.29886418377952234, + "learning_rate": 0.000117415015014641, + "loss": 1.0563, + "step": 9868 + }, + { + "epoch": 0.94, + "grad_norm": 0.3002368289446441, + "learning_rate": 0.00011739943704201796, + "loss": 1.0341, + "step": 9869 + }, + { + "epoch": 0.94, + "grad_norm": 0.28512524718120663, + "learning_rate": 0.00011738385863396311, + "loss": 0.9591, + "step": 9870 + }, + { + "epoch": 0.94, + "grad_norm": 0.2900458256329998, + "learning_rate": 0.00011736827979086625, + "loss": 1.0893, + "step": 9871 + }, + { + "epoch": 0.94, + "grad_norm": 0.28440813574018564, + "learning_rate": 0.00011735270051311724, + "loss": 1.052, + "step": 9872 + }, + { + "epoch": 0.94, + "grad_norm": 0.2469565537916454, + "learning_rate": 0.00011733712080110603, + "loss": 0.857, + "step": 9873 + }, + { + "epoch": 0.94, + "grad_norm": 0.2501421742546667, + "learning_rate": 0.00011732154065522247, + "loss": 1.1517, + "step": 9874 + }, + { + "epoch": 0.94, + "grad_norm": 0.2943711294105304, + "learning_rate": 0.00011730596007585646, + "loss": 1.0131, + "step": 9875 + }, + { + "epoch": 0.94, + "grad_norm": 0.2759019579633036, + "learning_rate": 0.00011729037906339795, + "loss": 1.0827, + "step": 9876 + }, + { + "epoch": 0.94, + "grad_norm": 0.2734166501679395, + "learning_rate": 0.00011727479761823683, + "loss": 1.017, + "step": 9877 + }, + { + "epoch": 0.95, + "grad_norm": 0.2945759953073775, + "learning_rate": 0.00011725921574076305, + "loss": 1.048, + "step": 9878 + }, + { + "epoch": 0.95, + "grad_norm": 0.2847423179857022, + "learning_rate": 0.00011724363343136651, + "loss": 0.9387, + "step": 9879 + }, + { + "epoch": 0.95, + "grad_norm": 0.2731998133242859, + "learning_rate": 0.00011722805069043724, + "loss": 1.1269, + "step": 9880 + }, + { + "epoch": 0.95, + "grad_norm": 0.31097755473044353, + "learning_rate": 0.00011721246751836514, + "loss": 1.1183, + "step": 9881 + }, + { + "epoch": 0.95, + "grad_norm": 0.24077282326706342, + "learning_rate": 0.00011719688391554024, + "loss": 1.1315, + "step": 9882 + }, + { + "epoch": 0.95, + "grad_norm": 0.28290762006495423, + "learning_rate": 0.00011718129988235251, + "loss": 1.0301, + "step": 9883 + }, + { + "epoch": 0.95, + "grad_norm": 0.28799605220582924, + "learning_rate": 0.00011716571541919197, + "loss": 1.1275, + "step": 9884 + }, + { + "epoch": 0.95, + "grad_norm": 0.2506151300260542, + "learning_rate": 0.00011715013052644859, + "loss": 1.0488, + "step": 9885 + }, + { + "epoch": 0.95, + "grad_norm": 0.3003785975679057, + "learning_rate": 0.00011713454520451243, + "loss": 1.1445, + "step": 9886 + }, + { + "epoch": 0.95, + "grad_norm": 0.2871681991986448, + "learning_rate": 0.00011711895945377351, + "loss": 1.1341, + "step": 9887 + }, + { + "epoch": 0.95, + "grad_norm": 0.263469877916956, + "learning_rate": 0.00011710337327462186, + "loss": 1.0684, + "step": 9888 + }, + { + "epoch": 0.95, + "grad_norm": 0.25031317343613296, + "learning_rate": 0.00011708778666744756, + "loss": 1.0692, + "step": 9889 + }, + { + "epoch": 0.95, + "grad_norm": 0.27978745157205426, + "learning_rate": 0.00011707219963264063, + "loss": 0.9031, + "step": 9890 + }, + { + "epoch": 0.95, + "grad_norm": 0.2771417917801048, + "learning_rate": 0.00011705661217059121, + "loss": 1.0672, + "step": 9891 + }, + { + "epoch": 0.95, + "grad_norm": 0.27046659772554993, + "learning_rate": 0.00011704102428168931, + "loss": 1.1261, + "step": 9892 + }, + { + "epoch": 0.95, + "grad_norm": 0.2627203026681411, + "learning_rate": 0.00011702543596632512, + "loss": 1.0424, + "step": 9893 + }, + { + "epoch": 0.95, + "grad_norm": 0.24842638162917163, + "learning_rate": 0.00011700984722488865, + "loss": 1.0639, + "step": 9894 + }, + { + "epoch": 0.95, + "grad_norm": 0.297596295779575, + "learning_rate": 0.00011699425805777008, + "loss": 1.1003, + "step": 9895 + }, + { + "epoch": 0.95, + "grad_norm": 0.27759923733902636, + "learning_rate": 0.00011697866846535953, + "loss": 1.1192, + "step": 9896 + }, + { + "epoch": 0.95, + "grad_norm": 0.30285981266810275, + "learning_rate": 0.00011696307844804713, + "loss": 1.1918, + "step": 9897 + }, + { + "epoch": 0.95, + "grad_norm": 0.31450365765502614, + "learning_rate": 0.00011694748800622301, + "loss": 1.0818, + "step": 9898 + }, + { + "epoch": 0.95, + "grad_norm": 0.3141057225598429, + "learning_rate": 0.00011693189714027737, + "loss": 0.9055, + "step": 9899 + }, + { + "epoch": 0.95, + "grad_norm": 0.2718320654630044, + "learning_rate": 0.00011691630585060036, + "loss": 1.182, + "step": 9900 + }, + { + "epoch": 0.95, + "grad_norm": 0.2642413261096164, + "learning_rate": 0.00011690071413758217, + "loss": 1.0622, + "step": 9901 + }, + { + "epoch": 0.95, + "grad_norm": 0.2781872909054998, + "learning_rate": 0.00011688512200161297, + "loss": 1.0917, + "step": 9902 + }, + { + "epoch": 0.95, + "grad_norm": 0.2933665399467418, + "learning_rate": 0.00011686952944308298, + "loss": 1.0172, + "step": 9903 + }, + { + "epoch": 0.95, + "grad_norm": 0.2986798343224596, + "learning_rate": 0.00011685393646238243, + "loss": 1.0515, + "step": 9904 + }, + { + "epoch": 0.95, + "grad_norm": 0.27008347335950034, + "learning_rate": 0.00011683834305990154, + "loss": 1.0519, + "step": 9905 + }, + { + "epoch": 0.95, + "grad_norm": 0.27146126457125797, + "learning_rate": 0.00011682274923603049, + "loss": 1.0363, + "step": 9906 + }, + { + "epoch": 0.95, + "grad_norm": 0.24730678500742276, + "learning_rate": 0.00011680715499115959, + "loss": 1.0349, + "step": 9907 + }, + { + "epoch": 0.95, + "grad_norm": 0.274923895199804, + "learning_rate": 0.00011679156032567911, + "loss": 1.0628, + "step": 9908 + }, + { + "epoch": 0.95, + "grad_norm": 0.28568564007828456, + "learning_rate": 0.00011677596523997922, + "loss": 1.0989, + "step": 9909 + }, + { + "epoch": 0.95, + "grad_norm": 0.26786733880651786, + "learning_rate": 0.00011676036973445028, + "loss": 0.9306, + "step": 9910 + }, + { + "epoch": 0.95, + "grad_norm": 0.30754840140433903, + "learning_rate": 0.00011674477380948255, + "loss": 1.0696, + "step": 9911 + }, + { + "epoch": 0.95, + "grad_norm": 0.25852789773425133, + "learning_rate": 0.00011672917746546634, + "loss": 0.8838, + "step": 9912 + }, + { + "epoch": 0.95, + "grad_norm": 0.2558247705235392, + "learning_rate": 0.00011671358070279193, + "loss": 0.99, + "step": 9913 + }, + { + "epoch": 0.95, + "grad_norm": 0.29138329913400457, + "learning_rate": 0.00011669798352184968, + "loss": 0.9954, + "step": 9914 + }, + { + "epoch": 0.95, + "grad_norm": 0.30260586220267804, + "learning_rate": 0.0001166823859230299, + "loss": 1.0312, + "step": 9915 + }, + { + "epoch": 0.95, + "grad_norm": 0.2827165548781267, + "learning_rate": 0.0001166667879067229, + "loss": 1.131, + "step": 9916 + }, + { + "epoch": 0.95, + "grad_norm": 0.29392517958616593, + "learning_rate": 0.0001166511894733191, + "loss": 1.0896, + "step": 9917 + }, + { + "epoch": 0.95, + "grad_norm": 0.3260991950351522, + "learning_rate": 0.00011663559062320878, + "loss": 1.0871, + "step": 9918 + }, + { + "epoch": 0.95, + "grad_norm": 0.2561723519059954, + "learning_rate": 0.00011661999135678237, + "loss": 1.1483, + "step": 9919 + }, + { + "epoch": 0.95, + "grad_norm": 0.309722846552505, + "learning_rate": 0.00011660439167443022, + "loss": 1.0889, + "step": 9920 + }, + { + "epoch": 0.95, + "grad_norm": 0.25190002272893053, + "learning_rate": 0.00011658879157654276, + "loss": 1.0626, + "step": 9921 + }, + { + "epoch": 0.95, + "grad_norm": 0.3259479008437906, + "learning_rate": 0.00011657319106351035, + "loss": 1.0441, + "step": 9922 + }, + { + "epoch": 0.95, + "grad_norm": 0.26095427143336863, + "learning_rate": 0.0001165575901357234, + "loss": 1.108, + "step": 9923 + }, + { + "epoch": 0.95, + "grad_norm": 0.3334009293399356, + "learning_rate": 0.00011654198879357236, + "loss": 0.9941, + "step": 9924 + }, + { + "epoch": 0.95, + "grad_norm": 0.2479075214464415, + "learning_rate": 0.00011652638703744769, + "loss": 0.9997, + "step": 9925 + }, + { + "epoch": 0.95, + "grad_norm": 0.2699529303379684, + "learning_rate": 0.00011651078486773974, + "loss": 1.0008, + "step": 9926 + }, + { + "epoch": 0.95, + "grad_norm": 0.2793875562565832, + "learning_rate": 0.00011649518228483907, + "loss": 1.1141, + "step": 9927 + }, + { + "epoch": 0.95, + "grad_norm": 0.30640720124070897, + "learning_rate": 0.00011647957928913606, + "loss": 1.088, + "step": 9928 + }, + { + "epoch": 0.95, + "grad_norm": 0.29450119862900453, + "learning_rate": 0.00011646397588102123, + "loss": 1.0208, + "step": 9929 + }, + { + "epoch": 0.95, + "grad_norm": 0.2986882197988017, + "learning_rate": 0.00011644837206088508, + "loss": 1.1405, + "step": 9930 + }, + { + "epoch": 0.95, + "grad_norm": 0.28289114316471997, + "learning_rate": 0.00011643276782911805, + "loss": 1.0624, + "step": 9931 + }, + { + "epoch": 0.95, + "grad_norm": 0.25290685088339804, + "learning_rate": 0.0001164171631861107, + "loss": 0.9819, + "step": 9932 + }, + { + "epoch": 0.95, + "grad_norm": 0.29613853648276806, + "learning_rate": 0.00011640155813225348, + "loss": 0.9768, + "step": 9933 + }, + { + "epoch": 0.95, + "grad_norm": 0.30470164680746314, + "learning_rate": 0.00011638595266793701, + "loss": 1.0121, + "step": 9934 + }, + { + "epoch": 0.95, + "grad_norm": 0.26617140467629113, + "learning_rate": 0.00011637034679355176, + "loss": 1.0911, + "step": 9935 + }, + { + "epoch": 0.95, + "grad_norm": 0.29756347865505667, + "learning_rate": 0.00011635474050948829, + "loss": 1.0299, + "step": 9936 + }, + { + "epoch": 0.95, + "grad_norm": 0.2496000722355499, + "learning_rate": 0.00011633913381613717, + "loss": 0.9958, + "step": 9937 + }, + { + "epoch": 0.95, + "grad_norm": 0.28395690278817887, + "learning_rate": 0.00011632352671388898, + "loss": 1.0131, + "step": 9938 + }, + { + "epoch": 0.95, + "grad_norm": 0.2922841640955166, + "learning_rate": 0.00011630791920313425, + "loss": 1.0615, + "step": 9939 + }, + { + "epoch": 0.95, + "grad_norm": 0.3033319129166104, + "learning_rate": 0.00011629231128426356, + "loss": 1.1185, + "step": 9940 + }, + { + "epoch": 0.95, + "grad_norm": 0.2696093697556044, + "learning_rate": 0.00011627670295766759, + "loss": 0.9108, + "step": 9941 + }, + { + "epoch": 0.95, + "grad_norm": 0.275903786765533, + "learning_rate": 0.00011626109422373688, + "loss": 0.9584, + "step": 9942 + }, + { + "epoch": 0.95, + "grad_norm": 0.2593587234240088, + "learning_rate": 0.00011624548508286206, + "loss": 1.0215, + "step": 9943 + }, + { + "epoch": 0.95, + "grad_norm": 0.25466792159457646, + "learning_rate": 0.00011622987553543376, + "loss": 1.0309, + "step": 9944 + }, + { + "epoch": 0.95, + "grad_norm": 0.28160232777966615, + "learning_rate": 0.00011621426558184265, + "loss": 1.036, + "step": 9945 + }, + { + "epoch": 0.95, + "grad_norm": 0.2665942842206432, + "learning_rate": 0.00011619865522247933, + "loss": 1.0767, + "step": 9946 + }, + { + "epoch": 0.95, + "grad_norm": 0.29079511088912224, + "learning_rate": 0.00011618304445773451, + "loss": 1.0276, + "step": 9947 + }, + { + "epoch": 0.95, + "grad_norm": 0.2803873259698023, + "learning_rate": 0.00011616743328799881, + "loss": 1.0198, + "step": 9948 + }, + { + "epoch": 0.95, + "grad_norm": 0.2555819038764869, + "learning_rate": 0.00011615182171366297, + "loss": 1.0222, + "step": 9949 + }, + { + "epoch": 0.95, + "grad_norm": 0.31734739800334083, + "learning_rate": 0.00011613620973511758, + "loss": 1.1547, + "step": 9950 + }, + { + "epoch": 0.95, + "grad_norm": 0.28917459887517855, + "learning_rate": 0.00011612059735275342, + "loss": 1.1102, + "step": 9951 + }, + { + "epoch": 0.95, + "grad_norm": 0.24841835752111638, + "learning_rate": 0.00011610498456696119, + "loss": 1.1191, + "step": 9952 + }, + { + "epoch": 0.95, + "grad_norm": 0.29654024864058454, + "learning_rate": 0.00011608937137813161, + "loss": 0.9583, + "step": 9953 + }, + { + "epoch": 0.95, + "grad_norm": 0.3147918475594126, + "learning_rate": 0.00011607375778665536, + "loss": 1.1377, + "step": 9954 + }, + { + "epoch": 0.95, + "grad_norm": 0.2772820949372226, + "learning_rate": 0.00011605814379292325, + "loss": 1.0474, + "step": 9955 + }, + { + "epoch": 0.95, + "grad_norm": 0.30109169932300395, + "learning_rate": 0.00011604252939732601, + "loss": 1.0939, + "step": 9956 + }, + { + "epoch": 0.95, + "grad_norm": 0.2868717018893174, + "learning_rate": 0.00011602691460025437, + "loss": 0.9908, + "step": 9957 + }, + { + "epoch": 0.95, + "grad_norm": 0.2675493227671909, + "learning_rate": 0.00011601129940209911, + "loss": 1.0693, + "step": 9958 + }, + { + "epoch": 0.95, + "grad_norm": 0.31375505859098296, + "learning_rate": 0.00011599568380325106, + "loss": 1.1187, + "step": 9959 + }, + { + "epoch": 0.95, + "grad_norm": 0.295148232115398, + "learning_rate": 0.00011598006780410091, + "loss": 1.0427, + "step": 9960 + }, + { + "epoch": 0.95, + "grad_norm": 0.28936388619727343, + "learning_rate": 0.00011596445140503957, + "loss": 0.9698, + "step": 9961 + }, + { + "epoch": 0.95, + "grad_norm": 0.28216120171352943, + "learning_rate": 0.0001159488346064578, + "loss": 1.0421, + "step": 9962 + }, + { + "epoch": 0.95, + "grad_norm": 0.28268688737805364, + "learning_rate": 0.00011593321740874639, + "loss": 1.0307, + "step": 9963 + }, + { + "epoch": 0.95, + "grad_norm": 0.27612823655418073, + "learning_rate": 0.00011591759981229622, + "loss": 0.9663, + "step": 9964 + }, + { + "epoch": 0.95, + "grad_norm": 0.2740428689355272, + "learning_rate": 0.00011590198181749811, + "loss": 1.0961, + "step": 9965 + }, + { + "epoch": 0.95, + "grad_norm": 0.2499724504263894, + "learning_rate": 0.0001158863634247429, + "loss": 1.0651, + "step": 9966 + }, + { + "epoch": 0.95, + "grad_norm": 0.2757709129720417, + "learning_rate": 0.00011587074463442147, + "loss": 1.0643, + "step": 9967 + }, + { + "epoch": 0.95, + "grad_norm": 0.2985207172650015, + "learning_rate": 0.00011585512544692467, + "loss": 1.1078, + "step": 9968 + }, + { + "epoch": 0.95, + "grad_norm": 0.3045737397007342, + "learning_rate": 0.00011583950586264343, + "loss": 1.1594, + "step": 9969 + }, + { + "epoch": 0.95, + "grad_norm": 0.27637988509985273, + "learning_rate": 0.00011582388588196855, + "loss": 1.0231, + "step": 9970 + }, + { + "epoch": 0.95, + "grad_norm": 0.2821414694181937, + "learning_rate": 0.000115808265505291, + "loss": 1.1147, + "step": 9971 + }, + { + "epoch": 0.95, + "grad_norm": 0.29155806162009856, + "learning_rate": 0.00011579264473300167, + "loss": 1.0431, + "step": 9972 + }, + { + "epoch": 0.95, + "grad_norm": 0.27636257246936596, + "learning_rate": 0.00011577702356549149, + "loss": 1.0565, + "step": 9973 + }, + { + "epoch": 0.95, + "grad_norm": 0.31102867461907796, + "learning_rate": 0.00011576140200315135, + "loss": 1.1162, + "step": 9974 + }, + { + "epoch": 0.95, + "grad_norm": 0.28686875083828667, + "learning_rate": 0.00011574578004637226, + "loss": 0.9938, + "step": 9975 + }, + { + "epoch": 0.95, + "grad_norm": 0.2829922774553789, + "learning_rate": 0.00011573015769554512, + "loss": 1.043, + "step": 9976 + }, + { + "epoch": 0.95, + "grad_norm": 0.30378521566991484, + "learning_rate": 0.00011571453495106086, + "loss": 1.125, + "step": 9977 + }, + { + "epoch": 0.95, + "grad_norm": 0.2882399502995974, + "learning_rate": 0.00011569891181331054, + "loss": 1.0316, + "step": 9978 + }, + { + "epoch": 0.95, + "grad_norm": 0.2546929646781789, + "learning_rate": 0.00011568328828268506, + "loss": 1.0347, + "step": 9979 + }, + { + "epoch": 0.95, + "grad_norm": 0.27885757954603946, + "learning_rate": 0.00011566766435957541, + "loss": 1.0919, + "step": 9980 + }, + { + "epoch": 0.95, + "grad_norm": 0.26559169992251663, + "learning_rate": 0.00011565204004437267, + "loss": 1.0969, + "step": 9981 + }, + { + "epoch": 0.95, + "grad_norm": 0.33065813751668216, + "learning_rate": 0.00011563641533746774, + "loss": 1.0411, + "step": 9982 + }, + { + "epoch": 0.96, + "grad_norm": 0.33089614226385505, + "learning_rate": 0.00011562079023925172, + "loss": 1.0129, + "step": 9983 + }, + { + "epoch": 0.96, + "grad_norm": 0.29760832729104647, + "learning_rate": 0.00011560516475011558, + "loss": 1.0865, + "step": 9984 + }, + { + "epoch": 0.96, + "grad_norm": 0.2982860380062189, + "learning_rate": 0.00011558953887045041, + "loss": 1.0879, + "step": 9985 + }, + { + "epoch": 0.96, + "grad_norm": 0.3089962049085526, + "learning_rate": 0.00011557391260064723, + "loss": 1.062, + "step": 9986 + }, + { + "epoch": 0.96, + "grad_norm": 0.2907036025220388, + "learning_rate": 0.00011555828594109707, + "loss": 1.0863, + "step": 9987 + }, + { + "epoch": 0.96, + "grad_norm": 0.31013815730431993, + "learning_rate": 0.00011554265889219106, + "loss": 1.0049, + "step": 9988 + }, + { + "epoch": 0.96, + "grad_norm": 0.29883777425014585, + "learning_rate": 0.00011552703145432025, + "loss": 1.0613, + "step": 9989 + }, + { + "epoch": 0.96, + "grad_norm": 0.29975907234168436, + "learning_rate": 0.0001155114036278757, + "loss": 1.0005, + "step": 9990 + }, + { + "epoch": 0.96, + "grad_norm": 0.30684771167447394, + "learning_rate": 0.0001154957754132485, + "loss": 1.1158, + "step": 9991 + }, + { + "epoch": 0.96, + "grad_norm": 0.26335079161415853, + "learning_rate": 0.00011548014681082981, + "loss": 1.021, + "step": 9992 + }, + { + "epoch": 0.96, + "grad_norm": 0.27648270125908575, + "learning_rate": 0.00011546451782101071, + "loss": 1.1223, + "step": 9993 + }, + { + "epoch": 0.96, + "grad_norm": 0.3475752040212275, + "learning_rate": 0.00011544888844418233, + "loss": 1.1233, + "step": 9994 + }, + { + "epoch": 0.96, + "grad_norm": 0.30111765274598085, + "learning_rate": 0.0001154332586807358, + "loss": 1.0963, + "step": 9995 + }, + { + "epoch": 0.96, + "grad_norm": 0.2970605582481724, + "learning_rate": 0.0001154176285310623, + "loss": 0.99, + "step": 9996 + }, + { + "epoch": 0.96, + "grad_norm": 0.305235597238056, + "learning_rate": 0.00011540199799555294, + "loss": 1.132, + "step": 9997 + }, + { + "epoch": 0.96, + "grad_norm": 0.2670713296870493, + "learning_rate": 0.00011538636707459889, + "loss": 1.0476, + "step": 9998 + }, + { + "epoch": 0.96, + "grad_norm": 0.2677367396231706, + "learning_rate": 0.00011537073576859136, + "loss": 1.069, + "step": 9999 + }, + { + "epoch": 0.96, + "grad_norm": 0.2525746570305202, + "learning_rate": 0.00011535510407792149, + "loss": 0.9385, + "step": 10000 + }, + { + "epoch": 0.96, + "grad_norm": 0.2758414503350628, + "learning_rate": 0.0001153394720029805, + "loss": 1.0631, + "step": 10001 + }, + { + "epoch": 0.96, + "grad_norm": 0.3047038095467878, + "learning_rate": 0.00011532383954415957, + "loss": 1.0311, + "step": 10002 + }, + { + "epoch": 0.96, + "grad_norm": 0.28111478399862794, + "learning_rate": 0.00011530820670184995, + "loss": 1.0573, + "step": 10003 + }, + { + "epoch": 0.96, + "grad_norm": 0.28036170451809095, + "learning_rate": 0.0001152925734764428, + "loss": 1.1742, + "step": 10004 + }, + { + "epoch": 0.96, + "grad_norm": 0.2963156258676252, + "learning_rate": 0.00011527693986832942, + "loss": 1.016, + "step": 10005 + }, + { + "epoch": 0.96, + "grad_norm": 0.2815524009206448, + "learning_rate": 0.000115261305877901, + "loss": 1.0306, + "step": 10006 + }, + { + "epoch": 0.96, + "grad_norm": 0.29474398852141115, + "learning_rate": 0.00011524567150554881, + "loss": 1.0829, + "step": 10007 + }, + { + "epoch": 0.96, + "grad_norm": 0.29358155011217263, + "learning_rate": 0.00011523003675166411, + "loss": 1.0714, + "step": 10008 + }, + { + "epoch": 0.96, + "grad_norm": 0.2844867269668011, + "learning_rate": 0.00011521440161663819, + "loss": 1.0348, + "step": 10009 + }, + { + "epoch": 0.96, + "grad_norm": 0.255280580277318, + "learning_rate": 0.00011519876610086229, + "loss": 1.1012, + "step": 10010 + }, + { + "epoch": 0.96, + "grad_norm": 0.31413925882198807, + "learning_rate": 0.00011518313020472768, + "loss": 1.0744, + "step": 10011 + }, + { + "epoch": 0.96, + "grad_norm": 0.31994016396189484, + "learning_rate": 0.00011516749392862576, + "loss": 0.9883, + "step": 10012 + }, + { + "epoch": 0.96, + "grad_norm": 0.27390552772942023, + "learning_rate": 0.00011515185727294771, + "loss": 0.9983, + "step": 10013 + }, + { + "epoch": 0.96, + "grad_norm": 0.2853534170678965, + "learning_rate": 0.00011513622023808495, + "loss": 1.0904, + "step": 10014 + }, + { + "epoch": 0.96, + "grad_norm": 0.25465697705481827, + "learning_rate": 0.00011512058282442874, + "loss": 0.922, + "step": 10015 + }, + { + "epoch": 0.96, + "grad_norm": 0.26802643575831, + "learning_rate": 0.00011510494503237046, + "loss": 1.0313, + "step": 10016 + }, + { + "epoch": 0.96, + "grad_norm": 0.3193729552335535, + "learning_rate": 0.00011508930686230146, + "loss": 0.9854, + "step": 10017 + }, + { + "epoch": 0.96, + "grad_norm": 0.2879367749732901, + "learning_rate": 0.00011507366831461302, + "loss": 1.1051, + "step": 10018 + }, + { + "epoch": 0.96, + "grad_norm": 0.30097969407853326, + "learning_rate": 0.0001150580293896966, + "loss": 1.0425, + "step": 10019 + }, + { + "epoch": 0.96, + "grad_norm": 0.30624544517915264, + "learning_rate": 0.0001150423900879435, + "loss": 1.0311, + "step": 10020 + }, + { + "epoch": 0.96, + "grad_norm": 0.2767941099814116, + "learning_rate": 0.00011502675040974516, + "loss": 0.9427, + "step": 10021 + }, + { + "epoch": 0.96, + "grad_norm": 0.28490801229190077, + "learning_rate": 0.00011501111035549295, + "loss": 1.1947, + "step": 10022 + }, + { + "epoch": 0.96, + "grad_norm": 0.2784277522206461, + "learning_rate": 0.00011499546992557826, + "loss": 0.9624, + "step": 10023 + }, + { + "epoch": 0.96, + "grad_norm": 0.2666766109775799, + "learning_rate": 0.00011497982912039249, + "loss": 0.9757, + "step": 10024 + }, + { + "epoch": 0.96, + "grad_norm": 0.32891260560151236, + "learning_rate": 0.00011496418794032711, + "loss": 1.1256, + "step": 10025 + }, + { + "epoch": 0.96, + "grad_norm": 0.3086769971151652, + "learning_rate": 0.0001149485463857735, + "loss": 0.9429, + "step": 10026 + }, + { + "epoch": 0.96, + "grad_norm": 0.25897740279910364, + "learning_rate": 0.00011493290445712315, + "loss": 0.8941, + "step": 10027 + }, + { + "epoch": 0.96, + "grad_norm": 0.27367523231686985, + "learning_rate": 0.00011491726215476746, + "loss": 0.961, + "step": 10028 + }, + { + "epoch": 0.96, + "grad_norm": 0.2902717009917645, + "learning_rate": 0.0001149016194790979, + "loss": 1.0828, + "step": 10029 + }, + { + "epoch": 0.96, + "grad_norm": 0.2767681614076228, + "learning_rate": 0.00011488597643050598, + "loss": 1.1453, + "step": 10030 + }, + { + "epoch": 0.96, + "grad_norm": 0.2857144403267616, + "learning_rate": 0.0001148703330093831, + "loss": 1.1737, + "step": 10031 + }, + { + "epoch": 0.96, + "grad_norm": 0.28150370452068746, + "learning_rate": 0.00011485468921612084, + "loss": 1.1734, + "step": 10032 + }, + { + "epoch": 0.96, + "grad_norm": 0.3290209532225155, + "learning_rate": 0.00011483904505111063, + "loss": 1.1331, + "step": 10033 + }, + { + "epoch": 0.96, + "grad_norm": 0.3138396953516879, + "learning_rate": 0.00011482340051474396, + "loss": 1.0148, + "step": 10034 + }, + { + "epoch": 0.96, + "grad_norm": 0.30110096644908, + "learning_rate": 0.00011480775560741239, + "loss": 1.0134, + "step": 10035 + }, + { + "epoch": 0.96, + "grad_norm": 0.3203655127160484, + "learning_rate": 0.00011479211032950743, + "loss": 1.0475, + "step": 10036 + }, + { + "epoch": 0.96, + "grad_norm": 0.2772142324547969, + "learning_rate": 0.00011477646468142062, + "loss": 1.0685, + "step": 10037 + }, + { + "epoch": 0.96, + "grad_norm": 0.2917042525821615, + "learning_rate": 0.0001147608186635435, + "loss": 1.1421, + "step": 10038 + }, + { + "epoch": 0.96, + "grad_norm": 0.26698256397886955, + "learning_rate": 0.00011474517227626762, + "loss": 1.1066, + "step": 10039 + }, + { + "epoch": 0.96, + "grad_norm": 0.2675194389510324, + "learning_rate": 0.00011472952551998452, + "loss": 0.9672, + "step": 10040 + }, + { + "epoch": 0.96, + "grad_norm": 0.2925787442554594, + "learning_rate": 0.0001147138783950858, + "loss": 1.0619, + "step": 10041 + }, + { + "epoch": 0.96, + "grad_norm": 0.2925652633706256, + "learning_rate": 0.00011469823090196303, + "loss": 1.067, + "step": 10042 + }, + { + "epoch": 0.96, + "grad_norm": 0.26146199790712404, + "learning_rate": 0.00011468258304100779, + "loss": 1.0669, + "step": 10043 + }, + { + "epoch": 0.96, + "grad_norm": 0.29387442287441945, + "learning_rate": 0.00011466693481261168, + "loss": 1.1529, + "step": 10044 + }, + { + "epoch": 0.96, + "grad_norm": 0.30566339364004746, + "learning_rate": 0.0001146512862171663, + "loss": 1.0711, + "step": 10045 + }, + { + "epoch": 0.96, + "grad_norm": 0.2623345882172563, + "learning_rate": 0.00011463563725506328, + "loss": 1.1218, + "step": 10046 + }, + { + "epoch": 0.96, + "grad_norm": 0.3090594432747929, + "learning_rate": 0.00011461998792669426, + "loss": 1.0833, + "step": 10047 + }, + { + "epoch": 0.96, + "grad_norm": 0.3168446002514091, + "learning_rate": 0.0001146043382324508, + "loss": 1.0041, + "step": 10048 + }, + { + "epoch": 0.96, + "grad_norm": 0.26118984316659555, + "learning_rate": 0.00011458868817272465, + "loss": 1.075, + "step": 10049 + }, + { + "epoch": 0.96, + "grad_norm": 0.3084194238337441, + "learning_rate": 0.0001145730377479074, + "loss": 0.9865, + "step": 10050 + }, + { + "epoch": 0.96, + "grad_norm": 0.2916812390686762, + "learning_rate": 0.00011455738695839071, + "loss": 1.1019, + "step": 10051 + }, + { + "epoch": 0.96, + "grad_norm": 0.2853685064498807, + "learning_rate": 0.00011454173580456627, + "loss": 1.1053, + "step": 10052 + }, + { + "epoch": 0.96, + "grad_norm": 0.28381729559022867, + "learning_rate": 0.00011452608428682574, + "loss": 1.0872, + "step": 10053 + }, + { + "epoch": 0.96, + "grad_norm": 0.35555285136726095, + "learning_rate": 0.0001145104324055608, + "loss": 1.0321, + "step": 10054 + }, + { + "epoch": 0.96, + "grad_norm": 0.29725262158547283, + "learning_rate": 0.00011449478016116322, + "loss": 1.0607, + "step": 10055 + }, + { + "epoch": 0.96, + "grad_norm": 0.32988647361187395, + "learning_rate": 0.00011447912755402463, + "loss": 1.0194, + "step": 10056 + }, + { + "epoch": 0.96, + "grad_norm": 0.2986055841309832, + "learning_rate": 0.00011446347458453677, + "loss": 1.0673, + "step": 10057 + }, + { + "epoch": 0.96, + "grad_norm": 0.2696942404700563, + "learning_rate": 0.00011444782125309137, + "loss": 1.0442, + "step": 10058 + }, + { + "epoch": 0.96, + "grad_norm": 0.3064172789572965, + "learning_rate": 0.00011443216756008017, + "loss": 0.9705, + "step": 10059 + }, + { + "epoch": 0.96, + "grad_norm": 0.29986340116379595, + "learning_rate": 0.00011441651350589493, + "loss": 1.042, + "step": 10060 + }, + { + "epoch": 0.96, + "grad_norm": 0.29196996307249695, + "learning_rate": 0.00011440085909092735, + "loss": 1.1514, + "step": 10061 + }, + { + "epoch": 0.96, + "grad_norm": 0.26305531388271913, + "learning_rate": 0.00011438520431556923, + "loss": 1.0125, + "step": 10062 + }, + { + "epoch": 0.96, + "grad_norm": 0.28626327759234815, + "learning_rate": 0.00011436954918021232, + "loss": 1.1407, + "step": 10063 + }, + { + "epoch": 0.96, + "grad_norm": 0.27422258071461836, + "learning_rate": 0.00011435389368524842, + "loss": 1.055, + "step": 10064 + }, + { + "epoch": 0.96, + "grad_norm": 0.29139299897756993, + "learning_rate": 0.0001143382378310693, + "loss": 1.015, + "step": 10065 + }, + { + "epoch": 0.96, + "grad_norm": 0.2590303991609533, + "learning_rate": 0.0001143225816180668, + "loss": 0.9926, + "step": 10066 + }, + { + "epoch": 0.96, + "grad_norm": 0.28910079655703913, + "learning_rate": 0.00011430692504663265, + "loss": 0.9727, + "step": 10067 + }, + { + "epoch": 0.96, + "grad_norm": 0.3312654664415843, + "learning_rate": 0.00011429126811715872, + "loss": 1.101, + "step": 10068 + }, + { + "epoch": 0.96, + "grad_norm": 0.30000869026615357, + "learning_rate": 0.00011427561083003683, + "loss": 1.0738, + "step": 10069 + }, + { + "epoch": 0.96, + "grad_norm": 0.28159858071588617, + "learning_rate": 0.00011425995318565883, + "loss": 1.0995, + "step": 10070 + }, + { + "epoch": 0.96, + "grad_norm": 0.3123705444016819, + "learning_rate": 0.00011424429518441653, + "loss": 0.9907, + "step": 10071 + }, + { + "epoch": 0.96, + "grad_norm": 0.25560829274875024, + "learning_rate": 0.00011422863682670176, + "loss": 1.0387, + "step": 10072 + }, + { + "epoch": 0.96, + "grad_norm": 0.3047604879614061, + "learning_rate": 0.00011421297811290643, + "loss": 1.0803, + "step": 10073 + }, + { + "epoch": 0.96, + "grad_norm": 0.27464921456414265, + "learning_rate": 0.0001141973190434224, + "loss": 1.1457, + "step": 10074 + }, + { + "epoch": 0.96, + "grad_norm": 0.3015661166736289, + "learning_rate": 0.00011418165961864151, + "loss": 0.9435, + "step": 10075 + }, + { + "epoch": 0.96, + "grad_norm": 0.3288482467287445, + "learning_rate": 0.0001141659998389557, + "loss": 1.0268, + "step": 10076 + }, + { + "epoch": 0.96, + "grad_norm": 0.2880879802413768, + "learning_rate": 0.00011415033970475682, + "loss": 1.1155, + "step": 10077 + }, + { + "epoch": 0.96, + "grad_norm": 0.3046710592105044, + "learning_rate": 0.00011413467921643681, + "loss": 1.0854, + "step": 10078 + }, + { + "epoch": 0.96, + "grad_norm": 0.23189332771051346, + "learning_rate": 0.00011411901837438757, + "loss": 0.8959, + "step": 10079 + }, + { + "epoch": 0.96, + "grad_norm": 0.2860860545736034, + "learning_rate": 0.00011410335717900102, + "loss": 1.004, + "step": 10080 + }, + { + "epoch": 0.96, + "grad_norm": 0.29819011560467884, + "learning_rate": 0.0001140876956306691, + "loss": 1.0271, + "step": 10081 + }, + { + "epoch": 0.96, + "grad_norm": 0.2961548882133075, + "learning_rate": 0.00011407203372978372, + "loss": 1.0964, + "step": 10082 + }, + { + "epoch": 0.96, + "grad_norm": 0.27643172524881576, + "learning_rate": 0.00011405637147673688, + "loss": 1.0848, + "step": 10083 + }, + { + "epoch": 0.96, + "grad_norm": 0.26601937387043634, + "learning_rate": 0.00011404070887192051, + "loss": 1.0771, + "step": 10084 + }, + { + "epoch": 0.96, + "grad_norm": 0.29423767837815973, + "learning_rate": 0.00011402504591572656, + "loss": 1.1087, + "step": 10085 + }, + { + "epoch": 0.96, + "grad_norm": 0.31957243246751704, + "learning_rate": 0.00011400938260854703, + "loss": 1.1154, + "step": 10086 + }, + { + "epoch": 0.97, + "grad_norm": 0.34871076842626053, + "learning_rate": 0.00011399371895077389, + "loss": 1.0691, + "step": 10087 + }, + { + "epoch": 0.97, + "grad_norm": 0.30378790346074774, + "learning_rate": 0.00011397805494279916, + "loss": 1.096, + "step": 10088 + }, + { + "epoch": 0.97, + "grad_norm": 0.25772589878682645, + "learning_rate": 0.00011396239058501476, + "loss": 1.0342, + "step": 10089 + }, + { + "epoch": 0.97, + "grad_norm": 0.2823080995120186, + "learning_rate": 0.00011394672587781284, + "loss": 1.1017, + "step": 10090 + }, + { + "epoch": 0.97, + "grad_norm": 0.312629120636968, + "learning_rate": 0.0001139310608215853, + "loss": 1.0579, + "step": 10091 + }, + { + "epoch": 0.97, + "grad_norm": 0.3054424461560633, + "learning_rate": 0.00011391539541672418, + "loss": 1.0553, + "step": 10092 + }, + { + "epoch": 0.97, + "grad_norm": 0.28281060096224714, + "learning_rate": 0.00011389972966362159, + "loss": 1.0612, + "step": 10093 + }, + { + "epoch": 0.97, + "grad_norm": 0.27166297516714194, + "learning_rate": 0.00011388406356266951, + "loss": 1.0524, + "step": 10094 + }, + { + "epoch": 0.97, + "grad_norm": 0.31743027731547635, + "learning_rate": 0.00011386839711426003, + "loss": 1.0024, + "step": 10095 + }, + { + "epoch": 0.97, + "grad_norm": 0.32534625159202174, + "learning_rate": 0.00011385273031878516, + "loss": 1.0885, + "step": 10096 + }, + { + "epoch": 0.97, + "grad_norm": 0.30049574798069184, + "learning_rate": 0.00011383706317663705, + "loss": 1.0135, + "step": 10097 + }, + { + "epoch": 0.97, + "grad_norm": 0.3266379229812977, + "learning_rate": 0.00011382139568820771, + "loss": 1.0434, + "step": 10098 + }, + { + "epoch": 0.97, + "grad_norm": 0.3039958500677989, + "learning_rate": 0.00011380572785388923, + "loss": 1.1338, + "step": 10099 + }, + { + "epoch": 0.97, + "grad_norm": 0.2820873927102382, + "learning_rate": 0.0001137900596740738, + "loss": 0.9901, + "step": 10100 + }, + { + "epoch": 0.97, + "grad_norm": 0.25895906852181655, + "learning_rate": 0.00011377439114915343, + "loss": 1.0197, + "step": 10101 + }, + { + "epoch": 0.97, + "grad_norm": 0.2736675525144609, + "learning_rate": 0.00011375872227952024, + "loss": 1.0666, + "step": 10102 + }, + { + "epoch": 0.97, + "grad_norm": 0.27812994654877704, + "learning_rate": 0.00011374305306556641, + "loss": 0.9667, + "step": 10103 + }, + { + "epoch": 0.97, + "grad_norm": 0.24944483327099637, + "learning_rate": 0.00011372738350768404, + "loss": 1.1377, + "step": 10104 + }, + { + "epoch": 0.97, + "grad_norm": 0.3052298238193706, + "learning_rate": 0.00011371171360626528, + "loss": 1.0432, + "step": 10105 + }, + { + "epoch": 0.97, + "grad_norm": 0.27306031709563633, + "learning_rate": 0.00011369604336170221, + "loss": 1.0682, + "step": 10106 + }, + { + "epoch": 0.97, + "grad_norm": 0.3167325272734246, + "learning_rate": 0.0001136803727743871, + "loss": 1.1381, + "step": 10107 + }, + { + "epoch": 0.97, + "grad_norm": 0.3310696902768376, + "learning_rate": 0.00011366470184471206, + "loss": 0.9888, + "step": 10108 + }, + { + "epoch": 0.97, + "grad_norm": 0.2920470605038157, + "learning_rate": 0.00011364903057306923, + "loss": 1.0723, + "step": 10109 + }, + { + "epoch": 0.97, + "grad_norm": 0.28721162596130884, + "learning_rate": 0.00011363335895985087, + "loss": 1.0795, + "step": 10110 + }, + { + "epoch": 0.97, + "grad_norm": 0.3123155431147055, + "learning_rate": 0.00011361768700544915, + "loss": 1.0195, + "step": 10111 + }, + { + "epoch": 0.97, + "grad_norm": 0.31432924829965664, + "learning_rate": 0.00011360201471025625, + "loss": 1.0262, + "step": 10112 + }, + { + "epoch": 0.97, + "grad_norm": 0.30371743385151373, + "learning_rate": 0.00011358634207466434, + "loss": 1.1198, + "step": 10113 + }, + { + "epoch": 0.97, + "grad_norm": 0.30399866249564966, + "learning_rate": 0.0001135706690990657, + "loss": 1.0732, + "step": 10114 + }, + { + "epoch": 0.97, + "grad_norm": 0.32184020587696033, + "learning_rate": 0.00011355499578385256, + "loss": 1.1721, + "step": 10115 + }, + { + "epoch": 0.97, + "grad_norm": 0.30947844684370757, + "learning_rate": 0.00011353932212941709, + "loss": 0.9335, + "step": 10116 + }, + { + "epoch": 0.97, + "grad_norm": 0.30689326685860674, + "learning_rate": 0.00011352364813615159, + "loss": 1.154, + "step": 10117 + }, + { + "epoch": 0.97, + "grad_norm": 0.35179570677545263, + "learning_rate": 0.0001135079738044483, + "loss": 1.1623, + "step": 10118 + }, + { + "epoch": 0.97, + "grad_norm": 0.2895264616854555, + "learning_rate": 0.00011349229913469948, + "loss": 1.1446, + "step": 10119 + }, + { + "epoch": 0.97, + "grad_norm": 0.31453418960821145, + "learning_rate": 0.00011347662412729738, + "loss": 1.1001, + "step": 10120 + }, + { + "epoch": 0.97, + "grad_norm": 0.2761763344790832, + "learning_rate": 0.00011346094878263431, + "loss": 0.9588, + "step": 10121 + }, + { + "epoch": 0.97, + "grad_norm": 0.30740807652927366, + "learning_rate": 0.00011344527310110256, + "loss": 1.0145, + "step": 10122 + }, + { + "epoch": 0.97, + "grad_norm": 0.2852359412921041, + "learning_rate": 0.00011342959708309435, + "loss": 0.9789, + "step": 10123 + }, + { + "epoch": 0.97, + "grad_norm": 0.2870738764698151, + "learning_rate": 0.00011341392072900205, + "loss": 1.0349, + "step": 10124 + }, + { + "epoch": 0.97, + "grad_norm": 0.3079179973519736, + "learning_rate": 0.00011339824403921797, + "loss": 1.0653, + "step": 10125 + }, + { + "epoch": 0.97, + "grad_norm": 0.256710403425938, + "learning_rate": 0.0001133825670141344, + "loss": 0.9875, + "step": 10126 + }, + { + "epoch": 0.97, + "grad_norm": 0.3171962319601306, + "learning_rate": 0.00011336688965414369, + "loss": 1.1135, + "step": 10127 + }, + { + "epoch": 0.97, + "grad_norm": 0.2964396038679801, + "learning_rate": 0.00011335121195963813, + "loss": 1.0385, + "step": 10128 + }, + { + "epoch": 0.97, + "grad_norm": 0.2750463815164073, + "learning_rate": 0.00011333553393101013, + "loss": 0.9907, + "step": 10129 + }, + { + "epoch": 0.97, + "grad_norm": 0.3245742501930826, + "learning_rate": 0.00011331985556865201, + "loss": 1.0089, + "step": 10130 + }, + { + "epoch": 0.97, + "grad_norm": 0.2976235078728115, + "learning_rate": 0.00011330417687295614, + "loss": 1.0739, + "step": 10131 + }, + { + "epoch": 0.97, + "grad_norm": 0.30953794415640545, + "learning_rate": 0.00011328849784431488, + "loss": 1.1504, + "step": 10132 + }, + { + "epoch": 0.97, + "grad_norm": 0.2779345999485772, + "learning_rate": 0.00011327281848312059, + "loss": 1.1356, + "step": 10133 + }, + { + "epoch": 0.97, + "grad_norm": 0.2802541009558856, + "learning_rate": 0.0001132571387897657, + "loss": 1.0917, + "step": 10134 + }, + { + "epoch": 0.97, + "grad_norm": 0.27849384342581196, + "learning_rate": 0.00011324145876464259, + "loss": 1.0923, + "step": 10135 + }, + { + "epoch": 0.97, + "grad_norm": 0.3318350576277318, + "learning_rate": 0.00011322577840814361, + "loss": 0.9506, + "step": 10136 + }, + { + "epoch": 0.97, + "grad_norm": 0.3459815664705523, + "learning_rate": 0.00011321009772066124, + "loss": 1.0129, + "step": 10137 + }, + { + "epoch": 0.97, + "grad_norm": 0.3037533943998662, + "learning_rate": 0.00011319441670258788, + "loss": 1.0418, + "step": 10138 + }, + { + "epoch": 0.97, + "grad_norm": 0.2520746414367993, + "learning_rate": 0.00011317873535431591, + "loss": 1.0216, + "step": 10139 + }, + { + "epoch": 0.97, + "grad_norm": 0.2661292266112255, + "learning_rate": 0.00011316305367623785, + "loss": 1.1706, + "step": 10140 + }, + { + "epoch": 0.97, + "grad_norm": 0.3063821264146261, + "learning_rate": 0.00011314737166874607, + "loss": 1.0417, + "step": 10141 + }, + { + "epoch": 0.97, + "grad_norm": 0.26323311620303635, + "learning_rate": 0.00011313168933223306, + "loss": 1.0204, + "step": 10142 + }, + { + "epoch": 0.97, + "grad_norm": 0.27238653921733935, + "learning_rate": 0.00011311600666709126, + "loss": 1.0871, + "step": 10143 + }, + { + "epoch": 0.97, + "grad_norm": 0.3013746324023429, + "learning_rate": 0.00011310032367371317, + "loss": 1.0356, + "step": 10144 + }, + { + "epoch": 0.97, + "grad_norm": 0.2574291995601369, + "learning_rate": 0.00011308464035249125, + "loss": 1.0812, + "step": 10145 + }, + { + "epoch": 0.97, + "grad_norm": 0.2869306171965447, + "learning_rate": 0.00011306895670381797, + "loss": 1.0173, + "step": 10146 + }, + { + "epoch": 0.97, + "grad_norm": 0.23922368061607324, + "learning_rate": 0.00011305327272808583, + "loss": 1.0289, + "step": 10147 + }, + { + "epoch": 0.97, + "grad_norm": 0.25159221628276673, + "learning_rate": 0.00011303758842568735, + "loss": 0.999, + "step": 10148 + }, + { + "epoch": 0.97, + "grad_norm": 0.2868191888678923, + "learning_rate": 0.00011302190379701503, + "loss": 1.0263, + "step": 10149 + }, + { + "epoch": 0.97, + "grad_norm": 0.361397853429607, + "learning_rate": 0.00011300621884246136, + "loss": 1.1683, + "step": 10150 + }, + { + "epoch": 0.97, + "grad_norm": 0.2826280133952878, + "learning_rate": 0.00011299053356241891, + "loss": 1.0963, + "step": 10151 + }, + { + "epoch": 0.97, + "grad_norm": 0.28972955414613405, + "learning_rate": 0.00011297484795728019, + "loss": 1.0244, + "step": 10152 + }, + { + "epoch": 0.97, + "grad_norm": 0.3320877897903177, + "learning_rate": 0.00011295916202743773, + "loss": 1.0985, + "step": 10153 + }, + { + "epoch": 0.97, + "grad_norm": 0.2928253376533602, + "learning_rate": 0.00011294347577328412, + "loss": 1.0557, + "step": 10154 + }, + { + "epoch": 0.97, + "grad_norm": 0.33688760633249853, + "learning_rate": 0.00011292778919521189, + "loss": 1.149, + "step": 10155 + }, + { + "epoch": 0.97, + "grad_norm": 0.27676103323848317, + "learning_rate": 0.00011291210229361362, + "loss": 1.1117, + "step": 10156 + }, + { + "epoch": 0.97, + "grad_norm": 0.29026256750352997, + "learning_rate": 0.00011289641506888182, + "loss": 1.0704, + "step": 10157 + }, + { + "epoch": 0.97, + "grad_norm": 0.29334540208045223, + "learning_rate": 0.0001128807275214092, + "loss": 0.9988, + "step": 10158 + }, + { + "epoch": 0.97, + "grad_norm": 0.2818605763391759, + "learning_rate": 0.00011286503965158822, + "loss": 1.0227, + "step": 10159 + }, + { + "epoch": 0.97, + "grad_norm": 0.27801097090219445, + "learning_rate": 0.00011284935145981157, + "loss": 0.9915, + "step": 10160 + }, + { + "epoch": 0.97, + "grad_norm": 0.2805850444759864, + "learning_rate": 0.0001128336629464718, + "loss": 1.0897, + "step": 10161 + }, + { + "epoch": 0.97, + "grad_norm": 0.3032491771748229, + "learning_rate": 0.00011281797411196156, + "loss": 1.0666, + "step": 10162 + }, + { + "epoch": 0.97, + "grad_norm": 0.300948028153539, + "learning_rate": 0.00011280228495667346, + "loss": 1.1215, + "step": 10163 + }, + { + "epoch": 0.97, + "grad_norm": 0.32505655549938, + "learning_rate": 0.00011278659548100015, + "loss": 1.0754, + "step": 10164 + }, + { + "epoch": 0.97, + "grad_norm": 0.2830441409159092, + "learning_rate": 0.00011277090568533424, + "loss": 1.1184, + "step": 10165 + }, + { + "epoch": 0.97, + "grad_norm": 0.27235511959421765, + "learning_rate": 0.0001127552155700684, + "loss": 1.0373, + "step": 10166 + }, + { + "epoch": 0.97, + "grad_norm": 0.3112142074976755, + "learning_rate": 0.00011273952513559525, + "loss": 1.0028, + "step": 10167 + }, + { + "epoch": 0.97, + "grad_norm": 0.27216215210937883, + "learning_rate": 0.0001127238343823075, + "loss": 1.0513, + "step": 10168 + }, + { + "epoch": 0.97, + "grad_norm": 0.2953527746729821, + "learning_rate": 0.0001127081433105978, + "loss": 1.0803, + "step": 10169 + }, + { + "epoch": 0.97, + "grad_norm": 0.2897815242186968, + "learning_rate": 0.0001126924519208588, + "loss": 1.0248, + "step": 10170 + }, + { + "epoch": 0.97, + "grad_norm": 0.28105812749428083, + "learning_rate": 0.00011267676021348323, + "loss": 1.013, + "step": 10171 + }, + { + "epoch": 0.97, + "grad_norm": 0.30722992978249025, + "learning_rate": 0.00011266106818886377, + "loss": 1.0101, + "step": 10172 + }, + { + "epoch": 0.97, + "grad_norm": 0.3033918428966205, + "learning_rate": 0.00011264537584739314, + "loss": 1.0618, + "step": 10173 + }, + { + "epoch": 0.97, + "grad_norm": 0.28043324409698095, + "learning_rate": 0.00011262968318946398, + "loss": 0.9475, + "step": 10174 + }, + { + "epoch": 0.97, + "grad_norm": 0.2609838079910845, + "learning_rate": 0.00011261399021546912, + "loss": 0.9683, + "step": 10175 + }, + { + "epoch": 0.97, + "grad_norm": 0.3252941464289039, + "learning_rate": 0.00011259829692580119, + "loss": 1.0946, + "step": 10176 + }, + { + "epoch": 0.97, + "grad_norm": 0.3102405322061509, + "learning_rate": 0.00011258260332085298, + "loss": 1.0548, + "step": 10177 + }, + { + "epoch": 0.97, + "grad_norm": 0.27928723327298144, + "learning_rate": 0.0001125669094010172, + "loss": 1.0014, + "step": 10178 + }, + { + "epoch": 0.97, + "grad_norm": 0.3043342368498864, + "learning_rate": 0.00011255121516668663, + "loss": 1.0738, + "step": 10179 + }, + { + "epoch": 0.97, + "grad_norm": 0.28720036803685933, + "learning_rate": 0.00011253552061825398, + "loss": 1.0355, + "step": 10180 + }, + { + "epoch": 0.97, + "grad_norm": 0.284815493784025, + "learning_rate": 0.00011251982575611209, + "loss": 1.0894, + "step": 10181 + }, + { + "epoch": 0.97, + "grad_norm": 0.2556934414454461, + "learning_rate": 0.00011250413058065365, + "loss": 0.9576, + "step": 10182 + }, + { + "epoch": 0.97, + "grad_norm": 0.26083285136088175, + "learning_rate": 0.00011248843509227152, + "loss": 0.945, + "step": 10183 + }, + { + "epoch": 0.97, + "grad_norm": 0.28387550518261473, + "learning_rate": 0.00011247273929135841, + "loss": 1.1312, + "step": 10184 + }, + { + "epoch": 0.97, + "grad_norm": 0.33228217976195246, + "learning_rate": 0.00011245704317830721, + "loss": 1.0897, + "step": 10185 + }, + { + "epoch": 0.97, + "grad_norm": 0.31237530100881616, + "learning_rate": 0.00011244134675351066, + "loss": 1.1166, + "step": 10186 + }, + { + "epoch": 0.97, + "grad_norm": 0.30031596382794623, + "learning_rate": 0.00011242565001736159, + "loss": 1.0411, + "step": 10187 + }, + { + "epoch": 0.97, + "grad_norm": 0.3052672362577054, + "learning_rate": 0.00011240995297025281, + "loss": 0.9902, + "step": 10188 + }, + { + "epoch": 0.97, + "grad_norm": 0.306885450267344, + "learning_rate": 0.00011239425561257717, + "loss": 1.0137, + "step": 10189 + }, + { + "epoch": 0.97, + "grad_norm": 0.29453938744454133, + "learning_rate": 0.00011237855794472748, + "loss": 1.0604, + "step": 10190 + }, + { + "epoch": 0.97, + "grad_norm": 0.24569112604498705, + "learning_rate": 0.00011236285996709659, + "loss": 1.128, + "step": 10191 + }, + { + "epoch": 0.98, + "grad_norm": 0.30552330478731954, + "learning_rate": 0.00011234716168007737, + "loss": 1.0256, + "step": 10192 + }, + { + "epoch": 0.98, + "grad_norm": 0.29514963250865506, + "learning_rate": 0.00011233146308406268, + "loss": 1.1758, + "step": 10193 + }, + { + "epoch": 0.98, + "grad_norm": 0.2895811885576239, + "learning_rate": 0.00011231576417944536, + "loss": 1.0462, + "step": 10194 + }, + { + "epoch": 0.98, + "grad_norm": 0.2674722074074362, + "learning_rate": 0.00011230006496661831, + "loss": 1.2296, + "step": 10195 + }, + { + "epoch": 0.98, + "grad_norm": 0.31631409210970807, + "learning_rate": 0.00011228436544597442, + "loss": 1.0523, + "step": 10196 + }, + { + "epoch": 0.98, + "grad_norm": 0.2383251012221512, + "learning_rate": 0.00011226866561790653, + "loss": 0.9241, + "step": 10197 + }, + { + "epoch": 0.98, + "grad_norm": 0.3245079733703714, + "learning_rate": 0.00011225296548280759, + "loss": 1.0727, + "step": 10198 + }, + { + "epoch": 0.98, + "grad_norm": 0.28495612155493694, + "learning_rate": 0.0001122372650410705, + "loss": 0.9841, + "step": 10199 + }, + { + "epoch": 0.98, + "grad_norm": 0.30013506380723926, + "learning_rate": 0.00011222156429308812, + "loss": 0.9932, + "step": 10200 + }, + { + "epoch": 0.98, + "grad_norm": 0.2786075519347718, + "learning_rate": 0.00011220586323925346, + "loss": 1.0858, + "step": 10201 + }, + { + "epoch": 0.98, + "grad_norm": 0.3138110381506769, + "learning_rate": 0.00011219016187995937, + "loss": 1.1049, + "step": 10202 + }, + { + "epoch": 0.98, + "grad_norm": 0.27581534819480813, + "learning_rate": 0.00011217446021559883, + "loss": 0.9373, + "step": 10203 + }, + { + "epoch": 0.98, + "grad_norm": 0.3292494819868315, + "learning_rate": 0.00011215875824656477, + "loss": 0.9851, + "step": 10204 + }, + { + "epoch": 0.98, + "grad_norm": 0.2655547214014843, + "learning_rate": 0.00011214305597325015, + "loss": 0.9125, + "step": 10205 + }, + { + "epoch": 0.98, + "grad_norm": 0.2747966640064946, + "learning_rate": 0.00011212735339604792, + "loss": 0.9829, + "step": 10206 + }, + { + "epoch": 0.98, + "grad_norm": 0.26757047246677396, + "learning_rate": 0.00011211165051535104, + "loss": 0.9451, + "step": 10207 + }, + { + "epoch": 0.98, + "grad_norm": 0.27347785909550587, + "learning_rate": 0.00011209594733155251, + "loss": 1.0511, + "step": 10208 + }, + { + "epoch": 0.98, + "grad_norm": 0.250251681777428, + "learning_rate": 0.00011208024384504527, + "loss": 1.0769, + "step": 10209 + }, + { + "epoch": 0.98, + "grad_norm": 0.2977263384929927, + "learning_rate": 0.00011206454005622237, + "loss": 1.0488, + "step": 10210 + }, + { + "epoch": 0.98, + "grad_norm": 0.26128674737229673, + "learning_rate": 0.00011204883596547676, + "loss": 0.9721, + "step": 10211 + }, + { + "epoch": 0.98, + "grad_norm": 0.2864777874901107, + "learning_rate": 0.00011203313157320146, + "loss": 0.9903, + "step": 10212 + }, + { + "epoch": 0.98, + "grad_norm": 0.2930793218779365, + "learning_rate": 0.00011201742687978946, + "loss": 1.0835, + "step": 10213 + }, + { + "epoch": 0.98, + "grad_norm": 0.2726110420231643, + "learning_rate": 0.0001120017218856338, + "loss": 1.0418, + "step": 10214 + }, + { + "epoch": 0.98, + "grad_norm": 0.26500863939608205, + "learning_rate": 0.00011198601659112753, + "loss": 0.9941, + "step": 10215 + }, + { + "epoch": 0.98, + "grad_norm": 0.35922537571617147, + "learning_rate": 0.00011197031099666366, + "loss": 1.1, + "step": 10216 + }, + { + "epoch": 0.98, + "grad_norm": 0.26403707584086666, + "learning_rate": 0.00011195460510263523, + "loss": 1.0841, + "step": 10217 + }, + { + "epoch": 0.98, + "grad_norm": 0.30153285229911025, + "learning_rate": 0.00011193889890943528, + "loss": 0.9745, + "step": 10218 + }, + { + "epoch": 0.98, + "grad_norm": 0.27058298329981373, + "learning_rate": 0.0001119231924174569, + "loss": 1.0014, + "step": 10219 + }, + { + "epoch": 0.98, + "grad_norm": 0.2890597865454572, + "learning_rate": 0.00011190748562709314, + "loss": 1.1932, + "step": 10220 + }, + { + "epoch": 0.98, + "grad_norm": 0.2559758601582075, + "learning_rate": 0.00011189177853873705, + "loss": 1.035, + "step": 10221 + }, + { + "epoch": 0.98, + "grad_norm": 0.2675198324495206, + "learning_rate": 0.00011187607115278173, + "loss": 1.0852, + "step": 10222 + }, + { + "epoch": 0.98, + "grad_norm": 0.2984560752244984, + "learning_rate": 0.00011186036346962025, + "loss": 1.0197, + "step": 10223 + }, + { + "epoch": 0.98, + "grad_norm": 0.2746333508360344, + "learning_rate": 0.00011184465548964575, + "loss": 1.1096, + "step": 10224 + }, + { + "epoch": 0.98, + "grad_norm": 0.2911661232884761, + "learning_rate": 0.00011182894721325128, + "loss": 1.0876, + "step": 10225 + }, + { + "epoch": 0.98, + "grad_norm": 0.2545901819238632, + "learning_rate": 0.00011181323864082999, + "loss": 1.008, + "step": 10226 + }, + { + "epoch": 0.98, + "grad_norm": 0.27438906716939765, + "learning_rate": 0.00011179752977277498, + "loss": 0.9276, + "step": 10227 + }, + { + "epoch": 0.98, + "grad_norm": 0.30116616901446114, + "learning_rate": 0.00011178182060947935, + "loss": 0.9543, + "step": 10228 + }, + { + "epoch": 0.98, + "grad_norm": 0.2752406915494225, + "learning_rate": 0.00011176611115133628, + "loss": 1.0355, + "step": 10229 + }, + { + "epoch": 0.98, + "grad_norm": 0.27687088339188193, + "learning_rate": 0.00011175040139873889, + "loss": 0.9992, + "step": 10230 + }, + { + "epoch": 0.98, + "grad_norm": 0.2650331819262323, + "learning_rate": 0.00011173469135208028, + "loss": 0.9954, + "step": 10231 + }, + { + "epoch": 0.98, + "grad_norm": 0.29487040103252415, + "learning_rate": 0.00011171898101175369, + "loss": 0.9526, + "step": 10232 + }, + { + "epoch": 0.98, + "grad_norm": 0.2784410390172419, + "learning_rate": 0.0001117032703781522, + "loss": 1.1314, + "step": 10233 + }, + { + "epoch": 0.98, + "grad_norm": 0.34321818169181995, + "learning_rate": 0.00011168755945166905, + "loss": 1.0451, + "step": 10234 + }, + { + "epoch": 0.98, + "grad_norm": 0.2934212795606079, + "learning_rate": 0.00011167184823269735, + "loss": 0.9916, + "step": 10235 + }, + { + "epoch": 0.98, + "grad_norm": 0.27720714282535736, + "learning_rate": 0.00011165613672163032, + "loss": 1.052, + "step": 10236 + }, + { + "epoch": 0.98, + "grad_norm": 0.2827387556355499, + "learning_rate": 0.00011164042491886115, + "loss": 1.0031, + "step": 10237 + }, + { + "epoch": 0.98, + "grad_norm": 0.3020361414751645, + "learning_rate": 0.00011162471282478299, + "loss": 1.1295, + "step": 10238 + }, + { + "epoch": 0.98, + "grad_norm": 0.2926727806203999, + "learning_rate": 0.00011160900043978915, + "loss": 1.0201, + "step": 10239 + }, + { + "epoch": 0.98, + "grad_norm": 0.29327346670931664, + "learning_rate": 0.00011159328776427274, + "loss": 1.0045, + "step": 10240 + }, + { + "epoch": 0.98, + "grad_norm": 0.3197321134655832, + "learning_rate": 0.00011157757479862701, + "loss": 0.9832, + "step": 10241 + }, + { + "epoch": 0.98, + "grad_norm": 0.2834245023692689, + "learning_rate": 0.00011156186154324522, + "loss": 0.9674, + "step": 10242 + }, + { + "epoch": 0.98, + "grad_norm": 0.2925004499767554, + "learning_rate": 0.00011154614799852055, + "loss": 1.0646, + "step": 10243 + }, + { + "epoch": 0.98, + "grad_norm": 0.25673872599009095, + "learning_rate": 0.0001115304341648463, + "loss": 1.04, + "step": 10244 + }, + { + "epoch": 0.98, + "grad_norm": 0.2808493957291415, + "learning_rate": 0.00011151472004261565, + "loss": 1.0743, + "step": 10245 + }, + { + "epoch": 0.98, + "grad_norm": 0.2908844238513358, + "learning_rate": 0.00011149900563222193, + "loss": 1.0376, + "step": 10246 + }, + { + "epoch": 0.98, + "grad_norm": 0.2887693925794027, + "learning_rate": 0.00011148329093405836, + "loss": 1.018, + "step": 10247 + }, + { + "epoch": 0.98, + "grad_norm": 0.29234813333297677, + "learning_rate": 0.0001114675759485182, + "loss": 1.0377, + "step": 10248 + }, + { + "epoch": 0.98, + "grad_norm": 0.28841549577391123, + "learning_rate": 0.00011145186067599478, + "loss": 1.1507, + "step": 10249 + }, + { + "epoch": 0.98, + "grad_norm": 0.31377105859709736, + "learning_rate": 0.00011143614511688132, + "loss": 0.9281, + "step": 10250 + }, + { + "epoch": 0.98, + "grad_norm": 0.27520050582114036, + "learning_rate": 0.00011142042927157114, + "loss": 1.0736, + "step": 10251 + }, + { + "epoch": 0.98, + "grad_norm": 0.2836489124077, + "learning_rate": 0.00011140471314045755, + "loss": 1.1397, + "step": 10252 + }, + { + "epoch": 0.98, + "grad_norm": 0.3196462341558277, + "learning_rate": 0.00011138899672393386, + "loss": 1.1109, + "step": 10253 + }, + { + "epoch": 0.98, + "grad_norm": 0.2919022639490787, + "learning_rate": 0.00011137328002239335, + "loss": 1.0623, + "step": 10254 + }, + { + "epoch": 0.98, + "grad_norm": 0.2668993177956516, + "learning_rate": 0.00011135756303622937, + "loss": 0.9756, + "step": 10255 + }, + { + "epoch": 0.98, + "grad_norm": 0.29098900383647164, + "learning_rate": 0.00011134184576583525, + "loss": 0.981, + "step": 10256 + }, + { + "epoch": 0.98, + "grad_norm": 0.2741127564871344, + "learning_rate": 0.00011132612821160428, + "loss": 1.0042, + "step": 10257 + }, + { + "epoch": 0.98, + "grad_norm": 0.3179337291974993, + "learning_rate": 0.00011131041037392984, + "loss": 0.9453, + "step": 10258 + }, + { + "epoch": 0.98, + "grad_norm": 0.2903023541952086, + "learning_rate": 0.00011129469225320527, + "loss": 1.1199, + "step": 10259 + }, + { + "epoch": 0.98, + "grad_norm": 0.2762878923470942, + "learning_rate": 0.00011127897384982396, + "loss": 1.0547, + "step": 10260 + }, + { + "epoch": 0.98, + "grad_norm": 0.29633017173721615, + "learning_rate": 0.00011126325516417921, + "loss": 1.0946, + "step": 10261 + }, + { + "epoch": 0.98, + "grad_norm": 0.30069391214448493, + "learning_rate": 0.00011124753619666441, + "loss": 1.1047, + "step": 10262 + }, + { + "epoch": 0.98, + "grad_norm": 0.30197537398471846, + "learning_rate": 0.000111231816947673, + "loss": 1.0716, + "step": 10263 + }, + { + "epoch": 0.98, + "grad_norm": 0.22716259808522002, + "learning_rate": 0.00011121609741759824, + "loss": 1.1495, + "step": 10264 + }, + { + "epoch": 0.98, + "grad_norm": 0.33447762653923324, + "learning_rate": 0.00011120037760683364, + "loss": 1.0615, + "step": 10265 + }, + { + "epoch": 0.98, + "grad_norm": 0.2677195592128131, + "learning_rate": 0.00011118465751577254, + "loss": 1.0661, + "step": 10266 + }, + { + "epoch": 0.98, + "grad_norm": 0.25916514691347986, + "learning_rate": 0.00011116893714480836, + "loss": 1.0618, + "step": 10267 + }, + { + "epoch": 0.98, + "grad_norm": 0.2735392836561538, + "learning_rate": 0.0001111532164943345, + "loss": 1.1209, + "step": 10268 + }, + { + "epoch": 0.98, + "grad_norm": 0.306277634485208, + "learning_rate": 0.0001111374955647444, + "loss": 1.0481, + "step": 10269 + }, + { + "epoch": 0.98, + "grad_norm": 0.24999171813245338, + "learning_rate": 0.00011112177435643147, + "loss": 0.9234, + "step": 10270 + }, + { + "epoch": 0.98, + "grad_norm": 0.26942078573492634, + "learning_rate": 0.00011110605286978914, + "loss": 1.0864, + "step": 10271 + }, + { + "epoch": 0.98, + "grad_norm": 0.26245411051479745, + "learning_rate": 0.00011109033110521086, + "loss": 1.1104, + "step": 10272 + }, + { + "epoch": 0.98, + "grad_norm": 0.31836586363546904, + "learning_rate": 0.00011107460906309008, + "loss": 0.9678, + "step": 10273 + }, + { + "epoch": 0.98, + "grad_norm": 0.3164102794109734, + "learning_rate": 0.00011105888674382025, + "loss": 1.063, + "step": 10274 + }, + { + "epoch": 0.98, + "grad_norm": 0.3168927687531968, + "learning_rate": 0.0001110431641477948, + "loss": 1.0617, + "step": 10275 + }, + { + "epoch": 0.98, + "grad_norm": 0.2901612455077041, + "learning_rate": 0.00011102744127540728, + "loss": 1.0798, + "step": 10276 + }, + { + "epoch": 0.98, + "grad_norm": 0.2808843978397793, + "learning_rate": 0.0001110117181270511, + "loss": 1.083, + "step": 10277 + }, + { + "epoch": 0.98, + "grad_norm": 0.28149992419883424, + "learning_rate": 0.00011099599470311972, + "loss": 1.0127, + "step": 10278 + }, + { + "epoch": 0.98, + "grad_norm": 0.31449595403015373, + "learning_rate": 0.00011098027100400667, + "loss": 1.1207, + "step": 10279 + }, + { + "epoch": 0.98, + "grad_norm": 0.243827419121298, + "learning_rate": 0.00011096454703010546, + "loss": 0.9526, + "step": 10280 + }, + { + "epoch": 0.98, + "grad_norm": 0.27945107442928213, + "learning_rate": 0.00011094882278180956, + "loss": 0.9607, + "step": 10281 + }, + { + "epoch": 0.98, + "grad_norm": 0.27291199800336474, + "learning_rate": 0.00011093309825951245, + "loss": 1.0538, + "step": 10282 + }, + { + "epoch": 0.98, + "grad_norm": 0.30765998442794057, + "learning_rate": 0.00011091737346360773, + "loss": 1.1567, + "step": 10283 + }, + { + "epoch": 0.98, + "grad_norm": 0.28547060490679954, + "learning_rate": 0.00011090164839448887, + "loss": 0.9414, + "step": 10284 + }, + { + "epoch": 0.98, + "grad_norm": 0.28516651031906826, + "learning_rate": 0.00011088592305254939, + "loss": 1.1272, + "step": 10285 + }, + { + "epoch": 0.98, + "grad_norm": 0.28976084124588714, + "learning_rate": 0.00011087019743818288, + "loss": 1.067, + "step": 10286 + }, + { + "epoch": 0.98, + "grad_norm": 0.30715911284891306, + "learning_rate": 0.00011085447155178279, + "loss": 1.0638, + "step": 10287 + }, + { + "epoch": 0.98, + "grad_norm": 0.26407524082018785, + "learning_rate": 0.00011083874539374277, + "loss": 1.0787, + "step": 10288 + }, + { + "epoch": 0.98, + "grad_norm": 0.2694386589995852, + "learning_rate": 0.00011082301896445633, + "loss": 1.0979, + "step": 10289 + }, + { + "epoch": 0.98, + "grad_norm": 0.27217656363948856, + "learning_rate": 0.00011080729226431703, + "loss": 1.141, + "step": 10290 + }, + { + "epoch": 0.98, + "grad_norm": 0.2794911223689666, + "learning_rate": 0.00011079156529371846, + "loss": 1.049, + "step": 10291 + }, + { + "epoch": 0.98, + "grad_norm": 0.27195359614242276, + "learning_rate": 0.00011077583805305418, + "loss": 0.9813, + "step": 10292 + }, + { + "epoch": 0.98, + "grad_norm": 0.29038357957473354, + "learning_rate": 0.00011076011054271778, + "loss": 1.1996, + "step": 10293 + }, + { + "epoch": 0.98, + "grad_norm": 0.28199425499385034, + "learning_rate": 0.00011074438276310287, + "loss": 1.1444, + "step": 10294 + }, + { + "epoch": 0.98, + "grad_norm": 0.30433530645019496, + "learning_rate": 0.00011072865471460301, + "loss": 1.0179, + "step": 10295 + }, + { + "epoch": 0.99, + "grad_norm": 0.26891004746639385, + "learning_rate": 0.00011071292639761181, + "loss": 1.0334, + "step": 10296 + }, + { + "epoch": 0.99, + "grad_norm": 0.29402614889194606, + "learning_rate": 0.0001106971978125229, + "loss": 1.0908, + "step": 10297 + }, + { + "epoch": 0.99, + "grad_norm": 0.2559539105537088, + "learning_rate": 0.00011068146895972993, + "loss": 1.1747, + "step": 10298 + }, + { + "epoch": 0.99, + "grad_norm": 0.2845273763898639, + "learning_rate": 0.00011066573983962642, + "loss": 1.0114, + "step": 10299 + }, + { + "epoch": 0.99, + "grad_norm": 0.3025475977583579, + "learning_rate": 0.00011065001045260613, + "loss": 1.0435, + "step": 10300 + }, + { + "epoch": 0.99, + "grad_norm": 0.2747465345583122, + "learning_rate": 0.00011063428079906259, + "loss": 1.1137, + "step": 10301 + }, + { + "epoch": 0.99, + "grad_norm": 0.2990549222362699, + "learning_rate": 0.0001106185508793895, + "loss": 1.0761, + "step": 10302 + }, + { + "epoch": 0.99, + "grad_norm": 0.3060379979713365, + "learning_rate": 0.00011060282069398052, + "loss": 1.0701, + "step": 10303 + }, + { + "epoch": 0.99, + "grad_norm": 0.25935605894516695, + "learning_rate": 0.00011058709024322929, + "loss": 1.0854, + "step": 10304 + }, + { + "epoch": 0.99, + "grad_norm": 0.284568963561047, + "learning_rate": 0.00011057135952752943, + "loss": 0.9767, + "step": 10305 + }, + { + "epoch": 0.99, + "grad_norm": 0.268579961216263, + "learning_rate": 0.00011055562854727471, + "loss": 1.0459, + "step": 10306 + }, + { + "epoch": 0.99, + "grad_norm": 0.26453950568162055, + "learning_rate": 0.00011053989730285869, + "loss": 1.0633, + "step": 10307 + }, + { + "epoch": 0.99, + "grad_norm": 0.2840144675918696, + "learning_rate": 0.00011052416579467518, + "loss": 0.8992, + "step": 10308 + }, + { + "epoch": 0.99, + "grad_norm": 0.2526273337620793, + "learning_rate": 0.00011050843402311777, + "loss": 1.0391, + "step": 10309 + }, + { + "epoch": 0.99, + "grad_norm": 0.2907758381732847, + "learning_rate": 0.00011049270198858019, + "loss": 1.1091, + "step": 10310 + }, + { + "epoch": 0.99, + "grad_norm": 0.29330977370977224, + "learning_rate": 0.00011047696969145618, + "loss": 1.1539, + "step": 10311 + }, + { + "epoch": 0.99, + "grad_norm": 0.28027168248307766, + "learning_rate": 0.00011046123713213939, + "loss": 1.0466, + "step": 10312 + }, + { + "epoch": 0.99, + "grad_norm": 0.29433174543880003, + "learning_rate": 0.00011044550431102358, + "loss": 1.0868, + "step": 10313 + }, + { + "epoch": 0.99, + "grad_norm": 0.28747079837458267, + "learning_rate": 0.00011042977122850247, + "loss": 0.9917, + "step": 10314 + }, + { + "epoch": 0.99, + "grad_norm": 0.2841412064889096, + "learning_rate": 0.00011041403788496976, + "loss": 1.23, + "step": 10315 + }, + { + "epoch": 0.99, + "grad_norm": 0.26187375710340965, + "learning_rate": 0.0001103983042808192, + "loss": 1.0055, + "step": 10316 + }, + { + "epoch": 0.99, + "grad_norm": 0.32286275918136254, + "learning_rate": 0.00011038257041644455, + "loss": 0.993, + "step": 10317 + }, + { + "epoch": 0.99, + "grad_norm": 0.28197212224874324, + "learning_rate": 0.00011036683629223958, + "loss": 1.003, + "step": 10318 + }, + { + "epoch": 0.99, + "grad_norm": 0.25905993814852957, + "learning_rate": 0.00011035110190859796, + "loss": 1.0416, + "step": 10319 + }, + { + "epoch": 0.99, + "grad_norm": 0.2894375367829976, + "learning_rate": 0.00011033536726591356, + "loss": 1.0795, + "step": 10320 + }, + { + "epoch": 0.99, + "grad_norm": 0.32587938767421387, + "learning_rate": 0.00011031963236458008, + "loss": 1.0801, + "step": 10321 + }, + { + "epoch": 0.99, + "grad_norm": 0.2678020975589778, + "learning_rate": 0.00011030389720499132, + "loss": 1.1145, + "step": 10322 + }, + { + "epoch": 0.99, + "grad_norm": 0.30583316214893913, + "learning_rate": 0.00011028816178754104, + "loss": 1.0778, + "step": 10323 + }, + { + "epoch": 0.99, + "grad_norm": 0.32483933693639894, + "learning_rate": 0.00011027242611262306, + "loss": 1.0419, + "step": 10324 + }, + { + "epoch": 0.99, + "grad_norm": 0.31586042610524007, + "learning_rate": 0.00011025669018063116, + "loss": 1.0958, + "step": 10325 + }, + { + "epoch": 0.99, + "grad_norm": 0.2878776556420424, + "learning_rate": 0.00011024095399195913, + "loss": 1.0656, + "step": 10326 + }, + { + "epoch": 0.99, + "grad_norm": 0.2830198312625606, + "learning_rate": 0.0001102252175470008, + "loss": 1.0493, + "step": 10327 + }, + { + "epoch": 0.99, + "grad_norm": 0.2803658012096297, + "learning_rate": 0.00011020948084614995, + "loss": 0.9544, + "step": 10328 + }, + { + "epoch": 0.99, + "grad_norm": 0.3160224185691885, + "learning_rate": 0.00011019374388980046, + "loss": 0.9955, + "step": 10329 + }, + { + "epoch": 0.99, + "grad_norm": 0.28845814066617365, + "learning_rate": 0.0001101780066783461, + "loss": 1.1675, + "step": 10330 + }, + { + "epoch": 0.99, + "grad_norm": 0.2892793784279133, + "learning_rate": 0.00011016226921218074, + "loss": 0.9273, + "step": 10331 + }, + { + "epoch": 0.99, + "grad_norm": 0.27909437771191264, + "learning_rate": 0.0001101465314916982, + "loss": 1.1146, + "step": 10332 + }, + { + "epoch": 0.99, + "grad_norm": 0.3021556255362156, + "learning_rate": 0.00011013079351729232, + "loss": 1.0174, + "step": 10333 + }, + { + "epoch": 0.99, + "grad_norm": 0.3088606738990712, + "learning_rate": 0.000110115055289357, + "loss": 1.0696, + "step": 10334 + }, + { + "epoch": 0.99, + "grad_norm": 0.2532028338885159, + "learning_rate": 0.00011009931680828604, + "loss": 1.0509, + "step": 10335 + }, + { + "epoch": 0.99, + "grad_norm": 0.2644967377127452, + "learning_rate": 0.00011008357807447334, + "loss": 1.011, + "step": 10336 + }, + { + "epoch": 0.99, + "grad_norm": 0.25911644503390924, + "learning_rate": 0.00011006783908831275, + "loss": 0.9568, + "step": 10337 + }, + { + "epoch": 0.99, + "grad_norm": 0.3093805850346706, + "learning_rate": 0.00011005209985019817, + "loss": 1.0442, + "step": 10338 + }, + { + "epoch": 0.99, + "grad_norm": 0.26605780506457216, + "learning_rate": 0.00011003636036052347, + "loss": 1.12, + "step": 10339 + }, + { + "epoch": 0.99, + "grad_norm": 0.2738908065467255, + "learning_rate": 0.00011002062061968255, + "loss": 1.0423, + "step": 10340 + }, + { + "epoch": 0.99, + "grad_norm": 0.38868786111630993, + "learning_rate": 0.00011000488062806929, + "loss": 1.0834, + "step": 10341 + }, + { + "epoch": 0.99, + "grad_norm": 0.3172034502101059, + "learning_rate": 0.00010998914038607762, + "loss": 1.0222, + "step": 10342 + }, + { + "epoch": 0.99, + "grad_norm": 0.2858843799078492, + "learning_rate": 0.0001099733998941014, + "loss": 1.0284, + "step": 10343 + }, + { + "epoch": 0.99, + "grad_norm": 0.2646838577150544, + "learning_rate": 0.00010995765915253462, + "loss": 1.1413, + "step": 10344 + }, + { + "epoch": 0.99, + "grad_norm": 0.27314529183764996, + "learning_rate": 0.00010994191816177115, + "loss": 1.0144, + "step": 10345 + }, + { + "epoch": 0.99, + "grad_norm": 0.25795307029613257, + "learning_rate": 0.0001099261769222049, + "loss": 1.1011, + "step": 10346 + }, + { + "epoch": 0.99, + "grad_norm": 0.269654266329036, + "learning_rate": 0.00010991043543422987, + "loss": 1.093, + "step": 10347 + }, + { + "epoch": 0.99, + "grad_norm": 0.2873513143757092, + "learning_rate": 0.00010989469369823993, + "loss": 1.0575, + "step": 10348 + }, + { + "epoch": 0.99, + "grad_norm": 0.2966705516629044, + "learning_rate": 0.0001098789517146291, + "loss": 0.9566, + "step": 10349 + }, + { + "epoch": 0.99, + "grad_norm": 0.30235296838928694, + "learning_rate": 0.00010986320948379125, + "loss": 1.0973, + "step": 10350 + }, + { + "epoch": 0.99, + "grad_norm": 0.22856559446563995, + "learning_rate": 0.00010984746700612043, + "loss": 1.0409, + "step": 10351 + }, + { + "epoch": 0.99, + "grad_norm": 0.29679007545691055, + "learning_rate": 0.00010983172428201055, + "loss": 1.0134, + "step": 10352 + }, + { + "epoch": 0.99, + "grad_norm": 0.3089775230936333, + "learning_rate": 0.00010981598131185558, + "loss": 1.0447, + "step": 10353 + }, + { + "epoch": 0.99, + "grad_norm": 0.29725469954817196, + "learning_rate": 0.00010980023809604951, + "loss": 1.0729, + "step": 10354 + }, + { + "epoch": 0.99, + "grad_norm": 0.2626951615117108, + "learning_rate": 0.00010978449463498632, + "loss": 0.9726, + "step": 10355 + }, + { + "epoch": 0.99, + "grad_norm": 0.27242407027742316, + "learning_rate": 0.00010976875092906003, + "loss": 1.0434, + "step": 10356 + }, + { + "epoch": 0.99, + "grad_norm": 0.2947561469672809, + "learning_rate": 0.00010975300697866456, + "loss": 1.0845, + "step": 10357 + }, + { + "epoch": 0.99, + "grad_norm": 0.29370486489779185, + "learning_rate": 0.00010973726278419398, + "loss": 0.9322, + "step": 10358 + }, + { + "epoch": 0.99, + "grad_norm": 0.2928377780175839, + "learning_rate": 0.00010972151834604229, + "loss": 1.0208, + "step": 10359 + }, + { + "epoch": 0.99, + "grad_norm": 0.32099727413336887, + "learning_rate": 0.00010970577366460349, + "loss": 1.0144, + "step": 10360 + }, + { + "epoch": 0.99, + "grad_norm": 0.2763946202818681, + "learning_rate": 0.00010969002874027161, + "loss": 1.206, + "step": 10361 + }, + { + "epoch": 0.99, + "grad_norm": 0.29031856568736375, + "learning_rate": 0.00010967428357344067, + "loss": 1.0013, + "step": 10362 + }, + { + "epoch": 0.99, + "grad_norm": 0.2600423194599826, + "learning_rate": 0.00010965853816450469, + "loss": 1.1104, + "step": 10363 + }, + { + "epoch": 0.99, + "grad_norm": 0.262125842100789, + "learning_rate": 0.00010964279251385774, + "loss": 1.0854, + "step": 10364 + }, + { + "epoch": 0.99, + "grad_norm": 0.2983375163299008, + "learning_rate": 0.00010962704662189383, + "loss": 1.0486, + "step": 10365 + }, + { + "epoch": 0.99, + "grad_norm": 0.27574617002830204, + "learning_rate": 0.00010961130048900705, + "loss": 1.0465, + "step": 10366 + }, + { + "epoch": 0.99, + "grad_norm": 0.2474828198501618, + "learning_rate": 0.0001095955541155914, + "loss": 1.0645, + "step": 10367 + }, + { + "epoch": 0.99, + "grad_norm": 0.33229191383287654, + "learning_rate": 0.000109579807502041, + "loss": 0.9869, + "step": 10368 + }, + { + "epoch": 0.99, + "grad_norm": 0.2982566291542284, + "learning_rate": 0.0001095640606487499, + "loss": 1.0616, + "step": 10369 + }, + { + "epoch": 0.99, + "grad_norm": 0.3011080575254373, + "learning_rate": 0.00010954831355611215, + "loss": 1.0475, + "step": 10370 + }, + { + "epoch": 0.99, + "grad_norm": 0.2954042572701708, + "learning_rate": 0.00010953256622452185, + "loss": 1.0212, + "step": 10371 + }, + { + "epoch": 0.99, + "grad_norm": 0.3015044088263099, + "learning_rate": 0.0001095168186543731, + "loss": 1.0839, + "step": 10372 + }, + { + "epoch": 0.99, + "grad_norm": 0.36161978315568827, + "learning_rate": 0.00010950107084605998, + "loss": 0.9966, + "step": 10373 + }, + { + "epoch": 0.99, + "grad_norm": 0.2947245497967197, + "learning_rate": 0.00010948532279997664, + "loss": 1.0052, + "step": 10374 + }, + { + "epoch": 0.99, + "grad_norm": 0.278629231797063, + "learning_rate": 0.00010946957451651709, + "loss": 0.974, + "step": 10375 + }, + { + "epoch": 0.99, + "grad_norm": 0.3081240007766056, + "learning_rate": 0.0001094538259960755, + "loss": 0.9513, + "step": 10376 + }, + { + "epoch": 0.99, + "grad_norm": 0.37939549753517904, + "learning_rate": 0.00010943807723904593, + "loss": 1.1191, + "step": 10377 + }, + { + "epoch": 0.99, + "grad_norm": 0.24633288069760065, + "learning_rate": 0.0001094223282458226, + "loss": 1.0712, + "step": 10378 + }, + { + "epoch": 0.99, + "grad_norm": 0.3096687909717489, + "learning_rate": 0.00010940657901679956, + "loss": 0.9914, + "step": 10379 + }, + { + "epoch": 0.99, + "grad_norm": 0.3063298337056405, + "learning_rate": 0.00010939082955237096, + "loss": 1.1432, + "step": 10380 + }, + { + "epoch": 0.99, + "grad_norm": 0.2745100518027808, + "learning_rate": 0.00010937507985293098, + "loss": 1.0335, + "step": 10381 + }, + { + "epoch": 0.99, + "grad_norm": 0.25605757624283976, + "learning_rate": 0.00010935932991887372, + "loss": 0.9888, + "step": 10382 + }, + { + "epoch": 0.99, + "grad_norm": 0.27108913673944884, + "learning_rate": 0.00010934357975059334, + "loss": 1.1554, + "step": 10383 + }, + { + "epoch": 0.99, + "grad_norm": 0.3465731349344641, + "learning_rate": 0.000109327829348484, + "loss": 1.0209, + "step": 10384 + }, + { + "epoch": 0.99, + "grad_norm": 0.2661351715791654, + "learning_rate": 0.00010931207871293987, + "loss": 1.1225, + "step": 10385 + }, + { + "epoch": 0.99, + "grad_norm": 0.2992239551094853, + "learning_rate": 0.00010929632784435513, + "loss": 1.1448, + "step": 10386 + }, + { + "epoch": 0.99, + "grad_norm": 0.2714660858795361, + "learning_rate": 0.00010928057674312393, + "loss": 1.1303, + "step": 10387 + }, + { + "epoch": 0.99, + "grad_norm": 0.3560152461135453, + "learning_rate": 0.00010926482540964047, + "loss": 1.0939, + "step": 10388 + }, + { + "epoch": 0.99, + "grad_norm": 0.33230188942732725, + "learning_rate": 0.00010924907384429892, + "loss": 1.0654, + "step": 10389 + }, + { + "epoch": 0.99, + "grad_norm": 0.2648343149387062, + "learning_rate": 0.0001092333220474935, + "loss": 1.0357, + "step": 10390 + }, + { + "epoch": 0.99, + "grad_norm": 0.2959171922667274, + "learning_rate": 0.00010921757001961839, + "loss": 1.0695, + "step": 10391 + }, + { + "epoch": 0.99, + "grad_norm": 0.26506456061345024, + "learning_rate": 0.00010920181776106779, + "loss": 1.1593, + "step": 10392 + }, + { + "epoch": 0.99, + "grad_norm": 0.3204886263895637, + "learning_rate": 0.00010918606527223593, + "loss": 1.0391, + "step": 10393 + }, + { + "epoch": 0.99, + "grad_norm": 0.24450093898680367, + "learning_rate": 0.00010917031255351699, + "loss": 0.9267, + "step": 10394 + }, + { + "epoch": 0.99, + "grad_norm": 0.28085674386349635, + "learning_rate": 0.00010915455960530526, + "loss": 1.0893, + "step": 10395 + }, + { + "epoch": 0.99, + "grad_norm": 0.28257173644304456, + "learning_rate": 0.00010913880642799488, + "loss": 1.0546, + "step": 10396 + }, + { + "epoch": 0.99, + "grad_norm": 0.3084129119694214, + "learning_rate": 0.00010912305302198014, + "loss": 1.0221, + "step": 10397 + }, + { + "epoch": 0.99, + "grad_norm": 0.2778700782623621, + "learning_rate": 0.00010910729938765528, + "loss": 1.0197, + "step": 10398 + }, + { + "epoch": 0.99, + "grad_norm": 0.24322367397269376, + "learning_rate": 0.00010909154552541449, + "loss": 0.9353, + "step": 10399 + }, + { + "epoch": 0.99, + "grad_norm": 0.27278544222892476, + "learning_rate": 0.0001090757914356521, + "loss": 1.076, + "step": 10400 + }, + { + "epoch": 1.0, + "grad_norm": 0.2869093663218051, + "learning_rate": 0.00010906003711876229, + "loss": 0.9683, + "step": 10401 + }, + { + "epoch": 1.0, + "grad_norm": 0.2941519770990777, + "learning_rate": 0.00010904428257513939, + "loss": 1.0158, + "step": 10402 + }, + { + "epoch": 1.0, + "grad_norm": 0.27338008253801777, + "learning_rate": 0.00010902852780517763, + "loss": 1.0386, + "step": 10403 + }, + { + "epoch": 1.0, + "grad_norm": 0.28389255664388, + "learning_rate": 0.00010901277280927124, + "loss": 1.1183, + "step": 10404 + }, + { + "epoch": 1.0, + "grad_norm": 0.3443690879581628, + "learning_rate": 0.0001089970175878146, + "loss": 1.098, + "step": 10405 + }, + { + "epoch": 1.0, + "grad_norm": 0.2551799172999775, + "learning_rate": 0.00010898126214120194, + "loss": 1.0581, + "step": 10406 + }, + { + "epoch": 1.0, + "grad_norm": 0.282716317579218, + "learning_rate": 0.0001089655064698275, + "loss": 0.9715, + "step": 10407 + }, + { + "epoch": 1.0, + "grad_norm": 0.31282993584728686, + "learning_rate": 0.00010894975057408568, + "loss": 1.047, + "step": 10408 + }, + { + "epoch": 1.0, + "grad_norm": 0.30036322798930976, + "learning_rate": 0.00010893399445437071, + "loss": 1.188, + "step": 10409 + }, + { + "epoch": 1.0, + "grad_norm": 0.3325931928877523, + "learning_rate": 0.0001089182381110769, + "loss": 0.9747, + "step": 10410 + }, + { + "epoch": 1.0, + "grad_norm": 0.3117101250717752, + "learning_rate": 0.00010890248154459858, + "loss": 1.0603, + "step": 10411 + }, + { + "epoch": 1.0, + "grad_norm": 0.28934935878323786, + "learning_rate": 0.00010888672475533006, + "loss": 1.093, + "step": 10412 + }, + { + "epoch": 1.0, + "grad_norm": 0.2716220894787029, + "learning_rate": 0.0001088709677436657, + "loss": 1.1221, + "step": 10413 + }, + { + "epoch": 1.0, + "grad_norm": 0.30194803612986443, + "learning_rate": 0.00010885521050999976, + "loss": 1.0063, + "step": 10414 + }, + { + "epoch": 1.0, + "grad_norm": 0.28931680238920227, + "learning_rate": 0.00010883945305472662, + "loss": 0.9928, + "step": 10415 + }, + { + "epoch": 1.0, + "grad_norm": 0.3229463397228268, + "learning_rate": 0.00010882369537824064, + "loss": 1.0712, + "step": 10416 + }, + { + "epoch": 1.0, + "grad_norm": 0.2954911003697874, + "learning_rate": 0.00010880793748093615, + "loss": 1.0046, + "step": 10417 + }, + { + "epoch": 1.0, + "grad_norm": 0.2872966086169036, + "learning_rate": 0.00010879217936320743, + "loss": 1.0814, + "step": 10418 + }, + { + "epoch": 1.0, + "grad_norm": 0.3310289766979793, + "learning_rate": 0.00010877642102544891, + "loss": 0.9988, + "step": 10419 + }, + { + "epoch": 1.0, + "grad_norm": 0.2661925275942424, + "learning_rate": 0.00010876066246805496, + "loss": 1.0533, + "step": 10420 + }, + { + "epoch": 1.0, + "grad_norm": 0.2868056408701368, + "learning_rate": 0.00010874490369141991, + "loss": 1.0454, + "step": 10421 + }, + { + "epoch": 1.0, + "grad_norm": 0.25382291752633346, + "learning_rate": 0.00010872914469593816, + "loss": 1.0787, + "step": 10422 + }, + { + "epoch": 1.0, + "grad_norm": 0.32950064922007866, + "learning_rate": 0.00010871338548200409, + "loss": 1.0575, + "step": 10423 + }, + { + "epoch": 1.0, + "grad_norm": 0.28542084992905475, + "learning_rate": 0.00010869762605001204, + "loss": 1.0273, + "step": 10424 + }, + { + "epoch": 1.0, + "grad_norm": 0.274404919205005, + "learning_rate": 0.00010868186640035645, + "loss": 1.1522, + "step": 10425 + }, + { + "epoch": 1.0, + "grad_norm": 0.27910073576406763, + "learning_rate": 0.0001086661065334317, + "loss": 0.9747, + "step": 10426 + }, + { + "epoch": 1.0, + "grad_norm": 0.2754135797678086, + "learning_rate": 0.00010865034644963219, + "loss": 1.0593, + "step": 10427 + }, + { + "epoch": 1.0, + "grad_norm": 0.3036570067584713, + "learning_rate": 0.00010863458614935228, + "loss": 0.9586, + "step": 10428 + }, + { + "epoch": 1.0, + "grad_norm": 0.25912219351365356, + "learning_rate": 0.00010861882563298648, + "loss": 1.1329, + "step": 10429 + }, + { + "epoch": 1.0, + "grad_norm": 0.30796423013527807, + "learning_rate": 0.00010860306490092916, + "loss": 0.9919, + "step": 10430 + }, + { + "epoch": 1.0, + "grad_norm": 0.2939383687791828, + "learning_rate": 0.00010858730395357468, + "loss": 0.9745, + "step": 10431 + }, + { + "epoch": 1.0, + "grad_norm": 0.27337663742366286, + "learning_rate": 0.00010857154279131754, + "loss": 0.9214, + "step": 10432 + }, + { + "epoch": 1.0, + "grad_norm": 0.26204823094059426, + "learning_rate": 0.00010855578141455216, + "loss": 1.0411, + "step": 10433 + }, + { + "epoch": 1.0, + "grad_norm": 0.26527110818824695, + "learning_rate": 0.00010854001982367296, + "loss": 1.0424, + "step": 10434 + }, + { + "epoch": 1.0, + "grad_norm": 0.25265488574920947, + "learning_rate": 0.00010852425801907442, + "loss": 0.9679, + "step": 10435 + }, + { + "epoch": 1.0, + "grad_norm": 0.26557773202249946, + "learning_rate": 0.00010850849600115096, + "loss": 1.0498, + "step": 10436 + }, + { + "epoch": 1.0, + "grad_norm": 0.3095199638697367, + "learning_rate": 0.00010849273377029705, + "loss": 0.9924, + "step": 10437 + }, + { + "epoch": 1.0, + "grad_norm": 0.25086882514656306, + "learning_rate": 0.00010847697132690713, + "loss": 1.0061, + "step": 10438 + }, + { + "epoch": 1.0, + "grad_norm": 0.26227634701423524, + "learning_rate": 0.00010846120867137567, + "loss": 1.022, + "step": 10439 + }, + { + "epoch": 1.0, + "grad_norm": 0.272747535197043, + "learning_rate": 0.00010844544580409717, + "loss": 1.049, + "step": 10440 + }, + { + "epoch": 1.0, + "grad_norm": 0.30486635134117596, + "learning_rate": 0.00010842968272546603, + "loss": 0.9928, + "step": 10441 + }, + { + "epoch": 1.0, + "grad_norm": 0.2911807248936414, + "learning_rate": 0.00010841391943587682, + "loss": 1.0824, + "step": 10442 + }, + { + "epoch": 1.0, + "grad_norm": 0.24115347100619006, + "learning_rate": 0.00010839815593572398, + "loss": 1.0671, + "step": 10443 + }, + { + "epoch": 1.0, + "grad_norm": 0.2749883873988357, + "learning_rate": 0.00010838239222540203, + "loss": 1.1308, + "step": 10444 + }, + { + "epoch": 1.0, + "grad_norm": 0.25592035706779787, + "learning_rate": 0.00010836662830530539, + "loss": 1.0291, + "step": 10445 + }, + { + "epoch": 1.0, + "grad_norm": 0.27534119534699625, + "learning_rate": 0.00010835086417582867, + "loss": 1.075, + "step": 10446 + }, + { + "epoch": 1.0, + "grad_norm": 0.29613831949975483, + "learning_rate": 0.00010833509983736632, + "loss": 1.0167, + "step": 10447 + }, + { + "epoch": 1.0, + "grad_norm": 0.2971382111384855, + "learning_rate": 0.00010831933529031284, + "loss": 1.0102, + "step": 10448 + }, + { + "epoch": 1.0, + "grad_norm": 0.2833249451857255, + "learning_rate": 0.00010830357053506277, + "loss": 1.1205, + "step": 10449 + }, + { + "epoch": 1.0, + "grad_norm": 0.28978501988105787, + "learning_rate": 0.00010828780557201066, + "loss": 1.0154, + "step": 10450 + }, + { + "epoch": 1.0, + "grad_norm": 0.32646478206285645, + "learning_rate": 0.00010827204040155094, + "loss": 0.9872, + "step": 10451 + }, + { + "epoch": 1.0, + "grad_norm": 0.28028521065676737, + "learning_rate": 0.00010825627502407827, + "loss": 0.9816, + "step": 10452 + }, + { + "epoch": 1.0, + "eval_loss": 1.123805046081543, + "eval_runtime": 4227.9159, + "eval_samples_per_second": 19.778, + "eval_steps_per_second": 2.472, + "step": 10452 + }, + { + "epoch": 1.0, + "grad_norm": 0.29639912410920427, + "learning_rate": 0.0001082405094399871, + "loss": 0.9907, + "step": 10453 + }, + { + "epoch": 1.0, + "grad_norm": 0.2803083164881628, + "learning_rate": 0.000108224743649672, + "loss": 1.0055, + "step": 10454 + }, + { + "epoch": 1.0, + "grad_norm": 0.28220785620723815, + "learning_rate": 0.00010820897765352753, + "loss": 0.8705, + "step": 10455 + }, + { + "epoch": 1.0, + "grad_norm": 0.26187787074552016, + "learning_rate": 0.00010819321145194825, + "loss": 1.0478, + "step": 10456 + }, + { + "epoch": 1.0, + "grad_norm": 0.30642156410488447, + "learning_rate": 0.00010817744504532869, + "loss": 0.9899, + "step": 10457 + }, + { + "epoch": 1.0, + "grad_norm": 0.2822114200472332, + "learning_rate": 0.00010816167843406341, + "loss": 1.0508, + "step": 10458 + }, + { + "epoch": 1.0, + "grad_norm": 0.2991241545458809, + "learning_rate": 0.00010814591161854704, + "loss": 1.0499, + "step": 10459 + }, + { + "epoch": 1.0, + "grad_norm": 0.2705201004160056, + "learning_rate": 0.00010813014459917408, + "loss": 1.0675, + "step": 10460 + }, + { + "epoch": 1.0, + "grad_norm": 0.2614129687537468, + "learning_rate": 0.00010811437737633916, + "loss": 0.9365, + "step": 10461 + }, + { + "epoch": 1.0, + "grad_norm": 0.2487323096899427, + "learning_rate": 0.00010809860995043683, + "loss": 0.9152, + "step": 10462 + }, + { + "epoch": 1.0, + "grad_norm": 0.3086555950388732, + "learning_rate": 0.0001080828423218617, + "loss": 0.9447, + "step": 10463 + }, + { + "epoch": 1.0, + "grad_norm": 0.29043731753857494, + "learning_rate": 0.0001080670744910084, + "loss": 1.0979, + "step": 10464 + }, + { + "epoch": 1.0, + "grad_norm": 0.2905732734908811, + "learning_rate": 0.00010805130645827141, + "loss": 0.9849, + "step": 10465 + }, + { + "epoch": 1.0, + "grad_norm": 0.28806319188983115, + "learning_rate": 0.00010803553822404548, + "loss": 1.1297, + "step": 10466 + }, + { + "epoch": 1.0, + "grad_norm": 0.3107020865126196, + "learning_rate": 0.00010801976978872515, + "loss": 1.0383, + "step": 10467 + }, + { + "epoch": 1.0, + "grad_norm": 0.30348775614812207, + "learning_rate": 0.00010800400115270501, + "loss": 1.0627, + "step": 10468 + }, + { + "epoch": 1.0, + "grad_norm": 0.2866713751972161, + "learning_rate": 0.00010798823231637974, + "loss": 1.0529, + "step": 10469 + }, + { + "epoch": 1.0, + "grad_norm": 0.3273451447470748, + "learning_rate": 0.00010797246328014392, + "loss": 0.9872, + "step": 10470 + }, + { + "epoch": 1.0, + "grad_norm": 0.27952784139827025, + "learning_rate": 0.00010795669404439221, + "loss": 0.9719, + "step": 10471 + }, + { + "epoch": 1.0, + "grad_norm": 0.29404482582036656, + "learning_rate": 0.00010794092460951921, + "loss": 1.115, + "step": 10472 + }, + { + "epoch": 1.0, + "grad_norm": 0.27090225353502845, + "learning_rate": 0.0001079251549759196, + "loss": 1.0585, + "step": 10473 + }, + { + "epoch": 1.0, + "grad_norm": 0.27432480124209896, + "learning_rate": 0.00010790938514398799, + "loss": 1.0617, + "step": 10474 + }, + { + "epoch": 1.0, + "grad_norm": 0.28049252995764806, + "learning_rate": 0.00010789361511411906, + "loss": 1.0838, + "step": 10475 + }, + { + "epoch": 1.0, + "grad_norm": 0.32813550108169526, + "learning_rate": 0.00010787784488670743, + "loss": 1.0085, + "step": 10476 + }, + { + "epoch": 1.0, + "grad_norm": 0.27638338743324226, + "learning_rate": 0.0001078620744621478, + "loss": 1.0194, + "step": 10477 + }, + { + "epoch": 1.0, + "grad_norm": 0.30804374690739317, + "learning_rate": 0.00010784630384083483, + "loss": 0.9786, + "step": 10478 + }, + { + "epoch": 1.0, + "grad_norm": 0.2910163923228523, + "learning_rate": 0.00010783053302316313, + "loss": 1.0401, + "step": 10479 + }, + { + "epoch": 1.0, + "grad_norm": 0.2828500205822147, + "learning_rate": 0.00010781476200952747, + "loss": 1.0473, + "step": 10480 + }, + { + "epoch": 1.0, + "grad_norm": 0.32327348115054194, + "learning_rate": 0.00010779899080032246, + "loss": 1.1565, + "step": 10481 + }, + { + "epoch": 1.0, + "grad_norm": 0.334972794747575, + "learning_rate": 0.00010778321939594279, + "loss": 1.0677, + "step": 10482 + }, + { + "epoch": 1.0, + "grad_norm": 0.32321418292603826, + "learning_rate": 0.00010776744779678316, + "loss": 1.0278, + "step": 10483 + }, + { + "epoch": 1.0, + "grad_norm": 0.3135882690623517, + "learning_rate": 0.0001077516760032383, + "loss": 1.015, + "step": 10484 + }, + { + "epoch": 1.0, + "grad_norm": 0.2874625150372571, + "learning_rate": 0.00010773590401570284, + "loss": 0.9992, + "step": 10485 + }, + { + "epoch": 1.0, + "grad_norm": 0.25604346957628077, + "learning_rate": 0.00010772013183457154, + "loss": 0.9291, + "step": 10486 + }, + { + "epoch": 1.0, + "grad_norm": 0.2866484747749706, + "learning_rate": 0.00010770435946023909, + "loss": 0.9683, + "step": 10487 + }, + { + "epoch": 1.0, + "grad_norm": 0.35487486827557696, + "learning_rate": 0.00010768858689310022, + "loss": 1.0071, + "step": 10488 + }, + { + "epoch": 1.0, + "grad_norm": 0.28682081035848683, + "learning_rate": 0.00010767281413354957, + "loss": 1.1483, + "step": 10489 + }, + { + "epoch": 1.0, + "grad_norm": 0.27960260510706303, + "learning_rate": 0.000107657041181982, + "loss": 1.165, + "step": 10490 + }, + { + "epoch": 1.0, + "grad_norm": 0.3159138216678741, + "learning_rate": 0.00010764126803879211, + "loss": 1.0175, + "step": 10491 + }, + { + "epoch": 1.0, + "grad_norm": 0.3044581665674588, + "learning_rate": 0.00010762549470437469, + "loss": 1.0653, + "step": 10492 + }, + { + "epoch": 1.0, + "grad_norm": 0.27403640469994983, + "learning_rate": 0.0001076097211791245, + "loss": 1.0574, + "step": 10493 + }, + { + "epoch": 1.0, + "grad_norm": 0.29408855681117685, + "learning_rate": 0.00010759394746343621, + "loss": 0.8724, + "step": 10494 + }, + { + "epoch": 1.0, + "grad_norm": 0.27086113032832987, + "learning_rate": 0.00010757817355770466, + "loss": 0.9765, + "step": 10495 + }, + { + "epoch": 1.0, + "grad_norm": 0.29546601202074696, + "learning_rate": 0.00010756239946232452, + "loss": 1.106, + "step": 10496 + }, + { + "epoch": 1.0, + "grad_norm": 0.279769203248748, + "learning_rate": 0.00010754662517769058, + "loss": 0.9804, + "step": 10497 + }, + { + "epoch": 1.0, + "grad_norm": 0.2661421141334049, + "learning_rate": 0.00010753085070419763, + "loss": 1.051, + "step": 10498 + }, + { + "epoch": 1.0, + "grad_norm": 0.2755811574683516, + "learning_rate": 0.00010751507604224038, + "loss": 0.9185, + "step": 10499 + }, + { + "epoch": 1.0, + "grad_norm": 0.32503272024326224, + "learning_rate": 0.00010749930119221365, + "loss": 0.9641, + "step": 10500 + }, + { + "epoch": 1.0, + "grad_norm": 0.27658930012848465, + "learning_rate": 0.0001074835261545122, + "loss": 0.9744, + "step": 10501 + }, + { + "epoch": 1.0, + "grad_norm": 0.3288461529640433, + "learning_rate": 0.00010746775092953076, + "loss": 1.0616, + "step": 10502 + }, + { + "epoch": 1.0, + "grad_norm": 0.2591238791373149, + "learning_rate": 0.00010745197551766422, + "loss": 1.0233, + "step": 10503 + }, + { + "epoch": 1.0, + "grad_norm": 0.3065770893385144, + "learning_rate": 0.00010743619991930728, + "loss": 1.0684, + "step": 10504 + }, + { + "epoch": 1.01, + "grad_norm": 0.2814613360598266, + "learning_rate": 0.00010742042413485479, + "loss": 1.0751, + "step": 10505 + }, + { + "epoch": 1.01, + "grad_norm": 0.30296999590796936, + "learning_rate": 0.00010740464816470147, + "loss": 1.1212, + "step": 10506 + }, + { + "epoch": 1.01, + "grad_norm": 0.30778721762559724, + "learning_rate": 0.00010738887200924221, + "loss": 0.982, + "step": 10507 + }, + { + "epoch": 1.01, + "grad_norm": 0.2859309351560786, + "learning_rate": 0.0001073730956688718, + "loss": 1.0611, + "step": 10508 + }, + { + "epoch": 1.01, + "grad_norm": 0.31642974782828703, + "learning_rate": 0.000107357319143985, + "loss": 1.0782, + "step": 10509 + }, + { + "epoch": 1.01, + "grad_norm": 0.29541534436480793, + "learning_rate": 0.00010734154243497667, + "loss": 0.9578, + "step": 10510 + }, + { + "epoch": 1.01, + "grad_norm": 0.34221830018515814, + "learning_rate": 0.00010732576554224165, + "loss": 1.0397, + "step": 10511 + }, + { + "epoch": 1.01, + "grad_norm": 0.2679732275776767, + "learning_rate": 0.0001073099884661747, + "loss": 1.0726, + "step": 10512 + }, + { + "epoch": 1.01, + "grad_norm": 0.27318632109146107, + "learning_rate": 0.00010729421120717074, + "loss": 0.9772, + "step": 10513 + }, + { + "epoch": 1.01, + "grad_norm": 0.3091991108252378, + "learning_rate": 0.00010727843376562456, + "loss": 1.0546, + "step": 10514 + }, + { + "epoch": 1.01, + "grad_norm": 0.33012671721648684, + "learning_rate": 0.00010726265614193097, + "loss": 1.0365, + "step": 10515 + }, + { + "epoch": 1.01, + "grad_norm": 0.28452819055590395, + "learning_rate": 0.00010724687833648485, + "loss": 0.9811, + "step": 10516 + }, + { + "epoch": 1.01, + "grad_norm": 0.2897717625839458, + "learning_rate": 0.00010723110034968104, + "loss": 0.9655, + "step": 10517 + }, + { + "epoch": 1.01, + "grad_norm": 0.31696048805261545, + "learning_rate": 0.00010721532218191443, + "loss": 1.0671, + "step": 10518 + }, + { + "epoch": 1.01, + "grad_norm": 0.29343898959763104, + "learning_rate": 0.00010719954383357981, + "loss": 1.0869, + "step": 10519 + }, + { + "epoch": 1.01, + "grad_norm": 0.3016620615896671, + "learning_rate": 0.00010718376530507213, + "loss": 1.0106, + "step": 10520 + }, + { + "epoch": 1.01, + "grad_norm": 0.2890262429542911, + "learning_rate": 0.00010716798659678617, + "loss": 1.051, + "step": 10521 + }, + { + "epoch": 1.01, + "grad_norm": 0.3011997317028774, + "learning_rate": 0.00010715220770911687, + "loss": 1.0591, + "step": 10522 + }, + { + "epoch": 1.01, + "grad_norm": 0.2827851935425907, + "learning_rate": 0.00010713642864245905, + "loss": 1.1236, + "step": 10523 + }, + { + "epoch": 1.01, + "grad_norm": 0.2834557938321255, + "learning_rate": 0.00010712064939720763, + "loss": 1.0493, + "step": 10524 + }, + { + "epoch": 1.01, + "grad_norm": 0.2877723965403218, + "learning_rate": 0.00010710486997375749, + "loss": 1.0139, + "step": 10525 + }, + { + "epoch": 1.01, + "grad_norm": 0.2800465872327851, + "learning_rate": 0.0001070890903725035, + "loss": 1.0812, + "step": 10526 + }, + { + "epoch": 1.01, + "grad_norm": 0.29324154831050125, + "learning_rate": 0.00010707331059384056, + "loss": 1.0524, + "step": 10527 + }, + { + "epoch": 1.01, + "grad_norm": 0.2890231260816284, + "learning_rate": 0.00010705753063816359, + "loss": 1.1326, + "step": 10528 + }, + { + "epoch": 1.01, + "grad_norm": 0.273772614161637, + "learning_rate": 0.00010704175050586749, + "loss": 1.2287, + "step": 10529 + }, + { + "epoch": 1.01, + "grad_norm": 0.2632480290724763, + "learning_rate": 0.00010702597019734714, + "loss": 1.0176, + "step": 10530 + }, + { + "epoch": 1.01, + "grad_norm": 0.2881129250056252, + "learning_rate": 0.0001070101897129975, + "loss": 1.063, + "step": 10531 + }, + { + "epoch": 1.01, + "grad_norm": 0.3188054288187872, + "learning_rate": 0.00010699440905321343, + "loss": 1.0637, + "step": 10532 + }, + { + "epoch": 1.01, + "grad_norm": 0.2751484776861917, + "learning_rate": 0.00010697862821838987, + "loss": 1.0734, + "step": 10533 + }, + { + "epoch": 1.01, + "grad_norm": 0.2843859342103799, + "learning_rate": 0.00010696284720892177, + "loss": 0.9541, + "step": 10534 + }, + { + "epoch": 1.01, + "grad_norm": 0.27455505977515676, + "learning_rate": 0.00010694706602520404, + "loss": 1.0755, + "step": 10535 + }, + { + "epoch": 1.01, + "grad_norm": 0.2721441651927573, + "learning_rate": 0.0001069312846676316, + "loss": 1.0389, + "step": 10536 + }, + { + "epoch": 1.01, + "grad_norm": 0.26685246335886553, + "learning_rate": 0.00010691550313659942, + "loss": 1.0029, + "step": 10537 + }, + { + "epoch": 1.01, + "grad_norm": 0.28382233330962003, + "learning_rate": 0.00010689972143250244, + "loss": 0.9487, + "step": 10538 + }, + { + "epoch": 1.01, + "grad_norm": 0.32729762560259823, + "learning_rate": 0.0001068839395557356, + "loss": 0.9003, + "step": 10539 + }, + { + "epoch": 1.01, + "grad_norm": 0.2713103274569532, + "learning_rate": 0.00010686815750669378, + "loss": 1.0844, + "step": 10540 + }, + { + "epoch": 1.01, + "grad_norm": 0.261814845491281, + "learning_rate": 0.00010685237528577208, + "loss": 0.9878, + "step": 10541 + }, + { + "epoch": 1.01, + "grad_norm": 0.2883832357164556, + "learning_rate": 0.00010683659289336536, + "loss": 0.969, + "step": 10542 + }, + { + "epoch": 1.01, + "grad_norm": 0.27295777071208516, + "learning_rate": 0.00010682081032986856, + "loss": 1.0401, + "step": 10543 + }, + { + "epoch": 1.01, + "grad_norm": 0.27723104696302703, + "learning_rate": 0.00010680502759567677, + "loss": 1.1157, + "step": 10544 + }, + { + "epoch": 1.01, + "grad_norm": 0.31203854926340235, + "learning_rate": 0.00010678924469118486, + "loss": 0.9992, + "step": 10545 + }, + { + "epoch": 1.01, + "grad_norm": 0.28490832596454174, + "learning_rate": 0.0001067734616167878, + "loss": 0.9855, + "step": 10546 + }, + { + "epoch": 1.01, + "grad_norm": 0.3056701728717963, + "learning_rate": 0.00010675767837288064, + "loss": 0.9262, + "step": 10547 + }, + { + "epoch": 1.01, + "grad_norm": 0.314926475959889, + "learning_rate": 0.00010674189495985834, + "loss": 0.8889, + "step": 10548 + }, + { + "epoch": 1.01, + "grad_norm": 0.30326228050193105, + "learning_rate": 0.00010672611137811586, + "loss": 1.0098, + "step": 10549 + }, + { + "epoch": 1.01, + "grad_norm": 0.29890442661545713, + "learning_rate": 0.0001067103276280482, + "loss": 1.0034, + "step": 10550 + }, + { + "epoch": 1.01, + "grad_norm": 0.270649023950682, + "learning_rate": 0.0001066945437100504, + "loss": 0.9459, + "step": 10551 + }, + { + "epoch": 1.01, + "grad_norm": 0.28818599874968776, + "learning_rate": 0.00010667875962451746, + "loss": 0.9588, + "step": 10552 + }, + { + "epoch": 1.01, + "grad_norm": 0.2635966292899574, + "learning_rate": 0.0001066629753718443, + "loss": 1.0566, + "step": 10553 + }, + { + "epoch": 1.01, + "grad_norm": 0.3045614432507198, + "learning_rate": 0.00010664719095242603, + "loss": 0.9978, + "step": 10554 + }, + { + "epoch": 1.01, + "grad_norm": 0.3176880731411001, + "learning_rate": 0.00010663140636665765, + "loss": 1.1307, + "step": 10555 + }, + { + "epoch": 1.01, + "grad_norm": 0.3285021663264456, + "learning_rate": 0.00010661562161493411, + "loss": 1.0902, + "step": 10556 + }, + { + "epoch": 1.01, + "grad_norm": 0.3012700234103322, + "learning_rate": 0.00010659983669765052, + "loss": 1.1009, + "step": 10557 + }, + { + "epoch": 1.01, + "grad_norm": 0.31595676725448874, + "learning_rate": 0.00010658405161520184, + "loss": 1.0825, + "step": 10558 + }, + { + "epoch": 1.01, + "grad_norm": 0.2991517860676054, + "learning_rate": 0.00010656826636798314, + "loss": 1.0432, + "step": 10559 + }, + { + "epoch": 1.01, + "grad_norm": 0.3022758257456784, + "learning_rate": 0.00010655248095638944, + "loss": 0.9573, + "step": 10560 + }, + { + "epoch": 1.01, + "grad_norm": 0.3304039459635002, + "learning_rate": 0.00010653669538081581, + "loss": 1.0303, + "step": 10561 + }, + { + "epoch": 1.01, + "grad_norm": 0.2860233237757736, + "learning_rate": 0.00010652090964165726, + "loss": 1.0053, + "step": 10562 + }, + { + "epoch": 1.01, + "grad_norm": 0.3179828954932696, + "learning_rate": 0.00010650512373930884, + "loss": 0.9719, + "step": 10563 + }, + { + "epoch": 1.01, + "grad_norm": 0.31230593369151893, + "learning_rate": 0.00010648933767416562, + "loss": 1.0558, + "step": 10564 + }, + { + "epoch": 1.01, + "grad_norm": 0.30446009654455125, + "learning_rate": 0.00010647355144662263, + "loss": 1.0372, + "step": 10565 + }, + { + "epoch": 1.01, + "grad_norm": 0.2817172027574087, + "learning_rate": 0.00010645776505707498, + "loss": 0.9653, + "step": 10566 + }, + { + "epoch": 1.01, + "grad_norm": 0.31446385954282674, + "learning_rate": 0.00010644197850591766, + "loss": 1.098, + "step": 10567 + }, + { + "epoch": 1.01, + "grad_norm": 0.2850404075952183, + "learning_rate": 0.00010642619179354581, + "loss": 1.0465, + "step": 10568 + }, + { + "epoch": 1.01, + "grad_norm": 0.283651312892408, + "learning_rate": 0.00010641040492035445, + "loss": 1.084, + "step": 10569 + }, + { + "epoch": 1.01, + "grad_norm": 0.27923596756204183, + "learning_rate": 0.00010639461788673868, + "loss": 1.0245, + "step": 10570 + }, + { + "epoch": 1.01, + "grad_norm": 0.28007558242622876, + "learning_rate": 0.00010637883069309361, + "loss": 0.9343, + "step": 10571 + }, + { + "epoch": 1.01, + "grad_norm": 0.3167807420604227, + "learning_rate": 0.00010636304333981427, + "loss": 0.9588, + "step": 10572 + }, + { + "epoch": 1.01, + "grad_norm": 0.2924018426149092, + "learning_rate": 0.00010634725582729574, + "loss": 1.0973, + "step": 10573 + }, + { + "epoch": 1.01, + "grad_norm": 0.2995762497700476, + "learning_rate": 0.0001063314681559332, + "loss": 1.0196, + "step": 10574 + }, + { + "epoch": 1.01, + "grad_norm": 0.2918023810025417, + "learning_rate": 0.00010631568032612167, + "loss": 1.0772, + "step": 10575 + }, + { + "epoch": 1.01, + "grad_norm": 0.25853342905828497, + "learning_rate": 0.00010629989233825626, + "loss": 0.9217, + "step": 10576 + }, + { + "epoch": 1.01, + "grad_norm": 0.2818979151914942, + "learning_rate": 0.0001062841041927321, + "loss": 1.1522, + "step": 10577 + }, + { + "epoch": 1.01, + "grad_norm": 0.2768018817902283, + "learning_rate": 0.00010626831588994424, + "loss": 0.9424, + "step": 10578 + }, + { + "epoch": 1.01, + "grad_norm": 0.31683144914971234, + "learning_rate": 0.00010625252743028788, + "loss": 1.0845, + "step": 10579 + }, + { + "epoch": 1.01, + "grad_norm": 0.30568406025011047, + "learning_rate": 0.00010623673881415808, + "loss": 1.0398, + "step": 10580 + }, + { + "epoch": 1.01, + "grad_norm": 0.31327040614652096, + "learning_rate": 0.00010622095004194996, + "loss": 1.0506, + "step": 10581 + }, + { + "epoch": 1.01, + "grad_norm": 0.31017079216506827, + "learning_rate": 0.00010620516111405868, + "loss": 1.1322, + "step": 10582 + }, + { + "epoch": 1.01, + "grad_norm": 0.29678119497066807, + "learning_rate": 0.00010618937203087932, + "loss": 1.0006, + "step": 10583 + }, + { + "epoch": 1.01, + "grad_norm": 0.2842622361902401, + "learning_rate": 0.00010617358279280704, + "loss": 0.9889, + "step": 10584 + }, + { + "epoch": 1.01, + "grad_norm": 0.33396498782730666, + "learning_rate": 0.000106157793400237, + "loss": 1.0577, + "step": 10585 + }, + { + "epoch": 1.01, + "grad_norm": 0.3160434588561971, + "learning_rate": 0.00010614200385356428, + "loss": 1.1507, + "step": 10586 + }, + { + "epoch": 1.01, + "grad_norm": 0.2810161421935083, + "learning_rate": 0.00010612621415318404, + "loss": 1.0433, + "step": 10587 + }, + { + "epoch": 1.01, + "grad_norm": 0.32114120815595104, + "learning_rate": 0.00010611042429949147, + "loss": 1.0064, + "step": 10588 + }, + { + "epoch": 1.01, + "grad_norm": 0.23632089697469708, + "learning_rate": 0.00010609463429288169, + "loss": 0.9958, + "step": 10589 + }, + { + "epoch": 1.01, + "grad_norm": 0.2647217545159937, + "learning_rate": 0.00010607884413374985, + "loss": 1.0367, + "step": 10590 + }, + { + "epoch": 1.01, + "grad_norm": 0.3005780284928043, + "learning_rate": 0.0001060630538224911, + "loss": 1.1317, + "step": 10591 + }, + { + "epoch": 1.01, + "grad_norm": 0.2965251528814514, + "learning_rate": 0.00010604726335950065, + "loss": 1.0527, + "step": 10592 + }, + { + "epoch": 1.01, + "grad_norm": 0.2525829154577615, + "learning_rate": 0.00010603147274517362, + "loss": 0.9343, + "step": 10593 + }, + { + "epoch": 1.01, + "grad_norm": 0.3133488377271471, + "learning_rate": 0.00010601568197990517, + "loss": 1.0207, + "step": 10594 + }, + { + "epoch": 1.01, + "grad_norm": 0.25317835508097, + "learning_rate": 0.00010599989106409054, + "loss": 0.9165, + "step": 10595 + }, + { + "epoch": 1.01, + "grad_norm": 0.32872702270742477, + "learning_rate": 0.00010598409999812485, + "loss": 1.0802, + "step": 10596 + }, + { + "epoch": 1.01, + "grad_norm": 0.29554748102419565, + "learning_rate": 0.00010596830878240326, + "loss": 0.9491, + "step": 10597 + }, + { + "epoch": 1.01, + "grad_norm": 0.2725979864149322, + "learning_rate": 0.00010595251741732104, + "loss": 1.028, + "step": 10598 + }, + { + "epoch": 1.01, + "grad_norm": 0.30398319997009854, + "learning_rate": 0.00010593672590327328, + "loss": 1.0505, + "step": 10599 + }, + { + "epoch": 1.01, + "grad_norm": 0.30856494962165254, + "learning_rate": 0.00010592093424065528, + "loss": 0.9594, + "step": 10600 + }, + { + "epoch": 1.01, + "grad_norm": 0.3036929135827336, + "learning_rate": 0.00010590514242986213, + "loss": 1.0855, + "step": 10601 + }, + { + "epoch": 1.01, + "grad_norm": 0.28128569475062754, + "learning_rate": 0.00010588935047128912, + "loss": 1.0412, + "step": 10602 + }, + { + "epoch": 1.01, + "grad_norm": 0.30370062816795834, + "learning_rate": 0.0001058735583653314, + "loss": 1.1351, + "step": 10603 + }, + { + "epoch": 1.01, + "grad_norm": 0.2575470929533984, + "learning_rate": 0.00010585776611238416, + "loss": 0.9584, + "step": 10604 + }, + { + "epoch": 1.01, + "grad_norm": 0.27591103520861515, + "learning_rate": 0.00010584197371284267, + "loss": 1.0565, + "step": 10605 + }, + { + "epoch": 1.01, + "grad_norm": 0.2661179370743896, + "learning_rate": 0.00010582618116710214, + "loss": 0.97, + "step": 10606 + }, + { + "epoch": 1.01, + "grad_norm": 0.2911892182776199, + "learning_rate": 0.00010581038847555773, + "loss": 0.9579, + "step": 10607 + }, + { + "epoch": 1.01, + "grad_norm": 0.30431833670601427, + "learning_rate": 0.00010579459563860469, + "loss": 0.9506, + "step": 10608 + }, + { + "epoch": 1.01, + "grad_norm": 0.2946821778047032, + "learning_rate": 0.00010577880265663829, + "loss": 0.9962, + "step": 10609 + }, + { + "epoch": 1.02, + "grad_norm": 0.3538251465062118, + "learning_rate": 0.00010576300953005372, + "loss": 1.001, + "step": 10610 + }, + { + "epoch": 1.02, + "grad_norm": 0.3153003556734997, + "learning_rate": 0.00010574721625924618, + "loss": 1.1265, + "step": 10611 + }, + { + "epoch": 1.02, + "grad_norm": 0.28491308996329834, + "learning_rate": 0.00010573142284461095, + "loss": 0.9945, + "step": 10612 + }, + { + "epoch": 1.02, + "grad_norm": 0.3178028949473492, + "learning_rate": 0.00010571562928654328, + "loss": 1.0415, + "step": 10613 + }, + { + "epoch": 1.02, + "grad_norm": 0.3079240854545629, + "learning_rate": 0.00010569983558543837, + "loss": 1.026, + "step": 10614 + }, + { + "epoch": 1.02, + "grad_norm": 0.2634763977806796, + "learning_rate": 0.00010568404174169151, + "loss": 1.0127, + "step": 10615 + }, + { + "epoch": 1.02, + "grad_norm": 0.34246526889020235, + "learning_rate": 0.00010566824775569792, + "loss": 0.9971, + "step": 10616 + }, + { + "epoch": 1.02, + "grad_norm": 0.29370250854624336, + "learning_rate": 0.00010565245362785288, + "loss": 1.0162, + "step": 10617 + }, + { + "epoch": 1.02, + "grad_norm": 0.309392986933538, + "learning_rate": 0.00010563665935855162, + "loss": 1.0153, + "step": 10618 + }, + { + "epoch": 1.02, + "grad_norm": 0.29678503910350873, + "learning_rate": 0.00010562086494818943, + "loss": 1.1059, + "step": 10619 + }, + { + "epoch": 1.02, + "grad_norm": 0.3353091960850999, + "learning_rate": 0.00010560507039716156, + "loss": 0.9912, + "step": 10620 + }, + { + "epoch": 1.02, + "grad_norm": 0.27407053368211787, + "learning_rate": 0.00010558927570586326, + "loss": 1.1091, + "step": 10621 + }, + { + "epoch": 1.02, + "grad_norm": 0.29426380722568135, + "learning_rate": 0.00010557348087468985, + "loss": 1.0746, + "step": 10622 + }, + { + "epoch": 1.02, + "grad_norm": 0.25833469409879717, + "learning_rate": 0.00010555768590403659, + "loss": 0.9799, + "step": 10623 + }, + { + "epoch": 1.02, + "grad_norm": 0.31062518487110347, + "learning_rate": 0.0001055418907942987, + "loss": 1.0788, + "step": 10624 + }, + { + "epoch": 1.02, + "grad_norm": 0.3192725237980823, + "learning_rate": 0.00010552609554587157, + "loss": 0.996, + "step": 10625 + }, + { + "epoch": 1.02, + "grad_norm": 0.315989138692609, + "learning_rate": 0.00010551030015915038, + "loss": 0.9851, + "step": 10626 + }, + { + "epoch": 1.02, + "grad_norm": 0.27751318878003567, + "learning_rate": 0.0001054945046345305, + "loss": 0.9926, + "step": 10627 + }, + { + "epoch": 1.02, + "grad_norm": 0.26385440905124996, + "learning_rate": 0.00010547870897240714, + "loss": 0.9903, + "step": 10628 + }, + { + "epoch": 1.02, + "grad_norm": 0.2934442085643062, + "learning_rate": 0.0001054629131731757, + "loss": 0.9892, + "step": 10629 + }, + { + "epoch": 1.02, + "grad_norm": 0.2970048445662831, + "learning_rate": 0.00010544711723723139, + "loss": 0.9469, + "step": 10630 + }, + { + "epoch": 1.02, + "grad_norm": 0.2675312540854728, + "learning_rate": 0.00010543132116496954, + "loss": 0.969, + "step": 10631 + }, + { + "epoch": 1.02, + "grad_norm": 0.29249170170998573, + "learning_rate": 0.00010541552495678549, + "loss": 0.9695, + "step": 10632 + }, + { + "epoch": 1.02, + "grad_norm": 0.3240128896263728, + "learning_rate": 0.00010539972861307452, + "loss": 0.9649, + "step": 10633 + }, + { + "epoch": 1.02, + "grad_norm": 0.3273474129118629, + "learning_rate": 0.00010538393213423192, + "loss": 0.9534, + "step": 10634 + }, + { + "epoch": 1.02, + "grad_norm": 0.29411632003394983, + "learning_rate": 0.00010536813552065307, + "loss": 1.0242, + "step": 10635 + }, + { + "epoch": 1.02, + "grad_norm": 0.30809535579100994, + "learning_rate": 0.00010535233877273324, + "loss": 1.0368, + "step": 10636 + }, + { + "epoch": 1.02, + "grad_norm": 0.3349466364379998, + "learning_rate": 0.00010533654189086779, + "loss": 0.9496, + "step": 10637 + }, + { + "epoch": 1.02, + "grad_norm": 0.2843017595730202, + "learning_rate": 0.00010532074487545201, + "loss": 1.0279, + "step": 10638 + }, + { + "epoch": 1.02, + "grad_norm": 0.3230195425665935, + "learning_rate": 0.00010530494772688125, + "loss": 1.0454, + "step": 10639 + }, + { + "epoch": 1.02, + "grad_norm": 0.282806797852005, + "learning_rate": 0.0001052891504455508, + "loss": 0.9824, + "step": 10640 + }, + { + "epoch": 1.02, + "grad_norm": 0.3043758305076119, + "learning_rate": 0.00010527335303185607, + "loss": 0.8774, + "step": 10641 + }, + { + "epoch": 1.02, + "grad_norm": 0.3265050479565585, + "learning_rate": 0.00010525755548619233, + "loss": 1.0144, + "step": 10642 + }, + { + "epoch": 1.02, + "grad_norm": 0.2665053716692157, + "learning_rate": 0.00010524175780895502, + "loss": 1.1015, + "step": 10643 + }, + { + "epoch": 1.02, + "grad_norm": 0.27038653290647474, + "learning_rate": 0.00010522596000053941, + "loss": 1.009, + "step": 10644 + }, + { + "epoch": 1.02, + "grad_norm": 0.286021555387563, + "learning_rate": 0.00010521016206134085, + "loss": 1.0968, + "step": 10645 + }, + { + "epoch": 1.02, + "grad_norm": 0.3346738333015948, + "learning_rate": 0.0001051943639917547, + "loss": 1.1447, + "step": 10646 + }, + { + "epoch": 1.02, + "grad_norm": 0.27292515432978154, + "learning_rate": 0.00010517856579217635, + "loss": 1.1505, + "step": 10647 + }, + { + "epoch": 1.02, + "grad_norm": 0.30010396180362503, + "learning_rate": 0.00010516276746300112, + "loss": 1.0886, + "step": 10648 + }, + { + "epoch": 1.02, + "grad_norm": 0.26261274212444535, + "learning_rate": 0.0001051469690046244, + "loss": 1.0966, + "step": 10649 + }, + { + "epoch": 1.02, + "grad_norm": 0.2916901126902084, + "learning_rate": 0.00010513117041744156, + "loss": 0.9437, + "step": 10650 + }, + { + "epoch": 1.02, + "grad_norm": 0.2841361245485583, + "learning_rate": 0.0001051153717018479, + "loss": 0.9242, + "step": 10651 + }, + { + "epoch": 1.02, + "grad_norm": 0.3123713822296836, + "learning_rate": 0.0001050995728582389, + "loss": 1.0902, + "step": 10652 + }, + { + "epoch": 1.02, + "grad_norm": 0.3028595246611414, + "learning_rate": 0.00010508377388700989, + "loss": 0.9975, + "step": 10653 + }, + { + "epoch": 1.02, + "grad_norm": 0.3027447290979864, + "learning_rate": 0.00010506797478855623, + "loss": 0.9977, + "step": 10654 + }, + { + "epoch": 1.02, + "grad_norm": 0.29891167007608194, + "learning_rate": 0.0001050521755632733, + "loss": 1.0605, + "step": 10655 + }, + { + "epoch": 1.02, + "grad_norm": 0.290896098176422, + "learning_rate": 0.00010503637621155652, + "loss": 0.9992, + "step": 10656 + }, + { + "epoch": 1.02, + "grad_norm": 0.28361853061687564, + "learning_rate": 0.00010502057673380125, + "loss": 1.0126, + "step": 10657 + }, + { + "epoch": 1.02, + "grad_norm": 0.27076011600498245, + "learning_rate": 0.00010500477713040289, + "loss": 0.9373, + "step": 10658 + }, + { + "epoch": 1.02, + "grad_norm": 0.33337216872039366, + "learning_rate": 0.00010498897740175684, + "loss": 1.043, + "step": 10659 + }, + { + "epoch": 1.02, + "grad_norm": 0.28306736027683893, + "learning_rate": 0.00010497317754825851, + "loss": 0.9406, + "step": 10660 + }, + { + "epoch": 1.02, + "grad_norm": 0.295085913977975, + "learning_rate": 0.00010495737757030326, + "loss": 1.145, + "step": 10661 + }, + { + "epoch": 1.02, + "grad_norm": 0.302522894369919, + "learning_rate": 0.00010494157746828655, + "loss": 1.0593, + "step": 10662 + }, + { + "epoch": 1.02, + "grad_norm": 0.30772741883901095, + "learning_rate": 0.00010492577724260371, + "loss": 1.0847, + "step": 10663 + }, + { + "epoch": 1.02, + "grad_norm": 0.31103678684542835, + "learning_rate": 0.00010490997689365025, + "loss": 0.931, + "step": 10664 + }, + { + "epoch": 1.02, + "grad_norm": 0.29135300257547003, + "learning_rate": 0.00010489417642182149, + "loss": 1.0858, + "step": 10665 + }, + { + "epoch": 1.02, + "grad_norm": 0.28936031685763214, + "learning_rate": 0.00010487837582751292, + "loss": 1.0685, + "step": 10666 + }, + { + "epoch": 1.02, + "grad_norm": 0.27709420150471, + "learning_rate": 0.00010486257511111991, + "loss": 0.9178, + "step": 10667 + }, + { + "epoch": 1.02, + "grad_norm": 0.29977112029256253, + "learning_rate": 0.0001048467742730379, + "loss": 1.017, + "step": 10668 + }, + { + "epoch": 1.02, + "grad_norm": 0.3240468749221804, + "learning_rate": 0.00010483097331366236, + "loss": 1.0078, + "step": 10669 + }, + { + "epoch": 1.02, + "grad_norm": 0.2965625821622611, + "learning_rate": 0.00010481517223338863, + "loss": 1.0083, + "step": 10670 + }, + { + "epoch": 1.02, + "grad_norm": 0.2603992936010485, + "learning_rate": 0.00010479937103261224, + "loss": 1.0853, + "step": 10671 + }, + { + "epoch": 1.02, + "grad_norm": 0.3051486217902884, + "learning_rate": 0.0001047835697117285, + "loss": 1.0453, + "step": 10672 + }, + { + "epoch": 1.02, + "grad_norm": 0.287363920870317, + "learning_rate": 0.000104767768271133, + "loss": 0.9786, + "step": 10673 + }, + { + "epoch": 1.02, + "grad_norm": 0.2889366035543418, + "learning_rate": 0.00010475196671122106, + "loss": 0.9687, + "step": 10674 + }, + { + "epoch": 1.02, + "grad_norm": 0.30073277334609233, + "learning_rate": 0.00010473616503238814, + "loss": 1.0212, + "step": 10675 + }, + { + "epoch": 1.02, + "grad_norm": 0.2939076158913809, + "learning_rate": 0.00010472036323502976, + "loss": 1.0627, + "step": 10676 + }, + { + "epoch": 1.02, + "grad_norm": 0.2923840707523673, + "learning_rate": 0.00010470456131954132, + "loss": 0.9532, + "step": 10677 + }, + { + "epoch": 1.02, + "grad_norm": 0.2817621657201065, + "learning_rate": 0.00010468875928631825, + "loss": 1.1329, + "step": 10678 + }, + { + "epoch": 1.02, + "grad_norm": 0.29952095021049363, + "learning_rate": 0.00010467295713575605, + "loss": 0.956, + "step": 10679 + }, + { + "epoch": 1.02, + "grad_norm": 0.33017449742336674, + "learning_rate": 0.00010465715486825017, + "loss": 0.9661, + "step": 10680 + }, + { + "epoch": 1.02, + "grad_norm": 0.2677311275383132, + "learning_rate": 0.00010464135248419602, + "loss": 1.0562, + "step": 10681 + }, + { + "epoch": 1.02, + "grad_norm": 0.30477794735713265, + "learning_rate": 0.00010462554998398915, + "loss": 1.0799, + "step": 10682 + }, + { + "epoch": 1.02, + "grad_norm": 0.3013040397211361, + "learning_rate": 0.00010460974736802496, + "loss": 1.1095, + "step": 10683 + }, + { + "epoch": 1.02, + "grad_norm": 0.29447328658909416, + "learning_rate": 0.00010459394463669896, + "loss": 1.0598, + "step": 10684 + }, + { + "epoch": 1.02, + "grad_norm": 0.29986400101599975, + "learning_rate": 0.00010457814179040659, + "loss": 0.956, + "step": 10685 + }, + { + "epoch": 1.02, + "grad_norm": 0.2923759797169043, + "learning_rate": 0.00010456233882954335, + "loss": 0.9665, + "step": 10686 + }, + { + "epoch": 1.02, + "grad_norm": 0.32869187012512935, + "learning_rate": 0.00010454653575450472, + "loss": 1.1011, + "step": 10687 + }, + { + "epoch": 1.02, + "grad_norm": 0.2885541988659791, + "learning_rate": 0.00010453073256568619, + "loss": 0.962, + "step": 10688 + }, + { + "epoch": 1.02, + "grad_norm": 0.2842674327072589, + "learning_rate": 0.0001045149292634832, + "loss": 1.0207, + "step": 10689 + }, + { + "epoch": 1.02, + "grad_norm": 0.25852926590072545, + "learning_rate": 0.00010449912584829132, + "loss": 0.9613, + "step": 10690 + }, + { + "epoch": 1.02, + "grad_norm": 0.3031203931702311, + "learning_rate": 0.00010448332232050595, + "loss": 0.8787, + "step": 10691 + }, + { + "epoch": 1.02, + "grad_norm": 0.2954944413816486, + "learning_rate": 0.0001044675186805226, + "loss": 1.0912, + "step": 10692 + }, + { + "epoch": 1.02, + "grad_norm": 0.311909180651197, + "learning_rate": 0.00010445171492873682, + "loss": 1.0584, + "step": 10693 + }, + { + "epoch": 1.02, + "grad_norm": 0.29806371805617865, + "learning_rate": 0.00010443591106554406, + "loss": 0.9583, + "step": 10694 + }, + { + "epoch": 1.02, + "grad_norm": 0.3295497783709313, + "learning_rate": 0.00010442010709133984, + "loss": 1.0521, + "step": 10695 + }, + { + "epoch": 1.02, + "grad_norm": 0.32343433426583795, + "learning_rate": 0.0001044043030065197, + "loss": 1.0705, + "step": 10696 + }, + { + "epoch": 1.02, + "grad_norm": 0.3142129481444155, + "learning_rate": 0.00010438849881147908, + "loss": 0.9875, + "step": 10697 + }, + { + "epoch": 1.02, + "grad_norm": 0.3193881211197199, + "learning_rate": 0.00010437269450661351, + "loss": 1.0379, + "step": 10698 + }, + { + "epoch": 1.02, + "grad_norm": 0.301208348981727, + "learning_rate": 0.00010435689009231851, + "loss": 1.0478, + "step": 10699 + }, + { + "epoch": 1.02, + "grad_norm": 0.30162005958999033, + "learning_rate": 0.00010434108556898961, + "loss": 0.947, + "step": 10700 + }, + { + "epoch": 1.02, + "grad_norm": 0.28239783699308063, + "learning_rate": 0.00010432528093702232, + "loss": 1.0402, + "step": 10701 + }, + { + "epoch": 1.02, + "grad_norm": 0.28301114338761785, + "learning_rate": 0.00010430947619681213, + "loss": 1.1518, + "step": 10702 + }, + { + "epoch": 1.02, + "grad_norm": 0.4739151683381904, + "learning_rate": 0.00010429367134875461, + "loss": 1.4219, + "step": 10703 + }, + { + "epoch": 1.02, + "grad_norm": 0.29800367774324227, + "learning_rate": 0.00010427786639324524, + "loss": 1.0607, + "step": 10704 + }, + { + "epoch": 1.02, + "grad_norm": 0.3016406250085099, + "learning_rate": 0.00010426206133067959, + "loss": 0.9957, + "step": 10705 + }, + { + "epoch": 1.02, + "grad_norm": 0.3248438185612142, + "learning_rate": 0.00010424625616145318, + "loss": 0.9672, + "step": 10706 + }, + { + "epoch": 1.02, + "grad_norm": 0.29367789172980335, + "learning_rate": 0.00010423045088596155, + "loss": 1.0007, + "step": 10707 + }, + { + "epoch": 1.02, + "grad_norm": 0.2983586566501756, + "learning_rate": 0.00010421464550460023, + "loss": 0.9435, + "step": 10708 + }, + { + "epoch": 1.02, + "grad_norm": 0.29716329077619436, + "learning_rate": 0.00010419884001776475, + "loss": 1.0453, + "step": 10709 + }, + { + "epoch": 1.02, + "grad_norm": 0.2696736812821209, + "learning_rate": 0.00010418303442585064, + "loss": 0.9272, + "step": 10710 + }, + { + "epoch": 1.02, + "grad_norm": 0.3269246168812922, + "learning_rate": 0.00010416722872925351, + "loss": 0.9214, + "step": 10711 + }, + { + "epoch": 1.02, + "grad_norm": 0.3429771881665985, + "learning_rate": 0.00010415142292836881, + "loss": 1.0104, + "step": 10712 + }, + { + "epoch": 1.02, + "grad_norm": 0.30073446194863124, + "learning_rate": 0.00010413561702359218, + "loss": 0.9899, + "step": 10713 + }, + { + "epoch": 1.03, + "grad_norm": 0.35001849702710186, + "learning_rate": 0.00010411981101531914, + "loss": 0.9315, + "step": 10714 + }, + { + "epoch": 1.03, + "grad_norm": 0.30124061028128457, + "learning_rate": 0.00010410400490394523, + "loss": 1.0858, + "step": 10715 + }, + { + "epoch": 1.03, + "grad_norm": 0.2971517413684335, + "learning_rate": 0.00010408819868986599, + "loss": 1.1092, + "step": 10716 + }, + { + "epoch": 1.03, + "grad_norm": 0.32694344046187873, + "learning_rate": 0.00010407239237347703, + "loss": 1.0487, + "step": 10717 + }, + { + "epoch": 1.03, + "grad_norm": 0.2930788143580028, + "learning_rate": 0.00010405658595517392, + "loss": 0.9528, + "step": 10718 + }, + { + "epoch": 1.03, + "grad_norm": 0.27990747948430694, + "learning_rate": 0.00010404077943535213, + "loss": 1.101, + "step": 10719 + }, + { + "epoch": 1.03, + "grad_norm": 0.27225881950185465, + "learning_rate": 0.00010402497281440735, + "loss": 1.005, + "step": 10720 + }, + { + "epoch": 1.03, + "grad_norm": 0.3373329248171265, + "learning_rate": 0.00010400916609273508, + "loss": 0.9353, + "step": 10721 + }, + { + "epoch": 1.03, + "grad_norm": 0.3233077602943812, + "learning_rate": 0.0001039933592707309, + "loss": 0.9416, + "step": 10722 + }, + { + "epoch": 1.03, + "grad_norm": 0.3133195823459084, + "learning_rate": 0.00010397755234879041, + "loss": 1.1353, + "step": 10723 + }, + { + "epoch": 1.03, + "grad_norm": 0.3063111863472075, + "learning_rate": 0.00010396174532730915, + "loss": 1.0607, + "step": 10724 + }, + { + "epoch": 1.03, + "grad_norm": 0.32068844383905254, + "learning_rate": 0.00010394593820668275, + "loss": 0.9932, + "step": 10725 + }, + { + "epoch": 1.03, + "grad_norm": 0.2915049207023837, + "learning_rate": 0.00010393013098730672, + "loss": 1.0547, + "step": 10726 + }, + { + "epoch": 1.03, + "grad_norm": 0.32715438229102545, + "learning_rate": 0.00010391432366957674, + "loss": 1.0404, + "step": 10727 + }, + { + "epoch": 1.03, + "grad_norm": 0.29732355289874624, + "learning_rate": 0.00010389851625388835, + "loss": 0.9203, + "step": 10728 + }, + { + "epoch": 1.03, + "grad_norm": 0.3465678305513601, + "learning_rate": 0.00010388270874063712, + "loss": 1.0947, + "step": 10729 + }, + { + "epoch": 1.03, + "grad_norm": 0.2814064313020933, + "learning_rate": 0.00010386690113021866, + "loss": 0.9988, + "step": 10730 + }, + { + "epoch": 1.03, + "grad_norm": 0.27392066376075563, + "learning_rate": 0.00010385109342302861, + "loss": 1.148, + "step": 10731 + }, + { + "epoch": 1.03, + "grad_norm": 0.340865613197669, + "learning_rate": 0.0001038352856194625, + "loss": 0.9641, + "step": 10732 + }, + { + "epoch": 1.03, + "grad_norm": 0.3097266033426417, + "learning_rate": 0.00010381947771991594, + "loss": 1.1042, + "step": 10733 + }, + { + "epoch": 1.03, + "grad_norm": 0.31442337880164023, + "learning_rate": 0.00010380366972478456, + "loss": 0.9651, + "step": 10734 + }, + { + "epoch": 1.03, + "grad_norm": 0.32102779734128245, + "learning_rate": 0.00010378786163446397, + "loss": 1.0644, + "step": 10735 + }, + { + "epoch": 1.03, + "grad_norm": 0.27588296310641836, + "learning_rate": 0.00010377205344934973, + "loss": 1.1013, + "step": 10736 + }, + { + "epoch": 1.03, + "grad_norm": 0.3123335024333068, + "learning_rate": 0.0001037562451698375, + "loss": 1.0448, + "step": 10737 + }, + { + "epoch": 1.03, + "grad_norm": 0.33780152633739785, + "learning_rate": 0.0001037404367963229, + "loss": 1.0009, + "step": 10738 + }, + { + "epoch": 1.03, + "grad_norm": 0.26383462801709284, + "learning_rate": 0.00010372462832920147, + "loss": 1.0365, + "step": 10739 + }, + { + "epoch": 1.03, + "grad_norm": 0.32317384864547366, + "learning_rate": 0.00010370881976886891, + "loss": 1.0352, + "step": 10740 + }, + { + "epoch": 1.03, + "grad_norm": 0.30778101723438633, + "learning_rate": 0.00010369301111572081, + "loss": 1.0034, + "step": 10741 + }, + { + "epoch": 1.03, + "grad_norm": 0.2855560749913106, + "learning_rate": 0.00010367720237015277, + "loss": 1.1115, + "step": 10742 + }, + { + "epoch": 1.03, + "grad_norm": 0.2652938911811925, + "learning_rate": 0.00010366139353256042, + "loss": 0.8872, + "step": 10743 + }, + { + "epoch": 1.03, + "grad_norm": 0.3262077138942635, + "learning_rate": 0.0001036455846033394, + "loss": 1.0192, + "step": 10744 + }, + { + "epoch": 1.03, + "grad_norm": 0.27625963914593427, + "learning_rate": 0.00010362977558288533, + "loss": 0.9853, + "step": 10745 + }, + { + "epoch": 1.03, + "grad_norm": 0.27196875040209284, + "learning_rate": 0.00010361396647159388, + "loss": 1.1458, + "step": 10746 + }, + { + "epoch": 1.03, + "grad_norm": 0.26751130167072784, + "learning_rate": 0.0001035981572698606, + "loss": 1.0489, + "step": 10747 + }, + { + "epoch": 1.03, + "grad_norm": 0.2758925415156718, + "learning_rate": 0.00010358234797808119, + "loss": 0.9932, + "step": 10748 + }, + { + "epoch": 1.03, + "grad_norm": 0.29867374880947756, + "learning_rate": 0.00010356653859665129, + "loss": 1.1153, + "step": 10749 + }, + { + "epoch": 1.03, + "grad_norm": 0.3257821562872946, + "learning_rate": 0.00010355072912596648, + "loss": 0.9278, + "step": 10750 + }, + { + "epoch": 1.03, + "grad_norm": 0.3265435300923386, + "learning_rate": 0.00010353491956642247, + "loss": 1.0361, + "step": 10751 + }, + { + "epoch": 1.03, + "grad_norm": 0.34401137041720287, + "learning_rate": 0.00010351910991841489, + "loss": 1.0791, + "step": 10752 + }, + { + "epoch": 1.03, + "grad_norm": 0.2951193467092975, + "learning_rate": 0.00010350330018233934, + "loss": 0.9878, + "step": 10753 + }, + { + "epoch": 1.03, + "grad_norm": 0.2928731276116953, + "learning_rate": 0.00010348749035859153, + "loss": 0.9585, + "step": 10754 + }, + { + "epoch": 1.03, + "grad_norm": 0.27288909902840447, + "learning_rate": 0.00010347168044756708, + "loss": 1.1302, + "step": 10755 + }, + { + "epoch": 1.03, + "grad_norm": 0.27074340683913134, + "learning_rate": 0.00010345587044966161, + "loss": 0.9533, + "step": 10756 + }, + { + "epoch": 1.03, + "grad_norm": 0.30830518043506033, + "learning_rate": 0.00010344006036527085, + "loss": 1.0573, + "step": 10757 + }, + { + "epoch": 1.03, + "grad_norm": 0.26623997426416124, + "learning_rate": 0.0001034242501947904, + "loss": 1.0606, + "step": 10758 + }, + { + "epoch": 1.03, + "grad_norm": 0.2997860936927183, + "learning_rate": 0.00010340843993861596, + "loss": 1.1109, + "step": 10759 + }, + { + "epoch": 1.03, + "grad_norm": 0.27594451338601317, + "learning_rate": 0.00010339262959714311, + "loss": 0.9936, + "step": 10760 + }, + { + "epoch": 1.03, + "grad_norm": 0.28136995779958734, + "learning_rate": 0.00010337681917076763, + "loss": 1.0036, + "step": 10761 + }, + { + "epoch": 1.03, + "grad_norm": 0.28292468913070634, + "learning_rate": 0.00010336100865988512, + "loss": 1.1285, + "step": 10762 + }, + { + "epoch": 1.03, + "grad_norm": 0.2724722146633755, + "learning_rate": 0.00010334519806489122, + "loss": 0.9742, + "step": 10763 + }, + { + "epoch": 1.03, + "grad_norm": 0.31114546757293443, + "learning_rate": 0.00010332938738618166, + "loss": 1.1156, + "step": 10764 + }, + { + "epoch": 1.03, + "grad_norm": 0.29334139679974747, + "learning_rate": 0.00010331357662415209, + "loss": 0.9573, + "step": 10765 + }, + { + "epoch": 1.03, + "grad_norm": 0.28232419321718283, + "learning_rate": 0.00010329776577919816, + "loss": 1.0625, + "step": 10766 + }, + { + "epoch": 1.03, + "grad_norm": 0.29489926600521627, + "learning_rate": 0.00010328195485171559, + "loss": 1.1003, + "step": 10767 + }, + { + "epoch": 1.03, + "grad_norm": 0.3020115608405508, + "learning_rate": 0.00010326614384210005, + "loss": 1.0903, + "step": 10768 + }, + { + "epoch": 1.03, + "grad_norm": 0.3266832967206907, + "learning_rate": 0.00010325033275074718, + "loss": 1.1303, + "step": 10769 + }, + { + "epoch": 1.03, + "grad_norm": 0.3295610728186336, + "learning_rate": 0.00010323452157805269, + "loss": 1.0864, + "step": 10770 + }, + { + "epoch": 1.03, + "grad_norm": 0.2766271639649193, + "learning_rate": 0.0001032187103244123, + "loss": 1.0284, + "step": 10771 + }, + { + "epoch": 1.03, + "grad_norm": 0.3426312167992186, + "learning_rate": 0.00010320289899022164, + "loss": 1.0299, + "step": 10772 + }, + { + "epoch": 1.03, + "grad_norm": 0.3778742393532663, + "learning_rate": 0.0001031870875758764, + "loss": 0.9261, + "step": 10773 + }, + { + "epoch": 1.03, + "grad_norm": 0.30542068377079806, + "learning_rate": 0.00010317127608177233, + "loss": 1.0832, + "step": 10774 + }, + { + "epoch": 1.03, + "grad_norm": 0.3169746975373242, + "learning_rate": 0.00010315546450830507, + "loss": 1.0967, + "step": 10775 + }, + { + "epoch": 1.03, + "grad_norm": 0.30531080576109226, + "learning_rate": 0.00010313965285587033, + "loss": 1.0219, + "step": 10776 + }, + { + "epoch": 1.03, + "grad_norm": 0.33416723424029476, + "learning_rate": 0.00010312384112486378, + "loss": 1.0873, + "step": 10777 + }, + { + "epoch": 1.03, + "grad_norm": 0.3042727747578678, + "learning_rate": 0.00010310802931568115, + "loss": 0.9928, + "step": 10778 + }, + { + "epoch": 1.03, + "grad_norm": 0.28498566345598364, + "learning_rate": 0.00010309221742871815, + "loss": 1.0425, + "step": 10779 + }, + { + "epoch": 1.03, + "grad_norm": 0.3205601317659082, + "learning_rate": 0.00010307640546437045, + "loss": 1.1003, + "step": 10780 + }, + { + "epoch": 1.03, + "grad_norm": 0.2819975935983172, + "learning_rate": 0.00010306059342303378, + "loss": 1.0121, + "step": 10781 + }, + { + "epoch": 1.03, + "grad_norm": 0.3116324515915003, + "learning_rate": 0.00010304478130510384, + "loss": 0.9589, + "step": 10782 + }, + { + "epoch": 1.03, + "grad_norm": 0.2975584101059, + "learning_rate": 0.00010302896911097633, + "loss": 0.9727, + "step": 10783 + }, + { + "epoch": 1.03, + "grad_norm": 0.29772956437354287, + "learning_rate": 0.00010301315684104693, + "loss": 1.081, + "step": 10784 + }, + { + "epoch": 1.03, + "grad_norm": 0.2680356349334504, + "learning_rate": 0.00010299734449571143, + "loss": 1.1008, + "step": 10785 + }, + { + "epoch": 1.03, + "grad_norm": 0.31579838424812323, + "learning_rate": 0.00010298153207536548, + "loss": 1.1605, + "step": 10786 + }, + { + "epoch": 1.03, + "grad_norm": 0.3279994084718613, + "learning_rate": 0.00010296571958040481, + "loss": 1.0504, + "step": 10787 + }, + { + "epoch": 1.03, + "grad_norm": 0.29758900584038767, + "learning_rate": 0.00010294990701122515, + "loss": 0.9593, + "step": 10788 + }, + { + "epoch": 1.03, + "grad_norm": 0.3108457120763608, + "learning_rate": 0.00010293409436822224, + "loss": 1.1341, + "step": 10789 + }, + { + "epoch": 1.03, + "grad_norm": 0.29142329962369634, + "learning_rate": 0.00010291828165179174, + "loss": 1.0805, + "step": 10790 + }, + { + "epoch": 1.03, + "grad_norm": 0.24857988026840014, + "learning_rate": 0.00010290246886232941, + "loss": 1.0518, + "step": 10791 + }, + { + "epoch": 1.03, + "grad_norm": 0.2725623943242712, + "learning_rate": 0.000102886656000231, + "loss": 0.9264, + "step": 10792 + }, + { + "epoch": 1.03, + "grad_norm": 0.3148137080741391, + "learning_rate": 0.00010287084306589221, + "loss": 1.0303, + "step": 10793 + }, + { + "epoch": 1.03, + "grad_norm": 0.2828941280105932, + "learning_rate": 0.00010285503005970874, + "loss": 0.9612, + "step": 10794 + }, + { + "epoch": 1.03, + "grad_norm": 0.3337110472597963, + "learning_rate": 0.00010283921698207635, + "loss": 1.1169, + "step": 10795 + }, + { + "epoch": 1.03, + "grad_norm": 0.2804647581987878, + "learning_rate": 0.0001028234038333908, + "loss": 0.8892, + "step": 10796 + }, + { + "epoch": 1.03, + "grad_norm": 0.32150954472875076, + "learning_rate": 0.00010280759061404775, + "loss": 0.9924, + "step": 10797 + }, + { + "epoch": 1.03, + "grad_norm": 0.30471676465834124, + "learning_rate": 0.00010279177732444301, + "loss": 1.0093, + "step": 10798 + }, + { + "epoch": 1.03, + "grad_norm": 0.3131695029072164, + "learning_rate": 0.00010277596396497229, + "loss": 1.1155, + "step": 10799 + }, + { + "epoch": 1.03, + "grad_norm": 0.3092578623321237, + "learning_rate": 0.00010276015053603133, + "loss": 1.0783, + "step": 10800 + }, + { + "epoch": 1.03, + "grad_norm": 0.30795276260962456, + "learning_rate": 0.00010274433703801584, + "loss": 1.0172, + "step": 10801 + }, + { + "epoch": 1.03, + "grad_norm": 0.34807480741220104, + "learning_rate": 0.00010272852347132162, + "loss": 1.0136, + "step": 10802 + }, + { + "epoch": 1.03, + "grad_norm": 0.26907588238064467, + "learning_rate": 0.00010271270983634438, + "loss": 0.9705, + "step": 10803 + }, + { + "epoch": 1.03, + "grad_norm": 0.30188248194309475, + "learning_rate": 0.00010269689613347986, + "loss": 1.0396, + "step": 10804 + }, + { + "epoch": 1.03, + "grad_norm": 0.2765175552523923, + "learning_rate": 0.00010268108236312384, + "loss": 1.1009, + "step": 10805 + }, + { + "epoch": 1.03, + "grad_norm": 0.2814617497437536, + "learning_rate": 0.00010266526852567204, + "loss": 1.0047, + "step": 10806 + }, + { + "epoch": 1.03, + "grad_norm": 0.3018055759653196, + "learning_rate": 0.0001026494546215202, + "loss": 1.0683, + "step": 10807 + }, + { + "epoch": 1.03, + "grad_norm": 0.26894771460248973, + "learning_rate": 0.0001026336406510641, + "loss": 1.0295, + "step": 10808 + }, + { + "epoch": 1.03, + "grad_norm": 0.3104536554973185, + "learning_rate": 0.0001026178266146995, + "loss": 1.0618, + "step": 10809 + }, + { + "epoch": 1.03, + "grad_norm": 0.28423569389276987, + "learning_rate": 0.00010260201251282214, + "loss": 0.9586, + "step": 10810 + }, + { + "epoch": 1.03, + "grad_norm": 0.30665783540186714, + "learning_rate": 0.00010258619834582777, + "loss": 0.9476, + "step": 10811 + }, + { + "epoch": 1.03, + "grad_norm": 0.2821738473672911, + "learning_rate": 0.00010257038411411217, + "loss": 0.9462, + "step": 10812 + }, + { + "epoch": 1.03, + "grad_norm": 0.2968128439484056, + "learning_rate": 0.0001025545698180711, + "loss": 1.0361, + "step": 10813 + }, + { + "epoch": 1.03, + "grad_norm": 0.2881926594905324, + "learning_rate": 0.0001025387554581003, + "loss": 1.0408, + "step": 10814 + }, + { + "epoch": 1.03, + "grad_norm": 0.2796825546356873, + "learning_rate": 0.00010252294103459556, + "loss": 1.0651, + "step": 10815 + }, + { + "epoch": 1.03, + "grad_norm": 0.2868103218900092, + "learning_rate": 0.00010250712654795263, + "loss": 0.9767, + "step": 10816 + }, + { + "epoch": 1.03, + "grad_norm": 0.31327018314499594, + "learning_rate": 0.00010249131199856727, + "loss": 1.05, + "step": 10817 + }, + { + "epoch": 1.03, + "grad_norm": 0.29576285530094, + "learning_rate": 0.00010247549738683529, + "loss": 1.0729, + "step": 10818 + }, + { + "epoch": 1.04, + "grad_norm": 0.2959220268056112, + "learning_rate": 0.00010245968271315243, + "loss": 1.1181, + "step": 10819 + }, + { + "epoch": 1.04, + "grad_norm": 0.26912705841852347, + "learning_rate": 0.00010244386797791446, + "loss": 1.0106, + "step": 10820 + }, + { + "epoch": 1.04, + "grad_norm": 0.32257170251400913, + "learning_rate": 0.00010242805318151714, + "loss": 1.0053, + "step": 10821 + }, + { + "epoch": 1.04, + "grad_norm": 0.28494424517358896, + "learning_rate": 0.00010241223832435627, + "loss": 1.0228, + "step": 10822 + }, + { + "epoch": 1.04, + "grad_norm": 0.298684357139771, + "learning_rate": 0.00010239642340682762, + "loss": 1.0332, + "step": 10823 + }, + { + "epoch": 1.04, + "grad_norm": 0.3037632380372488, + "learning_rate": 0.00010238060842932697, + "loss": 0.9796, + "step": 10824 + }, + { + "epoch": 1.04, + "grad_norm": 0.277765175827632, + "learning_rate": 0.00010236479339225012, + "loss": 0.987, + "step": 10825 + }, + { + "epoch": 1.04, + "grad_norm": 0.31024459895433304, + "learning_rate": 0.00010234897829599279, + "loss": 1.077, + "step": 10826 + }, + { + "epoch": 1.04, + "grad_norm": 0.3468673263243319, + "learning_rate": 0.00010233316314095083, + "loss": 0.9701, + "step": 10827 + }, + { + "epoch": 1.04, + "grad_norm": 0.25654724046392835, + "learning_rate": 0.00010231734792752, + "loss": 0.9774, + "step": 10828 + }, + { + "epoch": 1.04, + "grad_norm": 0.28735306355479445, + "learning_rate": 0.00010230153265609604, + "loss": 0.9187, + "step": 10829 + }, + { + "epoch": 1.04, + "grad_norm": 0.2799444846861041, + "learning_rate": 0.00010228571732707482, + "loss": 0.8845, + "step": 10830 + }, + { + "epoch": 1.04, + "grad_norm": 0.30209322671266775, + "learning_rate": 0.00010226990194085205, + "loss": 1.0444, + "step": 10831 + }, + { + "epoch": 1.04, + "grad_norm": 0.30560517782681385, + "learning_rate": 0.0001022540864978236, + "loss": 0.993, + "step": 10832 + }, + { + "epoch": 1.04, + "grad_norm": 0.2779804894620069, + "learning_rate": 0.00010223827099838518, + "loss": 0.9631, + "step": 10833 + }, + { + "epoch": 1.04, + "grad_norm": 0.279160430737164, + "learning_rate": 0.00010222245544293265, + "loss": 0.9473, + "step": 10834 + }, + { + "epoch": 1.04, + "grad_norm": 0.2962892131823493, + "learning_rate": 0.00010220663983186176, + "loss": 1.076, + "step": 10835 + }, + { + "epoch": 1.04, + "grad_norm": 0.294090680404834, + "learning_rate": 0.00010219082416556836, + "loss": 1.0012, + "step": 10836 + }, + { + "epoch": 1.04, + "grad_norm": 0.31062465092061836, + "learning_rate": 0.00010217500844444816, + "loss": 0.9496, + "step": 10837 + }, + { + "epoch": 1.04, + "grad_norm": 0.2915404189048358, + "learning_rate": 0.00010215919266889701, + "loss": 0.9219, + "step": 10838 + }, + { + "epoch": 1.04, + "grad_norm": 0.33148037321313023, + "learning_rate": 0.00010214337683931075, + "loss": 1.0291, + "step": 10839 + }, + { + "epoch": 1.04, + "grad_norm": 0.32611035051175113, + "learning_rate": 0.00010212756095608511, + "loss": 1.2383, + "step": 10840 + }, + { + "epoch": 1.04, + "grad_norm": 0.28484400173923136, + "learning_rate": 0.00010211174501961589, + "loss": 1.1488, + "step": 10841 + }, + { + "epoch": 1.04, + "grad_norm": 0.3288183581662409, + "learning_rate": 0.00010209592903029897, + "loss": 1.0382, + "step": 10842 + }, + { + "epoch": 1.04, + "grad_norm": 0.3409878368544353, + "learning_rate": 0.00010208011298853008, + "loss": 1.1781, + "step": 10843 + }, + { + "epoch": 1.04, + "grad_norm": 0.322054405614101, + "learning_rate": 0.00010206429689470509, + "loss": 1.1422, + "step": 10844 + }, + { + "epoch": 1.04, + "grad_norm": 0.3103486050947882, + "learning_rate": 0.00010204848074921973, + "loss": 1.0084, + "step": 10845 + }, + { + "epoch": 1.04, + "grad_norm": 0.29800791000717075, + "learning_rate": 0.00010203266455246989, + "loss": 1.1333, + "step": 10846 + }, + { + "epoch": 1.04, + "grad_norm": 0.3049302252168114, + "learning_rate": 0.00010201684830485133, + "loss": 0.8163, + "step": 10847 + }, + { + "epoch": 1.04, + "grad_norm": 0.2614499742825224, + "learning_rate": 0.00010200103200675983, + "loss": 1.0136, + "step": 10848 + }, + { + "epoch": 1.04, + "grad_norm": 0.2948209111033554, + "learning_rate": 0.00010198521565859129, + "loss": 1.0954, + "step": 10849 + }, + { + "epoch": 1.04, + "grad_norm": 0.3279067132089347, + "learning_rate": 0.00010196939926074148, + "loss": 1.067, + "step": 10850 + }, + { + "epoch": 1.04, + "grad_norm": 0.3441148812430696, + "learning_rate": 0.0001019535828136062, + "loss": 1.1084, + "step": 10851 + }, + { + "epoch": 1.04, + "grad_norm": 0.3086768974122604, + "learning_rate": 0.0001019377663175813, + "loss": 1.0635, + "step": 10852 + }, + { + "epoch": 1.04, + "grad_norm": 0.2883680585383462, + "learning_rate": 0.00010192194977306258, + "loss": 0.9054, + "step": 10853 + }, + { + "epoch": 1.04, + "grad_norm": 0.34503214185377656, + "learning_rate": 0.00010190613318044583, + "loss": 0.9868, + "step": 10854 + }, + { + "epoch": 1.04, + "grad_norm": 0.2825930369918334, + "learning_rate": 0.00010189031654012693, + "loss": 1.0846, + "step": 10855 + }, + { + "epoch": 1.04, + "grad_norm": 0.3152896391987517, + "learning_rate": 0.00010187449985250166, + "loss": 1.0362, + "step": 10856 + }, + { + "epoch": 1.04, + "grad_norm": 0.2964926299369092, + "learning_rate": 0.00010185868311796585, + "loss": 1.1441, + "step": 10857 + }, + { + "epoch": 1.04, + "grad_norm": 0.32000085890339136, + "learning_rate": 0.00010184286633691534, + "loss": 0.9924, + "step": 10858 + }, + { + "epoch": 1.04, + "grad_norm": 0.2707109581843347, + "learning_rate": 0.00010182704950974593, + "loss": 0.9284, + "step": 10859 + }, + { + "epoch": 1.04, + "grad_norm": 0.2955178154762135, + "learning_rate": 0.00010181123263685346, + "loss": 1.0855, + "step": 10860 + }, + { + "epoch": 1.04, + "grad_norm": 0.28320118519679194, + "learning_rate": 0.00010179541571863374, + "loss": 1.0063, + "step": 10861 + }, + { + "epoch": 1.04, + "grad_norm": 0.3120781734012364, + "learning_rate": 0.00010177959875548264, + "loss": 1.0878, + "step": 10862 + }, + { + "epoch": 1.04, + "grad_norm": 0.3099472609201921, + "learning_rate": 0.00010176378174779595, + "loss": 0.9902, + "step": 10863 + }, + { + "epoch": 1.04, + "grad_norm": 0.3468506131503533, + "learning_rate": 0.00010174796469596952, + "loss": 1.0854, + "step": 10864 + }, + { + "epoch": 1.04, + "grad_norm": 0.26989376826822614, + "learning_rate": 0.00010173214760039916, + "loss": 0.9601, + "step": 10865 + }, + { + "epoch": 1.04, + "grad_norm": 0.27633015358130286, + "learning_rate": 0.00010171633046148073, + "loss": 0.9817, + "step": 10866 + }, + { + "epoch": 1.04, + "grad_norm": 0.3555063970818768, + "learning_rate": 0.00010170051327961005, + "loss": 1.074, + "step": 10867 + }, + { + "epoch": 1.04, + "grad_norm": 0.29814753259824983, + "learning_rate": 0.00010168469605518293, + "loss": 1.0005, + "step": 10868 + }, + { + "epoch": 1.04, + "grad_norm": 0.3742066704379873, + "learning_rate": 0.00010166887878859528, + "loss": 0.9317, + "step": 10869 + }, + { + "epoch": 1.04, + "grad_norm": 0.28971233049582473, + "learning_rate": 0.00010165306148024285, + "loss": 1.0572, + "step": 10870 + }, + { + "epoch": 1.04, + "grad_norm": 0.30226772497950677, + "learning_rate": 0.00010163724413052153, + "loss": 1.0824, + "step": 10871 + }, + { + "epoch": 1.04, + "grad_norm": 0.27209641906793863, + "learning_rate": 0.00010162142673982716, + "loss": 1.0737, + "step": 10872 + }, + { + "epoch": 1.04, + "grad_norm": 0.3549491331766397, + "learning_rate": 0.00010160560930855556, + "loss": 0.9888, + "step": 10873 + }, + { + "epoch": 1.04, + "grad_norm": 0.26314348729797027, + "learning_rate": 0.00010158979183710258, + "loss": 1.1105, + "step": 10874 + }, + { + "epoch": 1.04, + "grad_norm": 0.26703577888709373, + "learning_rate": 0.00010157397432586405, + "loss": 1.0349, + "step": 10875 + }, + { + "epoch": 1.04, + "grad_norm": 0.2662565992274953, + "learning_rate": 0.00010155815677523584, + "loss": 1.0883, + "step": 10876 + }, + { + "epoch": 1.04, + "grad_norm": 0.26444506422497427, + "learning_rate": 0.00010154233918561376, + "loss": 0.9118, + "step": 10877 + }, + { + "epoch": 1.04, + "grad_norm": 0.3073158492270825, + "learning_rate": 0.00010152652155739366, + "loss": 0.9511, + "step": 10878 + }, + { + "epoch": 1.04, + "grad_norm": 0.3065381987536428, + "learning_rate": 0.00010151070389097144, + "loss": 1.0478, + "step": 10879 + }, + { + "epoch": 1.04, + "grad_norm": 0.3012668421267288, + "learning_rate": 0.00010149488618674287, + "loss": 0.9072, + "step": 10880 + }, + { + "epoch": 1.04, + "grad_norm": 0.2749985216639651, + "learning_rate": 0.00010147906844510387, + "loss": 1.0493, + "step": 10881 + }, + { + "epoch": 1.04, + "grad_norm": 0.27550061837296547, + "learning_rate": 0.00010146325066645022, + "loss": 1.0606, + "step": 10882 + }, + { + "epoch": 1.04, + "grad_norm": 0.3548316559876122, + "learning_rate": 0.00010144743285117781, + "loss": 1.0066, + "step": 10883 + }, + { + "epoch": 1.04, + "grad_norm": 0.292853800635852, + "learning_rate": 0.00010143161499968247, + "loss": 1.0099, + "step": 10884 + }, + { + "epoch": 1.04, + "grad_norm": 0.31770934489653696, + "learning_rate": 0.00010141579711236009, + "loss": 1.0791, + "step": 10885 + }, + { + "epoch": 1.04, + "grad_norm": 0.32674930421447435, + "learning_rate": 0.00010139997918960648, + "loss": 1.0743, + "step": 10886 + }, + { + "epoch": 1.04, + "grad_norm": 0.29481361471722517, + "learning_rate": 0.00010138416123181753, + "loss": 1.2049, + "step": 10887 + }, + { + "epoch": 1.04, + "grad_norm": 0.3370157803604361, + "learning_rate": 0.00010136834323938908, + "loss": 0.9902, + "step": 10888 + }, + { + "epoch": 1.04, + "grad_norm": 0.3203649078363609, + "learning_rate": 0.00010135252521271695, + "loss": 1.0235, + "step": 10889 + }, + { + "epoch": 1.04, + "grad_norm": 0.34792462786689077, + "learning_rate": 0.00010133670715219705, + "loss": 1.0215, + "step": 10890 + }, + { + "epoch": 1.04, + "grad_norm": 0.32736976890031116, + "learning_rate": 0.00010132088905822519, + "loss": 1.0383, + "step": 10891 + }, + { + "epoch": 1.04, + "grad_norm": 0.3195000692626494, + "learning_rate": 0.00010130507093119726, + "loss": 0.9895, + "step": 10892 + }, + { + "epoch": 1.04, + "grad_norm": 0.30780215187321597, + "learning_rate": 0.00010128925277150911, + "loss": 0.9965, + "step": 10893 + }, + { + "epoch": 1.04, + "grad_norm": 0.27105263777923727, + "learning_rate": 0.0001012734345795566, + "loss": 1.0354, + "step": 10894 + }, + { + "epoch": 1.04, + "grad_norm": 0.3112411840476932, + "learning_rate": 0.00010125761635573558, + "loss": 1.0474, + "step": 10895 + }, + { + "epoch": 1.04, + "grad_norm": 0.28521286637869614, + "learning_rate": 0.00010124179810044194, + "loss": 0.9423, + "step": 10896 + }, + { + "epoch": 1.04, + "grad_norm": 0.31452763635521297, + "learning_rate": 0.00010122597981407153, + "loss": 0.9653, + "step": 10897 + }, + { + "epoch": 1.04, + "grad_norm": 0.2642881169490843, + "learning_rate": 0.00010121016149702021, + "loss": 1.0532, + "step": 10898 + }, + { + "epoch": 1.04, + "grad_norm": 0.3275191890984719, + "learning_rate": 0.00010119434314968379, + "loss": 1.09, + "step": 10899 + }, + { + "epoch": 1.04, + "grad_norm": 0.2798478148322801, + "learning_rate": 0.00010117852477245822, + "loss": 1.0408, + "step": 10900 + }, + { + "epoch": 1.04, + "grad_norm": 0.3642098161214323, + "learning_rate": 0.00010116270636573933, + "loss": 1.009, + "step": 10901 + }, + { + "epoch": 1.04, + "grad_norm": 0.2679706418371611, + "learning_rate": 0.00010114688792992294, + "loss": 0.9701, + "step": 10902 + }, + { + "epoch": 1.04, + "grad_norm": 0.26741887370762657, + "learning_rate": 0.000101131069465405, + "loss": 1.0362, + "step": 10903 + }, + { + "epoch": 1.04, + "grad_norm": 0.27239109719545535, + "learning_rate": 0.00010111525097258134, + "loss": 1.0446, + "step": 10904 + }, + { + "epoch": 1.04, + "grad_norm": 0.2753229744898263, + "learning_rate": 0.00010109943245184781, + "loss": 0.9867, + "step": 10905 + }, + { + "epoch": 1.04, + "grad_norm": 0.3487009071204345, + "learning_rate": 0.00010108361390360029, + "loss": 1.097, + "step": 10906 + }, + { + "epoch": 1.04, + "grad_norm": 0.3141811216560684, + "learning_rate": 0.00010106779532823465, + "loss": 1.0046, + "step": 10907 + }, + { + "epoch": 1.04, + "grad_norm": 0.3113340621674816, + "learning_rate": 0.00010105197672614677, + "loss": 0.8404, + "step": 10908 + }, + { + "epoch": 1.04, + "grad_norm": 0.28097110924524415, + "learning_rate": 0.00010103615809773248, + "loss": 1.0516, + "step": 10909 + }, + { + "epoch": 1.04, + "grad_norm": 0.31286944931460015, + "learning_rate": 0.00010102033944338771, + "loss": 1.0882, + "step": 10910 + }, + { + "epoch": 1.04, + "grad_norm": 0.3089502607739295, + "learning_rate": 0.00010100452076350828, + "loss": 1.0772, + "step": 10911 + }, + { + "epoch": 1.04, + "grad_norm": 0.3075876561349412, + "learning_rate": 0.00010098870205849011, + "loss": 1.0187, + "step": 10912 + }, + { + "epoch": 1.04, + "grad_norm": 0.30761151895950406, + "learning_rate": 0.00010097288332872905, + "loss": 1.0755, + "step": 10913 + }, + { + "epoch": 1.04, + "grad_norm": 0.33658530968809713, + "learning_rate": 0.00010095706457462098, + "loss": 1.0033, + "step": 10914 + }, + { + "epoch": 1.04, + "grad_norm": 0.33500341053570665, + "learning_rate": 0.00010094124579656174, + "loss": 1.0672, + "step": 10915 + }, + { + "epoch": 1.04, + "grad_norm": 0.35429629339677454, + "learning_rate": 0.00010092542699494725, + "loss": 1.077, + "step": 10916 + }, + { + "epoch": 1.04, + "grad_norm": 0.321171203864763, + "learning_rate": 0.00010090960817017337, + "loss": 0.9483, + "step": 10917 + }, + { + "epoch": 1.04, + "grad_norm": 0.28811120167846993, + "learning_rate": 0.00010089378932263598, + "loss": 0.9483, + "step": 10918 + }, + { + "epoch": 1.04, + "grad_norm": 0.2822130756617273, + "learning_rate": 0.00010087797045273093, + "loss": 1.1146, + "step": 10919 + }, + { + "epoch": 1.04, + "grad_norm": 0.3201355316176018, + "learning_rate": 0.00010086215156085413, + "loss": 1.0515, + "step": 10920 + }, + { + "epoch": 1.04, + "grad_norm": 0.27313128229230826, + "learning_rate": 0.00010084633264740146, + "loss": 0.9963, + "step": 10921 + }, + { + "epoch": 1.04, + "grad_norm": 0.2918572599412112, + "learning_rate": 0.00010083051371276874, + "loss": 0.9659, + "step": 10922 + }, + { + "epoch": 1.05, + "grad_norm": 0.2741157181427605, + "learning_rate": 0.00010081469475735195, + "loss": 1.1455, + "step": 10923 + }, + { + "epoch": 1.05, + "grad_norm": 0.26582720205215565, + "learning_rate": 0.00010079887578154687, + "loss": 1.0602, + "step": 10924 + }, + { + "epoch": 1.05, + "grad_norm": 0.2712542361948409, + "learning_rate": 0.00010078305678574944, + "loss": 1.1032, + "step": 10925 + }, + { + "epoch": 1.05, + "grad_norm": 0.3196559559543027, + "learning_rate": 0.00010076723777035551, + "loss": 1.0117, + "step": 10926 + }, + { + "epoch": 1.05, + "grad_norm": 0.27074075848992707, + "learning_rate": 0.00010075141873576097, + "loss": 0.9755, + "step": 10927 + }, + { + "epoch": 1.05, + "grad_norm": 0.27879156546530104, + "learning_rate": 0.00010073559968236173, + "loss": 1.0821, + "step": 10928 + }, + { + "epoch": 1.05, + "grad_norm": 0.27928290425321756, + "learning_rate": 0.00010071978061055361, + "loss": 0.9995, + "step": 10929 + }, + { + "epoch": 1.05, + "grad_norm": 0.30358542686918144, + "learning_rate": 0.00010070396152073256, + "loss": 1.0649, + "step": 10930 + }, + { + "epoch": 1.05, + "grad_norm": 0.3082277004428636, + "learning_rate": 0.00010068814241329442, + "loss": 1.0548, + "step": 10931 + }, + { + "epoch": 1.05, + "grad_norm": 0.3096621156105449, + "learning_rate": 0.00010067232328863507, + "loss": 1.012, + "step": 10932 + }, + { + "epoch": 1.05, + "grad_norm": 0.28986010760063324, + "learning_rate": 0.00010065650414715042, + "loss": 1.052, + "step": 10933 + }, + { + "epoch": 1.05, + "grad_norm": 0.33228752944153356, + "learning_rate": 0.00010064068498923631, + "loss": 0.9252, + "step": 10934 + }, + { + "epoch": 1.05, + "grad_norm": 0.30720434318230716, + "learning_rate": 0.00010062486581528869, + "loss": 1.1103, + "step": 10935 + }, + { + "epoch": 1.05, + "grad_norm": 0.2704579289133573, + "learning_rate": 0.0001006090466257034, + "loss": 1.0302, + "step": 10936 + }, + { + "epoch": 1.05, + "grad_norm": 0.3140777444076708, + "learning_rate": 0.00010059322742087634, + "loss": 0.8072, + "step": 10937 + }, + { + "epoch": 1.05, + "grad_norm": 0.3557841251475421, + "learning_rate": 0.0001005774082012034, + "loss": 1.0069, + "step": 10938 + }, + { + "epoch": 1.05, + "grad_norm": 0.26120936557289426, + "learning_rate": 0.00010056158896708043, + "loss": 1.021, + "step": 10939 + }, + { + "epoch": 1.05, + "grad_norm": 0.3215923386682417, + "learning_rate": 0.00010054576971890337, + "loss": 1.1189, + "step": 10940 + }, + { + "epoch": 1.05, + "grad_norm": 0.28298403250109133, + "learning_rate": 0.00010052995045706808, + "loss": 1.0615, + "step": 10941 + }, + { + "epoch": 1.05, + "grad_norm": 0.30024478299692625, + "learning_rate": 0.00010051413118197044, + "loss": 1.1242, + "step": 10942 + }, + { + "epoch": 1.05, + "grad_norm": 0.31248723125768413, + "learning_rate": 0.00010049831189400633, + "loss": 1.0671, + "step": 10943 + }, + { + "epoch": 1.05, + "grad_norm": 0.3103821318110805, + "learning_rate": 0.00010048249259357168, + "loss": 1.0171, + "step": 10944 + }, + { + "epoch": 1.05, + "grad_norm": 0.3389066473912416, + "learning_rate": 0.00010046667328106231, + "loss": 1.1551, + "step": 10945 + }, + { + "epoch": 1.05, + "grad_norm": 0.28346719728184266, + "learning_rate": 0.00010045085395687417, + "loss": 1.054, + "step": 10946 + }, + { + "epoch": 1.05, + "grad_norm": 0.3177515451190992, + "learning_rate": 0.00010043503462140314, + "loss": 1.0008, + "step": 10947 + }, + { + "epoch": 1.05, + "grad_norm": 0.31069075057480483, + "learning_rate": 0.00010041921527504508, + "loss": 1.0878, + "step": 10948 + }, + { + "epoch": 1.05, + "grad_norm": 0.31903645022667737, + "learning_rate": 0.0001004033959181959, + "loss": 1.0848, + "step": 10949 + }, + { + "epoch": 1.05, + "grad_norm": 0.30184712453729473, + "learning_rate": 0.00010038757655125146, + "loss": 1.0707, + "step": 10950 + }, + { + "epoch": 1.05, + "grad_norm": 0.32145553152864575, + "learning_rate": 0.00010037175717460771, + "loss": 0.8426, + "step": 10951 + }, + { + "epoch": 1.05, + "grad_norm": 0.3015332106764296, + "learning_rate": 0.00010035593778866049, + "loss": 1.0184, + "step": 10952 + }, + { + "epoch": 1.05, + "grad_norm": 0.25682068133696634, + "learning_rate": 0.00010034011839380568, + "loss": 0.9661, + "step": 10953 + }, + { + "epoch": 1.05, + "grad_norm": 0.3106342393226833, + "learning_rate": 0.0001003242989904392, + "loss": 1.0081, + "step": 10954 + }, + { + "epoch": 1.05, + "grad_norm": 0.31152967030604023, + "learning_rate": 0.00010030847957895695, + "loss": 1.0106, + "step": 10955 + }, + { + "epoch": 1.05, + "grad_norm": 0.298463504474077, + "learning_rate": 0.00010029266015975478, + "loss": 0.9452, + "step": 10956 + }, + { + "epoch": 1.05, + "grad_norm": 0.3361069506951409, + "learning_rate": 0.00010027684073322865, + "loss": 1.0356, + "step": 10957 + }, + { + "epoch": 1.05, + "grad_norm": 0.31146290539442006, + "learning_rate": 0.00010026102129977437, + "loss": 0.9806, + "step": 10958 + }, + { + "epoch": 1.05, + "grad_norm": 0.26370846862413366, + "learning_rate": 0.00010024520185978789, + "loss": 1.0261, + "step": 10959 + }, + { + "epoch": 1.05, + "grad_norm": 0.3509166575413118, + "learning_rate": 0.00010022938241366506, + "loss": 1.0425, + "step": 10960 + }, + { + "epoch": 1.05, + "grad_norm": 0.33408391895314776, + "learning_rate": 0.00010021356296180182, + "loss": 0.9598, + "step": 10961 + }, + { + "epoch": 1.05, + "grad_norm": 0.32029946671812226, + "learning_rate": 0.00010019774350459401, + "loss": 0.9893, + "step": 10962 + }, + { + "epoch": 1.05, + "grad_norm": 0.25594405789858476, + "learning_rate": 0.00010018192404243753, + "loss": 1.0635, + "step": 10963 + }, + { + "epoch": 1.05, + "grad_norm": 0.30529397227552146, + "learning_rate": 0.00010016610457572833, + "loss": 0.9589, + "step": 10964 + }, + { + "epoch": 1.05, + "grad_norm": 0.28750821264027016, + "learning_rate": 0.00010015028510486221, + "loss": 0.9246, + "step": 10965 + }, + { + "epoch": 1.05, + "grad_norm": 0.3371281424238216, + "learning_rate": 0.00010013446563023516, + "loss": 1.0672, + "step": 10966 + }, + { + "epoch": 1.05, + "grad_norm": 0.2806206080369055, + "learning_rate": 0.00010011864615224298, + "loss": 0.9683, + "step": 10967 + }, + { + "epoch": 1.05, + "grad_norm": 0.29666288782621186, + "learning_rate": 0.00010010282667128164, + "loss": 0.9289, + "step": 10968 + }, + { + "epoch": 1.05, + "grad_norm": 0.29419429118893625, + "learning_rate": 0.00010008700718774697, + "loss": 0.9435, + "step": 10969 + }, + { + "epoch": 1.05, + "grad_norm": 0.31329751207890827, + "learning_rate": 0.0001000711877020349, + "loss": 0.9707, + "step": 10970 + }, + { + "epoch": 1.05, + "grad_norm": 0.3242828648857475, + "learning_rate": 0.00010005536821454131, + "loss": 0.9651, + "step": 10971 + }, + { + "epoch": 1.05, + "grad_norm": 0.285650985737405, + "learning_rate": 0.0001000395487256621, + "loss": 1.0609, + "step": 10972 + }, + { + "epoch": 1.05, + "grad_norm": 0.2827100986510454, + "learning_rate": 0.00010002372923579316, + "loss": 1.0454, + "step": 10973 + }, + { + "epoch": 1.05, + "grad_norm": 0.3363478535766906, + "learning_rate": 0.00010000790974533039, + "loss": 1.0492, + "step": 10974 + }, + { + "epoch": 1.05, + "grad_norm": 0.31461451328476236, + "learning_rate": 9.999209025466964e-05, + "loss": 1.0068, + "step": 10975 + }, + { + "epoch": 1.05, + "grad_norm": 0.28482192979060966, + "learning_rate": 9.997627076420687e-05, + "loss": 0.9749, + "step": 10976 + }, + { + "epoch": 1.05, + "grad_norm": 0.3009363426399415, + "learning_rate": 9.996045127433792e-05, + "loss": 1.029, + "step": 10977 + }, + { + "epoch": 1.05, + "grad_norm": 0.29271050014307154, + "learning_rate": 9.994463178545871e-05, + "loss": 1.0302, + "step": 10978 + }, + { + "epoch": 1.05, + "grad_norm": 0.3038293614431799, + "learning_rate": 9.992881229796511e-05, + "loss": 1.0454, + "step": 10979 + }, + { + "epoch": 1.05, + "grad_norm": 0.3302282169561924, + "learning_rate": 9.991299281225305e-05, + "loss": 1.0345, + "step": 10980 + }, + { + "epoch": 1.05, + "grad_norm": 0.3081455802149132, + "learning_rate": 9.989717332871839e-05, + "loss": 0.975, + "step": 10981 + }, + { + "epoch": 1.05, + "grad_norm": 0.32385063975877504, + "learning_rate": 9.988135384775705e-05, + "loss": 1.0514, + "step": 10982 + }, + { + "epoch": 1.05, + "grad_norm": 0.3159854051210647, + "learning_rate": 9.986553436976488e-05, + "loss": 1.0807, + "step": 10983 + }, + { + "epoch": 1.05, + "grad_norm": 0.3091894036208416, + "learning_rate": 9.984971489513781e-05, + "loss": 1.0269, + "step": 10984 + }, + { + "epoch": 1.05, + "grad_norm": 0.30578618341950853, + "learning_rate": 9.983389542427168e-05, + "loss": 0.9458, + "step": 10985 + }, + { + "epoch": 1.05, + "grad_norm": 0.30473234580215725, + "learning_rate": 9.981807595756246e-05, + "loss": 1.0073, + "step": 10986 + }, + { + "epoch": 1.05, + "grad_norm": 0.3000545622698162, + "learning_rate": 9.980225649540601e-05, + "loss": 1.0533, + "step": 10987 + }, + { + "epoch": 1.05, + "grad_norm": 0.24900274836798425, + "learning_rate": 9.978643703819822e-05, + "loss": 0.9516, + "step": 10988 + }, + { + "epoch": 1.05, + "grad_norm": 0.3280156457329766, + "learning_rate": 9.977061758633495e-05, + "loss": 1.0449, + "step": 10989 + }, + { + "epoch": 1.05, + "grad_norm": 0.31205903864388795, + "learning_rate": 9.975479814021215e-05, + "loss": 1.215, + "step": 10990 + }, + { + "epoch": 1.05, + "grad_norm": 0.3003690439137535, + "learning_rate": 9.973897870022567e-05, + "loss": 1.0729, + "step": 10991 + }, + { + "epoch": 1.05, + "grad_norm": 0.2691614405324527, + "learning_rate": 9.972315926677136e-05, + "loss": 0.9093, + "step": 10992 + }, + { + "epoch": 1.05, + "grad_norm": 0.2917625409485338, + "learning_rate": 9.970733984024522e-05, + "loss": 1.1094, + "step": 10993 + }, + { + "epoch": 1.05, + "grad_norm": 0.28608967837701155, + "learning_rate": 9.969152042104307e-05, + "loss": 0.9254, + "step": 10994 + }, + { + "epoch": 1.05, + "grad_norm": 0.28739790380684904, + "learning_rate": 9.967570100956082e-05, + "loss": 1.0262, + "step": 10995 + }, + { + "epoch": 1.05, + "grad_norm": 0.30625000870618024, + "learning_rate": 9.965988160619434e-05, + "loss": 0.9923, + "step": 10996 + }, + { + "epoch": 1.05, + "grad_norm": 0.2681667150799098, + "learning_rate": 9.964406221133955e-05, + "loss": 1.0123, + "step": 10997 + }, + { + "epoch": 1.05, + "grad_norm": 0.3238370080069252, + "learning_rate": 9.962824282539234e-05, + "loss": 0.9635, + "step": 10998 + }, + { + "epoch": 1.05, + "grad_norm": 0.30985436604973, + "learning_rate": 9.961242344874854e-05, + "loss": 0.9604, + "step": 10999 + }, + { + "epoch": 1.05, + "grad_norm": 0.2771520922161627, + "learning_rate": 9.959660408180412e-05, + "loss": 1.0222, + "step": 11000 + }, + { + "epoch": 1.05, + "grad_norm": 0.2656775926415813, + "learning_rate": 9.958078472495494e-05, + "loss": 1.0412, + "step": 11001 + }, + { + "epoch": 1.05, + "grad_norm": 0.3168181958331293, + "learning_rate": 9.956496537859688e-05, + "loss": 1.1077, + "step": 11002 + }, + { + "epoch": 1.05, + "grad_norm": 0.29995091403732577, + "learning_rate": 9.954914604312585e-05, + "loss": 1.0977, + "step": 11003 + }, + { + "epoch": 1.05, + "grad_norm": 0.24739457570890666, + "learning_rate": 9.953332671893772e-05, + "loss": 0.8737, + "step": 11004 + }, + { + "epoch": 1.05, + "grad_norm": 0.3240329219332351, + "learning_rate": 9.951750740642837e-05, + "loss": 1.0102, + "step": 11005 + }, + { + "epoch": 1.05, + "grad_norm": 0.30204102400925503, + "learning_rate": 9.950168810599367e-05, + "loss": 1.0299, + "step": 11006 + }, + { + "epoch": 1.05, + "grad_norm": 0.34618352634266286, + "learning_rate": 9.948586881802958e-05, + "loss": 1.1211, + "step": 11007 + }, + { + "epoch": 1.05, + "grad_norm": 0.27426250734306223, + "learning_rate": 9.947004954293194e-05, + "loss": 1.0424, + "step": 11008 + }, + { + "epoch": 1.05, + "grad_norm": 0.2854358797028317, + "learning_rate": 9.945423028109665e-05, + "loss": 1.0684, + "step": 11009 + }, + { + "epoch": 1.05, + "grad_norm": 0.31773528467607115, + "learning_rate": 9.94384110329196e-05, + "loss": 0.978, + "step": 11010 + }, + { + "epoch": 1.05, + "grad_norm": 0.2961906637760406, + "learning_rate": 9.942259179879664e-05, + "loss": 1.0515, + "step": 11011 + }, + { + "epoch": 1.05, + "grad_norm": 0.32489127711360727, + "learning_rate": 9.940677257912366e-05, + "loss": 1.084, + "step": 11012 + }, + { + "epoch": 1.05, + "grad_norm": 0.2850959788328847, + "learning_rate": 9.93909533742966e-05, + "loss": 1.0575, + "step": 11013 + }, + { + "epoch": 1.05, + "grad_norm": 0.3103726377840572, + "learning_rate": 9.937513418471133e-05, + "loss": 0.9549, + "step": 11014 + }, + { + "epoch": 1.05, + "grad_norm": 0.39032932172070933, + "learning_rate": 9.93593150107637e-05, + "loss": 0.9606, + "step": 11015 + }, + { + "epoch": 1.05, + "grad_norm": 0.28893029381682717, + "learning_rate": 9.934349585284961e-05, + "loss": 0.982, + "step": 11016 + }, + { + "epoch": 1.05, + "grad_norm": 0.29937901868845207, + "learning_rate": 9.932767671136496e-05, + "loss": 0.9773, + "step": 11017 + }, + { + "epoch": 1.05, + "grad_norm": 0.3541177259929554, + "learning_rate": 9.931185758670563e-05, + "loss": 0.9266, + "step": 11018 + }, + { + "epoch": 1.05, + "grad_norm": 0.29126504015374943, + "learning_rate": 9.929603847926747e-05, + "loss": 1.0369, + "step": 11019 + }, + { + "epoch": 1.05, + "grad_norm": 0.32321142437311406, + "learning_rate": 9.928021938944639e-05, + "loss": 1.0762, + "step": 11020 + }, + { + "epoch": 1.05, + "grad_norm": 0.3348885271348225, + "learning_rate": 9.92644003176383e-05, + "loss": 0.9692, + "step": 11021 + }, + { + "epoch": 1.05, + "grad_norm": 0.29026810867508235, + "learning_rate": 9.924858126423903e-05, + "loss": 1.0234, + "step": 11022 + }, + { + "epoch": 1.05, + "grad_norm": 0.2683276757492179, + "learning_rate": 9.923276222964452e-05, + "loss": 0.9801, + "step": 11023 + }, + { + "epoch": 1.05, + "grad_norm": 0.2936108587085574, + "learning_rate": 9.921694321425061e-05, + "loss": 1.0527, + "step": 11024 + }, + { + "epoch": 1.05, + "grad_norm": 0.3085269757844321, + "learning_rate": 9.920112421845317e-05, + "loss": 1.0103, + "step": 11025 + }, + { + "epoch": 1.05, + "grad_norm": 0.2994923987047925, + "learning_rate": 9.918530524264806e-05, + "loss": 0.9334, + "step": 11026 + }, + { + "epoch": 1.05, + "grad_norm": 0.27010855770940634, + "learning_rate": 9.916948628723125e-05, + "loss": 0.9728, + "step": 11027 + }, + { + "epoch": 1.06, + "grad_norm": 0.3213416667007468, + "learning_rate": 9.915366735259856e-05, + "loss": 1.0094, + "step": 11028 + }, + { + "epoch": 1.06, + "grad_norm": 0.2672595121446945, + "learning_rate": 9.913784843914588e-05, + "loss": 0.9648, + "step": 11029 + }, + { + "epoch": 1.06, + "grad_norm": 0.3163641316232268, + "learning_rate": 9.91220295472691e-05, + "loss": 1.051, + "step": 11030 + }, + { + "epoch": 1.06, + "grad_norm": 0.3298038713526849, + "learning_rate": 9.910621067736406e-05, + "loss": 1.1238, + "step": 11031 + }, + { + "epoch": 1.06, + "grad_norm": 0.2947110481342578, + "learning_rate": 9.909039182982667e-05, + "loss": 1.022, + "step": 11032 + }, + { + "epoch": 1.06, + "grad_norm": 0.26213848420790914, + "learning_rate": 9.907457300505276e-05, + "loss": 0.9779, + "step": 11033 + }, + { + "epoch": 1.06, + "grad_norm": 0.24087084981129292, + "learning_rate": 9.905875420343827e-05, + "loss": 1.0072, + "step": 11034 + }, + { + "epoch": 1.06, + "grad_norm": 0.3233777048948354, + "learning_rate": 9.904293542537905e-05, + "loss": 1.1236, + "step": 11035 + }, + { + "epoch": 1.06, + "grad_norm": 0.3039975439922752, + "learning_rate": 9.902711667127098e-05, + "loss": 0.956, + "step": 11036 + }, + { + "epoch": 1.06, + "grad_norm": 0.3063015037625037, + "learning_rate": 9.90112979415099e-05, + "loss": 0.9887, + "step": 11037 + }, + { + "epoch": 1.06, + "grad_norm": 0.28450034061682694, + "learning_rate": 9.899547923649173e-05, + "loss": 0.9795, + "step": 11038 + }, + { + "epoch": 1.06, + "grad_norm": 0.2785000252633791, + "learning_rate": 9.897966055661231e-05, + "loss": 1.0114, + "step": 11039 + }, + { + "epoch": 1.06, + "grad_norm": 0.3103592548129079, + "learning_rate": 9.896384190226754e-05, + "loss": 1.0084, + "step": 11040 + }, + { + "epoch": 1.06, + "grad_norm": 0.3297530752384266, + "learning_rate": 9.894802327385326e-05, + "loss": 1.1541, + "step": 11041 + }, + { + "epoch": 1.06, + "grad_norm": 0.2903040614607674, + "learning_rate": 9.893220467176537e-05, + "loss": 0.9143, + "step": 11042 + }, + { + "epoch": 1.06, + "grad_norm": 0.31194816645013596, + "learning_rate": 9.891638609639975e-05, + "loss": 1.0536, + "step": 11043 + }, + { + "epoch": 1.06, + "grad_norm": 0.2757808002975673, + "learning_rate": 9.890056754815224e-05, + "loss": 1.0037, + "step": 11044 + }, + { + "epoch": 1.06, + "grad_norm": 0.2563072039353289, + "learning_rate": 9.888474902741871e-05, + "loss": 0.8924, + "step": 11045 + }, + { + "epoch": 1.06, + "grad_norm": 0.28107060875964895, + "learning_rate": 9.886893053459499e-05, + "loss": 1.0705, + "step": 11046 + }, + { + "epoch": 1.06, + "grad_norm": 0.29124467735321796, + "learning_rate": 9.885311207007705e-05, + "loss": 1.0912, + "step": 11047 + }, + { + "epoch": 1.06, + "grad_norm": 0.28530010300811937, + "learning_rate": 9.88372936342607e-05, + "loss": 1.0735, + "step": 11048 + }, + { + "epoch": 1.06, + "grad_norm": 0.30426717584773466, + "learning_rate": 9.88214752275418e-05, + "loss": 0.9064, + "step": 11049 + }, + { + "epoch": 1.06, + "grad_norm": 0.30873689793695763, + "learning_rate": 9.880565685031623e-05, + "loss": 1.0888, + "step": 11050 + }, + { + "epoch": 1.06, + "grad_norm": 0.29513790119937, + "learning_rate": 9.878983850297985e-05, + "loss": 1.039, + "step": 11051 + }, + { + "epoch": 1.06, + "grad_norm": 0.3142028512339773, + "learning_rate": 9.877402018592852e-05, + "loss": 1.0165, + "step": 11052 + }, + { + "epoch": 1.06, + "grad_norm": 0.32293286473002614, + "learning_rate": 9.875820189955806e-05, + "loss": 1.0041, + "step": 11053 + }, + { + "epoch": 1.06, + "grad_norm": 0.2844444447903022, + "learning_rate": 9.874238364426441e-05, + "loss": 1.1329, + "step": 11054 + }, + { + "epoch": 1.06, + "grad_norm": 0.3279386950592109, + "learning_rate": 9.872656542044342e-05, + "loss": 1.0185, + "step": 11055 + }, + { + "epoch": 1.06, + "grad_norm": 0.2972295546173956, + "learning_rate": 9.871074722849092e-05, + "loss": 0.9677, + "step": 11056 + }, + { + "epoch": 1.06, + "grad_norm": 0.29875130114940784, + "learning_rate": 9.869492906880275e-05, + "loss": 1.1122, + "step": 11057 + }, + { + "epoch": 1.06, + "grad_norm": 0.31471653083993223, + "learning_rate": 9.867911094177485e-05, + "loss": 0.9991, + "step": 11058 + }, + { + "epoch": 1.06, + "grad_norm": 0.3000306847989355, + "learning_rate": 9.8663292847803e-05, + "loss": 1.0392, + "step": 11059 + }, + { + "epoch": 1.06, + "grad_norm": 0.3153655611824132, + "learning_rate": 9.864747478728308e-05, + "loss": 1.0236, + "step": 11060 + }, + { + "epoch": 1.06, + "grad_norm": 0.3024852883667774, + "learning_rate": 9.863165676061094e-05, + "loss": 0.9922, + "step": 11061 + }, + { + "epoch": 1.06, + "grad_norm": 0.31311775609042825, + "learning_rate": 9.861583876818249e-05, + "loss": 1.0863, + "step": 11062 + }, + { + "epoch": 1.06, + "grad_norm": 0.30586342358939, + "learning_rate": 9.860002081039354e-05, + "loss": 1.0023, + "step": 11063 + }, + { + "epoch": 1.06, + "grad_norm": 0.319747332380186, + "learning_rate": 9.858420288763995e-05, + "loss": 0.9993, + "step": 11064 + }, + { + "epoch": 1.06, + "grad_norm": 0.3125782744217087, + "learning_rate": 9.856838500031755e-05, + "loss": 0.9548, + "step": 11065 + }, + { + "epoch": 1.06, + "grad_norm": 0.26708993524592006, + "learning_rate": 9.855256714882224e-05, + "loss": 1.0433, + "step": 11066 + }, + { + "epoch": 1.06, + "grad_norm": 0.30703199653227103, + "learning_rate": 9.853674933354979e-05, + "loss": 1.0478, + "step": 11067 + }, + { + "epoch": 1.06, + "grad_norm": 0.3613901996930307, + "learning_rate": 9.852093155489615e-05, + "loss": 1.0566, + "step": 11068 + }, + { + "epoch": 1.06, + "grad_norm": 0.2922049814111139, + "learning_rate": 9.850511381325714e-05, + "loss": 0.9937, + "step": 11069 + }, + { + "epoch": 1.06, + "grad_norm": 0.30709113856112485, + "learning_rate": 9.84892961090286e-05, + "loss": 1.1265, + "step": 11070 + }, + { + "epoch": 1.06, + "grad_norm": 0.3129440889228055, + "learning_rate": 9.847347844260637e-05, + "loss": 1.0919, + "step": 11071 + }, + { + "epoch": 1.06, + "grad_norm": 0.3002241165146986, + "learning_rate": 9.84576608143863e-05, + "loss": 1.0452, + "step": 11072 + }, + { + "epoch": 1.06, + "grad_norm": 0.2683589222833353, + "learning_rate": 9.844184322476417e-05, + "loss": 0.9153, + "step": 11073 + }, + { + "epoch": 1.06, + "grad_norm": 0.3375047876329501, + "learning_rate": 9.842602567413596e-05, + "loss": 1.0327, + "step": 11074 + }, + { + "epoch": 1.06, + "grad_norm": 0.3361424968229276, + "learning_rate": 9.841020816289744e-05, + "loss": 0.9622, + "step": 11075 + }, + { + "epoch": 1.06, + "grad_norm": 0.3170221061545032, + "learning_rate": 9.839439069144447e-05, + "loss": 0.9295, + "step": 11076 + }, + { + "epoch": 1.06, + "grad_norm": 0.3257716322480005, + "learning_rate": 9.837857326017286e-05, + "loss": 1.1503, + "step": 11077 + }, + { + "epoch": 1.06, + "grad_norm": 0.26746704200499555, + "learning_rate": 9.836275586947848e-05, + "loss": 1.0125, + "step": 11078 + }, + { + "epoch": 1.06, + "grad_norm": 0.32354553868990843, + "learning_rate": 9.834693851975717e-05, + "loss": 1.1189, + "step": 11079 + }, + { + "epoch": 1.06, + "grad_norm": 0.30952972851219035, + "learning_rate": 9.833112121140474e-05, + "loss": 1.0119, + "step": 11080 + }, + { + "epoch": 1.06, + "grad_norm": 0.2798067758730765, + "learning_rate": 9.831530394481708e-05, + "loss": 0.9476, + "step": 11081 + }, + { + "epoch": 1.06, + "grad_norm": 0.2912537096895058, + "learning_rate": 9.829948672038996e-05, + "loss": 1.0766, + "step": 11082 + }, + { + "epoch": 1.06, + "grad_norm": 0.31179298576359327, + "learning_rate": 9.828366953851928e-05, + "loss": 0.9766, + "step": 11083 + }, + { + "epoch": 1.06, + "grad_norm": 0.31448633284448124, + "learning_rate": 9.826785239960087e-05, + "loss": 1.0681, + "step": 11084 + }, + { + "epoch": 1.06, + "grad_norm": 0.2740170288234518, + "learning_rate": 9.825203530403053e-05, + "loss": 1.0086, + "step": 11085 + }, + { + "epoch": 1.06, + "grad_norm": 0.2942696476708363, + "learning_rate": 9.82362182522041e-05, + "loss": 1.0429, + "step": 11086 + }, + { + "epoch": 1.06, + "grad_norm": 0.2873029476303662, + "learning_rate": 9.822040124451737e-05, + "loss": 1.0107, + "step": 11087 + }, + { + "epoch": 1.06, + "grad_norm": 0.31892251959283047, + "learning_rate": 9.820458428136626e-05, + "loss": 1.0427, + "step": 11088 + }, + { + "epoch": 1.06, + "grad_norm": 0.28808478931128667, + "learning_rate": 9.818876736314657e-05, + "loss": 0.9887, + "step": 11089 + }, + { + "epoch": 1.06, + "grad_norm": 0.264114776898352, + "learning_rate": 9.81729504902541e-05, + "loss": 1.0765, + "step": 11090 + }, + { + "epoch": 1.06, + "grad_norm": 0.28883451101795093, + "learning_rate": 9.81571336630847e-05, + "loss": 1.0653, + "step": 11091 + }, + { + "epoch": 1.06, + "grad_norm": 0.31562841505548916, + "learning_rate": 9.814131688203418e-05, + "loss": 0.9555, + "step": 11092 + }, + { + "epoch": 1.06, + "grad_norm": 0.29101266306807544, + "learning_rate": 9.812550014749838e-05, + "loss": 0.989, + "step": 11093 + }, + { + "epoch": 1.06, + "grad_norm": 0.3068587070686998, + "learning_rate": 9.810968345987308e-05, + "loss": 0.945, + "step": 11094 + }, + { + "epoch": 1.06, + "grad_norm": 0.3024077896549214, + "learning_rate": 9.809386681955418e-05, + "loss": 1.004, + "step": 11095 + }, + { + "epoch": 1.06, + "grad_norm": 0.3076740005670801, + "learning_rate": 9.807805022693746e-05, + "loss": 0.9752, + "step": 11096 + }, + { + "epoch": 1.06, + "grad_norm": 0.34647299911504187, + "learning_rate": 9.806223368241874e-05, + "loss": 1.0811, + "step": 11097 + }, + { + "epoch": 1.06, + "grad_norm": 0.2789480695143697, + "learning_rate": 9.804641718639382e-05, + "loss": 1.0362, + "step": 11098 + }, + { + "epoch": 1.06, + "grad_norm": 0.2841137934371156, + "learning_rate": 9.803060073925854e-05, + "loss": 1.0195, + "step": 11099 + }, + { + "epoch": 1.06, + "grad_norm": 0.29502177655393963, + "learning_rate": 9.801478434140872e-05, + "loss": 1.0806, + "step": 11100 + }, + { + "epoch": 1.06, + "grad_norm": 0.2993819942862502, + "learning_rate": 9.799896799324018e-05, + "loss": 1.0495, + "step": 11101 + }, + { + "epoch": 1.06, + "grad_norm": 0.40831615374028807, + "learning_rate": 9.79831516951487e-05, + "loss": 1.0197, + "step": 11102 + }, + { + "epoch": 1.06, + "grad_norm": 0.32820674014380014, + "learning_rate": 9.796733544753014e-05, + "loss": 0.9976, + "step": 11103 + }, + { + "epoch": 1.06, + "grad_norm": 0.3187619206508716, + "learning_rate": 9.795151925078028e-05, + "loss": 1.0653, + "step": 11104 + }, + { + "epoch": 1.06, + "grad_norm": 0.2768627539591872, + "learning_rate": 9.793570310529495e-05, + "loss": 0.9341, + "step": 11105 + }, + { + "epoch": 1.06, + "grad_norm": 0.3516149474448269, + "learning_rate": 9.791988701146995e-05, + "loss": 1.0665, + "step": 11106 + }, + { + "epoch": 1.06, + "grad_norm": 0.3261816798873697, + "learning_rate": 9.790407096970104e-05, + "loss": 1.1082, + "step": 11107 + }, + { + "epoch": 1.06, + "grad_norm": 0.3061480427634875, + "learning_rate": 9.788825498038409e-05, + "loss": 1.0956, + "step": 11108 + }, + { + "epoch": 1.06, + "grad_norm": 0.34703496164923214, + "learning_rate": 9.787243904391493e-05, + "loss": 1.0296, + "step": 11109 + }, + { + "epoch": 1.06, + "grad_norm": 0.2604112768506786, + "learning_rate": 9.785662316068929e-05, + "loss": 0.9273, + "step": 11110 + }, + { + "epoch": 1.06, + "grad_norm": 0.2790159937125621, + "learning_rate": 9.7840807331103e-05, + "loss": 1.0044, + "step": 11111 + }, + { + "epoch": 1.06, + "grad_norm": 0.31414165218748763, + "learning_rate": 9.782499155555188e-05, + "loss": 0.8503, + "step": 11112 + }, + { + "epoch": 1.06, + "grad_norm": 0.3014748462121707, + "learning_rate": 9.78091758344317e-05, + "loss": 0.9112, + "step": 11113 + }, + { + "epoch": 1.06, + "grad_norm": 0.2948288080541336, + "learning_rate": 9.779336016813822e-05, + "loss": 0.9785, + "step": 11114 + }, + { + "epoch": 1.06, + "grad_norm": 0.30036047740686683, + "learning_rate": 9.777754455706736e-05, + "loss": 1.1212, + "step": 11115 + }, + { + "epoch": 1.06, + "grad_norm": 0.2690003670064326, + "learning_rate": 9.776172900161483e-05, + "loss": 1.0631, + "step": 11116 + }, + { + "epoch": 1.06, + "grad_norm": 0.24044108228120348, + "learning_rate": 9.774591350217643e-05, + "loss": 1.0578, + "step": 11117 + }, + { + "epoch": 1.06, + "grad_norm": 0.2999799431842342, + "learning_rate": 9.773009805914796e-05, + "loss": 1.0166, + "step": 11118 + }, + { + "epoch": 1.06, + "grad_norm": 0.3154730850191107, + "learning_rate": 9.771428267292522e-05, + "loss": 1.023, + "step": 11119 + }, + { + "epoch": 1.06, + "grad_norm": 0.30538041366110374, + "learning_rate": 9.769846734390399e-05, + "loss": 0.9414, + "step": 11120 + }, + { + "epoch": 1.06, + "grad_norm": 0.288893573039383, + "learning_rate": 9.768265207248004e-05, + "loss": 0.9002, + "step": 11121 + }, + { + "epoch": 1.06, + "grad_norm": 0.2604082667269524, + "learning_rate": 9.76668368590492e-05, + "loss": 1.0386, + "step": 11122 + }, + { + "epoch": 1.06, + "grad_norm": 0.3109853220586414, + "learning_rate": 9.765102170400722e-05, + "loss": 1.0154, + "step": 11123 + }, + { + "epoch": 1.06, + "grad_norm": 0.3166347080387588, + "learning_rate": 9.763520660774992e-05, + "loss": 1.0493, + "step": 11124 + }, + { + "epoch": 1.06, + "grad_norm": 0.3685856793216986, + "learning_rate": 9.761939157067306e-05, + "loss": 0.987, + "step": 11125 + }, + { + "epoch": 1.06, + "grad_norm": 0.28882033380489214, + "learning_rate": 9.760357659317241e-05, + "loss": 1.0337, + "step": 11126 + }, + { + "epoch": 1.06, + "grad_norm": 0.3120265425578845, + "learning_rate": 9.758776167564378e-05, + "loss": 0.9829, + "step": 11127 + }, + { + "epoch": 1.06, + "grad_norm": 0.35012063990028647, + "learning_rate": 9.757194681848287e-05, + "loss": 1.0791, + "step": 11128 + }, + { + "epoch": 1.06, + "grad_norm": 0.3064204143143254, + "learning_rate": 9.755613202208556e-05, + "loss": 1.0422, + "step": 11129 + }, + { + "epoch": 1.06, + "grad_norm": 0.2877922394730586, + "learning_rate": 9.75403172868476e-05, + "loss": 1.0121, + "step": 11130 + }, + { + "epoch": 1.06, + "grad_norm": 0.3167566670203429, + "learning_rate": 9.752450261316473e-05, + "loss": 1.0494, + "step": 11131 + }, + { + "epoch": 1.07, + "grad_norm": 0.29110558061594977, + "learning_rate": 9.750868800143275e-05, + "loss": 1.1045, + "step": 11132 + }, + { + "epoch": 1.07, + "grad_norm": 0.31017488877050725, + "learning_rate": 9.74928734520474e-05, + "loss": 1.0409, + "step": 11133 + }, + { + "epoch": 1.07, + "grad_norm": 0.26068301710746605, + "learning_rate": 9.747705896540445e-05, + "loss": 1.0578, + "step": 11134 + }, + { + "epoch": 1.07, + "grad_norm": 0.3149583835407991, + "learning_rate": 9.74612445418997e-05, + "loss": 1.1197, + "step": 11135 + }, + { + "epoch": 1.07, + "grad_norm": 0.33060028200728037, + "learning_rate": 9.744543018192892e-05, + "loss": 1.0496, + "step": 11136 + }, + { + "epoch": 1.07, + "grad_norm": 0.3015712151203088, + "learning_rate": 9.742961588588785e-05, + "loss": 0.9791, + "step": 11137 + }, + { + "epoch": 1.07, + "grad_norm": 0.30699336374797065, + "learning_rate": 9.741380165417227e-05, + "loss": 1.0914, + "step": 11138 + }, + { + "epoch": 1.07, + "grad_norm": 0.27705037534240534, + "learning_rate": 9.739798748717791e-05, + "loss": 1.0177, + "step": 11139 + }, + { + "epoch": 1.07, + "grad_norm": 0.3463102933851776, + "learning_rate": 9.738217338530054e-05, + "loss": 1.1802, + "step": 11140 + }, + { + "epoch": 1.07, + "grad_norm": 0.2693236196018057, + "learning_rate": 9.736635934893592e-05, + "loss": 0.9029, + "step": 11141 + }, + { + "epoch": 1.07, + "grad_norm": 0.2742953391068999, + "learning_rate": 9.735054537847983e-05, + "loss": 1.1082, + "step": 11142 + }, + { + "epoch": 1.07, + "grad_norm": 0.30005341773063854, + "learning_rate": 9.7334731474328e-05, + "loss": 1.1169, + "step": 11143 + }, + { + "epoch": 1.07, + "grad_norm": 0.2994606411711211, + "learning_rate": 9.73189176368762e-05, + "loss": 0.9895, + "step": 11144 + }, + { + "epoch": 1.07, + "grad_norm": 0.2774797983966724, + "learning_rate": 9.730310386652016e-05, + "loss": 1.0043, + "step": 11145 + }, + { + "epoch": 1.07, + "grad_norm": 0.3482098025957065, + "learning_rate": 9.728729016365565e-05, + "loss": 1.062, + "step": 11146 + }, + { + "epoch": 1.07, + "grad_norm": 0.27324192801789443, + "learning_rate": 9.727147652867841e-05, + "loss": 1.0363, + "step": 11147 + }, + { + "epoch": 1.07, + "grad_norm": 0.35725466303817166, + "learning_rate": 9.725566296198416e-05, + "loss": 1.0682, + "step": 11148 + }, + { + "epoch": 1.07, + "grad_norm": 0.2886199734465701, + "learning_rate": 9.723984946396868e-05, + "loss": 1.0128, + "step": 11149 + }, + { + "epoch": 1.07, + "grad_norm": 0.304677558800044, + "learning_rate": 9.722403603502772e-05, + "loss": 1.0023, + "step": 11150 + }, + { + "epoch": 1.07, + "grad_norm": 0.2883743473567877, + "learning_rate": 9.7208222675557e-05, + "loss": 1.1518, + "step": 11151 + }, + { + "epoch": 1.07, + "grad_norm": 0.29595505323230814, + "learning_rate": 9.719240938595226e-05, + "loss": 0.8969, + "step": 11152 + }, + { + "epoch": 1.07, + "grad_norm": 0.3466469371923985, + "learning_rate": 9.717659616660925e-05, + "loss": 1.0664, + "step": 11153 + }, + { + "epoch": 1.07, + "grad_norm": 0.314026025263861, + "learning_rate": 9.716078301792368e-05, + "loss": 1.0407, + "step": 11154 + }, + { + "epoch": 1.07, + "grad_norm": 0.3146199075248267, + "learning_rate": 9.714496994029127e-05, + "loss": 1.1167, + "step": 11155 + }, + { + "epoch": 1.07, + "grad_norm": 0.28341525399798034, + "learning_rate": 9.712915693410782e-05, + "loss": 0.9903, + "step": 11156 + }, + { + "epoch": 1.07, + "grad_norm": 0.28013586118444034, + "learning_rate": 9.711334399976902e-05, + "loss": 0.9987, + "step": 11157 + }, + { + "epoch": 1.07, + "grad_norm": 0.32666537813959073, + "learning_rate": 9.709753113767061e-05, + "loss": 0.9513, + "step": 11158 + }, + { + "epoch": 1.07, + "grad_norm": 0.27427287637124265, + "learning_rate": 9.70817183482083e-05, + "loss": 0.9772, + "step": 11159 + }, + { + "epoch": 1.07, + "grad_norm": 0.32650780615141145, + "learning_rate": 9.706590563177779e-05, + "loss": 1.0062, + "step": 11160 + }, + { + "epoch": 1.07, + "grad_norm": 0.2738590241230193, + "learning_rate": 9.705009298877484e-05, + "loss": 1.0544, + "step": 11161 + }, + { + "epoch": 1.07, + "grad_norm": 0.3121780864604334, + "learning_rate": 9.70342804195952e-05, + "loss": 1.072, + "step": 11162 + }, + { + "epoch": 1.07, + "grad_norm": 0.2718689155620133, + "learning_rate": 9.701846792463454e-05, + "loss": 1.0316, + "step": 11163 + }, + { + "epoch": 1.07, + "grad_norm": 0.2754769828086489, + "learning_rate": 9.700265550428859e-05, + "loss": 1.0899, + "step": 11164 + }, + { + "epoch": 1.07, + "grad_norm": 0.33731439637116034, + "learning_rate": 9.698684315895308e-05, + "loss": 1.0327, + "step": 11165 + }, + { + "epoch": 1.07, + "grad_norm": 0.2598872227309607, + "learning_rate": 9.697103088902372e-05, + "loss": 1.0188, + "step": 11166 + }, + { + "epoch": 1.07, + "grad_norm": 0.2734707912985093, + "learning_rate": 9.695521869489621e-05, + "loss": 0.9912, + "step": 11167 + }, + { + "epoch": 1.07, + "grad_norm": 0.3018683842033648, + "learning_rate": 9.693940657696623e-05, + "loss": 0.9341, + "step": 11168 + }, + { + "epoch": 1.07, + "grad_norm": 0.28028745775759023, + "learning_rate": 9.692359453562956e-05, + "loss": 0.9901, + "step": 11169 + }, + { + "epoch": 1.07, + "grad_norm": 0.3796765908279408, + "learning_rate": 9.690778257128186e-05, + "loss": 1.0259, + "step": 11170 + }, + { + "epoch": 1.07, + "grad_norm": 0.3446826361693147, + "learning_rate": 9.689197068431887e-05, + "loss": 1.0049, + "step": 11171 + }, + { + "epoch": 1.07, + "grad_norm": 0.2953349913391379, + "learning_rate": 9.687615887513624e-05, + "loss": 1.0832, + "step": 11172 + }, + { + "epoch": 1.07, + "grad_norm": 0.32453302649409216, + "learning_rate": 9.686034714412973e-05, + "loss": 1.0242, + "step": 11173 + }, + { + "epoch": 1.07, + "grad_norm": 0.2984295286462301, + "learning_rate": 9.684453549169498e-05, + "loss": 1.0139, + "step": 11174 + }, + { + "epoch": 1.07, + "grad_norm": 0.3160435004731973, + "learning_rate": 9.682872391822768e-05, + "loss": 1.1389, + "step": 11175 + }, + { + "epoch": 1.07, + "grad_norm": 0.3147680188934432, + "learning_rate": 9.68129124241236e-05, + "loss": 0.9701, + "step": 11176 + }, + { + "epoch": 1.07, + "grad_norm": 0.27579119776621414, + "learning_rate": 9.679710100977838e-05, + "loss": 1.1025, + "step": 11177 + }, + { + "epoch": 1.07, + "grad_norm": 0.3384401727489251, + "learning_rate": 9.678128967558772e-05, + "loss": 1.0797, + "step": 11178 + }, + { + "epoch": 1.07, + "grad_norm": 0.30725435519827277, + "learning_rate": 9.676547842194732e-05, + "loss": 1.2237, + "step": 11179 + }, + { + "epoch": 1.07, + "grad_norm": 0.2851163836077202, + "learning_rate": 9.674966724925286e-05, + "loss": 1.0074, + "step": 11180 + }, + { + "epoch": 1.07, + "grad_norm": 0.28736070831447036, + "learning_rate": 9.673385615789998e-05, + "loss": 1.1641, + "step": 11181 + }, + { + "epoch": 1.07, + "grad_norm": 0.29216994287778986, + "learning_rate": 9.671804514828441e-05, + "loss": 1.1399, + "step": 11182 + }, + { + "epoch": 1.07, + "grad_norm": 0.3058261226901559, + "learning_rate": 9.670223422080185e-05, + "loss": 1.0444, + "step": 11183 + }, + { + "epoch": 1.07, + "grad_norm": 0.31618062192248253, + "learning_rate": 9.668642337584795e-05, + "loss": 1.1778, + "step": 11184 + }, + { + "epoch": 1.07, + "grad_norm": 0.30362075666215665, + "learning_rate": 9.667061261381835e-05, + "loss": 1.0237, + "step": 11185 + }, + { + "epoch": 1.07, + "grad_norm": 0.27978458981728077, + "learning_rate": 9.665480193510881e-05, + "loss": 1.1427, + "step": 11186 + }, + { + "epoch": 1.07, + "grad_norm": 0.30978604733420473, + "learning_rate": 9.663899134011494e-05, + "loss": 1.0243, + "step": 11187 + }, + { + "epoch": 1.07, + "grad_norm": 0.280201870768336, + "learning_rate": 9.662318082923242e-05, + "loss": 0.9926, + "step": 11188 + }, + { + "epoch": 1.07, + "grad_norm": 0.29002380120886634, + "learning_rate": 9.660737040285688e-05, + "loss": 1.0242, + "step": 11189 + }, + { + "epoch": 1.07, + "grad_norm": 0.3245953784128328, + "learning_rate": 9.659156006138408e-05, + "loss": 1.1044, + "step": 11190 + }, + { + "epoch": 1.07, + "grad_norm": 0.31586315324410397, + "learning_rate": 9.657574980520962e-05, + "loss": 1.0211, + "step": 11191 + }, + { + "epoch": 1.07, + "grad_norm": 0.3013061886658016, + "learning_rate": 9.655993963472918e-05, + "loss": 1.0151, + "step": 11192 + }, + { + "epoch": 1.07, + "grad_norm": 0.3115896648548936, + "learning_rate": 9.654412955033841e-05, + "loss": 1.0365, + "step": 11193 + }, + { + "epoch": 1.07, + "grad_norm": 0.3486842136436551, + "learning_rate": 9.652831955243297e-05, + "loss": 0.9742, + "step": 11194 + }, + { + "epoch": 1.07, + "grad_norm": 0.27626285244761273, + "learning_rate": 9.651250964140848e-05, + "loss": 1.028, + "step": 11195 + }, + { + "epoch": 1.07, + "grad_norm": 0.3024230044933347, + "learning_rate": 9.649669981766067e-05, + "loss": 1.0425, + "step": 11196 + }, + { + "epoch": 1.07, + "grad_norm": 0.30858684215548393, + "learning_rate": 9.648089008158514e-05, + "loss": 1.0814, + "step": 11197 + }, + { + "epoch": 1.07, + "grad_norm": 0.29707180999984284, + "learning_rate": 9.646508043357754e-05, + "loss": 1.076, + "step": 11198 + }, + { + "epoch": 1.07, + "grad_norm": 0.31667657891620526, + "learning_rate": 9.644927087403354e-05, + "loss": 1.1039, + "step": 11199 + }, + { + "epoch": 1.07, + "grad_norm": 0.2973083720036791, + "learning_rate": 9.643346140334876e-05, + "loss": 0.9704, + "step": 11200 + }, + { + "epoch": 1.07, + "grad_norm": 0.33945583294515436, + "learning_rate": 9.641765202191883e-05, + "loss": 1.0945, + "step": 11201 + }, + { + "epoch": 1.07, + "grad_norm": 0.27948217368979844, + "learning_rate": 9.640184273013942e-05, + "loss": 1.0741, + "step": 11202 + }, + { + "epoch": 1.07, + "grad_norm": 0.32175426842100446, + "learning_rate": 9.638603352840616e-05, + "loss": 1.0712, + "step": 11203 + }, + { + "epoch": 1.07, + "grad_norm": 0.3441623070581641, + "learning_rate": 9.63702244171147e-05, + "loss": 0.9833, + "step": 11204 + }, + { + "epoch": 1.07, + "grad_norm": 0.33854289772962076, + "learning_rate": 9.635441539666061e-05, + "loss": 0.9807, + "step": 11205 + }, + { + "epoch": 1.07, + "grad_norm": 0.3140299225351834, + "learning_rate": 9.633860646743961e-05, + "loss": 1.0847, + "step": 11206 + }, + { + "epoch": 1.07, + "grad_norm": 0.28940128392157494, + "learning_rate": 9.632279762984727e-05, + "loss": 1.1553, + "step": 11207 + }, + { + "epoch": 1.07, + "grad_norm": 0.3444340326709395, + "learning_rate": 9.630698888427924e-05, + "loss": 0.9758, + "step": 11208 + }, + { + "epoch": 1.07, + "grad_norm": 0.280625737815087, + "learning_rate": 9.629118023113109e-05, + "loss": 0.9314, + "step": 11209 + }, + { + "epoch": 1.07, + "grad_norm": 0.31728811890232406, + "learning_rate": 9.627537167079852e-05, + "loss": 0.9531, + "step": 11210 + }, + { + "epoch": 1.07, + "grad_norm": 0.31632896302195185, + "learning_rate": 9.625956320367712e-05, + "loss": 1.0399, + "step": 11211 + }, + { + "epoch": 1.07, + "grad_norm": 0.3205225044699653, + "learning_rate": 9.624375483016252e-05, + "loss": 0.9811, + "step": 11212 + }, + { + "epoch": 1.07, + "grad_norm": 0.3011912476784293, + "learning_rate": 9.622794655065028e-05, + "loss": 0.9514, + "step": 11213 + }, + { + "epoch": 1.07, + "grad_norm": 0.37595951086341384, + "learning_rate": 9.621213836553608e-05, + "loss": 1.0572, + "step": 11214 + }, + { + "epoch": 1.07, + "grad_norm": 0.2935027925461575, + "learning_rate": 9.619633027521549e-05, + "loss": 0.944, + "step": 11215 + }, + { + "epoch": 1.07, + "grad_norm": 0.30123605888536614, + "learning_rate": 9.618052228008407e-05, + "loss": 0.8717, + "step": 11216 + }, + { + "epoch": 1.07, + "grad_norm": 0.30420875661332886, + "learning_rate": 9.616471438053752e-05, + "loss": 0.9572, + "step": 11217 + }, + { + "epoch": 1.07, + "grad_norm": 0.288761969641041, + "learning_rate": 9.614890657697143e-05, + "loss": 1.108, + "step": 11218 + }, + { + "epoch": 1.07, + "grad_norm": 0.35771443143229414, + "learning_rate": 9.613309886978135e-05, + "loss": 0.9237, + "step": 11219 + }, + { + "epoch": 1.07, + "grad_norm": 0.311300205332723, + "learning_rate": 9.61172912593629e-05, + "loss": 0.8953, + "step": 11220 + }, + { + "epoch": 1.07, + "grad_norm": 0.3265059826367189, + "learning_rate": 9.61014837461117e-05, + "loss": 1.0096, + "step": 11221 + }, + { + "epoch": 1.07, + "grad_norm": 0.2718517730473976, + "learning_rate": 9.608567633042325e-05, + "loss": 0.9915, + "step": 11222 + }, + { + "epoch": 1.07, + "grad_norm": 0.3142834511718656, + "learning_rate": 9.606986901269327e-05, + "loss": 1.049, + "step": 11223 + }, + { + "epoch": 1.07, + "grad_norm": 0.27209691680220205, + "learning_rate": 9.605406179331727e-05, + "loss": 1.0594, + "step": 11224 + }, + { + "epoch": 1.07, + "grad_norm": 0.30389846297233947, + "learning_rate": 9.603825467269087e-05, + "loss": 1.0593, + "step": 11225 + }, + { + "epoch": 1.07, + "grad_norm": 0.3201865087266764, + "learning_rate": 9.602244765120961e-05, + "loss": 1.0534, + "step": 11226 + }, + { + "epoch": 1.07, + "grad_norm": 0.27393420573867794, + "learning_rate": 9.600664072926912e-05, + "loss": 1.0087, + "step": 11227 + }, + { + "epoch": 1.07, + "grad_norm": 0.3128851492377119, + "learning_rate": 9.599083390726496e-05, + "loss": 1.1419, + "step": 11228 + }, + { + "epoch": 1.07, + "grad_norm": 0.2589052009872461, + "learning_rate": 9.597502718559267e-05, + "loss": 1.0095, + "step": 11229 + }, + { + "epoch": 1.07, + "grad_norm": 0.28529030218411794, + "learning_rate": 9.595922056464785e-05, + "loss": 0.8703, + "step": 11230 + }, + { + "epoch": 1.07, + "grad_norm": 0.3219566367545767, + "learning_rate": 9.594341404482612e-05, + "loss": 1.0301, + "step": 11231 + }, + { + "epoch": 1.07, + "grad_norm": 0.33699602329220935, + "learning_rate": 9.592760762652298e-05, + "loss": 1.0419, + "step": 11232 + }, + { + "epoch": 1.07, + "grad_norm": 0.29538533428142194, + "learning_rate": 9.591180131013403e-05, + "loss": 1.1378, + "step": 11233 + }, + { + "epoch": 1.07, + "grad_norm": 0.3178515614745168, + "learning_rate": 9.589599509605482e-05, + "loss": 0.9248, + "step": 11234 + }, + { + "epoch": 1.07, + "grad_norm": 0.2812402415214333, + "learning_rate": 9.58801889846809e-05, + "loss": 1.0483, + "step": 11235 + }, + { + "epoch": 1.07, + "grad_norm": 0.33886486066038896, + "learning_rate": 9.586438297640782e-05, + "loss": 1.0654, + "step": 11236 + }, + { + "epoch": 1.08, + "grad_norm": 0.2862778908207863, + "learning_rate": 9.584857707163118e-05, + "loss": 1.0667, + "step": 11237 + }, + { + "epoch": 1.08, + "grad_norm": 0.27208864632524254, + "learning_rate": 9.583277127074651e-05, + "loss": 0.982, + "step": 11238 + }, + { + "epoch": 1.08, + "grad_norm": 0.31863568359855954, + "learning_rate": 9.581696557414937e-05, + "loss": 1.0643, + "step": 11239 + }, + { + "epoch": 1.08, + "grad_norm": 0.3642044782084103, + "learning_rate": 9.580115998223528e-05, + "loss": 1.0146, + "step": 11240 + }, + { + "epoch": 1.08, + "grad_norm": 0.3193198576988049, + "learning_rate": 9.578535449539981e-05, + "loss": 0.9768, + "step": 11241 + }, + { + "epoch": 1.08, + "grad_norm": 0.2928041235114697, + "learning_rate": 9.576954911403847e-05, + "loss": 1.0559, + "step": 11242 + }, + { + "epoch": 1.08, + "grad_norm": 0.28790622901990937, + "learning_rate": 9.575374383854682e-05, + "loss": 0.975, + "step": 11243 + }, + { + "epoch": 1.08, + "grad_norm": 0.2932271986158786, + "learning_rate": 9.573793866932042e-05, + "loss": 1.0007, + "step": 11244 + }, + { + "epoch": 1.08, + "grad_norm": 0.2983555987359424, + "learning_rate": 9.572213360675479e-05, + "loss": 1.0497, + "step": 11245 + }, + { + "epoch": 1.08, + "grad_norm": 0.2961792428270764, + "learning_rate": 9.570632865124541e-05, + "loss": 1.0835, + "step": 11246 + }, + { + "epoch": 1.08, + "grad_norm": 0.30760872321979127, + "learning_rate": 9.56905238031879e-05, + "loss": 0.921, + "step": 11247 + }, + { + "epoch": 1.08, + "grad_norm": 0.3428824785594951, + "learning_rate": 9.567471906297773e-05, + "loss": 0.9248, + "step": 11248 + }, + { + "epoch": 1.08, + "grad_norm": 0.3336663531021088, + "learning_rate": 9.565891443101044e-05, + "loss": 1.18, + "step": 11249 + }, + { + "epoch": 1.08, + "grad_norm": 0.33961226685835655, + "learning_rate": 9.56431099076815e-05, + "loss": 0.8973, + "step": 11250 + }, + { + "epoch": 1.08, + "grad_norm": 0.31372285477080086, + "learning_rate": 9.562730549338653e-05, + "loss": 1.0866, + "step": 11251 + }, + { + "epoch": 1.08, + "grad_norm": 0.3190631671287586, + "learning_rate": 9.561150118852096e-05, + "loss": 1.0845, + "step": 11252 + }, + { + "epoch": 1.08, + "grad_norm": 0.3002816007603442, + "learning_rate": 9.559569699348034e-05, + "loss": 1.0991, + "step": 11253 + }, + { + "epoch": 1.08, + "grad_norm": 0.26516518152169, + "learning_rate": 9.557989290866018e-05, + "loss": 0.9682, + "step": 11254 + }, + { + "epoch": 1.08, + "grad_norm": 0.3104188717630717, + "learning_rate": 9.556408893445596e-05, + "loss": 1.0069, + "step": 11255 + }, + { + "epoch": 1.08, + "grad_norm": 0.30275435422266955, + "learning_rate": 9.554828507126317e-05, + "loss": 1.018, + "step": 11256 + }, + { + "epoch": 1.08, + "grad_norm": 0.326419989604932, + "learning_rate": 9.553248131947739e-05, + "loss": 1.138, + "step": 11257 + }, + { + "epoch": 1.08, + "grad_norm": 0.30564644079207814, + "learning_rate": 9.551667767949408e-05, + "loss": 0.9481, + "step": 11258 + }, + { + "epoch": 1.08, + "grad_norm": 0.2860117393125948, + "learning_rate": 9.550087415170872e-05, + "loss": 1.0854, + "step": 11259 + }, + { + "epoch": 1.08, + "grad_norm": 0.2964694974401944, + "learning_rate": 9.548507073651681e-05, + "loss": 1.0739, + "step": 11260 + }, + { + "epoch": 1.08, + "grad_norm": 0.2899974288418561, + "learning_rate": 9.546926743431385e-05, + "loss": 0.9728, + "step": 11261 + }, + { + "epoch": 1.08, + "grad_norm": 0.3080073605457522, + "learning_rate": 9.54534642454953e-05, + "loss": 1.1175, + "step": 11262 + }, + { + "epoch": 1.08, + "grad_norm": 0.2754666024285171, + "learning_rate": 9.543766117045664e-05, + "loss": 0.8522, + "step": 11263 + }, + { + "epoch": 1.08, + "grad_norm": 0.31541168491908067, + "learning_rate": 9.542185820959342e-05, + "loss": 0.9325, + "step": 11264 + }, + { + "epoch": 1.08, + "grad_norm": 0.3147013461299667, + "learning_rate": 9.540605536330108e-05, + "loss": 1.0921, + "step": 11265 + }, + { + "epoch": 1.08, + "grad_norm": 0.30910790630492974, + "learning_rate": 9.539025263197508e-05, + "loss": 1.027, + "step": 11266 + }, + { + "epoch": 1.08, + "grad_norm": 0.30731822848171453, + "learning_rate": 9.537445001601088e-05, + "loss": 0.9955, + "step": 11267 + }, + { + "epoch": 1.08, + "grad_norm": 0.3386003125887747, + "learning_rate": 9.5358647515804e-05, + "loss": 1.0671, + "step": 11268 + }, + { + "epoch": 1.08, + "grad_norm": 0.2841475028417147, + "learning_rate": 9.53428451317499e-05, + "loss": 0.9961, + "step": 11269 + }, + { + "epoch": 1.08, + "grad_norm": 0.31603121587153393, + "learning_rate": 9.532704286424398e-05, + "loss": 1.0922, + "step": 11270 + }, + { + "epoch": 1.08, + "grad_norm": 0.28410543881471806, + "learning_rate": 9.531124071368176e-05, + "loss": 1.0448, + "step": 11271 + }, + { + "epoch": 1.08, + "grad_norm": 0.2962057475008243, + "learning_rate": 9.52954386804587e-05, + "loss": 0.9652, + "step": 11272 + }, + { + "epoch": 1.08, + "grad_norm": 0.31901956254905467, + "learning_rate": 9.527963676497025e-05, + "loss": 1.0494, + "step": 11273 + }, + { + "epoch": 1.08, + "grad_norm": 0.33820716310499843, + "learning_rate": 9.526383496761187e-05, + "loss": 1.0535, + "step": 11274 + }, + { + "epoch": 1.08, + "grad_norm": 0.30806458766268185, + "learning_rate": 9.5248033288779e-05, + "loss": 1.0063, + "step": 11275 + }, + { + "epoch": 1.08, + "grad_norm": 0.276102310015319, + "learning_rate": 9.523223172886706e-05, + "loss": 1.0177, + "step": 11276 + }, + { + "epoch": 1.08, + "grad_norm": 0.29222231731100207, + "learning_rate": 9.521643028827149e-05, + "loss": 1.0704, + "step": 11277 + }, + { + "epoch": 1.08, + "grad_norm": 0.30019502082120103, + "learning_rate": 9.52006289673878e-05, + "loss": 1.0228, + "step": 11278 + }, + { + "epoch": 1.08, + "grad_norm": 0.3101052461989163, + "learning_rate": 9.518482776661137e-05, + "loss": 1.1038, + "step": 11279 + }, + { + "epoch": 1.08, + "grad_norm": 0.3174914779014836, + "learning_rate": 9.516902668633767e-05, + "loss": 0.9325, + "step": 11280 + }, + { + "epoch": 1.08, + "grad_norm": 0.3196488171490318, + "learning_rate": 9.515322572696211e-05, + "loss": 1.0235, + "step": 11281 + }, + { + "epoch": 1.08, + "grad_norm": 0.291647365183429, + "learning_rate": 9.513742488888011e-05, + "loss": 1.0026, + "step": 11282 + }, + { + "epoch": 1.08, + "grad_norm": 0.27147211541990285, + "learning_rate": 9.512162417248708e-05, + "loss": 1.0467, + "step": 11283 + }, + { + "epoch": 1.08, + "grad_norm": 0.3118410180966839, + "learning_rate": 9.51058235781785e-05, + "loss": 0.9806, + "step": 11284 + }, + { + "epoch": 1.08, + "grad_norm": 0.2934804593757332, + "learning_rate": 9.509002310634977e-05, + "loss": 0.991, + "step": 11285 + }, + { + "epoch": 1.08, + "grad_norm": 0.2889154724387504, + "learning_rate": 9.50742227573963e-05, + "loss": 0.8904, + "step": 11286 + }, + { + "epoch": 1.08, + "grad_norm": 0.28459368028028376, + "learning_rate": 9.505842253171349e-05, + "loss": 1.0417, + "step": 11287 + }, + { + "epoch": 1.08, + "grad_norm": 0.2728441819275197, + "learning_rate": 9.504262242969676e-05, + "loss": 1.0601, + "step": 11288 + }, + { + "epoch": 1.08, + "grad_norm": 0.30646180068543477, + "learning_rate": 9.502682245174154e-05, + "loss": 0.9853, + "step": 11289 + }, + { + "epoch": 1.08, + "grad_norm": 0.2949158274937865, + "learning_rate": 9.501102259824317e-05, + "loss": 0.9912, + "step": 11290 + }, + { + "epoch": 1.08, + "grad_norm": 0.3002443993751763, + "learning_rate": 9.499522286959711e-05, + "loss": 1.0374, + "step": 11291 + }, + { + "epoch": 1.08, + "grad_norm": 0.34345123541204026, + "learning_rate": 9.497942326619877e-05, + "loss": 0.9417, + "step": 11292 + }, + { + "epoch": 1.08, + "grad_norm": 0.29534934246112376, + "learning_rate": 9.49636237884435e-05, + "loss": 1.0331, + "step": 11293 + }, + { + "epoch": 1.08, + "grad_norm": 0.2874464931044368, + "learning_rate": 9.494782443672672e-05, + "loss": 1.1295, + "step": 11294 + }, + { + "epoch": 1.08, + "grad_norm": 0.2786044430189825, + "learning_rate": 9.49320252114438e-05, + "loss": 0.9808, + "step": 11295 + }, + { + "epoch": 1.08, + "grad_norm": 0.28824221732457467, + "learning_rate": 9.491622611299015e-05, + "loss": 1.0217, + "step": 11296 + }, + { + "epoch": 1.08, + "grad_norm": 0.32504383547596655, + "learning_rate": 9.490042714176108e-05, + "loss": 0.9224, + "step": 11297 + }, + { + "epoch": 1.08, + "grad_norm": 0.3266615912759899, + "learning_rate": 9.488462829815208e-05, + "loss": 0.9361, + "step": 11298 + }, + { + "epoch": 1.08, + "grad_norm": 0.29180458152583866, + "learning_rate": 9.486882958255848e-05, + "loss": 0.9602, + "step": 11299 + }, + { + "epoch": 1.08, + "grad_norm": 0.3292567151578694, + "learning_rate": 9.485303099537561e-05, + "loss": 1.0342, + "step": 11300 + }, + { + "epoch": 1.08, + "grad_norm": 0.26317473051977797, + "learning_rate": 9.48372325369989e-05, + "loss": 0.9519, + "step": 11301 + }, + { + "epoch": 1.08, + "grad_norm": 0.31524973393662514, + "learning_rate": 9.482143420782369e-05, + "loss": 1.0454, + "step": 11302 + }, + { + "epoch": 1.08, + "grad_norm": 0.30470530396183576, + "learning_rate": 9.480563600824533e-05, + "loss": 1.0452, + "step": 11303 + }, + { + "epoch": 1.08, + "grad_norm": 0.3036942172095975, + "learning_rate": 9.478983793865916e-05, + "loss": 1.0253, + "step": 11304 + }, + { + "epoch": 1.08, + "grad_norm": 0.5346387191663518, + "learning_rate": 9.477403999946062e-05, + "loss": 1.0048, + "step": 11305 + }, + { + "epoch": 1.08, + "grad_norm": 0.3143426493365793, + "learning_rate": 9.4758242191045e-05, + "loss": 1.138, + "step": 11306 + }, + { + "epoch": 1.08, + "grad_norm": 0.3327574551301448, + "learning_rate": 9.474244451380767e-05, + "loss": 1.0821, + "step": 11307 + }, + { + "epoch": 1.08, + "grad_norm": 0.27545403823517156, + "learning_rate": 9.472664696814395e-05, + "loss": 0.955, + "step": 11308 + }, + { + "epoch": 1.08, + "grad_norm": 0.30457274560929365, + "learning_rate": 9.471084955444922e-05, + "loss": 1.0261, + "step": 11309 + }, + { + "epoch": 1.08, + "grad_norm": 0.2790843677303593, + "learning_rate": 9.469505227311882e-05, + "loss": 1.025, + "step": 11310 + }, + { + "epoch": 1.08, + "grad_norm": 0.2970597639202619, + "learning_rate": 9.467925512454803e-05, + "loss": 1.0451, + "step": 11311 + }, + { + "epoch": 1.08, + "grad_norm": 0.31171701993420287, + "learning_rate": 9.466345810913222e-05, + "loss": 0.9135, + "step": 11312 + }, + { + "epoch": 1.08, + "grad_norm": 0.27993251925042006, + "learning_rate": 9.464766122726676e-05, + "loss": 0.9887, + "step": 11313 + }, + { + "epoch": 1.08, + "grad_norm": 0.2922667032538277, + "learning_rate": 9.463186447934695e-05, + "loss": 1.0199, + "step": 11314 + }, + { + "epoch": 1.08, + "grad_norm": 0.3467964501244379, + "learning_rate": 9.461606786576809e-05, + "loss": 0.9126, + "step": 11315 + }, + { + "epoch": 1.08, + "grad_norm": 0.32272308778885384, + "learning_rate": 9.460027138692551e-05, + "loss": 0.973, + "step": 11316 + }, + { + "epoch": 1.08, + "grad_norm": 0.2957010820081167, + "learning_rate": 9.458447504321451e-05, + "loss": 1.0252, + "step": 11317 + }, + { + "epoch": 1.08, + "grad_norm": 0.3153203095395208, + "learning_rate": 9.456867883503046e-05, + "loss": 1.0042, + "step": 11318 + }, + { + "epoch": 1.08, + "grad_norm": 0.28735329855136416, + "learning_rate": 9.455288276276863e-05, + "loss": 1.0259, + "step": 11319 + }, + { + "epoch": 1.08, + "grad_norm": 0.30615238060809274, + "learning_rate": 9.453708682682433e-05, + "loss": 1.0052, + "step": 11320 + }, + { + "epoch": 1.08, + "grad_norm": 0.34818462613776774, + "learning_rate": 9.452129102759288e-05, + "loss": 1.1266, + "step": 11321 + }, + { + "epoch": 1.08, + "grad_norm": 0.31179219468032693, + "learning_rate": 9.450549536546955e-05, + "loss": 1.0085, + "step": 11322 + }, + { + "epoch": 1.08, + "grad_norm": 0.3597898869474432, + "learning_rate": 9.448969984084966e-05, + "loss": 1.0189, + "step": 11323 + }, + { + "epoch": 1.08, + "grad_norm": 0.31339958823715486, + "learning_rate": 9.447390445412844e-05, + "loss": 1.0094, + "step": 11324 + }, + { + "epoch": 1.08, + "grad_norm": 0.3129387342093304, + "learning_rate": 9.44581092057013e-05, + "loss": 0.9364, + "step": 11325 + }, + { + "epoch": 1.08, + "grad_norm": 0.30837591831156597, + "learning_rate": 9.444231409596343e-05, + "loss": 1.017, + "step": 11326 + }, + { + "epoch": 1.08, + "grad_norm": 0.2867113652014837, + "learning_rate": 9.442651912531017e-05, + "loss": 1.0343, + "step": 11327 + }, + { + "epoch": 1.08, + "grad_norm": 0.2689799780211522, + "learning_rate": 9.441072429413675e-05, + "loss": 1.0119, + "step": 11328 + }, + { + "epoch": 1.08, + "grad_norm": 0.34454332476688837, + "learning_rate": 9.439492960283846e-05, + "loss": 1.0482, + "step": 11329 + }, + { + "epoch": 1.08, + "grad_norm": 0.31595660809197285, + "learning_rate": 9.43791350518106e-05, + "loss": 0.8727, + "step": 11330 + }, + { + "epoch": 1.08, + "grad_norm": 0.2960284605225842, + "learning_rate": 9.436334064144839e-05, + "loss": 1.0367, + "step": 11331 + }, + { + "epoch": 1.08, + "grad_norm": 0.3067019582636595, + "learning_rate": 9.434754637214713e-05, + "loss": 1.028, + "step": 11332 + }, + { + "epoch": 1.08, + "grad_norm": 0.346300091533587, + "learning_rate": 9.433175224430209e-05, + "loss": 1.0285, + "step": 11333 + }, + { + "epoch": 1.08, + "grad_norm": 0.3286939552905764, + "learning_rate": 9.431595825830851e-05, + "loss": 1.0644, + "step": 11334 + }, + { + "epoch": 1.08, + "grad_norm": 0.3441616179095328, + "learning_rate": 9.430016441456165e-05, + "loss": 0.9362, + "step": 11335 + }, + { + "epoch": 1.08, + "grad_norm": 0.29040310171735395, + "learning_rate": 9.428437071345677e-05, + "loss": 1.0723, + "step": 11336 + }, + { + "epoch": 1.08, + "grad_norm": 0.2821602761958069, + "learning_rate": 9.426857715538909e-05, + "loss": 1.0848, + "step": 11337 + }, + { + "epoch": 1.08, + "grad_norm": 0.27715265286944324, + "learning_rate": 9.425278374075383e-05, + "loss": 1.0501, + "step": 11338 + }, + { + "epoch": 1.08, + "grad_norm": 0.31061090102509803, + "learning_rate": 9.423699046994632e-05, + "loss": 1.027, + "step": 11339 + }, + { + "epoch": 1.08, + "grad_norm": 0.2854829127526615, + "learning_rate": 9.422119734336174e-05, + "loss": 1.1199, + "step": 11340 + }, + { + "epoch": 1.09, + "grad_norm": 0.29084146648661213, + "learning_rate": 9.420540436139532e-05, + "loss": 1.0736, + "step": 11341 + }, + { + "epoch": 1.09, + "grad_norm": 0.2792119116253644, + "learning_rate": 9.41896115244423e-05, + "loss": 0.9902, + "step": 11342 + }, + { + "epoch": 1.09, + "grad_norm": 0.33600519144110635, + "learning_rate": 9.417381883289791e-05, + "loss": 1.0085, + "step": 11343 + }, + { + "epoch": 1.09, + "grad_norm": 0.28063651243466686, + "learning_rate": 9.415802628715732e-05, + "loss": 1.0405, + "step": 11344 + }, + { + "epoch": 1.09, + "grad_norm": 0.34808798033394683, + "learning_rate": 9.414223388761583e-05, + "loss": 1.0148, + "step": 11345 + }, + { + "epoch": 1.09, + "grad_norm": 0.3368144168754586, + "learning_rate": 9.412644163466863e-05, + "loss": 0.9782, + "step": 11346 + }, + { + "epoch": 1.09, + "grad_norm": 0.3216670237526899, + "learning_rate": 9.411064952871092e-05, + "loss": 0.9462, + "step": 11347 + }, + { + "epoch": 1.09, + "grad_norm": 0.2813803712641204, + "learning_rate": 9.409485757013789e-05, + "loss": 1.0282, + "step": 11348 + }, + { + "epoch": 1.09, + "grad_norm": 0.28475782305840425, + "learning_rate": 9.407906575934475e-05, + "loss": 1.0004, + "step": 11349 + }, + { + "epoch": 1.09, + "grad_norm": 0.3227036737981328, + "learning_rate": 9.406327409672673e-05, + "loss": 1.0156, + "step": 11350 + }, + { + "epoch": 1.09, + "grad_norm": 0.29786238308535357, + "learning_rate": 9.4047482582679e-05, + "loss": 0.9632, + "step": 11351 + }, + { + "epoch": 1.09, + "grad_norm": 0.2864203386878321, + "learning_rate": 9.403169121759675e-05, + "loss": 1.0202, + "step": 11352 + }, + { + "epoch": 1.09, + "grad_norm": 0.26297171784196716, + "learning_rate": 9.401590000187518e-05, + "loss": 1.1274, + "step": 11353 + }, + { + "epoch": 1.09, + "grad_norm": 0.2668862522598809, + "learning_rate": 9.40001089359095e-05, + "loss": 0.9523, + "step": 11354 + }, + { + "epoch": 1.09, + "grad_norm": 0.3430070050906079, + "learning_rate": 9.398431802009484e-05, + "loss": 1.1081, + "step": 11355 + }, + { + "epoch": 1.09, + "grad_norm": 0.29441500082460426, + "learning_rate": 9.396852725482642e-05, + "loss": 1.0317, + "step": 11356 + }, + { + "epoch": 1.09, + "grad_norm": 0.26867609023926886, + "learning_rate": 9.39527366404994e-05, + "loss": 1.0126, + "step": 11357 + }, + { + "epoch": 1.09, + "grad_norm": 0.3256555743057207, + "learning_rate": 9.393694617750889e-05, + "loss": 0.9547, + "step": 11358 + }, + { + "epoch": 1.09, + "grad_norm": 0.3483720479097684, + "learning_rate": 9.392115586625016e-05, + "loss": 0.9336, + "step": 11359 + }, + { + "epoch": 1.09, + "grad_norm": 0.3091207023329838, + "learning_rate": 9.390536570711832e-05, + "loss": 1.016, + "step": 11360 + }, + { + "epoch": 1.09, + "grad_norm": 0.32129910140386914, + "learning_rate": 9.388957570050854e-05, + "loss": 1.0441, + "step": 11361 + }, + { + "epoch": 1.09, + "grad_norm": 0.3458771120761256, + "learning_rate": 9.387378584681598e-05, + "loss": 1.0524, + "step": 11362 + }, + { + "epoch": 1.09, + "grad_norm": 0.3204588688123865, + "learning_rate": 9.385799614643576e-05, + "loss": 1.0433, + "step": 11363 + }, + { + "epoch": 1.09, + "grad_norm": 0.2848065439528938, + "learning_rate": 9.384220659976305e-05, + "loss": 1.0546, + "step": 11364 + }, + { + "epoch": 1.09, + "grad_norm": 0.3093210062084415, + "learning_rate": 9.382641720719295e-05, + "loss": 1.1099, + "step": 11365 + }, + { + "epoch": 1.09, + "grad_norm": 0.2739357378603301, + "learning_rate": 9.381062796912068e-05, + "loss": 0.9881, + "step": 11366 + }, + { + "epoch": 1.09, + "grad_norm": 0.3136317607534424, + "learning_rate": 9.379483888594133e-05, + "loss": 0.9967, + "step": 11367 + }, + { + "epoch": 1.09, + "grad_norm": 0.28011192674245033, + "learning_rate": 9.377904995805006e-05, + "loss": 1.0224, + "step": 11368 + }, + { + "epoch": 1.09, + "grad_norm": 0.3166833432706861, + "learning_rate": 9.376326118584196e-05, + "loss": 0.9859, + "step": 11369 + }, + { + "epoch": 1.09, + "grad_norm": 0.27588998656466385, + "learning_rate": 9.374747256971214e-05, + "loss": 1.067, + "step": 11370 + }, + { + "epoch": 1.09, + "grad_norm": 0.2818186446126681, + "learning_rate": 9.373168411005577e-05, + "loss": 1.0404, + "step": 11371 + }, + { + "epoch": 1.09, + "grad_norm": 0.3353788855470054, + "learning_rate": 9.371589580726794e-05, + "loss": 1.0473, + "step": 11372 + }, + { + "epoch": 1.09, + "grad_norm": 0.3000641889791246, + "learning_rate": 9.370010766174378e-05, + "loss": 1.043, + "step": 11373 + }, + { + "epoch": 1.09, + "grad_norm": 0.25729686117403217, + "learning_rate": 9.368431967387836e-05, + "loss": 0.8858, + "step": 11374 + }, + { + "epoch": 1.09, + "grad_norm": 0.3686860884663247, + "learning_rate": 9.366853184406683e-05, + "loss": 1.099, + "step": 11375 + }, + { + "epoch": 1.09, + "grad_norm": 0.2687120423883095, + "learning_rate": 9.365274417270427e-05, + "loss": 1.0448, + "step": 11376 + }, + { + "epoch": 1.09, + "grad_norm": 0.3267753269788522, + "learning_rate": 9.363695666018577e-05, + "loss": 1.0987, + "step": 11377 + }, + { + "epoch": 1.09, + "grad_norm": 0.3143760562385636, + "learning_rate": 9.36211693069064e-05, + "loss": 1.1225, + "step": 11378 + }, + { + "epoch": 1.09, + "grad_norm": 0.30618025888651784, + "learning_rate": 9.36053821132613e-05, + "loss": 1.0366, + "step": 11379 + }, + { + "epoch": 1.09, + "grad_norm": 0.274474680102085, + "learning_rate": 9.358959507964555e-05, + "loss": 1.0066, + "step": 11380 + }, + { + "epoch": 1.09, + "grad_norm": 0.3362406133050426, + "learning_rate": 9.357380820645422e-05, + "loss": 1.0135, + "step": 11381 + }, + { + "epoch": 1.09, + "grad_norm": 0.28042700080219135, + "learning_rate": 9.355802149408235e-05, + "loss": 1.0675, + "step": 11382 + }, + { + "epoch": 1.09, + "grad_norm": 0.2949732912479607, + "learning_rate": 9.354223494292507e-05, + "loss": 1.0539, + "step": 11383 + }, + { + "epoch": 1.09, + "grad_norm": 0.28388179921490925, + "learning_rate": 9.35264485533774e-05, + "loss": 1.0232, + "step": 11384 + }, + { + "epoch": 1.09, + "grad_norm": 0.3005951726639967, + "learning_rate": 9.351066232583439e-05, + "loss": 0.9737, + "step": 11385 + }, + { + "epoch": 1.09, + "grad_norm": 0.34345652040216745, + "learning_rate": 9.349487626069116e-05, + "loss": 1.0533, + "step": 11386 + }, + { + "epoch": 1.09, + "grad_norm": 0.30727935140779933, + "learning_rate": 9.347909035834276e-05, + "loss": 0.9171, + "step": 11387 + }, + { + "epoch": 1.09, + "grad_norm": 0.28567339519252133, + "learning_rate": 9.346330461918422e-05, + "loss": 1.0498, + "step": 11388 + }, + { + "epoch": 1.09, + "grad_norm": 0.24439161354431146, + "learning_rate": 9.344751904361057e-05, + "loss": 0.9598, + "step": 11389 + }, + { + "epoch": 1.09, + "grad_norm": 0.2558864794457524, + "learning_rate": 9.343173363201686e-05, + "loss": 0.8985, + "step": 11390 + }, + { + "epoch": 1.09, + "grad_norm": 0.3095327105788236, + "learning_rate": 9.34159483847982e-05, + "loss": 1.0948, + "step": 11391 + }, + { + "epoch": 1.09, + "grad_norm": 0.2725700808941792, + "learning_rate": 9.340016330234951e-05, + "loss": 1.1604, + "step": 11392 + }, + { + "epoch": 1.09, + "grad_norm": 0.324499042847467, + "learning_rate": 9.33843783850659e-05, + "loss": 1.0637, + "step": 11393 + }, + { + "epoch": 1.09, + "grad_norm": 0.2802496266920151, + "learning_rate": 9.336859363334238e-05, + "loss": 1.055, + "step": 11394 + }, + { + "epoch": 1.09, + "grad_norm": 0.30263930109196974, + "learning_rate": 9.335280904757398e-05, + "loss": 0.9199, + "step": 11395 + }, + { + "epoch": 1.09, + "grad_norm": 0.2981791166657182, + "learning_rate": 9.333702462815572e-05, + "loss": 1.1352, + "step": 11396 + }, + { + "epoch": 1.09, + "grad_norm": 0.329400949774662, + "learning_rate": 9.33212403754826e-05, + "loss": 0.9752, + "step": 11397 + }, + { + "epoch": 1.09, + "grad_norm": 0.290464765622106, + "learning_rate": 9.330545628994963e-05, + "loss": 1.0982, + "step": 11398 + }, + { + "epoch": 1.09, + "grad_norm": 0.3037067873332023, + "learning_rate": 9.32896723719518e-05, + "loss": 1.0474, + "step": 11399 + }, + { + "epoch": 1.09, + "grad_norm": 0.30171440149763074, + "learning_rate": 9.327388862188417e-05, + "loss": 1.0026, + "step": 11400 + }, + { + "epoch": 1.09, + "grad_norm": 0.2990361636281973, + "learning_rate": 9.32581050401417e-05, + "loss": 1.0549, + "step": 11401 + }, + { + "epoch": 1.09, + "grad_norm": 0.3178701545686714, + "learning_rate": 9.324232162711939e-05, + "loss": 1.0622, + "step": 11402 + }, + { + "epoch": 1.09, + "grad_norm": 0.3084387465255457, + "learning_rate": 9.322653838321223e-05, + "loss": 1.1182, + "step": 11403 + }, + { + "epoch": 1.09, + "grad_norm": 0.2941259016159747, + "learning_rate": 9.32107553088152e-05, + "loss": 0.9839, + "step": 11404 + }, + { + "epoch": 1.09, + "grad_norm": 0.26454359674303857, + "learning_rate": 9.319497240432324e-05, + "loss": 0.9985, + "step": 11405 + }, + { + "epoch": 1.09, + "grad_norm": 0.2648134235595188, + "learning_rate": 9.317918967013143e-05, + "loss": 1.0794, + "step": 11406 + }, + { + "epoch": 1.09, + "grad_norm": 0.2697848626847198, + "learning_rate": 9.316340710663467e-05, + "loss": 0.8672, + "step": 11407 + }, + { + "epoch": 1.09, + "grad_norm": 0.27416359005417, + "learning_rate": 9.314762471422796e-05, + "loss": 0.9171, + "step": 11408 + }, + { + "epoch": 1.09, + "grad_norm": 0.30186988519303243, + "learning_rate": 9.313184249330623e-05, + "loss": 0.997, + "step": 11409 + }, + { + "epoch": 1.09, + "grad_norm": 0.3150710658250788, + "learning_rate": 9.311606044426447e-05, + "loss": 1.0059, + "step": 11410 + }, + { + "epoch": 1.09, + "grad_norm": 0.3027934256055935, + "learning_rate": 9.310027856749759e-05, + "loss": 0.9684, + "step": 11411 + }, + { + "epoch": 1.09, + "grad_norm": 0.2782409447766919, + "learning_rate": 9.308449686340058e-05, + "loss": 1.1251, + "step": 11412 + }, + { + "epoch": 1.09, + "grad_norm": 0.30736818542725314, + "learning_rate": 9.306871533236841e-05, + "loss": 1.041, + "step": 11413 + }, + { + "epoch": 1.09, + "grad_norm": 0.3329482926060789, + "learning_rate": 9.3052933974796e-05, + "loss": 1.0978, + "step": 11414 + }, + { + "epoch": 1.09, + "grad_norm": 0.2823244173303807, + "learning_rate": 9.303715279107825e-05, + "loss": 1.0254, + "step": 11415 + }, + { + "epoch": 1.09, + "grad_norm": 0.31167930726228615, + "learning_rate": 9.302137178161014e-05, + "loss": 1.1244, + "step": 11416 + }, + { + "epoch": 1.09, + "grad_norm": 0.33216313102601064, + "learning_rate": 9.300559094678662e-05, + "loss": 1.137, + "step": 11417 + }, + { + "epoch": 1.09, + "grad_norm": 0.31078241453182676, + "learning_rate": 9.298981028700255e-05, + "loss": 1.0385, + "step": 11418 + }, + { + "epoch": 1.09, + "grad_norm": 0.30901821032212323, + "learning_rate": 9.297402980265285e-05, + "loss": 1.0952, + "step": 11419 + }, + { + "epoch": 1.09, + "grad_norm": 0.3116118077969148, + "learning_rate": 9.295824949413252e-05, + "loss": 1.0547, + "step": 11420 + }, + { + "epoch": 1.09, + "grad_norm": 0.3228127141903917, + "learning_rate": 9.294246936183642e-05, + "loss": 0.9571, + "step": 11421 + }, + { + "epoch": 1.09, + "grad_norm": 0.31134118377544123, + "learning_rate": 9.292668940615945e-05, + "loss": 1.0091, + "step": 11422 + }, + { + "epoch": 1.09, + "grad_norm": 0.2894565469130807, + "learning_rate": 9.291090962749654e-05, + "loss": 1.0866, + "step": 11423 + }, + { + "epoch": 1.09, + "grad_norm": 0.3207596878851169, + "learning_rate": 9.289513002624256e-05, + "loss": 1.1001, + "step": 11424 + }, + { + "epoch": 1.09, + "grad_norm": 0.3008933836675878, + "learning_rate": 9.287935060279242e-05, + "loss": 1.0942, + "step": 11425 + }, + { + "epoch": 1.09, + "grad_norm": 0.31898012655997837, + "learning_rate": 9.286357135754096e-05, + "loss": 0.9011, + "step": 11426 + }, + { + "epoch": 1.09, + "grad_norm": 0.31756259293555517, + "learning_rate": 9.284779229088316e-05, + "loss": 0.9861, + "step": 11427 + }, + { + "epoch": 1.09, + "grad_norm": 0.25934743332475524, + "learning_rate": 9.283201340321384e-05, + "loss": 1.115, + "step": 11428 + }, + { + "epoch": 1.09, + "grad_norm": 0.27490936517210524, + "learning_rate": 9.281623469492791e-05, + "loss": 1.0215, + "step": 11429 + }, + { + "epoch": 1.09, + "grad_norm": 0.3536928124407862, + "learning_rate": 9.280045616642021e-05, + "loss": 1.0158, + "step": 11430 + }, + { + "epoch": 1.09, + "grad_norm": 0.2999133766349949, + "learning_rate": 9.278467781808558e-05, + "loss": 0.9771, + "step": 11431 + }, + { + "epoch": 1.09, + "grad_norm": 0.3027277862397413, + "learning_rate": 9.276889965031898e-05, + "loss": 1.0906, + "step": 11432 + }, + { + "epoch": 1.09, + "grad_norm": 0.33572047280697964, + "learning_rate": 9.275312166351516e-05, + "loss": 0.9944, + "step": 11433 + }, + { + "epoch": 1.09, + "grad_norm": 0.2830163963885555, + "learning_rate": 9.273734385806906e-05, + "loss": 0.9997, + "step": 11434 + }, + { + "epoch": 1.09, + "grad_norm": 0.28912019742652556, + "learning_rate": 9.272156623437546e-05, + "loss": 1.0173, + "step": 11435 + }, + { + "epoch": 1.09, + "grad_norm": 0.2932458271277523, + "learning_rate": 9.270578879282929e-05, + "loss": 1.0472, + "step": 11436 + }, + { + "epoch": 1.09, + "grad_norm": 0.29133880168115567, + "learning_rate": 9.269001153382532e-05, + "loss": 1.139, + "step": 11437 + }, + { + "epoch": 1.09, + "grad_norm": 0.2782953550250367, + "learning_rate": 9.26742344577584e-05, + "loss": 0.9452, + "step": 11438 + }, + { + "epoch": 1.09, + "grad_norm": 0.29197560457905414, + "learning_rate": 9.265845756502333e-05, + "loss": 1.0428, + "step": 11439 + }, + { + "epoch": 1.09, + "grad_norm": 0.3137637895455744, + "learning_rate": 9.264268085601501e-05, + "loss": 1.1102, + "step": 11440 + }, + { + "epoch": 1.09, + "grad_norm": 0.313732491407767, + "learning_rate": 9.262690433112824e-05, + "loss": 0.9453, + "step": 11441 + }, + { + "epoch": 1.09, + "grad_norm": 0.2948275168811159, + "learning_rate": 9.261112799075782e-05, + "loss": 1.0232, + "step": 11442 + }, + { + "epoch": 1.09, + "grad_norm": 0.3207179025444539, + "learning_rate": 9.259535183529854e-05, + "loss": 0.9935, + "step": 11443 + }, + { + "epoch": 1.09, + "grad_norm": 0.33296662263486504, + "learning_rate": 9.257957586514528e-05, + "loss": 0.9602, + "step": 11444 + }, + { + "epoch": 1.09, + "grad_norm": 0.3039140964087672, + "learning_rate": 9.256380008069277e-05, + "loss": 0.9889, + "step": 11445 + }, + { + "epoch": 1.1, + "grad_norm": 0.2659490046378622, + "learning_rate": 9.254802448233578e-05, + "loss": 1.1225, + "step": 11446 + }, + { + "epoch": 1.1, + "grad_norm": 0.3485396021150113, + "learning_rate": 9.253224907046922e-05, + "loss": 1.155, + "step": 11447 + }, + { + "epoch": 1.1, + "grad_norm": 0.3325029217831625, + "learning_rate": 9.251647384548782e-05, + "loss": 0.9161, + "step": 11448 + }, + { + "epoch": 1.1, + "grad_norm": 0.32119326087743194, + "learning_rate": 9.250069880778636e-05, + "loss": 1.1258, + "step": 11449 + }, + { + "epoch": 1.1, + "grad_norm": 0.31906200206879254, + "learning_rate": 9.248492395775964e-05, + "loss": 1.1553, + "step": 11450 + }, + { + "epoch": 1.1, + "grad_norm": 0.291756267109159, + "learning_rate": 9.246914929580242e-05, + "loss": 1.0573, + "step": 11451 + }, + { + "epoch": 1.1, + "grad_norm": 0.3350488358646355, + "learning_rate": 9.245337482230944e-05, + "loss": 1.0467, + "step": 11452 + }, + { + "epoch": 1.1, + "grad_norm": 0.2938823951153432, + "learning_rate": 9.243760053767549e-05, + "loss": 1.0042, + "step": 11453 + }, + { + "epoch": 1.1, + "grad_norm": 0.29971917343496535, + "learning_rate": 9.242182644229538e-05, + "loss": 1.087, + "step": 11454 + }, + { + "epoch": 1.1, + "grad_norm": 0.2767862302395918, + "learning_rate": 9.240605253656381e-05, + "loss": 1.0103, + "step": 11455 + }, + { + "epoch": 1.1, + "grad_norm": 0.32509399046587334, + "learning_rate": 9.239027882087553e-05, + "loss": 1.023, + "step": 11456 + }, + { + "epoch": 1.1, + "grad_norm": 0.32283353742020854, + "learning_rate": 9.237450529562534e-05, + "loss": 1.0256, + "step": 11457 + }, + { + "epoch": 1.1, + "grad_norm": 0.2821938198614643, + "learning_rate": 9.235873196120794e-05, + "loss": 1.0549, + "step": 11458 + }, + { + "epoch": 1.1, + "grad_norm": 0.3171160147643407, + "learning_rate": 9.234295881801807e-05, + "loss": 1.022, + "step": 11459 + }, + { + "epoch": 1.1, + "grad_norm": 0.282672579704511, + "learning_rate": 9.232718586645042e-05, + "loss": 1.0495, + "step": 11460 + }, + { + "epoch": 1.1, + "grad_norm": 0.2922971956148487, + "learning_rate": 9.231141310689981e-05, + "loss": 1.1067, + "step": 11461 + }, + { + "epoch": 1.1, + "grad_norm": 0.3081991267459038, + "learning_rate": 9.229564053976093e-05, + "loss": 1.162, + "step": 11462 + }, + { + "epoch": 1.1, + "grad_norm": 0.3141656655877061, + "learning_rate": 9.227986816542848e-05, + "loss": 1.0741, + "step": 11463 + }, + { + "epoch": 1.1, + "grad_norm": 0.3417354103763981, + "learning_rate": 9.226409598429718e-05, + "loss": 0.9149, + "step": 11464 + }, + { + "epoch": 1.1, + "grad_norm": 0.3368641471398126, + "learning_rate": 9.224832399676175e-05, + "loss": 1.1188, + "step": 11465 + }, + { + "epoch": 1.1, + "grad_norm": 0.2778215624136298, + "learning_rate": 9.223255220321682e-05, + "loss": 1.0223, + "step": 11466 + }, + { + "epoch": 1.1, + "grad_norm": 0.3717177865759146, + "learning_rate": 9.221678060405722e-05, + "loss": 1.0674, + "step": 11467 + }, + { + "epoch": 1.1, + "grad_norm": 0.31036843572206185, + "learning_rate": 9.220100919967757e-05, + "loss": 1.0755, + "step": 11468 + }, + { + "epoch": 1.1, + "grad_norm": 0.32832979081626973, + "learning_rate": 9.218523799047256e-05, + "loss": 0.9299, + "step": 11469 + }, + { + "epoch": 1.1, + "grad_norm": 0.2905558787389144, + "learning_rate": 9.216946697683688e-05, + "loss": 0.983, + "step": 11470 + }, + { + "epoch": 1.1, + "grad_norm": 0.30379320935257625, + "learning_rate": 9.215369615916522e-05, + "loss": 0.9752, + "step": 11471 + }, + { + "epoch": 1.1, + "grad_norm": 0.3187181800924588, + "learning_rate": 9.213792553785224e-05, + "loss": 1.0215, + "step": 11472 + }, + { + "epoch": 1.1, + "grad_norm": 0.3436434455672564, + "learning_rate": 9.212215511329257e-05, + "loss": 1.0582, + "step": 11473 + }, + { + "epoch": 1.1, + "grad_norm": 0.3131320969917745, + "learning_rate": 9.210638488588095e-05, + "loss": 1.0672, + "step": 11474 + }, + { + "epoch": 1.1, + "grad_norm": 0.28965727589388895, + "learning_rate": 9.209061485601203e-05, + "loss": 0.9681, + "step": 11475 + }, + { + "epoch": 1.1, + "grad_norm": 0.32207889448114263, + "learning_rate": 9.207484502408041e-05, + "loss": 1.0191, + "step": 11476 + }, + { + "epoch": 1.1, + "grad_norm": 0.34166953642999415, + "learning_rate": 9.205907539048081e-05, + "loss": 0.9953, + "step": 11477 + }, + { + "epoch": 1.1, + "grad_norm": 0.3422674676358954, + "learning_rate": 9.204330595560783e-05, + "loss": 1.1038, + "step": 11478 + }, + { + "epoch": 1.1, + "grad_norm": 0.30249488532693336, + "learning_rate": 9.20275367198561e-05, + "loss": 1.0697, + "step": 11479 + }, + { + "epoch": 1.1, + "grad_norm": 0.30819044677169616, + "learning_rate": 9.201176768362027e-05, + "loss": 0.9549, + "step": 11480 + }, + { + "epoch": 1.1, + "grad_norm": 0.2951289542227046, + "learning_rate": 9.199599884729498e-05, + "loss": 0.936, + "step": 11481 + }, + { + "epoch": 1.1, + "grad_norm": 0.31307124674708064, + "learning_rate": 9.198023021127487e-05, + "loss": 1.0378, + "step": 11482 + }, + { + "epoch": 1.1, + "grad_norm": 0.3004897203419622, + "learning_rate": 9.196446177595454e-05, + "loss": 1.0783, + "step": 11483 + }, + { + "epoch": 1.1, + "grad_norm": 0.29088616283893437, + "learning_rate": 9.19486935417286e-05, + "loss": 1.1352, + "step": 11484 + }, + { + "epoch": 1.1, + "grad_norm": 0.32439454706922444, + "learning_rate": 9.193292550899166e-05, + "loss": 0.9648, + "step": 11485 + }, + { + "epoch": 1.1, + "grad_norm": 0.27462395778614596, + "learning_rate": 9.191715767813834e-05, + "loss": 1.013, + "step": 11486 + }, + { + "epoch": 1.1, + "grad_norm": 0.30839950765568264, + "learning_rate": 9.190139004956318e-05, + "loss": 0.9677, + "step": 11487 + }, + { + "epoch": 1.1, + "grad_norm": 0.2812608894599483, + "learning_rate": 9.188562262366085e-05, + "loss": 1.0264, + "step": 11488 + }, + { + "epoch": 1.1, + "grad_norm": 0.31203703683570183, + "learning_rate": 9.186985540082593e-05, + "loss": 1.045, + "step": 11489 + }, + { + "epoch": 1.1, + "grad_norm": 0.29769459287168876, + "learning_rate": 9.1854088381453e-05, + "loss": 1.0909, + "step": 11490 + }, + { + "epoch": 1.1, + "grad_norm": 0.35594042857398484, + "learning_rate": 9.183832156593661e-05, + "loss": 0.9989, + "step": 11491 + }, + { + "epoch": 1.1, + "grad_norm": 0.2842469313330223, + "learning_rate": 9.182255495467134e-05, + "loss": 1.0304, + "step": 11492 + }, + { + "epoch": 1.1, + "grad_norm": 0.2849143187283397, + "learning_rate": 9.180678854805177e-05, + "loss": 1.058, + "step": 11493 + }, + { + "epoch": 1.1, + "grad_norm": 0.25565205249750705, + "learning_rate": 9.179102234647247e-05, + "loss": 1.0759, + "step": 11494 + }, + { + "epoch": 1.1, + "grad_norm": 0.2528700059460578, + "learning_rate": 9.1775256350328e-05, + "loss": 1.1179, + "step": 11495 + }, + { + "epoch": 1.1, + "grad_norm": 0.3079968684707797, + "learning_rate": 9.175949056001293e-05, + "loss": 1.011, + "step": 11496 + }, + { + "epoch": 1.1, + "grad_norm": 0.31596368903900224, + "learning_rate": 9.174372497592175e-05, + "loss": 1.0106, + "step": 11497 + }, + { + "epoch": 1.1, + "grad_norm": 0.303578013850767, + "learning_rate": 9.172795959844907e-05, + "loss": 1.0226, + "step": 11498 + }, + { + "epoch": 1.1, + "grad_norm": 0.3171349662375542, + "learning_rate": 9.171219442798939e-05, + "loss": 1.0857, + "step": 11499 + }, + { + "epoch": 1.1, + "grad_norm": 0.2793431071552088, + "learning_rate": 9.169642946493724e-05, + "loss": 1.0148, + "step": 11500 + }, + { + "epoch": 1.1, + "grad_norm": 0.30418153772545836, + "learning_rate": 9.168066470968716e-05, + "loss": 0.9923, + "step": 11501 + }, + { + "epoch": 1.1, + "grad_norm": 0.3592224126880653, + "learning_rate": 9.166490016263369e-05, + "loss": 0.905, + "step": 11502 + }, + { + "epoch": 1.1, + "grad_norm": 0.27660228051630525, + "learning_rate": 9.164913582417134e-05, + "loss": 0.9769, + "step": 11503 + }, + { + "epoch": 1.1, + "grad_norm": 0.2940427129940845, + "learning_rate": 9.163337169469462e-05, + "loss": 1.0329, + "step": 11504 + }, + { + "epoch": 1.1, + "grad_norm": 0.3330674663285783, + "learning_rate": 9.161760777459802e-05, + "loss": 1.0291, + "step": 11505 + }, + { + "epoch": 1.1, + "grad_norm": 0.30061804787165797, + "learning_rate": 9.160184406427605e-05, + "loss": 0.973, + "step": 11506 + }, + { + "epoch": 1.1, + "grad_norm": 0.29775155793223596, + "learning_rate": 9.158608056412318e-05, + "loss": 0.9982, + "step": 11507 + }, + { + "epoch": 1.1, + "grad_norm": 0.30213185242713764, + "learning_rate": 9.157031727453396e-05, + "loss": 1.0166, + "step": 11508 + }, + { + "epoch": 1.1, + "grad_norm": 0.3131984103857842, + "learning_rate": 9.155455419590286e-05, + "loss": 1.0081, + "step": 11509 + }, + { + "epoch": 1.1, + "grad_norm": 0.32843669053242885, + "learning_rate": 9.153879132862435e-05, + "loss": 1.0108, + "step": 11510 + }, + { + "epoch": 1.1, + "grad_norm": 0.30944209386078386, + "learning_rate": 9.152302867309291e-05, + "loss": 1.0622, + "step": 11511 + }, + { + "epoch": 1.1, + "grad_norm": 0.27280496329694115, + "learning_rate": 9.150726622970299e-05, + "loss": 1.0448, + "step": 11512 + }, + { + "epoch": 1.1, + "grad_norm": 0.326362142011431, + "learning_rate": 9.149150399884907e-05, + "loss": 0.9973, + "step": 11513 + }, + { + "epoch": 1.1, + "grad_norm": 0.3231129739777904, + "learning_rate": 9.147574198092558e-05, + "loss": 1.0161, + "step": 11514 + }, + { + "epoch": 1.1, + "grad_norm": 0.28392393264345045, + "learning_rate": 9.145998017632703e-05, + "loss": 0.9239, + "step": 11515 + }, + { + "epoch": 1.1, + "grad_norm": 0.3252500430774593, + "learning_rate": 9.144421858544787e-05, + "loss": 0.9223, + "step": 11516 + }, + { + "epoch": 1.1, + "grad_norm": 0.28746372755000366, + "learning_rate": 9.142845720868249e-05, + "loss": 1.0051, + "step": 11517 + }, + { + "epoch": 1.1, + "grad_norm": 0.31685420893630095, + "learning_rate": 9.141269604642534e-05, + "loss": 1.0036, + "step": 11518 + }, + { + "epoch": 1.1, + "grad_norm": 0.31718224154373814, + "learning_rate": 9.139693509907089e-05, + "loss": 0.9745, + "step": 11519 + }, + { + "epoch": 1.1, + "grad_norm": 0.2737437144522178, + "learning_rate": 9.138117436701356e-05, + "loss": 1.0389, + "step": 11520 + }, + { + "epoch": 1.1, + "grad_norm": 0.34668426240599215, + "learning_rate": 9.13654138506477e-05, + "loss": 1.0746, + "step": 11521 + }, + { + "epoch": 1.1, + "grad_norm": 0.3439423925974035, + "learning_rate": 9.134965355036783e-05, + "loss": 0.9459, + "step": 11522 + }, + { + "epoch": 1.1, + "grad_norm": 0.3178396113025507, + "learning_rate": 9.133389346656832e-05, + "loss": 1.0417, + "step": 11523 + }, + { + "epoch": 1.1, + "grad_norm": 0.29774274898286124, + "learning_rate": 9.131813359964358e-05, + "loss": 1.0099, + "step": 11524 + }, + { + "epoch": 1.1, + "grad_norm": 0.29797843524760165, + "learning_rate": 9.130237394998799e-05, + "loss": 1.0688, + "step": 11525 + }, + { + "epoch": 1.1, + "grad_norm": 0.3322198445888856, + "learning_rate": 9.128661451799596e-05, + "loss": 0.9914, + "step": 11526 + }, + { + "epoch": 1.1, + "grad_norm": 0.30523296553003915, + "learning_rate": 9.127085530406183e-05, + "loss": 0.9826, + "step": 11527 + }, + { + "epoch": 1.1, + "grad_norm": 0.3179870323969011, + "learning_rate": 9.12550963085801e-05, + "loss": 1.092, + "step": 11528 + }, + { + "epoch": 1.1, + "grad_norm": 0.26387318639761115, + "learning_rate": 9.123933753194506e-05, + "loss": 0.9746, + "step": 11529 + }, + { + "epoch": 1.1, + "grad_norm": 0.2530434472395067, + "learning_rate": 9.12235789745511e-05, + "loss": 1.0581, + "step": 11530 + }, + { + "epoch": 1.1, + "grad_norm": 0.33004092529450124, + "learning_rate": 9.120782063679259e-05, + "loss": 1.0147, + "step": 11531 + }, + { + "epoch": 1.1, + "grad_norm": 0.2962880288732999, + "learning_rate": 9.119206251906392e-05, + "loss": 0.9801, + "step": 11532 + }, + { + "epoch": 1.1, + "grad_norm": 0.30013874651353006, + "learning_rate": 9.11763046217594e-05, + "loss": 1.1443, + "step": 11533 + }, + { + "epoch": 1.1, + "grad_norm": 0.3097484220746625, + "learning_rate": 9.116054694527336e-05, + "loss": 0.9745, + "step": 11534 + }, + { + "epoch": 1.1, + "grad_norm": 0.3040971573675076, + "learning_rate": 9.114478949000024e-05, + "loss": 0.8803, + "step": 11535 + }, + { + "epoch": 1.1, + "grad_norm": 0.27941541969393435, + "learning_rate": 9.112903225633432e-05, + "loss": 1.083, + "step": 11536 + }, + { + "epoch": 1.1, + "grad_norm": 0.31056592308114583, + "learning_rate": 9.111327524466994e-05, + "loss": 1.1157, + "step": 11537 + }, + { + "epoch": 1.1, + "grad_norm": 0.3333865233310131, + "learning_rate": 9.109751845540143e-05, + "loss": 1.0621, + "step": 11538 + }, + { + "epoch": 1.1, + "grad_norm": 0.3056956465744153, + "learning_rate": 9.108176188892312e-05, + "loss": 1.0277, + "step": 11539 + }, + { + "epoch": 1.1, + "grad_norm": 0.2993256937789053, + "learning_rate": 9.106600554562933e-05, + "loss": 1.015, + "step": 11540 + }, + { + "epoch": 1.1, + "grad_norm": 0.2770770544509905, + "learning_rate": 9.105024942591433e-05, + "loss": 0.983, + "step": 11541 + }, + { + "epoch": 1.1, + "grad_norm": 0.32764010439774555, + "learning_rate": 9.103449353017248e-05, + "loss": 0.9087, + "step": 11542 + }, + { + "epoch": 1.1, + "grad_norm": 0.31576441897742125, + "learning_rate": 9.101873785879808e-05, + "loss": 1.0997, + "step": 11543 + }, + { + "epoch": 1.1, + "grad_norm": 0.2644271724889086, + "learning_rate": 9.100298241218542e-05, + "loss": 1.1208, + "step": 11544 + }, + { + "epoch": 1.1, + "grad_norm": 0.3004191651031753, + "learning_rate": 9.098722719072877e-05, + "loss": 1.0425, + "step": 11545 + }, + { + "epoch": 1.1, + "grad_norm": 0.34357032464646825, + "learning_rate": 9.097147219482242e-05, + "loss": 1.0867, + "step": 11546 + }, + { + "epoch": 1.1, + "grad_norm": 0.3057899623466431, + "learning_rate": 9.095571742486066e-05, + "loss": 1.0534, + "step": 11547 + }, + { + "epoch": 1.1, + "grad_norm": 0.2704566011775238, + "learning_rate": 9.09399628812377e-05, + "loss": 1.0528, + "step": 11548 + }, + { + "epoch": 1.1, + "grad_norm": 0.36137686198768354, + "learning_rate": 9.092420856434792e-05, + "loss": 1.02, + "step": 11549 + }, + { + "epoch": 1.1, + "grad_norm": 0.32097296335662157, + "learning_rate": 9.090845447458552e-05, + "loss": 1.0489, + "step": 11550 + }, + { + "epoch": 1.11, + "grad_norm": 0.3252633951288977, + "learning_rate": 9.089270061234476e-05, + "loss": 1.0137, + "step": 11551 + }, + { + "epoch": 1.11, + "grad_norm": 0.29665019376601004, + "learning_rate": 9.08769469780199e-05, + "loss": 0.9347, + "step": 11552 + }, + { + "epoch": 1.11, + "grad_norm": 0.31529265393290284, + "learning_rate": 9.086119357200516e-05, + "loss": 0.9909, + "step": 11553 + }, + { + "epoch": 1.11, + "grad_norm": 0.29295610496610625, + "learning_rate": 9.084544039469481e-05, + "loss": 1.1287, + "step": 11554 + }, + { + "epoch": 1.11, + "grad_norm": 0.2754158849465959, + "learning_rate": 9.0829687446483e-05, + "loss": 0.9546, + "step": 11555 + }, + { + "epoch": 1.11, + "grad_norm": 0.301917984580733, + "learning_rate": 9.081393472776409e-05, + "loss": 1.053, + "step": 11556 + }, + { + "epoch": 1.11, + "grad_norm": 0.34561631688928013, + "learning_rate": 9.079818223893223e-05, + "loss": 0.9786, + "step": 11557 + }, + { + "epoch": 1.11, + "grad_norm": 0.31405969013331286, + "learning_rate": 9.078242998038165e-05, + "loss": 1.0003, + "step": 11558 + }, + { + "epoch": 1.11, + "grad_norm": 0.2944408515132265, + "learning_rate": 9.076667795250652e-05, + "loss": 0.9979, + "step": 11559 + }, + { + "epoch": 1.11, + "grad_norm": 0.3172864439013365, + "learning_rate": 9.07509261557011e-05, + "loss": 1.0077, + "step": 11560 + }, + { + "epoch": 1.11, + "grad_norm": 0.2693123307660103, + "learning_rate": 9.073517459035954e-05, + "loss": 1.1205, + "step": 11561 + }, + { + "epoch": 1.11, + "grad_norm": 0.3082942158517124, + "learning_rate": 9.071942325687609e-05, + "loss": 1.0586, + "step": 11562 + }, + { + "epoch": 1.11, + "grad_norm": 0.2756053972315942, + "learning_rate": 9.070367215564488e-05, + "loss": 1.0403, + "step": 11563 + }, + { + "epoch": 1.11, + "grad_norm": 0.2924828383192969, + "learning_rate": 9.068792128706014e-05, + "loss": 1.0012, + "step": 11564 + }, + { + "epoch": 1.11, + "grad_norm": 0.2953049784008131, + "learning_rate": 9.067217065151603e-05, + "loss": 0.9264, + "step": 11565 + }, + { + "epoch": 1.11, + "grad_norm": 0.30748368325245784, + "learning_rate": 9.06564202494067e-05, + "loss": 0.9648, + "step": 11566 + }, + { + "epoch": 1.11, + "grad_norm": 0.2717016973210812, + "learning_rate": 9.064067008112633e-05, + "loss": 0.9824, + "step": 11567 + }, + { + "epoch": 1.11, + "grad_norm": 0.26604258877201326, + "learning_rate": 9.062492014706903e-05, + "loss": 1.0435, + "step": 11568 + }, + { + "epoch": 1.11, + "grad_norm": 0.3034045874964776, + "learning_rate": 9.060917044762903e-05, + "loss": 0.9813, + "step": 11569 + }, + { + "epoch": 1.11, + "grad_norm": 0.30381804279845065, + "learning_rate": 9.059342098320045e-05, + "loss": 1.0738, + "step": 11570 + }, + { + "epoch": 1.11, + "grad_norm": 0.30964165790980497, + "learning_rate": 9.057767175417743e-05, + "loss": 1.0835, + "step": 11571 + }, + { + "epoch": 1.11, + "grad_norm": 0.3117184200847612, + "learning_rate": 9.056192276095408e-05, + "loss": 1.0265, + "step": 11572 + }, + { + "epoch": 1.11, + "grad_norm": 0.3049581054218351, + "learning_rate": 9.054617400392456e-05, + "loss": 0.9814, + "step": 11573 + }, + { + "epoch": 1.11, + "grad_norm": 0.3147599452535716, + "learning_rate": 9.053042548348296e-05, + "loss": 1.0385, + "step": 11574 + }, + { + "epoch": 1.11, + "grad_norm": 0.3113444815054467, + "learning_rate": 9.051467720002337e-05, + "loss": 0.9167, + "step": 11575 + }, + { + "epoch": 1.11, + "grad_norm": 0.33450108541284995, + "learning_rate": 9.049892915394e-05, + "loss": 0.9391, + "step": 11576 + }, + { + "epoch": 1.11, + "grad_norm": 0.3081020765961142, + "learning_rate": 9.04831813456269e-05, + "loss": 1.1142, + "step": 11577 + }, + { + "epoch": 1.11, + "grad_norm": 0.28855697769969846, + "learning_rate": 9.046743377547817e-05, + "loss": 1.0489, + "step": 11578 + }, + { + "epoch": 1.11, + "grad_norm": 0.28949168463664404, + "learning_rate": 9.045168644388786e-05, + "loss": 1.0908, + "step": 11579 + }, + { + "epoch": 1.11, + "grad_norm": 0.2765563482457284, + "learning_rate": 9.043593935125014e-05, + "loss": 0.9773, + "step": 11580 + }, + { + "epoch": 1.11, + "grad_norm": 0.3121841176373514, + "learning_rate": 9.042019249795904e-05, + "loss": 0.9939, + "step": 11581 + }, + { + "epoch": 1.11, + "grad_norm": 0.3057006410699669, + "learning_rate": 9.040444588440862e-05, + "loss": 1.0594, + "step": 11582 + }, + { + "epoch": 1.11, + "grad_norm": 0.2806504541989742, + "learning_rate": 9.038869951099298e-05, + "loss": 1.0067, + "step": 11583 + }, + { + "epoch": 1.11, + "grad_norm": 0.3532908454364322, + "learning_rate": 9.037295337810618e-05, + "loss": 1.0812, + "step": 11584 + }, + { + "epoch": 1.11, + "grad_norm": 0.3077562774091088, + "learning_rate": 9.03572074861423e-05, + "loss": 1.0685, + "step": 11585 + }, + { + "epoch": 1.11, + "grad_norm": 0.3225284208852558, + "learning_rate": 9.034146183549535e-05, + "loss": 1.0896, + "step": 11586 + }, + { + "epoch": 1.11, + "grad_norm": 0.3133522161879127, + "learning_rate": 9.032571642655937e-05, + "loss": 1.0914, + "step": 11587 + }, + { + "epoch": 1.11, + "grad_norm": 0.33910746131907543, + "learning_rate": 9.030997125972838e-05, + "loss": 1.1032, + "step": 11588 + }, + { + "epoch": 1.11, + "grad_norm": 0.34685592483424277, + "learning_rate": 9.029422633539651e-05, + "loss": 0.9716, + "step": 11589 + }, + { + "epoch": 1.11, + "grad_norm": 0.31762626139556777, + "learning_rate": 9.027848165395772e-05, + "loss": 0.9745, + "step": 11590 + }, + { + "epoch": 1.11, + "grad_norm": 0.34194620649038404, + "learning_rate": 9.026273721580603e-05, + "loss": 1.0366, + "step": 11591 + }, + { + "epoch": 1.11, + "grad_norm": 0.30462936177146177, + "learning_rate": 9.024699302133545e-05, + "loss": 1.0687, + "step": 11592 + }, + { + "epoch": 1.11, + "grad_norm": 0.30561849997499935, + "learning_rate": 9.023124907094003e-05, + "loss": 0.9369, + "step": 11593 + }, + { + "epoch": 1.11, + "grad_norm": 0.30916424622368943, + "learning_rate": 9.021550536501371e-05, + "loss": 1.1038, + "step": 11594 + }, + { + "epoch": 1.11, + "grad_norm": 0.2792151502470113, + "learning_rate": 9.01997619039505e-05, + "loss": 0.9955, + "step": 11595 + }, + { + "epoch": 1.11, + "grad_norm": 0.27822593400704726, + "learning_rate": 9.018401868814443e-05, + "loss": 0.9971, + "step": 11596 + }, + { + "epoch": 1.11, + "grad_norm": 0.29187554969244056, + "learning_rate": 9.016827571798946e-05, + "loss": 1.0278, + "step": 11597 + }, + { + "epoch": 1.11, + "grad_norm": 0.3153895430638414, + "learning_rate": 9.01525329938796e-05, + "loss": 1.0814, + "step": 11598 + }, + { + "epoch": 1.11, + "grad_norm": 0.3258400526134067, + "learning_rate": 9.013679051620876e-05, + "loss": 1.1013, + "step": 11599 + }, + { + "epoch": 1.11, + "grad_norm": 0.2973570417974947, + "learning_rate": 9.012104828537093e-05, + "loss": 0.9316, + "step": 11600 + }, + { + "epoch": 1.11, + "grad_norm": 0.27594197275786353, + "learning_rate": 9.010530630176008e-05, + "loss": 1.0267, + "step": 11601 + }, + { + "epoch": 1.11, + "grad_norm": 0.31056027598835695, + "learning_rate": 9.008956456577016e-05, + "loss": 1.0303, + "step": 11602 + }, + { + "epoch": 1.11, + "grad_norm": 0.2868664019921097, + "learning_rate": 9.007382307779512e-05, + "loss": 1.071, + "step": 11603 + }, + { + "epoch": 1.11, + "grad_norm": 0.2980290771080433, + "learning_rate": 9.005808183822889e-05, + "loss": 1.0191, + "step": 11604 + }, + { + "epoch": 1.11, + "grad_norm": 0.29689397100512677, + "learning_rate": 9.004234084746541e-05, + "loss": 0.9075, + "step": 11605 + }, + { + "epoch": 1.11, + "grad_norm": 0.2856243096814176, + "learning_rate": 9.002660010589862e-05, + "loss": 1.048, + "step": 11606 + }, + { + "epoch": 1.11, + "grad_norm": 0.32142754342231983, + "learning_rate": 9.001085961392243e-05, + "loss": 0.9657, + "step": 11607 + }, + { + "epoch": 1.11, + "grad_norm": 0.3027339608527746, + "learning_rate": 8.999511937193076e-05, + "loss": 0.9955, + "step": 11608 + }, + { + "epoch": 1.11, + "grad_norm": 0.2663985878165352, + "learning_rate": 8.997937938031746e-05, + "loss": 1.1229, + "step": 11609 + }, + { + "epoch": 1.11, + "grad_norm": 0.2958121413384028, + "learning_rate": 8.996363963947655e-05, + "loss": 1.1138, + "step": 11610 + }, + { + "epoch": 1.11, + "grad_norm": 0.3212330405433351, + "learning_rate": 8.994790014980185e-05, + "loss": 1.0326, + "step": 11611 + }, + { + "epoch": 1.11, + "grad_norm": 0.287489967261577, + "learning_rate": 8.993216091168727e-05, + "loss": 1.0219, + "step": 11612 + }, + { + "epoch": 1.11, + "grad_norm": 0.32670203595094327, + "learning_rate": 8.99164219255267e-05, + "loss": 1.0819, + "step": 11613 + }, + { + "epoch": 1.11, + "grad_norm": 0.31518022496511894, + "learning_rate": 8.990068319171399e-05, + "loss": 1.1037, + "step": 11614 + }, + { + "epoch": 1.11, + "grad_norm": 0.27312415187773204, + "learning_rate": 8.988494471064304e-05, + "loss": 1.072, + "step": 11615 + }, + { + "epoch": 1.11, + "grad_norm": 0.27318689526500484, + "learning_rate": 8.986920648270767e-05, + "loss": 0.9259, + "step": 11616 + }, + { + "epoch": 1.11, + "grad_norm": 0.29185588547167524, + "learning_rate": 8.985346850830182e-05, + "loss": 1.0583, + "step": 11617 + }, + { + "epoch": 1.11, + "grad_norm": 0.3271937382560927, + "learning_rate": 8.983773078781928e-05, + "loss": 1.0061, + "step": 11618 + }, + { + "epoch": 1.11, + "grad_norm": 0.3409497343389582, + "learning_rate": 8.982199332165393e-05, + "loss": 0.9311, + "step": 11619 + }, + { + "epoch": 1.11, + "grad_norm": 0.25624038991872944, + "learning_rate": 8.980625611019956e-05, + "loss": 1.0758, + "step": 11620 + }, + { + "epoch": 1.11, + "grad_norm": 0.33343570323906646, + "learning_rate": 8.979051915385007e-05, + "loss": 0.9912, + "step": 11621 + }, + { + "epoch": 1.11, + "grad_norm": 0.30172605504130623, + "learning_rate": 8.977478245299923e-05, + "loss": 0.9904, + "step": 11622 + }, + { + "epoch": 1.11, + "grad_norm": 0.3266283382736568, + "learning_rate": 8.97590460080409e-05, + "loss": 1.0768, + "step": 11623 + }, + { + "epoch": 1.11, + "grad_norm": 0.312243061243717, + "learning_rate": 8.974330981936886e-05, + "loss": 1.0217, + "step": 11624 + }, + { + "epoch": 1.11, + "grad_norm": 0.31494972213253647, + "learning_rate": 8.972757388737695e-05, + "loss": 0.8384, + "step": 11625 + }, + { + "epoch": 1.11, + "grad_norm": 0.29769755829921885, + "learning_rate": 8.971183821245899e-05, + "loss": 0.9893, + "step": 11626 + }, + { + "epoch": 1.11, + "grad_norm": 0.2984152827081916, + "learning_rate": 8.969610279500871e-05, + "loss": 0.9959, + "step": 11627 + }, + { + "epoch": 1.11, + "grad_norm": 0.30945663871442164, + "learning_rate": 8.968036763541996e-05, + "loss": 1.061, + "step": 11628 + }, + { + "epoch": 1.11, + "grad_norm": 0.3176657617133282, + "learning_rate": 8.966463273408644e-05, + "loss": 0.8833, + "step": 11629 + }, + { + "epoch": 1.11, + "grad_norm": 0.2830912269606958, + "learning_rate": 8.964889809140203e-05, + "loss": 0.8836, + "step": 11630 + }, + { + "epoch": 1.11, + "grad_norm": 0.30273768656164357, + "learning_rate": 8.963316370776044e-05, + "loss": 1.0514, + "step": 11631 + }, + { + "epoch": 1.11, + "grad_norm": 0.27881601584808563, + "learning_rate": 8.961742958355546e-05, + "loss": 0.825, + "step": 11632 + }, + { + "epoch": 1.11, + "grad_norm": 0.27971225704036073, + "learning_rate": 8.960169571918082e-05, + "loss": 0.9603, + "step": 11633 + }, + { + "epoch": 1.11, + "grad_norm": 0.28631688754549506, + "learning_rate": 8.958596211503028e-05, + "loss": 1.0752, + "step": 11634 + }, + { + "epoch": 1.11, + "grad_norm": 0.31223405960222345, + "learning_rate": 8.95702287714976e-05, + "loss": 0.9417, + "step": 11635 + }, + { + "epoch": 1.11, + "grad_norm": 0.2988992583628658, + "learning_rate": 8.955449568897643e-05, + "loss": 0.8458, + "step": 11636 + }, + { + "epoch": 1.11, + "grad_norm": 0.2955696413290493, + "learning_rate": 8.953876286786062e-05, + "loss": 0.9783, + "step": 11637 + }, + { + "epoch": 1.11, + "grad_norm": 0.31785992145596753, + "learning_rate": 8.952303030854385e-05, + "loss": 1.0413, + "step": 11638 + }, + { + "epoch": 1.11, + "grad_norm": 0.28287992282544655, + "learning_rate": 8.950729801141982e-05, + "loss": 1.0306, + "step": 11639 + }, + { + "epoch": 1.11, + "grad_norm": 0.26431126911931047, + "learning_rate": 8.949156597688226e-05, + "loss": 0.995, + "step": 11640 + }, + { + "epoch": 1.11, + "grad_norm": 0.30956284939684153, + "learning_rate": 8.947583420532485e-05, + "loss": 1.008, + "step": 11641 + }, + { + "epoch": 1.11, + "grad_norm": 0.31689002294978424, + "learning_rate": 8.946010269714132e-05, + "loss": 1.1427, + "step": 11642 + }, + { + "epoch": 1.11, + "grad_norm": 0.3066634902666201, + "learning_rate": 8.944437145272531e-05, + "loss": 1.0436, + "step": 11643 + }, + { + "epoch": 1.11, + "grad_norm": 0.2776894458642188, + "learning_rate": 8.942864047247058e-05, + "loss": 0.8923, + "step": 11644 + }, + { + "epoch": 1.11, + "grad_norm": 0.31342171746340125, + "learning_rate": 8.941290975677073e-05, + "loss": 1.0752, + "step": 11645 + }, + { + "epoch": 1.11, + "grad_norm": 0.33620986986226853, + "learning_rate": 8.939717930601949e-05, + "loss": 0.8881, + "step": 11646 + }, + { + "epoch": 1.11, + "grad_norm": 0.2789130553424382, + "learning_rate": 8.938144912061051e-05, + "loss": 1.0919, + "step": 11647 + }, + { + "epoch": 1.11, + "grad_norm": 0.291214541246641, + "learning_rate": 8.936571920093743e-05, + "loss": 1.0412, + "step": 11648 + }, + { + "epoch": 1.11, + "grad_norm": 0.33084823857976936, + "learning_rate": 8.934998954739388e-05, + "loss": 0.9611, + "step": 11649 + }, + { + "epoch": 1.11, + "grad_norm": 0.3423609578815596, + "learning_rate": 8.933426016037357e-05, + "loss": 0.9669, + "step": 11650 + }, + { + "epoch": 1.11, + "grad_norm": 0.33401659736196293, + "learning_rate": 8.931853104027011e-05, + "loss": 0.9459, + "step": 11651 + }, + { + "epoch": 1.11, + "grad_norm": 0.3139284342432189, + "learning_rate": 8.930280218747711e-05, + "loss": 1.0599, + "step": 11652 + }, + { + "epoch": 1.11, + "grad_norm": 0.3126628473971068, + "learning_rate": 8.928707360238821e-05, + "loss": 0.9502, + "step": 11653 + }, + { + "epoch": 1.11, + "grad_norm": 0.28569830775176513, + "learning_rate": 8.927134528539703e-05, + "loss": 0.9486, + "step": 11654 + }, + { + "epoch": 1.12, + "grad_norm": 0.31318275379583177, + "learning_rate": 8.925561723689718e-05, + "loss": 0.986, + "step": 11655 + }, + { + "epoch": 1.12, + "grad_norm": 0.29231951702892467, + "learning_rate": 8.923988945728222e-05, + "loss": 0.8899, + "step": 11656 + }, + { + "epoch": 1.12, + "grad_norm": 0.294417486457446, + "learning_rate": 8.922416194694583e-05, + "loss": 1.0267, + "step": 11657 + }, + { + "epoch": 1.12, + "grad_norm": 0.32202809023360823, + "learning_rate": 8.920843470628155e-05, + "loss": 1.2393, + "step": 11658 + }, + { + "epoch": 1.12, + "grad_norm": 0.32472583357433943, + "learning_rate": 8.919270773568299e-05, + "loss": 1.1154, + "step": 11659 + }, + { + "epoch": 1.12, + "grad_norm": 0.28661638745764834, + "learning_rate": 8.917698103554371e-05, + "loss": 1.0422, + "step": 11660 + }, + { + "epoch": 1.12, + "grad_norm": 0.3352193426924155, + "learning_rate": 8.916125460625726e-05, + "loss": 0.9476, + "step": 11661 + }, + { + "epoch": 1.12, + "grad_norm": 0.32385674642232737, + "learning_rate": 8.914552844821722e-05, + "loss": 1.0053, + "step": 11662 + }, + { + "epoch": 1.12, + "grad_norm": 0.3076695022677086, + "learning_rate": 8.912980256181715e-05, + "loss": 1.0336, + "step": 11663 + }, + { + "epoch": 1.12, + "grad_norm": 0.31611489578336077, + "learning_rate": 8.911407694745064e-05, + "loss": 1.0939, + "step": 11664 + }, + { + "epoch": 1.12, + "grad_norm": 0.28613230282905333, + "learning_rate": 8.909835160551114e-05, + "loss": 1.0907, + "step": 11665 + }, + { + "epoch": 1.12, + "grad_norm": 0.25982599564873426, + "learning_rate": 8.908262653639229e-05, + "loss": 1.0832, + "step": 11666 + }, + { + "epoch": 1.12, + "grad_norm": 0.31099031305930674, + "learning_rate": 8.906690174048757e-05, + "loss": 1.1827, + "step": 11667 + }, + { + "epoch": 1.12, + "grad_norm": 0.3090862531619347, + "learning_rate": 8.905117721819049e-05, + "loss": 1.1341, + "step": 11668 + }, + { + "epoch": 1.12, + "grad_norm": 0.2943083956063802, + "learning_rate": 8.903545296989459e-05, + "loss": 1.0842, + "step": 11669 + }, + { + "epoch": 1.12, + "grad_norm": 0.30092176656396746, + "learning_rate": 8.901972899599334e-05, + "loss": 0.9583, + "step": 11670 + }, + { + "epoch": 1.12, + "grad_norm": 0.2810965579033296, + "learning_rate": 8.90040052968803e-05, + "loss": 0.9157, + "step": 11671 + }, + { + "epoch": 1.12, + "grad_norm": 0.29140343554175696, + "learning_rate": 8.898828187294894e-05, + "loss": 1.1027, + "step": 11672 + }, + { + "epoch": 1.12, + "grad_norm": 0.26491943255092726, + "learning_rate": 8.897255872459274e-05, + "loss": 1.1205, + "step": 11673 + }, + { + "epoch": 1.12, + "grad_norm": 0.26025018700276176, + "learning_rate": 8.89568358522052e-05, + "loss": 1.0237, + "step": 11674 + }, + { + "epoch": 1.12, + "grad_norm": 0.28868494429671426, + "learning_rate": 8.894111325617979e-05, + "loss": 1.0619, + "step": 11675 + }, + { + "epoch": 1.12, + "grad_norm": 0.2783153266707279, + "learning_rate": 8.892539093690995e-05, + "loss": 1.0694, + "step": 11676 + }, + { + "epoch": 1.12, + "grad_norm": 0.32290295292138715, + "learning_rate": 8.890966889478914e-05, + "loss": 1.0954, + "step": 11677 + }, + { + "epoch": 1.12, + "grad_norm": 0.2889147041270084, + "learning_rate": 8.889394713021087e-05, + "loss": 1.0054, + "step": 11678 + }, + { + "epoch": 1.12, + "grad_norm": 0.28627148492044224, + "learning_rate": 8.887822564356855e-05, + "loss": 1.0448, + "step": 11679 + }, + { + "epoch": 1.12, + "grad_norm": 0.311836885192235, + "learning_rate": 8.886250443525563e-05, + "loss": 1.0467, + "step": 11680 + }, + { + "epoch": 1.12, + "grad_norm": 0.3397149342425237, + "learning_rate": 8.884678350566554e-05, + "loss": 1.0439, + "step": 11681 + }, + { + "epoch": 1.12, + "grad_norm": 0.2979677857009591, + "learning_rate": 8.883106285519166e-05, + "loss": 1.0175, + "step": 11682 + }, + { + "epoch": 1.12, + "grad_norm": 0.32030205063759615, + "learning_rate": 8.881534248422747e-05, + "loss": 1.1184, + "step": 11683 + }, + { + "epoch": 1.12, + "grad_norm": 0.3091709937059686, + "learning_rate": 8.879962239316637e-05, + "loss": 1.0006, + "step": 11684 + }, + { + "epoch": 1.12, + "grad_norm": 0.356937090522235, + "learning_rate": 8.878390258240177e-05, + "loss": 1.0289, + "step": 11685 + }, + { + "epoch": 1.12, + "grad_norm": 0.3244434284068347, + "learning_rate": 8.876818305232704e-05, + "loss": 1.027, + "step": 11686 + }, + { + "epoch": 1.12, + "grad_norm": 0.2850736865534277, + "learning_rate": 8.87524638033356e-05, + "loss": 0.9322, + "step": 11687 + }, + { + "epoch": 1.12, + "grad_norm": 0.3160310930899856, + "learning_rate": 8.873674483582083e-05, + "loss": 1.0196, + "step": 11688 + }, + { + "epoch": 1.12, + "grad_norm": 0.36299397858523413, + "learning_rate": 8.872102615017609e-05, + "loss": 1.0825, + "step": 11689 + }, + { + "epoch": 1.12, + "grad_norm": 0.3264850910977147, + "learning_rate": 8.870530774679472e-05, + "loss": 0.9365, + "step": 11690 + }, + { + "epoch": 1.12, + "grad_norm": 0.2925294959684692, + "learning_rate": 8.868958962607017e-05, + "loss": 0.9864, + "step": 11691 + }, + { + "epoch": 1.12, + "grad_norm": 0.31668369003897534, + "learning_rate": 8.867387178839573e-05, + "loss": 1.0051, + "step": 11692 + }, + { + "epoch": 1.12, + "grad_norm": 0.35160727943648445, + "learning_rate": 8.865815423416479e-05, + "loss": 1.0386, + "step": 11693 + }, + { + "epoch": 1.12, + "grad_norm": 0.32992830615280117, + "learning_rate": 8.864243696377066e-05, + "loss": 1.052, + "step": 11694 + }, + { + "epoch": 1.12, + "grad_norm": 0.2637815025996669, + "learning_rate": 8.862671997760667e-05, + "loss": 0.9698, + "step": 11695 + }, + { + "epoch": 1.12, + "grad_norm": 0.3188305733519182, + "learning_rate": 8.861100327606618e-05, + "loss": 1.0698, + "step": 11696 + }, + { + "epoch": 1.12, + "grad_norm": 0.3475546970574943, + "learning_rate": 8.859528685954244e-05, + "loss": 1.1835, + "step": 11697 + }, + { + "epoch": 1.12, + "grad_norm": 0.27144489013773826, + "learning_rate": 8.857957072842887e-05, + "loss": 1.1211, + "step": 11698 + }, + { + "epoch": 1.12, + "grad_norm": 0.3669736493218948, + "learning_rate": 8.85638548831187e-05, + "loss": 1.0322, + "step": 11699 + }, + { + "epoch": 1.12, + "grad_norm": 0.28156764259026584, + "learning_rate": 8.854813932400525e-05, + "loss": 0.9653, + "step": 11700 + }, + { + "epoch": 1.12, + "grad_norm": 0.31122772183217706, + "learning_rate": 8.853242405148181e-05, + "loss": 0.9717, + "step": 11701 + }, + { + "epoch": 1.12, + "grad_norm": 0.2738946682901121, + "learning_rate": 8.851670906594167e-05, + "loss": 0.9793, + "step": 11702 + }, + { + "epoch": 1.12, + "grad_norm": 0.30393118464410657, + "learning_rate": 8.85009943677781e-05, + "loss": 1.0298, + "step": 11703 + }, + { + "epoch": 1.12, + "grad_norm": 0.308274581737524, + "learning_rate": 8.848527995738435e-05, + "loss": 1.0017, + "step": 11704 + }, + { + "epoch": 1.12, + "grad_norm": 0.31927879846628043, + "learning_rate": 8.846956583515373e-05, + "loss": 1.1971, + "step": 11705 + }, + { + "epoch": 1.12, + "grad_norm": 0.32238558866430234, + "learning_rate": 8.845385200147947e-05, + "loss": 1.0519, + "step": 11706 + }, + { + "epoch": 1.12, + "grad_norm": 0.3492948677598376, + "learning_rate": 8.84381384567548e-05, + "loss": 1.0174, + "step": 11707 + }, + { + "epoch": 1.12, + "grad_norm": 0.31114609342185734, + "learning_rate": 8.8422425201373e-05, + "loss": 0.879, + "step": 11708 + }, + { + "epoch": 1.12, + "grad_norm": 0.31973703529169883, + "learning_rate": 8.84067122357273e-05, + "loss": 0.9319, + "step": 11709 + }, + { + "epoch": 1.12, + "grad_norm": 0.31198006319721916, + "learning_rate": 8.839099956021086e-05, + "loss": 0.9905, + "step": 11710 + }, + { + "epoch": 1.12, + "grad_norm": 0.3370496137516742, + "learning_rate": 8.837528717521699e-05, + "loss": 1.0856, + "step": 11711 + }, + { + "epoch": 1.12, + "grad_norm": 0.3261229724492701, + "learning_rate": 8.835957508113888e-05, + "loss": 1.0555, + "step": 11712 + }, + { + "epoch": 1.12, + "grad_norm": 0.3755784251741156, + "learning_rate": 8.83438632783697e-05, + "loss": 0.9602, + "step": 11713 + }, + { + "epoch": 1.12, + "grad_norm": 0.31734761914354137, + "learning_rate": 8.832815176730268e-05, + "loss": 1.0604, + "step": 11714 + }, + { + "epoch": 1.12, + "grad_norm": 0.3018408352358852, + "learning_rate": 8.8312440548331e-05, + "loss": 1.0933, + "step": 11715 + }, + { + "epoch": 1.12, + "grad_norm": 0.35730740339380435, + "learning_rate": 8.829672962184783e-05, + "loss": 1.0828, + "step": 11716 + }, + { + "epoch": 1.12, + "grad_norm": 0.3004230587582097, + "learning_rate": 8.828101898824632e-05, + "loss": 1.0454, + "step": 11717 + }, + { + "epoch": 1.12, + "grad_norm": 0.30832669068564544, + "learning_rate": 8.82653086479197e-05, + "loss": 1.1244, + "step": 11718 + }, + { + "epoch": 1.12, + "grad_norm": 0.335162908097521, + "learning_rate": 8.824959860126114e-05, + "loss": 1.1418, + "step": 11719 + }, + { + "epoch": 1.12, + "grad_norm": 0.2839347189102202, + "learning_rate": 8.823388884866373e-05, + "loss": 0.996, + "step": 11720 + }, + { + "epoch": 1.12, + "grad_norm": 0.3182672218338143, + "learning_rate": 8.821817939052067e-05, + "loss": 0.9392, + "step": 11721 + }, + { + "epoch": 1.12, + "grad_norm": 0.3309090406568504, + "learning_rate": 8.820247022722507e-05, + "loss": 1.058, + "step": 11722 + }, + { + "epoch": 1.12, + "grad_norm": 0.3282209617121481, + "learning_rate": 8.818676135917003e-05, + "loss": 1.0165, + "step": 11723 + }, + { + "epoch": 1.12, + "grad_norm": 0.3062200558202659, + "learning_rate": 8.817105278674872e-05, + "loss": 1.0344, + "step": 11724 + }, + { + "epoch": 1.12, + "grad_norm": 0.3277105500126028, + "learning_rate": 8.815534451035427e-05, + "loss": 1.0719, + "step": 11725 + }, + { + "epoch": 1.12, + "grad_norm": 0.34554397223711425, + "learning_rate": 8.813963653037976e-05, + "loss": 0.9784, + "step": 11726 + }, + { + "epoch": 1.12, + "grad_norm": 0.31682541496563876, + "learning_rate": 8.81239288472183e-05, + "loss": 1.0072, + "step": 11727 + }, + { + "epoch": 1.12, + "grad_norm": 0.290221098231797, + "learning_rate": 8.810822146126299e-05, + "loss": 1.133, + "step": 11728 + }, + { + "epoch": 1.12, + "grad_norm": 0.3204798957778579, + "learning_rate": 8.809251437290691e-05, + "loss": 1.1214, + "step": 11729 + }, + { + "epoch": 1.12, + "grad_norm": 0.27334652835710305, + "learning_rate": 8.807680758254314e-05, + "loss": 0.9232, + "step": 11730 + }, + { + "epoch": 1.12, + "grad_norm": 0.313508978111253, + "learning_rate": 8.806110109056472e-05, + "loss": 1.0357, + "step": 11731 + }, + { + "epoch": 1.12, + "grad_norm": 0.31789118802327204, + "learning_rate": 8.804539489736478e-05, + "loss": 1.0602, + "step": 11732 + }, + { + "epoch": 1.12, + "grad_norm": 0.29638575374665055, + "learning_rate": 8.802968900333635e-05, + "loss": 0.9543, + "step": 11733 + }, + { + "epoch": 1.12, + "grad_norm": 0.2947843517621017, + "learning_rate": 8.80139834088725e-05, + "loss": 1.0415, + "step": 11734 + }, + { + "epoch": 1.12, + "grad_norm": 0.2934964077403165, + "learning_rate": 8.799827811436622e-05, + "loss": 1.0719, + "step": 11735 + }, + { + "epoch": 1.12, + "grad_norm": 0.31213977643081936, + "learning_rate": 8.798257312021058e-05, + "loss": 1.0165, + "step": 11736 + }, + { + "epoch": 1.12, + "grad_norm": 0.3149087221225925, + "learning_rate": 8.79668684267986e-05, + "loss": 1.0387, + "step": 11737 + }, + { + "epoch": 1.12, + "grad_norm": 0.29696616688376803, + "learning_rate": 8.795116403452325e-05, + "loss": 1.0229, + "step": 11738 + }, + { + "epoch": 1.12, + "grad_norm": 0.2661667160910295, + "learning_rate": 8.793545994377765e-05, + "loss": 0.9392, + "step": 11739 + }, + { + "epoch": 1.12, + "grad_norm": 0.3046180369087137, + "learning_rate": 8.791975615495474e-05, + "loss": 1.1403, + "step": 11740 + }, + { + "epoch": 1.12, + "grad_norm": 0.3820366272116155, + "learning_rate": 8.790405266844753e-05, + "loss": 1.1185, + "step": 11741 + }, + { + "epoch": 1.12, + "grad_norm": 0.24104062440177565, + "learning_rate": 8.7888349484649e-05, + "loss": 1.0083, + "step": 11742 + }, + { + "epoch": 1.12, + "grad_norm": 0.32096200544736164, + "learning_rate": 8.787264660395212e-05, + "loss": 0.9791, + "step": 11743 + }, + { + "epoch": 1.12, + "grad_norm": 0.30532835353781995, + "learning_rate": 8.785694402674986e-05, + "loss": 0.9012, + "step": 11744 + }, + { + "epoch": 1.12, + "grad_norm": 0.333191518387627, + "learning_rate": 8.784124175343524e-05, + "loss": 1.0628, + "step": 11745 + }, + { + "epoch": 1.12, + "grad_norm": 0.31306334267072666, + "learning_rate": 8.782553978440118e-05, + "loss": 1.0878, + "step": 11746 + }, + { + "epoch": 1.12, + "grad_norm": 0.28742636933876836, + "learning_rate": 8.780983812004065e-05, + "loss": 1.0097, + "step": 11747 + }, + { + "epoch": 1.12, + "grad_norm": 0.2794674440758732, + "learning_rate": 8.779413676074656e-05, + "loss": 1.1142, + "step": 11748 + }, + { + "epoch": 1.12, + "grad_norm": 0.28026170152681756, + "learning_rate": 8.777843570691189e-05, + "loss": 1.0398, + "step": 11749 + }, + { + "epoch": 1.12, + "grad_norm": 0.29873903541825475, + "learning_rate": 8.776273495892955e-05, + "loss": 1.0607, + "step": 11750 + }, + { + "epoch": 1.12, + "grad_norm": 0.3107128598808398, + "learning_rate": 8.774703451719243e-05, + "loss": 1.0445, + "step": 11751 + }, + { + "epoch": 1.12, + "grad_norm": 0.3248443877746381, + "learning_rate": 8.773133438209348e-05, + "loss": 0.9466, + "step": 11752 + }, + { + "epoch": 1.12, + "grad_norm": 0.31555771379167835, + "learning_rate": 8.77156345540256e-05, + "loss": 0.9583, + "step": 11753 + }, + { + "epoch": 1.12, + "grad_norm": 0.3046447609897317, + "learning_rate": 8.769993503338171e-05, + "loss": 1.0561, + "step": 11754 + }, + { + "epoch": 1.12, + "grad_norm": 0.31151454624517777, + "learning_rate": 8.768423582055466e-05, + "loss": 0.9807, + "step": 11755 + }, + { + "epoch": 1.12, + "grad_norm": 0.31859028634900255, + "learning_rate": 8.766853691593736e-05, + "loss": 0.928, + "step": 11756 + }, + { + "epoch": 1.12, + "grad_norm": 0.3405565407726896, + "learning_rate": 8.765283831992266e-05, + "loss": 1.0601, + "step": 11757 + }, + { + "epoch": 1.12, + "grad_norm": 0.29045655336969156, + "learning_rate": 8.76371400329034e-05, + "loss": 1.0594, + "step": 11758 + }, + { + "epoch": 1.12, + "grad_norm": 0.29727194932359413, + "learning_rate": 8.762144205527253e-05, + "loss": 1.0492, + "step": 11759 + }, + { + "epoch": 1.13, + "grad_norm": 0.29393228805917765, + "learning_rate": 8.760574438742286e-05, + "loss": 1.095, + "step": 11760 + }, + { + "epoch": 1.13, + "grad_norm": 0.3336296128174042, + "learning_rate": 8.759004702974722e-05, + "loss": 1.1061, + "step": 11761 + }, + { + "epoch": 1.13, + "grad_norm": 0.30414695446469087, + "learning_rate": 8.757434998263845e-05, + "loss": 1.0356, + "step": 11762 + }, + { + "epoch": 1.13, + "grad_norm": 0.3082968547654127, + "learning_rate": 8.755865324648938e-05, + "loss": 1.081, + "step": 11763 + }, + { + "epoch": 1.13, + "grad_norm": 0.30902704442047174, + "learning_rate": 8.754295682169281e-05, + "loss": 1.1735, + "step": 11764 + }, + { + "epoch": 1.13, + "grad_norm": 0.322669527521351, + "learning_rate": 8.752726070864158e-05, + "loss": 1.0706, + "step": 11765 + }, + { + "epoch": 1.13, + "grad_norm": 0.361129076727717, + "learning_rate": 8.75115649077285e-05, + "loss": 1.0307, + "step": 11766 + }, + { + "epoch": 1.13, + "grad_norm": 0.30587856181161655, + "learning_rate": 8.749586941934636e-05, + "loss": 0.9579, + "step": 11767 + }, + { + "epoch": 1.13, + "grad_norm": 0.34375706835277664, + "learning_rate": 8.748017424388794e-05, + "loss": 1.0316, + "step": 11768 + }, + { + "epoch": 1.13, + "grad_norm": 0.3130585608347365, + "learning_rate": 8.746447938174604e-05, + "loss": 1.1016, + "step": 11769 + }, + { + "epoch": 1.13, + "grad_norm": 0.2992835129740237, + "learning_rate": 8.744878483331342e-05, + "loss": 1.0469, + "step": 11770 + }, + { + "epoch": 1.13, + "grad_norm": 0.32583328662299327, + "learning_rate": 8.743309059898281e-05, + "loss": 1.0707, + "step": 11771 + }, + { + "epoch": 1.13, + "grad_norm": 0.3290551116371758, + "learning_rate": 8.741739667914703e-05, + "loss": 1.0328, + "step": 11772 + }, + { + "epoch": 1.13, + "grad_norm": 0.32902614786385054, + "learning_rate": 8.740170307419882e-05, + "loss": 0.9208, + "step": 11773 + }, + { + "epoch": 1.13, + "grad_norm": 0.2800208661283637, + "learning_rate": 8.738600978453091e-05, + "loss": 0.9725, + "step": 11774 + }, + { + "epoch": 1.13, + "grad_norm": 0.28445989285841317, + "learning_rate": 8.737031681053603e-05, + "loss": 0.91, + "step": 11775 + }, + { + "epoch": 1.13, + "grad_norm": 0.30295137542091016, + "learning_rate": 8.735462415260691e-05, + "loss": 1.0021, + "step": 11776 + }, + { + "epoch": 1.13, + "grad_norm": 0.3185673764141133, + "learning_rate": 8.733893181113626e-05, + "loss": 1.0135, + "step": 11777 + }, + { + "epoch": 1.13, + "grad_norm": 0.29855680394090717, + "learning_rate": 8.732323978651676e-05, + "loss": 1.0855, + "step": 11778 + }, + { + "epoch": 1.13, + "grad_norm": 0.3090837036457281, + "learning_rate": 8.73075480791412e-05, + "loss": 1.0388, + "step": 11779 + }, + { + "epoch": 1.13, + "grad_norm": 0.33543756569125666, + "learning_rate": 8.729185668940223e-05, + "loss": 1.1234, + "step": 11780 + }, + { + "epoch": 1.13, + "grad_norm": 0.32821962056393456, + "learning_rate": 8.727616561769252e-05, + "loss": 1.0283, + "step": 11781 + }, + { + "epoch": 1.13, + "grad_norm": 0.30129756660683893, + "learning_rate": 8.726047486440477e-05, + "loss": 1.1068, + "step": 11782 + }, + { + "epoch": 1.13, + "grad_norm": 0.30721081177880943, + "learning_rate": 8.724478442993165e-05, + "loss": 0.9544, + "step": 11783 + }, + { + "epoch": 1.13, + "grad_norm": 0.2977355614104344, + "learning_rate": 8.72290943146658e-05, + "loss": 1.0995, + "step": 11784 + }, + { + "epoch": 1.13, + "grad_norm": 0.3308848972508445, + "learning_rate": 8.721340451899985e-05, + "loss": 1.1038, + "step": 11785 + }, + { + "epoch": 1.13, + "grad_norm": 0.30623548626019165, + "learning_rate": 8.719771504332655e-05, + "loss": 1.0619, + "step": 11786 + }, + { + "epoch": 1.13, + "grad_norm": 0.2885561933358289, + "learning_rate": 8.718202588803845e-05, + "loss": 1.0407, + "step": 11787 + }, + { + "epoch": 1.13, + "grad_norm": 0.3516382885788529, + "learning_rate": 8.716633705352823e-05, + "loss": 1.1285, + "step": 11788 + }, + { + "epoch": 1.13, + "grad_norm": 0.2861859210047231, + "learning_rate": 8.715064854018846e-05, + "loss": 1.0219, + "step": 11789 + }, + { + "epoch": 1.13, + "grad_norm": 0.30359596619761764, + "learning_rate": 8.71349603484118e-05, + "loss": 1.1126, + "step": 11790 + }, + { + "epoch": 1.13, + "grad_norm": 0.31168415979248204, + "learning_rate": 8.711927247859087e-05, + "loss": 1.0375, + "step": 11791 + }, + { + "epoch": 1.13, + "grad_norm": 0.3180508406494047, + "learning_rate": 8.710358493111819e-05, + "loss": 1.0157, + "step": 11792 + }, + { + "epoch": 1.13, + "grad_norm": 0.3228930982348067, + "learning_rate": 8.708789770638641e-05, + "loss": 1.0209, + "step": 11793 + }, + { + "epoch": 1.13, + "grad_norm": 0.30811737138875095, + "learning_rate": 8.707221080478813e-05, + "loss": 1.1696, + "step": 11794 + }, + { + "epoch": 1.13, + "grad_norm": 0.30151828581676104, + "learning_rate": 8.705652422671591e-05, + "loss": 1.1165, + "step": 11795 + }, + { + "epoch": 1.13, + "grad_norm": 0.28914836669109695, + "learning_rate": 8.704083797256228e-05, + "loss": 0.9861, + "step": 11796 + }, + { + "epoch": 1.13, + "grad_norm": 0.3239326590358015, + "learning_rate": 8.702515204271985e-05, + "loss": 0.959, + "step": 11797 + }, + { + "epoch": 1.13, + "grad_norm": 0.28597698215000256, + "learning_rate": 8.700946643758113e-05, + "loss": 1.0847, + "step": 11798 + }, + { + "epoch": 1.13, + "grad_norm": 0.28268334547918067, + "learning_rate": 8.699378115753864e-05, + "loss": 0.9484, + "step": 11799 + }, + { + "epoch": 1.13, + "grad_norm": 0.27593099919795827, + "learning_rate": 8.697809620298498e-05, + "loss": 1.0066, + "step": 11800 + }, + { + "epoch": 1.13, + "grad_norm": 0.27181854647406384, + "learning_rate": 8.696241157431266e-05, + "loss": 1.0284, + "step": 11801 + }, + { + "epoch": 1.13, + "grad_norm": 0.32609495158389945, + "learning_rate": 8.694672727191419e-05, + "loss": 1.0009, + "step": 11802 + }, + { + "epoch": 1.13, + "grad_norm": 0.29285643032671044, + "learning_rate": 8.693104329618205e-05, + "loss": 0.9513, + "step": 11803 + }, + { + "epoch": 1.13, + "grad_norm": 0.2982046335227466, + "learning_rate": 8.691535964750879e-05, + "loss": 0.9839, + "step": 11804 + }, + { + "epoch": 1.13, + "grad_norm": 0.29207884763792824, + "learning_rate": 8.689967632628683e-05, + "loss": 0.9365, + "step": 11805 + }, + { + "epoch": 1.13, + "grad_norm": 0.27467443537203357, + "learning_rate": 8.688399333290873e-05, + "loss": 1.0572, + "step": 11806 + }, + { + "epoch": 1.13, + "grad_norm": 0.30621137633822965, + "learning_rate": 8.686831066776697e-05, + "loss": 0.9592, + "step": 11807 + }, + { + "epoch": 1.13, + "grad_norm": 0.30769532529834404, + "learning_rate": 8.685262833125396e-05, + "loss": 1.0677, + "step": 11808 + }, + { + "epoch": 1.13, + "grad_norm": 0.34001535527259263, + "learning_rate": 8.683694632376218e-05, + "loss": 0.9436, + "step": 11809 + }, + { + "epoch": 1.13, + "grad_norm": 0.30261346099553693, + "learning_rate": 8.682126464568411e-05, + "loss": 1.0503, + "step": 11810 + }, + { + "epoch": 1.13, + "grad_norm": 0.3377769587129517, + "learning_rate": 8.680558329741217e-05, + "loss": 1.0026, + "step": 11811 + }, + { + "epoch": 1.13, + "grad_norm": 0.2867519498752575, + "learning_rate": 8.678990227933878e-05, + "loss": 1.0166, + "step": 11812 + }, + { + "epoch": 1.13, + "grad_norm": 0.3253171694454577, + "learning_rate": 8.67742215918564e-05, + "loss": 0.9547, + "step": 11813 + }, + { + "epoch": 1.13, + "grad_norm": 0.29605089417091124, + "learning_rate": 8.675854123535745e-05, + "loss": 0.9361, + "step": 11814 + }, + { + "epoch": 1.13, + "grad_norm": 0.30370683186358644, + "learning_rate": 8.674286121023432e-05, + "loss": 0.9554, + "step": 11815 + }, + { + "epoch": 1.13, + "grad_norm": 0.27140176637657093, + "learning_rate": 8.672718151687943e-05, + "loss": 1.0744, + "step": 11816 + }, + { + "epoch": 1.13, + "grad_norm": 0.33202882155992763, + "learning_rate": 8.671150215568515e-05, + "loss": 0.9769, + "step": 11817 + }, + { + "epoch": 1.13, + "grad_norm": 0.2644516051875476, + "learning_rate": 8.66958231270439e-05, + "loss": 1.0416, + "step": 11818 + }, + { + "epoch": 1.13, + "grad_norm": 0.28124522388950374, + "learning_rate": 8.668014443134799e-05, + "loss": 1.0023, + "step": 11819 + }, + { + "epoch": 1.13, + "grad_norm": 0.2926456226498094, + "learning_rate": 8.666446606898988e-05, + "loss": 1.0498, + "step": 11820 + }, + { + "epoch": 1.13, + "grad_norm": 0.29133376219692897, + "learning_rate": 8.664878804036188e-05, + "loss": 1.0512, + "step": 11821 + }, + { + "epoch": 1.13, + "grad_norm": 0.3344670092893148, + "learning_rate": 8.663311034585635e-05, + "loss": 0.9974, + "step": 11822 + }, + { + "epoch": 1.13, + "grad_norm": 0.3500231420643422, + "learning_rate": 8.661743298586564e-05, + "loss": 0.9743, + "step": 11823 + }, + { + "epoch": 1.13, + "grad_norm": 0.31447008266799525, + "learning_rate": 8.660175596078207e-05, + "loss": 1.0313, + "step": 11824 + }, + { + "epoch": 1.13, + "grad_norm": 0.30942221757837385, + "learning_rate": 8.658607927099798e-05, + "loss": 0.9948, + "step": 11825 + }, + { + "epoch": 1.13, + "grad_norm": 0.2649144410638964, + "learning_rate": 8.657040291690566e-05, + "loss": 1.0601, + "step": 11826 + }, + { + "epoch": 1.13, + "grad_norm": 0.3430980746964851, + "learning_rate": 8.655472689889748e-05, + "loss": 1.0965, + "step": 11827 + }, + { + "epoch": 1.13, + "grad_norm": 0.32546165846143654, + "learning_rate": 8.653905121736571e-05, + "loss": 1.0108, + "step": 11828 + }, + { + "epoch": 1.13, + "grad_norm": 0.30362154432573085, + "learning_rate": 8.652337587270264e-05, + "loss": 1.0487, + "step": 11829 + }, + { + "epoch": 1.13, + "grad_norm": 0.28565198116500046, + "learning_rate": 8.650770086530053e-05, + "loss": 1.0841, + "step": 11830 + }, + { + "epoch": 1.13, + "grad_norm": 0.24869245021703726, + "learning_rate": 8.649202619555172e-05, + "loss": 0.9842, + "step": 11831 + }, + { + "epoch": 1.13, + "grad_norm": 0.33783487459344, + "learning_rate": 8.647635186384842e-05, + "loss": 0.9542, + "step": 11832 + }, + { + "epoch": 1.13, + "grad_norm": 0.34260039750953986, + "learning_rate": 8.646067787058294e-05, + "loss": 1.0903, + "step": 11833 + }, + { + "epoch": 1.13, + "grad_norm": 0.32569204381473477, + "learning_rate": 8.644500421614747e-05, + "loss": 1.0232, + "step": 11834 + }, + { + "epoch": 1.13, + "grad_norm": 0.3100379534873475, + "learning_rate": 8.642933090093432e-05, + "loss": 1.1247, + "step": 11835 + }, + { + "epoch": 1.13, + "grad_norm": 0.2763889763573153, + "learning_rate": 8.641365792533569e-05, + "loss": 0.9429, + "step": 11836 + }, + { + "epoch": 1.13, + "grad_norm": 0.30510322342339763, + "learning_rate": 8.639798528974382e-05, + "loss": 1.0536, + "step": 11837 + }, + { + "epoch": 1.13, + "grad_norm": 0.275387050111551, + "learning_rate": 8.638231299455089e-05, + "loss": 1.0423, + "step": 11838 + }, + { + "epoch": 1.13, + "grad_norm": 0.2910180566275121, + "learning_rate": 8.636664104014911e-05, + "loss": 1.0107, + "step": 11839 + }, + { + "epoch": 1.13, + "grad_norm": 0.27488725950319126, + "learning_rate": 8.635096942693075e-05, + "loss": 1.0189, + "step": 11840 + }, + { + "epoch": 1.13, + "grad_norm": 0.29942772590823175, + "learning_rate": 8.633529815528795e-05, + "loss": 1.0553, + "step": 11841 + }, + { + "epoch": 1.13, + "grad_norm": 0.2788638692664182, + "learning_rate": 8.631962722561292e-05, + "loss": 1.1867, + "step": 11842 + }, + { + "epoch": 1.13, + "grad_norm": 0.3113945625820475, + "learning_rate": 8.63039566382978e-05, + "loss": 1.042, + "step": 11843 + }, + { + "epoch": 1.13, + "grad_norm": 0.3094309666647985, + "learning_rate": 8.628828639373479e-05, + "loss": 0.9281, + "step": 11844 + }, + { + "epoch": 1.13, + "grad_norm": 0.2850915232368822, + "learning_rate": 8.627261649231602e-05, + "loss": 0.9867, + "step": 11845 + }, + { + "epoch": 1.13, + "grad_norm": 0.3242532037626142, + "learning_rate": 8.625694693443358e-05, + "loss": 1.1184, + "step": 11846 + }, + { + "epoch": 1.13, + "grad_norm": 0.2851352325547713, + "learning_rate": 8.624127772047977e-05, + "loss": 0.9683, + "step": 11847 + }, + { + "epoch": 1.13, + "grad_norm": 0.32932041875620255, + "learning_rate": 8.62256088508466e-05, + "loss": 1.0244, + "step": 11848 + }, + { + "epoch": 1.13, + "grad_norm": 0.33072773766041846, + "learning_rate": 8.620994032592624e-05, + "loss": 0.9823, + "step": 11849 + }, + { + "epoch": 1.13, + "grad_norm": 0.31249857477662385, + "learning_rate": 8.619427214611078e-05, + "loss": 1.203, + "step": 11850 + }, + { + "epoch": 1.13, + "grad_norm": 0.31567333168813816, + "learning_rate": 8.617860431179233e-05, + "loss": 1.0565, + "step": 11851 + }, + { + "epoch": 1.13, + "grad_norm": 0.2834981310960545, + "learning_rate": 8.616293682336299e-05, + "loss": 1.0067, + "step": 11852 + }, + { + "epoch": 1.13, + "grad_norm": 0.2710601777991362, + "learning_rate": 8.614726968121485e-05, + "loss": 1.016, + "step": 11853 + }, + { + "epoch": 1.13, + "grad_norm": 0.29154931528696243, + "learning_rate": 8.613160288574e-05, + "loss": 1.0881, + "step": 11854 + }, + { + "epoch": 1.13, + "grad_norm": 0.330233949759841, + "learning_rate": 8.611593643733051e-05, + "loss": 0.9853, + "step": 11855 + }, + { + "epoch": 1.13, + "grad_norm": 0.34604183347630596, + "learning_rate": 8.610027033637842e-05, + "loss": 1.0645, + "step": 11856 + }, + { + "epoch": 1.13, + "grad_norm": 0.2928952171002599, + "learning_rate": 8.608460458327583e-05, + "loss": 1.0578, + "step": 11857 + }, + { + "epoch": 1.13, + "grad_norm": 0.29650039029115177, + "learning_rate": 8.606893917841474e-05, + "loss": 0.974, + "step": 11858 + }, + { + "epoch": 1.13, + "grad_norm": 0.28378564027064557, + "learning_rate": 8.605327412218723e-05, + "loss": 1.1073, + "step": 11859 + }, + { + "epoch": 1.13, + "grad_norm": 0.28121518941415935, + "learning_rate": 8.603760941498522e-05, + "loss": 1.0705, + "step": 11860 + }, + { + "epoch": 1.13, + "grad_norm": 0.29879822152916036, + "learning_rate": 8.602194505720087e-05, + "loss": 0.9652, + "step": 11861 + }, + { + "epoch": 1.13, + "grad_norm": 0.31106621247094984, + "learning_rate": 8.600628104922612e-05, + "loss": 1.006, + "step": 11862 + }, + { + "epoch": 1.13, + "grad_norm": 0.2990807244709715, + "learning_rate": 8.599061739145299e-05, + "loss": 1.0577, + "step": 11863 + }, + { + "epoch": 1.14, + "grad_norm": 0.2844949451917499, + "learning_rate": 8.597495408427347e-05, + "loss": 1.028, + "step": 11864 + }, + { + "epoch": 1.14, + "grad_norm": 0.30803023881084896, + "learning_rate": 8.595929112807954e-05, + "loss": 1.0838, + "step": 11865 + }, + { + "epoch": 1.14, + "grad_norm": 0.2947064452535329, + "learning_rate": 8.594362852326313e-05, + "loss": 1.0791, + "step": 11866 + }, + { + "epoch": 1.14, + "grad_norm": 0.30571016874465307, + "learning_rate": 8.592796627021627e-05, + "loss": 1.0467, + "step": 11867 + }, + { + "epoch": 1.14, + "grad_norm": 0.3186681058497015, + "learning_rate": 8.591230436933091e-05, + "loss": 1.0603, + "step": 11868 + }, + { + "epoch": 1.14, + "grad_norm": 0.2857203823972958, + "learning_rate": 8.589664282099899e-05, + "loss": 0.9875, + "step": 11869 + }, + { + "epoch": 1.14, + "grad_norm": 0.2799431137513968, + "learning_rate": 8.588098162561246e-05, + "loss": 0.9576, + "step": 11870 + }, + { + "epoch": 1.14, + "grad_norm": 0.34181834994581756, + "learning_rate": 8.58653207835632e-05, + "loss": 1.0174, + "step": 11871 + }, + { + "epoch": 1.14, + "grad_norm": 0.33537731836829465, + "learning_rate": 8.584966029524319e-05, + "loss": 1.0364, + "step": 11872 + }, + { + "epoch": 1.14, + "grad_norm": 0.3728683756769394, + "learning_rate": 8.583400016104432e-05, + "loss": 0.9726, + "step": 11873 + }, + { + "epoch": 1.14, + "grad_norm": 0.3052409130796934, + "learning_rate": 8.581834038135851e-05, + "loss": 0.9402, + "step": 11874 + }, + { + "epoch": 1.14, + "grad_norm": 0.3117793676187802, + "learning_rate": 8.580268095657761e-05, + "loss": 1.056, + "step": 11875 + }, + { + "epoch": 1.14, + "grad_norm": 0.32520833460873727, + "learning_rate": 8.578702188709359e-05, + "loss": 0.9516, + "step": 11876 + }, + { + "epoch": 1.14, + "grad_norm": 0.29535146609744267, + "learning_rate": 8.577136317329827e-05, + "loss": 0.9522, + "step": 11877 + }, + { + "epoch": 1.14, + "grad_norm": 0.2682231737825024, + "learning_rate": 8.575570481558352e-05, + "loss": 0.9586, + "step": 11878 + }, + { + "epoch": 1.14, + "grad_norm": 0.33724451177352555, + "learning_rate": 8.574004681434122e-05, + "loss": 1.0682, + "step": 11879 + }, + { + "epoch": 1.14, + "grad_norm": 0.2963937729560492, + "learning_rate": 8.572438916996316e-05, + "loss": 1.0416, + "step": 11880 + }, + { + "epoch": 1.14, + "grad_norm": 0.31520515299882, + "learning_rate": 8.570873188284128e-05, + "loss": 1.0857, + "step": 11881 + }, + { + "epoch": 1.14, + "grad_norm": 0.30147966136771354, + "learning_rate": 8.569307495336736e-05, + "loss": 1.0053, + "step": 11882 + }, + { + "epoch": 1.14, + "grad_norm": 0.3319813199380165, + "learning_rate": 8.567741838193324e-05, + "loss": 0.9836, + "step": 11883 + }, + { + "epoch": 1.14, + "grad_norm": 0.2920875954459454, + "learning_rate": 8.566176216893072e-05, + "loss": 0.897, + "step": 11884 + }, + { + "epoch": 1.14, + "grad_norm": 0.3146924489171864, + "learning_rate": 8.564610631475161e-05, + "loss": 1.082, + "step": 11885 + }, + { + "epoch": 1.14, + "grad_norm": 0.2999674039699899, + "learning_rate": 8.563045081978773e-05, + "loss": 1.0641, + "step": 11886 + }, + { + "epoch": 1.14, + "grad_norm": 0.28403172341229044, + "learning_rate": 8.561479568443078e-05, + "loss": 1.0011, + "step": 11887 + }, + { + "epoch": 1.14, + "grad_norm": 0.32269253934162867, + "learning_rate": 8.559914090907266e-05, + "loss": 0.9911, + "step": 11888 + }, + { + "epoch": 1.14, + "grad_norm": 0.31596010031828053, + "learning_rate": 8.55834864941051e-05, + "loss": 0.9134, + "step": 11889 + }, + { + "epoch": 1.14, + "grad_norm": 0.2921069744707491, + "learning_rate": 8.556783243991984e-05, + "loss": 1.1171, + "step": 11890 + }, + { + "epoch": 1.14, + "grad_norm": 0.2907036291179976, + "learning_rate": 8.555217874690866e-05, + "loss": 0.9039, + "step": 11891 + }, + { + "epoch": 1.14, + "grad_norm": 0.3113868894900095, + "learning_rate": 8.553652541546324e-05, + "loss": 1.0583, + "step": 11892 + }, + { + "epoch": 1.14, + "grad_norm": 0.3202622587626687, + "learning_rate": 8.552087244597538e-05, + "loss": 0.9521, + "step": 11893 + }, + { + "epoch": 1.14, + "grad_norm": 0.3109902550591866, + "learning_rate": 8.55052198388368e-05, + "loss": 0.9923, + "step": 11894 + }, + { + "epoch": 1.14, + "grad_norm": 0.3048702190862498, + "learning_rate": 8.54895675944392e-05, + "loss": 1.0201, + "step": 11895 + }, + { + "epoch": 1.14, + "grad_norm": 0.28202816333623226, + "learning_rate": 8.547391571317427e-05, + "loss": 1.0275, + "step": 11896 + }, + { + "epoch": 1.14, + "grad_norm": 0.30740681970613987, + "learning_rate": 8.545826419543377e-05, + "loss": 1.0503, + "step": 11897 + }, + { + "epoch": 1.14, + "grad_norm": 0.3159798248830845, + "learning_rate": 8.544261304160932e-05, + "loss": 0.8949, + "step": 11898 + }, + { + "epoch": 1.14, + "grad_norm": 0.31168704004158576, + "learning_rate": 8.542696225209264e-05, + "loss": 1.0214, + "step": 11899 + }, + { + "epoch": 1.14, + "grad_norm": 0.30711258045297957, + "learning_rate": 8.541131182727536e-05, + "loss": 0.9988, + "step": 11900 + }, + { + "epoch": 1.14, + "grad_norm": 0.2769325730448447, + "learning_rate": 8.539566176754918e-05, + "loss": 1.1316, + "step": 11901 + }, + { + "epoch": 1.14, + "grad_norm": 0.25697574557752956, + "learning_rate": 8.538001207330577e-05, + "loss": 0.9369, + "step": 11902 + }, + { + "epoch": 1.14, + "grad_norm": 0.3466838146024124, + "learning_rate": 8.536436274493673e-05, + "loss": 1.0705, + "step": 11903 + }, + { + "epoch": 1.14, + "grad_norm": 0.25542202605106246, + "learning_rate": 8.534871378283373e-05, + "loss": 0.9368, + "step": 11904 + }, + { + "epoch": 1.14, + "grad_norm": 0.3149231011222603, + "learning_rate": 8.533306518738836e-05, + "loss": 1.0069, + "step": 11905 + }, + { + "epoch": 1.14, + "grad_norm": 0.29491285224515057, + "learning_rate": 8.531741695899226e-05, + "loss": 1.0269, + "step": 11906 + }, + { + "epoch": 1.14, + "grad_norm": 0.33309319946649235, + "learning_rate": 8.530176909803698e-05, + "loss": 1.0467, + "step": 11907 + }, + { + "epoch": 1.14, + "grad_norm": 0.3038994434204986, + "learning_rate": 8.52861216049142e-05, + "loss": 1.1545, + "step": 11908 + }, + { + "epoch": 1.14, + "grad_norm": 0.35963260431807964, + "learning_rate": 8.527047448001549e-05, + "loss": 1.0049, + "step": 11909 + }, + { + "epoch": 1.14, + "grad_norm": 0.2919270611385858, + "learning_rate": 8.52548277237324e-05, + "loss": 0.9582, + "step": 11910 + }, + { + "epoch": 1.14, + "grad_norm": 0.3187957943429259, + "learning_rate": 8.523918133645651e-05, + "loss": 1.027, + "step": 11911 + }, + { + "epoch": 1.14, + "grad_norm": 0.2930576340570574, + "learning_rate": 8.522353531857939e-05, + "loss": 1.0845, + "step": 11912 + }, + { + "epoch": 1.14, + "grad_norm": 0.2982517349451332, + "learning_rate": 8.520788967049258e-05, + "loss": 1.007, + "step": 11913 + }, + { + "epoch": 1.14, + "grad_norm": 0.3011079753995403, + "learning_rate": 8.519224439258762e-05, + "loss": 1.0469, + "step": 11914 + }, + { + "epoch": 1.14, + "grad_norm": 0.3253103783030644, + "learning_rate": 8.517659948525607e-05, + "loss": 0.9936, + "step": 11915 + }, + { + "epoch": 1.14, + "grad_norm": 0.3475575621954856, + "learning_rate": 8.51609549488894e-05, + "loss": 0.985, + "step": 11916 + }, + { + "epoch": 1.14, + "grad_norm": 0.31773566645254236, + "learning_rate": 8.51453107838792e-05, + "loss": 1.092, + "step": 11917 + }, + { + "epoch": 1.14, + "grad_norm": 0.2954701786289484, + "learning_rate": 8.512966699061692e-05, + "loss": 0.9746, + "step": 11918 + }, + { + "epoch": 1.14, + "grad_norm": 0.33243607536418507, + "learning_rate": 8.511402356949408e-05, + "loss": 1.0805, + "step": 11919 + }, + { + "epoch": 1.14, + "grad_norm": 0.3439263078427064, + "learning_rate": 8.509838052090213e-05, + "loss": 0.958, + "step": 11920 + }, + { + "epoch": 1.14, + "grad_norm": 0.2724456177565354, + "learning_rate": 8.508273784523255e-05, + "loss": 1.168, + "step": 11921 + }, + { + "epoch": 1.14, + "grad_norm": 0.29646226501292205, + "learning_rate": 8.506709554287688e-05, + "loss": 1.0709, + "step": 11922 + }, + { + "epoch": 1.14, + "grad_norm": 0.26975077689927063, + "learning_rate": 8.505145361422651e-05, + "loss": 1.0544, + "step": 11923 + }, + { + "epoch": 1.14, + "grad_norm": 0.34913464075436507, + "learning_rate": 8.503581205967293e-05, + "loss": 0.9051, + "step": 11924 + }, + { + "epoch": 1.14, + "grad_norm": 0.3427194897321703, + "learning_rate": 8.502017087960754e-05, + "loss": 1.0259, + "step": 11925 + }, + { + "epoch": 1.14, + "grad_norm": 0.34875041099481124, + "learning_rate": 8.500453007442179e-05, + "loss": 1.0719, + "step": 11926 + }, + { + "epoch": 1.14, + "grad_norm": 0.2974344432279597, + "learning_rate": 8.498888964450706e-05, + "loss": 1.0815, + "step": 11927 + }, + { + "epoch": 1.14, + "grad_norm": 0.3008260506605528, + "learning_rate": 8.497324959025485e-05, + "loss": 1.1976, + "step": 11928 + }, + { + "epoch": 1.14, + "grad_norm": 0.30103767946716553, + "learning_rate": 8.495760991205651e-05, + "loss": 1.0106, + "step": 11929 + }, + { + "epoch": 1.14, + "grad_norm": 0.28039604697216197, + "learning_rate": 8.494197061030342e-05, + "loss": 1.0317, + "step": 11930 + }, + { + "epoch": 1.14, + "grad_norm": 0.3095289242452046, + "learning_rate": 8.4926331685387e-05, + "loss": 0.9769, + "step": 11931 + }, + { + "epoch": 1.14, + "grad_norm": 0.3187529979372792, + "learning_rate": 8.491069313769859e-05, + "loss": 1.0189, + "step": 11932 + }, + { + "epoch": 1.14, + "grad_norm": 0.3006079442617397, + "learning_rate": 8.489505496762955e-05, + "loss": 1.1971, + "step": 11933 + }, + { + "epoch": 1.14, + "grad_norm": 0.3301101188462815, + "learning_rate": 8.487941717557125e-05, + "loss": 1.0978, + "step": 11934 + }, + { + "epoch": 1.14, + "grad_norm": 0.28643757034553596, + "learning_rate": 8.486377976191506e-05, + "loss": 0.9968, + "step": 11935 + }, + { + "epoch": 1.14, + "grad_norm": 0.31957336369639566, + "learning_rate": 8.48481427270523e-05, + "loss": 1.0517, + "step": 11936 + }, + { + "epoch": 1.14, + "grad_norm": 0.30459489173667564, + "learning_rate": 8.483250607137428e-05, + "loss": 1.0803, + "step": 11937 + }, + { + "epoch": 1.14, + "grad_norm": 0.3238470251011299, + "learning_rate": 8.481686979527233e-05, + "loss": 1.0844, + "step": 11938 + }, + { + "epoch": 1.14, + "grad_norm": 0.313680961340711, + "learning_rate": 8.480123389913776e-05, + "loss": 1.1253, + "step": 11939 + }, + { + "epoch": 1.14, + "grad_norm": 0.35043858704285974, + "learning_rate": 8.478559838336186e-05, + "loss": 1.0169, + "step": 11940 + }, + { + "epoch": 1.14, + "grad_norm": 0.3119449649459266, + "learning_rate": 8.47699632483359e-05, + "loss": 1.0611, + "step": 11941 + }, + { + "epoch": 1.14, + "grad_norm": 0.2888170911908191, + "learning_rate": 8.47543284944512e-05, + "loss": 1.0091, + "step": 11942 + }, + { + "epoch": 1.14, + "grad_norm": 0.32266303216249037, + "learning_rate": 8.473869412209902e-05, + "loss": 1.0318, + "step": 11943 + }, + { + "epoch": 1.14, + "grad_norm": 0.29481213520925764, + "learning_rate": 8.472306013167061e-05, + "loss": 1.0373, + "step": 11944 + }, + { + "epoch": 1.14, + "grad_norm": 0.2881410595379304, + "learning_rate": 8.470742652355722e-05, + "loss": 1.0283, + "step": 11945 + }, + { + "epoch": 1.14, + "grad_norm": 0.3285631553797132, + "learning_rate": 8.469179329815009e-05, + "loss": 1.07, + "step": 11946 + }, + { + "epoch": 1.14, + "grad_norm": 0.30563610891600146, + "learning_rate": 8.467616045584047e-05, + "loss": 1.0107, + "step": 11947 + }, + { + "epoch": 1.14, + "grad_norm": 0.3072181354097928, + "learning_rate": 8.466052799701951e-05, + "loss": 1.0242, + "step": 11948 + }, + { + "epoch": 1.14, + "grad_norm": 0.3262030402317795, + "learning_rate": 8.464489592207852e-05, + "loss": 0.9325, + "step": 11949 + }, + { + "epoch": 1.14, + "grad_norm": 0.30609772256898393, + "learning_rate": 8.462926423140865e-05, + "loss": 1.0921, + "step": 11950 + }, + { + "epoch": 1.14, + "grad_norm": 0.3170012769022536, + "learning_rate": 8.461363292540112e-05, + "loss": 1.047, + "step": 11951 + }, + { + "epoch": 1.14, + "grad_norm": 0.2922757347893426, + "learning_rate": 8.45980020044471e-05, + "loss": 1.0299, + "step": 11952 + }, + { + "epoch": 1.14, + "grad_norm": 0.2601252766380446, + "learning_rate": 8.458237146893772e-05, + "loss": 0.987, + "step": 11953 + }, + { + "epoch": 1.14, + "grad_norm": 0.29512394304791134, + "learning_rate": 8.456674131926418e-05, + "loss": 1.0488, + "step": 11954 + }, + { + "epoch": 1.14, + "grad_norm": 0.29142637151951367, + "learning_rate": 8.455111155581768e-05, + "loss": 0.9187, + "step": 11955 + }, + { + "epoch": 1.14, + "grad_norm": 0.28278792629375377, + "learning_rate": 8.453548217898931e-05, + "loss": 1.0659, + "step": 11956 + }, + { + "epoch": 1.14, + "grad_norm": 0.277813407778333, + "learning_rate": 8.45198531891702e-05, + "loss": 0.908, + "step": 11957 + }, + { + "epoch": 1.14, + "grad_norm": 0.32063121801380473, + "learning_rate": 8.450422458675152e-05, + "loss": 1.0276, + "step": 11958 + }, + { + "epoch": 1.14, + "grad_norm": 0.3091718706439481, + "learning_rate": 8.448859637212435e-05, + "loss": 0.9032, + "step": 11959 + }, + { + "epoch": 1.14, + "grad_norm": 0.30669748337184594, + "learning_rate": 8.447296854567982e-05, + "loss": 1.0204, + "step": 11960 + }, + { + "epoch": 1.14, + "grad_norm": 0.3131448982549371, + "learning_rate": 8.445734110780894e-05, + "loss": 1.0364, + "step": 11961 + }, + { + "epoch": 1.14, + "grad_norm": 0.296551145574413, + "learning_rate": 8.444171405890292e-05, + "loss": 1.0667, + "step": 11962 + }, + { + "epoch": 1.14, + "grad_norm": 0.27695107060797824, + "learning_rate": 8.442608739935279e-05, + "loss": 1.0695, + "step": 11963 + }, + { + "epoch": 1.14, + "grad_norm": 0.33706839656551024, + "learning_rate": 8.441046112954961e-05, + "loss": 1.0493, + "step": 11964 + }, + { + "epoch": 1.14, + "grad_norm": 0.33804130783497127, + "learning_rate": 8.439483524988443e-05, + "loss": 0.9816, + "step": 11965 + }, + { + "epoch": 1.14, + "grad_norm": 0.29538678949100833, + "learning_rate": 8.437920976074831e-05, + "loss": 1.0881, + "step": 11966 + }, + { + "epoch": 1.14, + "grad_norm": 0.351513194683739, + "learning_rate": 8.436358466253229e-05, + "loss": 1.0761, + "step": 11967 + }, + { + "epoch": 1.14, + "grad_norm": 0.35440588716678245, + "learning_rate": 8.434795995562734e-05, + "loss": 1.0186, + "step": 11968 + }, + { + "epoch": 1.15, + "grad_norm": 0.3055558275083587, + "learning_rate": 8.433233564042457e-05, + "loss": 1.0491, + "step": 11969 + }, + { + "epoch": 1.15, + "grad_norm": 0.309657102579547, + "learning_rate": 8.431671171731496e-05, + "loss": 0.9487, + "step": 11970 + }, + { + "epoch": 1.15, + "grad_norm": 0.3384184521147817, + "learning_rate": 8.43010881866895e-05, + "loss": 1.0393, + "step": 11971 + }, + { + "epoch": 1.15, + "grad_norm": 0.31835791023469556, + "learning_rate": 8.428546504893915e-05, + "loss": 1.0516, + "step": 11972 + }, + { + "epoch": 1.15, + "grad_norm": 0.3042894875648419, + "learning_rate": 8.426984230445493e-05, + "loss": 1.0794, + "step": 11973 + }, + { + "epoch": 1.15, + "grad_norm": 0.3364792874136485, + "learning_rate": 8.425421995362776e-05, + "loss": 1.0228, + "step": 11974 + }, + { + "epoch": 1.15, + "grad_norm": 0.2857527420490243, + "learning_rate": 8.423859799684864e-05, + "loss": 1.0498, + "step": 11975 + }, + { + "epoch": 1.15, + "grad_norm": 0.31760925263083734, + "learning_rate": 8.422297643450853e-05, + "loss": 1.1176, + "step": 11976 + }, + { + "epoch": 1.15, + "grad_norm": 0.3382539370612801, + "learning_rate": 8.420735526699836e-05, + "loss": 1.0701, + "step": 11977 + }, + { + "epoch": 1.15, + "grad_norm": 0.31683668494237005, + "learning_rate": 8.419173449470901e-05, + "loss": 1.0265, + "step": 11978 + }, + { + "epoch": 1.15, + "grad_norm": 0.34508301718444545, + "learning_rate": 8.417611411803147e-05, + "loss": 0.9461, + "step": 11979 + }, + { + "epoch": 1.15, + "grad_norm": 0.30899855868831116, + "learning_rate": 8.416049413735662e-05, + "loss": 1.1135, + "step": 11980 + }, + { + "epoch": 1.15, + "grad_norm": 0.3151666005549788, + "learning_rate": 8.414487455307536e-05, + "loss": 1.0614, + "step": 11981 + }, + { + "epoch": 1.15, + "grad_norm": 0.30891988716290564, + "learning_rate": 8.412925536557853e-05, + "loss": 1.0521, + "step": 11982 + }, + { + "epoch": 1.15, + "grad_norm": 0.32419565452116916, + "learning_rate": 8.41136365752571e-05, + "loss": 1.029, + "step": 11983 + }, + { + "epoch": 1.15, + "grad_norm": 0.3201124305763516, + "learning_rate": 8.409801818250191e-05, + "loss": 0.8993, + "step": 11984 + }, + { + "epoch": 1.15, + "grad_norm": 0.2757534896461394, + "learning_rate": 8.40824001877038e-05, + "loss": 1.1687, + "step": 11985 + }, + { + "epoch": 1.15, + "grad_norm": 0.28602749439031644, + "learning_rate": 8.406678259125365e-05, + "loss": 1.0336, + "step": 11986 + }, + { + "epoch": 1.15, + "grad_norm": 0.3235735022158296, + "learning_rate": 8.405116539354226e-05, + "loss": 1.1011, + "step": 11987 + }, + { + "epoch": 1.15, + "grad_norm": 0.27168793494931687, + "learning_rate": 8.403554859496044e-05, + "loss": 0.9155, + "step": 11988 + }, + { + "epoch": 1.15, + "grad_norm": 0.348251182950838, + "learning_rate": 8.401993219589908e-05, + "loss": 1.0617, + "step": 11989 + }, + { + "epoch": 1.15, + "grad_norm": 0.2935436769214298, + "learning_rate": 8.400431619674898e-05, + "loss": 1.0783, + "step": 11990 + }, + { + "epoch": 1.15, + "grad_norm": 0.29091867965279034, + "learning_rate": 8.398870059790091e-05, + "loss": 1.0062, + "step": 11991 + }, + { + "epoch": 1.15, + "grad_norm": 0.30811303110009003, + "learning_rate": 8.397308539974567e-05, + "loss": 1.0498, + "step": 11992 + }, + { + "epoch": 1.15, + "grad_norm": 0.3375971475777704, + "learning_rate": 8.395747060267404e-05, + "loss": 1.0828, + "step": 11993 + }, + { + "epoch": 1.15, + "grad_norm": 0.26006696144404073, + "learning_rate": 8.394185620707676e-05, + "loss": 1.0363, + "step": 11994 + }, + { + "epoch": 1.15, + "grad_norm": 0.33577286914122667, + "learning_rate": 8.392624221334465e-05, + "loss": 1.0211, + "step": 11995 + }, + { + "epoch": 1.15, + "grad_norm": 0.2732792671110833, + "learning_rate": 8.391062862186843e-05, + "loss": 1.0098, + "step": 11996 + }, + { + "epoch": 1.15, + "grad_norm": 0.3349378855079747, + "learning_rate": 8.389501543303883e-05, + "loss": 1.0575, + "step": 11997 + }, + { + "epoch": 1.15, + "grad_norm": 0.3308733411870625, + "learning_rate": 8.387940264724659e-05, + "loss": 1.0011, + "step": 11998 + }, + { + "epoch": 1.15, + "grad_norm": 0.28545093764400836, + "learning_rate": 8.386379026488244e-05, + "loss": 0.9784, + "step": 11999 + }, + { + "epoch": 1.15, + "grad_norm": 0.3310553108583504, + "learning_rate": 8.384817828633708e-05, + "loss": 0.9417, + "step": 12000 + }, + { + "epoch": 1.15, + "grad_norm": 0.334414820703518, + "learning_rate": 8.383256671200123e-05, + "loss": 1.0726, + "step": 12001 + }, + { + "epoch": 1.15, + "grad_norm": 0.2931481464979092, + "learning_rate": 8.38169555422655e-05, + "loss": 0.9116, + "step": 12002 + }, + { + "epoch": 1.15, + "grad_norm": 0.3324438305466467, + "learning_rate": 8.380134477752067e-05, + "loss": 1.1738, + "step": 12003 + }, + { + "epoch": 1.15, + "grad_norm": 0.3043806541874354, + "learning_rate": 8.378573441815736e-05, + "loss": 1.0627, + "step": 12004 + }, + { + "epoch": 1.15, + "grad_norm": 0.3044276956844582, + "learning_rate": 8.377012446456625e-05, + "loss": 1.0045, + "step": 12005 + }, + { + "epoch": 1.15, + "grad_norm": 0.3077294081275074, + "learning_rate": 8.375451491713797e-05, + "loss": 0.9851, + "step": 12006 + }, + { + "epoch": 1.15, + "grad_norm": 0.2780555858669826, + "learning_rate": 8.373890577626316e-05, + "loss": 1.0731, + "step": 12007 + }, + { + "epoch": 1.15, + "grad_norm": 0.3177365365568838, + "learning_rate": 8.372329704233246e-05, + "loss": 1.0161, + "step": 12008 + }, + { + "epoch": 1.15, + "grad_norm": 0.30172885463631877, + "learning_rate": 8.370768871573644e-05, + "loss": 1.0367, + "step": 12009 + }, + { + "epoch": 1.15, + "grad_norm": 0.3484601207771935, + "learning_rate": 8.369208079686578e-05, + "loss": 1.0598, + "step": 12010 + }, + { + "epoch": 1.15, + "grad_norm": 0.2948978050146606, + "learning_rate": 8.367647328611106e-05, + "loss": 1.0882, + "step": 12011 + }, + { + "epoch": 1.15, + "grad_norm": 0.34506141402727936, + "learning_rate": 8.366086618386284e-05, + "loss": 0.9403, + "step": 12012 + }, + { + "epoch": 1.15, + "grad_norm": 0.32581616749299086, + "learning_rate": 8.364525949051172e-05, + "loss": 1.0273, + "step": 12013 + }, + { + "epoch": 1.15, + "grad_norm": 0.3105475266963726, + "learning_rate": 8.362965320644826e-05, + "loss": 0.9783, + "step": 12014 + }, + { + "epoch": 1.15, + "grad_norm": 0.34592488238578806, + "learning_rate": 8.361404733206298e-05, + "loss": 0.9762, + "step": 12015 + }, + { + "epoch": 1.15, + "grad_norm": 0.32517169992626255, + "learning_rate": 8.35984418677465e-05, + "loss": 1.0157, + "step": 12016 + }, + { + "epoch": 1.15, + "grad_norm": 0.3372130317916621, + "learning_rate": 8.358283681388933e-05, + "loss": 0.9621, + "step": 12017 + }, + { + "epoch": 1.15, + "grad_norm": 0.3339495062174599, + "learning_rate": 8.356723217088199e-05, + "loss": 1.1185, + "step": 12018 + }, + { + "epoch": 1.15, + "grad_norm": 0.31004829429034486, + "learning_rate": 8.355162793911496e-05, + "loss": 0.969, + "step": 12019 + }, + { + "epoch": 1.15, + "grad_norm": 0.29236346095477483, + "learning_rate": 8.353602411897878e-05, + "loss": 1.0343, + "step": 12020 + }, + { + "epoch": 1.15, + "grad_norm": 0.31904623830288975, + "learning_rate": 8.352042071086397e-05, + "loss": 1.0984, + "step": 12021 + }, + { + "epoch": 1.15, + "grad_norm": 0.3285227425192462, + "learning_rate": 8.350481771516097e-05, + "loss": 1.0936, + "step": 12022 + }, + { + "epoch": 1.15, + "grad_norm": 0.27971498610491, + "learning_rate": 8.348921513226026e-05, + "loss": 1.0002, + "step": 12023 + }, + { + "epoch": 1.15, + "grad_norm": 0.33919822280327566, + "learning_rate": 8.347361296255234e-05, + "loss": 1.1207, + "step": 12024 + }, + { + "epoch": 1.15, + "grad_norm": 0.282603588595529, + "learning_rate": 8.345801120642764e-05, + "loss": 1.022, + "step": 12025 + }, + { + "epoch": 1.15, + "grad_norm": 0.32836611012737915, + "learning_rate": 8.344240986427661e-05, + "loss": 1.0063, + "step": 12026 + }, + { + "epoch": 1.15, + "grad_norm": 0.2948805515982776, + "learning_rate": 8.342680893648969e-05, + "loss": 1.0339, + "step": 12027 + }, + { + "epoch": 1.15, + "grad_norm": 0.32087135530218674, + "learning_rate": 8.341120842345729e-05, + "loss": 1.0107, + "step": 12028 + }, + { + "epoch": 1.15, + "grad_norm": 0.29601319119798924, + "learning_rate": 8.339560832556976e-05, + "loss": 1.0491, + "step": 12029 + }, + { + "epoch": 1.15, + "grad_norm": 0.267593232544949, + "learning_rate": 8.338000864321763e-05, + "loss": 1.0343, + "step": 12030 + }, + { + "epoch": 1.15, + "grad_norm": 0.2852657453547207, + "learning_rate": 8.336440937679122e-05, + "loss": 0.9441, + "step": 12031 + }, + { + "epoch": 1.15, + "grad_norm": 0.2982185252837074, + "learning_rate": 8.334881052668093e-05, + "loss": 1.0164, + "step": 12032 + }, + { + "epoch": 1.15, + "grad_norm": 0.298219200453631, + "learning_rate": 8.333321209327711e-05, + "loss": 0.94, + "step": 12033 + }, + { + "epoch": 1.15, + "grad_norm": 0.32139061692427306, + "learning_rate": 8.331761407697014e-05, + "loss": 0.9457, + "step": 12034 + }, + { + "epoch": 1.15, + "grad_norm": 0.28511551645269306, + "learning_rate": 8.330201647815037e-05, + "loss": 0.919, + "step": 12035 + }, + { + "epoch": 1.15, + "grad_norm": 0.3059906893727021, + "learning_rate": 8.328641929720807e-05, + "loss": 1.0269, + "step": 12036 + }, + { + "epoch": 1.15, + "grad_norm": 0.2919129971684272, + "learning_rate": 8.327082253453368e-05, + "loss": 1.1741, + "step": 12037 + }, + { + "epoch": 1.15, + "grad_norm": 0.3020886627390625, + "learning_rate": 8.325522619051748e-05, + "loss": 1.1174, + "step": 12038 + }, + { + "epoch": 1.15, + "grad_norm": 0.3158416378023349, + "learning_rate": 8.323963026554975e-05, + "loss": 0.9932, + "step": 12039 + }, + { + "epoch": 1.15, + "grad_norm": 0.29398865371057314, + "learning_rate": 8.32240347600208e-05, + "loss": 0.947, + "step": 12040 + }, + { + "epoch": 1.15, + "grad_norm": 0.2780978994466656, + "learning_rate": 8.320843967432095e-05, + "loss": 1.0723, + "step": 12041 + }, + { + "epoch": 1.15, + "grad_norm": 0.2980497260028954, + "learning_rate": 8.319284500884043e-05, + "loss": 1.0665, + "step": 12042 + }, + { + "epoch": 1.15, + "grad_norm": 0.31580472798767045, + "learning_rate": 8.31772507639695e-05, + "loss": 1.1173, + "step": 12043 + }, + { + "epoch": 1.15, + "grad_norm": 0.3024942305027789, + "learning_rate": 8.316165694009848e-05, + "loss": 1.0386, + "step": 12044 + }, + { + "epoch": 1.15, + "grad_norm": 0.3710105956125254, + "learning_rate": 8.314606353761758e-05, + "loss": 0.9958, + "step": 12045 + }, + { + "epoch": 1.15, + "grad_norm": 0.30557304803794905, + "learning_rate": 8.313047055691703e-05, + "loss": 0.9625, + "step": 12046 + }, + { + "epoch": 1.15, + "grad_norm": 0.31519929013566106, + "learning_rate": 8.311487799838707e-05, + "loss": 1.0622, + "step": 12047 + }, + { + "epoch": 1.15, + "grad_norm": 0.303346491676697, + "learning_rate": 8.309928586241787e-05, + "loss": 1.0133, + "step": 12048 + }, + { + "epoch": 1.15, + "grad_norm": 0.3420522052431417, + "learning_rate": 8.308369414939965e-05, + "loss": 1.0211, + "step": 12049 + }, + { + "epoch": 1.15, + "grad_norm": 0.3176336801417099, + "learning_rate": 8.306810285972264e-05, + "loss": 1.0359, + "step": 12050 + }, + { + "epoch": 1.15, + "grad_norm": 0.34122489286671037, + "learning_rate": 8.3052511993777e-05, + "loss": 0.913, + "step": 12051 + }, + { + "epoch": 1.15, + "grad_norm": 0.3544283746586811, + "learning_rate": 8.303692155195291e-05, + "loss": 0.9547, + "step": 12052 + }, + { + "epoch": 1.15, + "grad_norm": 0.31418970116756534, + "learning_rate": 8.30213315346405e-05, + "loss": 1.0835, + "step": 12053 + }, + { + "epoch": 1.15, + "grad_norm": 0.3158262928973319, + "learning_rate": 8.300574194222995e-05, + "loss": 1.1034, + "step": 12054 + }, + { + "epoch": 1.15, + "grad_norm": 0.28436613732603644, + "learning_rate": 8.299015277511139e-05, + "loss": 1.0055, + "step": 12055 + }, + { + "epoch": 1.15, + "grad_norm": 0.2908092819058805, + "learning_rate": 8.297456403367489e-05, + "loss": 1.0936, + "step": 12056 + }, + { + "epoch": 1.15, + "grad_norm": 0.3191322524328025, + "learning_rate": 8.295897571831069e-05, + "loss": 0.9567, + "step": 12057 + }, + { + "epoch": 1.15, + "grad_norm": 0.29563363912781343, + "learning_rate": 8.294338782940881e-05, + "loss": 0.9836, + "step": 12058 + }, + { + "epoch": 1.15, + "grad_norm": 0.30477096623220157, + "learning_rate": 8.292780036735938e-05, + "loss": 0.9803, + "step": 12059 + }, + { + "epoch": 1.15, + "grad_norm": 0.29911835846375767, + "learning_rate": 8.291221333255245e-05, + "loss": 0.9989, + "step": 12060 + }, + { + "epoch": 1.15, + "grad_norm": 0.3128437837181349, + "learning_rate": 8.289662672537815e-05, + "loss": 1.0472, + "step": 12061 + }, + { + "epoch": 1.15, + "grad_norm": 0.3160022763433083, + "learning_rate": 8.288104054622651e-05, + "loss": 1.039, + "step": 12062 + }, + { + "epoch": 1.15, + "grad_norm": 0.31327274978086433, + "learning_rate": 8.286545479548758e-05, + "loss": 1.0977, + "step": 12063 + }, + { + "epoch": 1.15, + "grad_norm": 0.29422087493109444, + "learning_rate": 8.28498694735514e-05, + "loss": 0.9366, + "step": 12064 + }, + { + "epoch": 1.15, + "grad_norm": 0.327826036196354, + "learning_rate": 8.283428458080804e-05, + "loss": 1.0126, + "step": 12065 + }, + { + "epoch": 1.15, + "grad_norm": 0.2733588046984156, + "learning_rate": 8.28187001176475e-05, + "loss": 0.9866, + "step": 12066 + }, + { + "epoch": 1.15, + "grad_norm": 0.3090842825194732, + "learning_rate": 8.280311608445978e-05, + "loss": 0.9373, + "step": 12067 + }, + { + "epoch": 1.15, + "grad_norm": 0.28599811401908276, + "learning_rate": 8.27875324816349e-05, + "loss": 0.9805, + "step": 12068 + }, + { + "epoch": 1.15, + "grad_norm": 0.3088929579121635, + "learning_rate": 8.277194930956283e-05, + "loss": 0.9706, + "step": 12069 + }, + { + "epoch": 1.15, + "grad_norm": 0.35001046856756646, + "learning_rate": 8.27563665686335e-05, + "loss": 1.0988, + "step": 12070 + }, + { + "epoch": 1.15, + "grad_norm": 0.3213185691404636, + "learning_rate": 8.274078425923699e-05, + "loss": 1.0616, + "step": 12071 + }, + { + "epoch": 1.15, + "grad_norm": 0.30455005972366983, + "learning_rate": 8.27252023817632e-05, + "loss": 0.9658, + "step": 12072 + }, + { + "epoch": 1.16, + "grad_norm": 0.3478933009068606, + "learning_rate": 8.270962093660207e-05, + "loss": 1.1285, + "step": 12073 + }, + { + "epoch": 1.16, + "grad_norm": 0.35018600758377005, + "learning_rate": 8.269403992414355e-05, + "loss": 1.0877, + "step": 12074 + }, + { + "epoch": 1.16, + "grad_norm": 0.3168646215368207, + "learning_rate": 8.267845934477756e-05, + "loss": 1.054, + "step": 12075 + }, + { + "epoch": 1.16, + "grad_norm": 0.30048640693662276, + "learning_rate": 8.266287919889396e-05, + "loss": 0.958, + "step": 12076 + }, + { + "epoch": 1.16, + "grad_norm": 0.34954798920367947, + "learning_rate": 8.264729948688276e-05, + "loss": 1.0861, + "step": 12077 + }, + { + "epoch": 1.16, + "grad_norm": 0.3241089972559607, + "learning_rate": 8.263172020913377e-05, + "loss": 1.1327, + "step": 12078 + }, + { + "epoch": 1.16, + "grad_norm": 0.2902801875929258, + "learning_rate": 8.261614136603693e-05, + "loss": 1.024, + "step": 12079 + }, + { + "epoch": 1.16, + "grad_norm": 0.3289214619033638, + "learning_rate": 8.260056295798206e-05, + "loss": 1.0359, + "step": 12080 + }, + { + "epoch": 1.16, + "grad_norm": 0.31802191395514956, + "learning_rate": 8.258498498535902e-05, + "loss": 1.1287, + "step": 12081 + }, + { + "epoch": 1.16, + "grad_norm": 0.2900305972248621, + "learning_rate": 8.256940744855768e-05, + "loss": 1.0295, + "step": 12082 + }, + { + "epoch": 1.16, + "grad_norm": 0.30886920345503244, + "learning_rate": 8.255383034796788e-05, + "loss": 1.0577, + "step": 12083 + }, + { + "epoch": 1.16, + "grad_norm": 0.337716402906366, + "learning_rate": 8.253825368397942e-05, + "loss": 1.1208, + "step": 12084 + }, + { + "epoch": 1.16, + "grad_norm": 0.3145418031740912, + "learning_rate": 8.252267745698216e-05, + "loss": 1.0372, + "step": 12085 + }, + { + "epoch": 1.16, + "grad_norm": 0.3536039244353038, + "learning_rate": 8.250710166736589e-05, + "loss": 1.0469, + "step": 12086 + }, + { + "epoch": 1.16, + "grad_norm": 0.2809228275556616, + "learning_rate": 8.24915263155204e-05, + "loss": 1.0608, + "step": 12087 + }, + { + "epoch": 1.16, + "grad_norm": 0.3447816894263733, + "learning_rate": 8.247595140183545e-05, + "loss": 1.1263, + "step": 12088 + }, + { + "epoch": 1.16, + "grad_norm": 0.28255879933177985, + "learning_rate": 8.246037692670084e-05, + "loss": 1.1108, + "step": 12089 + }, + { + "epoch": 1.16, + "grad_norm": 0.27884593553245607, + "learning_rate": 8.244480289050626e-05, + "loss": 1.0261, + "step": 12090 + }, + { + "epoch": 1.16, + "grad_norm": 0.28410950200991947, + "learning_rate": 8.242922929364159e-05, + "loss": 1.1062, + "step": 12091 + }, + { + "epoch": 1.16, + "grad_norm": 0.2849798826455596, + "learning_rate": 8.241365613649649e-05, + "loss": 0.9854, + "step": 12092 + }, + { + "epoch": 1.16, + "grad_norm": 0.2909420223438542, + "learning_rate": 8.239808341946069e-05, + "loss": 0.9739, + "step": 12093 + }, + { + "epoch": 1.16, + "grad_norm": 0.2592709107796627, + "learning_rate": 8.238251114292392e-05, + "loss": 1.0329, + "step": 12094 + }, + { + "epoch": 1.16, + "grad_norm": 0.38000245514905184, + "learning_rate": 8.236693930727588e-05, + "loss": 1.1019, + "step": 12095 + }, + { + "epoch": 1.16, + "grad_norm": 0.3009015840659236, + "learning_rate": 8.235136791290627e-05, + "loss": 0.9988, + "step": 12096 + }, + { + "epoch": 1.16, + "grad_norm": 0.3209233076105538, + "learning_rate": 8.233579696020473e-05, + "loss": 0.9695, + "step": 12097 + }, + { + "epoch": 1.16, + "grad_norm": 0.27757216847019334, + "learning_rate": 8.2320226449561e-05, + "loss": 1.0313, + "step": 12098 + }, + { + "epoch": 1.16, + "grad_norm": 0.26767453027729204, + "learning_rate": 8.230465638136474e-05, + "loss": 0.8862, + "step": 12099 + }, + { + "epoch": 1.16, + "grad_norm": 0.33464735866812406, + "learning_rate": 8.228908675600555e-05, + "loss": 1.0785, + "step": 12100 + }, + { + "epoch": 1.16, + "grad_norm": 0.3247410732124524, + "learning_rate": 8.227351757387307e-05, + "loss": 1.0422, + "step": 12101 + }, + { + "epoch": 1.16, + "grad_norm": 0.3050245268594372, + "learning_rate": 8.2257948835357e-05, + "loss": 1.0602, + "step": 12102 + }, + { + "epoch": 1.16, + "grad_norm": 0.29296553640306267, + "learning_rate": 8.22423805408469e-05, + "loss": 1.0511, + "step": 12103 + }, + { + "epoch": 1.16, + "grad_norm": 0.3006967627369799, + "learning_rate": 8.222681269073236e-05, + "loss": 1.1552, + "step": 12104 + }, + { + "epoch": 1.16, + "grad_norm": 0.32009090305414417, + "learning_rate": 8.221124528540301e-05, + "loss": 1.0742, + "step": 12105 + }, + { + "epoch": 1.16, + "grad_norm": 0.28081851298702487, + "learning_rate": 8.219567832524844e-05, + "loss": 1.1017, + "step": 12106 + }, + { + "epoch": 1.16, + "grad_norm": 0.31487631491941787, + "learning_rate": 8.218011181065821e-05, + "loss": 0.9797, + "step": 12107 + }, + { + "epoch": 1.16, + "grad_norm": 0.31410302517700434, + "learning_rate": 8.216454574202188e-05, + "loss": 1.0515, + "step": 12108 + }, + { + "epoch": 1.16, + "grad_norm": 0.33686957495840186, + "learning_rate": 8.214898011972902e-05, + "loss": 1.0318, + "step": 12109 + }, + { + "epoch": 1.16, + "grad_norm": 0.2933285933596056, + "learning_rate": 8.213341494416908e-05, + "loss": 0.918, + "step": 12110 + }, + { + "epoch": 1.16, + "grad_norm": 0.3220577953179028, + "learning_rate": 8.211785021573173e-05, + "loss": 1.0787, + "step": 12111 + }, + { + "epoch": 1.16, + "grad_norm": 0.2918802595342865, + "learning_rate": 8.21022859348064e-05, + "loss": 1.1291, + "step": 12112 + }, + { + "epoch": 1.16, + "grad_norm": 0.2790072842304904, + "learning_rate": 8.208672210178262e-05, + "loss": 0.9405, + "step": 12113 + }, + { + "epoch": 1.16, + "grad_norm": 0.27644915281719873, + "learning_rate": 8.207115871704987e-05, + "loss": 1.0249, + "step": 12114 + }, + { + "epoch": 1.16, + "grad_norm": 0.28353166098898425, + "learning_rate": 8.205559578099766e-05, + "loss": 1.019, + "step": 12115 + }, + { + "epoch": 1.16, + "grad_norm": 0.3456474640012215, + "learning_rate": 8.204003329401542e-05, + "loss": 1.0418, + "step": 12116 + }, + { + "epoch": 1.16, + "grad_norm": 0.29412059475332225, + "learning_rate": 8.20244712564926e-05, + "loss": 0.9115, + "step": 12117 + }, + { + "epoch": 1.16, + "grad_norm": 0.2757008487136062, + "learning_rate": 8.200890966881874e-05, + "loss": 1.0413, + "step": 12118 + }, + { + "epoch": 1.16, + "grad_norm": 0.3117862555841153, + "learning_rate": 8.19933485313832e-05, + "loss": 1.0128, + "step": 12119 + }, + { + "epoch": 1.16, + "grad_norm": 0.29220792052021777, + "learning_rate": 8.197778784457543e-05, + "loss": 0.9852, + "step": 12120 + }, + { + "epoch": 1.16, + "grad_norm": 0.30127152379232763, + "learning_rate": 8.196222760878487e-05, + "loss": 0.957, + "step": 12121 + }, + { + "epoch": 1.16, + "grad_norm": 0.32534021242015054, + "learning_rate": 8.194666782440086e-05, + "loss": 1.1049, + "step": 12122 + }, + { + "epoch": 1.16, + "grad_norm": 0.2815359899044726, + "learning_rate": 8.193110849181286e-05, + "loss": 1.0134, + "step": 12123 + }, + { + "epoch": 1.16, + "grad_norm": 0.3233727025778913, + "learning_rate": 8.191554961141021e-05, + "loss": 1.038, + "step": 12124 + }, + { + "epoch": 1.16, + "grad_norm": 0.3326397037989504, + "learning_rate": 8.18999911835823e-05, + "loss": 1.0196, + "step": 12125 + }, + { + "epoch": 1.16, + "grad_norm": 0.2720929678081213, + "learning_rate": 8.188443320871848e-05, + "loss": 0.9181, + "step": 12126 + }, + { + "epoch": 1.16, + "grad_norm": 0.29179549689535805, + "learning_rate": 8.186887568720813e-05, + "loss": 1.0079, + "step": 12127 + }, + { + "epoch": 1.16, + "grad_norm": 0.2909338851580608, + "learning_rate": 8.185331861944054e-05, + "loss": 1.12, + "step": 12128 + }, + { + "epoch": 1.16, + "grad_norm": 0.2785646388506291, + "learning_rate": 8.183776200580509e-05, + "loss": 0.9823, + "step": 12129 + }, + { + "epoch": 1.16, + "grad_norm": 0.34928312206926204, + "learning_rate": 8.182220584669103e-05, + "loss": 0.9977, + "step": 12130 + }, + { + "epoch": 1.16, + "grad_norm": 0.33078050429483374, + "learning_rate": 8.180665014248767e-05, + "loss": 0.997, + "step": 12131 + }, + { + "epoch": 1.16, + "grad_norm": 0.3349152395389837, + "learning_rate": 8.179109489358434e-05, + "loss": 0.9363, + "step": 12132 + }, + { + "epoch": 1.16, + "grad_norm": 0.34761961229478167, + "learning_rate": 8.177554010037033e-05, + "loss": 0.8869, + "step": 12133 + }, + { + "epoch": 1.16, + "grad_norm": 0.3165384071530606, + "learning_rate": 8.175998576323488e-05, + "loss": 1.0299, + "step": 12134 + }, + { + "epoch": 1.16, + "grad_norm": 0.29808005535502036, + "learning_rate": 8.174443188256723e-05, + "loss": 1.0513, + "step": 12135 + }, + { + "epoch": 1.16, + "grad_norm": 0.29460052873528203, + "learning_rate": 8.172887845875667e-05, + "loss": 1.0127, + "step": 12136 + }, + { + "epoch": 1.16, + "grad_norm": 0.312115831433222, + "learning_rate": 8.171332549219234e-05, + "loss": 0.9628, + "step": 12137 + }, + { + "epoch": 1.16, + "grad_norm": 0.35196569853886855, + "learning_rate": 8.169777298326359e-05, + "loss": 1.0581, + "step": 12138 + }, + { + "epoch": 1.16, + "grad_norm": 0.29921808299542807, + "learning_rate": 8.168222093235959e-05, + "loss": 1.0661, + "step": 12139 + }, + { + "epoch": 1.16, + "grad_norm": 0.3181872274762372, + "learning_rate": 8.16666693398695e-05, + "loss": 0.9506, + "step": 12140 + }, + { + "epoch": 1.16, + "grad_norm": 0.25385167654686425, + "learning_rate": 8.165111820618255e-05, + "loss": 0.8761, + "step": 12141 + }, + { + "epoch": 1.16, + "grad_norm": 0.3389372804623743, + "learning_rate": 8.163556753168786e-05, + "loss": 1.0844, + "step": 12142 + }, + { + "epoch": 1.16, + "grad_norm": 0.31112135468231833, + "learning_rate": 8.162001731677466e-05, + "loss": 1.0819, + "step": 12143 + }, + { + "epoch": 1.16, + "grad_norm": 0.29849913554471885, + "learning_rate": 8.160446756183207e-05, + "loss": 1.0582, + "step": 12144 + }, + { + "epoch": 1.16, + "grad_norm": 0.28774470868716173, + "learning_rate": 8.158891826724925e-05, + "loss": 0.9933, + "step": 12145 + }, + { + "epoch": 1.16, + "grad_norm": 0.3132805478745228, + "learning_rate": 8.15733694334153e-05, + "loss": 1.0488, + "step": 12146 + }, + { + "epoch": 1.16, + "grad_norm": 0.2928103765292576, + "learning_rate": 8.155782106071939e-05, + "loss": 0.983, + "step": 12147 + }, + { + "epoch": 1.16, + "grad_norm": 0.3466711104164778, + "learning_rate": 8.154227314955059e-05, + "loss": 0.9853, + "step": 12148 + }, + { + "epoch": 1.16, + "grad_norm": 0.2889379219999669, + "learning_rate": 8.1526725700298e-05, + "loss": 0.9961, + "step": 12149 + }, + { + "epoch": 1.16, + "grad_norm": 0.33118722291845515, + "learning_rate": 8.151117871335072e-05, + "loss": 1.0467, + "step": 12150 + }, + { + "epoch": 1.16, + "grad_norm": 0.30094939848827285, + "learning_rate": 8.149563218909776e-05, + "loss": 1.0352, + "step": 12151 + }, + { + "epoch": 1.16, + "grad_norm": 0.3102228831806439, + "learning_rate": 8.148008612792827e-05, + "loss": 1.0852, + "step": 12152 + }, + { + "epoch": 1.16, + "grad_norm": 0.3317703942715706, + "learning_rate": 8.14645405302313e-05, + "loss": 0.8635, + "step": 12153 + }, + { + "epoch": 1.16, + "grad_norm": 0.33503329462363657, + "learning_rate": 8.144899539639583e-05, + "loss": 1.1268, + "step": 12154 + }, + { + "epoch": 1.16, + "grad_norm": 0.29957575865543434, + "learning_rate": 8.14334507268109e-05, + "loss": 0.9745, + "step": 12155 + }, + { + "epoch": 1.16, + "grad_norm": 0.2903525873147151, + "learning_rate": 8.141790652186555e-05, + "loss": 1.0131, + "step": 12156 + }, + { + "epoch": 1.16, + "grad_norm": 0.30757466491873214, + "learning_rate": 8.140236278194875e-05, + "loss": 1.0639, + "step": 12157 + }, + { + "epoch": 1.16, + "grad_norm": 0.2884899248944729, + "learning_rate": 8.138681950744947e-05, + "loss": 1.0293, + "step": 12158 + }, + { + "epoch": 1.16, + "grad_norm": 0.31425809984877723, + "learning_rate": 8.137127669875678e-05, + "loss": 1.0429, + "step": 12159 + }, + { + "epoch": 1.16, + "grad_norm": 0.29168373412009607, + "learning_rate": 8.135573435625961e-05, + "loss": 0.9563, + "step": 12160 + }, + { + "epoch": 1.16, + "grad_norm": 0.28596064751240896, + "learning_rate": 8.134019248034688e-05, + "loss": 1.0231, + "step": 12161 + }, + { + "epoch": 1.16, + "grad_norm": 0.32615940497173795, + "learning_rate": 8.132465107140757e-05, + "loss": 1.0353, + "step": 12162 + }, + { + "epoch": 1.16, + "grad_norm": 0.29456517704285945, + "learning_rate": 8.130911012983059e-05, + "loss": 1.0979, + "step": 12163 + }, + { + "epoch": 1.16, + "grad_norm": 0.2830638212959876, + "learning_rate": 8.12935696560049e-05, + "loss": 1.0866, + "step": 12164 + }, + { + "epoch": 1.16, + "grad_norm": 0.27736355579912353, + "learning_rate": 8.127802965031935e-05, + "loss": 0.9667, + "step": 12165 + }, + { + "epoch": 1.16, + "grad_norm": 0.3630307275180466, + "learning_rate": 8.12624901131629e-05, + "loss": 0.8778, + "step": 12166 + }, + { + "epoch": 1.16, + "grad_norm": 0.3124018475688689, + "learning_rate": 8.124695104492439e-05, + "loss": 1.1194, + "step": 12167 + }, + { + "epoch": 1.16, + "grad_norm": 0.3300287575025695, + "learning_rate": 8.123141244599275e-05, + "loss": 1.0008, + "step": 12168 + }, + { + "epoch": 1.16, + "grad_norm": 0.2886060091856945, + "learning_rate": 8.121587431675678e-05, + "loss": 1.1073, + "step": 12169 + }, + { + "epoch": 1.16, + "grad_norm": 0.3208301703447157, + "learning_rate": 8.12003366576054e-05, + "loss": 0.9803, + "step": 12170 + }, + { + "epoch": 1.16, + "grad_norm": 0.27847159643143865, + "learning_rate": 8.118479946892734e-05, + "loss": 1.0158, + "step": 12171 + }, + { + "epoch": 1.16, + "grad_norm": 0.282344884614418, + "learning_rate": 8.116926275111155e-05, + "loss": 0.9918, + "step": 12172 + }, + { + "epoch": 1.16, + "grad_norm": 0.30160497847576295, + "learning_rate": 8.115372650454679e-05, + "loss": 0.9656, + "step": 12173 + }, + { + "epoch": 1.16, + "grad_norm": 0.3065544349143686, + "learning_rate": 8.113819072962188e-05, + "loss": 1.0003, + "step": 12174 + }, + { + "epoch": 1.16, + "grad_norm": 0.32011445260314614, + "learning_rate": 8.112265542672559e-05, + "loss": 1.0632, + "step": 12175 + }, + { + "epoch": 1.16, + "grad_norm": 0.2784938352297704, + "learning_rate": 8.110712059624671e-05, + "loss": 1.033, + "step": 12176 + }, + { + "epoch": 1.16, + "grad_norm": 0.31620044595872643, + "learning_rate": 8.109158623857402e-05, + "loss": 0.9842, + "step": 12177 + }, + { + "epoch": 1.17, + "grad_norm": 0.28236431277412044, + "learning_rate": 8.10760523540962e-05, + "loss": 0.9406, + "step": 12178 + }, + { + "epoch": 1.17, + "grad_norm": 0.3542710025740123, + "learning_rate": 8.106051894320214e-05, + "loss": 0.978, + "step": 12179 + }, + { + "epoch": 1.17, + "grad_norm": 0.32123694463332436, + "learning_rate": 8.104498600628048e-05, + "loss": 1.031, + "step": 12180 + }, + { + "epoch": 1.17, + "grad_norm": 0.3228459049622118, + "learning_rate": 8.102945354371995e-05, + "loss": 0.985, + "step": 12181 + }, + { + "epoch": 1.17, + "grad_norm": 0.32675311084490927, + "learning_rate": 8.101392155590929e-05, + "loss": 0.9775, + "step": 12182 + }, + { + "epoch": 1.17, + "grad_norm": 0.27487572702803686, + "learning_rate": 8.099839004323716e-05, + "loss": 0.8435, + "step": 12183 + }, + { + "epoch": 1.17, + "grad_norm": 0.33534918584190876, + "learning_rate": 8.098285900609226e-05, + "loss": 1.0039, + "step": 12184 + }, + { + "epoch": 1.17, + "grad_norm": 0.3040615239606208, + "learning_rate": 8.096732844486324e-05, + "loss": 0.9543, + "step": 12185 + }, + { + "epoch": 1.17, + "grad_norm": 0.28419044728701137, + "learning_rate": 8.095179835993882e-05, + "loss": 1.0187, + "step": 12186 + }, + { + "epoch": 1.17, + "grad_norm": 0.27245986307624936, + "learning_rate": 8.093626875170759e-05, + "loss": 0.8554, + "step": 12187 + }, + { + "epoch": 1.17, + "grad_norm": 0.31563390972613314, + "learning_rate": 8.092073962055824e-05, + "loss": 1.1577, + "step": 12188 + }, + { + "epoch": 1.17, + "grad_norm": 0.3138588141677654, + "learning_rate": 8.090521096687937e-05, + "loss": 0.9965, + "step": 12189 + }, + { + "epoch": 1.17, + "grad_norm": 0.3041083697725484, + "learning_rate": 8.088968279105959e-05, + "loss": 0.9237, + "step": 12190 + }, + { + "epoch": 1.17, + "grad_norm": 0.331542403202878, + "learning_rate": 8.087415509348753e-05, + "loss": 1.0584, + "step": 12191 + }, + { + "epoch": 1.17, + "grad_norm": 0.34150800784880825, + "learning_rate": 8.08586278745517e-05, + "loss": 0.9645, + "step": 12192 + }, + { + "epoch": 1.17, + "grad_norm": 0.32493807595032537, + "learning_rate": 8.084310113464078e-05, + "loss": 1.0426, + "step": 12193 + }, + { + "epoch": 1.17, + "grad_norm": 0.3289029684956461, + "learning_rate": 8.08275748741433e-05, + "loss": 1.1279, + "step": 12194 + }, + { + "epoch": 1.17, + "grad_norm": 0.26778998485203714, + "learning_rate": 8.08120490934478e-05, + "loss": 1.0539, + "step": 12195 + }, + { + "epoch": 1.17, + "grad_norm": 0.29948695856970675, + "learning_rate": 8.079652379294283e-05, + "loss": 1.0223, + "step": 12196 + }, + { + "epoch": 1.17, + "grad_norm": 0.29135742243385926, + "learning_rate": 8.078099897301693e-05, + "loss": 1.0539, + "step": 12197 + }, + { + "epoch": 1.17, + "grad_norm": 0.3300143216546912, + "learning_rate": 8.076547463405857e-05, + "loss": 1.1557, + "step": 12198 + }, + { + "epoch": 1.17, + "grad_norm": 0.32237829439236537, + "learning_rate": 8.074995077645633e-05, + "loss": 1.1086, + "step": 12199 + }, + { + "epoch": 1.17, + "grad_norm": 0.29209468066598265, + "learning_rate": 8.073442740059867e-05, + "loss": 1.0664, + "step": 12200 + }, + { + "epoch": 1.17, + "grad_norm": 0.2939188425157372, + "learning_rate": 8.071890450687407e-05, + "loss": 1.1082, + "step": 12201 + }, + { + "epoch": 1.17, + "grad_norm": 0.33872320088891394, + "learning_rate": 8.070338209567101e-05, + "loss": 1.049, + "step": 12202 + }, + { + "epoch": 1.17, + "grad_norm": 0.3005657797531301, + "learning_rate": 8.068786016737794e-05, + "loss": 0.9898, + "step": 12203 + }, + { + "epoch": 1.17, + "grad_norm": 0.3194092445573554, + "learning_rate": 8.067233872238327e-05, + "loss": 1.0167, + "step": 12204 + }, + { + "epoch": 1.17, + "grad_norm": 0.32264465817189764, + "learning_rate": 8.065681776107548e-05, + "loss": 0.9112, + "step": 12205 + }, + { + "epoch": 1.17, + "grad_norm": 0.2975469302971063, + "learning_rate": 8.064129728384303e-05, + "loss": 1.0385, + "step": 12206 + }, + { + "epoch": 1.17, + "grad_norm": 0.2916211844697667, + "learning_rate": 8.062577729107425e-05, + "loss": 1.0468, + "step": 12207 + }, + { + "epoch": 1.17, + "grad_norm": 0.2671249598719891, + "learning_rate": 8.061025778315755e-05, + "loss": 1.1052, + "step": 12208 + }, + { + "epoch": 1.17, + "grad_norm": 0.28232862364723843, + "learning_rate": 8.059473876048136e-05, + "loss": 1.0864, + "step": 12209 + }, + { + "epoch": 1.17, + "grad_norm": 0.29168757775597787, + "learning_rate": 8.057922022343403e-05, + "loss": 1.1971, + "step": 12210 + }, + { + "epoch": 1.17, + "grad_norm": 0.30748151952279507, + "learning_rate": 8.056370217240392e-05, + "loss": 1.0514, + "step": 12211 + }, + { + "epoch": 1.17, + "grad_norm": 0.31031370208596176, + "learning_rate": 8.054818460777933e-05, + "loss": 1.0625, + "step": 12212 + }, + { + "epoch": 1.17, + "grad_norm": 0.3270810001837514, + "learning_rate": 8.05326675299487e-05, + "loss": 1.0236, + "step": 12213 + }, + { + "epoch": 1.17, + "grad_norm": 0.3295068973073696, + "learning_rate": 8.05171509393003e-05, + "loss": 1.1372, + "step": 12214 + }, + { + "epoch": 1.17, + "grad_norm": 0.38857062256517527, + "learning_rate": 8.050163483622243e-05, + "loss": 1.0202, + "step": 12215 + }, + { + "epoch": 1.17, + "grad_norm": 0.32083461753370496, + "learning_rate": 8.048611922110341e-05, + "loss": 0.9206, + "step": 12216 + }, + { + "epoch": 1.17, + "grad_norm": 0.30909081827683543, + "learning_rate": 8.047060409433152e-05, + "loss": 0.9244, + "step": 12217 + }, + { + "epoch": 1.17, + "grad_norm": 0.2942769491727334, + "learning_rate": 8.045508945629503e-05, + "loss": 0.992, + "step": 12218 + }, + { + "epoch": 1.17, + "grad_norm": 0.31281728799393754, + "learning_rate": 8.043957530738217e-05, + "loss": 1.0383, + "step": 12219 + }, + { + "epoch": 1.17, + "grad_norm": 0.32800256239565917, + "learning_rate": 8.042406164798129e-05, + "loss": 1.0378, + "step": 12220 + }, + { + "epoch": 1.17, + "grad_norm": 0.320568016419705, + "learning_rate": 8.040854847848055e-05, + "loss": 1.0007, + "step": 12221 + }, + { + "epoch": 1.17, + "grad_norm": 0.30527029998455957, + "learning_rate": 8.03930357992682e-05, + "loss": 1.0627, + "step": 12222 + }, + { + "epoch": 1.17, + "grad_norm": 0.2947368178919167, + "learning_rate": 8.037752361073246e-05, + "loss": 0.9685, + "step": 12223 + }, + { + "epoch": 1.17, + "grad_norm": 0.3055903707183276, + "learning_rate": 8.036201191326152e-05, + "loss": 1.0945, + "step": 12224 + }, + { + "epoch": 1.17, + "grad_norm": 0.3564748089847177, + "learning_rate": 8.034650070724355e-05, + "loss": 0.9215, + "step": 12225 + }, + { + "epoch": 1.17, + "grad_norm": 0.30995777281817544, + "learning_rate": 8.033098999306676e-05, + "loss": 1.0594, + "step": 12226 + }, + { + "epoch": 1.17, + "grad_norm": 0.3544559456232826, + "learning_rate": 8.031547977111932e-05, + "loss": 1.0455, + "step": 12227 + }, + { + "epoch": 1.17, + "grad_norm": 0.2795769569334886, + "learning_rate": 8.029997004178934e-05, + "loss": 1.0252, + "step": 12228 + }, + { + "epoch": 1.17, + "grad_norm": 0.2671566345779031, + "learning_rate": 8.028446080546503e-05, + "loss": 0.9947, + "step": 12229 + }, + { + "epoch": 1.17, + "grad_norm": 0.3480874908611991, + "learning_rate": 8.026895206253447e-05, + "loss": 1.0177, + "step": 12230 + }, + { + "epoch": 1.17, + "grad_norm": 0.32924599466598686, + "learning_rate": 8.025344381338579e-05, + "loss": 0.9498, + "step": 12231 + }, + { + "epoch": 1.17, + "grad_norm": 0.30484505615906604, + "learning_rate": 8.023793605840704e-05, + "loss": 1.0566, + "step": 12232 + }, + { + "epoch": 1.17, + "grad_norm": 0.34086733221141163, + "learning_rate": 8.022242879798641e-05, + "loss": 0.9815, + "step": 12233 + }, + { + "epoch": 1.17, + "grad_norm": 0.2892147272067814, + "learning_rate": 8.020692203251192e-05, + "loss": 0.9736, + "step": 12234 + }, + { + "epoch": 1.17, + "grad_norm": 0.3303874234402778, + "learning_rate": 8.019141576237165e-05, + "loss": 0.9499, + "step": 12235 + }, + { + "epoch": 1.17, + "grad_norm": 0.2664912563017257, + "learning_rate": 8.017590998795364e-05, + "loss": 1.0254, + "step": 12236 + }, + { + "epoch": 1.17, + "grad_norm": 0.28744805441001325, + "learning_rate": 8.016040470964597e-05, + "loss": 1.0254, + "step": 12237 + }, + { + "epoch": 1.17, + "grad_norm": 0.31786487698905075, + "learning_rate": 8.014489992783661e-05, + "loss": 0.9391, + "step": 12238 + }, + { + "epoch": 1.17, + "grad_norm": 0.30137124938308957, + "learning_rate": 8.012939564291358e-05, + "loss": 0.9556, + "step": 12239 + }, + { + "epoch": 1.17, + "grad_norm": 0.2701465264987846, + "learning_rate": 8.011389185526494e-05, + "loss": 0.9749, + "step": 12240 + }, + { + "epoch": 1.17, + "grad_norm": 0.29343637837370634, + "learning_rate": 8.009838856527867e-05, + "loss": 0.9783, + "step": 12241 + }, + { + "epoch": 1.17, + "grad_norm": 0.31029115374480615, + "learning_rate": 8.008288577334273e-05, + "loss": 0.9929, + "step": 12242 + }, + { + "epoch": 1.17, + "grad_norm": 0.32203199453003367, + "learning_rate": 8.00673834798451e-05, + "loss": 1.1768, + "step": 12243 + }, + { + "epoch": 1.17, + "grad_norm": 0.29978406311879835, + "learning_rate": 8.005188168517372e-05, + "loss": 1.0172, + "step": 12244 + }, + { + "epoch": 1.17, + "grad_norm": 0.28241166012590846, + "learning_rate": 8.00363803897165e-05, + "loss": 1.0464, + "step": 12245 + }, + { + "epoch": 1.17, + "grad_norm": 0.3381203167904085, + "learning_rate": 8.002087959386143e-05, + "loss": 1.1159, + "step": 12246 + }, + { + "epoch": 1.17, + "grad_norm": 0.28179127121630815, + "learning_rate": 8.000537929799642e-05, + "loss": 1.0706, + "step": 12247 + }, + { + "epoch": 1.17, + "grad_norm": 0.29612955334258184, + "learning_rate": 7.998987950250935e-05, + "loss": 1.0605, + "step": 12248 + }, + { + "epoch": 1.17, + "grad_norm": 0.3161274890700582, + "learning_rate": 7.997438020778811e-05, + "loss": 0.9907, + "step": 12249 + }, + { + "epoch": 1.17, + "grad_norm": 0.29328843129573673, + "learning_rate": 7.99588814142206e-05, + "loss": 0.9246, + "step": 12250 + }, + { + "epoch": 1.17, + "grad_norm": 0.32889714567530826, + "learning_rate": 7.99433831221947e-05, + "loss": 0.9952, + "step": 12251 + }, + { + "epoch": 1.17, + "grad_norm": 0.2762563400796828, + "learning_rate": 7.992788533209822e-05, + "loss": 0.9975, + "step": 12252 + }, + { + "epoch": 1.17, + "grad_norm": 0.2722571008036665, + "learning_rate": 7.991238804431898e-05, + "loss": 0.9701, + "step": 12253 + }, + { + "epoch": 1.17, + "grad_norm": 0.3218513036668454, + "learning_rate": 7.989689125924491e-05, + "loss": 1.0599, + "step": 12254 + }, + { + "epoch": 1.17, + "grad_norm": 0.3346988591746516, + "learning_rate": 7.988139497726376e-05, + "loss": 1.1059, + "step": 12255 + }, + { + "epoch": 1.17, + "grad_norm": 0.3216864164964715, + "learning_rate": 7.986589919876338e-05, + "loss": 1.1213, + "step": 12256 + }, + { + "epoch": 1.17, + "grad_norm": 0.2907493802134934, + "learning_rate": 7.985040392413149e-05, + "loss": 1.134, + "step": 12257 + }, + { + "epoch": 1.17, + "grad_norm": 0.33776683497127985, + "learning_rate": 7.983490915375591e-05, + "loss": 1.0616, + "step": 12258 + }, + { + "epoch": 1.17, + "grad_norm": 0.311133425481472, + "learning_rate": 7.981941488802436e-05, + "loss": 0.9661, + "step": 12259 + }, + { + "epoch": 1.17, + "grad_norm": 0.2940204620946952, + "learning_rate": 7.980392112732469e-05, + "loss": 1.0113, + "step": 12260 + }, + { + "epoch": 1.17, + "grad_norm": 0.2933233104228555, + "learning_rate": 7.978842787204457e-05, + "loss": 1.0601, + "step": 12261 + }, + { + "epoch": 1.17, + "grad_norm": 0.29817122229119214, + "learning_rate": 7.977293512257175e-05, + "loss": 0.9531, + "step": 12262 + }, + { + "epoch": 1.17, + "grad_norm": 0.2812187704993087, + "learning_rate": 7.975744287929394e-05, + "loss": 1.0495, + "step": 12263 + }, + { + "epoch": 1.17, + "grad_norm": 0.33232923556961336, + "learning_rate": 7.974195114259885e-05, + "loss": 0.9766, + "step": 12264 + }, + { + "epoch": 1.17, + "grad_norm": 0.31429122070008303, + "learning_rate": 7.972645991287416e-05, + "loss": 0.9902, + "step": 12265 + }, + { + "epoch": 1.17, + "grad_norm": 0.2794581671793385, + "learning_rate": 7.97109691905075e-05, + "loss": 1.0652, + "step": 12266 + }, + { + "epoch": 1.17, + "grad_norm": 0.3539325172656202, + "learning_rate": 7.969547897588666e-05, + "loss": 0.9815, + "step": 12267 + }, + { + "epoch": 1.17, + "grad_norm": 0.33094793178243953, + "learning_rate": 7.96799892693992e-05, + "loss": 0.9638, + "step": 12268 + }, + { + "epoch": 1.17, + "grad_norm": 0.2966346371872266, + "learning_rate": 7.966450007143278e-05, + "loss": 1.0355, + "step": 12269 + }, + { + "epoch": 1.17, + "grad_norm": 0.3352095101304133, + "learning_rate": 7.9649011382375e-05, + "loss": 1.0228, + "step": 12270 + }, + { + "epoch": 1.17, + "grad_norm": 0.2884829222214186, + "learning_rate": 7.963352320261352e-05, + "loss": 0.9925, + "step": 12271 + }, + { + "epoch": 1.17, + "grad_norm": 0.3281196499786753, + "learning_rate": 7.961803553253593e-05, + "loss": 0.9458, + "step": 12272 + }, + { + "epoch": 1.17, + "grad_norm": 0.3045358157267079, + "learning_rate": 7.960254837252977e-05, + "loss": 0.9238, + "step": 12273 + }, + { + "epoch": 1.17, + "grad_norm": 0.3392685479685694, + "learning_rate": 7.958706172298269e-05, + "loss": 1.0082, + "step": 12274 + }, + { + "epoch": 1.17, + "grad_norm": 0.28290119649288276, + "learning_rate": 7.957157558428224e-05, + "loss": 0.9994, + "step": 12275 + }, + { + "epoch": 1.17, + "grad_norm": 0.3089214693107056, + "learning_rate": 7.955608995681593e-05, + "loss": 1.0229, + "step": 12276 + }, + { + "epoch": 1.17, + "grad_norm": 0.30605802206135707, + "learning_rate": 7.954060484097131e-05, + "loss": 1.0369, + "step": 12277 + }, + { + "epoch": 1.17, + "grad_norm": 0.3148483338988176, + "learning_rate": 7.952512023713593e-05, + "loss": 1.0261, + "step": 12278 + }, + { + "epoch": 1.17, + "grad_norm": 0.31422973632482387, + "learning_rate": 7.950963614569728e-05, + "loss": 0.98, + "step": 12279 + }, + { + "epoch": 1.17, + "grad_norm": 0.38995834206390306, + "learning_rate": 7.949415256704282e-05, + "loss": 1.1162, + "step": 12280 + }, + { + "epoch": 1.17, + "grad_norm": 0.3016962129464625, + "learning_rate": 7.947866950156011e-05, + "loss": 0.9948, + "step": 12281 + }, + { + "epoch": 1.18, + "grad_norm": 0.30030553533390786, + "learning_rate": 7.94631869496366e-05, + "loss": 0.9899, + "step": 12282 + }, + { + "epoch": 1.18, + "grad_norm": 0.28171657435386244, + "learning_rate": 7.944770491165975e-05, + "loss": 1.1642, + "step": 12283 + }, + { + "epoch": 1.18, + "grad_norm": 0.27160173891494466, + "learning_rate": 7.9432223388017e-05, + "loss": 0.9781, + "step": 12284 + }, + { + "epoch": 1.18, + "grad_norm": 0.29545241426261404, + "learning_rate": 7.941674237909578e-05, + "loss": 1.0287, + "step": 12285 + }, + { + "epoch": 1.18, + "grad_norm": 0.28192703415076736, + "learning_rate": 7.94012618852835e-05, + "loss": 0.9069, + "step": 12286 + }, + { + "epoch": 1.18, + "grad_norm": 0.27891707405870264, + "learning_rate": 7.93857819069676e-05, + "loss": 1.0039, + "step": 12287 + }, + { + "epoch": 1.18, + "grad_norm": 0.2793985224056609, + "learning_rate": 7.937030244453546e-05, + "loss": 1.0765, + "step": 12288 + }, + { + "epoch": 1.18, + "grad_norm": 0.328990725464406, + "learning_rate": 7.935482349837449e-05, + "loss": 0.9799, + "step": 12289 + }, + { + "epoch": 1.18, + "grad_norm": 0.3118452497705599, + "learning_rate": 7.933934506887199e-05, + "loss": 1.0068, + "step": 12290 + }, + { + "epoch": 1.18, + "grad_norm": 0.2985203906130354, + "learning_rate": 7.93238671564154e-05, + "loss": 1.008, + "step": 12291 + }, + { + "epoch": 1.18, + "grad_norm": 0.32655520553372946, + "learning_rate": 7.930838976139203e-05, + "loss": 0.9784, + "step": 12292 + }, + { + "epoch": 1.18, + "grad_norm": 0.3074843372179691, + "learning_rate": 7.929291288418918e-05, + "loss": 1.009, + "step": 12293 + }, + { + "epoch": 1.18, + "grad_norm": 0.32737587643200866, + "learning_rate": 7.927743652519421e-05, + "loss": 0.9706, + "step": 12294 + }, + { + "epoch": 1.18, + "grad_norm": 0.28206200909140744, + "learning_rate": 7.926196068479444e-05, + "loss": 1.1241, + "step": 12295 + }, + { + "epoch": 1.18, + "grad_norm": 0.3086157472460379, + "learning_rate": 7.924648536337713e-05, + "loss": 1.0234, + "step": 12296 + }, + { + "epoch": 1.18, + "grad_norm": 0.2817849472599397, + "learning_rate": 7.923101056132956e-05, + "loss": 1.0082, + "step": 12297 + }, + { + "epoch": 1.18, + "grad_norm": 0.3809537738197977, + "learning_rate": 7.921553627903902e-05, + "loss": 1.0516, + "step": 12298 + }, + { + "epoch": 1.18, + "grad_norm": 0.30459578334959286, + "learning_rate": 7.920006251689273e-05, + "loss": 1.0122, + "step": 12299 + }, + { + "epoch": 1.18, + "grad_norm": 0.30932683032992603, + "learning_rate": 7.918458927527793e-05, + "loss": 1.0504, + "step": 12300 + }, + { + "epoch": 1.18, + "grad_norm": 0.2826377262252809, + "learning_rate": 7.916911655458188e-05, + "loss": 0.9762, + "step": 12301 + }, + { + "epoch": 1.18, + "grad_norm": 0.3127341754581663, + "learning_rate": 7.91536443551918e-05, + "loss": 1.1407, + "step": 12302 + }, + { + "epoch": 1.18, + "grad_norm": 0.30140899246338837, + "learning_rate": 7.913817267749485e-05, + "loss": 1.0923, + "step": 12303 + }, + { + "epoch": 1.18, + "grad_norm": 0.30122369834511387, + "learning_rate": 7.912270152187825e-05, + "loss": 1.1318, + "step": 12304 + }, + { + "epoch": 1.18, + "grad_norm": 0.3234063288646688, + "learning_rate": 7.910723088872918e-05, + "loss": 1.0211, + "step": 12305 + }, + { + "epoch": 1.18, + "grad_norm": 0.34158613551946476, + "learning_rate": 7.909176077843476e-05, + "loss": 1.0094, + "step": 12306 + }, + { + "epoch": 1.18, + "grad_norm": 0.2985490528389644, + "learning_rate": 7.907629119138215e-05, + "loss": 1.0682, + "step": 12307 + }, + { + "epoch": 1.18, + "grad_norm": 0.3182097579890309, + "learning_rate": 7.906082212795853e-05, + "loss": 1.057, + "step": 12308 + }, + { + "epoch": 1.18, + "grad_norm": 0.31709403141254006, + "learning_rate": 7.9045353588551e-05, + "loss": 1.1068, + "step": 12309 + }, + { + "epoch": 1.18, + "grad_norm": 0.2932837906060155, + "learning_rate": 7.902988557354667e-05, + "loss": 0.9857, + "step": 12310 + }, + { + "epoch": 1.18, + "grad_norm": 0.2886269795013318, + "learning_rate": 7.901441808333262e-05, + "loss": 1.1166, + "step": 12311 + }, + { + "epoch": 1.18, + "grad_norm": 0.28363022477149885, + "learning_rate": 7.899895111829594e-05, + "loss": 1.0655, + "step": 12312 + }, + { + "epoch": 1.18, + "grad_norm": 0.3108189131764565, + "learning_rate": 7.898348467882373e-05, + "loss": 1.0137, + "step": 12313 + }, + { + "epoch": 1.18, + "grad_norm": 0.3185091669460191, + "learning_rate": 7.8968018765303e-05, + "loss": 1.0278, + "step": 12314 + }, + { + "epoch": 1.18, + "grad_norm": 0.3036159120606576, + "learning_rate": 7.89525533781208e-05, + "loss": 1.0065, + "step": 12315 + }, + { + "epoch": 1.18, + "grad_norm": 0.308038682991964, + "learning_rate": 7.893708851766422e-05, + "loss": 1.1366, + "step": 12316 + }, + { + "epoch": 1.18, + "grad_norm": 0.2514470880499811, + "learning_rate": 7.892162418432024e-05, + "loss": 0.9699, + "step": 12317 + }, + { + "epoch": 1.18, + "grad_norm": 0.30452656362076785, + "learning_rate": 7.890616037847585e-05, + "loss": 1.0369, + "step": 12318 + }, + { + "epoch": 1.18, + "grad_norm": 0.2608030778603675, + "learning_rate": 7.889069710051807e-05, + "loss": 0.9445, + "step": 12319 + }, + { + "epoch": 1.18, + "grad_norm": 0.2712469119707307, + "learning_rate": 7.887523435083379e-05, + "loss": 1.0402, + "step": 12320 + }, + { + "epoch": 1.18, + "grad_norm": 0.35155292748426464, + "learning_rate": 7.88597721298101e-05, + "loss": 1.0747, + "step": 12321 + }, + { + "epoch": 1.18, + "grad_norm": 0.299220971901389, + "learning_rate": 7.88443104378339e-05, + "loss": 0.9736, + "step": 12322 + }, + { + "epoch": 1.18, + "grad_norm": 0.2734940350583464, + "learning_rate": 7.882884927529215e-05, + "loss": 0.9087, + "step": 12323 + }, + { + "epoch": 1.18, + "grad_norm": 0.30159324344467164, + "learning_rate": 7.881338864257173e-05, + "loss": 1.0923, + "step": 12324 + }, + { + "epoch": 1.18, + "grad_norm": 0.30559926557580125, + "learning_rate": 7.879792854005959e-05, + "loss": 1.0743, + "step": 12325 + }, + { + "epoch": 1.18, + "grad_norm": 0.2903574140873278, + "learning_rate": 7.878246896814259e-05, + "loss": 1.0754, + "step": 12326 + }, + { + "epoch": 1.18, + "grad_norm": 0.3005767706211161, + "learning_rate": 7.876700992720762e-05, + "loss": 1.1639, + "step": 12327 + }, + { + "epoch": 1.18, + "grad_norm": 0.2987374040275679, + "learning_rate": 7.87515514176416e-05, + "loss": 0.96, + "step": 12328 + }, + { + "epoch": 1.18, + "grad_norm": 0.3046707886831216, + "learning_rate": 7.873609343983137e-05, + "loss": 1.078, + "step": 12329 + }, + { + "epoch": 1.18, + "grad_norm": 0.310741425213357, + "learning_rate": 7.872063599416375e-05, + "loss": 1.0126, + "step": 12330 + }, + { + "epoch": 1.18, + "grad_norm": 0.3330194595708015, + "learning_rate": 7.870517908102558e-05, + "loss": 1.0553, + "step": 12331 + }, + { + "epoch": 1.18, + "grad_norm": 0.35603401299721493, + "learning_rate": 7.868972270080369e-05, + "loss": 1.06, + "step": 12332 + }, + { + "epoch": 1.18, + "grad_norm": 0.316429236345029, + "learning_rate": 7.867426685388491e-05, + "loss": 0.9243, + "step": 12333 + }, + { + "epoch": 1.18, + "grad_norm": 0.3172564006494952, + "learning_rate": 7.865881154065596e-05, + "loss": 1.025, + "step": 12334 + }, + { + "epoch": 1.18, + "grad_norm": 0.3198371241965819, + "learning_rate": 7.864335676150367e-05, + "loss": 0.9908, + "step": 12335 + }, + { + "epoch": 1.18, + "grad_norm": 0.2886500197420304, + "learning_rate": 7.862790251681483e-05, + "loss": 1.0349, + "step": 12336 + }, + { + "epoch": 1.18, + "grad_norm": 0.31700249086337595, + "learning_rate": 7.861244880697615e-05, + "loss": 1.077, + "step": 12337 + }, + { + "epoch": 1.18, + "grad_norm": 0.33520957153940134, + "learning_rate": 7.859699563237439e-05, + "loss": 1.0007, + "step": 12338 + }, + { + "epoch": 1.18, + "grad_norm": 0.3454329333316163, + "learning_rate": 7.858154299339626e-05, + "loss": 1.1705, + "step": 12339 + }, + { + "epoch": 1.18, + "grad_norm": 0.33863856404837667, + "learning_rate": 7.856609089042846e-05, + "loss": 1.111, + "step": 12340 + }, + { + "epoch": 1.18, + "grad_norm": 0.33268883045152764, + "learning_rate": 7.855063932385769e-05, + "loss": 1.097, + "step": 12341 + }, + { + "epoch": 1.18, + "grad_norm": 0.30501705251173716, + "learning_rate": 7.853518829407068e-05, + "loss": 1.1099, + "step": 12342 + }, + { + "epoch": 1.18, + "grad_norm": 0.30272146551500634, + "learning_rate": 7.851973780145408e-05, + "loss": 1.0182, + "step": 12343 + }, + { + "epoch": 1.18, + "grad_norm": 0.3123579019923319, + "learning_rate": 7.850428784639454e-05, + "loss": 0.8987, + "step": 12344 + }, + { + "epoch": 1.18, + "grad_norm": 0.3585319871501846, + "learning_rate": 7.848883842927871e-05, + "loss": 1.0998, + "step": 12345 + }, + { + "epoch": 1.18, + "grad_norm": 0.34708903099937316, + "learning_rate": 7.847338955049321e-05, + "loss": 1.0377, + "step": 12346 + }, + { + "epoch": 1.18, + "grad_norm": 0.30003026325680615, + "learning_rate": 7.845794121042467e-05, + "loss": 1.0124, + "step": 12347 + }, + { + "epoch": 1.18, + "grad_norm": 0.28586704387932904, + "learning_rate": 7.844249340945964e-05, + "loss": 0.988, + "step": 12348 + }, + { + "epoch": 1.18, + "grad_norm": 0.32170014210794406, + "learning_rate": 7.842704614798482e-05, + "loss": 1.016, + "step": 12349 + }, + { + "epoch": 1.18, + "grad_norm": 0.3171550075569755, + "learning_rate": 7.841159942638672e-05, + "loss": 0.9821, + "step": 12350 + }, + { + "epoch": 1.18, + "grad_norm": 0.26568260764835633, + "learning_rate": 7.83961532450519e-05, + "loss": 1.0615, + "step": 12351 + }, + { + "epoch": 1.18, + "grad_norm": 0.29884159360015244, + "learning_rate": 7.838070760436691e-05, + "loss": 0.9523, + "step": 12352 + }, + { + "epoch": 1.18, + "grad_norm": 0.3132809892398003, + "learning_rate": 7.836526250471832e-05, + "loss": 1.045, + "step": 12353 + }, + { + "epoch": 1.18, + "grad_norm": 0.3157318556605002, + "learning_rate": 7.83498179464926e-05, + "loss": 1.0337, + "step": 12354 + }, + { + "epoch": 1.18, + "grad_norm": 0.31352971934822854, + "learning_rate": 7.833437393007634e-05, + "loss": 1.0201, + "step": 12355 + }, + { + "epoch": 1.18, + "grad_norm": 0.2963780854091285, + "learning_rate": 7.831893045585595e-05, + "loss": 0.9985, + "step": 12356 + }, + { + "epoch": 1.18, + "grad_norm": 0.3346135388061129, + "learning_rate": 7.830348752421799e-05, + "loss": 1.0579, + "step": 12357 + }, + { + "epoch": 1.18, + "grad_norm": 0.3104055438492123, + "learning_rate": 7.828804513554887e-05, + "loss": 0.9738, + "step": 12358 + }, + { + "epoch": 1.18, + "grad_norm": 0.30796241765339066, + "learning_rate": 7.827260329023507e-05, + "loss": 1.0137, + "step": 12359 + }, + { + "epoch": 1.18, + "grad_norm": 0.3148368632682338, + "learning_rate": 7.825716198866304e-05, + "loss": 0.9783, + "step": 12360 + }, + { + "epoch": 1.18, + "grad_norm": 0.3054952600637796, + "learning_rate": 7.824172123121913e-05, + "loss": 1.0232, + "step": 12361 + }, + { + "epoch": 1.18, + "grad_norm": 0.31968104841528405, + "learning_rate": 7.822628101828988e-05, + "loss": 0.9749, + "step": 12362 + }, + { + "epoch": 1.18, + "grad_norm": 0.33753906520840093, + "learning_rate": 7.821084135026165e-05, + "loss": 0.9029, + "step": 12363 + }, + { + "epoch": 1.18, + "grad_norm": 0.3405559147576027, + "learning_rate": 7.819540222752077e-05, + "loss": 1.0033, + "step": 12364 + }, + { + "epoch": 1.18, + "grad_norm": 0.3144541600688221, + "learning_rate": 7.817996365045368e-05, + "loss": 1.0055, + "step": 12365 + }, + { + "epoch": 1.18, + "grad_norm": 0.2812694210877648, + "learning_rate": 7.816452561944671e-05, + "loss": 0.9763, + "step": 12366 + }, + { + "epoch": 1.18, + "grad_norm": 0.33565571260548627, + "learning_rate": 7.814908813488619e-05, + "loss": 1.0829, + "step": 12367 + }, + { + "epoch": 1.18, + "grad_norm": 0.3539818480747366, + "learning_rate": 7.813365119715846e-05, + "loss": 0.9116, + "step": 12368 + }, + { + "epoch": 1.18, + "grad_norm": 0.3455116531588368, + "learning_rate": 7.811821480664987e-05, + "loss": 0.9168, + "step": 12369 + }, + { + "epoch": 1.18, + "grad_norm": 0.2957548804219064, + "learning_rate": 7.810277896374672e-05, + "loss": 1.0092, + "step": 12370 + }, + { + "epoch": 1.18, + "grad_norm": 0.2874469864542806, + "learning_rate": 7.808734366883528e-05, + "loss": 1.0853, + "step": 12371 + }, + { + "epoch": 1.18, + "grad_norm": 0.30424316332188667, + "learning_rate": 7.807190892230182e-05, + "loss": 1.047, + "step": 12372 + }, + { + "epoch": 1.18, + "grad_norm": 0.29718041469381873, + "learning_rate": 7.805647472453263e-05, + "loss": 1.1259, + "step": 12373 + }, + { + "epoch": 1.18, + "grad_norm": 0.3406109616714969, + "learning_rate": 7.804104107591397e-05, + "loss": 1.0928, + "step": 12374 + }, + { + "epoch": 1.18, + "grad_norm": 0.3135442237820493, + "learning_rate": 7.802560797683203e-05, + "loss": 0.978, + "step": 12375 + }, + { + "epoch": 1.18, + "grad_norm": 0.30705290848399436, + "learning_rate": 7.801017542767305e-05, + "loss": 0.9722, + "step": 12376 + }, + { + "epoch": 1.18, + "grad_norm": 0.26551263025404126, + "learning_rate": 7.799474342882328e-05, + "loss": 1.0647, + "step": 12377 + }, + { + "epoch": 1.18, + "grad_norm": 0.299884582712083, + "learning_rate": 7.797931198066889e-05, + "loss": 0.9272, + "step": 12378 + }, + { + "epoch": 1.18, + "grad_norm": 0.2899964176354625, + "learning_rate": 7.796388108359604e-05, + "loss": 1.043, + "step": 12379 + }, + { + "epoch": 1.18, + "grad_norm": 0.24377624726591784, + "learning_rate": 7.794845073799092e-05, + "loss": 1.0205, + "step": 12380 + }, + { + "epoch": 1.18, + "grad_norm": 0.2854054996084487, + "learning_rate": 7.793302094423966e-05, + "loss": 1.048, + "step": 12381 + }, + { + "epoch": 1.18, + "grad_norm": 0.3098148320622186, + "learning_rate": 7.791759170272845e-05, + "loss": 1.0322, + "step": 12382 + }, + { + "epoch": 1.18, + "grad_norm": 0.25481605140962066, + "learning_rate": 7.790216301384338e-05, + "loss": 0.9978, + "step": 12383 + }, + { + "epoch": 1.18, + "grad_norm": 0.35079675819394096, + "learning_rate": 7.788673487797056e-05, + "loss": 1.0347, + "step": 12384 + }, + { + "epoch": 1.18, + "grad_norm": 0.37389967399202384, + "learning_rate": 7.787130729549611e-05, + "loss": 0.9769, + "step": 12385 + }, + { + "epoch": 1.18, + "grad_norm": 0.2861786851677904, + "learning_rate": 7.785588026680611e-05, + "loss": 1.0028, + "step": 12386 + }, + { + "epoch": 1.19, + "grad_norm": 0.25619279023602265, + "learning_rate": 7.78404537922866e-05, + "loss": 1.1135, + "step": 12387 + }, + { + "epoch": 1.19, + "grad_norm": 0.29183497297093675, + "learning_rate": 7.782502787232362e-05, + "loss": 1.0561, + "step": 12388 + }, + { + "epoch": 1.19, + "grad_norm": 0.27699611402376784, + "learning_rate": 7.780960250730333e-05, + "loss": 0.9218, + "step": 12389 + }, + { + "epoch": 1.19, + "grad_norm": 0.3059746857199482, + "learning_rate": 7.779417769761166e-05, + "loss": 1.023, + "step": 12390 + }, + { + "epoch": 1.19, + "grad_norm": 0.3142363931934752, + "learning_rate": 7.777875344363464e-05, + "loss": 1.0027, + "step": 12391 + }, + { + "epoch": 1.19, + "grad_norm": 0.32790305642016415, + "learning_rate": 7.77633297457583e-05, + "loss": 0.9807, + "step": 12392 + }, + { + "epoch": 1.19, + "grad_norm": 0.32201389375582545, + "learning_rate": 7.774790660436858e-05, + "loss": 1.0229, + "step": 12393 + }, + { + "epoch": 1.19, + "grad_norm": 0.3216969393958999, + "learning_rate": 7.77324840198515e-05, + "loss": 1.1007, + "step": 12394 + }, + { + "epoch": 1.19, + "grad_norm": 0.39124579350828753, + "learning_rate": 7.771706199259297e-05, + "loss": 0.958, + "step": 12395 + }, + { + "epoch": 1.19, + "grad_norm": 0.3333052433489322, + "learning_rate": 7.7701640522979e-05, + "loss": 1.0283, + "step": 12396 + }, + { + "epoch": 1.19, + "grad_norm": 0.30573752379650077, + "learning_rate": 7.768621961139548e-05, + "loss": 0.9808, + "step": 12397 + }, + { + "epoch": 1.19, + "grad_norm": 0.2810593121738832, + "learning_rate": 7.767079925822835e-05, + "loss": 1.0895, + "step": 12398 + }, + { + "epoch": 1.19, + "grad_norm": 0.3223188454928677, + "learning_rate": 7.76553794638635e-05, + "loss": 1.0367, + "step": 12399 + }, + { + "epoch": 1.19, + "grad_norm": 0.2801598729135209, + "learning_rate": 7.763996022868681e-05, + "loss": 1.0517, + "step": 12400 + }, + { + "epoch": 1.19, + "grad_norm": 0.3046947179396429, + "learning_rate": 7.762454155308418e-05, + "loss": 1.0233, + "step": 12401 + }, + { + "epoch": 1.19, + "grad_norm": 0.2920041654899579, + "learning_rate": 7.760912343744142e-05, + "loss": 0.8909, + "step": 12402 + }, + { + "epoch": 1.19, + "grad_norm": 0.26475301681294816, + "learning_rate": 7.759370588214446e-05, + "loss": 1.0895, + "step": 12403 + }, + { + "epoch": 1.19, + "grad_norm": 0.29194333616478235, + "learning_rate": 7.75782888875791e-05, + "loss": 0.8518, + "step": 12404 + }, + { + "epoch": 1.19, + "grad_norm": 0.31770133637463416, + "learning_rate": 7.756287245413114e-05, + "loss": 1.0424, + "step": 12405 + }, + { + "epoch": 1.19, + "grad_norm": 0.33668750990111546, + "learning_rate": 7.75474565821864e-05, + "loss": 1.0492, + "step": 12406 + }, + { + "epoch": 1.19, + "grad_norm": 0.29495677193949144, + "learning_rate": 7.753204127213067e-05, + "loss": 1.0277, + "step": 12407 + }, + { + "epoch": 1.19, + "grad_norm": 0.31733469067130476, + "learning_rate": 7.751662652434973e-05, + "loss": 1.0724, + "step": 12408 + }, + { + "epoch": 1.19, + "grad_norm": 0.2868088264490977, + "learning_rate": 7.750121233922929e-05, + "loss": 1.0381, + "step": 12409 + }, + { + "epoch": 1.19, + "grad_norm": 0.3123971118946946, + "learning_rate": 7.748579871715521e-05, + "loss": 1.0845, + "step": 12410 + }, + { + "epoch": 1.19, + "grad_norm": 0.3140383014660475, + "learning_rate": 7.747038565851316e-05, + "loss": 0.9709, + "step": 12411 + }, + { + "epoch": 1.19, + "grad_norm": 0.328875097208197, + "learning_rate": 7.745497316368886e-05, + "loss": 1.1393, + "step": 12412 + }, + { + "epoch": 1.19, + "grad_norm": 0.28790679750915155, + "learning_rate": 7.743956123306804e-05, + "loss": 0.9578, + "step": 12413 + }, + { + "epoch": 1.19, + "grad_norm": 0.3072968742441174, + "learning_rate": 7.742414986703634e-05, + "loss": 0.9813, + "step": 12414 + }, + { + "epoch": 1.19, + "grad_norm": 0.29977822768998413, + "learning_rate": 7.740873906597949e-05, + "loss": 1.0492, + "step": 12415 + }, + { + "epoch": 1.19, + "grad_norm": 0.27375450936458406, + "learning_rate": 7.739332883028316e-05, + "loss": 1.0678, + "step": 12416 + }, + { + "epoch": 1.19, + "grad_norm": 0.3082695095690427, + "learning_rate": 7.737791916033297e-05, + "loss": 0.9531, + "step": 12417 + }, + { + "epoch": 1.19, + "grad_norm": 0.33805879667993677, + "learning_rate": 7.736251005651458e-05, + "loss": 1.132, + "step": 12418 + }, + { + "epoch": 1.19, + "grad_norm": 0.3363395945855833, + "learning_rate": 7.734710151921361e-05, + "loss": 1.0047, + "step": 12419 + }, + { + "epoch": 1.19, + "grad_norm": 0.3378710587612075, + "learning_rate": 7.733169354881566e-05, + "loss": 1.0914, + "step": 12420 + }, + { + "epoch": 1.19, + "grad_norm": 0.3060324062483996, + "learning_rate": 7.731628614570634e-05, + "loss": 1.0905, + "step": 12421 + }, + { + "epoch": 1.19, + "grad_norm": 0.30505123806775913, + "learning_rate": 7.730087931027114e-05, + "loss": 1.0002, + "step": 12422 + }, + { + "epoch": 1.19, + "grad_norm": 0.29949459320705657, + "learning_rate": 7.728547304289578e-05, + "loss": 1.0043, + "step": 12423 + }, + { + "epoch": 1.19, + "grad_norm": 0.32735891709222203, + "learning_rate": 7.72700673439657e-05, + "loss": 1.1045, + "step": 12424 + }, + { + "epoch": 1.19, + "grad_norm": 0.3000278979240446, + "learning_rate": 7.72546622138665e-05, + "loss": 1.1701, + "step": 12425 + }, + { + "epoch": 1.19, + "grad_norm": 0.3349585641064477, + "learning_rate": 7.723925765298365e-05, + "loss": 1.0613, + "step": 12426 + }, + { + "epoch": 1.19, + "grad_norm": 0.2757630508520786, + "learning_rate": 7.722385366170269e-05, + "loss": 0.9394, + "step": 12427 + }, + { + "epoch": 1.19, + "grad_norm": 0.32218066024404646, + "learning_rate": 7.720845024040911e-05, + "loss": 0.9407, + "step": 12428 + }, + { + "epoch": 1.19, + "grad_norm": 0.27465328219081003, + "learning_rate": 7.719304738948833e-05, + "loss": 1.0375, + "step": 12429 + }, + { + "epoch": 1.19, + "grad_norm": 0.33208436983904593, + "learning_rate": 7.717764510932593e-05, + "loss": 1.0826, + "step": 12430 + }, + { + "epoch": 1.19, + "grad_norm": 0.26504806007851084, + "learning_rate": 7.71622434003073e-05, + "loss": 1.0089, + "step": 12431 + }, + { + "epoch": 1.19, + "grad_norm": 0.30198635066609786, + "learning_rate": 7.714684226281788e-05, + "loss": 1.0501, + "step": 12432 + }, + { + "epoch": 1.19, + "grad_norm": 0.3313352995962886, + "learning_rate": 7.71314416972431e-05, + "loss": 0.8977, + "step": 12433 + }, + { + "epoch": 1.19, + "grad_norm": 0.2809565296230356, + "learning_rate": 7.711604170396833e-05, + "loss": 0.9953, + "step": 12434 + }, + { + "epoch": 1.19, + "grad_norm": 0.30181594121596617, + "learning_rate": 7.710064228337903e-05, + "loss": 1.0296, + "step": 12435 + }, + { + "epoch": 1.19, + "grad_norm": 0.2881840800491161, + "learning_rate": 7.708524343586054e-05, + "loss": 0.993, + "step": 12436 + }, + { + "epoch": 1.19, + "grad_norm": 0.30945344732579577, + "learning_rate": 7.706984516179824e-05, + "loss": 0.9768, + "step": 12437 + }, + { + "epoch": 1.19, + "grad_norm": 0.33742458906877704, + "learning_rate": 7.705444746157745e-05, + "loss": 0.9933, + "step": 12438 + }, + { + "epoch": 1.19, + "grad_norm": 0.30603531517740457, + "learning_rate": 7.703905033558357e-05, + "loss": 1.009, + "step": 12439 + }, + { + "epoch": 1.19, + "grad_norm": 0.3248143453390403, + "learning_rate": 7.702365378420186e-05, + "loss": 1.0707, + "step": 12440 + }, + { + "epoch": 1.19, + "grad_norm": 0.3673899202773331, + "learning_rate": 7.700825780781769e-05, + "loss": 1.0988, + "step": 12441 + }, + { + "epoch": 1.19, + "grad_norm": 0.3332558924103378, + "learning_rate": 7.699286240681625e-05, + "loss": 1.0239, + "step": 12442 + }, + { + "epoch": 1.19, + "grad_norm": 0.31029940352834484, + "learning_rate": 7.697746758158293e-05, + "loss": 1.0039, + "step": 12443 + }, + { + "epoch": 1.19, + "grad_norm": 0.3581781204573279, + "learning_rate": 7.696207333250297e-05, + "loss": 0.9635, + "step": 12444 + }, + { + "epoch": 1.19, + "grad_norm": 0.2860138179431936, + "learning_rate": 7.694667965996159e-05, + "loss": 1.0496, + "step": 12445 + }, + { + "epoch": 1.19, + "grad_norm": 0.32193692491888987, + "learning_rate": 7.693128656434404e-05, + "loss": 1.0003, + "step": 12446 + }, + { + "epoch": 1.19, + "grad_norm": 0.3275116156391719, + "learning_rate": 7.691589404603555e-05, + "loss": 1.0488, + "step": 12447 + }, + { + "epoch": 1.19, + "grad_norm": 0.32568082378621993, + "learning_rate": 7.690050210542131e-05, + "loss": 0.9052, + "step": 12448 + }, + { + "epoch": 1.19, + "grad_norm": 0.32600957380499135, + "learning_rate": 7.688511074288649e-05, + "loss": 1.0434, + "step": 12449 + }, + { + "epoch": 1.19, + "grad_norm": 0.28289995109292526, + "learning_rate": 7.686971995881633e-05, + "loss": 1.0383, + "step": 12450 + }, + { + "epoch": 1.19, + "grad_norm": 0.3425586431805106, + "learning_rate": 7.685432975359598e-05, + "loss": 1.0996, + "step": 12451 + }, + { + "epoch": 1.19, + "grad_norm": 0.32172692722898105, + "learning_rate": 7.683894012761057e-05, + "loss": 0.9594, + "step": 12452 + }, + { + "epoch": 1.19, + "grad_norm": 0.3121341543252601, + "learning_rate": 7.682355108124523e-05, + "loss": 1.1488, + "step": 12453 + }, + { + "epoch": 1.19, + "grad_norm": 0.3215263002782254, + "learning_rate": 7.68081626148851e-05, + "loss": 1.0497, + "step": 12454 + }, + { + "epoch": 1.19, + "grad_norm": 0.2732297954782772, + "learning_rate": 7.679277472891524e-05, + "loss": 0.9752, + "step": 12455 + }, + { + "epoch": 1.19, + "grad_norm": 0.31636581796877966, + "learning_rate": 7.677738742372079e-05, + "loss": 1.0298, + "step": 12456 + }, + { + "epoch": 1.19, + "grad_norm": 0.29918943988647995, + "learning_rate": 7.676200069968684e-05, + "loss": 1.0228, + "step": 12457 + }, + { + "epoch": 1.19, + "grad_norm": 0.3045539547443703, + "learning_rate": 7.674661455719842e-05, + "loss": 1.0262, + "step": 12458 + }, + { + "epoch": 1.19, + "grad_norm": 0.3276751219992979, + "learning_rate": 7.673122899664057e-05, + "loss": 1.0519, + "step": 12459 + }, + { + "epoch": 1.19, + "grad_norm": 0.330642891510667, + "learning_rate": 7.671584401839836e-05, + "loss": 1.1034, + "step": 12460 + }, + { + "epoch": 1.19, + "grad_norm": 0.3288056899671209, + "learning_rate": 7.670045962285676e-05, + "loss": 0.9898, + "step": 12461 + }, + { + "epoch": 1.19, + "grad_norm": 0.3352069767947915, + "learning_rate": 7.668507581040083e-05, + "loss": 1.072, + "step": 12462 + }, + { + "epoch": 1.19, + "grad_norm": 0.32985844462948527, + "learning_rate": 7.666969258141548e-05, + "loss": 1.0581, + "step": 12463 + }, + { + "epoch": 1.19, + "grad_norm": 0.31144292463059264, + "learning_rate": 7.665430993628578e-05, + "loss": 1.0893, + "step": 12464 + }, + { + "epoch": 1.19, + "grad_norm": 0.30395263697443875, + "learning_rate": 7.663892787539665e-05, + "loss": 1.0675, + "step": 12465 + }, + { + "epoch": 1.19, + "grad_norm": 0.31498975338067814, + "learning_rate": 7.662354639913303e-05, + "loss": 1.0288, + "step": 12466 + }, + { + "epoch": 1.19, + "grad_norm": 0.3049262846804138, + "learning_rate": 7.660816550787984e-05, + "loss": 1.0342, + "step": 12467 + }, + { + "epoch": 1.19, + "grad_norm": 0.2710603887533777, + "learning_rate": 7.659278520202203e-05, + "loss": 1.0075, + "step": 12468 + }, + { + "epoch": 1.19, + "grad_norm": 0.32555968079638403, + "learning_rate": 7.657740548194447e-05, + "loss": 0.9182, + "step": 12469 + }, + { + "epoch": 1.19, + "grad_norm": 0.34294554676367356, + "learning_rate": 7.656202634803202e-05, + "loss": 1.0875, + "step": 12470 + }, + { + "epoch": 1.19, + "grad_norm": 0.3392489446872539, + "learning_rate": 7.654664780066963e-05, + "loss": 1.0303, + "step": 12471 + }, + { + "epoch": 1.19, + "grad_norm": 0.3175461239405868, + "learning_rate": 7.653126984024211e-05, + "loss": 1.0642, + "step": 12472 + }, + { + "epoch": 1.19, + "grad_norm": 0.30985205563029955, + "learning_rate": 7.651589246713433e-05, + "loss": 1.0621, + "step": 12473 + }, + { + "epoch": 1.19, + "grad_norm": 0.2877745439988797, + "learning_rate": 7.650051568173108e-05, + "loss": 1.066, + "step": 12474 + }, + { + "epoch": 1.19, + "grad_norm": 0.31771299245561274, + "learning_rate": 7.648513948441719e-05, + "loss": 0.9234, + "step": 12475 + }, + { + "epoch": 1.19, + "grad_norm": 0.3554512080219655, + "learning_rate": 7.646976387557746e-05, + "loss": 1.0443, + "step": 12476 + }, + { + "epoch": 1.19, + "grad_norm": 0.2745960904382958, + "learning_rate": 7.64543888555967e-05, + "loss": 0.941, + "step": 12477 + }, + { + "epoch": 1.19, + "grad_norm": 0.29126503669911075, + "learning_rate": 7.643901442485964e-05, + "loss": 1.0104, + "step": 12478 + }, + { + "epoch": 1.19, + "grad_norm": 0.33094136139538477, + "learning_rate": 7.642364058375104e-05, + "loss": 0.9816, + "step": 12479 + }, + { + "epoch": 1.19, + "grad_norm": 0.31325898697585786, + "learning_rate": 7.640826733265568e-05, + "loss": 1.0257, + "step": 12480 + }, + { + "epoch": 1.19, + "grad_norm": 0.2880302526248939, + "learning_rate": 7.639289467195825e-05, + "loss": 1.0043, + "step": 12481 + }, + { + "epoch": 1.19, + "grad_norm": 0.356445940870874, + "learning_rate": 7.637752260204347e-05, + "loss": 1.115, + "step": 12482 + }, + { + "epoch": 1.19, + "grad_norm": 0.27080421502916197, + "learning_rate": 7.636215112329599e-05, + "loss": 0.9761, + "step": 12483 + }, + { + "epoch": 1.19, + "grad_norm": 0.3171107762226307, + "learning_rate": 7.634678023610059e-05, + "loss": 1.0256, + "step": 12484 + }, + { + "epoch": 1.19, + "grad_norm": 0.34195929941086645, + "learning_rate": 7.633140994084185e-05, + "loss": 1.034, + "step": 12485 + }, + { + "epoch": 1.19, + "grad_norm": 0.2587756910040122, + "learning_rate": 7.631604023790446e-05, + "loss": 1.0167, + "step": 12486 + }, + { + "epoch": 1.19, + "grad_norm": 0.32123964911357517, + "learning_rate": 7.630067112767306e-05, + "loss": 1.098, + "step": 12487 + }, + { + "epoch": 1.19, + "grad_norm": 0.2642552201405098, + "learning_rate": 7.628530261053226e-05, + "loss": 0.955, + "step": 12488 + }, + { + "epoch": 1.19, + "grad_norm": 0.2535223253137955, + "learning_rate": 7.626993468686667e-05, + "loss": 0.9868, + "step": 12489 + }, + { + "epoch": 1.19, + "grad_norm": 0.35052081906383675, + "learning_rate": 7.625456735706082e-05, + "loss": 1.0337, + "step": 12490 + }, + { + "epoch": 1.2, + "grad_norm": 0.3027509481018106, + "learning_rate": 7.62392006214994e-05, + "loss": 0.9965, + "step": 12491 + }, + { + "epoch": 1.2, + "grad_norm": 0.2854382709218523, + "learning_rate": 7.62238344805669e-05, + "loss": 0.9976, + "step": 12492 + }, + { + "epoch": 1.2, + "grad_norm": 0.2955052842717166, + "learning_rate": 7.620846893464791e-05, + "loss": 1.0324, + "step": 12493 + }, + { + "epoch": 1.2, + "grad_norm": 0.3463277269721645, + "learning_rate": 7.619310398412691e-05, + "loss": 1.0199, + "step": 12494 + }, + { + "epoch": 1.2, + "grad_norm": 0.2891343975181203, + "learning_rate": 7.617773962938847e-05, + "loss": 1.0474, + "step": 12495 + }, + { + "epoch": 1.2, + "grad_norm": 0.26969917780852953, + "learning_rate": 7.616237587081702e-05, + "loss": 0.9353, + "step": 12496 + }, + { + "epoch": 1.2, + "grad_norm": 0.31328611986403426, + "learning_rate": 7.614701270879711e-05, + "loss": 1.0175, + "step": 12497 + }, + { + "epoch": 1.2, + "grad_norm": 0.3093137995220943, + "learning_rate": 7.613165014371323e-05, + "loss": 1.0208, + "step": 12498 + }, + { + "epoch": 1.2, + "grad_norm": 0.26913783246996925, + "learning_rate": 7.61162881759498e-05, + "loss": 1.0519, + "step": 12499 + }, + { + "epoch": 1.2, + "grad_norm": 0.30939097892185274, + "learning_rate": 7.610092680589124e-05, + "loss": 1.08, + "step": 12500 + }, + { + "epoch": 1.2, + "grad_norm": 0.36067792799714266, + "learning_rate": 7.608556603392203e-05, + "loss": 1.0152, + "step": 12501 + }, + { + "epoch": 1.2, + "grad_norm": 0.3198455396492707, + "learning_rate": 7.607020586042657e-05, + "loss": 1.0745, + "step": 12502 + }, + { + "epoch": 1.2, + "grad_norm": 0.25451101771077417, + "learning_rate": 7.605484628578921e-05, + "loss": 0.9806, + "step": 12503 + }, + { + "epoch": 1.2, + "grad_norm": 0.3243200709381897, + "learning_rate": 7.603948731039437e-05, + "loss": 0.9884, + "step": 12504 + }, + { + "epoch": 1.2, + "grad_norm": 0.32342941903024824, + "learning_rate": 7.602412893462646e-05, + "loss": 1.086, + "step": 12505 + }, + { + "epoch": 1.2, + "grad_norm": 0.3050803890633507, + "learning_rate": 7.600877115886977e-05, + "loss": 1.0321, + "step": 12506 + }, + { + "epoch": 1.2, + "grad_norm": 0.32440480055667686, + "learning_rate": 7.599341398350868e-05, + "loss": 1.0257, + "step": 12507 + }, + { + "epoch": 1.2, + "grad_norm": 0.3375773512898762, + "learning_rate": 7.597805740892748e-05, + "loss": 1.0804, + "step": 12508 + }, + { + "epoch": 1.2, + "grad_norm": 0.3102749311936819, + "learning_rate": 7.596270143551048e-05, + "loss": 0.9983, + "step": 12509 + }, + { + "epoch": 1.2, + "grad_norm": 0.27457398657827303, + "learning_rate": 7.594734606364197e-05, + "loss": 1.0395, + "step": 12510 + }, + { + "epoch": 1.2, + "grad_norm": 0.3038316126302666, + "learning_rate": 7.593199129370626e-05, + "loss": 1.0561, + "step": 12511 + }, + { + "epoch": 1.2, + "grad_norm": 0.3133107745079505, + "learning_rate": 7.59166371260876e-05, + "loss": 1.0812, + "step": 12512 + }, + { + "epoch": 1.2, + "grad_norm": 0.3177868334455486, + "learning_rate": 7.590128356117024e-05, + "loss": 1.0879, + "step": 12513 + }, + { + "epoch": 1.2, + "grad_norm": 0.29822839579089, + "learning_rate": 7.588593059933839e-05, + "loss": 0.9493, + "step": 12514 + }, + { + "epoch": 1.2, + "grad_norm": 0.2826453186796218, + "learning_rate": 7.58705782409763e-05, + "loss": 1.0583, + "step": 12515 + }, + { + "epoch": 1.2, + "grad_norm": 0.2826205789186307, + "learning_rate": 7.585522648646811e-05, + "loss": 1.0568, + "step": 12516 + }, + { + "epoch": 1.2, + "grad_norm": 0.3425610160214411, + "learning_rate": 7.583987533619806e-05, + "loss": 1.0564, + "step": 12517 + }, + { + "epoch": 1.2, + "grad_norm": 0.2906921003101311, + "learning_rate": 7.582452479055034e-05, + "loss": 1.0786, + "step": 12518 + }, + { + "epoch": 1.2, + "grad_norm": 0.32060689454570296, + "learning_rate": 7.580917484990908e-05, + "loss": 0.9505, + "step": 12519 + }, + { + "epoch": 1.2, + "grad_norm": 0.34403166742441765, + "learning_rate": 7.57938255146584e-05, + "loss": 1.1108, + "step": 12520 + }, + { + "epoch": 1.2, + "grad_norm": 0.29767509757080973, + "learning_rate": 7.577847678518248e-05, + "loss": 0.9858, + "step": 12521 + }, + { + "epoch": 1.2, + "grad_norm": 0.29313731927031567, + "learning_rate": 7.57631286618654e-05, + "loss": 0.9599, + "step": 12522 + }, + { + "epoch": 1.2, + "grad_norm": 0.34803071402435415, + "learning_rate": 7.574778114509124e-05, + "loss": 1.0065, + "step": 12523 + }, + { + "epoch": 1.2, + "grad_norm": 0.3191605933165996, + "learning_rate": 7.573243423524408e-05, + "loss": 1.048, + "step": 12524 + }, + { + "epoch": 1.2, + "grad_norm": 0.2975574591860823, + "learning_rate": 7.571708793270802e-05, + "loss": 1.0094, + "step": 12525 + }, + { + "epoch": 1.2, + "grad_norm": 0.30315519572034993, + "learning_rate": 7.570174223786711e-05, + "loss": 1.1068, + "step": 12526 + }, + { + "epoch": 1.2, + "grad_norm": 0.33161303501498524, + "learning_rate": 7.568639715110537e-05, + "loss": 1.0606, + "step": 12527 + }, + { + "epoch": 1.2, + "grad_norm": 0.3265952926812042, + "learning_rate": 7.567105267280683e-05, + "loss": 0.8967, + "step": 12528 + }, + { + "epoch": 1.2, + "grad_norm": 0.32302644679166254, + "learning_rate": 7.565570880335548e-05, + "loss": 1.0292, + "step": 12529 + }, + { + "epoch": 1.2, + "grad_norm": 0.33249265397225314, + "learning_rate": 7.564036554313531e-05, + "loss": 1.0692, + "step": 12530 + }, + { + "epoch": 1.2, + "grad_norm": 0.3194511281607851, + "learning_rate": 7.562502289253027e-05, + "loss": 1.0634, + "step": 12531 + }, + { + "epoch": 1.2, + "grad_norm": 0.3836156448086532, + "learning_rate": 7.560968085192439e-05, + "loss": 1.0656, + "step": 12532 + }, + { + "epoch": 1.2, + "grad_norm": 0.29305333254778504, + "learning_rate": 7.559433942170158e-05, + "loss": 0.9811, + "step": 12533 + }, + { + "epoch": 1.2, + "grad_norm": 0.2560245960897966, + "learning_rate": 7.557899860224575e-05, + "loss": 1.042, + "step": 12534 + }, + { + "epoch": 1.2, + "grad_norm": 0.32479941195609396, + "learning_rate": 7.556365839394086e-05, + "loss": 1.0564, + "step": 12535 + }, + { + "epoch": 1.2, + "grad_norm": 0.3164591700894538, + "learning_rate": 7.554831879717075e-05, + "loss": 0.8918, + "step": 12536 + }, + { + "epoch": 1.2, + "grad_norm": 0.30369225187047577, + "learning_rate": 7.553297981231929e-05, + "loss": 1.065, + "step": 12537 + }, + { + "epoch": 1.2, + "grad_norm": 0.3190559057196423, + "learning_rate": 7.551764143977043e-05, + "loss": 1.0298, + "step": 12538 + }, + { + "epoch": 1.2, + "grad_norm": 0.3006451547735177, + "learning_rate": 7.550230367990797e-05, + "loss": 0.9404, + "step": 12539 + }, + { + "epoch": 1.2, + "grad_norm": 0.2999733564496193, + "learning_rate": 7.548696653311578e-05, + "loss": 0.9833, + "step": 12540 + }, + { + "epoch": 1.2, + "grad_norm": 0.3276206988083299, + "learning_rate": 7.547162999977762e-05, + "loss": 0.9366, + "step": 12541 + }, + { + "epoch": 1.2, + "grad_norm": 0.35237842950146797, + "learning_rate": 7.545629408027736e-05, + "loss": 0.9527, + "step": 12542 + }, + { + "epoch": 1.2, + "grad_norm": 0.2974112574711427, + "learning_rate": 7.544095877499876e-05, + "loss": 1.0258, + "step": 12543 + }, + { + "epoch": 1.2, + "grad_norm": 0.3041901659037143, + "learning_rate": 7.542562408432559e-05, + "loss": 1.0321, + "step": 12544 + }, + { + "epoch": 1.2, + "grad_norm": 0.3372365366055644, + "learning_rate": 7.541029000864161e-05, + "loss": 1.1276, + "step": 12545 + }, + { + "epoch": 1.2, + "grad_norm": 0.29622129141030584, + "learning_rate": 7.53949565483306e-05, + "loss": 1.1893, + "step": 12546 + }, + { + "epoch": 1.2, + "grad_norm": 0.3432704567732825, + "learning_rate": 7.537962370377629e-05, + "loss": 0.988, + "step": 12547 + }, + { + "epoch": 1.2, + "grad_norm": 0.31490843064710083, + "learning_rate": 7.536429147536235e-05, + "loss": 1.04, + "step": 12548 + }, + { + "epoch": 1.2, + "grad_norm": 0.28996273213324986, + "learning_rate": 7.53489598634725e-05, + "loss": 1.036, + "step": 12549 + }, + { + "epoch": 1.2, + "grad_norm": 0.26970176230394066, + "learning_rate": 7.533362886849042e-05, + "loss": 0.9904, + "step": 12550 + }, + { + "epoch": 1.2, + "grad_norm": 0.30069424304907116, + "learning_rate": 7.531829849079974e-05, + "loss": 1.0299, + "step": 12551 + }, + { + "epoch": 1.2, + "grad_norm": 0.2974413844065117, + "learning_rate": 7.53029687307842e-05, + "loss": 0.9375, + "step": 12552 + }, + { + "epoch": 1.2, + "grad_norm": 0.3164223556778576, + "learning_rate": 7.528763958882737e-05, + "loss": 1.0891, + "step": 12553 + }, + { + "epoch": 1.2, + "grad_norm": 0.2507215417306478, + "learning_rate": 7.527231106531292e-05, + "loss": 1.0641, + "step": 12554 + }, + { + "epoch": 1.2, + "grad_norm": 0.3573799032221247, + "learning_rate": 7.52569831606244e-05, + "loss": 0.9937, + "step": 12555 + }, + { + "epoch": 1.2, + "grad_norm": 0.3003377701876464, + "learning_rate": 7.524165587514545e-05, + "loss": 1.0829, + "step": 12556 + }, + { + "epoch": 1.2, + "grad_norm": 0.33133245298473735, + "learning_rate": 7.52263292092596e-05, + "loss": 1.1611, + "step": 12557 + }, + { + "epoch": 1.2, + "grad_norm": 0.31150872251308265, + "learning_rate": 7.52110031633504e-05, + "loss": 1.0186, + "step": 12558 + }, + { + "epoch": 1.2, + "grad_norm": 0.34701369422659617, + "learning_rate": 7.519567773780147e-05, + "loss": 0.954, + "step": 12559 + }, + { + "epoch": 1.2, + "grad_norm": 0.30314612697578336, + "learning_rate": 7.518035293299627e-05, + "loss": 0.9844, + "step": 12560 + }, + { + "epoch": 1.2, + "grad_norm": 0.32194120667557274, + "learning_rate": 7.516502874931834e-05, + "loss": 1.0787, + "step": 12561 + }, + { + "epoch": 1.2, + "grad_norm": 0.3267502802221382, + "learning_rate": 7.51497051871512e-05, + "loss": 1.0435, + "step": 12562 + }, + { + "epoch": 1.2, + "grad_norm": 0.2779493437240841, + "learning_rate": 7.513438224687829e-05, + "loss": 1.0397, + "step": 12563 + }, + { + "epoch": 1.2, + "grad_norm": 0.29789816057875546, + "learning_rate": 7.511905992888307e-05, + "loss": 1.093, + "step": 12564 + }, + { + "epoch": 1.2, + "grad_norm": 0.2766730839087854, + "learning_rate": 7.510373823354903e-05, + "loss": 0.9415, + "step": 12565 + }, + { + "epoch": 1.2, + "grad_norm": 0.3581235775264526, + "learning_rate": 7.508841716125958e-05, + "loss": 0.9722, + "step": 12566 + }, + { + "epoch": 1.2, + "grad_norm": 0.28341636891166583, + "learning_rate": 7.507309671239817e-05, + "loss": 1.056, + "step": 12567 + }, + { + "epoch": 1.2, + "grad_norm": 0.323016302109987, + "learning_rate": 7.505777688734818e-05, + "loss": 1.0924, + "step": 12568 + }, + { + "epoch": 1.2, + "grad_norm": 0.3083575249821192, + "learning_rate": 7.504245768649298e-05, + "loss": 1.0276, + "step": 12569 + }, + { + "epoch": 1.2, + "grad_norm": 0.2703112883640274, + "learning_rate": 7.502713911021598e-05, + "loss": 0.967, + "step": 12570 + }, + { + "epoch": 1.2, + "grad_norm": 0.28952243614027684, + "learning_rate": 7.501182115890047e-05, + "loss": 1.0659, + "step": 12571 + }, + { + "epoch": 1.2, + "grad_norm": 0.3171625214427002, + "learning_rate": 7.499650383292988e-05, + "loss": 1.0845, + "step": 12572 + }, + { + "epoch": 1.2, + "grad_norm": 0.32941242515288593, + "learning_rate": 7.49811871326875e-05, + "loss": 1.0076, + "step": 12573 + }, + { + "epoch": 1.2, + "grad_norm": 0.29256044987201857, + "learning_rate": 7.496587105855664e-05, + "loss": 0.9966, + "step": 12574 + }, + { + "epoch": 1.2, + "grad_norm": 0.3297993456173237, + "learning_rate": 7.495055561092059e-05, + "loss": 0.9438, + "step": 12575 + }, + { + "epoch": 1.2, + "grad_norm": 0.2978215071310354, + "learning_rate": 7.493524079016262e-05, + "loss": 1.1151, + "step": 12576 + }, + { + "epoch": 1.2, + "grad_norm": 0.3041267912825211, + "learning_rate": 7.4919926596666e-05, + "loss": 0.9192, + "step": 12577 + }, + { + "epoch": 1.2, + "grad_norm": 0.3085878756821412, + "learning_rate": 7.490461303081395e-05, + "loss": 1.1097, + "step": 12578 + }, + { + "epoch": 1.2, + "grad_norm": 0.3364309465534316, + "learning_rate": 7.488930009298976e-05, + "loss": 1.082, + "step": 12579 + }, + { + "epoch": 1.2, + "grad_norm": 0.286769175074176, + "learning_rate": 7.487398778357661e-05, + "loss": 1.0623, + "step": 12580 + }, + { + "epoch": 1.2, + "grad_norm": 0.2848520521195217, + "learning_rate": 7.485867610295774e-05, + "loss": 1.06, + "step": 12581 + }, + { + "epoch": 1.2, + "grad_norm": 0.33344845533801126, + "learning_rate": 7.484336505151625e-05, + "loss": 0.9751, + "step": 12582 + }, + { + "epoch": 1.2, + "grad_norm": 0.31394802425509566, + "learning_rate": 7.482805462963539e-05, + "loss": 1.0357, + "step": 12583 + }, + { + "epoch": 1.2, + "grad_norm": 0.35057526281731927, + "learning_rate": 7.481274483769829e-05, + "loss": 1.0568, + "step": 12584 + }, + { + "epoch": 1.2, + "grad_norm": 0.3145246209660702, + "learning_rate": 7.479743567608804e-05, + "loss": 1.0675, + "step": 12585 + }, + { + "epoch": 1.2, + "grad_norm": 0.3008811364108454, + "learning_rate": 7.478212714518781e-05, + "loss": 0.9427, + "step": 12586 + }, + { + "epoch": 1.2, + "grad_norm": 0.3288837613241213, + "learning_rate": 7.476681924538073e-05, + "loss": 0.9971, + "step": 12587 + }, + { + "epoch": 1.2, + "grad_norm": 0.3353219771292112, + "learning_rate": 7.475151197704984e-05, + "loss": 1.0206, + "step": 12588 + }, + { + "epoch": 1.2, + "grad_norm": 0.31150211300882047, + "learning_rate": 7.473620534057825e-05, + "loss": 1.0476, + "step": 12589 + }, + { + "epoch": 1.2, + "grad_norm": 0.30249266485881804, + "learning_rate": 7.472089933634898e-05, + "loss": 1.0078, + "step": 12590 + }, + { + "epoch": 1.2, + "grad_norm": 0.3445272752103807, + "learning_rate": 7.47055939647451e-05, + "loss": 0.9945, + "step": 12591 + }, + { + "epoch": 1.2, + "grad_norm": 0.3085537215117119, + "learning_rate": 7.469028922614957e-05, + "loss": 1.0473, + "step": 12592 + }, + { + "epoch": 1.2, + "grad_norm": 0.31963740095008464, + "learning_rate": 7.467498512094554e-05, + "loss": 0.9542, + "step": 12593 + }, + { + "epoch": 1.2, + "grad_norm": 0.3007454488266599, + "learning_rate": 7.465968164951589e-05, + "loss": 0.8486, + "step": 12594 + }, + { + "epoch": 1.2, + "grad_norm": 0.2708076475963503, + "learning_rate": 7.464437881224364e-05, + "loss": 0.9964, + "step": 12595 + }, + { + "epoch": 1.21, + "grad_norm": 0.35532753186905025, + "learning_rate": 7.462907660951175e-05, + "loss": 1.0961, + "step": 12596 + }, + { + "epoch": 1.21, + "grad_norm": 0.3338203346694441, + "learning_rate": 7.461377504170316e-05, + "loss": 1.061, + "step": 12597 + }, + { + "epoch": 1.21, + "grad_norm": 0.2931894615617442, + "learning_rate": 7.459847410920077e-05, + "loss": 1.0244, + "step": 12598 + }, + { + "epoch": 1.21, + "grad_norm": 0.33897645109177493, + "learning_rate": 7.458317381238755e-05, + "loss": 0.9719, + "step": 12599 + }, + { + "epoch": 1.21, + "grad_norm": 0.3703199233094078, + "learning_rate": 7.45678741516464e-05, + "loss": 1.064, + "step": 12600 + }, + { + "epoch": 1.21, + "grad_norm": 0.3452669869297856, + "learning_rate": 7.455257512736018e-05, + "loss": 1.0458, + "step": 12601 + }, + { + "epoch": 1.21, + "grad_norm": 0.2940443427122791, + "learning_rate": 7.453727673991176e-05, + "loss": 0.9822, + "step": 12602 + }, + { + "epoch": 1.21, + "grad_norm": 0.32576722149527104, + "learning_rate": 7.452197898968396e-05, + "loss": 1.0804, + "step": 12603 + }, + { + "epoch": 1.21, + "grad_norm": 0.2969791219923735, + "learning_rate": 7.450668187705968e-05, + "loss": 0.9395, + "step": 12604 + }, + { + "epoch": 1.21, + "grad_norm": 0.3346734722141974, + "learning_rate": 7.449138540242169e-05, + "loss": 1.1349, + "step": 12605 + }, + { + "epoch": 1.21, + "grad_norm": 0.338188677814113, + "learning_rate": 7.447608956615279e-05, + "loss": 1.0706, + "step": 12606 + }, + { + "epoch": 1.21, + "grad_norm": 0.273803792378492, + "learning_rate": 7.446079436863584e-05, + "loss": 1.0917, + "step": 12607 + }, + { + "epoch": 1.21, + "grad_norm": 0.3121074747089549, + "learning_rate": 7.444549981025355e-05, + "loss": 0.9967, + "step": 12608 + }, + { + "epoch": 1.21, + "grad_norm": 0.27994607206950717, + "learning_rate": 7.44302058913887e-05, + "loss": 0.9487, + "step": 12609 + }, + { + "epoch": 1.21, + "grad_norm": 0.3034574049969598, + "learning_rate": 7.4414912612424e-05, + "loss": 0.8934, + "step": 12610 + }, + { + "epoch": 1.21, + "grad_norm": 0.3045804157874529, + "learning_rate": 7.439961997374219e-05, + "loss": 0.9892, + "step": 12611 + }, + { + "epoch": 1.21, + "grad_norm": 0.303470085618402, + "learning_rate": 7.438432797572595e-05, + "loss": 0.9942, + "step": 12612 + }, + { + "epoch": 1.21, + "grad_norm": 0.315693046501465, + "learning_rate": 7.436903661875803e-05, + "loss": 1.1001, + "step": 12613 + }, + { + "epoch": 1.21, + "grad_norm": 0.30294804323375646, + "learning_rate": 7.435374590322109e-05, + "loss": 1.1, + "step": 12614 + }, + { + "epoch": 1.21, + "grad_norm": 0.28971319593064354, + "learning_rate": 7.433845582949776e-05, + "loss": 0.9678, + "step": 12615 + }, + { + "epoch": 1.21, + "grad_norm": 0.31771450076369895, + "learning_rate": 7.43231663979707e-05, + "loss": 1.1042, + "step": 12616 + }, + { + "epoch": 1.21, + "grad_norm": 0.34385470192410883, + "learning_rate": 7.430787760902255e-05, + "loss": 1.1894, + "step": 12617 + }, + { + "epoch": 1.21, + "grad_norm": 0.33325462398950845, + "learning_rate": 7.429258946303592e-05, + "loss": 0.9978, + "step": 12618 + }, + { + "epoch": 1.21, + "grad_norm": 0.34471800884780596, + "learning_rate": 7.427730196039333e-05, + "loss": 1.0182, + "step": 12619 + }, + { + "epoch": 1.21, + "grad_norm": 0.30063737634957605, + "learning_rate": 7.426201510147748e-05, + "loss": 1.0861, + "step": 12620 + }, + { + "epoch": 1.21, + "grad_norm": 0.3241265377074341, + "learning_rate": 7.424672888667088e-05, + "loss": 0.8974, + "step": 12621 + }, + { + "epoch": 1.21, + "grad_norm": 0.32167342575700664, + "learning_rate": 7.423144331635608e-05, + "loss": 1.0186, + "step": 12622 + }, + { + "epoch": 1.21, + "grad_norm": 0.3068810223170261, + "learning_rate": 7.421615839091557e-05, + "loss": 0.9966, + "step": 12623 + }, + { + "epoch": 1.21, + "grad_norm": 0.31954358917544184, + "learning_rate": 7.420087411073193e-05, + "loss": 0.9589, + "step": 12624 + }, + { + "epoch": 1.21, + "grad_norm": 0.29290911780471596, + "learning_rate": 7.418559047618761e-05, + "loss": 1.0324, + "step": 12625 + }, + { + "epoch": 1.21, + "grad_norm": 0.2984667936176138, + "learning_rate": 7.417030748766512e-05, + "loss": 1.0274, + "step": 12626 + }, + { + "epoch": 1.21, + "grad_norm": 0.3272457526988126, + "learning_rate": 7.415502514554691e-05, + "loss": 1.0065, + "step": 12627 + }, + { + "epoch": 1.21, + "grad_norm": 0.2841819043417003, + "learning_rate": 7.413974345021546e-05, + "loss": 1.011, + "step": 12628 + }, + { + "epoch": 1.21, + "grad_norm": 0.3381593743015684, + "learning_rate": 7.412446240205317e-05, + "loss": 1.0319, + "step": 12629 + }, + { + "epoch": 1.21, + "grad_norm": 0.2759001947324678, + "learning_rate": 7.410918200144247e-05, + "loss": 0.941, + "step": 12630 + }, + { + "epoch": 1.21, + "grad_norm": 0.3104709471146378, + "learning_rate": 7.40939022487658e-05, + "loss": 0.9963, + "step": 12631 + }, + { + "epoch": 1.21, + "grad_norm": 0.27014867810375626, + "learning_rate": 7.407862314440542e-05, + "loss": 0.9204, + "step": 12632 + }, + { + "epoch": 1.21, + "grad_norm": 0.310544003933227, + "learning_rate": 7.406334468874385e-05, + "loss": 0.9951, + "step": 12633 + }, + { + "epoch": 1.21, + "grad_norm": 0.3264760090727911, + "learning_rate": 7.404806688216337e-05, + "loss": 1.0534, + "step": 12634 + }, + { + "epoch": 1.21, + "grad_norm": 0.27203173849419804, + "learning_rate": 7.403278972504634e-05, + "loss": 0.9886, + "step": 12635 + }, + { + "epoch": 1.21, + "grad_norm": 0.35428909229516625, + "learning_rate": 7.401751321777506e-05, + "loss": 0.9481, + "step": 12636 + }, + { + "epoch": 1.21, + "grad_norm": 0.2837269864017178, + "learning_rate": 7.400223736073185e-05, + "loss": 1.0568, + "step": 12637 + }, + { + "epoch": 1.21, + "grad_norm": 0.3371408986240263, + "learning_rate": 7.398696215429896e-05, + "loss": 1.061, + "step": 12638 + }, + { + "epoch": 1.21, + "grad_norm": 0.3065418184235235, + "learning_rate": 7.397168759885868e-05, + "loss": 0.9496, + "step": 12639 + }, + { + "epoch": 1.21, + "grad_norm": 0.29879085226817403, + "learning_rate": 7.39564136947933e-05, + "loss": 0.9942, + "step": 12640 + }, + { + "epoch": 1.21, + "grad_norm": 0.38567070302711, + "learning_rate": 7.394114044248505e-05, + "loss": 1.0371, + "step": 12641 + }, + { + "epoch": 1.21, + "grad_norm": 0.34401316635479706, + "learning_rate": 7.392586784231613e-05, + "loss": 1.0379, + "step": 12642 + }, + { + "epoch": 1.21, + "grad_norm": 0.3186073313120979, + "learning_rate": 7.391059589466875e-05, + "loss": 1.0138, + "step": 12643 + }, + { + "epoch": 1.21, + "grad_norm": 0.2951702091360404, + "learning_rate": 7.389532459992509e-05, + "loss": 0.8796, + "step": 12644 + }, + { + "epoch": 1.21, + "grad_norm": 0.32349612030125785, + "learning_rate": 7.388005395846735e-05, + "loss": 0.9865, + "step": 12645 + }, + { + "epoch": 1.21, + "grad_norm": 0.2850387600179255, + "learning_rate": 7.386478397067767e-05, + "loss": 1.0115, + "step": 12646 + }, + { + "epoch": 1.21, + "grad_norm": 0.3013542551268381, + "learning_rate": 7.38495146369382e-05, + "loss": 1.1475, + "step": 12647 + }, + { + "epoch": 1.21, + "grad_norm": 0.2557846733093818, + "learning_rate": 7.383424595763104e-05, + "loss": 0.9523, + "step": 12648 + }, + { + "epoch": 1.21, + "grad_norm": 0.28133045608742535, + "learning_rate": 7.381897793313836e-05, + "loss": 1.051, + "step": 12649 + }, + { + "epoch": 1.21, + "grad_norm": 0.30551177597779183, + "learning_rate": 7.380371056384219e-05, + "loss": 1.0187, + "step": 12650 + }, + { + "epoch": 1.21, + "grad_norm": 0.3183074130306196, + "learning_rate": 7.378844385012464e-05, + "loss": 1.0099, + "step": 12651 + }, + { + "epoch": 1.21, + "grad_norm": 0.28700674431484713, + "learning_rate": 7.377317779236776e-05, + "loss": 0.9993, + "step": 12652 + }, + { + "epoch": 1.21, + "grad_norm": 0.3151665935928956, + "learning_rate": 7.375791239095353e-05, + "loss": 0.9516, + "step": 12653 + }, + { + "epoch": 1.21, + "grad_norm": 0.32764345802009426, + "learning_rate": 7.374264764626408e-05, + "loss": 1.0453, + "step": 12654 + }, + { + "epoch": 1.21, + "grad_norm": 0.34131666992223836, + "learning_rate": 7.372738355868138e-05, + "loss": 1.0307, + "step": 12655 + }, + { + "epoch": 1.21, + "grad_norm": 0.31348072728988213, + "learning_rate": 7.371212012858743e-05, + "loss": 1.1565, + "step": 12656 + }, + { + "epoch": 1.21, + "grad_norm": 0.3273032943985443, + "learning_rate": 7.369685735636417e-05, + "loss": 1.0106, + "step": 12657 + }, + { + "epoch": 1.21, + "grad_norm": 0.32302486570716904, + "learning_rate": 7.368159524239363e-05, + "loss": 1.0463, + "step": 12658 + }, + { + "epoch": 1.21, + "grad_norm": 0.28565044396415, + "learning_rate": 7.366633378705763e-05, + "loss": 0.9733, + "step": 12659 + }, + { + "epoch": 1.21, + "grad_norm": 0.3139807824457875, + "learning_rate": 7.365107299073821e-05, + "loss": 1.0017, + "step": 12660 + }, + { + "epoch": 1.21, + "grad_norm": 0.3255359579579072, + "learning_rate": 7.363581285381726e-05, + "loss": 0.9736, + "step": 12661 + }, + { + "epoch": 1.21, + "grad_norm": 0.3544187177715832, + "learning_rate": 7.362055337667668e-05, + "loss": 1.0616, + "step": 12662 + }, + { + "epoch": 1.21, + "grad_norm": 0.2801974539059134, + "learning_rate": 7.360529455969831e-05, + "loss": 0.9914, + "step": 12663 + }, + { + "epoch": 1.21, + "grad_norm": 0.30190121900520017, + "learning_rate": 7.359003640326403e-05, + "loss": 1.0003, + "step": 12664 + }, + { + "epoch": 1.21, + "grad_norm": 0.33548796606435216, + "learning_rate": 7.357477890775569e-05, + "loss": 0.9993, + "step": 12665 + }, + { + "epoch": 1.21, + "grad_norm": 0.28758926830993725, + "learning_rate": 7.35595220735551e-05, + "loss": 1.049, + "step": 12666 + }, + { + "epoch": 1.21, + "grad_norm": 0.31898388109806525, + "learning_rate": 7.354426590104412e-05, + "loss": 1.0445, + "step": 12667 + }, + { + "epoch": 1.21, + "grad_norm": 0.26870707098795193, + "learning_rate": 7.352901039060447e-05, + "loss": 1.1359, + "step": 12668 + }, + { + "epoch": 1.21, + "grad_norm": 0.2836342120226928, + "learning_rate": 7.351375554261801e-05, + "loss": 0.9269, + "step": 12669 + }, + { + "epoch": 1.21, + "grad_norm": 0.30008783043854304, + "learning_rate": 7.349850135746645e-05, + "loss": 0.9753, + "step": 12670 + }, + { + "epoch": 1.21, + "grad_norm": 0.35123876787670427, + "learning_rate": 7.348324783553157e-05, + "loss": 0.9926, + "step": 12671 + }, + { + "epoch": 1.21, + "grad_norm": 0.3149054131931207, + "learning_rate": 7.346799497719506e-05, + "loss": 1.0741, + "step": 12672 + }, + { + "epoch": 1.21, + "grad_norm": 0.3456075987274667, + "learning_rate": 7.345274278283861e-05, + "loss": 1.0524, + "step": 12673 + }, + { + "epoch": 1.21, + "grad_norm": 0.3716050380925594, + "learning_rate": 7.343749125284401e-05, + "loss": 1.008, + "step": 12674 + }, + { + "epoch": 1.21, + "grad_norm": 0.32921699877929295, + "learning_rate": 7.342224038759286e-05, + "loss": 1.0442, + "step": 12675 + }, + { + "epoch": 1.21, + "grad_norm": 0.3043311371894201, + "learning_rate": 7.340699018746687e-05, + "loss": 1.0107, + "step": 12676 + }, + { + "epoch": 1.21, + "grad_norm": 0.3094352020291517, + "learning_rate": 7.339174065284766e-05, + "loss": 1.0913, + "step": 12677 + }, + { + "epoch": 1.21, + "grad_norm": 0.3580445915737288, + "learning_rate": 7.337649178411686e-05, + "loss": 1.0203, + "step": 12678 + }, + { + "epoch": 1.21, + "grad_norm": 0.30046338573250675, + "learning_rate": 7.33612435816561e-05, + "loss": 1.0695, + "step": 12679 + }, + { + "epoch": 1.21, + "grad_norm": 0.2918899583684976, + "learning_rate": 7.334599604584689e-05, + "loss": 1.0339, + "step": 12680 + }, + { + "epoch": 1.21, + "grad_norm": 0.2912602029574193, + "learning_rate": 7.333074917707094e-05, + "loss": 1.1508, + "step": 12681 + }, + { + "epoch": 1.21, + "grad_norm": 0.32425961625129374, + "learning_rate": 7.331550297570975e-05, + "loss": 1.0083, + "step": 12682 + }, + { + "epoch": 1.21, + "grad_norm": 0.2944293225594804, + "learning_rate": 7.330025744214487e-05, + "loss": 0.9999, + "step": 12683 + }, + { + "epoch": 1.21, + "grad_norm": 0.308453575552525, + "learning_rate": 7.328501257675783e-05, + "loss": 1.0308, + "step": 12684 + }, + { + "epoch": 1.21, + "grad_norm": 0.3098699208988919, + "learning_rate": 7.326976837993011e-05, + "loss": 1.0211, + "step": 12685 + }, + { + "epoch": 1.21, + "grad_norm": 0.28097005556013877, + "learning_rate": 7.325452485204326e-05, + "loss": 0.9159, + "step": 12686 + }, + { + "epoch": 1.21, + "grad_norm": 0.29311810361884305, + "learning_rate": 7.323928199347872e-05, + "loss": 1.1114, + "step": 12687 + }, + { + "epoch": 1.21, + "grad_norm": 0.33670439008356573, + "learning_rate": 7.322403980461798e-05, + "loss": 0.9749, + "step": 12688 + }, + { + "epoch": 1.21, + "grad_norm": 0.32240954524502036, + "learning_rate": 7.320879828584247e-05, + "loss": 0.9682, + "step": 12689 + }, + { + "epoch": 1.21, + "grad_norm": 0.30045288525179137, + "learning_rate": 7.319355743753362e-05, + "loss": 0.978, + "step": 12690 + }, + { + "epoch": 1.21, + "grad_norm": 0.31893323177278193, + "learning_rate": 7.317831726007285e-05, + "loss": 1.0157, + "step": 12691 + }, + { + "epoch": 1.21, + "grad_norm": 0.2701466667181898, + "learning_rate": 7.316307775384155e-05, + "loss": 0.9935, + "step": 12692 + }, + { + "epoch": 1.21, + "grad_norm": 0.3207543961992831, + "learning_rate": 7.314783891922103e-05, + "loss": 0.8961, + "step": 12693 + }, + { + "epoch": 1.21, + "grad_norm": 0.31004605776188393, + "learning_rate": 7.313260075659278e-05, + "loss": 1.0437, + "step": 12694 + }, + { + "epoch": 1.21, + "grad_norm": 0.3413541871661538, + "learning_rate": 7.31173632663381e-05, + "loss": 0.952, + "step": 12695 + }, + { + "epoch": 1.21, + "grad_norm": 0.3069793297077493, + "learning_rate": 7.310212644883828e-05, + "loss": 0.9804, + "step": 12696 + }, + { + "epoch": 1.21, + "grad_norm": 0.30983676964747997, + "learning_rate": 7.308689030447465e-05, + "loss": 0.9817, + "step": 12697 + }, + { + "epoch": 1.21, + "grad_norm": 0.31833103142493574, + "learning_rate": 7.307165483362849e-05, + "loss": 1.0457, + "step": 12698 + }, + { + "epoch": 1.21, + "grad_norm": 0.32454139037465585, + "learning_rate": 7.305642003668111e-05, + "loss": 0.9911, + "step": 12699 + }, + { + "epoch": 1.22, + "grad_norm": 0.28596633006528793, + "learning_rate": 7.30411859140137e-05, + "loss": 0.9297, + "step": 12700 + }, + { + "epoch": 1.22, + "grad_norm": 0.2900557176912113, + "learning_rate": 7.302595246600761e-05, + "loss": 1.1395, + "step": 12701 + }, + { + "epoch": 1.22, + "grad_norm": 0.3173473395308535, + "learning_rate": 7.3010719693044e-05, + "loss": 0.9986, + "step": 12702 + }, + { + "epoch": 1.22, + "grad_norm": 0.32574208304461094, + "learning_rate": 7.29954875955041e-05, + "loss": 0.9211, + "step": 12703 + }, + { + "epoch": 1.22, + "grad_norm": 0.32323458054377585, + "learning_rate": 7.29802561737691e-05, + "loss": 1.0276, + "step": 12704 + }, + { + "epoch": 1.22, + "grad_norm": 0.34438493658403874, + "learning_rate": 7.296502542822012e-05, + "loss": 1.0728, + "step": 12705 + }, + { + "epoch": 1.22, + "grad_norm": 0.31489508975120684, + "learning_rate": 7.294979535923843e-05, + "loss": 1.0841, + "step": 12706 + }, + { + "epoch": 1.22, + "grad_norm": 0.34299831784271073, + "learning_rate": 7.293456596720506e-05, + "loss": 1.0936, + "step": 12707 + }, + { + "epoch": 1.22, + "grad_norm": 0.34265398692334303, + "learning_rate": 7.291933725250122e-05, + "loss": 1.0364, + "step": 12708 + }, + { + "epoch": 1.22, + "grad_norm": 0.30561201743740757, + "learning_rate": 7.290410921550797e-05, + "loss": 0.9866, + "step": 12709 + }, + { + "epoch": 1.22, + "grad_norm": 0.30869676104129073, + "learning_rate": 7.288888185660643e-05, + "loss": 1.2066, + "step": 12710 + }, + { + "epoch": 1.22, + "grad_norm": 0.2979883922797523, + "learning_rate": 7.287365517617765e-05, + "loss": 1.0553, + "step": 12711 + }, + { + "epoch": 1.22, + "grad_norm": 0.2780248696017988, + "learning_rate": 7.285842917460272e-05, + "loss": 1.0278, + "step": 12712 + }, + { + "epoch": 1.22, + "grad_norm": 0.3198337761331097, + "learning_rate": 7.284320385226265e-05, + "loss": 1.0383, + "step": 12713 + }, + { + "epoch": 1.22, + "grad_norm": 0.29971801245228835, + "learning_rate": 7.28279792095384e-05, + "loss": 1.0873, + "step": 12714 + }, + { + "epoch": 1.22, + "grad_norm": 0.32814448832959653, + "learning_rate": 7.281275524681113e-05, + "loss": 1.1088, + "step": 12715 + }, + { + "epoch": 1.22, + "grad_norm": 0.2816327890873952, + "learning_rate": 7.279753196446173e-05, + "loss": 1.0025, + "step": 12716 + }, + { + "epoch": 1.22, + "grad_norm": 0.2948804316298728, + "learning_rate": 7.278230936287119e-05, + "loss": 0.9998, + "step": 12717 + }, + { + "epoch": 1.22, + "grad_norm": 0.2937068388917655, + "learning_rate": 7.276708744242047e-05, + "loss": 1.0168, + "step": 12718 + }, + { + "epoch": 1.22, + "grad_norm": 0.329083138696946, + "learning_rate": 7.275186620349048e-05, + "loss": 1.0208, + "step": 12719 + }, + { + "epoch": 1.22, + "grad_norm": 0.3045683871034369, + "learning_rate": 7.273664564646215e-05, + "loss": 0.988, + "step": 12720 + }, + { + "epoch": 1.22, + "grad_norm": 0.2746841759316994, + "learning_rate": 7.272142577171642e-05, + "loss": 0.9925, + "step": 12721 + }, + { + "epoch": 1.22, + "grad_norm": 0.42226398842727586, + "learning_rate": 7.270620657963417e-05, + "loss": 0.9973, + "step": 12722 + }, + { + "epoch": 1.22, + "grad_norm": 0.31888762544077415, + "learning_rate": 7.269098807059623e-05, + "loss": 0.9627, + "step": 12723 + }, + { + "epoch": 1.22, + "grad_norm": 0.3061771050718617, + "learning_rate": 7.267577024498348e-05, + "loss": 1.0382, + "step": 12724 + }, + { + "epoch": 1.22, + "grad_norm": 0.26596906242545715, + "learning_rate": 7.266055310317677e-05, + "loss": 0.9049, + "step": 12725 + }, + { + "epoch": 1.22, + "grad_norm": 0.34159521182384167, + "learning_rate": 7.264533664555688e-05, + "loss": 1.0226, + "step": 12726 + }, + { + "epoch": 1.22, + "grad_norm": 0.32132944631781635, + "learning_rate": 7.263012087250462e-05, + "loss": 1.1246, + "step": 12727 + }, + { + "epoch": 1.22, + "grad_norm": 0.30519705035336175, + "learning_rate": 7.26149057844008e-05, + "loss": 0.9644, + "step": 12728 + }, + { + "epoch": 1.22, + "grad_norm": 0.3080831384370028, + "learning_rate": 7.25996913816262e-05, + "loss": 1.0366, + "step": 12729 + }, + { + "epoch": 1.22, + "grad_norm": 0.29731646870420186, + "learning_rate": 7.258447766456152e-05, + "loss": 1.0241, + "step": 12730 + }, + { + "epoch": 1.22, + "grad_norm": 0.2725971542408783, + "learning_rate": 7.256926463358751e-05, + "loss": 1.0241, + "step": 12731 + }, + { + "epoch": 1.22, + "grad_norm": 0.28686420371796106, + "learning_rate": 7.255405228908491e-05, + "loss": 1.0193, + "step": 12732 + }, + { + "epoch": 1.22, + "grad_norm": 0.31951160873855816, + "learning_rate": 7.253884063143439e-05, + "loss": 1.0335, + "step": 12733 + }, + { + "epoch": 1.22, + "grad_norm": 0.2725511634029356, + "learning_rate": 7.252362966101663e-05, + "loss": 1.1956, + "step": 12734 + }, + { + "epoch": 1.22, + "grad_norm": 0.31948713048629, + "learning_rate": 7.250841937821231e-05, + "loss": 1.1195, + "step": 12735 + }, + { + "epoch": 1.22, + "grad_norm": 0.2970835797027157, + "learning_rate": 7.249320978340209e-05, + "loss": 1.1254, + "step": 12736 + }, + { + "epoch": 1.22, + "grad_norm": 0.28269459385274337, + "learning_rate": 7.247800087696658e-05, + "loss": 0.8624, + "step": 12737 + }, + { + "epoch": 1.22, + "grad_norm": 0.3371577822126246, + "learning_rate": 7.246279265928639e-05, + "loss": 1.0597, + "step": 12738 + }, + { + "epoch": 1.22, + "grad_norm": 0.33185572643408234, + "learning_rate": 7.244758513074214e-05, + "loss": 1.0439, + "step": 12739 + }, + { + "epoch": 1.22, + "grad_norm": 0.2904064019405541, + "learning_rate": 7.243237829171436e-05, + "loss": 1.0759, + "step": 12740 + }, + { + "epoch": 1.22, + "grad_norm": 0.315897784106347, + "learning_rate": 7.241717214258362e-05, + "loss": 1.0918, + "step": 12741 + }, + { + "epoch": 1.22, + "grad_norm": 0.32390974414136164, + "learning_rate": 7.240196668373051e-05, + "loss": 1.0578, + "step": 12742 + }, + { + "epoch": 1.22, + "grad_norm": 0.32248715562508884, + "learning_rate": 7.238676191553554e-05, + "loss": 0.9639, + "step": 12743 + }, + { + "epoch": 1.22, + "grad_norm": 0.31539009766117565, + "learning_rate": 7.237155783837921e-05, + "loss": 1.0557, + "step": 12744 + }, + { + "epoch": 1.22, + "grad_norm": 0.28998643673519425, + "learning_rate": 7.2356354452642e-05, + "loss": 1.0657, + "step": 12745 + }, + { + "epoch": 1.22, + "grad_norm": 0.2818517155499948, + "learning_rate": 7.234115175870439e-05, + "loss": 0.8769, + "step": 12746 + }, + { + "epoch": 1.22, + "grad_norm": 0.32644228447819806, + "learning_rate": 7.23259497569468e-05, + "loss": 0.9125, + "step": 12747 + }, + { + "epoch": 1.22, + "grad_norm": 0.33301765951360657, + "learning_rate": 7.231074844774976e-05, + "loss": 0.9738, + "step": 12748 + }, + { + "epoch": 1.22, + "grad_norm": 0.295704159779775, + "learning_rate": 7.229554783149364e-05, + "loss": 0.9872, + "step": 12749 + }, + { + "epoch": 1.22, + "grad_norm": 0.2925818281176169, + "learning_rate": 7.22803479085588e-05, + "loss": 1.1752, + "step": 12750 + }, + { + "epoch": 1.22, + "grad_norm": 0.3251643319161377, + "learning_rate": 7.226514867932573e-05, + "loss": 0.9534, + "step": 12751 + }, + { + "epoch": 1.22, + "grad_norm": 0.3054164052431106, + "learning_rate": 7.224995014417473e-05, + "loss": 1.1958, + "step": 12752 + }, + { + "epoch": 1.22, + "grad_norm": 0.3567533157608515, + "learning_rate": 7.223475230348618e-05, + "loss": 1.104, + "step": 12753 + }, + { + "epoch": 1.22, + "grad_norm": 0.32062259687723244, + "learning_rate": 7.221955515764034e-05, + "loss": 1.0662, + "step": 12754 + }, + { + "epoch": 1.22, + "grad_norm": 0.31204408011268053, + "learning_rate": 7.220435870701765e-05, + "loss": 0.9741, + "step": 12755 + }, + { + "epoch": 1.22, + "grad_norm": 0.34015714321740464, + "learning_rate": 7.218916295199834e-05, + "loss": 0.9548, + "step": 12756 + }, + { + "epoch": 1.22, + "grad_norm": 0.32669865204307386, + "learning_rate": 7.217396789296272e-05, + "loss": 1.0586, + "step": 12757 + }, + { + "epoch": 1.22, + "grad_norm": 0.34090345907622205, + "learning_rate": 7.215877353029103e-05, + "loss": 0.9981, + "step": 12758 + }, + { + "epoch": 1.22, + "grad_norm": 0.3060762600574691, + "learning_rate": 7.214357986436355e-05, + "loss": 0.9619, + "step": 12759 + }, + { + "epoch": 1.22, + "grad_norm": 0.2585406804057839, + "learning_rate": 7.212838689556047e-05, + "loss": 0.9239, + "step": 12760 + }, + { + "epoch": 1.22, + "grad_norm": 0.2972243203202676, + "learning_rate": 7.211319462426198e-05, + "loss": 0.966, + "step": 12761 + }, + { + "epoch": 1.22, + "grad_norm": 0.3111011483056727, + "learning_rate": 7.209800305084838e-05, + "loss": 1.0119, + "step": 12762 + }, + { + "epoch": 1.22, + "grad_norm": 0.3066118883081151, + "learning_rate": 7.208281217569977e-05, + "loss": 1.1021, + "step": 12763 + }, + { + "epoch": 1.22, + "grad_norm": 0.3554334876133222, + "learning_rate": 7.206762199919634e-05, + "loss": 1.0074, + "step": 12764 + }, + { + "epoch": 1.22, + "grad_norm": 0.29954588658507075, + "learning_rate": 7.205243252171822e-05, + "loss": 0.9337, + "step": 12765 + }, + { + "epoch": 1.22, + "grad_norm": 0.25779090755655476, + "learning_rate": 7.203724374364555e-05, + "loss": 0.9767, + "step": 12766 + }, + { + "epoch": 1.22, + "grad_norm": 0.33284033135571545, + "learning_rate": 7.202205566535843e-05, + "loss": 0.9572, + "step": 12767 + }, + { + "epoch": 1.22, + "grad_norm": 0.27604024622582757, + "learning_rate": 7.200686828723691e-05, + "loss": 0.98, + "step": 12768 + }, + { + "epoch": 1.22, + "grad_norm": 0.28558549592601873, + "learning_rate": 7.199168160966114e-05, + "loss": 1.0764, + "step": 12769 + }, + { + "epoch": 1.22, + "grad_norm": 0.34251377624103946, + "learning_rate": 7.197649563301114e-05, + "loss": 1.0308, + "step": 12770 + }, + { + "epoch": 1.22, + "grad_norm": 0.2870096581485938, + "learning_rate": 7.196131035766693e-05, + "loss": 0.9359, + "step": 12771 + }, + { + "epoch": 1.22, + "grad_norm": 0.2910567427642542, + "learning_rate": 7.194612578400857e-05, + "loss": 1.0634, + "step": 12772 + }, + { + "epoch": 1.22, + "grad_norm": 0.3308096661702366, + "learning_rate": 7.193094191241604e-05, + "loss": 0.971, + "step": 12773 + }, + { + "epoch": 1.22, + "grad_norm": 0.28159446515610626, + "learning_rate": 7.191575874326932e-05, + "loss": 1.0092, + "step": 12774 + }, + { + "epoch": 1.22, + "grad_norm": 0.3239785446867856, + "learning_rate": 7.190057627694837e-05, + "loss": 0.9327, + "step": 12775 + }, + { + "epoch": 1.22, + "grad_norm": 0.2853496402493381, + "learning_rate": 7.188539451383318e-05, + "loss": 0.9967, + "step": 12776 + }, + { + "epoch": 1.22, + "grad_norm": 0.3031671442946889, + "learning_rate": 7.187021345430367e-05, + "loss": 0.9484, + "step": 12777 + }, + { + "epoch": 1.22, + "grad_norm": 0.28230727611298667, + "learning_rate": 7.185503309873975e-05, + "loss": 1.012, + "step": 12778 + }, + { + "epoch": 1.22, + "grad_norm": 0.3169230851543999, + "learning_rate": 7.183985344752131e-05, + "loss": 1.0048, + "step": 12779 + }, + { + "epoch": 1.22, + "grad_norm": 0.32017968594325535, + "learning_rate": 7.182467450102824e-05, + "loss": 0.9304, + "step": 12780 + }, + { + "epoch": 1.22, + "grad_norm": 0.3095542998610912, + "learning_rate": 7.180949625964036e-05, + "loss": 1.0407, + "step": 12781 + }, + { + "epoch": 1.22, + "grad_norm": 0.2924169959827623, + "learning_rate": 7.179431872373759e-05, + "loss": 1.0497, + "step": 12782 + }, + { + "epoch": 1.22, + "grad_norm": 0.2898882882551537, + "learning_rate": 7.177914189369972e-05, + "loss": 1.0007, + "step": 12783 + }, + { + "epoch": 1.22, + "grad_norm": 0.31069697297215176, + "learning_rate": 7.176396576990656e-05, + "loss": 1.1283, + "step": 12784 + }, + { + "epoch": 1.22, + "grad_norm": 0.2844022927988444, + "learning_rate": 7.174879035273789e-05, + "loss": 1.0732, + "step": 12785 + }, + { + "epoch": 1.22, + "grad_norm": 0.28446633247835745, + "learning_rate": 7.173361564257351e-05, + "loss": 0.9356, + "step": 12786 + }, + { + "epoch": 1.22, + "grad_norm": 0.3424844325035409, + "learning_rate": 7.171844163979317e-05, + "loss": 0.9632, + "step": 12787 + }, + { + "epoch": 1.22, + "grad_norm": 0.33260893922622464, + "learning_rate": 7.170326834477655e-05, + "loss": 0.9791, + "step": 12788 + }, + { + "epoch": 1.22, + "grad_norm": 0.3232028885591371, + "learning_rate": 7.168809575790347e-05, + "loss": 1.1126, + "step": 12789 + }, + { + "epoch": 1.22, + "grad_norm": 0.38578456017231716, + "learning_rate": 7.167292387955358e-05, + "loss": 1.0655, + "step": 12790 + }, + { + "epoch": 1.22, + "grad_norm": 0.2932377742676138, + "learning_rate": 7.165775271010657e-05, + "loss": 1.1135, + "step": 12791 + }, + { + "epoch": 1.22, + "grad_norm": 0.2700077348962263, + "learning_rate": 7.164258224994211e-05, + "loss": 0.9769, + "step": 12792 + }, + { + "epoch": 1.22, + "grad_norm": 0.3114163181268787, + "learning_rate": 7.162741249943986e-05, + "loss": 0.9605, + "step": 12793 + }, + { + "epoch": 1.22, + "grad_norm": 0.28647067224044676, + "learning_rate": 7.161224345897945e-05, + "loss": 0.9748, + "step": 12794 + }, + { + "epoch": 1.22, + "grad_norm": 0.3149592882517595, + "learning_rate": 7.159707512894042e-05, + "loss": 1.0394, + "step": 12795 + }, + { + "epoch": 1.22, + "grad_norm": 0.3384699232951792, + "learning_rate": 7.158190750970251e-05, + "loss": 1.1901, + "step": 12796 + }, + { + "epoch": 1.22, + "grad_norm": 0.3465112413613725, + "learning_rate": 7.156674060164521e-05, + "loss": 1.0218, + "step": 12797 + }, + { + "epoch": 1.22, + "grad_norm": 0.31692049265341515, + "learning_rate": 7.15515744051481e-05, + "loss": 0.9661, + "step": 12798 + }, + { + "epoch": 1.22, + "grad_norm": 0.3234227315435233, + "learning_rate": 7.153640892059071e-05, + "loss": 0.9093, + "step": 12799 + }, + { + "epoch": 1.22, + "grad_norm": 0.31966382793817927, + "learning_rate": 7.15212441483526e-05, + "loss": 0.9978, + "step": 12800 + }, + { + "epoch": 1.22, + "grad_norm": 0.29391721735517395, + "learning_rate": 7.150608008881323e-05, + "loss": 1.0359, + "step": 12801 + }, + { + "epoch": 1.22, + "grad_norm": 0.2809792783926624, + "learning_rate": 7.149091674235208e-05, + "loss": 1.0169, + "step": 12802 + }, + { + "epoch": 1.22, + "grad_norm": 0.2818391239299371, + "learning_rate": 7.147575410934869e-05, + "loss": 0.9457, + "step": 12803 + }, + { + "epoch": 1.22, + "grad_norm": 0.2850996809533879, + "learning_rate": 7.146059219018248e-05, + "loss": 1.0191, + "step": 12804 + }, + { + "epoch": 1.23, + "grad_norm": 0.2979389377746218, + "learning_rate": 7.14454309852329e-05, + "loss": 1.0415, + "step": 12805 + }, + { + "epoch": 1.23, + "grad_norm": 0.3227096017034147, + "learning_rate": 7.143027049487934e-05, + "loss": 1.1384, + "step": 12806 + }, + { + "epoch": 1.23, + "grad_norm": 0.31823987510512514, + "learning_rate": 7.141511071950123e-05, + "loss": 1.0451, + "step": 12807 + }, + { + "epoch": 1.23, + "grad_norm": 0.30720143262379745, + "learning_rate": 7.139995165947789e-05, + "loss": 1.0845, + "step": 12808 + }, + { + "epoch": 1.23, + "grad_norm": 0.332063985087617, + "learning_rate": 7.138479331518877e-05, + "loss": 1.0663, + "step": 12809 + }, + { + "epoch": 1.23, + "grad_norm": 0.3258814601049584, + "learning_rate": 7.13696356870132e-05, + "loss": 0.9837, + "step": 12810 + }, + { + "epoch": 1.23, + "grad_norm": 0.29747936638728856, + "learning_rate": 7.135447877533048e-05, + "loss": 0.94, + "step": 12811 + }, + { + "epoch": 1.23, + "grad_norm": 0.34262178951745376, + "learning_rate": 7.133932258051992e-05, + "loss": 0.9995, + "step": 12812 + }, + { + "epoch": 1.23, + "grad_norm": 0.29741970665478756, + "learning_rate": 7.132416710296083e-05, + "loss": 1.017, + "step": 12813 + }, + { + "epoch": 1.23, + "grad_norm": 0.3077002191524436, + "learning_rate": 7.130901234303248e-05, + "loss": 0.9876, + "step": 12814 + }, + { + "epoch": 1.23, + "grad_norm": 0.3358545025008494, + "learning_rate": 7.129385830111412e-05, + "loss": 0.9762, + "step": 12815 + }, + { + "epoch": 1.23, + "grad_norm": 0.32362434515188787, + "learning_rate": 7.1278704977585e-05, + "loss": 0.9953, + "step": 12816 + }, + { + "epoch": 1.23, + "grad_norm": 0.30150706541697947, + "learning_rate": 7.126355237282433e-05, + "loss": 0.9734, + "step": 12817 + }, + { + "epoch": 1.23, + "grad_norm": 0.29911280559664766, + "learning_rate": 7.124840048721135e-05, + "loss": 1.0222, + "step": 12818 + }, + { + "epoch": 1.23, + "grad_norm": 0.2924865282420192, + "learning_rate": 7.12332493211252e-05, + "loss": 1.0488, + "step": 12819 + }, + { + "epoch": 1.23, + "grad_norm": 0.3163081013064063, + "learning_rate": 7.121809887494508e-05, + "loss": 0.9756, + "step": 12820 + }, + { + "epoch": 1.23, + "grad_norm": 0.3023274413077954, + "learning_rate": 7.120294914905013e-05, + "loss": 1.0519, + "step": 12821 + }, + { + "epoch": 1.23, + "grad_norm": 0.33310472494668686, + "learning_rate": 7.118780014381941e-05, + "loss": 0.9845, + "step": 12822 + }, + { + "epoch": 1.23, + "grad_norm": 0.32854467609071464, + "learning_rate": 7.117265185963215e-05, + "loss": 1.0946, + "step": 12823 + }, + { + "epoch": 1.23, + "grad_norm": 0.31710353932565094, + "learning_rate": 7.11575042968674e-05, + "loss": 1.0286, + "step": 12824 + }, + { + "epoch": 1.23, + "grad_norm": 0.3194403041482773, + "learning_rate": 7.114235745590424e-05, + "loss": 1.0737, + "step": 12825 + }, + { + "epoch": 1.23, + "grad_norm": 0.28522299248290933, + "learning_rate": 7.112721133712171e-05, + "loss": 1.1489, + "step": 12826 + }, + { + "epoch": 1.23, + "grad_norm": 0.3054921782348778, + "learning_rate": 7.111206594089887e-05, + "loss": 1.0294, + "step": 12827 + }, + { + "epoch": 1.23, + "grad_norm": 0.28960822719207824, + "learning_rate": 7.109692126761472e-05, + "loss": 1.0881, + "step": 12828 + }, + { + "epoch": 1.23, + "grad_norm": 0.3105250698817918, + "learning_rate": 7.108177731764824e-05, + "loss": 1.0487, + "step": 12829 + }, + { + "epoch": 1.23, + "grad_norm": 0.3042144344197343, + "learning_rate": 7.106663409137853e-05, + "loss": 1.0829, + "step": 12830 + }, + { + "epoch": 1.23, + "grad_norm": 0.30276343916184595, + "learning_rate": 7.105149158918445e-05, + "loss": 1.0799, + "step": 12831 + }, + { + "epoch": 1.23, + "grad_norm": 0.2878067317675511, + "learning_rate": 7.1036349811445e-05, + "loss": 0.9547, + "step": 12832 + }, + { + "epoch": 1.23, + "grad_norm": 0.29648436179440046, + "learning_rate": 7.102120875853908e-05, + "loss": 1.1063, + "step": 12833 + }, + { + "epoch": 1.23, + "grad_norm": 0.3207430856800112, + "learning_rate": 7.100606843084563e-05, + "loss": 1.0839, + "step": 12834 + }, + { + "epoch": 1.23, + "grad_norm": 0.25932261258859696, + "learning_rate": 7.099092882874357e-05, + "loss": 0.9986, + "step": 12835 + }, + { + "epoch": 1.23, + "grad_norm": 0.3345840411747824, + "learning_rate": 7.09757899526117e-05, + "loss": 1.0286, + "step": 12836 + }, + { + "epoch": 1.23, + "grad_norm": 0.30204605434139176, + "learning_rate": 7.096065180282893e-05, + "loss": 1.1782, + "step": 12837 + }, + { + "epoch": 1.23, + "grad_norm": 0.2645433780254072, + "learning_rate": 7.094551437977412e-05, + "loss": 0.9151, + "step": 12838 + }, + { + "epoch": 1.23, + "grad_norm": 0.29061389012011624, + "learning_rate": 7.093037768382608e-05, + "loss": 1.0708, + "step": 12839 + }, + { + "epoch": 1.23, + "grad_norm": 0.35047107956297124, + "learning_rate": 7.091524171536361e-05, + "loss": 1.0278, + "step": 12840 + }, + { + "epoch": 1.23, + "grad_norm": 0.2792610010348014, + "learning_rate": 7.090010647476548e-05, + "loss": 1.0231, + "step": 12841 + }, + { + "epoch": 1.23, + "grad_norm": 0.2826574217001319, + "learning_rate": 7.088497196241044e-05, + "loss": 1.0813, + "step": 12842 + }, + { + "epoch": 1.23, + "grad_norm": 0.3090155308667243, + "learning_rate": 7.086983817867732e-05, + "loss": 1.114, + "step": 12843 + }, + { + "epoch": 1.23, + "grad_norm": 0.3042335718890938, + "learning_rate": 7.085470512394481e-05, + "loss": 0.9796, + "step": 12844 + }, + { + "epoch": 1.23, + "grad_norm": 0.3377681613933235, + "learning_rate": 7.083957279859162e-05, + "loss": 0.9566, + "step": 12845 + }, + { + "epoch": 1.23, + "grad_norm": 0.2701925933747922, + "learning_rate": 7.082444120299644e-05, + "loss": 1.0148, + "step": 12846 + }, + { + "epoch": 1.23, + "grad_norm": 0.33366173205023547, + "learning_rate": 7.080931033753797e-05, + "loss": 1.0172, + "step": 12847 + }, + { + "epoch": 1.23, + "grad_norm": 0.2684589102915182, + "learning_rate": 7.079418020259484e-05, + "loss": 0.9886, + "step": 12848 + }, + { + "epoch": 1.23, + "grad_norm": 0.3033365220684362, + "learning_rate": 7.077905079854566e-05, + "loss": 1.082, + "step": 12849 + }, + { + "epoch": 1.23, + "grad_norm": 0.3423038287636813, + "learning_rate": 7.076392212576916e-05, + "loss": 1.0437, + "step": 12850 + }, + { + "epoch": 1.23, + "grad_norm": 0.33870474473117973, + "learning_rate": 7.074879418464386e-05, + "loss": 1.039, + "step": 12851 + }, + { + "epoch": 1.23, + "grad_norm": 0.36640105832126824, + "learning_rate": 7.07336669755484e-05, + "loss": 0.8851, + "step": 12852 + }, + { + "epoch": 1.23, + "grad_norm": 0.30561958587030286, + "learning_rate": 7.071854049886127e-05, + "loss": 1.1217, + "step": 12853 + }, + { + "epoch": 1.23, + "grad_norm": 0.30390240720857264, + "learning_rate": 7.070341475496109e-05, + "loss": 1.1275, + "step": 12854 + }, + { + "epoch": 1.23, + "grad_norm": 0.3122214668841066, + "learning_rate": 7.068828974422637e-05, + "loss": 1.0194, + "step": 12855 + }, + { + "epoch": 1.23, + "grad_norm": 0.2786145466642042, + "learning_rate": 7.06731654670356e-05, + "loss": 1.0524, + "step": 12856 + }, + { + "epoch": 1.23, + "grad_norm": 0.28476731608134775, + "learning_rate": 7.06580419237673e-05, + "loss": 1.1226, + "step": 12857 + }, + { + "epoch": 1.23, + "grad_norm": 0.2994457862517522, + "learning_rate": 7.064291911479997e-05, + "loss": 0.9424, + "step": 12858 + }, + { + "epoch": 1.23, + "grad_norm": 0.29376903480698935, + "learning_rate": 7.062779704051203e-05, + "loss": 1.0241, + "step": 12859 + }, + { + "epoch": 1.23, + "grad_norm": 0.330713822109918, + "learning_rate": 7.061267570128192e-05, + "loss": 1.1163, + "step": 12860 + }, + { + "epoch": 1.23, + "grad_norm": 0.30144001438837353, + "learning_rate": 7.059755509748809e-05, + "loss": 0.9836, + "step": 12861 + }, + { + "epoch": 1.23, + "grad_norm": 0.31678287234377844, + "learning_rate": 7.058243522950891e-05, + "loss": 0.9856, + "step": 12862 + }, + { + "epoch": 1.23, + "grad_norm": 0.26184485009418096, + "learning_rate": 7.056731609772272e-05, + "loss": 1.022, + "step": 12863 + }, + { + "epoch": 1.23, + "grad_norm": 0.33546316179058894, + "learning_rate": 7.055219770250801e-05, + "loss": 1.0067, + "step": 12864 + }, + { + "epoch": 1.23, + "grad_norm": 0.2664926799194354, + "learning_rate": 7.053708004424304e-05, + "loss": 0.9291, + "step": 12865 + }, + { + "epoch": 1.23, + "grad_norm": 0.2961959011384652, + "learning_rate": 7.052196312330616e-05, + "loss": 1.0345, + "step": 12866 + }, + { + "epoch": 1.23, + "grad_norm": 0.3077662843149288, + "learning_rate": 7.05068469400757e-05, + "loss": 0.9814, + "step": 12867 + }, + { + "epoch": 1.23, + "grad_norm": 0.28974843150114854, + "learning_rate": 7.049173149492991e-05, + "loss": 0.974, + "step": 12868 + }, + { + "epoch": 1.23, + "grad_norm": 0.3080288309743229, + "learning_rate": 7.047661678824706e-05, + "loss": 0.9669, + "step": 12869 + }, + { + "epoch": 1.23, + "grad_norm": 0.27186428240245103, + "learning_rate": 7.046150282040547e-05, + "loss": 1.0147, + "step": 12870 + }, + { + "epoch": 1.23, + "grad_norm": 0.3178473028236003, + "learning_rate": 7.044638959178332e-05, + "loss": 1.0134, + "step": 12871 + }, + { + "epoch": 1.23, + "grad_norm": 0.33144144366219686, + "learning_rate": 7.043127710275885e-05, + "loss": 1.0603, + "step": 12872 + }, + { + "epoch": 1.23, + "grad_norm": 0.30342828498498453, + "learning_rate": 7.041616535371026e-05, + "loss": 1.1458, + "step": 12873 + }, + { + "epoch": 1.23, + "grad_norm": 0.49065755550557194, + "learning_rate": 7.04010543450157e-05, + "loss": 0.9901, + "step": 12874 + }, + { + "epoch": 1.23, + "grad_norm": 0.2712623971811776, + "learning_rate": 7.038594407705339e-05, + "loss": 1.0221, + "step": 12875 + }, + { + "epoch": 1.23, + "grad_norm": 0.3468256353003316, + "learning_rate": 7.03708345502014e-05, + "loss": 0.9951, + "step": 12876 + }, + { + "epoch": 1.23, + "grad_norm": 0.3197134530463721, + "learning_rate": 7.035572576483793e-05, + "loss": 1.0066, + "step": 12877 + }, + { + "epoch": 1.23, + "grad_norm": 0.30531905324626407, + "learning_rate": 7.034061772134103e-05, + "loss": 1.1065, + "step": 12878 + }, + { + "epoch": 1.23, + "grad_norm": 0.297657504645266, + "learning_rate": 7.032551042008883e-05, + "loss": 1.0802, + "step": 12879 + }, + { + "epoch": 1.23, + "grad_norm": 0.301680156940239, + "learning_rate": 7.031040386145938e-05, + "loss": 1.023, + "step": 12880 + }, + { + "epoch": 1.23, + "grad_norm": 0.2997706509140674, + "learning_rate": 7.029529804583074e-05, + "loss": 1.025, + "step": 12881 + }, + { + "epoch": 1.23, + "grad_norm": 0.29671911940311707, + "learning_rate": 7.028019297358093e-05, + "loss": 0.9737, + "step": 12882 + }, + { + "epoch": 1.23, + "grad_norm": 0.30491560852062927, + "learning_rate": 7.026508864508793e-05, + "loss": 1.1795, + "step": 12883 + }, + { + "epoch": 1.23, + "grad_norm": 0.2786233380625429, + "learning_rate": 7.024998506072981e-05, + "loss": 1.1415, + "step": 12884 + }, + { + "epoch": 1.23, + "grad_norm": 0.31141511007306105, + "learning_rate": 7.023488222088453e-05, + "loss": 1.025, + "step": 12885 + }, + { + "epoch": 1.23, + "grad_norm": 0.3084255582868199, + "learning_rate": 7.021978012593e-05, + "loss": 1.0495, + "step": 12886 + }, + { + "epoch": 1.23, + "grad_norm": 0.31468524861079, + "learning_rate": 7.020467877624422e-05, + "loss": 1.0314, + "step": 12887 + }, + { + "epoch": 1.23, + "grad_norm": 0.29424301485383475, + "learning_rate": 7.018957817220505e-05, + "loss": 1.07, + "step": 12888 + }, + { + "epoch": 1.23, + "grad_norm": 0.31132143217539177, + "learning_rate": 7.017447831419044e-05, + "loss": 0.9208, + "step": 12889 + }, + { + "epoch": 1.23, + "grad_norm": 0.3259440865030501, + "learning_rate": 7.01593792025782e-05, + "loss": 0.9154, + "step": 12890 + }, + { + "epoch": 1.23, + "grad_norm": 0.2884891812674482, + "learning_rate": 7.014428083774629e-05, + "loss": 1.0024, + "step": 12891 + }, + { + "epoch": 1.23, + "grad_norm": 0.3263369886575254, + "learning_rate": 7.012918322007254e-05, + "loss": 0.9482, + "step": 12892 + }, + { + "epoch": 1.23, + "grad_norm": 0.291157825536298, + "learning_rate": 7.011408634993473e-05, + "loss": 1.0374, + "step": 12893 + }, + { + "epoch": 1.23, + "grad_norm": 0.3050633325895519, + "learning_rate": 7.009899022771067e-05, + "loss": 0.9741, + "step": 12894 + }, + { + "epoch": 1.23, + "grad_norm": 0.288534863952808, + "learning_rate": 7.008389485377821e-05, + "loss": 0.8529, + "step": 12895 + }, + { + "epoch": 1.23, + "grad_norm": 0.3106460234330395, + "learning_rate": 7.006880022851508e-05, + "loss": 1.004, + "step": 12896 + }, + { + "epoch": 1.23, + "grad_norm": 0.31446574291000234, + "learning_rate": 7.0053706352299e-05, + "loss": 1.018, + "step": 12897 + }, + { + "epoch": 1.23, + "grad_norm": 0.3048324102774257, + "learning_rate": 7.003861322550775e-05, + "loss": 1.0074, + "step": 12898 + }, + { + "epoch": 1.23, + "grad_norm": 0.30356109183794866, + "learning_rate": 7.002352084851904e-05, + "loss": 1.0376, + "step": 12899 + }, + { + "epoch": 1.23, + "grad_norm": 0.3021797239131508, + "learning_rate": 7.000842922171058e-05, + "loss": 0.9563, + "step": 12900 + }, + { + "epoch": 1.23, + "grad_norm": 0.2969648936394849, + "learning_rate": 6.999333834546003e-05, + "loss": 0.984, + "step": 12901 + }, + { + "epoch": 1.23, + "grad_norm": 0.3345067207557389, + "learning_rate": 6.997824822014503e-05, + "loss": 1.036, + "step": 12902 + }, + { + "epoch": 1.23, + "grad_norm": 0.2842262158106226, + "learning_rate": 6.99631588461432e-05, + "loss": 1.0615, + "step": 12903 + }, + { + "epoch": 1.23, + "grad_norm": 0.28952814400989707, + "learning_rate": 6.994807022383223e-05, + "loss": 1.0825, + "step": 12904 + }, + { + "epoch": 1.23, + "grad_norm": 0.309814766686036, + "learning_rate": 6.993298235358969e-05, + "loss": 1.1242, + "step": 12905 + }, + { + "epoch": 1.23, + "grad_norm": 0.31014573093966963, + "learning_rate": 6.991789523579319e-05, + "loss": 1.0502, + "step": 12906 + }, + { + "epoch": 1.23, + "grad_norm": 0.3436593877209916, + "learning_rate": 6.990280887082024e-05, + "loss": 1.0936, + "step": 12907 + }, + { + "epoch": 1.23, + "grad_norm": 0.35485774036973666, + "learning_rate": 6.988772325904843e-05, + "loss": 1.0344, + "step": 12908 + }, + { + "epoch": 1.24, + "grad_norm": 0.3178238567087694, + "learning_rate": 6.987263840085526e-05, + "loss": 1.1573, + "step": 12909 + }, + { + "epoch": 1.24, + "grad_norm": 0.32338839883771314, + "learning_rate": 6.985755429661821e-05, + "loss": 1.0217, + "step": 12910 + }, + { + "epoch": 1.24, + "grad_norm": 0.2943936094804158, + "learning_rate": 6.984247094671485e-05, + "loss": 1.0479, + "step": 12911 + }, + { + "epoch": 1.24, + "grad_norm": 0.36409737005433424, + "learning_rate": 6.98273883515226e-05, + "loss": 1.0729, + "step": 12912 + }, + { + "epoch": 1.24, + "grad_norm": 0.2962094189138959, + "learning_rate": 6.981230651141894e-05, + "loss": 0.9703, + "step": 12913 + }, + { + "epoch": 1.24, + "grad_norm": 0.3095698380656648, + "learning_rate": 6.979722542678127e-05, + "loss": 1.0471, + "step": 12914 + }, + { + "epoch": 1.24, + "grad_norm": 0.3379883700242694, + "learning_rate": 6.978214509798699e-05, + "loss": 0.9649, + "step": 12915 + }, + { + "epoch": 1.24, + "grad_norm": 0.33227482064672853, + "learning_rate": 6.976706552541354e-05, + "loss": 1.0397, + "step": 12916 + }, + { + "epoch": 1.24, + "grad_norm": 0.3423528875125707, + "learning_rate": 6.975198670943825e-05, + "loss": 0.9931, + "step": 12917 + }, + { + "epoch": 1.24, + "grad_norm": 0.3133921897150818, + "learning_rate": 6.973690865043854e-05, + "loss": 1.0794, + "step": 12918 + }, + { + "epoch": 1.24, + "grad_norm": 0.29561861924987204, + "learning_rate": 6.972183134879169e-05, + "loss": 1.0626, + "step": 12919 + }, + { + "epoch": 1.24, + "grad_norm": 0.3311250363928045, + "learning_rate": 6.970675480487506e-05, + "loss": 1.1235, + "step": 12920 + }, + { + "epoch": 1.24, + "grad_norm": 0.30123920194262577, + "learning_rate": 6.96916790190659e-05, + "loss": 0.9478, + "step": 12921 + }, + { + "epoch": 1.24, + "grad_norm": 0.29681009819758514, + "learning_rate": 6.967660399174155e-05, + "loss": 1.006, + "step": 12922 + }, + { + "epoch": 1.24, + "grad_norm": 0.3380029093560818, + "learning_rate": 6.966152972327923e-05, + "loss": 0.959, + "step": 12923 + }, + { + "epoch": 1.24, + "grad_norm": 0.2935849745722062, + "learning_rate": 6.964645621405616e-05, + "loss": 0.8451, + "step": 12924 + }, + { + "epoch": 1.24, + "grad_norm": 0.2917306622663801, + "learning_rate": 6.963138346444964e-05, + "loss": 0.9386, + "step": 12925 + }, + { + "epoch": 1.24, + "grad_norm": 0.2974374395696289, + "learning_rate": 6.961631147483685e-05, + "loss": 0.9229, + "step": 12926 + }, + { + "epoch": 1.24, + "grad_norm": 0.32152140771652304, + "learning_rate": 6.960124024559493e-05, + "loss": 1.0356, + "step": 12927 + }, + { + "epoch": 1.24, + "grad_norm": 0.3312406008675425, + "learning_rate": 6.95861697771011e-05, + "loss": 0.9461, + "step": 12928 + }, + { + "epoch": 1.24, + "grad_norm": 0.3142169688192605, + "learning_rate": 6.957110006973249e-05, + "loss": 0.9499, + "step": 12929 + }, + { + "epoch": 1.24, + "grad_norm": 0.277686184685768, + "learning_rate": 6.955603112386618e-05, + "loss": 0.9586, + "step": 12930 + }, + { + "epoch": 1.24, + "grad_norm": 0.31337022914165463, + "learning_rate": 6.954096293987935e-05, + "loss": 1.0029, + "step": 12931 + }, + { + "epoch": 1.24, + "grad_norm": 0.2953145037967198, + "learning_rate": 6.952589551814908e-05, + "loss": 1.0401, + "step": 12932 + }, + { + "epoch": 1.24, + "grad_norm": 0.3031839546970206, + "learning_rate": 6.95108288590524e-05, + "loss": 1.0054, + "step": 12933 + }, + { + "epoch": 1.24, + "grad_norm": 0.3342249554447079, + "learning_rate": 6.949576296296643e-05, + "loss": 0.8775, + "step": 12934 + }, + { + "epoch": 1.24, + "grad_norm": 0.30743131867638746, + "learning_rate": 6.948069783026813e-05, + "loss": 1.1789, + "step": 12935 + }, + { + "epoch": 1.24, + "grad_norm": 0.3010100287286379, + "learning_rate": 6.946563346133455e-05, + "loss": 0.9281, + "step": 12936 + }, + { + "epoch": 1.24, + "grad_norm": 0.31498321591819406, + "learning_rate": 6.945056985654267e-05, + "loss": 0.924, + "step": 12937 + }, + { + "epoch": 1.24, + "grad_norm": 0.3373811187788921, + "learning_rate": 6.94355070162695e-05, + "loss": 0.9905, + "step": 12938 + }, + { + "epoch": 1.24, + "grad_norm": 0.30828100932315, + "learning_rate": 6.942044494089196e-05, + "loss": 1.0166, + "step": 12939 + }, + { + "epoch": 1.24, + "grad_norm": 0.3551684184996189, + "learning_rate": 6.940538363078701e-05, + "loss": 1.0128, + "step": 12940 + }, + { + "epoch": 1.24, + "grad_norm": 0.27099571885275114, + "learning_rate": 6.939032308633159e-05, + "loss": 1.0082, + "step": 12941 + }, + { + "epoch": 1.24, + "grad_norm": 0.32232845482149886, + "learning_rate": 6.937526330790256e-05, + "loss": 1.0506, + "step": 12942 + }, + { + "epoch": 1.24, + "grad_norm": 0.29003303165527483, + "learning_rate": 6.93602042958768e-05, + "loss": 0.9706, + "step": 12943 + }, + { + "epoch": 1.24, + "grad_norm": 0.3302398479895162, + "learning_rate": 6.934514605063116e-05, + "loss": 1.0496, + "step": 12944 + }, + { + "epoch": 1.24, + "grad_norm": 0.2799300058952538, + "learning_rate": 6.933008857254255e-05, + "loss": 1.007, + "step": 12945 + }, + { + "epoch": 1.24, + "grad_norm": 0.31508573458512257, + "learning_rate": 6.931503186198774e-05, + "loss": 1.1495, + "step": 12946 + }, + { + "epoch": 1.24, + "grad_norm": 0.33761404798416156, + "learning_rate": 6.929997591934354e-05, + "loss": 1.0925, + "step": 12947 + }, + { + "epoch": 1.24, + "grad_norm": 0.3934018167117008, + "learning_rate": 6.928492074498672e-05, + "loss": 1.074, + "step": 12948 + }, + { + "epoch": 1.24, + "grad_norm": 0.36032971394924507, + "learning_rate": 6.926986633929409e-05, + "loss": 1.0323, + "step": 12949 + }, + { + "epoch": 1.24, + "grad_norm": 0.3093786759779727, + "learning_rate": 6.925481270264236e-05, + "loss": 0.9305, + "step": 12950 + }, + { + "epoch": 1.24, + "grad_norm": 0.3423729499939024, + "learning_rate": 6.923975983540821e-05, + "loss": 1.0079, + "step": 12951 + }, + { + "epoch": 1.24, + "grad_norm": 0.3219853224119802, + "learning_rate": 6.922470773796845e-05, + "loss": 1.1014, + "step": 12952 + }, + { + "epoch": 1.24, + "grad_norm": 0.2919791284653352, + "learning_rate": 6.920965641069974e-05, + "loss": 1.0576, + "step": 12953 + }, + { + "epoch": 1.24, + "grad_norm": 0.32424176742094146, + "learning_rate": 6.919460585397871e-05, + "loss": 1.0116, + "step": 12954 + }, + { + "epoch": 1.24, + "grad_norm": 0.39779324281728284, + "learning_rate": 6.917955606818204e-05, + "loss": 1.1059, + "step": 12955 + }, + { + "epoch": 1.24, + "grad_norm": 0.31626807030704573, + "learning_rate": 6.916450705368631e-05, + "loss": 1.0393, + "step": 12956 + }, + { + "epoch": 1.24, + "grad_norm": 0.29686452042426115, + "learning_rate": 6.91494588108682e-05, + "loss": 1.0796, + "step": 12957 + }, + { + "epoch": 1.24, + "grad_norm": 0.24997834208129394, + "learning_rate": 6.913441134010426e-05, + "loss": 0.9884, + "step": 12958 + }, + { + "epoch": 1.24, + "grad_norm": 0.3469728865691748, + "learning_rate": 6.911936464177108e-05, + "loss": 1.1667, + "step": 12959 + }, + { + "epoch": 1.24, + "grad_norm": 0.2933903543227457, + "learning_rate": 6.91043187162452e-05, + "loss": 1.059, + "step": 12960 + }, + { + "epoch": 1.24, + "grad_norm": 0.28387567698216587, + "learning_rate": 6.90892735639032e-05, + "loss": 1.0557, + "step": 12961 + }, + { + "epoch": 1.24, + "grad_norm": 0.2653266369566171, + "learning_rate": 6.907422918512153e-05, + "loss": 0.9759, + "step": 12962 + }, + { + "epoch": 1.24, + "grad_norm": 0.2808086273686549, + "learning_rate": 6.905918558027671e-05, + "loss": 1.064, + "step": 12963 + }, + { + "epoch": 1.24, + "grad_norm": 0.29212305326149374, + "learning_rate": 6.904414274974518e-05, + "loss": 1.0641, + "step": 12964 + }, + { + "epoch": 1.24, + "grad_norm": 0.3015521692882197, + "learning_rate": 6.902910069390348e-05, + "loss": 0.9554, + "step": 12965 + }, + { + "epoch": 1.24, + "grad_norm": 0.3469454704103203, + "learning_rate": 6.9014059413128e-05, + "loss": 1.031, + "step": 12966 + }, + { + "epoch": 1.24, + "grad_norm": 0.30867586174998257, + "learning_rate": 6.899901890779514e-05, + "loss": 0.9834, + "step": 12967 + }, + { + "epoch": 1.24, + "grad_norm": 0.3022755900528362, + "learning_rate": 6.898397917828133e-05, + "loss": 0.9626, + "step": 12968 + }, + { + "epoch": 1.24, + "grad_norm": 0.26705231602520063, + "learning_rate": 6.896894022496293e-05, + "loss": 1.0378, + "step": 12969 + }, + { + "epoch": 1.24, + "grad_norm": 0.3171906671220136, + "learning_rate": 6.89539020482163e-05, + "loss": 1.1245, + "step": 12970 + }, + { + "epoch": 1.24, + "grad_norm": 0.30790941281033, + "learning_rate": 6.893886464841775e-05, + "loss": 1.089, + "step": 12971 + }, + { + "epoch": 1.24, + "grad_norm": 0.2965793164950096, + "learning_rate": 6.892382802594367e-05, + "loss": 1.0312, + "step": 12972 + }, + { + "epoch": 1.24, + "grad_norm": 0.31692275163087436, + "learning_rate": 6.890879218117032e-05, + "loss": 0.9607, + "step": 12973 + }, + { + "epoch": 1.24, + "grad_norm": 0.30500875697025115, + "learning_rate": 6.889375711447399e-05, + "loss": 1.0103, + "step": 12974 + }, + { + "epoch": 1.24, + "grad_norm": 0.3093215037429503, + "learning_rate": 6.887872282623094e-05, + "loss": 0.9803, + "step": 12975 + }, + { + "epoch": 1.24, + "grad_norm": 0.3515236358775855, + "learning_rate": 6.886368931681742e-05, + "loss": 1.0182, + "step": 12976 + }, + { + "epoch": 1.24, + "grad_norm": 0.3258919601695083, + "learning_rate": 6.88486565866096e-05, + "loss": 1.0374, + "step": 12977 + }, + { + "epoch": 1.24, + "grad_norm": 0.30327519567816896, + "learning_rate": 6.883362463598373e-05, + "loss": 1.1874, + "step": 12978 + }, + { + "epoch": 1.24, + "grad_norm": 0.32030392537222707, + "learning_rate": 6.881859346531602e-05, + "loss": 1.0311, + "step": 12979 + }, + { + "epoch": 1.24, + "grad_norm": 0.3348282064814355, + "learning_rate": 6.880356307498261e-05, + "loss": 1.0009, + "step": 12980 + }, + { + "epoch": 1.24, + "grad_norm": 0.28205199521036534, + "learning_rate": 6.878853346535962e-05, + "loss": 0.944, + "step": 12981 + }, + { + "epoch": 1.24, + "grad_norm": 0.3029350562319963, + "learning_rate": 6.877350463682321e-05, + "loss": 0.931, + "step": 12982 + }, + { + "epoch": 1.24, + "grad_norm": 0.31025309329783823, + "learning_rate": 6.875847658974947e-05, + "loss": 1.1085, + "step": 12983 + }, + { + "epoch": 1.24, + "grad_norm": 0.33689656975767707, + "learning_rate": 6.874344932451449e-05, + "loss": 1.0016, + "step": 12984 + }, + { + "epoch": 1.24, + "grad_norm": 0.3126560418395297, + "learning_rate": 6.87284228414943e-05, + "loss": 0.9523, + "step": 12985 + }, + { + "epoch": 1.24, + "grad_norm": 0.3038828294098562, + "learning_rate": 6.871339714106502e-05, + "loss": 1.0921, + "step": 12986 + }, + { + "epoch": 1.24, + "grad_norm": 0.33650628901521507, + "learning_rate": 6.869837222360262e-05, + "loss": 1.1398, + "step": 12987 + }, + { + "epoch": 1.24, + "grad_norm": 0.29083775906426323, + "learning_rate": 6.868334808948316e-05, + "loss": 0.9054, + "step": 12988 + }, + { + "epoch": 1.24, + "grad_norm": 0.3402948163288892, + "learning_rate": 6.866832473908256e-05, + "loss": 0.9682, + "step": 12989 + }, + { + "epoch": 1.24, + "grad_norm": 0.31098875703261053, + "learning_rate": 6.865330217277685e-05, + "loss": 1.0661, + "step": 12990 + }, + { + "epoch": 1.24, + "grad_norm": 0.31346854808626423, + "learning_rate": 6.863828039094191e-05, + "loss": 1.0374, + "step": 12991 + }, + { + "epoch": 1.24, + "grad_norm": 0.281560599053753, + "learning_rate": 6.862325939395376e-05, + "loss": 1.0591, + "step": 12992 + }, + { + "epoch": 1.24, + "grad_norm": 0.31938723621958387, + "learning_rate": 6.860823918218823e-05, + "loss": 1.1253, + "step": 12993 + }, + { + "epoch": 1.24, + "grad_norm": 0.29549871895225766, + "learning_rate": 6.859321975602128e-05, + "loss": 1.0521, + "step": 12994 + }, + { + "epoch": 1.24, + "grad_norm": 0.32993288890164413, + "learning_rate": 6.857820111582874e-05, + "loss": 0.9558, + "step": 12995 + }, + { + "epoch": 1.24, + "grad_norm": 0.3085447233599797, + "learning_rate": 6.856318326198645e-05, + "loss": 1.047, + "step": 12996 + }, + { + "epoch": 1.24, + "grad_norm": 0.368126687731009, + "learning_rate": 6.854816619487024e-05, + "loss": 1.1702, + "step": 12997 + }, + { + "epoch": 1.24, + "grad_norm": 0.29132304884747195, + "learning_rate": 6.853314991485595e-05, + "loss": 1.0209, + "step": 12998 + }, + { + "epoch": 1.24, + "grad_norm": 0.37241075069213675, + "learning_rate": 6.851813442231936e-05, + "loss": 1.1053, + "step": 12999 + }, + { + "epoch": 1.24, + "grad_norm": 0.3275720490666176, + "learning_rate": 6.850311971763625e-05, + "loss": 0.9859, + "step": 13000 + }, + { + "epoch": 1.24, + "grad_norm": 0.29727998742904743, + "learning_rate": 6.848810580118235e-05, + "loss": 1.095, + "step": 13001 + }, + { + "epoch": 1.24, + "grad_norm": 0.3154667926296613, + "learning_rate": 6.847309267333341e-05, + "loss": 1.0238, + "step": 13002 + }, + { + "epoch": 1.24, + "grad_norm": 0.2904023029821885, + "learning_rate": 6.845808033446517e-05, + "loss": 1.0185, + "step": 13003 + }, + { + "epoch": 1.24, + "grad_norm": 0.28845568583790404, + "learning_rate": 6.844306878495327e-05, + "loss": 0.983, + "step": 13004 + }, + { + "epoch": 1.24, + "grad_norm": 0.3258389816944525, + "learning_rate": 6.842805802517337e-05, + "loss": 1.07, + "step": 13005 + }, + { + "epoch": 1.24, + "grad_norm": 0.27846995728299445, + "learning_rate": 6.84130480555012e-05, + "loss": 0.9465, + "step": 13006 + }, + { + "epoch": 1.24, + "grad_norm": 0.30427389646511294, + "learning_rate": 6.839803887631236e-05, + "loss": 0.9068, + "step": 13007 + }, + { + "epoch": 1.24, + "grad_norm": 0.31593428659572387, + "learning_rate": 6.838303048798244e-05, + "loss": 0.8974, + "step": 13008 + }, + { + "epoch": 1.24, + "grad_norm": 0.27544113300597706, + "learning_rate": 6.836802289088708e-05, + "loss": 1.0792, + "step": 13009 + }, + { + "epoch": 1.24, + "grad_norm": 0.3172296104000656, + "learning_rate": 6.835301608540181e-05, + "loss": 1.1059, + "step": 13010 + }, + { + "epoch": 1.24, + "grad_norm": 0.3152889869463746, + "learning_rate": 6.833801007190221e-05, + "loss": 0.9937, + "step": 13011 + }, + { + "epoch": 1.24, + "grad_norm": 0.303578806734777, + "learning_rate": 6.832300485076376e-05, + "loss": 0.9686, + "step": 13012 + }, + { + "epoch": 1.24, + "grad_norm": 0.3349212496513893, + "learning_rate": 6.830800042236207e-05, + "loss": 1.0408, + "step": 13013 + }, + { + "epoch": 1.25, + "grad_norm": 0.28939574070782365, + "learning_rate": 6.829299678707258e-05, + "loss": 1.0869, + "step": 13014 + }, + { + "epoch": 1.25, + "grad_norm": 0.32079348805444724, + "learning_rate": 6.827799394527077e-05, + "loss": 0.8248, + "step": 13015 + }, + { + "epoch": 1.25, + "grad_norm": 0.31894095737178735, + "learning_rate": 6.82629918973321e-05, + "loss": 1.0749, + "step": 13016 + }, + { + "epoch": 1.25, + "grad_norm": 0.35248874210851755, + "learning_rate": 6.824799064363201e-05, + "loss": 1.0903, + "step": 13017 + }, + { + "epoch": 1.25, + "grad_norm": 0.30057719002115635, + "learning_rate": 6.823299018454588e-05, + "loss": 1.0519, + "step": 13018 + }, + { + "epoch": 1.25, + "grad_norm": 0.2984566500099061, + "learning_rate": 6.821799052044914e-05, + "loss": 1.0689, + "step": 13019 + }, + { + "epoch": 1.25, + "grad_norm": 0.2958290348196392, + "learning_rate": 6.820299165171717e-05, + "loss": 1.0159, + "step": 13020 + }, + { + "epoch": 1.25, + "grad_norm": 0.3681125489161926, + "learning_rate": 6.818799357872533e-05, + "loss": 0.9837, + "step": 13021 + }, + { + "epoch": 1.25, + "grad_norm": 0.3057689455069618, + "learning_rate": 6.817299630184893e-05, + "loss": 1.0484, + "step": 13022 + }, + { + "epoch": 1.25, + "grad_norm": 0.28390185717231736, + "learning_rate": 6.815799982146331e-05, + "loss": 1.0318, + "step": 13023 + }, + { + "epoch": 1.25, + "grad_norm": 0.2949114488422249, + "learning_rate": 6.814300413794377e-05, + "loss": 0.9794, + "step": 13024 + }, + { + "epoch": 1.25, + "grad_norm": 0.3121888821421138, + "learning_rate": 6.812800925166554e-05, + "loss": 1.0512, + "step": 13025 + }, + { + "epoch": 1.25, + "grad_norm": 0.3287379179488328, + "learning_rate": 6.811301516300391e-05, + "loss": 1.0345, + "step": 13026 + }, + { + "epoch": 1.25, + "grad_norm": 0.339869490874344, + "learning_rate": 6.809802187233414e-05, + "loss": 0.991, + "step": 13027 + }, + { + "epoch": 1.25, + "grad_norm": 0.341408152601276, + "learning_rate": 6.808302938003141e-05, + "loss": 1.057, + "step": 13028 + }, + { + "epoch": 1.25, + "grad_norm": 0.2878191237195913, + "learning_rate": 6.806803768647094e-05, + "loss": 1.0783, + "step": 13029 + }, + { + "epoch": 1.25, + "grad_norm": 0.2977570823041321, + "learning_rate": 6.80530467920279e-05, + "loss": 0.9734, + "step": 13030 + }, + { + "epoch": 1.25, + "grad_norm": 0.29081670466126136, + "learning_rate": 6.803805669707744e-05, + "loss": 1.1133, + "step": 13031 + }, + { + "epoch": 1.25, + "grad_norm": 0.32950963267312533, + "learning_rate": 6.802306740199464e-05, + "loss": 1.0221, + "step": 13032 + }, + { + "epoch": 1.25, + "grad_norm": 0.3176922722390941, + "learning_rate": 6.800807890715475e-05, + "loss": 1.0578, + "step": 13033 + }, + { + "epoch": 1.25, + "grad_norm": 0.28714044984496045, + "learning_rate": 6.799309121293275e-05, + "loss": 1.0669, + "step": 13034 + }, + { + "epoch": 1.25, + "grad_norm": 0.31439143610601467, + "learning_rate": 6.79781043197038e-05, + "loss": 0.8188, + "step": 13035 + }, + { + "epoch": 1.25, + "grad_norm": 0.3183169770709285, + "learning_rate": 6.79631182278429e-05, + "loss": 0.9677, + "step": 13036 + }, + { + "epoch": 1.25, + "grad_norm": 0.36800984007028514, + "learning_rate": 6.794813293772509e-05, + "loss": 1.1014, + "step": 13037 + }, + { + "epoch": 1.25, + "grad_norm": 0.3035487885802878, + "learning_rate": 6.793314844972537e-05, + "loss": 1.067, + "step": 13038 + }, + { + "epoch": 1.25, + "grad_norm": 0.2838836067338343, + "learning_rate": 6.791816476421877e-05, + "loss": 1.0028, + "step": 13039 + }, + { + "epoch": 1.25, + "grad_norm": 0.32150946098234195, + "learning_rate": 6.790318188158029e-05, + "loss": 1.0501, + "step": 13040 + }, + { + "epoch": 1.25, + "grad_norm": 0.30527958998487065, + "learning_rate": 6.788819980218485e-05, + "loss": 1.0788, + "step": 13041 + }, + { + "epoch": 1.25, + "grad_norm": 0.32063433647991246, + "learning_rate": 6.787321852640738e-05, + "loss": 1.017, + "step": 13042 + }, + { + "epoch": 1.25, + "grad_norm": 0.3093373586181076, + "learning_rate": 6.785823805462281e-05, + "loss": 1.0889, + "step": 13043 + }, + { + "epoch": 1.25, + "grad_norm": 0.2659071901723031, + "learning_rate": 6.784325838720605e-05, + "loss": 1.0552, + "step": 13044 + }, + { + "epoch": 1.25, + "grad_norm": 0.3028978359906059, + "learning_rate": 6.782827952453194e-05, + "loss": 0.9846, + "step": 13045 + }, + { + "epoch": 1.25, + "grad_norm": 0.3096084317057199, + "learning_rate": 6.781330146697533e-05, + "loss": 0.9886, + "step": 13046 + }, + { + "epoch": 1.25, + "grad_norm": 0.28321011587726924, + "learning_rate": 6.779832421491111e-05, + "loss": 1.0657, + "step": 13047 + }, + { + "epoch": 1.25, + "grad_norm": 0.27389421279275905, + "learning_rate": 6.778334776871407e-05, + "loss": 0.951, + "step": 13048 + }, + { + "epoch": 1.25, + "grad_norm": 0.27187850929399177, + "learning_rate": 6.776837212875899e-05, + "loss": 1.021, + "step": 13049 + }, + { + "epoch": 1.25, + "grad_norm": 0.3092818371287998, + "learning_rate": 6.775339729542067e-05, + "loss": 0.9605, + "step": 13050 + }, + { + "epoch": 1.25, + "grad_norm": 0.28734269155021097, + "learning_rate": 6.773842326907384e-05, + "loss": 1.158, + "step": 13051 + }, + { + "epoch": 1.25, + "grad_norm": 0.31511039746165515, + "learning_rate": 6.77234500500932e-05, + "loss": 0.9713, + "step": 13052 + }, + { + "epoch": 1.25, + "grad_norm": 0.3211482927491413, + "learning_rate": 6.770847763885355e-05, + "loss": 0.9783, + "step": 13053 + }, + { + "epoch": 1.25, + "grad_norm": 0.33419503292304487, + "learning_rate": 6.769350603572955e-05, + "loss": 1.0441, + "step": 13054 + }, + { + "epoch": 1.25, + "grad_norm": 0.2866515790623548, + "learning_rate": 6.767853524109585e-05, + "loss": 1.05, + "step": 13055 + }, + { + "epoch": 1.25, + "grad_norm": 0.2837191991173485, + "learning_rate": 6.766356525532713e-05, + "loss": 0.9757, + "step": 13056 + }, + { + "epoch": 1.25, + "grad_norm": 0.29499609222281936, + "learning_rate": 6.764859607879802e-05, + "loss": 1.0169, + "step": 13057 + }, + { + "epoch": 1.25, + "grad_norm": 0.30361440687181057, + "learning_rate": 6.76336277118831e-05, + "loss": 1.0071, + "step": 13058 + }, + { + "epoch": 1.25, + "grad_norm": 0.34846546918522836, + "learning_rate": 6.761866015495697e-05, + "loss": 1.0386, + "step": 13059 + }, + { + "epoch": 1.25, + "grad_norm": 0.3215343882146052, + "learning_rate": 6.760369340839426e-05, + "loss": 1.0652, + "step": 13060 + }, + { + "epoch": 1.25, + "grad_norm": 0.3385572234604098, + "learning_rate": 6.758872747256947e-05, + "loss": 1.1262, + "step": 13061 + }, + { + "epoch": 1.25, + "grad_norm": 0.24167702669161023, + "learning_rate": 6.757376234785715e-05, + "loss": 1.0145, + "step": 13062 + }, + { + "epoch": 1.25, + "grad_norm": 0.3295403792453386, + "learning_rate": 6.755879803463179e-05, + "loss": 1.0901, + "step": 13063 + }, + { + "epoch": 1.25, + "grad_norm": 0.3619302957829145, + "learning_rate": 6.754383453326792e-05, + "loss": 1.0484, + "step": 13064 + }, + { + "epoch": 1.25, + "grad_norm": 0.28735873958470753, + "learning_rate": 6.752887184413997e-05, + "loss": 0.8727, + "step": 13065 + }, + { + "epoch": 1.25, + "eval_loss": 1.1270205974578857, + "eval_runtime": 4231.0355, + "eval_samples_per_second": 19.763, + "eval_steps_per_second": 2.471, + "step": 13065 + }, + { + "epoch": 1.25, + "grad_norm": 0.34334165664434835, + "learning_rate": 6.751390996762241e-05, + "loss": 1.0174, + "step": 13066 + }, + { + "epoch": 1.25, + "grad_norm": 0.3257373236171967, + "learning_rate": 6.749894890408966e-05, + "loss": 0.9352, + "step": 13067 + }, + { + "epoch": 1.25, + "grad_norm": 0.3156729906680264, + "learning_rate": 6.748398865391616e-05, + "loss": 1.0791, + "step": 13068 + }, + { + "epoch": 1.25, + "grad_norm": 0.3497806839064356, + "learning_rate": 6.746902921747629e-05, + "loss": 1.1097, + "step": 13069 + }, + { + "epoch": 1.25, + "grad_norm": 0.3186760135205117, + "learning_rate": 6.745407059514439e-05, + "loss": 1.0213, + "step": 13070 + }, + { + "epoch": 1.25, + "grad_norm": 0.29754122666307026, + "learning_rate": 6.743911278729485e-05, + "loss": 0.9889, + "step": 13071 + }, + { + "epoch": 1.25, + "grad_norm": 0.33152997305950527, + "learning_rate": 6.742415579430196e-05, + "loss": 1.0333, + "step": 13072 + }, + { + "epoch": 1.25, + "grad_norm": 0.3150743428858849, + "learning_rate": 6.740919961654e-05, + "loss": 1.0305, + "step": 13073 + }, + { + "epoch": 1.25, + "grad_norm": 0.3256220216887421, + "learning_rate": 6.739424425438336e-05, + "loss": 1.0095, + "step": 13074 + }, + { + "epoch": 1.25, + "grad_norm": 0.3659560418264322, + "learning_rate": 6.737928970820624e-05, + "loss": 1.0276, + "step": 13075 + }, + { + "epoch": 1.25, + "grad_norm": 0.3105052581935409, + "learning_rate": 6.736433597838289e-05, + "loss": 0.9975, + "step": 13076 + }, + { + "epoch": 1.25, + "grad_norm": 0.30650164880320846, + "learning_rate": 6.734938306528755e-05, + "loss": 0.8458, + "step": 13077 + }, + { + "epoch": 1.25, + "grad_norm": 0.2818924662432329, + "learning_rate": 6.733443096929442e-05, + "loss": 0.9995, + "step": 13078 + }, + { + "epoch": 1.25, + "grad_norm": 0.2843750208310862, + "learning_rate": 6.73194796907777e-05, + "loss": 0.8233, + "step": 13079 + }, + { + "epoch": 1.25, + "grad_norm": 0.2677018163385261, + "learning_rate": 6.730452923011148e-05, + "loss": 1.0314, + "step": 13080 + }, + { + "epoch": 1.25, + "grad_norm": 0.33598995691097605, + "learning_rate": 6.728957958767e-05, + "loss": 1.0681, + "step": 13081 + }, + { + "epoch": 1.25, + "grad_norm": 0.291429007194965, + "learning_rate": 6.727463076382737e-05, + "loss": 1.0008, + "step": 13082 + }, + { + "epoch": 1.25, + "grad_norm": 0.322182178948345, + "learning_rate": 6.725968275895762e-05, + "loss": 0.9433, + "step": 13083 + }, + { + "epoch": 1.25, + "grad_norm": 0.3025031190017297, + "learning_rate": 6.724473557343493e-05, + "loss": 1.0388, + "step": 13084 + }, + { + "epoch": 1.25, + "grad_norm": 0.3006730508815951, + "learning_rate": 6.722978920763332e-05, + "loss": 1.0221, + "step": 13085 + }, + { + "epoch": 1.25, + "grad_norm": 0.32339068488799366, + "learning_rate": 6.72148436619268e-05, + "loss": 1.1101, + "step": 13086 + }, + { + "epoch": 1.25, + "grad_norm": 0.3141696749560235, + "learning_rate": 6.719989893668941e-05, + "loss": 1.0523, + "step": 13087 + }, + { + "epoch": 1.25, + "grad_norm": 0.2926680233300492, + "learning_rate": 6.71849550322952e-05, + "loss": 0.9787, + "step": 13088 + }, + { + "epoch": 1.25, + "grad_norm": 0.2864215163736861, + "learning_rate": 6.717001194911812e-05, + "loss": 1.1273, + "step": 13089 + }, + { + "epoch": 1.25, + "grad_norm": 0.29894818210133156, + "learning_rate": 6.715506968753212e-05, + "loss": 0.9817, + "step": 13090 + }, + { + "epoch": 1.25, + "grad_norm": 0.3030293742867469, + "learning_rate": 6.714012824791114e-05, + "loss": 0.9994, + "step": 13091 + }, + { + "epoch": 1.25, + "grad_norm": 0.3155685917170279, + "learning_rate": 6.71251876306291e-05, + "loss": 1.0036, + "step": 13092 + }, + { + "epoch": 1.25, + "grad_norm": 0.30785279017885175, + "learning_rate": 6.711024783605986e-05, + "loss": 1.0258, + "step": 13093 + }, + { + "epoch": 1.25, + "grad_norm": 0.33944212088754044, + "learning_rate": 6.709530886457738e-05, + "loss": 1.1419, + "step": 13094 + }, + { + "epoch": 1.25, + "grad_norm": 0.2978007674421392, + "learning_rate": 6.708037071655548e-05, + "loss": 1.0515, + "step": 13095 + }, + { + "epoch": 1.25, + "grad_norm": 0.34151153111562627, + "learning_rate": 6.706543339236798e-05, + "loss": 1.1082, + "step": 13096 + }, + { + "epoch": 1.25, + "grad_norm": 0.3184647943244512, + "learning_rate": 6.705049689238872e-05, + "loss": 1.1515, + "step": 13097 + }, + { + "epoch": 1.25, + "grad_norm": 0.3293873721026734, + "learning_rate": 6.703556121699148e-05, + "loss": 0.9804, + "step": 13098 + }, + { + "epoch": 1.25, + "grad_norm": 0.3552888503680203, + "learning_rate": 6.702062636655004e-05, + "loss": 1.0776, + "step": 13099 + }, + { + "epoch": 1.25, + "grad_norm": 0.33161282887662397, + "learning_rate": 6.70056923414381e-05, + "loss": 1.0061, + "step": 13100 + }, + { + "epoch": 1.25, + "grad_norm": 0.31949511904691497, + "learning_rate": 6.699075914202949e-05, + "loss": 1.0558, + "step": 13101 + }, + { + "epoch": 1.25, + "grad_norm": 0.320961589133692, + "learning_rate": 6.697582676869788e-05, + "loss": 1.1106, + "step": 13102 + }, + { + "epoch": 1.25, + "grad_norm": 0.33352365988628585, + "learning_rate": 6.696089522181696e-05, + "loss": 1.064, + "step": 13103 + }, + { + "epoch": 1.25, + "grad_norm": 0.3162559298097819, + "learning_rate": 6.694596450176038e-05, + "loss": 1.0224, + "step": 13104 + }, + { + "epoch": 1.25, + "grad_norm": 0.30852905212727266, + "learning_rate": 6.693103460890184e-05, + "loss": 0.9235, + "step": 13105 + }, + { + "epoch": 1.25, + "grad_norm": 0.2968521100452835, + "learning_rate": 6.691610554361494e-05, + "loss": 0.991, + "step": 13106 + }, + { + "epoch": 1.25, + "grad_norm": 0.31364456382889216, + "learning_rate": 6.690117730627328e-05, + "loss": 1.1531, + "step": 13107 + }, + { + "epoch": 1.25, + "grad_norm": 0.3359769123837669, + "learning_rate": 6.688624989725044e-05, + "loss": 1.0059, + "step": 13108 + }, + { + "epoch": 1.25, + "grad_norm": 0.2851701775773068, + "learning_rate": 6.687132331692003e-05, + "loss": 0.9848, + "step": 13109 + }, + { + "epoch": 1.25, + "grad_norm": 0.3189991411469636, + "learning_rate": 6.685639756565558e-05, + "loss": 0.9593, + "step": 13110 + }, + { + "epoch": 1.25, + "grad_norm": 0.2862527035548171, + "learning_rate": 6.684147264383061e-05, + "loss": 1.0756, + "step": 13111 + }, + { + "epoch": 1.25, + "grad_norm": 0.33660591640224374, + "learning_rate": 6.682654855181863e-05, + "loss": 1.0146, + "step": 13112 + }, + { + "epoch": 1.25, + "grad_norm": 0.2885258497158104, + "learning_rate": 6.681162528999309e-05, + "loss": 0.9532, + "step": 13113 + }, + { + "epoch": 1.25, + "grad_norm": 0.2897049912989281, + "learning_rate": 6.67967028587275e-05, + "loss": 0.981, + "step": 13114 + }, + { + "epoch": 1.25, + "grad_norm": 0.2946594935646359, + "learning_rate": 6.678178125839532e-05, + "loss": 1.0276, + "step": 13115 + }, + { + "epoch": 1.25, + "grad_norm": 0.2603491879150714, + "learning_rate": 6.676686048936991e-05, + "loss": 1.0467, + "step": 13116 + }, + { + "epoch": 1.25, + "grad_norm": 0.31548079173526045, + "learning_rate": 6.675194055202472e-05, + "loss": 1.0579, + "step": 13117 + }, + { + "epoch": 1.26, + "grad_norm": 0.2775870408570143, + "learning_rate": 6.673702144673311e-05, + "loss": 1.015, + "step": 13118 + }, + { + "epoch": 1.26, + "grad_norm": 0.2990275984523671, + "learning_rate": 6.672210317386844e-05, + "loss": 1.1151, + "step": 13119 + }, + { + "epoch": 1.26, + "grad_norm": 0.3291697180051336, + "learning_rate": 6.670718573380401e-05, + "loss": 0.8656, + "step": 13120 + }, + { + "epoch": 1.26, + "grad_norm": 0.2984594102560302, + "learning_rate": 6.669226912691324e-05, + "loss": 1.0361, + "step": 13121 + }, + { + "epoch": 1.26, + "grad_norm": 0.2982972484161112, + "learning_rate": 6.667735335356934e-05, + "loss": 1.0323, + "step": 13122 + }, + { + "epoch": 1.26, + "grad_norm": 0.30150493528952005, + "learning_rate": 6.666243841414564e-05, + "loss": 1.0207, + "step": 13123 + }, + { + "epoch": 1.26, + "grad_norm": 0.3193623762920771, + "learning_rate": 6.664752430901535e-05, + "loss": 1.0363, + "step": 13124 + }, + { + "epoch": 1.26, + "grad_norm": 0.3186107821270777, + "learning_rate": 6.663261103855171e-05, + "loss": 0.9583, + "step": 13125 + }, + { + "epoch": 1.26, + "grad_norm": 0.3064115020519285, + "learning_rate": 6.661769860312799e-05, + "loss": 1.0646, + "step": 13126 + }, + { + "epoch": 1.26, + "grad_norm": 0.3030835308641411, + "learning_rate": 6.660278700311731e-05, + "loss": 0.9979, + "step": 13127 + }, + { + "epoch": 1.26, + "grad_norm": 0.3097112524102825, + "learning_rate": 6.658787623889286e-05, + "loss": 0.9336, + "step": 13128 + }, + { + "epoch": 1.26, + "grad_norm": 0.3144354896470203, + "learning_rate": 6.657296631082784e-05, + "loss": 0.9731, + "step": 13129 + }, + { + "epoch": 1.26, + "grad_norm": 0.3564814419532164, + "learning_rate": 6.655805721929535e-05, + "loss": 1.2013, + "step": 13130 + }, + { + "epoch": 1.26, + "grad_norm": 0.26866122472736137, + "learning_rate": 6.654314896466848e-05, + "loss": 0.9897, + "step": 13131 + }, + { + "epoch": 1.26, + "grad_norm": 0.3410658703367055, + "learning_rate": 6.652824154732035e-05, + "loss": 1.0399, + "step": 13132 + }, + { + "epoch": 1.26, + "grad_norm": 0.31413509883347607, + "learning_rate": 6.6513334967624e-05, + "loss": 1.018, + "step": 13133 + }, + { + "epoch": 1.26, + "grad_norm": 0.3172305815538136, + "learning_rate": 6.649842922595246e-05, + "loss": 1.0529, + "step": 13134 + }, + { + "epoch": 1.26, + "grad_norm": 0.30637587897488944, + "learning_rate": 6.648352432267881e-05, + "loss": 0.9933, + "step": 13135 + }, + { + "epoch": 1.26, + "grad_norm": 0.289662810218198, + "learning_rate": 6.646862025817604e-05, + "loss": 1.0215, + "step": 13136 + }, + { + "epoch": 1.26, + "grad_norm": 0.3085650997973449, + "learning_rate": 6.645371703281711e-05, + "loss": 0.9511, + "step": 13137 + }, + { + "epoch": 1.26, + "grad_norm": 0.3228386142906535, + "learning_rate": 6.6438814646975e-05, + "loss": 0.9935, + "step": 13138 + }, + { + "epoch": 1.26, + "grad_norm": 0.2990172468230154, + "learning_rate": 6.642391310102265e-05, + "loss": 1.0419, + "step": 13139 + }, + { + "epoch": 1.26, + "grad_norm": 0.3104557122874441, + "learning_rate": 6.640901239533296e-05, + "loss": 1.0889, + "step": 13140 + }, + { + "epoch": 1.26, + "grad_norm": 0.3028259798198168, + "learning_rate": 6.639411253027883e-05, + "loss": 0.9745, + "step": 13141 + }, + { + "epoch": 1.26, + "grad_norm": 0.2764985950898903, + "learning_rate": 6.637921350623317e-05, + "loss": 0.9509, + "step": 13142 + }, + { + "epoch": 1.26, + "grad_norm": 0.2997359220216105, + "learning_rate": 6.636431532356884e-05, + "loss": 1.0054, + "step": 13143 + }, + { + "epoch": 1.26, + "grad_norm": 0.2792374027047244, + "learning_rate": 6.634941798265866e-05, + "loss": 1.0527, + "step": 13144 + }, + { + "epoch": 1.26, + "grad_norm": 0.3292651408642156, + "learning_rate": 6.633452148387542e-05, + "loss": 1.0884, + "step": 13145 + }, + { + "epoch": 1.26, + "grad_norm": 0.27921620123598884, + "learning_rate": 6.631962582759195e-05, + "loss": 1.107, + "step": 13146 + }, + { + "epoch": 1.26, + "grad_norm": 0.3153803098431158, + "learning_rate": 6.6304731014181e-05, + "loss": 1.0225, + "step": 13147 + }, + { + "epoch": 1.26, + "grad_norm": 0.30387360374517297, + "learning_rate": 6.628983704401535e-05, + "loss": 1.0162, + "step": 13148 + }, + { + "epoch": 1.26, + "grad_norm": 0.30591833557294396, + "learning_rate": 6.627494391746768e-05, + "loss": 0.9066, + "step": 13149 + }, + { + "epoch": 1.26, + "grad_norm": 0.25773761698767156, + "learning_rate": 6.626005163491076e-05, + "loss": 0.9625, + "step": 13150 + }, + { + "epoch": 1.26, + "grad_norm": 0.3150222243571564, + "learning_rate": 6.624516019671727e-05, + "loss": 1.0725, + "step": 13151 + }, + { + "epoch": 1.26, + "grad_norm": 0.35417683225648416, + "learning_rate": 6.623026960325984e-05, + "loss": 0.9673, + "step": 13152 + }, + { + "epoch": 1.26, + "grad_norm": 0.32154600960742113, + "learning_rate": 6.621537985491116e-05, + "loss": 1.0562, + "step": 13153 + }, + { + "epoch": 1.26, + "grad_norm": 0.2903230060183063, + "learning_rate": 6.620049095204377e-05, + "loss": 0.9988, + "step": 13154 + }, + { + "epoch": 1.26, + "grad_norm": 0.29100544237776266, + "learning_rate": 6.618560289503039e-05, + "loss": 0.925, + "step": 13155 + }, + { + "epoch": 1.26, + "grad_norm": 0.3564623355507773, + "learning_rate": 6.617071568424353e-05, + "loss": 1.0546, + "step": 13156 + }, + { + "epoch": 1.26, + "grad_norm": 0.31178251876153784, + "learning_rate": 6.615582932005579e-05, + "loss": 1.0626, + "step": 13157 + }, + { + "epoch": 1.26, + "grad_norm": 0.3272460774266918, + "learning_rate": 6.614094380283969e-05, + "loss": 0.9884, + "step": 13158 + }, + { + "epoch": 1.26, + "grad_norm": 0.302577092793733, + "learning_rate": 6.612605913296774e-05, + "loss": 1.0018, + "step": 13159 + }, + { + "epoch": 1.26, + "grad_norm": 0.25953002421902105, + "learning_rate": 6.611117531081246e-05, + "loss": 0.953, + "step": 13160 + }, + { + "epoch": 1.26, + "grad_norm": 0.31520930317757356, + "learning_rate": 6.609629233674627e-05, + "loss": 0.919, + "step": 13161 + }, + { + "epoch": 1.26, + "grad_norm": 0.3468131719918124, + "learning_rate": 6.608141021114171e-05, + "loss": 1.0301, + "step": 13162 + }, + { + "epoch": 1.26, + "grad_norm": 0.2912283900833401, + "learning_rate": 6.606652893437118e-05, + "loss": 0.9637, + "step": 13163 + }, + { + "epoch": 1.26, + "grad_norm": 0.3241833562769252, + "learning_rate": 6.60516485068071e-05, + "loss": 1.0621, + "step": 13164 + }, + { + "epoch": 1.26, + "grad_norm": 0.2905468042472747, + "learning_rate": 6.603676892882184e-05, + "loss": 1.1091, + "step": 13165 + }, + { + "epoch": 1.26, + "grad_norm": 0.3049647060005846, + "learning_rate": 6.602189020078776e-05, + "loss": 1.032, + "step": 13166 + }, + { + "epoch": 1.26, + "grad_norm": 0.34256114510958113, + "learning_rate": 6.600701232307727e-05, + "loss": 0.9223, + "step": 13167 + }, + { + "epoch": 1.26, + "grad_norm": 0.3266331376007605, + "learning_rate": 6.59921352960626e-05, + "loss": 1.0985, + "step": 13168 + }, + { + "epoch": 1.26, + "grad_norm": 0.30977372835977407, + "learning_rate": 6.597725912011619e-05, + "loss": 1.0795, + "step": 13169 + }, + { + "epoch": 1.26, + "grad_norm": 0.268515513002906, + "learning_rate": 6.596238379561019e-05, + "loss": 1.154, + "step": 13170 + }, + { + "epoch": 1.26, + "grad_norm": 0.3216645208992887, + "learning_rate": 6.594750932291698e-05, + "loss": 0.8679, + "step": 13171 + }, + { + "epoch": 1.26, + "grad_norm": 0.2997150259270586, + "learning_rate": 6.593263570240873e-05, + "loss": 1.0179, + "step": 13172 + }, + { + "epoch": 1.26, + "grad_norm": 0.3704946458531994, + "learning_rate": 6.591776293445769e-05, + "loss": 1.0137, + "step": 13173 + }, + { + "epoch": 1.26, + "grad_norm": 0.3108061353441055, + "learning_rate": 6.5902891019436e-05, + "loss": 1.0049, + "step": 13174 + }, + { + "epoch": 1.26, + "grad_norm": 0.3439956110938851, + "learning_rate": 6.588801995771594e-05, + "loss": 1.0039, + "step": 13175 + }, + { + "epoch": 1.26, + "grad_norm": 0.2813590728608357, + "learning_rate": 6.587314974966963e-05, + "loss": 1.037, + "step": 13176 + }, + { + "epoch": 1.26, + "grad_norm": 0.28291687406857274, + "learning_rate": 6.585828039566919e-05, + "loss": 0.99, + "step": 13177 + }, + { + "epoch": 1.26, + "grad_norm": 0.2937576596880494, + "learning_rate": 6.584341189608672e-05, + "loss": 0.9574, + "step": 13178 + }, + { + "epoch": 1.26, + "grad_norm": 0.3712042545976829, + "learning_rate": 6.582854425129435e-05, + "loss": 0.9655, + "step": 13179 + }, + { + "epoch": 1.26, + "grad_norm": 0.3086705774481812, + "learning_rate": 6.581367746166413e-05, + "loss": 1.005, + "step": 13180 + }, + { + "epoch": 1.26, + "grad_norm": 0.3693768881143896, + "learning_rate": 6.57988115275681e-05, + "loss": 0.9686, + "step": 13181 + }, + { + "epoch": 1.26, + "grad_norm": 0.351501084320053, + "learning_rate": 6.578394644937833e-05, + "loss": 1.1088, + "step": 13182 + }, + { + "epoch": 1.26, + "grad_norm": 0.2812315288020852, + "learning_rate": 6.57690822274668e-05, + "loss": 0.9964, + "step": 13183 + }, + { + "epoch": 1.26, + "grad_norm": 0.31453082015009093, + "learning_rate": 6.57542188622055e-05, + "loss": 0.9929, + "step": 13184 + }, + { + "epoch": 1.26, + "grad_norm": 0.243414762404747, + "learning_rate": 6.57393563539664e-05, + "loss": 0.9083, + "step": 13185 + }, + { + "epoch": 1.26, + "grad_norm": 0.2996891676471609, + "learning_rate": 6.572449470312141e-05, + "loss": 1.1293, + "step": 13186 + }, + { + "epoch": 1.26, + "grad_norm": 0.28284237806270135, + "learning_rate": 6.570963391004252e-05, + "loss": 1.0954, + "step": 13187 + }, + { + "epoch": 1.26, + "grad_norm": 0.32774278638115867, + "learning_rate": 6.569477397510156e-05, + "loss": 1.0018, + "step": 13188 + }, + { + "epoch": 1.26, + "grad_norm": 0.31440929748315394, + "learning_rate": 6.567991489867047e-05, + "loss": 0.9625, + "step": 13189 + }, + { + "epoch": 1.26, + "grad_norm": 0.29789820388812616, + "learning_rate": 6.566505668112103e-05, + "loss": 1.0228, + "step": 13190 + }, + { + "epoch": 1.26, + "grad_norm": 0.2864270063820604, + "learning_rate": 6.56501993228252e-05, + "loss": 1.1285, + "step": 13191 + }, + { + "epoch": 1.26, + "grad_norm": 0.3240729277370607, + "learning_rate": 6.563534282415466e-05, + "loss": 1.1263, + "step": 13192 + }, + { + "epoch": 1.26, + "grad_norm": 0.38431794843213174, + "learning_rate": 6.56204871854813e-05, + "loss": 1.1232, + "step": 13193 + }, + { + "epoch": 1.26, + "grad_norm": 0.33630300128280644, + "learning_rate": 6.560563240717686e-05, + "loss": 1.135, + "step": 13194 + }, + { + "epoch": 1.26, + "grad_norm": 0.35634315668073246, + "learning_rate": 6.559077848961301e-05, + "loss": 1.0326, + "step": 13195 + }, + { + "epoch": 1.26, + "grad_norm": 0.30556861917054506, + "learning_rate": 6.557592543316162e-05, + "loss": 1.032, + "step": 13196 + }, + { + "epoch": 1.26, + "grad_norm": 0.28833929550282733, + "learning_rate": 6.556107323819434e-05, + "loss": 1.106, + "step": 13197 + }, + { + "epoch": 1.26, + "grad_norm": 0.3818847042336026, + "learning_rate": 6.554622190508282e-05, + "loss": 0.9811, + "step": 13198 + }, + { + "epoch": 1.26, + "grad_norm": 0.32379489941248546, + "learning_rate": 6.553137143419877e-05, + "loss": 1.0414, + "step": 13199 + }, + { + "epoch": 1.26, + "grad_norm": 0.29769713136849013, + "learning_rate": 6.55165218259138e-05, + "loss": 1.0874, + "step": 13200 + }, + { + "epoch": 1.26, + "grad_norm": 0.3412620387838125, + "learning_rate": 6.550167308059955e-05, + "loss": 0.9915, + "step": 13201 + }, + { + "epoch": 1.26, + "grad_norm": 0.35062592789916125, + "learning_rate": 6.548682519862757e-05, + "loss": 0.971, + "step": 13202 + }, + { + "epoch": 1.26, + "grad_norm": 0.27040196578498205, + "learning_rate": 6.547197818036952e-05, + "loss": 1.0356, + "step": 13203 + }, + { + "epoch": 1.26, + "grad_norm": 0.31229780044956773, + "learning_rate": 6.545713202619692e-05, + "loss": 1.0994, + "step": 13204 + }, + { + "epoch": 1.26, + "grad_norm": 0.28222592703768373, + "learning_rate": 6.54422867364813e-05, + "loss": 1.076, + "step": 13205 + }, + { + "epoch": 1.26, + "grad_norm": 0.2918069788898459, + "learning_rate": 6.542744231159417e-05, + "loss": 1.0367, + "step": 13206 + }, + { + "epoch": 1.26, + "grad_norm": 0.2813966566709833, + "learning_rate": 6.5412598751907e-05, + "loss": 1.0393, + "step": 13207 + }, + { + "epoch": 1.26, + "grad_norm": 0.28325252713658844, + "learning_rate": 6.539775605779128e-05, + "loss": 1.05, + "step": 13208 + }, + { + "epoch": 1.26, + "grad_norm": 0.32594481785572477, + "learning_rate": 6.538291422961849e-05, + "loss": 0.9957, + "step": 13209 + }, + { + "epoch": 1.26, + "grad_norm": 0.3331488543598937, + "learning_rate": 6.536807326776002e-05, + "loss": 1.0106, + "step": 13210 + }, + { + "epoch": 1.26, + "grad_norm": 0.31824598672214643, + "learning_rate": 6.535323317258729e-05, + "loss": 1.0642, + "step": 13211 + }, + { + "epoch": 1.26, + "grad_norm": 0.3080694777209946, + "learning_rate": 6.533839394447165e-05, + "loss": 1.0315, + "step": 13212 + }, + { + "epoch": 1.26, + "grad_norm": 0.31579340554357316, + "learning_rate": 6.532355558378452e-05, + "loss": 1.0816, + "step": 13213 + }, + { + "epoch": 1.26, + "grad_norm": 0.29561629273397716, + "learning_rate": 6.530871809089719e-05, + "loss": 1.0545, + "step": 13214 + }, + { + "epoch": 1.26, + "grad_norm": 0.29120284538827196, + "learning_rate": 6.529388146618096e-05, + "loss": 1.0675, + "step": 13215 + }, + { + "epoch": 1.26, + "grad_norm": 0.30504361171370553, + "learning_rate": 6.527904571000719e-05, + "loss": 0.9615, + "step": 13216 + }, + { + "epoch": 1.26, + "grad_norm": 0.29947393317586357, + "learning_rate": 6.526421082274714e-05, + "loss": 1.1112, + "step": 13217 + }, + { + "epoch": 1.26, + "grad_norm": 0.31019092771160806, + "learning_rate": 6.524937680477202e-05, + "loss": 1.0317, + "step": 13218 + }, + { + "epoch": 1.26, + "grad_norm": 0.30517242087133284, + "learning_rate": 6.52345436564531e-05, + "loss": 1.0927, + "step": 13219 + }, + { + "epoch": 1.26, + "grad_norm": 0.31329219397578467, + "learning_rate": 6.521971137816156e-05, + "loss": 0.9949, + "step": 13220 + }, + { + "epoch": 1.26, + "grad_norm": 0.30860340131974634, + "learning_rate": 6.520487997026863e-05, + "loss": 1.0257, + "step": 13221 + }, + { + "epoch": 1.26, + "grad_norm": 0.3275338957723101, + "learning_rate": 6.519004943314537e-05, + "loss": 1.0195, + "step": 13222 + }, + { + "epoch": 1.27, + "grad_norm": 0.32790402317725237, + "learning_rate": 6.517521976716307e-05, + "loss": 1.1224, + "step": 13223 + }, + { + "epoch": 1.27, + "grad_norm": 0.34342013372372643, + "learning_rate": 6.516039097269277e-05, + "loss": 1.0332, + "step": 13224 + }, + { + "epoch": 1.27, + "grad_norm": 0.27995030588935627, + "learning_rate": 6.514556305010559e-05, + "loss": 1.0409, + "step": 13225 + }, + { + "epoch": 1.27, + "grad_norm": 0.3520242258471274, + "learning_rate": 6.513073599977258e-05, + "loss": 1.0828, + "step": 13226 + }, + { + "epoch": 1.27, + "grad_norm": 0.31743766695926795, + "learning_rate": 6.511590982206482e-05, + "loss": 1.0006, + "step": 13227 + }, + { + "epoch": 1.27, + "grad_norm": 0.3281995377996116, + "learning_rate": 6.510108451735335e-05, + "loss": 1.0446, + "step": 13228 + }, + { + "epoch": 1.27, + "grad_norm": 0.27294198572759787, + "learning_rate": 6.508626008600916e-05, + "loss": 1.0032, + "step": 13229 + }, + { + "epoch": 1.27, + "grad_norm": 0.2764020788260177, + "learning_rate": 6.507143652840326e-05, + "loss": 1.0559, + "step": 13230 + }, + { + "epoch": 1.27, + "grad_norm": 0.29634870867209506, + "learning_rate": 6.50566138449066e-05, + "loss": 1.0628, + "step": 13231 + }, + { + "epoch": 1.27, + "grad_norm": 0.3297325417111914, + "learning_rate": 6.504179203589015e-05, + "loss": 1.0547, + "step": 13232 + }, + { + "epoch": 1.27, + "grad_norm": 0.27250678939737205, + "learning_rate": 6.502697110172483e-05, + "loss": 0.9029, + "step": 13233 + }, + { + "epoch": 1.27, + "grad_norm": 0.29931913051667247, + "learning_rate": 6.501215104278155e-05, + "loss": 0.9869, + "step": 13234 + }, + { + "epoch": 1.27, + "grad_norm": 0.28706646634305283, + "learning_rate": 6.499733185943113e-05, + "loss": 1.048, + "step": 13235 + }, + { + "epoch": 1.27, + "grad_norm": 0.3284501784467761, + "learning_rate": 6.498251355204451e-05, + "loss": 0.9194, + "step": 13236 + }, + { + "epoch": 1.27, + "grad_norm": 0.33594433952967734, + "learning_rate": 6.496769612099252e-05, + "loss": 1.1366, + "step": 13237 + }, + { + "epoch": 1.27, + "grad_norm": 0.3527532211842117, + "learning_rate": 6.495287956664593e-05, + "loss": 1.1062, + "step": 13238 + }, + { + "epoch": 1.27, + "grad_norm": 0.3179085760042008, + "learning_rate": 6.493806388937555e-05, + "loss": 0.8702, + "step": 13239 + }, + { + "epoch": 1.27, + "grad_norm": 0.27852036778008615, + "learning_rate": 6.492324908955217e-05, + "loss": 1.0066, + "step": 13240 + }, + { + "epoch": 1.27, + "grad_norm": 0.29094213051380197, + "learning_rate": 6.490843516754654e-05, + "loss": 1.0244, + "step": 13241 + }, + { + "epoch": 1.27, + "grad_norm": 0.30735652517488987, + "learning_rate": 6.489362212372931e-05, + "loss": 1.1004, + "step": 13242 + }, + { + "epoch": 1.27, + "grad_norm": 0.33384013762407866, + "learning_rate": 6.48788099584713e-05, + "loss": 1.02, + "step": 13243 + }, + { + "epoch": 1.27, + "grad_norm": 0.30358652576042033, + "learning_rate": 6.486399867214314e-05, + "loss": 0.957, + "step": 13244 + }, + { + "epoch": 1.27, + "grad_norm": 0.3415936812995742, + "learning_rate": 6.48491882651155e-05, + "loss": 1.1138, + "step": 13245 + }, + { + "epoch": 1.27, + "grad_norm": 0.30812971823516655, + "learning_rate": 6.483437873775902e-05, + "loss": 0.9892, + "step": 13246 + }, + { + "epoch": 1.27, + "grad_norm": 0.3471061918894918, + "learning_rate": 6.48195700904443e-05, + "loss": 1.0541, + "step": 13247 + }, + { + "epoch": 1.27, + "grad_norm": 0.367929017867239, + "learning_rate": 6.480476232354194e-05, + "loss": 0.9451, + "step": 13248 + }, + { + "epoch": 1.27, + "grad_norm": 0.3397645492979088, + "learning_rate": 6.47899554374225e-05, + "loss": 0.9598, + "step": 13249 + }, + { + "epoch": 1.27, + "grad_norm": 0.3501395531861487, + "learning_rate": 6.477514943245659e-05, + "loss": 1.0323, + "step": 13250 + }, + { + "epoch": 1.27, + "grad_norm": 0.31403466215616654, + "learning_rate": 6.476034430901471e-05, + "loss": 1.2369, + "step": 13251 + }, + { + "epoch": 1.27, + "grad_norm": 0.3381401058354748, + "learning_rate": 6.474554006746734e-05, + "loss": 0.904, + "step": 13252 + }, + { + "epoch": 1.27, + "grad_norm": 0.32581299112433276, + "learning_rate": 6.4730736708185e-05, + "loss": 0.9307, + "step": 13253 + }, + { + "epoch": 1.27, + "grad_norm": 0.3270709437092426, + "learning_rate": 6.471593423153814e-05, + "loss": 1.0714, + "step": 13254 + }, + { + "epoch": 1.27, + "grad_norm": 0.3093203273855013, + "learning_rate": 6.470113263789721e-05, + "loss": 1.0784, + "step": 13255 + }, + { + "epoch": 1.27, + "grad_norm": 0.3145088585305941, + "learning_rate": 6.468633192763257e-05, + "loss": 1.1087, + "step": 13256 + }, + { + "epoch": 1.27, + "grad_norm": 0.31698270015441415, + "learning_rate": 6.467153210111471e-05, + "loss": 0.9748, + "step": 13257 + }, + { + "epoch": 1.27, + "grad_norm": 0.30130219040471967, + "learning_rate": 6.465673315871395e-05, + "loss": 1.0331, + "step": 13258 + }, + { + "epoch": 1.27, + "grad_norm": 0.3423223978816797, + "learning_rate": 6.464193510080067e-05, + "loss": 0.8514, + "step": 13259 + }, + { + "epoch": 1.27, + "grad_norm": 0.31368276349860236, + "learning_rate": 6.46271379277452e-05, + "loss": 1.0183, + "step": 13260 + }, + { + "epoch": 1.27, + "grad_norm": 0.3086402800612419, + "learning_rate": 6.461234163991781e-05, + "loss": 1.0447, + "step": 13261 + }, + { + "epoch": 1.27, + "grad_norm": 0.24128595157348462, + "learning_rate": 6.459754623768881e-05, + "loss": 1.0872, + "step": 13262 + }, + { + "epoch": 1.27, + "grad_norm": 0.29399047960189006, + "learning_rate": 6.458275172142843e-05, + "loss": 1.0048, + "step": 13263 + }, + { + "epoch": 1.27, + "grad_norm": 0.31622203404368454, + "learning_rate": 6.4567958091507e-05, + "loss": 1.0443, + "step": 13264 + }, + { + "epoch": 1.27, + "grad_norm": 0.3107880652879291, + "learning_rate": 6.455316534829467e-05, + "loss": 1.0912, + "step": 13265 + }, + { + "epoch": 1.27, + "grad_norm": 0.2922366990027794, + "learning_rate": 6.453837349216166e-05, + "loss": 1.0411, + "step": 13266 + }, + { + "epoch": 1.27, + "grad_norm": 0.3122637724737602, + "learning_rate": 6.452358252347815e-05, + "loss": 1.061, + "step": 13267 + }, + { + "epoch": 1.27, + "grad_norm": 0.3160009143251018, + "learning_rate": 6.450879244261427e-05, + "loss": 1.0266, + "step": 13268 + }, + { + "epoch": 1.27, + "grad_norm": 0.3080289610614102, + "learning_rate": 6.449400324994012e-05, + "loss": 0.8842, + "step": 13269 + }, + { + "epoch": 1.27, + "grad_norm": 0.3353990307618989, + "learning_rate": 6.447921494582591e-05, + "loss": 1.1166, + "step": 13270 + }, + { + "epoch": 1.27, + "grad_norm": 0.27854382722858284, + "learning_rate": 6.446442753064167e-05, + "loss": 0.976, + "step": 13271 + }, + { + "epoch": 1.27, + "grad_norm": 0.2862048187481964, + "learning_rate": 6.444964100475743e-05, + "loss": 1.0603, + "step": 13272 + }, + { + "epoch": 1.27, + "grad_norm": 0.33694207336276855, + "learning_rate": 6.44348553685433e-05, + "loss": 1.0616, + "step": 13273 + }, + { + "epoch": 1.27, + "grad_norm": 0.3564622543179985, + "learning_rate": 6.442007062236925e-05, + "loss": 1.0871, + "step": 13274 + }, + { + "epoch": 1.27, + "grad_norm": 0.3084492795951558, + "learning_rate": 6.440528676660531e-05, + "loss": 0.8913, + "step": 13275 + }, + { + "epoch": 1.27, + "grad_norm": 0.312433909708562, + "learning_rate": 6.439050380162139e-05, + "loss": 1.1461, + "step": 13276 + }, + { + "epoch": 1.27, + "grad_norm": 0.2603672282171556, + "learning_rate": 6.437572172778754e-05, + "loss": 0.9572, + "step": 13277 + }, + { + "epoch": 1.27, + "grad_norm": 0.30683948199531896, + "learning_rate": 6.436094054547363e-05, + "loss": 1.0621, + "step": 13278 + }, + { + "epoch": 1.27, + "grad_norm": 0.28907823324716986, + "learning_rate": 6.43461602550496e-05, + "loss": 0.9163, + "step": 13279 + }, + { + "epoch": 1.27, + "grad_norm": 0.27975073873089495, + "learning_rate": 6.43313808568853e-05, + "loss": 0.9575, + "step": 13280 + }, + { + "epoch": 1.27, + "grad_norm": 0.3311376651150422, + "learning_rate": 6.431660235135061e-05, + "loss": 1.0076, + "step": 13281 + }, + { + "epoch": 1.27, + "grad_norm": 0.3391278684407142, + "learning_rate": 6.43018247388154e-05, + "loss": 1.0003, + "step": 13282 + }, + { + "epoch": 1.27, + "grad_norm": 0.3106227397931226, + "learning_rate": 6.42870480196494e-05, + "loss": 1.0197, + "step": 13283 + }, + { + "epoch": 1.27, + "grad_norm": 0.32869930658049934, + "learning_rate": 6.427227219422251e-05, + "loss": 1.0338, + "step": 13284 + }, + { + "epoch": 1.27, + "grad_norm": 0.3237093856114065, + "learning_rate": 6.425749726290447e-05, + "loss": 1.0305, + "step": 13285 + }, + { + "epoch": 1.27, + "grad_norm": 0.3141793575020953, + "learning_rate": 6.4242723226065e-05, + "loss": 1.072, + "step": 13286 + }, + { + "epoch": 1.27, + "grad_norm": 0.2882475497909208, + "learning_rate": 6.422795008407387e-05, + "loss": 1.1101, + "step": 13287 + }, + { + "epoch": 1.27, + "grad_norm": 0.3006742129978756, + "learning_rate": 6.421317783730077e-05, + "loss": 0.9859, + "step": 13288 + }, + { + "epoch": 1.27, + "grad_norm": 0.31291057392209815, + "learning_rate": 6.419840648611537e-05, + "loss": 1.0301, + "step": 13289 + }, + { + "epoch": 1.27, + "grad_norm": 0.30936378351291954, + "learning_rate": 6.418363603088735e-05, + "loss": 1.0942, + "step": 13290 + }, + { + "epoch": 1.27, + "grad_norm": 0.28915204722450066, + "learning_rate": 6.416886647198635e-05, + "loss": 1.0105, + "step": 13291 + }, + { + "epoch": 1.27, + "grad_norm": 0.326971963961884, + "learning_rate": 6.415409780978201e-05, + "loss": 1.0692, + "step": 13292 + }, + { + "epoch": 1.27, + "grad_norm": 0.3445698603686449, + "learning_rate": 6.413933004464387e-05, + "loss": 1.0388, + "step": 13293 + }, + { + "epoch": 1.27, + "grad_norm": 0.34615454386566374, + "learning_rate": 6.412456317694155e-05, + "loss": 0.9609, + "step": 13294 + }, + { + "epoch": 1.27, + "grad_norm": 0.351404779312152, + "learning_rate": 6.410979720704458e-05, + "loss": 1.0856, + "step": 13295 + }, + { + "epoch": 1.27, + "grad_norm": 0.3325557622438565, + "learning_rate": 6.409503213532248e-05, + "loss": 1.0434, + "step": 13296 + }, + { + "epoch": 1.27, + "grad_norm": 0.30198819866829896, + "learning_rate": 6.408026796214477e-05, + "loss": 1.0125, + "step": 13297 + }, + { + "epoch": 1.27, + "grad_norm": 0.3590128931255267, + "learning_rate": 6.406550468788093e-05, + "loss": 1.028, + "step": 13298 + }, + { + "epoch": 1.27, + "grad_norm": 0.29928196885810626, + "learning_rate": 6.405074231290044e-05, + "loss": 0.9843, + "step": 13299 + }, + { + "epoch": 1.27, + "grad_norm": 0.3017134472561098, + "learning_rate": 6.40359808375727e-05, + "loss": 1.096, + "step": 13300 + }, + { + "epoch": 1.27, + "grad_norm": 0.32795083278890447, + "learning_rate": 6.402122026226715e-05, + "loss": 1.0652, + "step": 13301 + }, + { + "epoch": 1.27, + "grad_norm": 0.33967098880907765, + "learning_rate": 6.400646058735318e-05, + "loss": 1.0197, + "step": 13302 + }, + { + "epoch": 1.27, + "grad_norm": 0.3040657213344615, + "learning_rate": 6.399170181320012e-05, + "loss": 1.0918, + "step": 13303 + }, + { + "epoch": 1.27, + "grad_norm": 0.2969012071183049, + "learning_rate": 6.397694394017736e-05, + "loss": 1.011, + "step": 13304 + }, + { + "epoch": 1.27, + "grad_norm": 0.30121058563648534, + "learning_rate": 6.396218696865424e-05, + "loss": 0.9086, + "step": 13305 + }, + { + "epoch": 1.27, + "grad_norm": 0.29271968422690253, + "learning_rate": 6.394743089900005e-05, + "loss": 0.9856, + "step": 13306 + }, + { + "epoch": 1.27, + "grad_norm": 0.31728725010401665, + "learning_rate": 6.393267573158405e-05, + "loss": 1.0504, + "step": 13307 + }, + { + "epoch": 1.27, + "grad_norm": 0.30953997899802044, + "learning_rate": 6.391792146677551e-05, + "loss": 1.001, + "step": 13308 + }, + { + "epoch": 1.27, + "grad_norm": 0.28772852822451106, + "learning_rate": 6.390316810494365e-05, + "loss": 0.9168, + "step": 13309 + }, + { + "epoch": 1.27, + "grad_norm": 0.29469339207612383, + "learning_rate": 6.388841564645766e-05, + "loss": 1.0837, + "step": 13310 + }, + { + "epoch": 1.27, + "grad_norm": 0.3153903957889136, + "learning_rate": 6.387366409168681e-05, + "loss": 1.016, + "step": 13311 + }, + { + "epoch": 1.27, + "grad_norm": 0.3192746450532912, + "learning_rate": 6.385891344100022e-05, + "loss": 1.0116, + "step": 13312 + }, + { + "epoch": 1.27, + "grad_norm": 0.3013791021074644, + "learning_rate": 6.384416369476703e-05, + "loss": 1.1336, + "step": 13313 + }, + { + "epoch": 1.27, + "grad_norm": 0.2824212205167852, + "learning_rate": 6.382941485335635e-05, + "loss": 0.908, + "step": 13314 + }, + { + "epoch": 1.27, + "grad_norm": 0.3029212563747884, + "learning_rate": 6.381466691713732e-05, + "loss": 0.897, + "step": 13315 + }, + { + "epoch": 1.27, + "grad_norm": 0.2798130997012647, + "learning_rate": 6.379991988647899e-05, + "loss": 1.0033, + "step": 13316 + }, + { + "epoch": 1.27, + "grad_norm": 0.35059807856739084, + "learning_rate": 6.378517376175038e-05, + "loss": 1.0595, + "step": 13317 + }, + { + "epoch": 1.27, + "grad_norm": 0.31386439388864645, + "learning_rate": 6.377042854332057e-05, + "loss": 1.0191, + "step": 13318 + }, + { + "epoch": 1.27, + "grad_norm": 0.3125244690091981, + "learning_rate": 6.375568423155858e-05, + "loss": 1.0423, + "step": 13319 + }, + { + "epoch": 1.27, + "grad_norm": 0.35232627163280883, + "learning_rate": 6.374094082683337e-05, + "loss": 0.9997, + "step": 13320 + }, + { + "epoch": 1.27, + "grad_norm": 0.30091016669536913, + "learning_rate": 6.37261983295139e-05, + "loss": 0.9598, + "step": 13321 + }, + { + "epoch": 1.27, + "grad_norm": 0.31477652223700076, + "learning_rate": 6.371145673996912e-05, + "loss": 1.0405, + "step": 13322 + }, + { + "epoch": 1.27, + "grad_norm": 0.27773373105158017, + "learning_rate": 6.369671605856793e-05, + "loss": 1.0439, + "step": 13323 + }, + { + "epoch": 1.27, + "grad_norm": 0.33513360838844797, + "learning_rate": 6.368197628567919e-05, + "loss": 1.0593, + "step": 13324 + }, + { + "epoch": 1.27, + "grad_norm": 0.2917463817016475, + "learning_rate": 6.366723742167187e-05, + "loss": 1.1744, + "step": 13325 + }, + { + "epoch": 1.27, + "grad_norm": 0.29954313212765843, + "learning_rate": 6.365249946691477e-05, + "loss": 0.9569, + "step": 13326 + }, + { + "epoch": 1.28, + "grad_norm": 0.31972153201067294, + "learning_rate": 6.363776242177671e-05, + "loss": 1.1025, + "step": 13327 + }, + { + "epoch": 1.28, + "grad_norm": 0.3028116511491954, + "learning_rate": 6.36230262866265e-05, + "loss": 1.0191, + "step": 13328 + }, + { + "epoch": 1.28, + "grad_norm": 0.2917237961862753, + "learning_rate": 6.360829106183292e-05, + "loss": 0.9454, + "step": 13329 + }, + { + "epoch": 1.28, + "grad_norm": 0.2708991701061947, + "learning_rate": 6.359355674776468e-05, + "loss": 1.0874, + "step": 13330 + }, + { + "epoch": 1.28, + "grad_norm": 0.32453365123883776, + "learning_rate": 6.357882334479061e-05, + "loss": 1.0846, + "step": 13331 + }, + { + "epoch": 1.28, + "grad_norm": 0.3315829124740008, + "learning_rate": 6.356409085327938e-05, + "loss": 1.0059, + "step": 13332 + }, + { + "epoch": 1.28, + "grad_norm": 0.2866949276528045, + "learning_rate": 6.354935927359968e-05, + "loss": 0.9694, + "step": 13333 + }, + { + "epoch": 1.28, + "grad_norm": 0.32033274457829486, + "learning_rate": 6.353462860612014e-05, + "loss": 1.0447, + "step": 13334 + }, + { + "epoch": 1.28, + "grad_norm": 0.2807059521503393, + "learning_rate": 6.351989885120946e-05, + "loss": 1.0673, + "step": 13335 + }, + { + "epoch": 1.28, + "grad_norm": 0.2532112508731774, + "learning_rate": 6.350517000923623e-05, + "loss": 0.9232, + "step": 13336 + }, + { + "epoch": 1.28, + "grad_norm": 0.2676795927172339, + "learning_rate": 6.349044208056904e-05, + "loss": 0.8303, + "step": 13337 + }, + { + "epoch": 1.28, + "grad_norm": 0.31322610419497887, + "learning_rate": 6.34757150655765e-05, + "loss": 0.9627, + "step": 13338 + }, + { + "epoch": 1.28, + "grad_norm": 0.2813026858380566, + "learning_rate": 6.346098896462716e-05, + "loss": 0.8147, + "step": 13339 + }, + { + "epoch": 1.28, + "grad_norm": 0.3631765429787687, + "learning_rate": 6.344626377808952e-05, + "loss": 1.0102, + "step": 13340 + }, + { + "epoch": 1.28, + "grad_norm": 0.2535367685322601, + "learning_rate": 6.34315395063321e-05, + "loss": 0.8944, + "step": 13341 + }, + { + "epoch": 1.28, + "grad_norm": 0.3141524042123934, + "learning_rate": 6.34168161497234e-05, + "loss": 1.029, + "step": 13342 + }, + { + "epoch": 1.28, + "grad_norm": 0.28624018270242696, + "learning_rate": 6.340209370863186e-05, + "loss": 0.9936, + "step": 13343 + }, + { + "epoch": 1.28, + "grad_norm": 0.2628472341534729, + "learning_rate": 6.338737218342589e-05, + "loss": 1.0461, + "step": 13344 + }, + { + "epoch": 1.28, + "grad_norm": 0.3117570872540829, + "learning_rate": 6.337265157447398e-05, + "loss": 1.029, + "step": 13345 + }, + { + "epoch": 1.28, + "grad_norm": 0.3350708515424918, + "learning_rate": 6.33579318821445e-05, + "loss": 1.1499, + "step": 13346 + }, + { + "epoch": 1.28, + "grad_norm": 0.3386144976494899, + "learning_rate": 6.334321310680578e-05, + "loss": 1.0552, + "step": 13347 + }, + { + "epoch": 1.28, + "grad_norm": 0.28686197088948046, + "learning_rate": 6.33284952488262e-05, + "loss": 0.9579, + "step": 13348 + }, + { + "epoch": 1.28, + "grad_norm": 0.2524117080056282, + "learning_rate": 6.331377830857407e-05, + "loss": 1.047, + "step": 13349 + }, + { + "epoch": 1.28, + "grad_norm": 0.3557310296847978, + "learning_rate": 6.329906228641769e-05, + "loss": 1.0141, + "step": 13350 + }, + { + "epoch": 1.28, + "grad_norm": 0.3078290032784936, + "learning_rate": 6.328434718272532e-05, + "loss": 1.041, + "step": 13351 + }, + { + "epoch": 1.28, + "grad_norm": 0.3026451567728435, + "learning_rate": 6.326963299786526e-05, + "loss": 0.9124, + "step": 13352 + }, + { + "epoch": 1.28, + "grad_norm": 0.308857094669802, + "learning_rate": 6.325491973220572e-05, + "loss": 1.1023, + "step": 13353 + }, + { + "epoch": 1.28, + "grad_norm": 0.3631101903618612, + "learning_rate": 6.32402073861149e-05, + "loss": 0.9117, + "step": 13354 + }, + { + "epoch": 1.28, + "grad_norm": 0.34187903045819107, + "learning_rate": 6.322549595996099e-05, + "loss": 1.0464, + "step": 13355 + }, + { + "epoch": 1.28, + "grad_norm": 0.35663470220573723, + "learning_rate": 6.321078545411216e-05, + "loss": 1.086, + "step": 13356 + }, + { + "epoch": 1.28, + "grad_norm": 0.2858230872817101, + "learning_rate": 6.319607586893655e-05, + "loss": 1.061, + "step": 13357 + }, + { + "epoch": 1.28, + "grad_norm": 0.31116349492244944, + "learning_rate": 6.318136720480225e-05, + "loss": 0.9371, + "step": 13358 + }, + { + "epoch": 1.28, + "grad_norm": 0.34110924689281735, + "learning_rate": 6.316665946207739e-05, + "loss": 1.0461, + "step": 13359 + }, + { + "epoch": 1.28, + "grad_norm": 0.2697132612458151, + "learning_rate": 6.315195264113004e-05, + "loss": 0.972, + "step": 13360 + }, + { + "epoch": 1.28, + "grad_norm": 0.25494575647610046, + "learning_rate": 6.313724674232823e-05, + "loss": 0.9551, + "step": 13361 + }, + { + "epoch": 1.28, + "grad_norm": 0.3593922499674843, + "learning_rate": 6.312254176603997e-05, + "loss": 1.0708, + "step": 13362 + }, + { + "epoch": 1.28, + "grad_norm": 0.36025652236940986, + "learning_rate": 6.310783771263332e-05, + "loss": 1.0668, + "step": 13363 + }, + { + "epoch": 1.28, + "grad_norm": 0.33466808471271897, + "learning_rate": 6.309313458247615e-05, + "loss": 0.9984, + "step": 13364 + }, + { + "epoch": 1.28, + "grad_norm": 0.2900928393459147, + "learning_rate": 6.307843237593651e-05, + "loss": 1.0171, + "step": 13365 + }, + { + "epoch": 1.28, + "grad_norm": 0.28830893294808024, + "learning_rate": 6.306373109338233e-05, + "loss": 0.9711, + "step": 13366 + }, + { + "epoch": 1.28, + "grad_norm": 0.31087713077510104, + "learning_rate": 6.304903073518148e-05, + "loss": 1.0552, + "step": 13367 + }, + { + "epoch": 1.28, + "grad_norm": 0.3434512358303855, + "learning_rate": 6.303433130170186e-05, + "loss": 1.0434, + "step": 13368 + }, + { + "epoch": 1.28, + "grad_norm": 0.3015444096580982, + "learning_rate": 6.301963279331134e-05, + "loss": 0.9382, + "step": 13369 + }, + { + "epoch": 1.28, + "grad_norm": 0.29659652082814103, + "learning_rate": 6.300493521037774e-05, + "loss": 0.9614, + "step": 13370 + }, + { + "epoch": 1.28, + "grad_norm": 0.30900563458408864, + "learning_rate": 6.299023855326885e-05, + "loss": 0.9844, + "step": 13371 + }, + { + "epoch": 1.28, + "grad_norm": 0.30085099153614153, + "learning_rate": 6.297554282235254e-05, + "loss": 1.141, + "step": 13372 + }, + { + "epoch": 1.28, + "grad_norm": 0.3317683520732442, + "learning_rate": 6.296084801799653e-05, + "loss": 1.0432, + "step": 13373 + }, + { + "epoch": 1.28, + "grad_norm": 0.40415628114505414, + "learning_rate": 6.294615414056859e-05, + "loss": 1.1302, + "step": 13374 + }, + { + "epoch": 1.28, + "grad_norm": 0.3020073973732862, + "learning_rate": 6.29314611904364e-05, + "loss": 1.1025, + "step": 13375 + }, + { + "epoch": 1.28, + "grad_norm": 0.30233766957256636, + "learning_rate": 6.291676916796771e-05, + "loss": 1.0419, + "step": 13376 + }, + { + "epoch": 1.28, + "grad_norm": 0.3428612463986098, + "learning_rate": 6.290207807353019e-05, + "loss": 1.1284, + "step": 13377 + }, + { + "epoch": 1.28, + "grad_norm": 0.32093266176932894, + "learning_rate": 6.288738790749147e-05, + "loss": 1.0349, + "step": 13378 + }, + { + "epoch": 1.28, + "grad_norm": 0.2965037307426092, + "learning_rate": 6.287269867021916e-05, + "loss": 1.0099, + "step": 13379 + }, + { + "epoch": 1.28, + "grad_norm": 0.29278825694058114, + "learning_rate": 6.285801036208094e-05, + "loss": 0.9139, + "step": 13380 + }, + { + "epoch": 1.28, + "grad_norm": 0.29252086516558845, + "learning_rate": 6.284332298344435e-05, + "loss": 1.0648, + "step": 13381 + }, + { + "epoch": 1.28, + "grad_norm": 0.2998321333377936, + "learning_rate": 6.282863653467697e-05, + "loss": 1.0697, + "step": 13382 + }, + { + "epoch": 1.28, + "grad_norm": 0.33983454677975167, + "learning_rate": 6.28139510161463e-05, + "loss": 1.0488, + "step": 13383 + }, + { + "epoch": 1.28, + "grad_norm": 0.2690874170281866, + "learning_rate": 6.279926642821988e-05, + "loss": 1.0128, + "step": 13384 + }, + { + "epoch": 1.28, + "grad_norm": 0.31070778715844943, + "learning_rate": 6.278458277126517e-05, + "loss": 1.0134, + "step": 13385 + }, + { + "epoch": 1.28, + "grad_norm": 0.3295537236925883, + "learning_rate": 6.276990004564969e-05, + "loss": 1.0062, + "step": 13386 + }, + { + "epoch": 1.28, + "grad_norm": 0.31962269477972755, + "learning_rate": 6.275521825174086e-05, + "loss": 1.0602, + "step": 13387 + }, + { + "epoch": 1.28, + "grad_norm": 0.31051737266617574, + "learning_rate": 6.27405373899061e-05, + "loss": 1.0245, + "step": 13388 + }, + { + "epoch": 1.28, + "grad_norm": 0.2939279658260786, + "learning_rate": 6.272585746051282e-05, + "loss": 1.0248, + "step": 13389 + }, + { + "epoch": 1.28, + "grad_norm": 0.3315377098588913, + "learning_rate": 6.271117846392837e-05, + "loss": 1.0206, + "step": 13390 + }, + { + "epoch": 1.28, + "grad_norm": 0.2910080601553969, + "learning_rate": 6.26965004005201e-05, + "loss": 1.0354, + "step": 13391 + }, + { + "epoch": 1.28, + "grad_norm": 0.265224435368858, + "learning_rate": 6.268182327065537e-05, + "loss": 1.0479, + "step": 13392 + }, + { + "epoch": 1.28, + "grad_norm": 0.33240546145479527, + "learning_rate": 6.266714707470147e-05, + "loss": 0.9279, + "step": 13393 + }, + { + "epoch": 1.28, + "grad_norm": 0.28953216761121886, + "learning_rate": 6.265247181302567e-05, + "loss": 0.9726, + "step": 13394 + }, + { + "epoch": 1.28, + "grad_norm": 0.32608024519851697, + "learning_rate": 6.263779748599525e-05, + "loss": 1.016, + "step": 13395 + }, + { + "epoch": 1.28, + "grad_norm": 0.297408928290725, + "learning_rate": 6.262312409397739e-05, + "loss": 1.0635, + "step": 13396 + }, + { + "epoch": 1.28, + "grad_norm": 0.34933479439224696, + "learning_rate": 6.260845163733938e-05, + "loss": 1.1263, + "step": 13397 + }, + { + "epoch": 1.28, + "grad_norm": 0.36035840178211964, + "learning_rate": 6.259378011644834e-05, + "loss": 0.9814, + "step": 13398 + }, + { + "epoch": 1.28, + "grad_norm": 0.32522597756612287, + "learning_rate": 6.257910953167148e-05, + "loss": 1.0282, + "step": 13399 + }, + { + "epoch": 1.28, + "grad_norm": 0.2765933586474454, + "learning_rate": 6.25644398833759e-05, + "loss": 1.0294, + "step": 13400 + }, + { + "epoch": 1.28, + "grad_norm": 0.3481922125140964, + "learning_rate": 6.254977117192876e-05, + "loss": 1.0434, + "step": 13401 + }, + { + "epoch": 1.28, + "grad_norm": 0.3454511523291118, + "learning_rate": 6.253510339769714e-05, + "loss": 1.0433, + "step": 13402 + }, + { + "epoch": 1.28, + "grad_norm": 0.27930844687870204, + "learning_rate": 6.252043656104809e-05, + "loss": 0.9583, + "step": 13403 + }, + { + "epoch": 1.28, + "grad_norm": 0.30385206456499175, + "learning_rate": 6.250577066234869e-05, + "loss": 0.9919, + "step": 13404 + }, + { + "epoch": 1.28, + "grad_norm": 0.2942010464840959, + "learning_rate": 6.249110570196588e-05, + "loss": 0.9708, + "step": 13405 + }, + { + "epoch": 1.28, + "grad_norm": 0.29423090492934834, + "learning_rate": 6.247644168026679e-05, + "loss": 1.0279, + "step": 13406 + }, + { + "epoch": 1.28, + "grad_norm": 0.324970849831718, + "learning_rate": 6.24617785976183e-05, + "loss": 0.9942, + "step": 13407 + }, + { + "epoch": 1.28, + "grad_norm": 0.3134231235880247, + "learning_rate": 6.244711645438741e-05, + "loss": 1.0101, + "step": 13408 + }, + { + "epoch": 1.28, + "grad_norm": 0.31387026077014407, + "learning_rate": 6.243245525094103e-05, + "loss": 1.0754, + "step": 13409 + }, + { + "epoch": 1.28, + "grad_norm": 0.295745851266857, + "learning_rate": 6.241779498764606e-05, + "loss": 0.9849, + "step": 13410 + }, + { + "epoch": 1.28, + "grad_norm": 0.31686917010298876, + "learning_rate": 6.24031356648694e-05, + "loss": 1.0452, + "step": 13411 + }, + { + "epoch": 1.28, + "grad_norm": 0.3114617031404934, + "learning_rate": 6.238847728297786e-05, + "loss": 1.0274, + "step": 13412 + }, + { + "epoch": 1.28, + "grad_norm": 0.30184955114051365, + "learning_rate": 6.237381984233834e-05, + "loss": 1.0688, + "step": 13413 + }, + { + "epoch": 1.28, + "grad_norm": 0.3581047238378434, + "learning_rate": 6.235916334331764e-05, + "loss": 0.9413, + "step": 13414 + }, + { + "epoch": 1.28, + "grad_norm": 0.2635122320557983, + "learning_rate": 6.234450778628253e-05, + "loss": 0.9552, + "step": 13415 + }, + { + "epoch": 1.28, + "grad_norm": 0.3350646795830523, + "learning_rate": 6.232985317159977e-05, + "loss": 1.0323, + "step": 13416 + }, + { + "epoch": 1.28, + "grad_norm": 0.2804650452781182, + "learning_rate": 6.23151994996361e-05, + "loss": 1.1817, + "step": 13417 + }, + { + "epoch": 1.28, + "grad_norm": 0.3224093047123526, + "learning_rate": 6.230054677075825e-05, + "loss": 1.1832, + "step": 13418 + }, + { + "epoch": 1.28, + "grad_norm": 0.3321708651265014, + "learning_rate": 6.228589498533293e-05, + "loss": 1.0537, + "step": 13419 + }, + { + "epoch": 1.28, + "grad_norm": 0.30673855858628807, + "learning_rate": 6.227124414372676e-05, + "loss": 0.9969, + "step": 13420 + }, + { + "epoch": 1.28, + "grad_norm": 0.30145237051220103, + "learning_rate": 6.225659424630643e-05, + "loss": 1.0496, + "step": 13421 + }, + { + "epoch": 1.28, + "grad_norm": 0.28746480856525036, + "learning_rate": 6.224194529343857e-05, + "loss": 0.9988, + "step": 13422 + }, + { + "epoch": 1.28, + "grad_norm": 0.3403950553075962, + "learning_rate": 6.222729728548974e-05, + "loss": 1.0529, + "step": 13423 + }, + { + "epoch": 1.28, + "grad_norm": 0.29692142428075124, + "learning_rate": 6.221265022282654e-05, + "loss": 1.1073, + "step": 13424 + }, + { + "epoch": 1.28, + "grad_norm": 0.27566735356761196, + "learning_rate": 6.219800410581549e-05, + "loss": 1.0941, + "step": 13425 + }, + { + "epoch": 1.28, + "grad_norm": 0.3283958222915972, + "learning_rate": 6.218335893482317e-05, + "loss": 1.0304, + "step": 13426 + }, + { + "epoch": 1.28, + "grad_norm": 0.3460080324415158, + "learning_rate": 6.216871471021606e-05, + "loss": 0.9972, + "step": 13427 + }, + { + "epoch": 1.28, + "grad_norm": 0.30750247761905647, + "learning_rate": 6.215407143236065e-05, + "loss": 1.116, + "step": 13428 + }, + { + "epoch": 1.28, + "grad_norm": 0.29316167041343366, + "learning_rate": 6.213942910162338e-05, + "loss": 1.07, + "step": 13429 + }, + { + "epoch": 1.28, + "grad_norm": 0.31009086912351536, + "learning_rate": 6.212478771837069e-05, + "loss": 0.9916, + "step": 13430 + }, + { + "epoch": 1.28, + "grad_norm": 0.3138439289610912, + "learning_rate": 6.2110147282969e-05, + "loss": 1.0339, + "step": 13431 + }, + { + "epoch": 1.29, + "grad_norm": 0.3473291181727181, + "learning_rate": 6.209550779578464e-05, + "loss": 1.0527, + "step": 13432 + }, + { + "epoch": 1.29, + "grad_norm": 0.34048981852266014, + "learning_rate": 6.208086925718406e-05, + "loss": 1.0569, + "step": 13433 + }, + { + "epoch": 1.29, + "grad_norm": 0.31384577944541203, + "learning_rate": 6.206623166753356e-05, + "loss": 1.1227, + "step": 13434 + }, + { + "epoch": 1.29, + "grad_norm": 0.321385620374341, + "learning_rate": 6.205159502719946e-05, + "loss": 1.0603, + "step": 13435 + }, + { + "epoch": 1.29, + "grad_norm": 0.3067559292871543, + "learning_rate": 6.203695933654803e-05, + "loss": 0.9968, + "step": 13436 + }, + { + "epoch": 1.29, + "grad_norm": 0.3075136075744453, + "learning_rate": 6.202232459594555e-05, + "loss": 1.0891, + "step": 13437 + }, + { + "epoch": 1.29, + "grad_norm": 0.32409193012197546, + "learning_rate": 6.200769080575826e-05, + "loss": 1.09, + "step": 13438 + }, + { + "epoch": 1.29, + "grad_norm": 0.31808769604493664, + "learning_rate": 6.199305796635238e-05, + "loss": 0.9756, + "step": 13439 + }, + { + "epoch": 1.29, + "grad_norm": 0.2828954832621876, + "learning_rate": 6.197842607809413e-05, + "loss": 1.0899, + "step": 13440 + }, + { + "epoch": 1.29, + "grad_norm": 0.3127816300726818, + "learning_rate": 6.196379514134964e-05, + "loss": 1.0992, + "step": 13441 + }, + { + "epoch": 1.29, + "grad_norm": 0.2924873535381391, + "learning_rate": 6.19491651564851e-05, + "loss": 1.0866, + "step": 13442 + }, + { + "epoch": 1.29, + "grad_norm": 0.2961126124157564, + "learning_rate": 6.193453612386662e-05, + "loss": 1.0482, + "step": 13443 + }, + { + "epoch": 1.29, + "grad_norm": 0.3389680185896986, + "learning_rate": 6.191990804386029e-05, + "loss": 1.076, + "step": 13444 + }, + { + "epoch": 1.29, + "grad_norm": 0.33032289841579066, + "learning_rate": 6.19052809168322e-05, + "loss": 1.0602, + "step": 13445 + }, + { + "epoch": 1.29, + "grad_norm": 0.3171442243786885, + "learning_rate": 6.189065474314836e-05, + "loss": 1.0191, + "step": 13446 + }, + { + "epoch": 1.29, + "grad_norm": 0.3466415764999532, + "learning_rate": 6.187602952317488e-05, + "loss": 1.0391, + "step": 13447 + }, + { + "epoch": 1.29, + "grad_norm": 0.33660196878097876, + "learning_rate": 6.186140525727771e-05, + "loss": 0.9372, + "step": 13448 + }, + { + "epoch": 1.29, + "grad_norm": 0.32316469573472706, + "learning_rate": 6.184678194582285e-05, + "loss": 1.0457, + "step": 13449 + }, + { + "epoch": 1.29, + "grad_norm": 0.36437001979705236, + "learning_rate": 6.183215958917625e-05, + "loss": 0.9972, + "step": 13450 + }, + { + "epoch": 1.29, + "grad_norm": 0.29320862971167017, + "learning_rate": 6.181753818770384e-05, + "loss": 1.0602, + "step": 13451 + }, + { + "epoch": 1.29, + "grad_norm": 0.2687399943623842, + "learning_rate": 6.18029177417715e-05, + "loss": 1.0284, + "step": 13452 + }, + { + "epoch": 1.29, + "grad_norm": 0.3155461751720622, + "learning_rate": 6.178829825174519e-05, + "loss": 1.1006, + "step": 13453 + }, + { + "epoch": 1.29, + "grad_norm": 0.3121282192005124, + "learning_rate": 6.177367971799071e-05, + "loss": 1.018, + "step": 13454 + }, + { + "epoch": 1.29, + "grad_norm": 0.2812554737198285, + "learning_rate": 6.175906214087395e-05, + "loss": 0.9792, + "step": 13455 + }, + { + "epoch": 1.29, + "grad_norm": 0.301751469841568, + "learning_rate": 6.174444552076066e-05, + "loss": 1.0044, + "step": 13456 + }, + { + "epoch": 1.29, + "grad_norm": 0.30154125848558777, + "learning_rate": 6.17298298580167e-05, + "loss": 1.0851, + "step": 13457 + }, + { + "epoch": 1.29, + "grad_norm": 0.32768344417326806, + "learning_rate": 6.171521515300776e-05, + "loss": 1.0483, + "step": 13458 + }, + { + "epoch": 1.29, + "grad_norm": 0.29482247650529003, + "learning_rate": 6.170060140609961e-05, + "loss": 0.9683, + "step": 13459 + }, + { + "epoch": 1.29, + "grad_norm": 0.2918421417206647, + "learning_rate": 6.168598861765802e-05, + "loss": 1.0435, + "step": 13460 + }, + { + "epoch": 1.29, + "grad_norm": 0.3024185116916278, + "learning_rate": 6.167137678804862e-05, + "loss": 1.0524, + "step": 13461 + }, + { + "epoch": 1.29, + "grad_norm": 0.3304585487693831, + "learning_rate": 6.165676591763711e-05, + "loss": 0.9248, + "step": 13462 + }, + { + "epoch": 1.29, + "grad_norm": 0.27378537535450653, + "learning_rate": 6.164215600678916e-05, + "loss": 1.0589, + "step": 13463 + }, + { + "epoch": 1.29, + "grad_norm": 0.3100714419598875, + "learning_rate": 6.162754705587033e-05, + "loss": 1.085, + "step": 13464 + }, + { + "epoch": 1.29, + "grad_norm": 0.315810603858923, + "learning_rate": 6.161293906524628e-05, + "loss": 1.0508, + "step": 13465 + }, + { + "epoch": 1.29, + "grad_norm": 0.27245830436980345, + "learning_rate": 6.159833203528249e-05, + "loss": 1.0229, + "step": 13466 + }, + { + "epoch": 1.29, + "grad_norm": 0.33216817998827963, + "learning_rate": 6.158372596634463e-05, + "loss": 1.0674, + "step": 13467 + }, + { + "epoch": 1.29, + "grad_norm": 0.32067407439621287, + "learning_rate": 6.156912085879817e-05, + "loss": 1.0399, + "step": 13468 + }, + { + "epoch": 1.29, + "grad_norm": 0.3147146637914321, + "learning_rate": 6.15545167130086e-05, + "loss": 0.9983, + "step": 13469 + }, + { + "epoch": 1.29, + "grad_norm": 0.2796804165054073, + "learning_rate": 6.153991352934143e-05, + "loss": 0.937, + "step": 13470 + }, + { + "epoch": 1.29, + "grad_norm": 0.30617475731459287, + "learning_rate": 6.152531130816209e-05, + "loss": 1.0376, + "step": 13471 + }, + { + "epoch": 1.29, + "grad_norm": 0.3178749271830109, + "learning_rate": 6.151071004983601e-05, + "loss": 1.0426, + "step": 13472 + }, + { + "epoch": 1.29, + "grad_norm": 0.313835928190806, + "learning_rate": 6.149610975472858e-05, + "loss": 1.0836, + "step": 13473 + }, + { + "epoch": 1.29, + "grad_norm": 0.32278674890066755, + "learning_rate": 6.148151042320522e-05, + "loss": 1.0537, + "step": 13474 + }, + { + "epoch": 1.29, + "grad_norm": 0.2788926490276658, + "learning_rate": 6.146691205563128e-05, + "loss": 1.0487, + "step": 13475 + }, + { + "epoch": 1.29, + "grad_norm": 0.3204678458333768, + "learning_rate": 6.145231465237206e-05, + "loss": 0.9921, + "step": 13476 + }, + { + "epoch": 1.29, + "grad_norm": 0.3010025128812618, + "learning_rate": 6.143771821379291e-05, + "loss": 1.0155, + "step": 13477 + }, + { + "epoch": 1.29, + "grad_norm": 0.3348229465602061, + "learning_rate": 6.14231227402591e-05, + "loss": 0.9848, + "step": 13478 + }, + { + "epoch": 1.29, + "grad_norm": 0.34615756601831194, + "learning_rate": 6.140852823213585e-05, + "loss": 1.0841, + "step": 13479 + }, + { + "epoch": 1.29, + "grad_norm": 0.290458724537119, + "learning_rate": 6.139393468978849e-05, + "loss": 1.0424, + "step": 13480 + }, + { + "epoch": 1.29, + "grad_norm": 0.2867167537614711, + "learning_rate": 6.137934211358216e-05, + "loss": 1.1001, + "step": 13481 + }, + { + "epoch": 1.29, + "grad_norm": 0.35914852485893184, + "learning_rate": 6.136475050388204e-05, + "loss": 1.0888, + "step": 13482 + }, + { + "epoch": 1.29, + "grad_norm": 0.27690110894115216, + "learning_rate": 6.135015986105334e-05, + "loss": 1.0415, + "step": 13483 + }, + { + "epoch": 1.29, + "grad_norm": 0.294275001494397, + "learning_rate": 6.133557018546119e-05, + "loss": 1.007, + "step": 13484 + }, + { + "epoch": 1.29, + "grad_norm": 0.28332402828920883, + "learning_rate": 6.132098147747067e-05, + "loss": 0.9004, + "step": 13485 + }, + { + "epoch": 1.29, + "grad_norm": 0.3063807073556935, + "learning_rate": 6.130639373744689e-05, + "loss": 0.9566, + "step": 13486 + }, + { + "epoch": 1.29, + "grad_norm": 0.33338859620711125, + "learning_rate": 6.129180696575495e-05, + "loss": 1.0329, + "step": 13487 + }, + { + "epoch": 1.29, + "grad_norm": 0.32013174840283776, + "learning_rate": 6.127722116275986e-05, + "loss": 1.0355, + "step": 13488 + }, + { + "epoch": 1.29, + "grad_norm": 0.32922842814329517, + "learning_rate": 6.126263632882668e-05, + "loss": 0.9719, + "step": 13489 + }, + { + "epoch": 1.29, + "grad_norm": 0.3000291598915307, + "learning_rate": 6.124805246432034e-05, + "loss": 0.9715, + "step": 13490 + }, + { + "epoch": 1.29, + "grad_norm": 0.35669258015375566, + "learning_rate": 6.123346956960586e-05, + "loss": 1.0643, + "step": 13491 + }, + { + "epoch": 1.29, + "grad_norm": 0.31697576542483835, + "learning_rate": 6.121888764504814e-05, + "loss": 1.0548, + "step": 13492 + }, + { + "epoch": 1.29, + "grad_norm": 0.31822689984036334, + "learning_rate": 6.120430669101211e-05, + "loss": 1.0563, + "step": 13493 + }, + { + "epoch": 1.29, + "grad_norm": 0.3050599347519924, + "learning_rate": 6.118972670786272e-05, + "loss": 1.0605, + "step": 13494 + }, + { + "epoch": 1.29, + "grad_norm": 0.32943509536984444, + "learning_rate": 6.117514769596482e-05, + "loss": 1.1155, + "step": 13495 + }, + { + "epoch": 1.29, + "grad_norm": 0.28798464731875484, + "learning_rate": 6.116056965568324e-05, + "loss": 1.0507, + "step": 13496 + }, + { + "epoch": 1.29, + "grad_norm": 0.2888890754700727, + "learning_rate": 6.114599258738282e-05, + "loss": 0.9401, + "step": 13497 + }, + { + "epoch": 1.29, + "grad_norm": 0.33937455911579967, + "learning_rate": 6.113141649142836e-05, + "loss": 0.9778, + "step": 13498 + }, + { + "epoch": 1.29, + "grad_norm": 0.3053617234111141, + "learning_rate": 6.111684136818461e-05, + "loss": 0.9793, + "step": 13499 + }, + { + "epoch": 1.29, + "grad_norm": 0.30870357672251847, + "learning_rate": 6.110226721801634e-05, + "loss": 1.0813, + "step": 13500 + }, + { + "epoch": 1.29, + "grad_norm": 0.29850936915762943, + "learning_rate": 6.108769404128829e-05, + "loss": 0.9751, + "step": 13501 + }, + { + "epoch": 1.29, + "grad_norm": 0.35143672632106454, + "learning_rate": 6.107312183836515e-05, + "loss": 1.0362, + "step": 13502 + }, + { + "epoch": 1.29, + "grad_norm": 0.30567821753701674, + "learning_rate": 6.105855060961159e-05, + "loss": 1.046, + "step": 13503 + }, + { + "epoch": 1.29, + "grad_norm": 0.3303079790988385, + "learning_rate": 6.104398035539229e-05, + "loss": 1.0767, + "step": 13504 + }, + { + "epoch": 1.29, + "grad_norm": 0.32561024463830057, + "learning_rate": 6.102941107607187e-05, + "loss": 1.0508, + "step": 13505 + }, + { + "epoch": 1.29, + "grad_norm": 0.31352981469576374, + "learning_rate": 6.1014842772014924e-05, + "loss": 1.0611, + "step": 13506 + }, + { + "epoch": 1.29, + "grad_norm": 0.3310124279724771, + "learning_rate": 6.1000275443585995e-05, + "loss": 1.0442, + "step": 13507 + }, + { + "epoch": 1.29, + "grad_norm": 0.2807413152987664, + "learning_rate": 6.0985709091149736e-05, + "loss": 1.0629, + "step": 13508 + }, + { + "epoch": 1.29, + "grad_norm": 0.34139462813910343, + "learning_rate": 6.097114371507062e-05, + "loss": 0.9548, + "step": 13509 + }, + { + "epoch": 1.29, + "grad_norm": 0.2999481675917698, + "learning_rate": 6.095657931571317e-05, + "loss": 1.0284, + "step": 13510 + }, + { + "epoch": 1.29, + "grad_norm": 0.3395667781706842, + "learning_rate": 6.0942015893441836e-05, + "loss": 0.9792, + "step": 13511 + }, + { + "epoch": 1.29, + "grad_norm": 0.31120791044694857, + "learning_rate": 6.092745344862112e-05, + "loss": 1.0914, + "step": 13512 + }, + { + "epoch": 1.29, + "grad_norm": 0.268891381677966, + "learning_rate": 6.09128919816154e-05, + "loss": 1.1537, + "step": 13513 + }, + { + "epoch": 1.29, + "grad_norm": 0.31110610437530534, + "learning_rate": 6.089833149278916e-05, + "loss": 1.1222, + "step": 13514 + }, + { + "epoch": 1.29, + "grad_norm": 0.32784940139632723, + "learning_rate": 6.088377198250674e-05, + "loss": 0.9886, + "step": 13515 + }, + { + "epoch": 1.29, + "grad_norm": 0.29048459127810744, + "learning_rate": 6.0869213451132525e-05, + "loss": 1.0702, + "step": 13516 + }, + { + "epoch": 1.29, + "grad_norm": 0.35412299523676066, + "learning_rate": 6.085465589903083e-05, + "loss": 0.9408, + "step": 13517 + }, + { + "epoch": 1.29, + "grad_norm": 0.2947227744146206, + "learning_rate": 6.084009932656597e-05, + "loss": 1.1549, + "step": 13518 + }, + { + "epoch": 1.29, + "grad_norm": 0.35412712458955764, + "learning_rate": 6.0825543734102205e-05, + "loss": 1.0194, + "step": 13519 + }, + { + "epoch": 1.29, + "grad_norm": 0.33299068845337426, + "learning_rate": 6.081098912200385e-05, + "loss": 1.0737, + "step": 13520 + }, + { + "epoch": 1.29, + "grad_norm": 0.31868761436004633, + "learning_rate": 6.079643549063514e-05, + "loss": 0.935, + "step": 13521 + }, + { + "epoch": 1.29, + "grad_norm": 0.28222036861619754, + "learning_rate": 6.078188284036026e-05, + "loss": 0.9751, + "step": 13522 + }, + { + "epoch": 1.29, + "grad_norm": 0.28240814042498813, + "learning_rate": 6.0767331171543384e-05, + "loss": 1.0835, + "step": 13523 + }, + { + "epoch": 1.29, + "grad_norm": 0.26582465678953177, + "learning_rate": 6.075278048454873e-05, + "loss": 1.002, + "step": 13524 + }, + { + "epoch": 1.29, + "grad_norm": 0.31534335711281664, + "learning_rate": 6.073823077974041e-05, + "loss": 1.0143, + "step": 13525 + }, + { + "epoch": 1.29, + "grad_norm": 0.3067604204283215, + "learning_rate": 6.072368205748254e-05, + "loss": 0.959, + "step": 13526 + }, + { + "epoch": 1.29, + "grad_norm": 0.3076414284158878, + "learning_rate": 6.070913431813917e-05, + "loss": 1.0217, + "step": 13527 + }, + { + "epoch": 1.29, + "grad_norm": 0.3144893288853825, + "learning_rate": 6.0694587562074446e-05, + "loss": 1.1028, + "step": 13528 + }, + { + "epoch": 1.29, + "grad_norm": 0.3077943367632876, + "learning_rate": 6.0680041789652375e-05, + "loss": 1.0829, + "step": 13529 + }, + { + "epoch": 1.29, + "grad_norm": 0.27190789887639194, + "learning_rate": 6.066549700123697e-05, + "loss": 0.9614, + "step": 13530 + }, + { + "epoch": 1.29, + "grad_norm": 0.3003779844292193, + "learning_rate": 6.065095319719223e-05, + "loss": 1.0541, + "step": 13531 + }, + { + "epoch": 1.29, + "grad_norm": 0.3021646039012606, + "learning_rate": 6.06364103778821e-05, + "loss": 0.982, + "step": 13532 + }, + { + "epoch": 1.29, + "grad_norm": 0.3681260854758522, + "learning_rate": 6.0621868543670546e-05, + "loss": 0.9544, + "step": 13533 + }, + { + "epoch": 1.29, + "grad_norm": 0.27255310220080425, + "learning_rate": 6.0607327694921434e-05, + "loss": 1.0909, + "step": 13534 + }, + { + "epoch": 1.29, + "grad_norm": 0.31583740680571976, + "learning_rate": 6.0592787831998754e-05, + "loss": 0.9948, + "step": 13535 + }, + { + "epoch": 1.3, + "grad_norm": 0.2857506053876081, + "learning_rate": 6.0578248955266316e-05, + "loss": 1.0044, + "step": 13536 + }, + { + "epoch": 1.3, + "grad_norm": 0.3192520418189311, + "learning_rate": 6.056371106508798e-05, + "loss": 0.9872, + "step": 13537 + }, + { + "epoch": 1.3, + "grad_norm": 0.27841998223781655, + "learning_rate": 6.054917416182757e-05, + "loss": 0.9211, + "step": 13538 + }, + { + "epoch": 1.3, + "grad_norm": 0.28884480611925883, + "learning_rate": 6.053463824584885e-05, + "loss": 1.1376, + "step": 13539 + }, + { + "epoch": 1.3, + "grad_norm": 0.2621365836034702, + "learning_rate": 6.0520103317515584e-05, + "loss": 1.0465, + "step": 13540 + }, + { + "epoch": 1.3, + "grad_norm": 0.32248348679479777, + "learning_rate": 6.0505569377191585e-05, + "loss": 1.0348, + "step": 13541 + }, + { + "epoch": 1.3, + "grad_norm": 0.3167824805463549, + "learning_rate": 6.0491036425240524e-05, + "loss": 1.0694, + "step": 13542 + }, + { + "epoch": 1.3, + "grad_norm": 0.2959404962886227, + "learning_rate": 6.047650446202611e-05, + "loss": 1.011, + "step": 13543 + }, + { + "epoch": 1.3, + "grad_norm": 0.343776309947398, + "learning_rate": 6.0461973487911995e-05, + "loss": 1.0316, + "step": 13544 + }, + { + "epoch": 1.3, + "grad_norm": 0.27576171931990023, + "learning_rate": 6.044744350326186e-05, + "loss": 0.9416, + "step": 13545 + }, + { + "epoch": 1.3, + "grad_norm": 0.336705784219406, + "learning_rate": 6.0432914508439306e-05, + "loss": 1.0389, + "step": 13546 + }, + { + "epoch": 1.3, + "grad_norm": 0.25623672874883213, + "learning_rate": 6.041838650380791e-05, + "loss": 1.1208, + "step": 13547 + }, + { + "epoch": 1.3, + "grad_norm": 0.2790687740739954, + "learning_rate": 6.040385948973128e-05, + "loss": 1.0436, + "step": 13548 + }, + { + "epoch": 1.3, + "grad_norm": 0.307419974937371, + "learning_rate": 6.038933346657297e-05, + "loss": 1.0333, + "step": 13549 + }, + { + "epoch": 1.3, + "grad_norm": 0.3110381000804843, + "learning_rate": 6.037480843469648e-05, + "loss": 1.0958, + "step": 13550 + }, + { + "epoch": 1.3, + "grad_norm": 0.3083093386274324, + "learning_rate": 6.036028439446532e-05, + "loss": 1.0978, + "step": 13551 + }, + { + "epoch": 1.3, + "grad_norm": 0.2776712385333701, + "learning_rate": 6.0345761346242946e-05, + "loss": 0.9628, + "step": 13552 + }, + { + "epoch": 1.3, + "grad_norm": 0.3064493864077216, + "learning_rate": 6.0331239290392814e-05, + "loss": 1.0229, + "step": 13553 + }, + { + "epoch": 1.3, + "grad_norm": 0.31073566223381166, + "learning_rate": 6.0316718227278314e-05, + "loss": 1.046, + "step": 13554 + }, + { + "epoch": 1.3, + "grad_norm": 0.3239211934425204, + "learning_rate": 6.030219815726292e-05, + "loss": 0.9803, + "step": 13555 + }, + { + "epoch": 1.3, + "grad_norm": 0.3395265237734337, + "learning_rate": 6.0287679080709966e-05, + "loss": 1.0015, + "step": 13556 + }, + { + "epoch": 1.3, + "grad_norm": 0.29286466293590663, + "learning_rate": 6.027316099798281e-05, + "loss": 0.903, + "step": 13557 + }, + { + "epoch": 1.3, + "grad_norm": 0.300204089985078, + "learning_rate": 6.025864390944476e-05, + "loss": 0.9985, + "step": 13558 + }, + { + "epoch": 1.3, + "grad_norm": 0.2893008451878455, + "learning_rate": 6.024412781545912e-05, + "loss": 1.0645, + "step": 13559 + }, + { + "epoch": 1.3, + "grad_norm": 0.344539371497186, + "learning_rate": 6.022961271638915e-05, + "loss": 1.0347, + "step": 13560 + }, + { + "epoch": 1.3, + "grad_norm": 0.32381474080721895, + "learning_rate": 6.021509861259811e-05, + "loss": 0.9131, + "step": 13561 + }, + { + "epoch": 1.3, + "grad_norm": 0.29835208713743316, + "learning_rate": 6.020058550444927e-05, + "loss": 0.9891, + "step": 13562 + }, + { + "epoch": 1.3, + "grad_norm": 0.32870772510171165, + "learning_rate": 6.018607339230577e-05, + "loss": 0.944, + "step": 13563 + }, + { + "epoch": 1.3, + "grad_norm": 0.3604754063044376, + "learning_rate": 6.01715622765308e-05, + "loss": 1.0832, + "step": 13564 + }, + { + "epoch": 1.3, + "grad_norm": 0.26934763758420677, + "learning_rate": 6.0157052157487524e-05, + "loss": 1.0848, + "step": 13565 + }, + { + "epoch": 1.3, + "grad_norm": 0.26758666029006944, + "learning_rate": 6.0142543035539056e-05, + "loss": 0.9615, + "step": 13566 + }, + { + "epoch": 1.3, + "grad_norm": 0.30975077223713854, + "learning_rate": 6.0128034911048517e-05, + "loss": 1.1076, + "step": 13567 + }, + { + "epoch": 1.3, + "grad_norm": 0.2953810627900647, + "learning_rate": 6.011352778437891e-05, + "loss": 1.0551, + "step": 13568 + }, + { + "epoch": 1.3, + "grad_norm": 0.3350127263074774, + "learning_rate": 6.0099021655893375e-05, + "loss": 0.9668, + "step": 13569 + }, + { + "epoch": 1.3, + "grad_norm": 0.32906863893573357, + "learning_rate": 6.0084516525954904e-05, + "loss": 0.9749, + "step": 13570 + }, + { + "epoch": 1.3, + "grad_norm": 0.3017949306262433, + "learning_rate": 6.007001239492649e-05, + "loss": 1.1479, + "step": 13571 + }, + { + "epoch": 1.3, + "grad_norm": 0.31734503169063466, + "learning_rate": 6.0055509263171117e-05, + "loss": 1.0154, + "step": 13572 + }, + { + "epoch": 1.3, + "grad_norm": 0.358169668971772, + "learning_rate": 6.004100713105172e-05, + "loss": 0.9639, + "step": 13573 + }, + { + "epoch": 1.3, + "grad_norm": 0.34390244121012314, + "learning_rate": 6.00265059989312e-05, + "loss": 1.1001, + "step": 13574 + }, + { + "epoch": 1.3, + "grad_norm": 0.30214145589538904, + "learning_rate": 6.0012005867172525e-05, + "loss": 0.9698, + "step": 13575 + }, + { + "epoch": 1.3, + "grad_norm": 0.31972196897946425, + "learning_rate": 5.999750673613854e-05, + "loss": 1.0577, + "step": 13576 + }, + { + "epoch": 1.3, + "grad_norm": 0.31545814369340336, + "learning_rate": 5.998300860619208e-05, + "loss": 0.9717, + "step": 13577 + }, + { + "epoch": 1.3, + "grad_norm": 0.30316686191820136, + "learning_rate": 5.9968511477696e-05, + "loss": 1.0714, + "step": 13578 + }, + { + "epoch": 1.3, + "grad_norm": 0.27482920373197045, + "learning_rate": 5.995401535101306e-05, + "loss": 1.0369, + "step": 13579 + }, + { + "epoch": 1.3, + "grad_norm": 0.28025009449843985, + "learning_rate": 5.993952022650606e-05, + "loss": 0.9217, + "step": 13580 + }, + { + "epoch": 1.3, + "grad_norm": 0.30058194401057775, + "learning_rate": 5.9925026104537693e-05, + "loss": 0.8773, + "step": 13581 + }, + { + "epoch": 1.3, + "grad_norm": 0.3184862561597624, + "learning_rate": 5.991053298547079e-05, + "loss": 1.0006, + "step": 13582 + }, + { + "epoch": 1.3, + "grad_norm": 0.3660134086378921, + "learning_rate": 5.989604086966799e-05, + "loss": 1.0557, + "step": 13583 + }, + { + "epoch": 1.3, + "grad_norm": 0.3382105432037931, + "learning_rate": 5.988154975749197e-05, + "loss": 1.052, + "step": 13584 + }, + { + "epoch": 1.3, + "grad_norm": 0.3143046368322412, + "learning_rate": 5.986705964930537e-05, + "loss": 1.1205, + "step": 13585 + }, + { + "epoch": 1.3, + "grad_norm": 0.32785750134242114, + "learning_rate": 5.985257054547082e-05, + "loss": 1.0427, + "step": 13586 + }, + { + "epoch": 1.3, + "grad_norm": 0.27524721365245397, + "learning_rate": 5.983808244635094e-05, + "loss": 0.9989, + "step": 13587 + }, + { + "epoch": 1.3, + "grad_norm": 0.3205314122451055, + "learning_rate": 5.982359535230827e-05, + "loss": 1.0776, + "step": 13588 + }, + { + "epoch": 1.3, + "grad_norm": 0.31567272554298786, + "learning_rate": 5.980910926370538e-05, + "loss": 0.981, + "step": 13589 + }, + { + "epoch": 1.3, + "grad_norm": 0.37184253815417806, + "learning_rate": 5.97946241809048e-05, + "loss": 1.0848, + "step": 13590 + }, + { + "epoch": 1.3, + "grad_norm": 0.34621875793845364, + "learning_rate": 5.978014010426902e-05, + "loss": 1.0837, + "step": 13591 + }, + { + "epoch": 1.3, + "grad_norm": 0.31082129368814826, + "learning_rate": 5.9765657034160505e-05, + "loss": 0.9886, + "step": 13592 + }, + { + "epoch": 1.3, + "grad_norm": 0.34204660931703473, + "learning_rate": 5.975117497094172e-05, + "loss": 1.057, + "step": 13593 + }, + { + "epoch": 1.3, + "grad_norm": 0.3402812523573254, + "learning_rate": 5.9736693914975074e-05, + "loss": 1.0257, + "step": 13594 + }, + { + "epoch": 1.3, + "grad_norm": 0.29481199476560166, + "learning_rate": 5.972221386662292e-05, + "loss": 0.8726, + "step": 13595 + }, + { + "epoch": 1.3, + "grad_norm": 0.3377266126481497, + "learning_rate": 5.970773482624773e-05, + "loss": 1.0942, + "step": 13596 + }, + { + "epoch": 1.3, + "grad_norm": 0.3419060907135372, + "learning_rate": 5.9693256794211796e-05, + "loss": 1.0005, + "step": 13597 + }, + { + "epoch": 1.3, + "grad_norm": 0.3331717204590433, + "learning_rate": 5.967877977087743e-05, + "loss": 1.0173, + "step": 13598 + }, + { + "epoch": 1.3, + "grad_norm": 0.34243313473811193, + "learning_rate": 5.9664303756606954e-05, + "loss": 0.9683, + "step": 13599 + }, + { + "epoch": 1.3, + "grad_norm": 0.3177355479412364, + "learning_rate": 5.9649828751762614e-05, + "loss": 1.0014, + "step": 13600 + }, + { + "epoch": 1.3, + "grad_norm": 0.33090559825672444, + "learning_rate": 5.963535475670665e-05, + "loss": 1.1014, + "step": 13601 + }, + { + "epoch": 1.3, + "grad_norm": 0.3062538948847134, + "learning_rate": 5.9620881771801316e-05, + "loss": 1.0826, + "step": 13602 + }, + { + "epoch": 1.3, + "grad_norm": 0.32872325077637954, + "learning_rate": 5.9606409797408814e-05, + "loss": 1.1035, + "step": 13603 + }, + { + "epoch": 1.3, + "grad_norm": 0.3401242805056878, + "learning_rate": 5.959193883389129e-05, + "loss": 0.9656, + "step": 13604 + }, + { + "epoch": 1.3, + "grad_norm": 0.303528913790367, + "learning_rate": 5.957746888161087e-05, + "loss": 0.9202, + "step": 13605 + }, + { + "epoch": 1.3, + "grad_norm": 0.2655626119965656, + "learning_rate": 5.956299994092971e-05, + "loss": 0.9378, + "step": 13606 + }, + { + "epoch": 1.3, + "grad_norm": 0.29712035545902504, + "learning_rate": 5.954853201220989e-05, + "loss": 1.1963, + "step": 13607 + }, + { + "epoch": 1.3, + "grad_norm": 0.29878331965042354, + "learning_rate": 5.953406509581346e-05, + "loss": 1.0879, + "step": 13608 + }, + { + "epoch": 1.3, + "grad_norm": 0.28703453113727223, + "learning_rate": 5.951959919210248e-05, + "loss": 1.1262, + "step": 13609 + }, + { + "epoch": 1.3, + "grad_norm": 0.2949616217881919, + "learning_rate": 5.9505134301439e-05, + "loss": 1.0623, + "step": 13610 + }, + { + "epoch": 1.3, + "grad_norm": 0.3312919749964221, + "learning_rate": 5.9490670424184993e-05, + "loss": 0.916, + "step": 13611 + }, + { + "epoch": 1.3, + "grad_norm": 0.30634390097199393, + "learning_rate": 5.94762075607024e-05, + "loss": 1.0581, + "step": 13612 + }, + { + "epoch": 1.3, + "grad_norm": 0.3104569583334209, + "learning_rate": 5.946174571135319e-05, + "loss": 0.8879, + "step": 13613 + }, + { + "epoch": 1.3, + "grad_norm": 0.31369570668370816, + "learning_rate": 5.9447284876499264e-05, + "loss": 1.0554, + "step": 13614 + }, + { + "epoch": 1.3, + "grad_norm": 0.3022058380125859, + "learning_rate": 5.943282505650247e-05, + "loss": 1.0097, + "step": 13615 + }, + { + "epoch": 1.3, + "grad_norm": 0.3285789832527581, + "learning_rate": 5.941836625172479e-05, + "loss": 0.9732, + "step": 13616 + }, + { + "epoch": 1.3, + "grad_norm": 0.33044941497582764, + "learning_rate": 5.940390846252799e-05, + "loss": 1.0568, + "step": 13617 + }, + { + "epoch": 1.3, + "grad_norm": 0.26471366416045516, + "learning_rate": 5.938945168927388e-05, + "loss": 0.9762, + "step": 13618 + }, + { + "epoch": 1.3, + "grad_norm": 0.3718378059116555, + "learning_rate": 5.9374995932324286e-05, + "loss": 0.9183, + "step": 13619 + }, + { + "epoch": 1.3, + "grad_norm": 0.30588270747806195, + "learning_rate": 5.936054119204093e-05, + "loss": 0.9371, + "step": 13620 + }, + { + "epoch": 1.3, + "grad_norm": 0.31872976569580386, + "learning_rate": 5.934608746878558e-05, + "loss": 1.0477, + "step": 13621 + }, + { + "epoch": 1.3, + "grad_norm": 0.32708776237896264, + "learning_rate": 5.933163476291992e-05, + "loss": 1.0112, + "step": 13622 + }, + { + "epoch": 1.3, + "grad_norm": 0.3532209426287805, + "learning_rate": 5.931718307480567e-05, + "loss": 1.0483, + "step": 13623 + }, + { + "epoch": 1.3, + "grad_norm": 0.30714873057145337, + "learning_rate": 5.9302732404804504e-05, + "loss": 1.1078, + "step": 13624 + }, + { + "epoch": 1.3, + "grad_norm": 0.3328046110899251, + "learning_rate": 5.9288282753278025e-05, + "loss": 1.036, + "step": 13625 + }, + { + "epoch": 1.3, + "grad_norm": 0.3195649918792432, + "learning_rate": 5.927383412058783e-05, + "loss": 1.0596, + "step": 13626 + }, + { + "epoch": 1.3, + "grad_norm": 0.3102104060447815, + "learning_rate": 5.925938650709557e-05, + "loss": 1.0474, + "step": 13627 + }, + { + "epoch": 1.3, + "grad_norm": 0.31708291716972264, + "learning_rate": 5.9244939913162753e-05, + "loss": 0.8843, + "step": 13628 + }, + { + "epoch": 1.3, + "grad_norm": 0.27867646265102813, + "learning_rate": 5.923049433915091e-05, + "loss": 1.0432, + "step": 13629 + }, + { + "epoch": 1.3, + "grad_norm": 0.32646270802984945, + "learning_rate": 5.921604978542157e-05, + "loss": 1.048, + "step": 13630 + }, + { + "epoch": 1.3, + "grad_norm": 0.28273553216281677, + "learning_rate": 5.920160625233624e-05, + "loss": 1.1098, + "step": 13631 + }, + { + "epoch": 1.3, + "grad_norm": 0.2947350625808941, + "learning_rate": 5.918716374025635e-05, + "loss": 1.0068, + "step": 13632 + }, + { + "epoch": 1.3, + "grad_norm": 0.28117460790289905, + "learning_rate": 5.9172722249543344e-05, + "loss": 1.0803, + "step": 13633 + }, + { + "epoch": 1.3, + "grad_norm": 0.3321154301320244, + "learning_rate": 5.915828178055862e-05, + "loss": 0.9001, + "step": 13634 + }, + { + "epoch": 1.3, + "grad_norm": 0.3190651167587497, + "learning_rate": 5.9143842333663526e-05, + "loss": 0.9995, + "step": 13635 + }, + { + "epoch": 1.3, + "grad_norm": 0.2876177997119053, + "learning_rate": 5.912940390921948e-05, + "loss": 0.9873, + "step": 13636 + }, + { + "epoch": 1.3, + "grad_norm": 0.33249691782947477, + "learning_rate": 5.9114966507587786e-05, + "loss": 0.9789, + "step": 13637 + }, + { + "epoch": 1.3, + "grad_norm": 0.3001782720435861, + "learning_rate": 5.910053012912977e-05, + "loss": 1.0542, + "step": 13638 + }, + { + "epoch": 1.3, + "grad_norm": 0.3067904594034863, + "learning_rate": 5.908609477420668e-05, + "loss": 1.0163, + "step": 13639 + }, + { + "epoch": 1.3, + "grad_norm": 0.3026752936231184, + "learning_rate": 5.907166044317979e-05, + "loss": 1.0061, + "step": 13640 + }, + { + "epoch": 1.31, + "grad_norm": 0.3016197478051115, + "learning_rate": 5.905722713641031e-05, + "loss": 0.9543, + "step": 13641 + }, + { + "epoch": 1.31, + "grad_norm": 0.2901019597943787, + "learning_rate": 5.904279485425942e-05, + "loss": 1.1153, + "step": 13642 + }, + { + "epoch": 1.31, + "grad_norm": 0.3117349012552433, + "learning_rate": 5.902836359708836e-05, + "loss": 1.0272, + "step": 13643 + }, + { + "epoch": 1.31, + "grad_norm": 0.28696620172284415, + "learning_rate": 5.901393336525827e-05, + "loss": 0.9204, + "step": 13644 + }, + { + "epoch": 1.31, + "grad_norm": 0.32342860396583045, + "learning_rate": 5.899950415913024e-05, + "loss": 1.0558, + "step": 13645 + }, + { + "epoch": 1.31, + "grad_norm": 0.2917464923084643, + "learning_rate": 5.8985075979065384e-05, + "loss": 1.029, + "step": 13646 + }, + { + "epoch": 1.31, + "grad_norm": 0.33311077916959003, + "learning_rate": 5.8970648825424756e-05, + "loss": 0.9803, + "step": 13647 + }, + { + "epoch": 1.31, + "grad_norm": 0.29958212533934203, + "learning_rate": 5.895622269856946e-05, + "loss": 1.0029, + "step": 13648 + }, + { + "epoch": 1.31, + "grad_norm": 0.30010203984232997, + "learning_rate": 5.894179759886047e-05, + "loss": 0.9786, + "step": 13649 + }, + { + "epoch": 1.31, + "grad_norm": 0.30906967178331307, + "learning_rate": 5.892737352665878e-05, + "loss": 0.977, + "step": 13650 + }, + { + "epoch": 1.31, + "grad_norm": 0.3064818145851665, + "learning_rate": 5.8912950482325415e-05, + "loss": 1.0313, + "step": 13651 + }, + { + "epoch": 1.31, + "grad_norm": 0.3347338724525073, + "learning_rate": 5.889852846622128e-05, + "loss": 1.0908, + "step": 13652 + }, + { + "epoch": 1.31, + "grad_norm": 0.30404469419506414, + "learning_rate": 5.8884107478707304e-05, + "loss": 0.9177, + "step": 13653 + }, + { + "epoch": 1.31, + "grad_norm": 0.3196262524995382, + "learning_rate": 5.886968752014438e-05, + "loss": 0.918, + "step": 13654 + }, + { + "epoch": 1.31, + "grad_norm": 0.3205761508906189, + "learning_rate": 5.8855268590893373e-05, + "loss": 1.0714, + "step": 13655 + }, + { + "epoch": 1.31, + "grad_norm": 0.3007061184090071, + "learning_rate": 5.8840850691315085e-05, + "loss": 1.0737, + "step": 13656 + }, + { + "epoch": 1.31, + "grad_norm": 0.3099479969360668, + "learning_rate": 5.882643382177042e-05, + "loss": 1.1404, + "step": 13657 + }, + { + "epoch": 1.31, + "grad_norm": 0.32000429677287257, + "learning_rate": 5.8812017982620125e-05, + "loss": 1.0713, + "step": 13658 + }, + { + "epoch": 1.31, + "grad_norm": 0.3198128960142032, + "learning_rate": 5.879760317422497e-05, + "loss": 0.9326, + "step": 13659 + }, + { + "epoch": 1.31, + "grad_norm": 0.3222793584439414, + "learning_rate": 5.8783189396945694e-05, + "loss": 0.9933, + "step": 13660 + }, + { + "epoch": 1.31, + "grad_norm": 0.43049232204952703, + "learning_rate": 5.8768776651143e-05, + "loss": 1.0403, + "step": 13661 + }, + { + "epoch": 1.31, + "grad_norm": 0.296162616851968, + "learning_rate": 5.875436493717753e-05, + "loss": 1.0743, + "step": 13662 + }, + { + "epoch": 1.31, + "grad_norm": 0.3322689647626359, + "learning_rate": 5.873995425541006e-05, + "loss": 1.0582, + "step": 13663 + }, + { + "epoch": 1.31, + "grad_norm": 0.30300844569623925, + "learning_rate": 5.872554460620116e-05, + "loss": 1.1863, + "step": 13664 + }, + { + "epoch": 1.31, + "grad_norm": 0.31260882882307806, + "learning_rate": 5.871113598991144e-05, + "loss": 1.0895, + "step": 13665 + }, + { + "epoch": 1.31, + "grad_norm": 0.3098702624229806, + "learning_rate": 5.869672840690149e-05, + "loss": 1.0131, + "step": 13666 + }, + { + "epoch": 1.31, + "grad_norm": 0.2860408986375777, + "learning_rate": 5.868232185753185e-05, + "loss": 0.9936, + "step": 13667 + }, + { + "epoch": 1.31, + "grad_norm": 0.3500225548060834, + "learning_rate": 5.866791634216311e-05, + "loss": 0.9331, + "step": 13668 + }, + { + "epoch": 1.31, + "grad_norm": 0.28434196260810884, + "learning_rate": 5.86535118611557e-05, + "loss": 1.029, + "step": 13669 + }, + { + "epoch": 1.31, + "grad_norm": 0.30995711089044264, + "learning_rate": 5.863910841487017e-05, + "loss": 1.0576, + "step": 13670 + }, + { + "epoch": 1.31, + "grad_norm": 0.32739774767325525, + "learning_rate": 5.862470600366692e-05, + "loss": 0.9601, + "step": 13671 + }, + { + "epoch": 1.31, + "grad_norm": 0.27482915592837626, + "learning_rate": 5.861030462790643e-05, + "loss": 1.0029, + "step": 13672 + }, + { + "epoch": 1.31, + "grad_norm": 0.29408511347436717, + "learning_rate": 5.859590428794908e-05, + "loss": 0.9769, + "step": 13673 + }, + { + "epoch": 1.31, + "grad_norm": 0.331283834826914, + "learning_rate": 5.858150498415525e-05, + "loss": 1.0844, + "step": 13674 + }, + { + "epoch": 1.31, + "grad_norm": 0.31235162433756786, + "learning_rate": 5.856710671688528e-05, + "loss": 1.0348, + "step": 13675 + }, + { + "epoch": 1.31, + "grad_norm": 0.34377923888608936, + "learning_rate": 5.855270948649947e-05, + "loss": 1.1007, + "step": 13676 + }, + { + "epoch": 1.31, + "grad_norm": 0.33344998306965096, + "learning_rate": 5.8538313293358196e-05, + "loss": 1.0778, + "step": 13677 + }, + { + "epoch": 1.31, + "grad_norm": 0.32125503582684983, + "learning_rate": 5.852391813782168e-05, + "loss": 0.9647, + "step": 13678 + }, + { + "epoch": 1.31, + "grad_norm": 0.2743328867438318, + "learning_rate": 5.8509524020250184e-05, + "loss": 0.9961, + "step": 13679 + }, + { + "epoch": 1.31, + "grad_norm": 0.33821070773159095, + "learning_rate": 5.849513094100392e-05, + "loss": 1.0122, + "step": 13680 + }, + { + "epoch": 1.31, + "grad_norm": 0.37196787128829917, + "learning_rate": 5.84807389004431e-05, + "loss": 0.9033, + "step": 13681 + }, + { + "epoch": 1.31, + "grad_norm": 0.2885750253779786, + "learning_rate": 5.846634789892787e-05, + "loss": 0.9716, + "step": 13682 + }, + { + "epoch": 1.31, + "grad_norm": 0.34220525237593835, + "learning_rate": 5.8451957936818356e-05, + "loss": 1.1822, + "step": 13683 + }, + { + "epoch": 1.31, + "grad_norm": 0.2977946528490143, + "learning_rate": 5.843756901447475e-05, + "loss": 1.0505, + "step": 13684 + }, + { + "epoch": 1.31, + "grad_norm": 0.27862950021222904, + "learning_rate": 5.842318113225709e-05, + "loss": 0.9036, + "step": 13685 + }, + { + "epoch": 1.31, + "grad_norm": 0.25826033382206137, + "learning_rate": 5.8408794290525456e-05, + "loss": 1.0024, + "step": 13686 + }, + { + "epoch": 1.31, + "grad_norm": 0.2975983790863633, + "learning_rate": 5.8394408489639884e-05, + "loss": 1.1138, + "step": 13687 + }, + { + "epoch": 1.31, + "grad_norm": 0.3126701537206989, + "learning_rate": 5.83800237299604e-05, + "loss": 1.0041, + "step": 13688 + }, + { + "epoch": 1.31, + "grad_norm": 0.29700061229586616, + "learning_rate": 5.836564001184695e-05, + "loss": 1.0582, + "step": 13689 + }, + { + "epoch": 1.31, + "grad_norm": 0.3032022710619518, + "learning_rate": 5.835125733565951e-05, + "loss": 0.9616, + "step": 13690 + }, + { + "epoch": 1.31, + "grad_norm": 0.32397394498157384, + "learning_rate": 5.8336875701758054e-05, + "loss": 1.0333, + "step": 13691 + }, + { + "epoch": 1.31, + "grad_norm": 0.33809047007871157, + "learning_rate": 5.83224951105025e-05, + "loss": 0.9995, + "step": 13692 + }, + { + "epoch": 1.31, + "grad_norm": 0.35190299449985196, + "learning_rate": 5.830811556225265e-05, + "loss": 1.0006, + "step": 13693 + }, + { + "epoch": 1.31, + "grad_norm": 0.3373486112757616, + "learning_rate": 5.829373705736846e-05, + "loss": 0.9068, + "step": 13694 + }, + { + "epoch": 1.31, + "grad_norm": 0.3144604652192984, + "learning_rate": 5.827935959620967e-05, + "loss": 1.0055, + "step": 13695 + }, + { + "epoch": 1.31, + "grad_norm": 0.29159479287631507, + "learning_rate": 5.8264983179136124e-05, + "loss": 0.979, + "step": 13696 + }, + { + "epoch": 1.31, + "grad_norm": 0.3374680840915462, + "learning_rate": 5.8250607806507605e-05, + "loss": 1.0608, + "step": 13697 + }, + { + "epoch": 1.31, + "grad_norm": 0.33776957355873044, + "learning_rate": 5.82362334786839e-05, + "loss": 1.0471, + "step": 13698 + }, + { + "epoch": 1.31, + "grad_norm": 0.3405721911935171, + "learning_rate": 5.822186019602467e-05, + "loss": 1.0206, + "step": 13699 + }, + { + "epoch": 1.31, + "grad_norm": 0.31895640795178715, + "learning_rate": 5.820748795888964e-05, + "loss": 1.0765, + "step": 13700 + }, + { + "epoch": 1.31, + "grad_norm": 0.2813743459842793, + "learning_rate": 5.8193116767638524e-05, + "loss": 0.9736, + "step": 13701 + }, + { + "epoch": 1.31, + "grad_norm": 0.29426074160372934, + "learning_rate": 5.81787466226309e-05, + "loss": 0.9856, + "step": 13702 + }, + { + "epoch": 1.31, + "grad_norm": 0.32207489064197253, + "learning_rate": 5.816437752422643e-05, + "loss": 1.0663, + "step": 13703 + }, + { + "epoch": 1.31, + "grad_norm": 0.35428780895296036, + "learning_rate": 5.81500094727847e-05, + "loss": 1.041, + "step": 13704 + }, + { + "epoch": 1.31, + "grad_norm": 0.32302501327998173, + "learning_rate": 5.8135642468665316e-05, + "loss": 0.9619, + "step": 13705 + }, + { + "epoch": 1.31, + "grad_norm": 0.3345676144413117, + "learning_rate": 5.812127651222776e-05, + "loss": 0.9857, + "step": 13706 + }, + { + "epoch": 1.31, + "grad_norm": 0.28162591548347443, + "learning_rate": 5.810691160383163e-05, + "loss": 0.9463, + "step": 13707 + }, + { + "epoch": 1.31, + "grad_norm": 0.30258265772577037, + "learning_rate": 5.80925477438363e-05, + "loss": 0.8712, + "step": 13708 + }, + { + "epoch": 1.31, + "grad_norm": 0.2758473124177535, + "learning_rate": 5.8078184932601366e-05, + "loss": 1.0028, + "step": 13709 + }, + { + "epoch": 1.31, + "grad_norm": 0.30648643947110416, + "learning_rate": 5.806382317048611e-05, + "loss": 1.0271, + "step": 13710 + }, + { + "epoch": 1.31, + "grad_norm": 0.3365100996700143, + "learning_rate": 5.804946245785011e-05, + "loss": 1.1265, + "step": 13711 + }, + { + "epoch": 1.31, + "grad_norm": 0.324891039584826, + "learning_rate": 5.8035102795052644e-05, + "loss": 1.0343, + "step": 13712 + }, + { + "epoch": 1.31, + "grad_norm": 0.3503100656514255, + "learning_rate": 5.802074418245313e-05, + "loss": 1.0898, + "step": 13713 + }, + { + "epoch": 1.31, + "grad_norm": 0.29321468608140244, + "learning_rate": 5.800638662041086e-05, + "loss": 1.0558, + "step": 13714 + }, + { + "epoch": 1.31, + "grad_norm": 0.29699947981762637, + "learning_rate": 5.799203010928518e-05, + "loss": 1.0501, + "step": 13715 + }, + { + "epoch": 1.31, + "grad_norm": 0.3342895368686125, + "learning_rate": 5.797767464943531e-05, + "loss": 0.9389, + "step": 13716 + }, + { + "epoch": 1.31, + "grad_norm": 0.3110277192173238, + "learning_rate": 5.796332024122055e-05, + "loss": 1.0238, + "step": 13717 + }, + { + "epoch": 1.31, + "grad_norm": 0.31105924664547424, + "learning_rate": 5.794896688500011e-05, + "loss": 1.1088, + "step": 13718 + }, + { + "epoch": 1.31, + "grad_norm": 0.26959911519719176, + "learning_rate": 5.793461458113325e-05, + "loss": 1.0304, + "step": 13719 + }, + { + "epoch": 1.31, + "grad_norm": 0.313909770439909, + "learning_rate": 5.7920263329979066e-05, + "loss": 0.9939, + "step": 13720 + }, + { + "epoch": 1.31, + "grad_norm": 0.37558012945934677, + "learning_rate": 5.790591313189672e-05, + "loss": 1.1126, + "step": 13721 + }, + { + "epoch": 1.31, + "grad_norm": 0.30079083403146334, + "learning_rate": 5.78915639872454e-05, + "loss": 1.1522, + "step": 13722 + }, + { + "epoch": 1.31, + "grad_norm": 0.35137532893757595, + "learning_rate": 5.7877215896384105e-05, + "loss": 1.0256, + "step": 13723 + }, + { + "epoch": 1.31, + "grad_norm": 0.3253820983438573, + "learning_rate": 5.7862868859671957e-05, + "loss": 1.0104, + "step": 13724 + }, + { + "epoch": 1.31, + "grad_norm": 0.29720978984435115, + "learning_rate": 5.7848522877468014e-05, + "loss": 0.9805, + "step": 13725 + }, + { + "epoch": 1.31, + "grad_norm": 0.31261323468245905, + "learning_rate": 5.783417795013131e-05, + "loss": 1.0461, + "step": 13726 + }, + { + "epoch": 1.31, + "grad_norm": 0.32178937450832296, + "learning_rate": 5.7819834078020765e-05, + "loss": 1.1596, + "step": 13727 + }, + { + "epoch": 1.31, + "grad_norm": 0.30370028512567, + "learning_rate": 5.780549126149542e-05, + "loss": 1.0929, + "step": 13728 + }, + { + "epoch": 1.31, + "grad_norm": 0.2706360878812979, + "learning_rate": 5.779114950091413e-05, + "loss": 0.8463, + "step": 13729 + }, + { + "epoch": 1.31, + "grad_norm": 0.2827140163451032, + "learning_rate": 5.777680879663585e-05, + "loss": 1.04, + "step": 13730 + }, + { + "epoch": 1.31, + "grad_norm": 0.31652263532319247, + "learning_rate": 5.776246914901947e-05, + "loss": 1.0164, + "step": 13731 + }, + { + "epoch": 1.31, + "grad_norm": 0.2603242164857661, + "learning_rate": 5.7748130558423884e-05, + "loss": 1.0336, + "step": 13732 + }, + { + "epoch": 1.31, + "grad_norm": 0.3107406441105725, + "learning_rate": 5.773379302520785e-05, + "loss": 1.1207, + "step": 13733 + }, + { + "epoch": 1.31, + "grad_norm": 0.34361450294985746, + "learning_rate": 5.771945654973023e-05, + "loss": 0.96, + "step": 13734 + }, + { + "epoch": 1.31, + "grad_norm": 0.2740627660421482, + "learning_rate": 5.770512113234976e-05, + "loss": 0.9866, + "step": 13735 + }, + { + "epoch": 1.31, + "grad_norm": 0.28703049254301627, + "learning_rate": 5.769078677342525e-05, + "loss": 0.9065, + "step": 13736 + }, + { + "epoch": 1.31, + "grad_norm": 0.3497428944232676, + "learning_rate": 5.7676453473315317e-05, + "loss": 0.9744, + "step": 13737 + }, + { + "epoch": 1.31, + "grad_norm": 0.2993469328576167, + "learning_rate": 5.7662121232378816e-05, + "loss": 1.0594, + "step": 13738 + }, + { + "epoch": 1.31, + "grad_norm": 0.335868006606924, + "learning_rate": 5.7647790050974294e-05, + "loss": 1.0498, + "step": 13739 + }, + { + "epoch": 1.31, + "grad_norm": 0.3227025926313374, + "learning_rate": 5.7633459929460486e-05, + "loss": 1.1022, + "step": 13740 + }, + { + "epoch": 1.31, + "grad_norm": 0.3218942918680506, + "learning_rate": 5.7619130868195924e-05, + "loss": 1.0139, + "step": 13741 + }, + { + "epoch": 1.31, + "grad_norm": 0.3299451911675308, + "learning_rate": 5.7604802867539256e-05, + "loss": 1.0892, + "step": 13742 + }, + { + "epoch": 1.31, + "grad_norm": 0.3000833328938117, + "learning_rate": 5.7590475927849074e-05, + "loss": 0.9518, + "step": 13743 + }, + { + "epoch": 1.31, + "grad_norm": 0.3381745969707472, + "learning_rate": 5.7576150049483844e-05, + "loss": 1.0529, + "step": 13744 + }, + { + "epoch": 1.31, + "grad_norm": 0.32616434966975816, + "learning_rate": 5.756182523280212e-05, + "loss": 1.0288, + "step": 13745 + }, + { + "epoch": 1.32, + "grad_norm": 0.29352993220799495, + "learning_rate": 5.7547501478162405e-05, + "loss": 1.0305, + "step": 13746 + }, + { + "epoch": 1.32, + "grad_norm": 0.320994039592085, + "learning_rate": 5.753317878592317e-05, + "loss": 1.0819, + "step": 13747 + }, + { + "epoch": 1.32, + "grad_norm": 0.3061222662593735, + "learning_rate": 5.7518857156442796e-05, + "loss": 0.9331, + "step": 13748 + }, + { + "epoch": 1.32, + "grad_norm": 0.3459018877099051, + "learning_rate": 5.7504536590079746e-05, + "loss": 1.0435, + "step": 13749 + }, + { + "epoch": 1.32, + "grad_norm": 0.35436806865828063, + "learning_rate": 5.749021708719235e-05, + "loss": 1.0236, + "step": 13750 + }, + { + "epoch": 1.32, + "grad_norm": 0.31898292960883623, + "learning_rate": 5.7475898648138984e-05, + "loss": 1.0477, + "step": 13751 + }, + { + "epoch": 1.32, + "grad_norm": 0.353224825881362, + "learning_rate": 5.746158127327799e-05, + "loss": 1.0063, + "step": 13752 + }, + { + "epoch": 1.32, + "grad_norm": 0.2965324407278904, + "learning_rate": 5.7447264962967685e-05, + "loss": 0.9671, + "step": 13753 + }, + { + "epoch": 1.32, + "grad_norm": 0.3424197957522554, + "learning_rate": 5.7432949717566276e-05, + "loss": 1.1084, + "step": 13754 + }, + { + "epoch": 1.32, + "grad_norm": 0.2854551164740142, + "learning_rate": 5.74186355374321e-05, + "loss": 1.0453, + "step": 13755 + }, + { + "epoch": 1.32, + "grad_norm": 0.33660154041841545, + "learning_rate": 5.740432242292328e-05, + "loss": 1.0459, + "step": 13756 + }, + { + "epoch": 1.32, + "grad_norm": 0.3391787076665786, + "learning_rate": 5.739001037439806e-05, + "loss": 1.0212, + "step": 13757 + }, + { + "epoch": 1.32, + "grad_norm": 0.3060077001616304, + "learning_rate": 5.737569939221463e-05, + "loss": 1.0062, + "step": 13758 + }, + { + "epoch": 1.32, + "grad_norm": 0.32536268999263107, + "learning_rate": 5.736138947673113e-05, + "loss": 0.9801, + "step": 13759 + }, + { + "epoch": 1.32, + "grad_norm": 0.3397346701654385, + "learning_rate": 5.7347080628305625e-05, + "loss": 0.9798, + "step": 13760 + }, + { + "epoch": 1.32, + "grad_norm": 0.32186381224997124, + "learning_rate": 5.733277284729628e-05, + "loss": 1.037, + "step": 13761 + }, + { + "epoch": 1.32, + "grad_norm": 0.3593245295804644, + "learning_rate": 5.731846613406107e-05, + "loss": 0.9742, + "step": 13762 + }, + { + "epoch": 1.32, + "grad_norm": 0.27820303090503457, + "learning_rate": 5.730416048895806e-05, + "loss": 1.0764, + "step": 13763 + }, + { + "epoch": 1.32, + "grad_norm": 0.31821681812177177, + "learning_rate": 5.728985591234528e-05, + "loss": 0.9706, + "step": 13764 + }, + { + "epoch": 1.32, + "grad_norm": 0.32527840766321936, + "learning_rate": 5.727555240458069e-05, + "loss": 1.0961, + "step": 13765 + }, + { + "epoch": 1.32, + "grad_norm": 0.3031322463082369, + "learning_rate": 5.726124996602229e-05, + "loss": 1.1442, + "step": 13766 + }, + { + "epoch": 1.32, + "grad_norm": 0.3013079474605395, + "learning_rate": 5.724694859702793e-05, + "loss": 1.0065, + "step": 13767 + }, + { + "epoch": 1.32, + "grad_norm": 0.3321891858874202, + "learning_rate": 5.7232648297955585e-05, + "loss": 0.9933, + "step": 13768 + }, + { + "epoch": 1.32, + "grad_norm": 0.2728031189255866, + "learning_rate": 5.721834906916306e-05, + "loss": 1.039, + "step": 13769 + }, + { + "epoch": 1.32, + "grad_norm": 0.28942797798410047, + "learning_rate": 5.720405091100829e-05, + "loss": 1.0616, + "step": 13770 + }, + { + "epoch": 1.32, + "grad_norm": 0.29074085665840577, + "learning_rate": 5.718975382384895e-05, + "loss": 1.0783, + "step": 13771 + }, + { + "epoch": 1.32, + "grad_norm": 0.3306039497573358, + "learning_rate": 5.7175457808042995e-05, + "loss": 1.0191, + "step": 13772 + }, + { + "epoch": 1.32, + "grad_norm": 0.31952555928270077, + "learning_rate": 5.71611628639481e-05, + "loss": 0.9788, + "step": 13773 + }, + { + "epoch": 1.32, + "grad_norm": 0.325520320777763, + "learning_rate": 5.714686899192205e-05, + "loss": 0.9924, + "step": 13774 + }, + { + "epoch": 1.32, + "grad_norm": 0.29840290448763956, + "learning_rate": 5.713257619232252e-05, + "loss": 1.0854, + "step": 13775 + }, + { + "epoch": 1.32, + "grad_norm": 0.3555420944012683, + "learning_rate": 5.7118284465507246e-05, + "loss": 1.063, + "step": 13776 + }, + { + "epoch": 1.32, + "grad_norm": 0.3294158973803371, + "learning_rate": 5.7103993811833804e-05, + "loss": 0.9536, + "step": 13777 + }, + { + "epoch": 1.32, + "grad_norm": 0.30428784762693173, + "learning_rate": 5.7089704231659866e-05, + "loss": 1.0653, + "step": 13778 + }, + { + "epoch": 1.32, + "grad_norm": 0.28705286562151866, + "learning_rate": 5.707541572534307e-05, + "loss": 1.0129, + "step": 13779 + }, + { + "epoch": 1.32, + "grad_norm": 0.31488908407103267, + "learning_rate": 5.7061128293240995e-05, + "loss": 1.0222, + "step": 13780 + }, + { + "epoch": 1.32, + "grad_norm": 0.33519055629084155, + "learning_rate": 5.704684193571115e-05, + "loss": 1.104, + "step": 13781 + }, + { + "epoch": 1.32, + "grad_norm": 0.2847134969968392, + "learning_rate": 5.703255665311107e-05, + "loss": 1.0928, + "step": 13782 + }, + { + "epoch": 1.32, + "grad_norm": 0.3059933512576839, + "learning_rate": 5.701827244579829e-05, + "loss": 0.9632, + "step": 13783 + }, + { + "epoch": 1.32, + "grad_norm": 0.3349008578972912, + "learning_rate": 5.7003989314130234e-05, + "loss": 0.9905, + "step": 13784 + }, + { + "epoch": 1.32, + "grad_norm": 0.32282653401320777, + "learning_rate": 5.698970725846435e-05, + "loss": 0.9669, + "step": 13785 + }, + { + "epoch": 1.32, + "grad_norm": 0.30414199224975097, + "learning_rate": 5.697542627915809e-05, + "loss": 1.034, + "step": 13786 + }, + { + "epoch": 1.32, + "grad_norm": 0.3031946088666856, + "learning_rate": 5.6961146376568864e-05, + "loss": 0.8886, + "step": 13787 + }, + { + "epoch": 1.32, + "grad_norm": 0.29067549309609636, + "learning_rate": 5.694686755105396e-05, + "loss": 1.0743, + "step": 13788 + }, + { + "epoch": 1.32, + "grad_norm": 0.2935082041281169, + "learning_rate": 5.693258980297078e-05, + "loss": 1.2209, + "step": 13789 + }, + { + "epoch": 1.32, + "grad_norm": 0.27902260530466166, + "learning_rate": 5.691831313267658e-05, + "loss": 1.091, + "step": 13790 + }, + { + "epoch": 1.32, + "grad_norm": 0.312117388479116, + "learning_rate": 5.6904037540528665e-05, + "loss": 1.1047, + "step": 13791 + }, + { + "epoch": 1.32, + "grad_norm": 0.30502965027774637, + "learning_rate": 5.68897630268843e-05, + "loss": 1.0618, + "step": 13792 + }, + { + "epoch": 1.32, + "grad_norm": 0.3656600510185515, + "learning_rate": 5.687548959210074e-05, + "loss": 1.016, + "step": 13793 + }, + { + "epoch": 1.32, + "grad_norm": 0.31364902363572744, + "learning_rate": 5.6861217236535126e-05, + "loss": 1.1666, + "step": 13794 + }, + { + "epoch": 1.32, + "grad_norm": 0.2640008712367884, + "learning_rate": 5.6846945960544696e-05, + "loss": 1.0238, + "step": 13795 + }, + { + "epoch": 1.32, + "grad_norm": 0.3254968974105255, + "learning_rate": 5.683267576448653e-05, + "loss": 0.9643, + "step": 13796 + }, + { + "epoch": 1.32, + "grad_norm": 0.32545407953372135, + "learning_rate": 5.681840664871782e-05, + "loss": 0.9314, + "step": 13797 + }, + { + "epoch": 1.32, + "grad_norm": 0.2942956492204795, + "learning_rate": 5.680413861359554e-05, + "loss": 0.8732, + "step": 13798 + }, + { + "epoch": 1.32, + "grad_norm": 0.2980565328344895, + "learning_rate": 5.678987165947691e-05, + "loss": 0.9626, + "step": 13799 + }, + { + "epoch": 1.32, + "grad_norm": 0.2904259995099312, + "learning_rate": 5.6775605786718875e-05, + "loss": 1.0623, + "step": 13800 + }, + { + "epoch": 1.32, + "grad_norm": 0.34236600431963804, + "learning_rate": 5.67613409956785e-05, + "loss": 1.0703, + "step": 13801 + }, + { + "epoch": 1.32, + "grad_norm": 0.3227473242707066, + "learning_rate": 5.674707728671271e-05, + "loss": 1.0118, + "step": 13802 + }, + { + "epoch": 1.32, + "grad_norm": 0.2833517041476476, + "learning_rate": 5.673281466017848e-05, + "loss": 1.02, + "step": 13803 + }, + { + "epoch": 1.32, + "grad_norm": 0.3170182510212465, + "learning_rate": 5.671855311643282e-05, + "loss": 0.9346, + "step": 13804 + }, + { + "epoch": 1.32, + "grad_norm": 0.2747144399473775, + "learning_rate": 5.67042926558325e-05, + "loss": 1.069, + "step": 13805 + }, + { + "epoch": 1.32, + "grad_norm": 0.3180460309817628, + "learning_rate": 5.669003327873449e-05, + "loss": 0.9405, + "step": 13806 + }, + { + "epoch": 1.32, + "grad_norm": 0.32351360168149945, + "learning_rate": 5.667577498549561e-05, + "loss": 1.0536, + "step": 13807 + }, + { + "epoch": 1.32, + "grad_norm": 0.2784122462655272, + "learning_rate": 5.666151777647273e-05, + "loss": 0.8804, + "step": 13808 + }, + { + "epoch": 1.32, + "grad_norm": 0.3040727834686434, + "learning_rate": 5.6647261652022546e-05, + "loss": 1.0601, + "step": 13809 + }, + { + "epoch": 1.32, + "grad_norm": 0.3212293043325248, + "learning_rate": 5.6633006612501946e-05, + "loss": 1.0708, + "step": 13810 + }, + { + "epoch": 1.32, + "grad_norm": 0.3169196712842093, + "learning_rate": 5.661875265826757e-05, + "loss": 1.0257, + "step": 13811 + }, + { + "epoch": 1.32, + "grad_norm": 0.36490583144571886, + "learning_rate": 5.6604499789676166e-05, + "loss": 0.877, + "step": 13812 + }, + { + "epoch": 1.32, + "grad_norm": 0.3268623304200821, + "learning_rate": 5.659024800708443e-05, + "loss": 1.1125, + "step": 13813 + }, + { + "epoch": 1.32, + "grad_norm": 0.27812688501276633, + "learning_rate": 5.657599731084906e-05, + "loss": 1.112, + "step": 13814 + }, + { + "epoch": 1.32, + "grad_norm": 0.3666739073965871, + "learning_rate": 5.6561747701326603e-05, + "loss": 0.9768, + "step": 13815 + }, + { + "epoch": 1.32, + "grad_norm": 0.3033163699359745, + "learning_rate": 5.654749917887375e-05, + "loss": 1.1155, + "step": 13816 + }, + { + "epoch": 1.32, + "grad_norm": 0.32310730465513815, + "learning_rate": 5.6533251743847e-05, + "loss": 1.1351, + "step": 13817 + }, + { + "epoch": 1.32, + "grad_norm": 0.33323623536072516, + "learning_rate": 5.6519005396602956e-05, + "loss": 1.0511, + "step": 13818 + }, + { + "epoch": 1.32, + "grad_norm": 0.3278391118104683, + "learning_rate": 5.6504760137498125e-05, + "loss": 1.079, + "step": 13819 + }, + { + "epoch": 1.32, + "grad_norm": 0.32560006922967744, + "learning_rate": 5.6490515966889036e-05, + "loss": 1.0094, + "step": 13820 + }, + { + "epoch": 1.32, + "grad_norm": 0.3070547698957363, + "learning_rate": 5.6476272885132086e-05, + "loss": 0.9901, + "step": 13821 + }, + { + "epoch": 1.32, + "grad_norm": 0.3362524803933979, + "learning_rate": 5.6462030892583816e-05, + "loss": 1.0237, + "step": 13822 + }, + { + "epoch": 1.32, + "grad_norm": 0.3034634721886298, + "learning_rate": 5.644778998960054e-05, + "loss": 1.0014, + "step": 13823 + }, + { + "epoch": 1.32, + "grad_norm": 0.3264271438524375, + "learning_rate": 5.6433550176538686e-05, + "loss": 0.9912, + "step": 13824 + }, + { + "epoch": 1.32, + "grad_norm": 0.30947265395138285, + "learning_rate": 5.641931145375463e-05, + "loss": 0.9426, + "step": 13825 + }, + { + "epoch": 1.32, + "grad_norm": 0.2931366716350053, + "learning_rate": 5.6405073821604726e-05, + "loss": 0.912, + "step": 13826 + }, + { + "epoch": 1.32, + "grad_norm": 0.3015874684990806, + "learning_rate": 5.6390837280445206e-05, + "loss": 1.0447, + "step": 13827 + }, + { + "epoch": 1.32, + "grad_norm": 0.26709258102693023, + "learning_rate": 5.6376601830632405e-05, + "loss": 1.0872, + "step": 13828 + }, + { + "epoch": 1.32, + "grad_norm": 0.3487322262361247, + "learning_rate": 5.636236747252258e-05, + "loss": 1.0255, + "step": 13829 + }, + { + "epoch": 1.32, + "grad_norm": 0.3316721563253536, + "learning_rate": 5.63481342064719e-05, + "loss": 1.0601, + "step": 13830 + }, + { + "epoch": 1.32, + "grad_norm": 0.29860186598835664, + "learning_rate": 5.633390203283664e-05, + "loss": 0.984, + "step": 13831 + }, + { + "epoch": 1.32, + "grad_norm": 0.3028468869792273, + "learning_rate": 5.631967095197285e-05, + "loss": 1.0423, + "step": 13832 + }, + { + "epoch": 1.32, + "grad_norm": 0.30996553876032507, + "learning_rate": 5.630544096423682e-05, + "loss": 1.0397, + "step": 13833 + }, + { + "epoch": 1.32, + "grad_norm": 0.3043914001751101, + "learning_rate": 5.6291212069984546e-05, + "loss": 0.9085, + "step": 13834 + }, + { + "epoch": 1.32, + "grad_norm": 0.3126530388570813, + "learning_rate": 5.62769842695722e-05, + "loss": 0.9198, + "step": 13835 + }, + { + "epoch": 1.32, + "grad_norm": 0.31004788093736524, + "learning_rate": 5.6262757563355774e-05, + "loss": 1.0543, + "step": 13836 + }, + { + "epoch": 1.32, + "grad_norm": 0.3364073069285353, + "learning_rate": 5.624853195169135e-05, + "loss": 1.0259, + "step": 13837 + }, + { + "epoch": 1.32, + "grad_norm": 0.3232246443359867, + "learning_rate": 5.623430743493489e-05, + "loss": 1.064, + "step": 13838 + }, + { + "epoch": 1.32, + "grad_norm": 0.3063977501198385, + "learning_rate": 5.622008401344239e-05, + "loss": 1.1063, + "step": 13839 + }, + { + "epoch": 1.32, + "grad_norm": 0.31350488233405444, + "learning_rate": 5.620586168756978e-05, + "loss": 1.0635, + "step": 13840 + }, + { + "epoch": 1.32, + "grad_norm": 0.3224403143468549, + "learning_rate": 5.6191640457673066e-05, + "loss": 1.024, + "step": 13841 + }, + { + "epoch": 1.32, + "grad_norm": 0.2816315513455265, + "learning_rate": 5.617742032410803e-05, + "loss": 1.0361, + "step": 13842 + }, + { + "epoch": 1.32, + "grad_norm": 0.24568093351224346, + "learning_rate": 5.616320128723065e-05, + "loss": 0.9595, + "step": 13843 + }, + { + "epoch": 1.32, + "grad_norm": 0.2393163165148189, + "learning_rate": 5.6148983347396646e-05, + "loss": 0.9914, + "step": 13844 + }, + { + "epoch": 1.32, + "grad_norm": 0.30294070389827676, + "learning_rate": 5.61347665049619e-05, + "loss": 1.0128, + "step": 13845 + }, + { + "epoch": 1.32, + "grad_norm": 0.32011614643685055, + "learning_rate": 5.61205507602822e-05, + "loss": 1.0776, + "step": 13846 + }, + { + "epoch": 1.32, + "grad_norm": 0.27843445110310905, + "learning_rate": 5.610633611371332e-05, + "loss": 0.9738, + "step": 13847 + }, + { + "epoch": 1.32, + "grad_norm": 0.3062184324526837, + "learning_rate": 5.609212256561094e-05, + "loss": 0.9973, + "step": 13848 + }, + { + "epoch": 1.32, + "grad_norm": 0.3223682655464429, + "learning_rate": 5.607791011633078e-05, + "loss": 1.1062, + "step": 13849 + }, + { + "epoch": 1.33, + "grad_norm": 0.30233346529892824, + "learning_rate": 5.606369876622857e-05, + "loss": 1.069, + "step": 13850 + }, + { + "epoch": 1.33, + "grad_norm": 0.31921967798559375, + "learning_rate": 5.6049488515659855e-05, + "loss": 0.9541, + "step": 13851 + }, + { + "epoch": 1.33, + "grad_norm": 0.3030580037925566, + "learning_rate": 5.603527936498032e-05, + "loss": 1.0051, + "step": 13852 + }, + { + "epoch": 1.33, + "grad_norm": 0.29823087602879245, + "learning_rate": 5.602107131454555e-05, + "loss": 1.0191, + "step": 13853 + }, + { + "epoch": 1.33, + "grad_norm": 0.316650093495455, + "learning_rate": 5.6006864364711166e-05, + "loss": 1.0987, + "step": 13854 + }, + { + "epoch": 1.33, + "grad_norm": 0.32156107519917615, + "learning_rate": 5.5992658515832584e-05, + "loss": 1.0399, + "step": 13855 + }, + { + "epoch": 1.33, + "grad_norm": 0.28852265838415087, + "learning_rate": 5.597845376826544e-05, + "loss": 0.9806, + "step": 13856 + }, + { + "epoch": 1.33, + "grad_norm": 0.3024463996538198, + "learning_rate": 5.59642501223651e-05, + "loss": 1.0802, + "step": 13857 + }, + { + "epoch": 1.33, + "grad_norm": 0.30258661563249106, + "learning_rate": 5.595004757848713e-05, + "loss": 1.0607, + "step": 13858 + }, + { + "epoch": 1.33, + "grad_norm": 0.3201985833070001, + "learning_rate": 5.5935846136986817e-05, + "loss": 1.0589, + "step": 13859 + }, + { + "epoch": 1.33, + "grad_norm": 0.3093086741289812, + "learning_rate": 5.5921645798219726e-05, + "loss": 0.957, + "step": 13860 + }, + { + "epoch": 1.33, + "grad_norm": 0.26223166038922885, + "learning_rate": 5.590744656254112e-05, + "loss": 1.0567, + "step": 13861 + }, + { + "epoch": 1.33, + "grad_norm": 0.3235212630975704, + "learning_rate": 5.589324843030641e-05, + "loss": 1.0751, + "step": 13862 + }, + { + "epoch": 1.33, + "grad_norm": 0.3174609569161984, + "learning_rate": 5.587905140187084e-05, + "loss": 1.011, + "step": 13863 + }, + { + "epoch": 1.33, + "grad_norm": 0.31942261953708556, + "learning_rate": 5.586485547758973e-05, + "loss": 0.8676, + "step": 13864 + }, + { + "epoch": 1.33, + "grad_norm": 0.3043676892232189, + "learning_rate": 5.5850660657818385e-05, + "loss": 1.0367, + "step": 13865 + }, + { + "epoch": 1.33, + "grad_norm": 0.32338541771158363, + "learning_rate": 5.583646694291198e-05, + "loss": 1.0172, + "step": 13866 + }, + { + "epoch": 1.33, + "grad_norm": 0.3014027535125411, + "learning_rate": 5.582227433322574e-05, + "loss": 1.0587, + "step": 13867 + }, + { + "epoch": 1.33, + "grad_norm": 0.29010791513402306, + "learning_rate": 5.580808282911485e-05, + "loss": 0.9496, + "step": 13868 + }, + { + "epoch": 1.33, + "grad_norm": 0.3078657522441318, + "learning_rate": 5.57938924309345e-05, + "loss": 0.9646, + "step": 13869 + }, + { + "epoch": 1.33, + "grad_norm": 0.35234100454904593, + "learning_rate": 5.577970313903973e-05, + "loss": 1.0406, + "step": 13870 + }, + { + "epoch": 1.33, + "grad_norm": 0.33613024172478434, + "learning_rate": 5.576551495378573e-05, + "loss": 1.1406, + "step": 13871 + }, + { + "epoch": 1.33, + "grad_norm": 0.32030741162382, + "learning_rate": 5.575132787552747e-05, + "loss": 1.1264, + "step": 13872 + }, + { + "epoch": 1.33, + "grad_norm": 0.26623912266186556, + "learning_rate": 5.573714190462005e-05, + "loss": 1.0044, + "step": 13873 + }, + { + "epoch": 1.33, + "grad_norm": 0.3560349234673051, + "learning_rate": 5.572295704141846e-05, + "loss": 1.01, + "step": 13874 + }, + { + "epoch": 1.33, + "grad_norm": 0.3613897583210373, + "learning_rate": 5.570877328627774e-05, + "loss": 1.0413, + "step": 13875 + }, + { + "epoch": 1.33, + "grad_norm": 0.27838611565881866, + "learning_rate": 5.5694590639552755e-05, + "loss": 0.9655, + "step": 13876 + }, + { + "epoch": 1.33, + "grad_norm": 0.29720744514442893, + "learning_rate": 5.568040910159854e-05, + "loss": 1.0722, + "step": 13877 + }, + { + "epoch": 1.33, + "grad_norm": 0.27012534094704976, + "learning_rate": 5.5666228672769893e-05, + "loss": 0.958, + "step": 13878 + }, + { + "epoch": 1.33, + "grad_norm": 0.33399488916799247, + "learning_rate": 5.565204935342174e-05, + "loss": 1.0474, + "step": 13879 + }, + { + "epoch": 1.33, + "grad_norm": 0.32730386531301825, + "learning_rate": 5.5637871143908925e-05, + "loss": 1.1516, + "step": 13880 + }, + { + "epoch": 1.33, + "grad_norm": 0.3155761859081546, + "learning_rate": 5.562369404458631e-05, + "loss": 1.0936, + "step": 13881 + }, + { + "epoch": 1.33, + "grad_norm": 0.3346000829885797, + "learning_rate": 5.560951805580859e-05, + "loss": 1.0446, + "step": 13882 + }, + { + "epoch": 1.33, + "grad_norm": 0.3045668127540829, + "learning_rate": 5.5595343177930625e-05, + "loss": 1.0707, + "step": 13883 + }, + { + "epoch": 1.33, + "grad_norm": 0.31473379982604643, + "learning_rate": 5.558116941130707e-05, + "loss": 1.038, + "step": 13884 + }, + { + "epoch": 1.33, + "grad_norm": 0.2802492961154177, + "learning_rate": 5.5566996756292646e-05, + "loss": 1.0318, + "step": 13885 + }, + { + "epoch": 1.33, + "grad_norm": 0.3105267086885536, + "learning_rate": 5.5552825213242065e-05, + "loss": 1.0425, + "step": 13886 + }, + { + "epoch": 1.33, + "grad_norm": 0.2706786866706816, + "learning_rate": 5.5538654782510014e-05, + "loss": 0.9509, + "step": 13887 + }, + { + "epoch": 1.33, + "grad_norm": 0.30962200670488665, + "learning_rate": 5.5524485464451014e-05, + "loss": 0.9799, + "step": 13888 + }, + { + "epoch": 1.33, + "grad_norm": 0.2790936846267907, + "learning_rate": 5.551031725941973e-05, + "loss": 0.9653, + "step": 13889 + }, + { + "epoch": 1.33, + "grad_norm": 0.35365489900583047, + "learning_rate": 5.5496150167770754e-05, + "loss": 1.1505, + "step": 13890 + }, + { + "epoch": 1.33, + "grad_norm": 0.3105173617695424, + "learning_rate": 5.548198418985855e-05, + "loss": 0.9564, + "step": 13891 + }, + { + "epoch": 1.33, + "grad_norm": 0.3296542388629761, + "learning_rate": 5.546781932603771e-05, + "loss": 0.9954, + "step": 13892 + }, + { + "epoch": 1.33, + "grad_norm": 0.2625581574390105, + "learning_rate": 5.54536555766626e-05, + "loss": 1.0254, + "step": 13893 + }, + { + "epoch": 1.33, + "grad_norm": 0.3233168000322383, + "learning_rate": 5.543949294208783e-05, + "loss": 0.9767, + "step": 13894 + }, + { + "epoch": 1.33, + "grad_norm": 0.3242235757870707, + "learning_rate": 5.5425331422667724e-05, + "loss": 1.0148, + "step": 13895 + }, + { + "epoch": 1.33, + "grad_norm": 0.3374910151068779, + "learning_rate": 5.541117101875675e-05, + "loss": 0.9956, + "step": 13896 + }, + { + "epoch": 1.33, + "grad_norm": 0.3249475342438086, + "learning_rate": 5.53970117307092e-05, + "loss": 0.9762, + "step": 13897 + }, + { + "epoch": 1.33, + "grad_norm": 0.3052140372018797, + "learning_rate": 5.538285355887951e-05, + "loss": 1.0545, + "step": 13898 + }, + { + "epoch": 1.33, + "grad_norm": 0.3481744811241573, + "learning_rate": 5.536869650362191e-05, + "loss": 1.0226, + "step": 13899 + }, + { + "epoch": 1.33, + "grad_norm": 0.28354239389542923, + "learning_rate": 5.5354540565290723e-05, + "loss": 1.0471, + "step": 13900 + }, + { + "epoch": 1.33, + "grad_norm": 0.30379964097385265, + "learning_rate": 5.534038574424021e-05, + "loss": 1.0434, + "step": 13901 + }, + { + "epoch": 1.33, + "grad_norm": 0.2984527997351274, + "learning_rate": 5.5326232040824664e-05, + "loss": 0.9756, + "step": 13902 + }, + { + "epoch": 1.33, + "grad_norm": 0.29633260858988997, + "learning_rate": 5.53120794553982e-05, + "loss": 1.0383, + "step": 13903 + }, + { + "epoch": 1.33, + "grad_norm": 0.2905510144662787, + "learning_rate": 5.5297927988315066e-05, + "loss": 0.9568, + "step": 13904 + }, + { + "epoch": 1.33, + "grad_norm": 0.28557429659347794, + "learning_rate": 5.528377763992934e-05, + "loss": 0.9197, + "step": 13905 + }, + { + "epoch": 1.33, + "grad_norm": 0.2920100943085033, + "learning_rate": 5.5269628410595176e-05, + "loss": 0.9554, + "step": 13906 + }, + { + "epoch": 1.33, + "grad_norm": 0.3336837160626649, + "learning_rate": 5.525548030066668e-05, + "loss": 1.1404, + "step": 13907 + }, + { + "epoch": 1.33, + "grad_norm": 0.3201013625667117, + "learning_rate": 5.5241333310497944e-05, + "loss": 0.9958, + "step": 13908 + }, + { + "epoch": 1.33, + "grad_norm": 0.31817743731916603, + "learning_rate": 5.522718744044295e-05, + "loss": 1.029, + "step": 13909 + }, + { + "epoch": 1.33, + "grad_norm": 0.3517140067197475, + "learning_rate": 5.521304269085571e-05, + "loss": 1.0415, + "step": 13910 + }, + { + "epoch": 1.33, + "grad_norm": 0.337385626240159, + "learning_rate": 5.519889906209027e-05, + "loss": 0.9227, + "step": 13911 + }, + { + "epoch": 1.33, + "grad_norm": 0.28494428273666783, + "learning_rate": 5.5184756554500504e-05, + "loss": 1.1024, + "step": 13912 + }, + { + "epoch": 1.33, + "grad_norm": 0.2834900512777567, + "learning_rate": 5.517061516844035e-05, + "loss": 1.0618, + "step": 13913 + }, + { + "epoch": 1.33, + "grad_norm": 0.33093064815617323, + "learning_rate": 5.5156474904263747e-05, + "loss": 1.0549, + "step": 13914 + }, + { + "epoch": 1.33, + "grad_norm": 0.3242411883674369, + "learning_rate": 5.514233576232457e-05, + "loss": 1.0117, + "step": 13915 + }, + { + "epoch": 1.33, + "grad_norm": 0.32001901466829386, + "learning_rate": 5.512819774297661e-05, + "loss": 1.0042, + "step": 13916 + }, + { + "epoch": 1.33, + "grad_norm": 0.27514972760253326, + "learning_rate": 5.5114060846573725e-05, + "loss": 1.0174, + "step": 13917 + }, + { + "epoch": 1.33, + "grad_norm": 0.29068783239678986, + "learning_rate": 5.509992507346964e-05, + "loss": 1.018, + "step": 13918 + }, + { + "epoch": 1.33, + "grad_norm": 0.2935468013246177, + "learning_rate": 5.508579042401819e-05, + "loss": 0.9735, + "step": 13919 + }, + { + "epoch": 1.33, + "grad_norm": 0.3014600651209888, + "learning_rate": 5.507165689857299e-05, + "loss": 1.0391, + "step": 13920 + }, + { + "epoch": 1.33, + "grad_norm": 0.26375827438253446, + "learning_rate": 5.505752449748788e-05, + "loss": 0.938, + "step": 13921 + }, + { + "epoch": 1.33, + "grad_norm": 0.36826284851554414, + "learning_rate": 5.504339322111642e-05, + "loss": 1.0383, + "step": 13922 + }, + { + "epoch": 1.33, + "grad_norm": 0.3339393240956095, + "learning_rate": 5.502926306981235e-05, + "loss": 1.0661, + "step": 13923 + }, + { + "epoch": 1.33, + "grad_norm": 0.32109147831907275, + "learning_rate": 5.5015134043929196e-05, + "loss": 1.0284, + "step": 13924 + }, + { + "epoch": 1.33, + "grad_norm": 0.2866246981369521, + "learning_rate": 5.500100614382062e-05, + "loss": 1.1312, + "step": 13925 + }, + { + "epoch": 1.33, + "grad_norm": 0.30774294920677364, + "learning_rate": 5.4986879369840114e-05, + "loss": 0.8922, + "step": 13926 + }, + { + "epoch": 1.33, + "grad_norm": 0.26386531310883193, + "learning_rate": 5.497275372234123e-05, + "loss": 0.9812, + "step": 13927 + }, + { + "epoch": 1.33, + "grad_norm": 0.33512654616875315, + "learning_rate": 5.495862920167749e-05, + "loss": 0.9856, + "step": 13928 + }, + { + "epoch": 1.33, + "grad_norm": 0.2786086728936053, + "learning_rate": 5.4944505808202405e-05, + "loss": 0.9319, + "step": 13929 + }, + { + "epoch": 1.33, + "grad_norm": 0.28980838093999395, + "learning_rate": 5.493038354226934e-05, + "loss": 0.9869, + "step": 13930 + }, + { + "epoch": 1.33, + "grad_norm": 0.30339613463609694, + "learning_rate": 5.491626240423173e-05, + "loss": 1.0088, + "step": 13931 + }, + { + "epoch": 1.33, + "grad_norm": 0.3182939786586784, + "learning_rate": 5.490214239444304e-05, + "loss": 0.9681, + "step": 13932 + }, + { + "epoch": 1.33, + "grad_norm": 0.29000641248575026, + "learning_rate": 5.488802351325656e-05, + "loss": 1.0268, + "step": 13933 + }, + { + "epoch": 1.33, + "grad_norm": 0.31256305956919905, + "learning_rate": 5.4873905761025625e-05, + "loss": 0.963, + "step": 13934 + }, + { + "epoch": 1.33, + "grad_norm": 0.2870102004998422, + "learning_rate": 5.4859789138103555e-05, + "loss": 0.9896, + "step": 13935 + }, + { + "epoch": 1.33, + "grad_norm": 0.31798567827002794, + "learning_rate": 5.484567364484368e-05, + "loss": 1.0475, + "step": 13936 + }, + { + "epoch": 1.33, + "grad_norm": 0.33111734030522977, + "learning_rate": 5.483155928159915e-05, + "loss": 1.0698, + "step": 13937 + }, + { + "epoch": 1.33, + "grad_norm": 0.2896951646916223, + "learning_rate": 5.481744604872329e-05, + "loss": 1.0182, + "step": 13938 + }, + { + "epoch": 1.33, + "grad_norm": 0.29883342934443613, + "learning_rate": 5.480333394656919e-05, + "loss": 1.0119, + "step": 13939 + }, + { + "epoch": 1.33, + "grad_norm": 0.29492373732946964, + "learning_rate": 5.4789222975490075e-05, + "loss": 1.1608, + "step": 13940 + }, + { + "epoch": 1.33, + "grad_norm": 0.29999372629323157, + "learning_rate": 5.477511313583905e-05, + "loss": 1.1203, + "step": 13941 + }, + { + "epoch": 1.33, + "grad_norm": 0.32684372023778546, + "learning_rate": 5.476100442796929e-05, + "loss": 1.056, + "step": 13942 + }, + { + "epoch": 1.33, + "grad_norm": 0.29889698250531893, + "learning_rate": 5.474689685223379e-05, + "loss": 1.0727, + "step": 13943 + }, + { + "epoch": 1.33, + "grad_norm": 0.30525629843789637, + "learning_rate": 5.473279040898567e-05, + "loss": 0.8609, + "step": 13944 + }, + { + "epoch": 1.33, + "grad_norm": 0.3401896260341286, + "learning_rate": 5.4718685098577885e-05, + "loss": 0.9985, + "step": 13945 + }, + { + "epoch": 1.33, + "grad_norm": 0.3013432828102024, + "learning_rate": 5.47045809213635e-05, + "loss": 1.0319, + "step": 13946 + }, + { + "epoch": 1.33, + "grad_norm": 0.3158516783557926, + "learning_rate": 5.469047787769537e-05, + "loss": 0.9326, + "step": 13947 + }, + { + "epoch": 1.33, + "grad_norm": 0.3033540164343825, + "learning_rate": 5.4676375967926574e-05, + "loss": 1.1346, + "step": 13948 + }, + { + "epoch": 1.33, + "grad_norm": 0.30135559357158176, + "learning_rate": 5.4662275192409916e-05, + "loss": 0.9651, + "step": 13949 + }, + { + "epoch": 1.33, + "grad_norm": 0.3283664163755309, + "learning_rate": 5.464817555149837e-05, + "loss": 1.0311, + "step": 13950 + }, + { + "epoch": 1.33, + "grad_norm": 0.31058180466281676, + "learning_rate": 5.463407704554469e-05, + "loss": 0.9357, + "step": 13951 + }, + { + "epoch": 1.33, + "grad_norm": 0.311393873684762, + "learning_rate": 5.4619979674901735e-05, + "loss": 1.1314, + "step": 13952 + }, + { + "epoch": 1.33, + "grad_norm": 0.318569463980189, + "learning_rate": 5.460588343992235e-05, + "loss": 1.1452, + "step": 13953 + }, + { + "epoch": 1.33, + "grad_norm": 0.32765193551517224, + "learning_rate": 5.459178834095918e-05, + "loss": 1.0291, + "step": 13954 + }, + { + "epoch": 1.34, + "grad_norm": 0.27657380269094395, + "learning_rate": 5.457769437836513e-05, + "loss": 0.9926, + "step": 13955 + }, + { + "epoch": 1.34, + "grad_norm": 0.2945566873213993, + "learning_rate": 5.456360155249278e-05, + "loss": 1.2216, + "step": 13956 + }, + { + "epoch": 1.34, + "grad_norm": 0.31765242733073074, + "learning_rate": 5.4549509863694913e-05, + "loss": 0.9227, + "step": 13957 + }, + { + "epoch": 1.34, + "grad_norm": 0.30851184628406997, + "learning_rate": 5.4535419312324065e-05, + "loss": 1.0636, + "step": 13958 + }, + { + "epoch": 1.34, + "grad_norm": 0.2666943250751297, + "learning_rate": 5.4521329898732974e-05, + "loss": 1.0119, + "step": 13959 + }, + { + "epoch": 1.34, + "grad_norm": 0.2550333746132659, + "learning_rate": 5.450724162327414e-05, + "loss": 0.9512, + "step": 13960 + }, + { + "epoch": 1.34, + "grad_norm": 0.3028187855436863, + "learning_rate": 5.449315448630017e-05, + "loss": 1.0709, + "step": 13961 + }, + { + "epoch": 1.34, + "grad_norm": 0.34764286362964303, + "learning_rate": 5.447906848816362e-05, + "loss": 1.0175, + "step": 13962 + }, + { + "epoch": 1.34, + "grad_norm": 0.3167101619830756, + "learning_rate": 5.4464983629217023e-05, + "loss": 1.06, + "step": 13963 + }, + { + "epoch": 1.34, + "grad_norm": 0.30832636548807085, + "learning_rate": 5.44508999098128e-05, + "loss": 0.9417, + "step": 13964 + }, + { + "epoch": 1.34, + "grad_norm": 0.28461723647793713, + "learning_rate": 5.443681733030346e-05, + "loss": 1.0872, + "step": 13965 + }, + { + "epoch": 1.34, + "grad_norm": 0.29126656921448657, + "learning_rate": 5.442273589104136e-05, + "loss": 1.1104, + "step": 13966 + }, + { + "epoch": 1.34, + "grad_norm": 0.2890901766161314, + "learning_rate": 5.440865559237895e-05, + "loss": 1.0167, + "step": 13967 + }, + { + "epoch": 1.34, + "grad_norm": 0.30931067840865145, + "learning_rate": 5.439457643466859e-05, + "loss": 0.9208, + "step": 13968 + }, + { + "epoch": 1.34, + "grad_norm": 0.32361999203188485, + "learning_rate": 5.438049841826264e-05, + "loss": 0.9551, + "step": 13969 + }, + { + "epoch": 1.34, + "grad_norm": 0.296893392103769, + "learning_rate": 5.436642154351336e-05, + "loss": 1.0041, + "step": 13970 + }, + { + "epoch": 1.34, + "grad_norm": 0.31418955371920254, + "learning_rate": 5.4352345810773054e-05, + "loss": 1.135, + "step": 13971 + }, + { + "epoch": 1.34, + "grad_norm": 0.3360782745431793, + "learning_rate": 5.4338271220394024e-05, + "loss": 0.9591, + "step": 13972 + }, + { + "epoch": 1.34, + "grad_norm": 0.29929961402607275, + "learning_rate": 5.4324197772728414e-05, + "loss": 0.8999, + "step": 13973 + }, + { + "epoch": 1.34, + "grad_norm": 0.29617316450064285, + "learning_rate": 5.431012546812846e-05, + "loss": 1.1329, + "step": 13974 + }, + { + "epoch": 1.34, + "grad_norm": 0.31764129492085186, + "learning_rate": 5.429605430694633e-05, + "loss": 1.086, + "step": 13975 + }, + { + "epoch": 1.34, + "grad_norm": 0.2944716437225841, + "learning_rate": 5.4281984289534214e-05, + "loss": 0.9976, + "step": 13976 + }, + { + "epoch": 1.34, + "grad_norm": 0.3092925165452444, + "learning_rate": 5.426791541624412e-05, + "loss": 1.0037, + "step": 13977 + }, + { + "epoch": 1.34, + "grad_norm": 0.32057975304787945, + "learning_rate": 5.425384768742824e-05, + "loss": 0.9566, + "step": 13978 + }, + { + "epoch": 1.34, + "grad_norm": 0.32152450432308227, + "learning_rate": 5.423978110343852e-05, + "loss": 1.1159, + "step": 13979 + }, + { + "epoch": 1.34, + "grad_norm": 0.30924126831911625, + "learning_rate": 5.4225715664627085e-05, + "loss": 0.9811, + "step": 13980 + }, + { + "epoch": 1.34, + "grad_norm": 0.30155415405985353, + "learning_rate": 5.42116513713458e-05, + "loss": 1.0433, + "step": 13981 + }, + { + "epoch": 1.34, + "grad_norm": 0.3109177160626664, + "learning_rate": 5.4197588223946795e-05, + "loss": 1.0052, + "step": 13982 + }, + { + "epoch": 1.34, + "grad_norm": 0.3369339869927878, + "learning_rate": 5.418352622278189e-05, + "loss": 0.9847, + "step": 13983 + }, + { + "epoch": 1.34, + "grad_norm": 0.3549269115750836, + "learning_rate": 5.4169465368203075e-05, + "loss": 1.0838, + "step": 13984 + }, + { + "epoch": 1.34, + "grad_norm": 0.33860880161914586, + "learning_rate": 5.415540566056216e-05, + "loss": 1.1012, + "step": 13985 + }, + { + "epoch": 1.34, + "grad_norm": 0.325030229571302, + "learning_rate": 5.4141347100211057e-05, + "loss": 1.0484, + "step": 13986 + }, + { + "epoch": 1.34, + "grad_norm": 0.3364326835704884, + "learning_rate": 5.412728968750154e-05, + "loss": 0.9932, + "step": 13987 + }, + { + "epoch": 1.34, + "grad_norm": 0.33173528359922627, + "learning_rate": 5.411323342278542e-05, + "loss": 1.0545, + "step": 13988 + }, + { + "epoch": 1.34, + "grad_norm": 0.3088626904592197, + "learning_rate": 5.4099178306414465e-05, + "loss": 1.0575, + "step": 13989 + }, + { + "epoch": 1.34, + "grad_norm": 0.29601904914030897, + "learning_rate": 5.408512433874047e-05, + "loss": 0.9568, + "step": 13990 + }, + { + "epoch": 1.34, + "grad_norm": 0.30118700512567287, + "learning_rate": 5.407107152011506e-05, + "loss": 1.0116, + "step": 13991 + }, + { + "epoch": 1.34, + "grad_norm": 0.34335681240801774, + "learning_rate": 5.405701985088994e-05, + "loss": 0.9807, + "step": 13992 + }, + { + "epoch": 1.34, + "grad_norm": 0.30055066924167667, + "learning_rate": 5.404296933141681e-05, + "loss": 1.0213, + "step": 13993 + }, + { + "epoch": 1.34, + "grad_norm": 0.2830753935637788, + "learning_rate": 5.4028919962047234e-05, + "loss": 1.0789, + "step": 13994 + }, + { + "epoch": 1.34, + "grad_norm": 0.3121651512647466, + "learning_rate": 5.4014871743132814e-05, + "loss": 1.0163, + "step": 13995 + }, + { + "epoch": 1.34, + "grad_norm": 0.28394767249162345, + "learning_rate": 5.400082467502513e-05, + "loss": 1.1827, + "step": 13996 + }, + { + "epoch": 1.34, + "grad_norm": 0.34179484205569505, + "learning_rate": 5.398677875807576e-05, + "loss": 1.0383, + "step": 13997 + }, + { + "epoch": 1.34, + "grad_norm": 0.3589473352396655, + "learning_rate": 5.3972733992636125e-05, + "loss": 1.0343, + "step": 13998 + }, + { + "epoch": 1.34, + "grad_norm": 0.2981904605886854, + "learning_rate": 5.395869037905778e-05, + "loss": 0.9929, + "step": 13999 + }, + { + "epoch": 1.34, + "grad_norm": 0.3059139655315304, + "learning_rate": 5.3944647917692124e-05, + "loss": 1.0323, + "step": 14000 + }, + { + "epoch": 1.34, + "grad_norm": 0.3545547509800312, + "learning_rate": 5.39306066088906e-05, + "loss": 1.0726, + "step": 14001 + }, + { + "epoch": 1.34, + "grad_norm": 0.29489730681279797, + "learning_rate": 5.391656645300458e-05, + "loss": 0.9696, + "step": 14002 + }, + { + "epoch": 1.34, + "grad_norm": 0.2676039370498071, + "learning_rate": 5.390252745038551e-05, + "loss": 1.0024, + "step": 14003 + }, + { + "epoch": 1.34, + "grad_norm": 0.3483516589639446, + "learning_rate": 5.38884896013846e-05, + "loss": 0.9411, + "step": 14004 + }, + { + "epoch": 1.34, + "grad_norm": 0.316604785980228, + "learning_rate": 5.387445290635327e-05, + "loss": 1.0321, + "step": 14005 + }, + { + "epoch": 1.34, + "grad_norm": 0.30315675908744816, + "learning_rate": 5.3860417365642693e-05, + "loss": 1.0132, + "step": 14006 + }, + { + "epoch": 1.34, + "grad_norm": 0.31669931861066625, + "learning_rate": 5.3846382979604226e-05, + "loss": 1.0295, + "step": 14007 + }, + { + "epoch": 1.34, + "grad_norm": 0.3756785481386469, + "learning_rate": 5.383234974858894e-05, + "loss": 1.0144, + "step": 14008 + }, + { + "epoch": 1.34, + "grad_norm": 0.3376786005614308, + "learning_rate": 5.38183176729482e-05, + "loss": 1.0631, + "step": 14009 + }, + { + "epoch": 1.34, + "grad_norm": 0.31605758898308417, + "learning_rate": 5.3804286753033037e-05, + "loss": 0.9717, + "step": 14010 + }, + { + "epoch": 1.34, + "grad_norm": 0.32917576298223844, + "learning_rate": 5.379025698919469e-05, + "loss": 0.8276, + "step": 14011 + }, + { + "epoch": 1.34, + "grad_norm": 0.2893421866738128, + "learning_rate": 5.377622838178414e-05, + "loss": 0.9645, + "step": 14012 + }, + { + "epoch": 1.34, + "grad_norm": 0.293989137018777, + "learning_rate": 5.376220093115254e-05, + "loss": 0.9263, + "step": 14013 + }, + { + "epoch": 1.34, + "grad_norm": 0.3281772529452037, + "learning_rate": 5.374817463765096e-05, + "loss": 1.0837, + "step": 14014 + }, + { + "epoch": 1.34, + "grad_norm": 0.3036640647152903, + "learning_rate": 5.373414950163034e-05, + "loss": 0.9723, + "step": 14015 + }, + { + "epoch": 1.34, + "grad_norm": 0.3543493748896448, + "learning_rate": 5.37201255234417e-05, + "loss": 0.9963, + "step": 14016 + }, + { + "epoch": 1.34, + "grad_norm": 0.38046882986680536, + "learning_rate": 5.3706102703436003e-05, + "loss": 1.0362, + "step": 14017 + }, + { + "epoch": 1.34, + "grad_norm": 0.33482476602536587, + "learning_rate": 5.3692081041964216e-05, + "loss": 0.9928, + "step": 14018 + }, + { + "epoch": 1.34, + "grad_norm": 0.28210595016151624, + "learning_rate": 5.3678060539377185e-05, + "loss": 1.0202, + "step": 14019 + }, + { + "epoch": 1.34, + "grad_norm": 0.3095735034322811, + "learning_rate": 5.366404119602582e-05, + "loss": 0.9862, + "step": 14020 + }, + { + "epoch": 1.34, + "grad_norm": 0.2863016272262814, + "learning_rate": 5.3650023012260916e-05, + "loss": 0.9351, + "step": 14021 + }, + { + "epoch": 1.34, + "grad_norm": 0.28741260002918034, + "learning_rate": 5.36360059884333e-05, + "loss": 1.0048, + "step": 14022 + }, + { + "epoch": 1.34, + "grad_norm": 0.31190090017288136, + "learning_rate": 5.36219901248938e-05, + "loss": 1.0311, + "step": 14023 + }, + { + "epoch": 1.34, + "grad_norm": 0.3444457739315814, + "learning_rate": 5.360797542199316e-05, + "loss": 1.0446, + "step": 14024 + }, + { + "epoch": 1.34, + "grad_norm": 0.2873929311931004, + "learning_rate": 5.359396188008207e-05, + "loss": 1.0308, + "step": 14025 + }, + { + "epoch": 1.34, + "grad_norm": 0.31016848796576185, + "learning_rate": 5.3579949499511286e-05, + "loss": 0.9733, + "step": 14026 + }, + { + "epoch": 1.34, + "grad_norm": 0.3354827439023934, + "learning_rate": 5.3565938280631386e-05, + "loss": 1.1157, + "step": 14027 + }, + { + "epoch": 1.34, + "grad_norm": 0.29398714358702577, + "learning_rate": 5.355192822379307e-05, + "loss": 0.9833, + "step": 14028 + }, + { + "epoch": 1.34, + "grad_norm": 0.3426411946142527, + "learning_rate": 5.3537919329346955e-05, + "loss": 0.9782, + "step": 14029 + }, + { + "epoch": 1.34, + "grad_norm": 0.3052579104079863, + "learning_rate": 5.3523911597643626e-05, + "loss": 1.0896, + "step": 14030 + }, + { + "epoch": 1.34, + "grad_norm": 0.28984525537959543, + "learning_rate": 5.3509905029033594e-05, + "loss": 0.8939, + "step": 14031 + }, + { + "epoch": 1.34, + "grad_norm": 0.321348285803822, + "learning_rate": 5.349589962386744e-05, + "loss": 0.9874, + "step": 14032 + }, + { + "epoch": 1.34, + "grad_norm": 0.27765976466971737, + "learning_rate": 5.3481895382495596e-05, + "loss": 1.1012, + "step": 14033 + }, + { + "epoch": 1.34, + "grad_norm": 0.3090585751989251, + "learning_rate": 5.346789230526854e-05, + "loss": 1.1468, + "step": 14034 + }, + { + "epoch": 1.34, + "grad_norm": 0.3047055467458917, + "learning_rate": 5.345389039253673e-05, + "loss": 1.0999, + "step": 14035 + }, + { + "epoch": 1.34, + "grad_norm": 0.3404330233167928, + "learning_rate": 5.3439889644650606e-05, + "loss": 1.0697, + "step": 14036 + }, + { + "epoch": 1.34, + "grad_norm": 0.30431556068882387, + "learning_rate": 5.342589006196046e-05, + "loss": 0.9503, + "step": 14037 + }, + { + "epoch": 1.34, + "grad_norm": 0.29869549567157366, + "learning_rate": 5.3411891644816704e-05, + "loss": 1.0263, + "step": 14038 + }, + { + "epoch": 1.34, + "grad_norm": 0.31029292761976063, + "learning_rate": 5.339789439356966e-05, + "loss": 0.9728, + "step": 14039 + }, + { + "epoch": 1.34, + "grad_norm": 0.3462319619272705, + "learning_rate": 5.338389830856956e-05, + "loss": 1.0047, + "step": 14040 + }, + { + "epoch": 1.34, + "grad_norm": 0.31672964912421686, + "learning_rate": 5.3369903390166754e-05, + "loss": 1.1013, + "step": 14041 + }, + { + "epoch": 1.34, + "grad_norm": 0.3174189600423441, + "learning_rate": 5.335590963871133e-05, + "loss": 1.0309, + "step": 14042 + }, + { + "epoch": 1.34, + "grad_norm": 0.31170957053418147, + "learning_rate": 5.334191705455367e-05, + "loss": 1.1061, + "step": 14043 + }, + { + "epoch": 1.34, + "grad_norm": 0.30122102887811947, + "learning_rate": 5.332792563804381e-05, + "loss": 1.0456, + "step": 14044 + }, + { + "epoch": 1.34, + "grad_norm": 0.2996493561501193, + "learning_rate": 5.331393538953199e-05, + "loss": 1.0365, + "step": 14045 + }, + { + "epoch": 1.34, + "grad_norm": 0.308690754192563, + "learning_rate": 5.3299946309368234e-05, + "loss": 1.0432, + "step": 14046 + }, + { + "epoch": 1.34, + "grad_norm": 0.3008714009711711, + "learning_rate": 5.3285958397902715e-05, + "loss": 1.0598, + "step": 14047 + }, + { + "epoch": 1.34, + "grad_norm": 0.29762518987011105, + "learning_rate": 5.3271971655485406e-05, + "loss": 1.1147, + "step": 14048 + }, + { + "epoch": 1.34, + "grad_norm": 0.31529193588195403, + "learning_rate": 5.3257986082466375e-05, + "loss": 1.0331, + "step": 14049 + }, + { + "epoch": 1.34, + "grad_norm": 0.3161987871838107, + "learning_rate": 5.324400167919561e-05, + "loss": 1.0504, + "step": 14050 + }, + { + "epoch": 1.34, + "grad_norm": 0.2763531773695376, + "learning_rate": 5.3230018446023135e-05, + "loss": 1.0159, + "step": 14051 + }, + { + "epoch": 1.34, + "grad_norm": 0.29583434201805475, + "learning_rate": 5.321603638329879e-05, + "loss": 0.9844, + "step": 14052 + }, + { + "epoch": 1.34, + "grad_norm": 0.34663105331373556, + "learning_rate": 5.320205549137256e-05, + "loss": 0.9067, + "step": 14053 + }, + { + "epoch": 1.34, + "grad_norm": 0.31542980126203557, + "learning_rate": 5.318807577059433e-05, + "loss": 0.8958, + "step": 14054 + }, + { + "epoch": 1.34, + "grad_norm": 0.29329028664522067, + "learning_rate": 5.317409722131388e-05, + "loss": 0.9326, + "step": 14055 + }, + { + "epoch": 1.34, + "grad_norm": 0.29031512002711235, + "learning_rate": 5.316011984388108e-05, + "loss": 1.1487, + "step": 14056 + }, + { + "epoch": 1.34, + "grad_norm": 0.27138463784076855, + "learning_rate": 5.314614363864572e-05, + "loss": 1.0655, + "step": 14057 + }, + { + "epoch": 1.34, + "grad_norm": 0.32438761973286445, + "learning_rate": 5.31321686059576e-05, + "loss": 0.9911, + "step": 14058 + }, + { + "epoch": 1.35, + "grad_norm": 0.2942006516408754, + "learning_rate": 5.3118194746166386e-05, + "loss": 0.9949, + "step": 14059 + }, + { + "epoch": 1.35, + "grad_norm": 0.3106096223641047, + "learning_rate": 5.310422205962184e-05, + "loss": 1.0073, + "step": 14060 + }, + { + "epoch": 1.35, + "grad_norm": 0.3572938180268862, + "learning_rate": 5.309025054667358e-05, + "loss": 0.9984, + "step": 14061 + }, + { + "epoch": 1.35, + "grad_norm": 0.3285370831872955, + "learning_rate": 5.3076280207671266e-05, + "loss": 1.0436, + "step": 14062 + }, + { + "epoch": 1.35, + "grad_norm": 0.2602638048903277, + "learning_rate": 5.306231104296455e-05, + "loss": 1.0773, + "step": 14063 + }, + { + "epoch": 1.35, + "grad_norm": 0.31301892338167847, + "learning_rate": 5.3048343052903024e-05, + "loss": 0.9082, + "step": 14064 + }, + { + "epoch": 1.35, + "grad_norm": 0.2891939024113613, + "learning_rate": 5.303437623783618e-05, + "loss": 1.0854, + "step": 14065 + }, + { + "epoch": 1.35, + "grad_norm": 0.311864330109885, + "learning_rate": 5.3020410598113646e-05, + "loss": 1.076, + "step": 14066 + }, + { + "epoch": 1.35, + "grad_norm": 0.31837313598157674, + "learning_rate": 5.300644613408481e-05, + "loss": 1.0643, + "step": 14067 + }, + { + "epoch": 1.35, + "grad_norm": 0.2953231639375391, + "learning_rate": 5.2992482846099234e-05, + "loss": 0.9706, + "step": 14068 + }, + { + "epoch": 1.35, + "grad_norm": 0.2650047661589804, + "learning_rate": 5.297852073450623e-05, + "loss": 1.1381, + "step": 14069 + }, + { + "epoch": 1.35, + "grad_norm": 0.2852993388402205, + "learning_rate": 5.296455979965539e-05, + "loss": 0.9082, + "step": 14070 + }, + { + "epoch": 1.35, + "grad_norm": 0.33687354351218385, + "learning_rate": 5.295060004189596e-05, + "loss": 0.9643, + "step": 14071 + }, + { + "epoch": 1.35, + "grad_norm": 0.34616678826555847, + "learning_rate": 5.293664146157736e-05, + "loss": 1.0366, + "step": 14072 + }, + { + "epoch": 1.35, + "grad_norm": 0.3105474577814933, + "learning_rate": 5.292268405904885e-05, + "loss": 0.9914, + "step": 14073 + }, + { + "epoch": 1.35, + "grad_norm": 0.3097272204900178, + "learning_rate": 5.2908727834659765e-05, + "loss": 0.9894, + "step": 14074 + }, + { + "epoch": 1.35, + "grad_norm": 0.3037007097703506, + "learning_rate": 5.289477278875941e-05, + "loss": 0.9791, + "step": 14075 + }, + { + "epoch": 1.35, + "grad_norm": 0.3235903842785318, + "learning_rate": 5.2880818921696916e-05, + "loss": 0.9658, + "step": 14076 + }, + { + "epoch": 1.35, + "grad_norm": 0.24578753378555843, + "learning_rate": 5.2866866233821555e-05, + "loss": 1.0067, + "step": 14077 + }, + { + "epoch": 1.35, + "grad_norm": 0.2732240331043465, + "learning_rate": 5.2852914725482474e-05, + "loss": 1.0858, + "step": 14078 + }, + { + "epoch": 1.35, + "grad_norm": 0.3284778195379323, + "learning_rate": 5.283896439702888e-05, + "loss": 0.9676, + "step": 14079 + }, + { + "epoch": 1.35, + "grad_norm": 0.34460761731664424, + "learning_rate": 5.282501524880979e-05, + "loss": 1.0279, + "step": 14080 + }, + { + "epoch": 1.35, + "grad_norm": 0.30622334377206206, + "learning_rate": 5.28110672811744e-05, + "loss": 1.0863, + "step": 14081 + }, + { + "epoch": 1.35, + "grad_norm": 0.3373881110582242, + "learning_rate": 5.2797120494471656e-05, + "loss": 1.0953, + "step": 14082 + }, + { + "epoch": 1.35, + "grad_norm": 0.3125318280735094, + "learning_rate": 5.278317488905065e-05, + "loss": 0.9709, + "step": 14083 + }, + { + "epoch": 1.35, + "grad_norm": 0.3117152920651228, + "learning_rate": 5.2769230465260356e-05, + "loss": 0.8976, + "step": 14084 + }, + { + "epoch": 1.35, + "grad_norm": 0.35476833331927576, + "learning_rate": 5.27552872234498e-05, + "loss": 0.9974, + "step": 14085 + }, + { + "epoch": 1.35, + "grad_norm": 0.3409335975418433, + "learning_rate": 5.2741345163967824e-05, + "loss": 1.0578, + "step": 14086 + }, + { + "epoch": 1.35, + "grad_norm": 0.3276779038298904, + "learning_rate": 5.272740428716343e-05, + "loss": 1.0396, + "step": 14087 + }, + { + "epoch": 1.35, + "grad_norm": 0.3309780661420768, + "learning_rate": 5.271346459338542e-05, + "loss": 0.968, + "step": 14088 + }, + { + "epoch": 1.35, + "grad_norm": 0.3545896643503232, + "learning_rate": 5.2699526082982676e-05, + "loss": 1.0716, + "step": 14089 + }, + { + "epoch": 1.35, + "grad_norm": 0.31998196296140186, + "learning_rate": 5.2685588756304006e-05, + "loss": 1.0154, + "step": 14090 + }, + { + "epoch": 1.35, + "grad_norm": 0.29899840005867523, + "learning_rate": 5.267165261369826e-05, + "loss": 0.8588, + "step": 14091 + }, + { + "epoch": 1.35, + "grad_norm": 0.31499479253604773, + "learning_rate": 5.2657717655514124e-05, + "loss": 1.0086, + "step": 14092 + }, + { + "epoch": 1.35, + "grad_norm": 0.3051684524076439, + "learning_rate": 5.264378388210038e-05, + "loss": 1.0053, + "step": 14093 + }, + { + "epoch": 1.35, + "grad_norm": 0.30780899183436977, + "learning_rate": 5.2629851293805685e-05, + "loss": 1.0075, + "step": 14094 + }, + { + "epoch": 1.35, + "grad_norm": 0.3034350176554476, + "learning_rate": 5.2615919890978715e-05, + "loss": 1.0653, + "step": 14095 + }, + { + "epoch": 1.35, + "grad_norm": 0.3140578675029522, + "learning_rate": 5.260198967396814e-05, + "loss": 1.0814, + "step": 14096 + }, + { + "epoch": 1.35, + "grad_norm": 0.29776250849901603, + "learning_rate": 5.2588060643122595e-05, + "loss": 1.106, + "step": 14097 + }, + { + "epoch": 1.35, + "grad_norm": 0.34576707762072373, + "learning_rate": 5.2574132798790595e-05, + "loss": 0.9833, + "step": 14098 + }, + { + "epoch": 1.35, + "grad_norm": 0.2929152866876682, + "learning_rate": 5.2560206141320734e-05, + "loss": 1.1065, + "step": 14099 + }, + { + "epoch": 1.35, + "grad_norm": 0.29669722731906767, + "learning_rate": 5.2546280671061555e-05, + "loss": 0.8831, + "step": 14100 + }, + { + "epoch": 1.35, + "grad_norm": 0.31795423054400845, + "learning_rate": 5.2532356388361494e-05, + "loss": 0.9696, + "step": 14101 + }, + { + "epoch": 1.35, + "grad_norm": 0.32848838724484813, + "learning_rate": 5.251843329356908e-05, + "loss": 1.0848, + "step": 14102 + }, + { + "epoch": 1.35, + "grad_norm": 0.33921981643621085, + "learning_rate": 5.250451138703264e-05, + "loss": 1.066, + "step": 14103 + }, + { + "epoch": 1.35, + "grad_norm": 0.30021911129959533, + "learning_rate": 5.2490590669100735e-05, + "loss": 1.0137, + "step": 14104 + }, + { + "epoch": 1.35, + "grad_norm": 0.3098128700174503, + "learning_rate": 5.2476671140121604e-05, + "loss": 0.9187, + "step": 14105 + }, + { + "epoch": 1.35, + "grad_norm": 0.3135078610073723, + "learning_rate": 5.2462752800443706e-05, + "loss": 1.0295, + "step": 14106 + }, + { + "epoch": 1.35, + "grad_norm": 0.34500370448013884, + "learning_rate": 5.244883565041523e-05, + "loss": 1.0818, + "step": 14107 + }, + { + "epoch": 1.35, + "grad_norm": 0.3247095958341011, + "learning_rate": 5.2434919690384585e-05, + "loss": 1.1057, + "step": 14108 + }, + { + "epoch": 1.35, + "grad_norm": 0.3307453008707443, + "learning_rate": 5.242100492069992e-05, + "loss": 0.9289, + "step": 14109 + }, + { + "epoch": 1.35, + "grad_norm": 0.31223346550564524, + "learning_rate": 5.2407091341709514e-05, + "loss": 1.0532, + "step": 14110 + }, + { + "epoch": 1.35, + "grad_norm": 0.3179561978599248, + "learning_rate": 5.2393178953761557e-05, + "loss": 1.0495, + "step": 14111 + }, + { + "epoch": 1.35, + "grad_norm": 0.3895280185333459, + "learning_rate": 5.2379267757204254e-05, + "loss": 0.9884, + "step": 14112 + }, + { + "epoch": 1.35, + "grad_norm": 0.324782869293075, + "learning_rate": 5.236535775238567e-05, + "loss": 0.9787, + "step": 14113 + }, + { + "epoch": 1.35, + "grad_norm": 0.3234191362922148, + "learning_rate": 5.235144893965398e-05, + "loss": 0.9953, + "step": 14114 + }, + { + "epoch": 1.35, + "grad_norm": 0.32854422087145974, + "learning_rate": 5.233754131935719e-05, + "loss": 0.8807, + "step": 14115 + }, + { + "epoch": 1.35, + "grad_norm": 0.3023391055731823, + "learning_rate": 5.232363489184338e-05, + "loss": 1.1916, + "step": 14116 + }, + { + "epoch": 1.35, + "grad_norm": 0.316636015766911, + "learning_rate": 5.2309729657460574e-05, + "loss": 0.943, + "step": 14117 + }, + { + "epoch": 1.35, + "grad_norm": 0.33550650036627105, + "learning_rate": 5.229582561655678e-05, + "loss": 1.0353, + "step": 14118 + }, + { + "epoch": 1.35, + "grad_norm": 0.2839798398813967, + "learning_rate": 5.228192276947991e-05, + "loss": 1.0716, + "step": 14119 + }, + { + "epoch": 1.35, + "grad_norm": 0.27599233360099995, + "learning_rate": 5.22680211165779e-05, + "loss": 1.0095, + "step": 14120 + }, + { + "epoch": 1.35, + "grad_norm": 0.2990950363753826, + "learning_rate": 5.22541206581987e-05, + "loss": 1.0525, + "step": 14121 + }, + { + "epoch": 1.35, + "grad_norm": 0.28778722260310835, + "learning_rate": 5.224022139469011e-05, + "loss": 0.9744, + "step": 14122 + }, + { + "epoch": 1.35, + "grad_norm": 0.3150913905151712, + "learning_rate": 5.222632332639999e-05, + "loss": 1.0604, + "step": 14123 + }, + { + "epoch": 1.35, + "grad_norm": 0.32210862990741, + "learning_rate": 5.221242645367616e-05, + "loss": 1.0245, + "step": 14124 + }, + { + "epoch": 1.35, + "grad_norm": 0.3049368178693386, + "learning_rate": 5.219853077686642e-05, + "loss": 0.9894, + "step": 14125 + }, + { + "epoch": 1.35, + "grad_norm": 0.3789197509564292, + "learning_rate": 5.2184636296318444e-05, + "loss": 0.9785, + "step": 14126 + }, + { + "epoch": 1.35, + "grad_norm": 0.32488312595501134, + "learning_rate": 5.217074301238005e-05, + "loss": 0.9993, + "step": 14127 + }, + { + "epoch": 1.35, + "grad_norm": 0.31768732895208107, + "learning_rate": 5.215685092539883e-05, + "loss": 1.0464, + "step": 14128 + }, + { + "epoch": 1.35, + "grad_norm": 0.3100765867133566, + "learning_rate": 5.214296003572252e-05, + "loss": 0.9039, + "step": 14129 + }, + { + "epoch": 1.35, + "grad_norm": 0.3012046565603547, + "learning_rate": 5.212907034369863e-05, + "loss": 1.1117, + "step": 14130 + }, + { + "epoch": 1.35, + "grad_norm": 0.32539554809656335, + "learning_rate": 5.2115181849674924e-05, + "loss": 1.0046, + "step": 14131 + }, + { + "epoch": 1.35, + "grad_norm": 0.29600535127030003, + "learning_rate": 5.2101294553998835e-05, + "loss": 0.9859, + "step": 14132 + }, + { + "epoch": 1.35, + "grad_norm": 0.324345628017839, + "learning_rate": 5.2087408457018e-05, + "loss": 1.1485, + "step": 14133 + }, + { + "epoch": 1.35, + "grad_norm": 0.3031890166424321, + "learning_rate": 5.207352355907985e-05, + "loss": 0.9793, + "step": 14134 + }, + { + "epoch": 1.35, + "grad_norm": 0.3101522427766782, + "learning_rate": 5.205963986053193e-05, + "loss": 1.0894, + "step": 14135 + }, + { + "epoch": 1.35, + "grad_norm": 0.27650276035460297, + "learning_rate": 5.204575736172162e-05, + "loss": 0.9278, + "step": 14136 + }, + { + "epoch": 1.35, + "grad_norm": 0.2887797643347604, + "learning_rate": 5.2031876062996354e-05, + "loss": 0.9578, + "step": 14137 + }, + { + "epoch": 1.35, + "grad_norm": 0.2588624176325902, + "learning_rate": 5.201799596470354e-05, + "loss": 0.9478, + "step": 14138 + }, + { + "epoch": 1.35, + "grad_norm": 0.2709730058878256, + "learning_rate": 5.200411706719057e-05, + "loss": 1.1578, + "step": 14139 + }, + { + "epoch": 1.35, + "grad_norm": 0.30323446392079456, + "learning_rate": 5.1990239370804694e-05, + "loss": 1.0288, + "step": 14140 + }, + { + "epoch": 1.35, + "grad_norm": 0.2980613559085131, + "learning_rate": 5.197636287589326e-05, + "loss": 1.0387, + "step": 14141 + }, + { + "epoch": 1.35, + "grad_norm": 0.3114343520559088, + "learning_rate": 5.196248758280357e-05, + "loss": 1.0127, + "step": 14142 + }, + { + "epoch": 1.35, + "grad_norm": 0.29097398893876586, + "learning_rate": 5.194861349188276e-05, + "loss": 1.002, + "step": 14143 + }, + { + "epoch": 1.35, + "grad_norm": 0.3160034444728277, + "learning_rate": 5.1934740603478116e-05, + "loss": 1.126, + "step": 14144 + }, + { + "epoch": 1.35, + "grad_norm": 0.3132481258701965, + "learning_rate": 5.192086891793678e-05, + "loss": 0.9708, + "step": 14145 + }, + { + "epoch": 1.35, + "grad_norm": 0.2906772263283179, + "learning_rate": 5.190699843560597e-05, + "loss": 0.9726, + "step": 14146 + }, + { + "epoch": 1.35, + "grad_norm": 0.26769650009429563, + "learning_rate": 5.1893129156832696e-05, + "loss": 0.9498, + "step": 14147 + }, + { + "epoch": 1.35, + "grad_norm": 0.2884612764371217, + "learning_rate": 5.187926108196415e-05, + "loss": 1.0027, + "step": 14148 + }, + { + "epoch": 1.35, + "grad_norm": 0.2739373841324632, + "learning_rate": 5.1865394211347294e-05, + "loss": 0.9222, + "step": 14149 + }, + { + "epoch": 1.35, + "grad_norm": 0.32860009242628613, + "learning_rate": 5.1851528545329196e-05, + "loss": 1.1758, + "step": 14150 + }, + { + "epoch": 1.35, + "grad_norm": 0.31984178469337354, + "learning_rate": 5.183766408425686e-05, + "loss": 1.0204, + "step": 14151 + }, + { + "epoch": 1.35, + "grad_norm": 0.3189236580646693, + "learning_rate": 5.1823800828477285e-05, + "loss": 1.021, + "step": 14152 + }, + { + "epoch": 1.35, + "grad_norm": 0.2700439745774501, + "learning_rate": 5.180993877833735e-05, + "loss": 1.1131, + "step": 14153 + }, + { + "epoch": 1.35, + "grad_norm": 0.3422680923725331, + "learning_rate": 5.1796077934184e-05, + "loss": 1.0613, + "step": 14154 + }, + { + "epoch": 1.35, + "grad_norm": 0.27714803732782944, + "learning_rate": 5.178221829636407e-05, + "loss": 1.0171, + "step": 14155 + }, + { + "epoch": 1.35, + "grad_norm": 0.3216835571609934, + "learning_rate": 5.176835986522443e-05, + "loss": 1.0454, + "step": 14156 + }, + { + "epoch": 1.35, + "grad_norm": 0.32185281650574843, + "learning_rate": 5.175450264111188e-05, + "loss": 1.0164, + "step": 14157 + }, + { + "epoch": 1.35, + "grad_norm": 0.31563840459595427, + "learning_rate": 5.174064662437328e-05, + "loss": 1.0989, + "step": 14158 + }, + { + "epoch": 1.35, + "grad_norm": 0.319405501931305, + "learning_rate": 5.172679181535528e-05, + "loss": 1.0971, + "step": 14159 + }, + { + "epoch": 1.35, + "grad_norm": 0.2881031578023901, + "learning_rate": 5.171293821440466e-05, + "loss": 0.9981, + "step": 14160 + }, + { + "epoch": 1.35, + "grad_norm": 0.28049822794280455, + "learning_rate": 5.169908582186814e-05, + "loss": 1.0037, + "step": 14161 + }, + { + "epoch": 1.35, + "grad_norm": 0.3031584443634287, + "learning_rate": 5.168523463809232e-05, + "loss": 1.1441, + "step": 14162 + }, + { + "epoch": 1.35, + "grad_norm": 0.34092907078138307, + "learning_rate": 5.16713846634239e-05, + "loss": 1.0578, + "step": 14163 + }, + { + "epoch": 1.36, + "grad_norm": 0.311557127637738, + "learning_rate": 5.1657535898209384e-05, + "loss": 1.02, + "step": 14164 + }, + { + "epoch": 1.36, + "grad_norm": 0.35142178844570465, + "learning_rate": 5.1643688342795496e-05, + "loss": 0.9925, + "step": 14165 + }, + { + "epoch": 1.36, + "grad_norm": 0.3456236975437613, + "learning_rate": 5.1629841997528635e-05, + "loss": 1.0718, + "step": 14166 + }, + { + "epoch": 1.36, + "grad_norm": 0.30917381467050736, + "learning_rate": 5.161599686275544e-05, + "loss": 0.9447, + "step": 14167 + }, + { + "epoch": 1.36, + "grad_norm": 0.3021697576466236, + "learning_rate": 5.160215293882229e-05, + "loss": 0.9613, + "step": 14168 + }, + { + "epoch": 1.36, + "grad_norm": 0.33677715706868244, + "learning_rate": 5.158831022607571e-05, + "loss": 0.9982, + "step": 14169 + }, + { + "epoch": 1.36, + "grad_norm": 0.26888773963111867, + "learning_rate": 5.157446872486205e-05, + "loss": 0.8918, + "step": 14170 + }, + { + "epoch": 1.36, + "grad_norm": 0.3539822970440161, + "learning_rate": 5.156062843552776e-05, + "loss": 1.0106, + "step": 14171 + }, + { + "epoch": 1.36, + "grad_norm": 0.3045217144266334, + "learning_rate": 5.154678935841917e-05, + "loss": 1.0061, + "step": 14172 + }, + { + "epoch": 1.36, + "grad_norm": 0.3264455056130133, + "learning_rate": 5.1532951493882674e-05, + "loss": 0.9769, + "step": 14173 + }, + { + "epoch": 1.36, + "grad_norm": 0.30672715464118017, + "learning_rate": 5.151911484226448e-05, + "loss": 0.9831, + "step": 14174 + }, + { + "epoch": 1.36, + "grad_norm": 0.29830597027092154, + "learning_rate": 5.1505279403910967e-05, + "loss": 0.9841, + "step": 14175 + }, + { + "epoch": 1.36, + "grad_norm": 0.32411414366153724, + "learning_rate": 5.149144517916826e-05, + "loss": 1.0005, + "step": 14176 + }, + { + "epoch": 1.36, + "grad_norm": 0.3783846861477258, + "learning_rate": 5.1477612168382624e-05, + "loss": 1.0342, + "step": 14177 + }, + { + "epoch": 1.36, + "grad_norm": 0.3198723757172489, + "learning_rate": 5.1463780371900236e-05, + "loss": 1.0442, + "step": 14178 + }, + { + "epoch": 1.36, + "grad_norm": 0.33137666492689694, + "learning_rate": 5.1449949790067296e-05, + "loss": 0.9791, + "step": 14179 + }, + { + "epoch": 1.36, + "grad_norm": 0.3175783964000346, + "learning_rate": 5.1436120423229826e-05, + "loss": 0.8672, + "step": 14180 + }, + { + "epoch": 1.36, + "grad_norm": 0.2815044564980019, + "learning_rate": 5.1422292271733986e-05, + "loss": 1.0878, + "step": 14181 + }, + { + "epoch": 1.36, + "grad_norm": 0.3354622189714739, + "learning_rate": 5.1408465335925825e-05, + "loss": 0.9266, + "step": 14182 + }, + { + "epoch": 1.36, + "grad_norm": 0.3674854199298497, + "learning_rate": 5.139463961615133e-05, + "loss": 0.9082, + "step": 14183 + }, + { + "epoch": 1.36, + "grad_norm": 0.2670741837053459, + "learning_rate": 5.138081511275652e-05, + "loss": 1.0482, + "step": 14184 + }, + { + "epoch": 1.36, + "grad_norm": 0.3032190247931152, + "learning_rate": 5.1366991826087374e-05, + "loss": 1.0429, + "step": 14185 + }, + { + "epoch": 1.36, + "grad_norm": 0.2941932990156685, + "learning_rate": 5.135316975648986e-05, + "loss": 0.9958, + "step": 14186 + }, + { + "epoch": 1.36, + "grad_norm": 0.28578828714307497, + "learning_rate": 5.1339348904309806e-05, + "loss": 0.8557, + "step": 14187 + }, + { + "epoch": 1.36, + "grad_norm": 0.2816173755543984, + "learning_rate": 5.132552926989317e-05, + "loss": 0.9614, + "step": 14188 + }, + { + "epoch": 1.36, + "grad_norm": 0.30200683438259907, + "learning_rate": 5.13117108535857e-05, + "loss": 1.1145, + "step": 14189 + }, + { + "epoch": 1.36, + "grad_norm": 0.28894431874014587, + "learning_rate": 5.129789365573332e-05, + "loss": 1.0016, + "step": 14190 + }, + { + "epoch": 1.36, + "grad_norm": 0.33635324009387957, + "learning_rate": 5.128407767668169e-05, + "loss": 0.9952, + "step": 14191 + }, + { + "epoch": 1.36, + "grad_norm": 0.30958392214566965, + "learning_rate": 5.1270262916776704e-05, + "loss": 1.0125, + "step": 14192 + }, + { + "epoch": 1.36, + "grad_norm": 0.33245263423674304, + "learning_rate": 5.125644937636396e-05, + "loss": 1.028, + "step": 14193 + }, + { + "epoch": 1.36, + "grad_norm": 0.33571129384815396, + "learning_rate": 5.124263705578925e-05, + "loss": 1.0675, + "step": 14194 + }, + { + "epoch": 1.36, + "grad_norm": 0.3278490479355475, + "learning_rate": 5.122882595539816e-05, + "loss": 1.0572, + "step": 14195 + }, + { + "epoch": 1.36, + "grad_norm": 0.31917460630744426, + "learning_rate": 5.121501607553638e-05, + "loss": 1.0717, + "step": 14196 + }, + { + "epoch": 1.36, + "grad_norm": 0.3502010169927981, + "learning_rate": 5.120120741654946e-05, + "loss": 0.9556, + "step": 14197 + }, + { + "epoch": 1.36, + "grad_norm": 0.451216291002631, + "learning_rate": 5.118739997878299e-05, + "loss": 0.9966, + "step": 14198 + }, + { + "epoch": 1.36, + "grad_norm": 0.34389329269221786, + "learning_rate": 5.1173593762582504e-05, + "loss": 0.9877, + "step": 14199 + }, + { + "epoch": 1.36, + "grad_norm": 0.3134832448677587, + "learning_rate": 5.115978876829356e-05, + "loss": 1.0043, + "step": 14200 + }, + { + "epoch": 1.36, + "grad_norm": 0.27949552550299644, + "learning_rate": 5.1145984996261556e-05, + "loss": 1.0158, + "step": 14201 + }, + { + "epoch": 1.36, + "grad_norm": 0.32927008463748897, + "learning_rate": 5.1132182446831986e-05, + "loss": 1.0013, + "step": 14202 + }, + { + "epoch": 1.36, + "grad_norm": 0.280282681722434, + "learning_rate": 5.11183811203503e-05, + "loss": 0.9031, + "step": 14203 + }, + { + "epoch": 1.36, + "grad_norm": 0.29127357707951285, + "learning_rate": 5.110458101716181e-05, + "loss": 0.8706, + "step": 14204 + }, + { + "epoch": 1.36, + "grad_norm": 0.2891380828709362, + "learning_rate": 5.109078213761191e-05, + "loss": 1.0127, + "step": 14205 + }, + { + "epoch": 1.36, + "grad_norm": 0.3300803775215953, + "learning_rate": 5.107698448204592e-05, + "loss": 1.0064, + "step": 14206 + }, + { + "epoch": 1.36, + "grad_norm": 0.32461888870203603, + "learning_rate": 5.1063188050809186e-05, + "loss": 1.1272, + "step": 14207 + }, + { + "epoch": 1.36, + "grad_norm": 0.365841036641064, + "learning_rate": 5.104939284424689e-05, + "loss": 0.9685, + "step": 14208 + }, + { + "epoch": 1.36, + "grad_norm": 0.2873351255029833, + "learning_rate": 5.1035598862704335e-05, + "loss": 1.0531, + "step": 14209 + }, + { + "epoch": 1.36, + "grad_norm": 0.305809662283866, + "learning_rate": 5.102180610652665e-05, + "loss": 1.1116, + "step": 14210 + }, + { + "epoch": 1.36, + "grad_norm": 0.30681843430130473, + "learning_rate": 5.100801457605906e-05, + "loss": 1.0128, + "step": 14211 + }, + { + "epoch": 1.36, + "grad_norm": 0.3457506889061676, + "learning_rate": 5.0994224271646684e-05, + "loss": 1.0103, + "step": 14212 + }, + { + "epoch": 1.36, + "grad_norm": 0.31209671382589976, + "learning_rate": 5.098043519363469e-05, + "loss": 1.042, + "step": 14213 + }, + { + "epoch": 1.36, + "grad_norm": 0.3164114349205943, + "learning_rate": 5.096664734236807e-05, + "loss": 1.0998, + "step": 14214 + }, + { + "epoch": 1.36, + "grad_norm": 0.3101631011633101, + "learning_rate": 5.095286071819197e-05, + "loss": 1.0899, + "step": 14215 + }, + { + "epoch": 1.36, + "grad_norm": 0.33391839707507154, + "learning_rate": 5.09390753214513e-05, + "loss": 0.8777, + "step": 14216 + }, + { + "epoch": 1.36, + "grad_norm": 0.27880294675426254, + "learning_rate": 5.092529115249114e-05, + "loss": 1.015, + "step": 14217 + }, + { + "epoch": 1.36, + "grad_norm": 0.3103952144732892, + "learning_rate": 5.091150821165633e-05, + "loss": 1.0094, + "step": 14218 + }, + { + "epoch": 1.36, + "grad_norm": 0.31072521119735746, + "learning_rate": 5.089772649929198e-05, + "loss": 1.0441, + "step": 14219 + }, + { + "epoch": 1.36, + "grad_norm": 0.32539004738049604, + "learning_rate": 5.088394601574282e-05, + "loss": 1.0514, + "step": 14220 + }, + { + "epoch": 1.36, + "grad_norm": 0.32694555838330974, + "learning_rate": 5.087016676135383e-05, + "loss": 0.8942, + "step": 14221 + }, + { + "epoch": 1.36, + "grad_norm": 0.3208273593001083, + "learning_rate": 5.085638873646976e-05, + "loss": 1.0018, + "step": 14222 + }, + { + "epoch": 1.36, + "grad_norm": 0.31417223843602665, + "learning_rate": 5.084261194143545e-05, + "loss": 0.9095, + "step": 14223 + }, + { + "epoch": 1.36, + "grad_norm": 0.35150215770408405, + "learning_rate": 5.08288363765957e-05, + "loss": 0.977, + "step": 14224 + }, + { + "epoch": 1.36, + "grad_norm": 0.29869534915186036, + "learning_rate": 5.081506204229518e-05, + "loss": 1.0875, + "step": 14225 + }, + { + "epoch": 1.36, + "grad_norm": 0.32600837354255485, + "learning_rate": 5.0801288938878646e-05, + "loss": 0.9753, + "step": 14226 + }, + { + "epoch": 1.36, + "grad_norm": 0.3271222654123467, + "learning_rate": 5.078751706669078e-05, + "loss": 1.0371, + "step": 14227 + }, + { + "epoch": 1.36, + "grad_norm": 0.32775869136192193, + "learning_rate": 5.0773746426076265e-05, + "loss": 1.0398, + "step": 14228 + }, + { + "epoch": 1.36, + "grad_norm": 0.3287902019717911, + "learning_rate": 5.0759977017379645e-05, + "loss": 1.0743, + "step": 14229 + }, + { + "epoch": 1.36, + "grad_norm": 0.34494346648639623, + "learning_rate": 5.0746208840945586e-05, + "loss": 1.0377, + "step": 14230 + }, + { + "epoch": 1.36, + "grad_norm": 0.27753290806748715, + "learning_rate": 5.073244189711856e-05, + "loss": 0.9812, + "step": 14231 + }, + { + "epoch": 1.36, + "grad_norm": 0.28331029047003553, + "learning_rate": 5.071867618624315e-05, + "loss": 1.0031, + "step": 14232 + }, + { + "epoch": 1.36, + "grad_norm": 0.2966089265753014, + "learning_rate": 5.070491170866384e-05, + "loss": 1.0919, + "step": 14233 + }, + { + "epoch": 1.36, + "grad_norm": 0.29576762238459126, + "learning_rate": 5.069114846472513e-05, + "loss": 0.9197, + "step": 14234 + }, + { + "epoch": 1.36, + "grad_norm": 0.3229448792322541, + "learning_rate": 5.067738645477138e-05, + "loss": 0.9392, + "step": 14235 + }, + { + "epoch": 1.36, + "grad_norm": 0.29832462415666205, + "learning_rate": 5.066362567914706e-05, + "loss": 1.0536, + "step": 14236 + }, + { + "epoch": 1.36, + "grad_norm": 0.31057192145396, + "learning_rate": 5.0649866138196486e-05, + "loss": 1.0427, + "step": 14237 + }, + { + "epoch": 1.36, + "grad_norm": 0.31226703282192453, + "learning_rate": 5.063610783226408e-05, + "loss": 0.9544, + "step": 14238 + }, + { + "epoch": 1.36, + "grad_norm": 0.3077210697371082, + "learning_rate": 5.062235076169401e-05, + "loss": 1.03, + "step": 14239 + }, + { + "epoch": 1.36, + "grad_norm": 0.2758817464101371, + "learning_rate": 5.060859492683072e-05, + "loss": 0.9939, + "step": 14240 + }, + { + "epoch": 1.36, + "grad_norm": 0.29870237671899846, + "learning_rate": 5.059484032801834e-05, + "loss": 0.9768, + "step": 14241 + }, + { + "epoch": 1.36, + "grad_norm": 0.31777106569272995, + "learning_rate": 5.058108696560113e-05, + "loss": 0.9634, + "step": 14242 + }, + { + "epoch": 1.36, + "grad_norm": 0.3403047525028026, + "learning_rate": 5.056733483992333e-05, + "loss": 1.1097, + "step": 14243 + }, + { + "epoch": 1.36, + "grad_norm": 0.30286000585467543, + "learning_rate": 5.0553583951329e-05, + "loss": 1.0268, + "step": 14244 + }, + { + "epoch": 1.36, + "grad_norm": 0.3140630974645926, + "learning_rate": 5.0539834300162306e-05, + "loss": 1.0267, + "step": 14245 + }, + { + "epoch": 1.36, + "grad_norm": 0.3192416817102148, + "learning_rate": 5.052608588676734e-05, + "loss": 0.9858, + "step": 14246 + }, + { + "epoch": 1.36, + "grad_norm": 0.3335546241775897, + "learning_rate": 5.0512338711488206e-05, + "loss": 1.0174, + "step": 14247 + }, + { + "epoch": 1.36, + "grad_norm": 0.2885477405039649, + "learning_rate": 5.0498592774668865e-05, + "loss": 0.9648, + "step": 14248 + }, + { + "epoch": 1.36, + "grad_norm": 0.3139427468355865, + "learning_rate": 5.048484807665338e-05, + "loss": 1.0176, + "step": 14249 + }, + { + "epoch": 1.36, + "grad_norm": 0.3179955298448443, + "learning_rate": 5.047110461778566e-05, + "loss": 0.927, + "step": 14250 + }, + { + "epoch": 1.36, + "grad_norm": 0.32079524120806474, + "learning_rate": 5.0457362398409704e-05, + "loss": 0.9658, + "step": 14251 + }, + { + "epoch": 1.36, + "grad_norm": 0.3039002039886431, + "learning_rate": 5.044362141886932e-05, + "loss": 1.0079, + "step": 14252 + }, + { + "epoch": 1.36, + "grad_norm": 0.278127898241909, + "learning_rate": 5.042988167950854e-05, + "loss": 1.0935, + "step": 14253 + }, + { + "epoch": 1.36, + "grad_norm": 0.2868579210636773, + "learning_rate": 5.0416143180671075e-05, + "loss": 1.0751, + "step": 14254 + }, + { + "epoch": 1.36, + "grad_norm": 0.3072427467373777, + "learning_rate": 5.0402405922700824e-05, + "loss": 1.0985, + "step": 14255 + }, + { + "epoch": 1.36, + "grad_norm": 0.3021446590544224, + "learning_rate": 5.038866990594151e-05, + "loss": 1.0262, + "step": 14256 + }, + { + "epoch": 1.36, + "grad_norm": 0.27350584018381324, + "learning_rate": 5.037493513073695e-05, + "loss": 0.9009, + "step": 14257 + }, + { + "epoch": 1.36, + "grad_norm": 0.27268929681673754, + "learning_rate": 5.0361201597430773e-05, + "loss": 1.1361, + "step": 14258 + }, + { + "epoch": 1.36, + "grad_norm": 0.2959757609695759, + "learning_rate": 5.0347469306366735e-05, + "loss": 1.1494, + "step": 14259 + }, + { + "epoch": 1.36, + "grad_norm": 0.32147292358339974, + "learning_rate": 5.033373825788848e-05, + "loss": 0.9683, + "step": 14260 + }, + { + "epoch": 1.36, + "grad_norm": 0.286961076020842, + "learning_rate": 5.032000845233967e-05, + "loss": 0.9599, + "step": 14261 + }, + { + "epoch": 1.36, + "grad_norm": 0.31172384823765764, + "learning_rate": 5.0306279890063846e-05, + "loss": 1.0947, + "step": 14262 + }, + { + "epoch": 1.36, + "grad_norm": 0.30683027051153683, + "learning_rate": 5.029255257140459e-05, + "loss": 1.0116, + "step": 14263 + }, + { + "epoch": 1.36, + "grad_norm": 0.31801020764049326, + "learning_rate": 5.027882649670549e-05, + "loss": 0.9709, + "step": 14264 + }, + { + "epoch": 1.36, + "grad_norm": 0.3120122882376585, + "learning_rate": 5.0265101666309954e-05, + "loss": 1.0159, + "step": 14265 + }, + { + "epoch": 1.36, + "grad_norm": 0.32864177418942, + "learning_rate": 5.025137808056153e-05, + "loss": 1.0031, + "step": 14266 + }, + { + "epoch": 1.36, + "grad_norm": 0.29315605966465397, + "learning_rate": 5.023765573980361e-05, + "loss": 0.9781, + "step": 14267 + }, + { + "epoch": 1.37, + "grad_norm": 0.298165683393389, + "learning_rate": 5.022393464437969e-05, + "loss": 0.9469, + "step": 14268 + }, + { + "epoch": 1.37, + "grad_norm": 0.32549044792968745, + "learning_rate": 5.021021479463304e-05, + "loss": 1.082, + "step": 14269 + }, + { + "epoch": 1.37, + "grad_norm": 0.2979315681565218, + "learning_rate": 5.019649619090709e-05, + "loss": 0.9416, + "step": 14270 + }, + { + "epoch": 1.37, + "grad_norm": 0.3195259240921478, + "learning_rate": 5.0182778833545096e-05, + "loss": 0.9421, + "step": 14271 + }, + { + "epoch": 1.37, + "grad_norm": 0.2821172917669962, + "learning_rate": 5.016906272289036e-05, + "loss": 1.0271, + "step": 14272 + }, + { + "epoch": 1.37, + "grad_norm": 0.296024378818198, + "learning_rate": 5.015534785928615e-05, + "loss": 1.051, + "step": 14273 + }, + { + "epoch": 1.37, + "grad_norm": 0.31726514378154996, + "learning_rate": 5.014163424307572e-05, + "loss": 0.9901, + "step": 14274 + }, + { + "epoch": 1.37, + "grad_norm": 0.3532752375777321, + "learning_rate": 5.01279218746022e-05, + "loss": 1.0175, + "step": 14275 + }, + { + "epoch": 1.37, + "grad_norm": 0.30835002635885916, + "learning_rate": 5.011421075420881e-05, + "loss": 1.0523, + "step": 14276 + }, + { + "epoch": 1.37, + "grad_norm": 0.2747370370737288, + "learning_rate": 5.010050088223861e-05, + "loss": 1.0415, + "step": 14277 + }, + { + "epoch": 1.37, + "grad_norm": 0.3233701647063424, + "learning_rate": 5.0086792259034786e-05, + "loss": 0.9971, + "step": 14278 + }, + { + "epoch": 1.37, + "grad_norm": 0.2981921250392676, + "learning_rate": 5.007308488494027e-05, + "loss": 0.9336, + "step": 14279 + }, + { + "epoch": 1.37, + "grad_norm": 0.33692661205543634, + "learning_rate": 5.005937876029825e-05, + "loss": 1.0199, + "step": 14280 + }, + { + "epoch": 1.37, + "grad_norm": 0.2867706634108502, + "learning_rate": 5.0045673885451625e-05, + "loss": 0.9987, + "step": 14281 + }, + { + "epoch": 1.37, + "grad_norm": 0.3375826477882829, + "learning_rate": 5.0031970260743456e-05, + "loss": 0.9277, + "step": 14282 + }, + { + "epoch": 1.37, + "grad_norm": 0.252597634319411, + "learning_rate": 5.00182678865166e-05, + "loss": 0.9212, + "step": 14283 + }, + { + "epoch": 1.37, + "grad_norm": 0.31675910068353397, + "learning_rate": 5.000456676311398e-05, + "loss": 1.0134, + "step": 14284 + }, + { + "epoch": 1.37, + "grad_norm": 0.3134020609582148, + "learning_rate": 4.999086689087856e-05, + "loss": 1.025, + "step": 14285 + }, + { + "epoch": 1.37, + "grad_norm": 0.32636197484194895, + "learning_rate": 4.9977168270153073e-05, + "loss": 0.9892, + "step": 14286 + }, + { + "epoch": 1.37, + "grad_norm": 0.3354014638519744, + "learning_rate": 4.9963470901280395e-05, + "loss": 0.9474, + "step": 14287 + }, + { + "epoch": 1.37, + "grad_norm": 0.2971171557057585, + "learning_rate": 4.994977478460331e-05, + "loss": 0.9588, + "step": 14288 + }, + { + "epoch": 1.37, + "grad_norm": 0.344780309044234, + "learning_rate": 4.99360799204646e-05, + "loss": 1.0124, + "step": 14289 + }, + { + "epoch": 1.37, + "grad_norm": 0.28922634051860857, + "learning_rate": 4.99223863092069e-05, + "loss": 1.109, + "step": 14290 + }, + { + "epoch": 1.37, + "grad_norm": 0.2599554424721504, + "learning_rate": 4.990869395117301e-05, + "loss": 0.9833, + "step": 14291 + }, + { + "epoch": 1.37, + "grad_norm": 0.3195114409556665, + "learning_rate": 4.98950028467055e-05, + "loss": 1.0138, + "step": 14292 + }, + { + "epoch": 1.37, + "grad_norm": 0.3182703773472303, + "learning_rate": 4.9881312996147025e-05, + "loss": 0.9229, + "step": 14293 + }, + { + "epoch": 1.37, + "grad_norm": 0.31062567814321684, + "learning_rate": 4.9867624399840184e-05, + "loss": 1.0759, + "step": 14294 + }, + { + "epoch": 1.37, + "grad_norm": 0.3288841099642912, + "learning_rate": 4.98539370581276e-05, + "loss": 1.0368, + "step": 14295 + }, + { + "epoch": 1.37, + "grad_norm": 0.29716486643649803, + "learning_rate": 4.984025097135172e-05, + "loss": 1.0888, + "step": 14296 + }, + { + "epoch": 1.37, + "grad_norm": 0.30393455067341213, + "learning_rate": 4.9826566139855116e-05, + "loss": 0.8331, + "step": 14297 + }, + { + "epoch": 1.37, + "grad_norm": 0.28185619251051386, + "learning_rate": 4.981288256398019e-05, + "loss": 1.0495, + "step": 14298 + }, + { + "epoch": 1.37, + "grad_norm": 0.30810837904605876, + "learning_rate": 4.9799200244069456e-05, + "loss": 0.9974, + "step": 14299 + }, + { + "epoch": 1.37, + "grad_norm": 0.3428037070646373, + "learning_rate": 4.978551918046521e-05, + "loss": 1.0281, + "step": 14300 + }, + { + "epoch": 1.37, + "grad_norm": 0.2876114025774918, + "learning_rate": 4.977183937350999e-05, + "loss": 1.1125, + "step": 14301 + }, + { + "epoch": 1.37, + "grad_norm": 0.31416144522510536, + "learning_rate": 4.975816082354601e-05, + "loss": 0.9014, + "step": 14302 + }, + { + "epoch": 1.37, + "grad_norm": 0.2629120164181076, + "learning_rate": 4.974448353091566e-05, + "loss": 0.9278, + "step": 14303 + }, + { + "epoch": 1.37, + "grad_norm": 0.26883976736711596, + "learning_rate": 4.9730807495961176e-05, + "loss": 1.0141, + "step": 14304 + }, + { + "epoch": 1.37, + "grad_norm": 0.35099505311384094, + "learning_rate": 4.971713271902481e-05, + "loss": 0.9126, + "step": 14305 + }, + { + "epoch": 1.37, + "grad_norm": 0.3299384559369083, + "learning_rate": 4.970345920044881e-05, + "loss": 1.0763, + "step": 14306 + }, + { + "epoch": 1.37, + "grad_norm": 0.2873304685232035, + "learning_rate": 4.968978694057539e-05, + "loss": 0.9993, + "step": 14307 + }, + { + "epoch": 1.37, + "grad_norm": 0.28135346154291335, + "learning_rate": 4.9676115939746624e-05, + "loss": 1.1175, + "step": 14308 + }, + { + "epoch": 1.37, + "grad_norm": 0.2874157523198055, + "learning_rate": 4.966244619830469e-05, + "loss": 1.0277, + "step": 14309 + }, + { + "epoch": 1.37, + "grad_norm": 0.3072685121167781, + "learning_rate": 4.964877771659172e-05, + "loss": 1.0622, + "step": 14310 + }, + { + "epoch": 1.37, + "grad_norm": 0.318196465158434, + "learning_rate": 4.9635110494949685e-05, + "loss": 1.0808, + "step": 14311 + }, + { + "epoch": 1.37, + "grad_norm": 0.2931927260134576, + "learning_rate": 4.9621444533720704e-05, + "loss": 1.0304, + "step": 14312 + }, + { + "epoch": 1.37, + "grad_norm": 0.2936705647595417, + "learning_rate": 4.960777983324667e-05, + "loss": 1.1138, + "step": 14313 + }, + { + "epoch": 1.37, + "grad_norm": 0.30007108236162505, + "learning_rate": 4.959411639386968e-05, + "loss": 1.1124, + "step": 14314 + }, + { + "epoch": 1.37, + "grad_norm": 0.3216705135923202, + "learning_rate": 4.958045421593157e-05, + "loss": 0.9792, + "step": 14315 + }, + { + "epoch": 1.37, + "grad_norm": 0.30146500296074535, + "learning_rate": 4.956679329977432e-05, + "loss": 1.1219, + "step": 14316 + }, + { + "epoch": 1.37, + "grad_norm": 0.3086153047070812, + "learning_rate": 4.955313364573972e-05, + "loss": 1.1023, + "step": 14317 + }, + { + "epoch": 1.37, + "grad_norm": 0.34283992031195, + "learning_rate": 4.95394752541697e-05, + "loss": 1.0002, + "step": 14318 + }, + { + "epoch": 1.37, + "grad_norm": 0.3019841026931377, + "learning_rate": 4.952581812540599e-05, + "loss": 0.9635, + "step": 14319 + }, + { + "epoch": 1.37, + "grad_norm": 0.289524357005229, + "learning_rate": 4.95121622597904e-05, + "loss": 0.9252, + "step": 14320 + }, + { + "epoch": 1.37, + "grad_norm": 0.36382774730200323, + "learning_rate": 4.949850765766467e-05, + "loss": 1.0399, + "step": 14321 + }, + { + "epoch": 1.37, + "grad_norm": 0.2710439729408476, + "learning_rate": 4.948485431937058e-05, + "loss": 0.965, + "step": 14322 + }, + { + "epoch": 1.37, + "grad_norm": 0.3264153592139775, + "learning_rate": 4.94712022452497e-05, + "loss": 0.973, + "step": 14323 + }, + { + "epoch": 1.37, + "grad_norm": 0.28478101178365606, + "learning_rate": 4.94575514356438e-05, + "loss": 0.9803, + "step": 14324 + }, + { + "epoch": 1.37, + "grad_norm": 0.34505840573851493, + "learning_rate": 4.944390189089439e-05, + "loss": 1.0761, + "step": 14325 + }, + { + "epoch": 1.37, + "grad_norm": 0.29250904894128893, + "learning_rate": 4.9430253611343114e-05, + "loss": 1.0494, + "step": 14326 + }, + { + "epoch": 1.37, + "grad_norm": 0.28344919501205057, + "learning_rate": 4.941660659733152e-05, + "loss": 1.013, + "step": 14327 + }, + { + "epoch": 1.37, + "grad_norm": 0.31113162192057014, + "learning_rate": 4.9402960849201174e-05, + "loss": 1.0424, + "step": 14328 + }, + { + "epoch": 1.37, + "grad_norm": 0.3770464295890631, + "learning_rate": 4.938931636729349e-05, + "loss": 1.141, + "step": 14329 + }, + { + "epoch": 1.37, + "grad_norm": 0.3161690613701955, + "learning_rate": 4.9375673151949974e-05, + "loss": 1.108, + "step": 14330 + }, + { + "epoch": 1.37, + "grad_norm": 0.23510219380434752, + "learning_rate": 4.936203120351209e-05, + "loss": 1.0492, + "step": 14331 + }, + { + "epoch": 1.37, + "grad_norm": 0.32878446783183024, + "learning_rate": 4.934839052232116e-05, + "loss": 1.0163, + "step": 14332 + }, + { + "epoch": 1.37, + "grad_norm": 0.27988604740344963, + "learning_rate": 4.93347511087186e-05, + "loss": 1.0329, + "step": 14333 + }, + { + "epoch": 1.37, + "grad_norm": 0.3214084152991425, + "learning_rate": 4.9321112963045735e-05, + "loss": 1.0453, + "step": 14334 + }, + { + "epoch": 1.37, + "grad_norm": 0.28391090375632716, + "learning_rate": 4.93074760856439e-05, + "loss": 1.0506, + "step": 14335 + }, + { + "epoch": 1.37, + "grad_norm": 0.35477134515064196, + "learning_rate": 4.92938404768543e-05, + "loss": 1.1154, + "step": 14336 + }, + { + "epoch": 1.37, + "grad_norm": 0.27958581389123344, + "learning_rate": 4.9280206137018244e-05, + "loss": 1.0915, + "step": 14337 + }, + { + "epoch": 1.37, + "grad_norm": 0.2963980230956407, + "learning_rate": 4.926657306647687e-05, + "loss": 1.0921, + "step": 14338 + }, + { + "epoch": 1.37, + "grad_norm": 0.302452293136036, + "learning_rate": 4.925294126557143e-05, + "loss": 1.0928, + "step": 14339 + }, + { + "epoch": 1.37, + "grad_norm": 0.2777374225514974, + "learning_rate": 4.923931073464295e-05, + "loss": 1.1365, + "step": 14340 + }, + { + "epoch": 1.37, + "grad_norm": 0.3090676035015049, + "learning_rate": 4.9225681474032705e-05, + "loss": 1.0716, + "step": 14341 + }, + { + "epoch": 1.37, + "grad_norm": 0.33484280184705423, + "learning_rate": 4.921205348408164e-05, + "loss": 1.0425, + "step": 14342 + }, + { + "epoch": 1.37, + "grad_norm": 0.3113267275555978, + "learning_rate": 4.91984267651309e-05, + "loss": 0.8705, + "step": 14343 + }, + { + "epoch": 1.37, + "grad_norm": 0.3368940525028179, + "learning_rate": 4.918480131752142e-05, + "loss": 1.0353, + "step": 14344 + }, + { + "epoch": 1.37, + "grad_norm": 0.3274568833565112, + "learning_rate": 4.917117714159423e-05, + "loss": 0.9456, + "step": 14345 + }, + { + "epoch": 1.37, + "grad_norm": 0.3237204853791619, + "learning_rate": 4.915755423769029e-05, + "loss": 0.9599, + "step": 14346 + }, + { + "epoch": 1.37, + "grad_norm": 0.33455612344322, + "learning_rate": 4.91439326061505e-05, + "loss": 0.9935, + "step": 14347 + }, + { + "epoch": 1.37, + "grad_norm": 0.31027040326210403, + "learning_rate": 4.9130312247315744e-05, + "loss": 0.9447, + "step": 14348 + }, + { + "epoch": 1.37, + "grad_norm": 0.3118189459546958, + "learning_rate": 4.91166931615269e-05, + "loss": 1.0977, + "step": 14349 + }, + { + "epoch": 1.37, + "grad_norm": 0.314370157510389, + "learning_rate": 4.9103075349124815e-05, + "loss": 1.0515, + "step": 14350 + }, + { + "epoch": 1.37, + "grad_norm": 0.3035596733218033, + "learning_rate": 4.908945881045023e-05, + "loss": 1.0223, + "step": 14351 + }, + { + "epoch": 1.37, + "grad_norm": 0.27049758548958164, + "learning_rate": 4.907584354584397e-05, + "loss": 1.0216, + "step": 14352 + }, + { + "epoch": 1.37, + "grad_norm": 0.29915308200265756, + "learning_rate": 4.906222955564669e-05, + "loss": 0.9886, + "step": 14353 + }, + { + "epoch": 1.37, + "grad_norm": 0.32491188283620004, + "learning_rate": 4.9048616840199126e-05, + "loss": 0.9914, + "step": 14354 + }, + { + "epoch": 1.37, + "grad_norm": 0.3431616805747881, + "learning_rate": 4.903500539984196e-05, + "loss": 0.9725, + "step": 14355 + }, + { + "epoch": 1.37, + "grad_norm": 0.3438529974822854, + "learning_rate": 4.902139523491584e-05, + "loss": 1.0924, + "step": 14356 + }, + { + "epoch": 1.37, + "grad_norm": 0.3028123556633976, + "learning_rate": 4.9007786345761306e-05, + "loss": 1.1167, + "step": 14357 + }, + { + "epoch": 1.37, + "grad_norm": 0.268675093776302, + "learning_rate": 4.8994178732719006e-05, + "loss": 1.1207, + "step": 14358 + }, + { + "epoch": 1.37, + "grad_norm": 0.27898814017017803, + "learning_rate": 4.89805723961294e-05, + "loss": 1.0222, + "step": 14359 + }, + { + "epoch": 1.37, + "grad_norm": 0.2894580893433054, + "learning_rate": 4.8966967336333084e-05, + "loss": 1.0976, + "step": 14360 + }, + { + "epoch": 1.37, + "grad_norm": 0.2995637020991784, + "learning_rate": 4.8953363553670394e-05, + "loss": 0.9738, + "step": 14361 + }, + { + "epoch": 1.37, + "grad_norm": 0.3037780349610045, + "learning_rate": 4.893976104848194e-05, + "loss": 0.9383, + "step": 14362 + }, + { + "epoch": 1.37, + "grad_norm": 0.3370343005605129, + "learning_rate": 4.892615982110802e-05, + "loss": 1.1047, + "step": 14363 + }, + { + "epoch": 1.37, + "grad_norm": 0.2747056126790451, + "learning_rate": 4.8912559871889086e-05, + "loss": 1.0323, + "step": 14364 + }, + { + "epoch": 1.37, + "grad_norm": 0.2977717591271861, + "learning_rate": 4.8898961201165406e-05, + "loss": 0.9124, + "step": 14365 + }, + { + "epoch": 1.37, + "grad_norm": 0.25455055971986157, + "learning_rate": 4.888536380927735e-05, + "loss": 0.9343, + "step": 14366 + }, + { + "epoch": 1.37, + "grad_norm": 0.296825241430041, + "learning_rate": 4.8871767696565186e-05, + "loss": 0.9485, + "step": 14367 + }, + { + "epoch": 1.37, + "grad_norm": 0.29119359462374605, + "learning_rate": 4.88581728633692e-05, + "loss": 1.0024, + "step": 14368 + }, + { + "epoch": 1.37, + "grad_norm": 0.3073758100680126, + "learning_rate": 4.8844579310029535e-05, + "loss": 0.9095, + "step": 14369 + }, + { + "epoch": 1.37, + "grad_norm": 0.30193868405134305, + "learning_rate": 4.883098703688643e-05, + "loss": 1.0307, + "step": 14370 + }, + { + "epoch": 1.37, + "grad_norm": 0.3205892122541124, + "learning_rate": 4.8817396044280084e-05, + "loss": 1.0594, + "step": 14371 + }, + { + "epoch": 1.37, + "grad_norm": 0.3176260093117226, + "learning_rate": 4.8803806332550514e-05, + "loss": 1.116, + "step": 14372 + }, + { + "epoch": 1.38, + "grad_norm": 0.3000911886323585, + "learning_rate": 4.879021790203792e-05, + "loss": 1.1437, + "step": 14373 + }, + { + "epoch": 1.38, + "grad_norm": 0.26544564950264204, + "learning_rate": 4.8776630753082206e-05, + "loss": 1.0442, + "step": 14374 + }, + { + "epoch": 1.38, + "grad_norm": 0.306524590681851, + "learning_rate": 4.87630448860236e-05, + "loss": 0.8704, + "step": 14375 + }, + { + "epoch": 1.38, + "grad_norm": 0.32734116654465895, + "learning_rate": 4.874946030120196e-05, + "loss": 1.0404, + "step": 14376 + }, + { + "epoch": 1.38, + "grad_norm": 0.32074689021662417, + "learning_rate": 4.873587699895731e-05, + "loss": 1.0524, + "step": 14377 + }, + { + "epoch": 1.38, + "grad_norm": 0.3324224354432822, + "learning_rate": 4.8722294979629514e-05, + "loss": 0.9716, + "step": 14378 + }, + { + "epoch": 1.38, + "grad_norm": 0.2826701567185387, + "learning_rate": 4.870871424355856e-05, + "loss": 0.9796, + "step": 14379 + }, + { + "epoch": 1.38, + "grad_norm": 0.34988046016223845, + "learning_rate": 4.8695134791084226e-05, + "loss": 1.0022, + "step": 14380 + }, + { + "epoch": 1.38, + "grad_norm": 0.3304190784277097, + "learning_rate": 4.86815566225464e-05, + "loss": 1.07, + "step": 14381 + }, + { + "epoch": 1.38, + "grad_norm": 0.25555242301919423, + "learning_rate": 4.866797973828485e-05, + "loss": 1.05, + "step": 14382 + }, + { + "epoch": 1.38, + "grad_norm": 0.31141349908436566, + "learning_rate": 4.865440413863942e-05, + "loss": 0.9404, + "step": 14383 + }, + { + "epoch": 1.38, + "grad_norm": 0.3271686726131398, + "learning_rate": 4.8640829823949764e-05, + "loss": 1.118, + "step": 14384 + }, + { + "epoch": 1.38, + "grad_norm": 0.2764394871899079, + "learning_rate": 4.8627256794555645e-05, + "loss": 0.9971, + "step": 14385 + }, + { + "epoch": 1.38, + "grad_norm": 0.2830430709081524, + "learning_rate": 4.861368505079668e-05, + "loss": 1.0695, + "step": 14386 + }, + { + "epoch": 1.38, + "grad_norm": 0.3179831782875412, + "learning_rate": 4.860011459301254e-05, + "loss": 1.008, + "step": 14387 + }, + { + "epoch": 1.38, + "grad_norm": 0.3086317451361631, + "learning_rate": 4.858654542154284e-05, + "loss": 0.9545, + "step": 14388 + }, + { + "epoch": 1.38, + "grad_norm": 0.30045667507904456, + "learning_rate": 4.8572977536727185e-05, + "loss": 0.9668, + "step": 14389 + }, + { + "epoch": 1.38, + "grad_norm": 0.3391360001649396, + "learning_rate": 4.855941093890505e-05, + "loss": 1.0324, + "step": 14390 + }, + { + "epoch": 1.38, + "grad_norm": 0.3624250606446975, + "learning_rate": 4.854584562841599e-05, + "loss": 1.0263, + "step": 14391 + }, + { + "epoch": 1.38, + "grad_norm": 0.31955381382554093, + "learning_rate": 4.853228160559952e-05, + "loss": 1.0658, + "step": 14392 + }, + { + "epoch": 1.38, + "grad_norm": 0.32412400367653466, + "learning_rate": 4.8518718870795e-05, + "loss": 1.0195, + "step": 14393 + }, + { + "epoch": 1.38, + "grad_norm": 0.3310450212455749, + "learning_rate": 4.8505157424341904e-05, + "loss": 1.0384, + "step": 14394 + }, + { + "epoch": 1.38, + "grad_norm": 0.32868137884981874, + "learning_rate": 4.8491597266579605e-05, + "loss": 1.0998, + "step": 14395 + }, + { + "epoch": 1.38, + "grad_norm": 0.2986130178823071, + "learning_rate": 4.847803839784749e-05, + "loss": 1.0327, + "step": 14396 + }, + { + "epoch": 1.38, + "grad_norm": 0.2919306785985397, + "learning_rate": 4.8464480818484815e-05, + "loss": 0.9224, + "step": 14397 + }, + { + "epoch": 1.38, + "grad_norm": 0.31505795943267945, + "learning_rate": 4.845092452883093e-05, + "loss": 0.8344, + "step": 14398 + }, + { + "epoch": 1.38, + "grad_norm": 0.28684085282809035, + "learning_rate": 4.843736952922502e-05, + "loss": 0.9871, + "step": 14399 + }, + { + "epoch": 1.38, + "grad_norm": 0.2897869056131507, + "learning_rate": 4.842381582000638e-05, + "loss": 1.0058, + "step": 14400 + }, + { + "epoch": 1.38, + "grad_norm": 0.35566638322148836, + "learning_rate": 4.841026340151408e-05, + "loss": 1.1288, + "step": 14401 + }, + { + "epoch": 1.38, + "grad_norm": 0.27265274443678506, + "learning_rate": 4.839671227408745e-05, + "loss": 0.9711, + "step": 14402 + }, + { + "epoch": 1.38, + "grad_norm": 0.2975769461931277, + "learning_rate": 4.8383162438065475e-05, + "loss": 1.0308, + "step": 14403 + }, + { + "epoch": 1.38, + "grad_norm": 0.27564872352593617, + "learning_rate": 4.8369613893787344e-05, + "loss": 1.0336, + "step": 14404 + }, + { + "epoch": 1.38, + "grad_norm": 0.30580100622447964, + "learning_rate": 4.835606664159205e-05, + "loss": 1.0116, + "step": 14405 + }, + { + "epoch": 1.38, + "grad_norm": 0.27912348829470673, + "learning_rate": 4.834252068181867e-05, + "loss": 0.9693, + "step": 14406 + }, + { + "epoch": 1.38, + "grad_norm": 0.3082188096153147, + "learning_rate": 4.8328976014806135e-05, + "loss": 1.0614, + "step": 14407 + }, + { + "epoch": 1.38, + "grad_norm": 0.31015205639066984, + "learning_rate": 4.8315432640893455e-05, + "loss": 1.1628, + "step": 14408 + }, + { + "epoch": 1.38, + "grad_norm": 0.3290205061142567, + "learning_rate": 4.830189056041955e-05, + "loss": 0.9113, + "step": 14409 + }, + { + "epoch": 1.38, + "grad_norm": 0.292446035010594, + "learning_rate": 4.828834977372336e-05, + "loss": 1.0696, + "step": 14410 + }, + { + "epoch": 1.38, + "grad_norm": 0.33660615753680984, + "learning_rate": 4.827481028114369e-05, + "loss": 0.9225, + "step": 14411 + }, + { + "epoch": 1.38, + "grad_norm": 0.3321709746551842, + "learning_rate": 4.8261272083019395e-05, + "loss": 0.989, + "step": 14412 + }, + { + "epoch": 1.38, + "grad_norm": 0.3661872566842863, + "learning_rate": 4.8247735179689323e-05, + "loss": 1.0998, + "step": 14413 + }, + { + "epoch": 1.38, + "grad_norm": 0.27464108760403055, + "learning_rate": 4.8234199571492164e-05, + "loss": 1.1881, + "step": 14414 + }, + { + "epoch": 1.38, + "grad_norm": 0.35732635733409784, + "learning_rate": 4.82206652587667e-05, + "loss": 1.152, + "step": 14415 + }, + { + "epoch": 1.38, + "grad_norm": 0.31832185050710327, + "learning_rate": 4.820713224185162e-05, + "loss": 1.0355, + "step": 14416 + }, + { + "epoch": 1.38, + "grad_norm": 0.307608881417958, + "learning_rate": 4.819360052108565e-05, + "loss": 0.9928, + "step": 14417 + }, + { + "epoch": 1.38, + "grad_norm": 0.3097970850750387, + "learning_rate": 4.818007009680734e-05, + "loss": 1.0518, + "step": 14418 + }, + { + "epoch": 1.38, + "grad_norm": 0.3375389591761821, + "learning_rate": 4.816654096935539e-05, + "loss": 1.0571, + "step": 14419 + }, + { + "epoch": 1.38, + "grad_norm": 0.34579838648736483, + "learning_rate": 4.8153013139068295e-05, + "loss": 1.1063, + "step": 14420 + }, + { + "epoch": 1.38, + "grad_norm": 0.3376099575558734, + "learning_rate": 4.813948660628466e-05, + "loss": 1.0561, + "step": 14421 + }, + { + "epoch": 1.38, + "grad_norm": 0.3100453283633308, + "learning_rate": 4.812596137134289e-05, + "loss": 1.0214, + "step": 14422 + }, + { + "epoch": 1.38, + "grad_norm": 0.26743512170707145, + "learning_rate": 4.8112437434581616e-05, + "loss": 1.0353, + "step": 14423 + }, + { + "epoch": 1.38, + "grad_norm": 0.2939696124053134, + "learning_rate": 4.809891479633917e-05, + "loss": 1.1279, + "step": 14424 + }, + { + "epoch": 1.38, + "grad_norm": 0.2964672282060028, + "learning_rate": 4.8085393456954034e-05, + "loss": 0.9138, + "step": 14425 + }, + { + "epoch": 1.38, + "grad_norm": 0.29695987265636975, + "learning_rate": 4.807187341676451e-05, + "loss": 1.0574, + "step": 14426 + }, + { + "epoch": 1.38, + "grad_norm": 0.3200646473227317, + "learning_rate": 4.805835467610903e-05, + "loss": 1.0463, + "step": 14427 + }, + { + "epoch": 1.38, + "grad_norm": 0.29906777109265836, + "learning_rate": 4.80448372353258e-05, + "loss": 1.0359, + "step": 14428 + }, + { + "epoch": 1.38, + "grad_norm": 0.2863316437838671, + "learning_rate": 4.8031321094753245e-05, + "loss": 1.0485, + "step": 14429 + }, + { + "epoch": 1.38, + "grad_norm": 0.2874715094330051, + "learning_rate": 4.80178062547295e-05, + "loss": 1.0651, + "step": 14430 + }, + { + "epoch": 1.38, + "grad_norm": 0.33848318766113716, + "learning_rate": 4.800429271559282e-05, + "loss": 1.0225, + "step": 14431 + }, + { + "epoch": 1.38, + "grad_norm": 0.2853776567981412, + "learning_rate": 4.799078047768144e-05, + "loss": 0.9495, + "step": 14432 + }, + { + "epoch": 1.38, + "grad_norm": 0.2967995332945404, + "learning_rate": 4.7977269541333424e-05, + "loss": 1.0274, + "step": 14433 + }, + { + "epoch": 1.38, + "grad_norm": 0.2885409605285511, + "learning_rate": 4.796375990688696e-05, + "loss": 1.0631, + "step": 14434 + }, + { + "epoch": 1.38, + "grad_norm": 0.32360185818709697, + "learning_rate": 4.795025157468005e-05, + "loss": 1.0434, + "step": 14435 + }, + { + "epoch": 1.38, + "grad_norm": 0.3369450133813596, + "learning_rate": 4.7936744545050873e-05, + "loss": 0.9776, + "step": 14436 + }, + { + "epoch": 1.38, + "grad_norm": 0.3201783021579938, + "learning_rate": 4.792323881833735e-05, + "loss": 1.1172, + "step": 14437 + }, + { + "epoch": 1.38, + "grad_norm": 0.2658211787406327, + "learning_rate": 4.790973439487754e-05, + "loss": 1.0286, + "step": 14438 + }, + { + "epoch": 1.38, + "grad_norm": 0.3045647365769467, + "learning_rate": 4.789623127500932e-05, + "loss": 1.1183, + "step": 14439 + }, + { + "epoch": 1.38, + "grad_norm": 0.34979281792642597, + "learning_rate": 4.7882729459070696e-05, + "loss": 1.0175, + "step": 14440 + }, + { + "epoch": 1.38, + "grad_norm": 0.2820347216938124, + "learning_rate": 4.786922894739949e-05, + "loss": 1.0496, + "step": 14441 + }, + { + "epoch": 1.38, + "grad_norm": 0.3200980361502817, + "learning_rate": 4.7855729740333585e-05, + "loss": 1.095, + "step": 14442 + }, + { + "epoch": 1.38, + "grad_norm": 0.3356014576645321, + "learning_rate": 4.784223183821083e-05, + "loss": 1.0978, + "step": 14443 + }, + { + "epoch": 1.38, + "grad_norm": 0.32779938157542, + "learning_rate": 4.782873524136904e-05, + "loss": 1.0303, + "step": 14444 + }, + { + "epoch": 1.38, + "grad_norm": 0.30277653490758405, + "learning_rate": 4.781523995014588e-05, + "loss": 1.0721, + "step": 14445 + }, + { + "epoch": 1.38, + "grad_norm": 0.3197362608577243, + "learning_rate": 4.7801745964879194e-05, + "loss": 0.9771, + "step": 14446 + }, + { + "epoch": 1.38, + "grad_norm": 0.3195798163634146, + "learning_rate": 4.778825328590658e-05, + "loss": 1.0462, + "step": 14447 + }, + { + "epoch": 1.38, + "grad_norm": 0.3169183646459535, + "learning_rate": 4.7774761913565734e-05, + "loss": 0.9266, + "step": 14448 + }, + { + "epoch": 1.38, + "grad_norm": 0.2732806580448995, + "learning_rate": 4.77612718481943e-05, + "loss": 0.8659, + "step": 14449 + }, + { + "epoch": 1.38, + "grad_norm": 0.3104417280578355, + "learning_rate": 4.774778309012991e-05, + "loss": 0.9267, + "step": 14450 + }, + { + "epoch": 1.38, + "grad_norm": 0.2955682006785168, + "learning_rate": 4.7734295639710033e-05, + "loss": 0.9678, + "step": 14451 + }, + { + "epoch": 1.38, + "grad_norm": 0.3241841113226169, + "learning_rate": 4.772080949727228e-05, + "loss": 0.9178, + "step": 14452 + }, + { + "epoch": 1.38, + "grad_norm": 0.3145310236471491, + "learning_rate": 4.770732466315415e-05, + "loss": 0.9927, + "step": 14453 + }, + { + "epoch": 1.38, + "grad_norm": 0.31546819599300635, + "learning_rate": 4.769384113769304e-05, + "loss": 0.9228, + "step": 14454 + }, + { + "epoch": 1.38, + "grad_norm": 0.28123038800796984, + "learning_rate": 4.7680358921226444e-05, + "loss": 1.025, + "step": 14455 + }, + { + "epoch": 1.38, + "grad_norm": 0.31868376590856967, + "learning_rate": 4.766687801409173e-05, + "loss": 1.0467, + "step": 14456 + }, + { + "epoch": 1.38, + "grad_norm": 0.3062934906372535, + "learning_rate": 4.7653398416626336e-05, + "loss": 1.1389, + "step": 14457 + }, + { + "epoch": 1.38, + "grad_norm": 0.3177409820326631, + "learning_rate": 4.76399201291675e-05, + "loss": 1.0615, + "step": 14458 + }, + { + "epoch": 1.38, + "grad_norm": 0.36397044811607604, + "learning_rate": 4.762644315205261e-05, + "loss": 0.9403, + "step": 14459 + }, + { + "epoch": 1.38, + "grad_norm": 0.3430715263048402, + "learning_rate": 4.7612967485618865e-05, + "loss": 1.0287, + "step": 14460 + }, + { + "epoch": 1.38, + "grad_norm": 0.27921220157938487, + "learning_rate": 4.7599493130203565e-05, + "loss": 0.9567, + "step": 14461 + }, + { + "epoch": 1.38, + "grad_norm": 0.3245564156796946, + "learning_rate": 4.75860200861438e-05, + "loss": 0.9801, + "step": 14462 + }, + { + "epoch": 1.38, + "grad_norm": 0.26101414456525535, + "learning_rate": 4.75725483537769e-05, + "loss": 0.9329, + "step": 14463 + }, + { + "epoch": 1.38, + "grad_norm": 0.32098501399231855, + "learning_rate": 4.75590779334399e-05, + "loss": 1.0472, + "step": 14464 + }, + { + "epoch": 1.38, + "grad_norm": 0.27241196649762467, + "learning_rate": 4.7545608825469965e-05, + "loss": 0.9777, + "step": 14465 + }, + { + "epoch": 1.38, + "grad_norm": 0.27854110912896757, + "learning_rate": 4.7532141030204094e-05, + "loss": 1.0316, + "step": 14466 + }, + { + "epoch": 1.38, + "grad_norm": 0.3130285151450529, + "learning_rate": 4.751867454797941e-05, + "loss": 1.1194, + "step": 14467 + }, + { + "epoch": 1.38, + "grad_norm": 0.3541262046588002, + "learning_rate": 4.750520937913284e-05, + "loss": 1.0096, + "step": 14468 + }, + { + "epoch": 1.38, + "grad_norm": 0.2901325261445251, + "learning_rate": 4.7491745524001396e-05, + "loss": 0.9576, + "step": 14469 + }, + { + "epoch": 1.38, + "grad_norm": 0.33473887282728204, + "learning_rate": 4.7478282982922016e-05, + "loss": 1.067, + "step": 14470 + }, + { + "epoch": 1.38, + "grad_norm": 0.32525820708912423, + "learning_rate": 4.7464821756231656e-05, + "loss": 1.0339, + "step": 14471 + }, + { + "epoch": 1.38, + "grad_norm": 0.3743311660150218, + "learning_rate": 4.745136184426712e-05, + "loss": 0.945, + "step": 14472 + }, + { + "epoch": 1.38, + "grad_norm": 0.323533813354742, + "learning_rate": 4.743790324736527e-05, + "loss": 1.0739, + "step": 14473 + }, + { + "epoch": 1.38, + "grad_norm": 0.3129326335513027, + "learning_rate": 4.7424445965862965e-05, + "loss": 1.0919, + "step": 14474 + }, + { + "epoch": 1.38, + "grad_norm": 0.3218187925510785, + "learning_rate": 4.74109900000969e-05, + "loss": 1.0196, + "step": 14475 + }, + { + "epoch": 1.38, + "grad_norm": 0.2812211839909355, + "learning_rate": 4.7397535350403866e-05, + "loss": 0.9966, + "step": 14476 + }, + { + "epoch": 1.39, + "grad_norm": 0.2907701069447105, + "learning_rate": 4.7384082017120565e-05, + "loss": 1.0492, + "step": 14477 + }, + { + "epoch": 1.39, + "grad_norm": 0.32654187238671767, + "learning_rate": 4.737063000058371e-05, + "loss": 1.0354, + "step": 14478 + }, + { + "epoch": 1.39, + "grad_norm": 0.3216752726771083, + "learning_rate": 4.735717930112989e-05, + "loss": 1.0125, + "step": 14479 + }, + { + "epoch": 1.39, + "grad_norm": 0.38458498293791643, + "learning_rate": 4.734372991909578e-05, + "loss": 1.0148, + "step": 14480 + }, + { + "epoch": 1.39, + "grad_norm": 0.32091460379709535, + "learning_rate": 4.733028185481788e-05, + "loss": 0.9886, + "step": 14481 + }, + { + "epoch": 1.39, + "grad_norm": 0.27383193374335574, + "learning_rate": 4.731683510863282e-05, + "loss": 0.9988, + "step": 14482 + }, + { + "epoch": 1.39, + "grad_norm": 0.24989635859742168, + "learning_rate": 4.7303389680876994e-05, + "loss": 0.967, + "step": 14483 + }, + { + "epoch": 1.39, + "grad_norm": 0.30736567102489126, + "learning_rate": 4.728994557188704e-05, + "loss": 1.0467, + "step": 14484 + }, + { + "epoch": 1.39, + "grad_norm": 0.2945634026036099, + "learning_rate": 4.7276502781999275e-05, + "loss": 1.028, + "step": 14485 + }, + { + "epoch": 1.39, + "grad_norm": 0.3394716295800157, + "learning_rate": 4.7263061311550215e-05, + "loss": 1.0655, + "step": 14486 + }, + { + "epoch": 1.39, + "grad_norm": 0.2781397642250206, + "learning_rate": 4.724962116087615e-05, + "loss": 1.0307, + "step": 14487 + }, + { + "epoch": 1.39, + "grad_norm": 0.3544732807704542, + "learning_rate": 4.723618233031349e-05, + "loss": 0.9617, + "step": 14488 + }, + { + "epoch": 1.39, + "grad_norm": 0.3060275787529381, + "learning_rate": 4.7222744820198474e-05, + "loss": 1.0333, + "step": 14489 + }, + { + "epoch": 1.39, + "grad_norm": 0.27583549914846855, + "learning_rate": 4.72093086308675e-05, + "loss": 1.1355, + "step": 14490 + }, + { + "epoch": 1.39, + "grad_norm": 0.2755774147944322, + "learning_rate": 4.7195873762656715e-05, + "loss": 0.9521, + "step": 14491 + }, + { + "epoch": 1.39, + "grad_norm": 0.3114626397404572, + "learning_rate": 4.7182440215902424e-05, + "loss": 0.8806, + "step": 14492 + }, + { + "epoch": 1.39, + "grad_norm": 0.30211377048075383, + "learning_rate": 4.7169007990940715e-05, + "loss": 1.015, + "step": 14493 + }, + { + "epoch": 1.39, + "grad_norm": 0.28961599161238605, + "learning_rate": 4.7155577088107795e-05, + "loss": 1.05, + "step": 14494 + }, + { + "epoch": 1.39, + "grad_norm": 0.31110158719105024, + "learning_rate": 4.71421475077398e-05, + "loss": 1.0809, + "step": 14495 + }, + { + "epoch": 1.39, + "grad_norm": 0.29844288704939287, + "learning_rate": 4.712871925017274e-05, + "loss": 1.0499, + "step": 14496 + }, + { + "epoch": 1.39, + "grad_norm": 0.2963299630924041, + "learning_rate": 4.711529231574272e-05, + "loss": 0.9547, + "step": 14497 + }, + { + "epoch": 1.39, + "grad_norm": 0.3248837655246823, + "learning_rate": 4.7101866704785735e-05, + "loss": 1.0681, + "step": 14498 + }, + { + "epoch": 1.39, + "grad_norm": 0.2780033066539115, + "learning_rate": 4.7088442417637824e-05, + "loss": 0.961, + "step": 14499 + }, + { + "epoch": 1.39, + "grad_norm": 0.2663609739349043, + "learning_rate": 4.707501945463486e-05, + "loss": 1.0575, + "step": 14500 + }, + { + "epoch": 1.39, + "grad_norm": 0.29287471421796035, + "learning_rate": 4.706159781611283e-05, + "loss": 1.1105, + "step": 14501 + }, + { + "epoch": 1.39, + "grad_norm": 0.3729330667075555, + "learning_rate": 4.7048177502407555e-05, + "loss": 0.8903, + "step": 14502 + }, + { + "epoch": 1.39, + "grad_norm": 0.28291399821199564, + "learning_rate": 4.703475851385491e-05, + "loss": 1.0124, + "step": 14503 + }, + { + "epoch": 1.39, + "grad_norm": 0.3314178234527135, + "learning_rate": 4.702134085079073e-05, + "loss": 1.1006, + "step": 14504 + }, + { + "epoch": 1.39, + "grad_norm": 0.3314705581268673, + "learning_rate": 4.700792451355082e-05, + "loss": 0.9283, + "step": 14505 + }, + { + "epoch": 1.39, + "grad_norm": 0.3329449190195897, + "learning_rate": 4.6994509502470864e-05, + "loss": 0.9897, + "step": 14506 + }, + { + "epoch": 1.39, + "grad_norm": 0.3325196625819837, + "learning_rate": 4.6981095817886655e-05, + "loss": 0.9477, + "step": 14507 + }, + { + "epoch": 1.39, + "grad_norm": 0.2809637785971729, + "learning_rate": 4.696768346013382e-05, + "loss": 0.9285, + "step": 14508 + }, + { + "epoch": 1.39, + "grad_norm": 0.289544087682919, + "learning_rate": 4.695427242954806e-05, + "loss": 0.9545, + "step": 14509 + }, + { + "epoch": 1.39, + "grad_norm": 0.2691401795317508, + "learning_rate": 4.694086272646491e-05, + "loss": 1.0702, + "step": 14510 + }, + { + "epoch": 1.39, + "grad_norm": 0.33559211728582616, + "learning_rate": 4.692745435122008e-05, + "loss": 0.9896, + "step": 14511 + }, + { + "epoch": 1.39, + "grad_norm": 0.29942911764981295, + "learning_rate": 4.691404730414902e-05, + "loss": 1.0462, + "step": 14512 + }, + { + "epoch": 1.39, + "grad_norm": 0.27230548457547965, + "learning_rate": 4.690064158558733e-05, + "loss": 0.9745, + "step": 14513 + }, + { + "epoch": 1.39, + "grad_norm": 0.29274035367214074, + "learning_rate": 4.688723719587043e-05, + "loss": 1.0558, + "step": 14514 + }, + { + "epoch": 1.39, + "grad_norm": 0.27684066262467494, + "learning_rate": 4.6873834135333786e-05, + "loss": 0.9306, + "step": 14515 + }, + { + "epoch": 1.39, + "grad_norm": 0.3229578577017911, + "learning_rate": 4.686043240431284e-05, + "loss": 0.9825, + "step": 14516 + }, + { + "epoch": 1.39, + "grad_norm": 0.2831181586165476, + "learning_rate": 4.6847032003143e-05, + "loss": 0.851, + "step": 14517 + }, + { + "epoch": 1.39, + "grad_norm": 0.33047973582500834, + "learning_rate": 4.6833632932159554e-05, + "loss": 0.961, + "step": 14518 + }, + { + "epoch": 1.39, + "grad_norm": 0.29526512286402473, + "learning_rate": 4.682023519169786e-05, + "loss": 1.0299, + "step": 14519 + }, + { + "epoch": 1.39, + "grad_norm": 0.30174293181315615, + "learning_rate": 4.680683878209324e-05, + "loss": 1.0425, + "step": 14520 + }, + { + "epoch": 1.39, + "grad_norm": 0.2852224961897108, + "learning_rate": 4.679344370368087e-05, + "loss": 1.0279, + "step": 14521 + }, + { + "epoch": 1.39, + "grad_norm": 0.3099969127554914, + "learning_rate": 4.678004995679605e-05, + "loss": 1.0559, + "step": 14522 + }, + { + "epoch": 1.39, + "grad_norm": 0.32342806603302, + "learning_rate": 4.676665754177385e-05, + "loss": 1.0145, + "step": 14523 + }, + { + "epoch": 1.39, + "grad_norm": 0.3295888615037902, + "learning_rate": 4.675326645894958e-05, + "loss": 1.0312, + "step": 14524 + }, + { + "epoch": 1.39, + "grad_norm": 0.2900097621947336, + "learning_rate": 4.6739876708658234e-05, + "loss": 1.0071, + "step": 14525 + }, + { + "epoch": 1.39, + "grad_norm": 0.3121053017201456, + "learning_rate": 4.6726488291234994e-05, + "loss": 0.9355, + "step": 14526 + }, + { + "epoch": 1.39, + "grad_norm": 0.3654169631231735, + "learning_rate": 4.671310120701483e-05, + "loss": 1.0599, + "step": 14527 + }, + { + "epoch": 1.39, + "grad_norm": 0.310313005212213, + "learning_rate": 4.6699715456332824e-05, + "loss": 1.0408, + "step": 14528 + }, + { + "epoch": 1.39, + "grad_norm": 0.3184057320698385, + "learning_rate": 4.668633103952391e-05, + "loss": 1.0488, + "step": 14529 + }, + { + "epoch": 1.39, + "grad_norm": 0.31258838720155485, + "learning_rate": 4.6672947956923065e-05, + "loss": 0.9321, + "step": 14530 + }, + { + "epoch": 1.39, + "grad_norm": 0.3225596731702545, + "learning_rate": 4.6659566208865216e-05, + "loss": 0.9205, + "step": 14531 + }, + { + "epoch": 1.39, + "grad_norm": 0.2687077388304932, + "learning_rate": 4.6646185795685274e-05, + "loss": 1.0177, + "step": 14532 + }, + { + "epoch": 1.39, + "grad_norm": 0.29372148650348534, + "learning_rate": 4.663280671771803e-05, + "loss": 1.1015, + "step": 14533 + }, + { + "epoch": 1.39, + "grad_norm": 0.31796885995605395, + "learning_rate": 4.661942897529834e-05, + "loss": 0.972, + "step": 14534 + }, + { + "epoch": 1.39, + "grad_norm": 0.25564579923878467, + "learning_rate": 4.6606052568761024e-05, + "loss": 1.0007, + "step": 14535 + }, + { + "epoch": 1.39, + "grad_norm": 0.3059572230787575, + "learning_rate": 4.659267749844076e-05, + "loss": 0.9895, + "step": 14536 + }, + { + "epoch": 1.39, + "grad_norm": 0.3023299431981442, + "learning_rate": 4.657930376467231e-05, + "loss": 1.0752, + "step": 14537 + }, + { + "epoch": 1.39, + "grad_norm": 0.28156768323581954, + "learning_rate": 4.656593136779036e-05, + "loss": 0.9514, + "step": 14538 + }, + { + "epoch": 1.39, + "grad_norm": 0.28663842527157823, + "learning_rate": 4.655256030812959e-05, + "loss": 1.1362, + "step": 14539 + }, + { + "epoch": 1.39, + "grad_norm": 0.3164069283356414, + "learning_rate": 4.653919058602455e-05, + "loss": 1.0103, + "step": 14540 + }, + { + "epoch": 1.39, + "grad_norm": 0.3259885989516009, + "learning_rate": 4.652582220180991e-05, + "loss": 1.0072, + "step": 14541 + }, + { + "epoch": 1.39, + "grad_norm": 0.3066031640825305, + "learning_rate": 4.6512455155820136e-05, + "loss": 0.99, + "step": 14542 + }, + { + "epoch": 1.39, + "grad_norm": 0.2641908178563616, + "learning_rate": 4.649908944838982e-05, + "loss": 0.9837, + "step": 14543 + }, + { + "epoch": 1.39, + "grad_norm": 0.27606574152316776, + "learning_rate": 4.648572507985335e-05, + "loss": 1.0206, + "step": 14544 + }, + { + "epoch": 1.39, + "grad_norm": 0.35069542433948503, + "learning_rate": 4.647236205054532e-05, + "loss": 1.0838, + "step": 14545 + }, + { + "epoch": 1.39, + "grad_norm": 0.32242503381269405, + "learning_rate": 4.645900036080002e-05, + "loss": 1.0834, + "step": 14546 + }, + { + "epoch": 1.39, + "grad_norm": 0.3301660669302035, + "learning_rate": 4.6445640010951917e-05, + "loss": 1.0042, + "step": 14547 + }, + { + "epoch": 1.39, + "grad_norm": 0.2801240727014673, + "learning_rate": 4.643228100133531e-05, + "loss": 1.1063, + "step": 14548 + }, + { + "epoch": 1.39, + "grad_norm": 0.33786084199468713, + "learning_rate": 4.641892333228457e-05, + "loss": 1.06, + "step": 14549 + }, + { + "epoch": 1.39, + "grad_norm": 0.3582621005735129, + "learning_rate": 4.6405567004133864e-05, + "loss": 0.8787, + "step": 14550 + }, + { + "epoch": 1.39, + "grad_norm": 0.3097559797306662, + "learning_rate": 4.639221201721762e-05, + "loss": 0.8257, + "step": 14551 + }, + { + "epoch": 1.39, + "grad_norm": 0.3231517008521326, + "learning_rate": 4.63788583718699e-05, + "loss": 0.9679, + "step": 14552 + }, + { + "epoch": 1.39, + "grad_norm": 0.31927163300397904, + "learning_rate": 4.6365506068425e-05, + "loss": 1.0253, + "step": 14553 + }, + { + "epoch": 1.39, + "grad_norm": 0.2899991160646858, + "learning_rate": 4.635215510721699e-05, + "loss": 0.9223, + "step": 14554 + }, + { + "epoch": 1.39, + "grad_norm": 0.26916829426665295, + "learning_rate": 4.633880548857999e-05, + "loss": 0.8426, + "step": 14555 + }, + { + "epoch": 1.39, + "grad_norm": 0.362918310923639, + "learning_rate": 4.632545721284817e-05, + "loss": 1.0941, + "step": 14556 + }, + { + "epoch": 1.39, + "grad_norm": 0.3351483896066608, + "learning_rate": 4.631211028035547e-05, + "loss": 1.0818, + "step": 14557 + }, + { + "epoch": 1.39, + "grad_norm": 0.3462515614196478, + "learning_rate": 4.629876469143595e-05, + "loss": 1.0985, + "step": 14558 + }, + { + "epoch": 1.39, + "grad_norm": 0.29932336847106256, + "learning_rate": 4.628542044642359e-05, + "loss": 1.0252, + "step": 14559 + }, + { + "epoch": 1.39, + "grad_norm": 0.36066652263556875, + "learning_rate": 4.627207754565238e-05, + "loss": 0.9753, + "step": 14560 + }, + { + "epoch": 1.39, + "grad_norm": 0.3138537354629763, + "learning_rate": 4.625873598945617e-05, + "loss": 1.0649, + "step": 14561 + }, + { + "epoch": 1.39, + "grad_norm": 0.32834361615779534, + "learning_rate": 4.6245395778168886e-05, + "loss": 0.9845, + "step": 14562 + }, + { + "epoch": 1.39, + "grad_norm": 0.3040702041926141, + "learning_rate": 4.623205691212433e-05, + "loss": 1.0856, + "step": 14563 + }, + { + "epoch": 1.39, + "grad_norm": 0.2711535045871623, + "learning_rate": 4.621871939165634e-05, + "loss": 1.202, + "step": 14564 + }, + { + "epoch": 1.39, + "grad_norm": 0.28361129626873804, + "learning_rate": 4.620538321709871e-05, + "loss": 1.0391, + "step": 14565 + }, + { + "epoch": 1.39, + "grad_norm": 0.34669134029978227, + "learning_rate": 4.6192048388785194e-05, + "loss": 1.0307, + "step": 14566 + }, + { + "epoch": 1.39, + "grad_norm": 0.3413611963346785, + "learning_rate": 4.617871490704945e-05, + "loss": 1.1053, + "step": 14567 + }, + { + "epoch": 1.39, + "grad_norm": 0.27098625135456134, + "learning_rate": 4.616538277222523e-05, + "loss": 1.0656, + "step": 14568 + }, + { + "epoch": 1.39, + "grad_norm": 0.2829250269416554, + "learning_rate": 4.615205198464611e-05, + "loss": 1.0098, + "step": 14569 + }, + { + "epoch": 1.39, + "grad_norm": 0.30686112658291437, + "learning_rate": 4.613872254464575e-05, + "loss": 1.0504, + "step": 14570 + }, + { + "epoch": 1.39, + "grad_norm": 0.2929446720648362, + "learning_rate": 4.612539445255765e-05, + "loss": 0.9494, + "step": 14571 + }, + { + "epoch": 1.39, + "grad_norm": 0.32108940185964, + "learning_rate": 4.611206770871548e-05, + "loss": 0.9504, + "step": 14572 + }, + { + "epoch": 1.39, + "grad_norm": 0.35913484614990676, + "learning_rate": 4.609874231345265e-05, + "loss": 1.0777, + "step": 14573 + }, + { + "epoch": 1.39, + "grad_norm": 0.3093344653187777, + "learning_rate": 4.6085418267102706e-05, + "loss": 1.0562, + "step": 14574 + }, + { + "epoch": 1.39, + "grad_norm": 0.3282702858118168, + "learning_rate": 4.607209556999902e-05, + "loss": 1.0741, + "step": 14575 + }, + { + "epoch": 1.39, + "grad_norm": 0.31059534713234765, + "learning_rate": 4.605877422247502e-05, + "loss": 0.9249, + "step": 14576 + }, + { + "epoch": 1.39, + "grad_norm": 0.3321196695929585, + "learning_rate": 4.604545422486411e-05, + "loss": 1.1065, + "step": 14577 + }, + { + "epoch": 1.39, + "grad_norm": 0.3059299868015661, + "learning_rate": 4.6032135577499655e-05, + "loss": 1.074, + "step": 14578 + }, + { + "epoch": 1.39, + "grad_norm": 0.3232725679638278, + "learning_rate": 4.601881828071488e-05, + "loss": 1.0453, + "step": 14579 + }, + { + "epoch": 1.39, + "grad_norm": 0.31521605265285724, + "learning_rate": 4.600550233484311e-05, + "loss": 1.0716, + "step": 14580 + }, + { + "epoch": 1.39, + "grad_norm": 0.2786443801089079, + "learning_rate": 4.599218774021761e-05, + "loss": 1.0527, + "step": 14581 + }, + { + "epoch": 1.4, + "grad_norm": 0.3157605337477032, + "learning_rate": 4.597887449717152e-05, + "loss": 1.0466, + "step": 14582 + }, + { + "epoch": 1.4, + "grad_norm": 0.31293683336560724, + "learning_rate": 4.596556260603808e-05, + "loss": 1.0613, + "step": 14583 + }, + { + "epoch": 1.4, + "grad_norm": 0.3124238060566854, + "learning_rate": 4.5952252067150325e-05, + "loss": 0.9623, + "step": 14584 + }, + { + "epoch": 1.4, + "grad_norm": 0.3186697012761131, + "learning_rate": 4.5938942880841505e-05, + "loss": 0.9889, + "step": 14585 + }, + { + "epoch": 1.4, + "grad_norm": 0.29529543377179285, + "learning_rate": 4.5925635047444574e-05, + "loss": 1.1031, + "step": 14586 + }, + { + "epoch": 1.4, + "grad_norm": 0.32255578272882907, + "learning_rate": 4.591232856729265e-05, + "loss": 0.9806, + "step": 14587 + }, + { + "epoch": 1.4, + "grad_norm": 0.2945305059139064, + "learning_rate": 4.589902344071866e-05, + "loss": 0.9506, + "step": 14588 + }, + { + "epoch": 1.4, + "grad_norm": 0.3414819377709196, + "learning_rate": 4.588571966805565e-05, + "loss": 0.8946, + "step": 14589 + }, + { + "epoch": 1.4, + "grad_norm": 0.32896136159342537, + "learning_rate": 4.587241724963647e-05, + "loss": 1.0246, + "step": 14590 + }, + { + "epoch": 1.4, + "grad_norm": 0.3319194091874105, + "learning_rate": 4.585911618579407e-05, + "loss": 1.0447, + "step": 14591 + }, + { + "epoch": 1.4, + "grad_norm": 0.30414068871671424, + "learning_rate": 4.584581647686131e-05, + "loss": 1.072, + "step": 14592 + }, + { + "epoch": 1.4, + "grad_norm": 0.2737723755427151, + "learning_rate": 4.5832518123171074e-05, + "loss": 1.0746, + "step": 14593 + }, + { + "epoch": 1.4, + "grad_norm": 0.3171226058862531, + "learning_rate": 4.581922112505607e-05, + "loss": 1.0314, + "step": 14594 + }, + { + "epoch": 1.4, + "grad_norm": 0.32223991788141987, + "learning_rate": 4.580592548284914e-05, + "loss": 1.1334, + "step": 14595 + }, + { + "epoch": 1.4, + "grad_norm": 0.3022847623787543, + "learning_rate": 4.579263119688296e-05, + "loss": 1.0369, + "step": 14596 + }, + { + "epoch": 1.4, + "grad_norm": 0.3474345547000413, + "learning_rate": 4.5779338267490255e-05, + "loss": 1.061, + "step": 14597 + }, + { + "epoch": 1.4, + "grad_norm": 0.3561983735184284, + "learning_rate": 4.5766046695003673e-05, + "loss": 0.9969, + "step": 14598 + }, + { + "epoch": 1.4, + "grad_norm": 0.32010018463537226, + "learning_rate": 4.57527564797559e-05, + "loss": 0.9642, + "step": 14599 + }, + { + "epoch": 1.4, + "grad_norm": 0.3048514761302618, + "learning_rate": 4.5739467622079456e-05, + "loss": 1.0145, + "step": 14600 + }, + { + "epoch": 1.4, + "grad_norm": 0.40855175817808637, + "learning_rate": 4.572618012230693e-05, + "loss": 0.9601, + "step": 14601 + }, + { + "epoch": 1.4, + "grad_norm": 0.32592922673312585, + "learning_rate": 4.57128939807709e-05, + "loss": 1.0278, + "step": 14602 + }, + { + "epoch": 1.4, + "grad_norm": 0.34684775951713204, + "learning_rate": 4.569960919780377e-05, + "loss": 1.0904, + "step": 14603 + }, + { + "epoch": 1.4, + "grad_norm": 0.30248729163940646, + "learning_rate": 4.5686325773738094e-05, + "loss": 0.9238, + "step": 14604 + }, + { + "epoch": 1.4, + "grad_norm": 0.29106678140277076, + "learning_rate": 4.567304370890617e-05, + "loss": 1.0611, + "step": 14605 + }, + { + "epoch": 1.4, + "grad_norm": 0.2902016167129869, + "learning_rate": 4.5659763003640546e-05, + "loss": 1.0618, + "step": 14606 + }, + { + "epoch": 1.4, + "grad_norm": 0.3280570377842861, + "learning_rate": 4.564648365827346e-05, + "loss": 1.04, + "step": 14607 + }, + { + "epoch": 1.4, + "grad_norm": 0.2588458315060987, + "learning_rate": 4.563320567313732e-05, + "loss": 1.0216, + "step": 14608 + }, + { + "epoch": 1.4, + "grad_norm": 0.30518018057929713, + "learning_rate": 4.561992904856435e-05, + "loss": 0.8892, + "step": 14609 + }, + { + "epoch": 1.4, + "grad_norm": 0.32211516147088803, + "learning_rate": 4.560665378488687e-05, + "loss": 1.0572, + "step": 14610 + }, + { + "epoch": 1.4, + "grad_norm": 0.27436603849517954, + "learning_rate": 4.559337988243697e-05, + "loss": 1.0375, + "step": 14611 + }, + { + "epoch": 1.4, + "grad_norm": 0.3170720083840489, + "learning_rate": 4.558010734154703e-05, + "loss": 0.982, + "step": 14612 + }, + { + "epoch": 1.4, + "grad_norm": 0.26251258315164244, + "learning_rate": 4.5566836162549055e-05, + "loss": 0.8984, + "step": 14613 + }, + { + "epoch": 1.4, + "grad_norm": 0.2777054345760726, + "learning_rate": 4.5553566345775245e-05, + "loss": 1.0987, + "step": 14614 + }, + { + "epoch": 1.4, + "grad_norm": 0.28472439806759225, + "learning_rate": 4.554029789155763e-05, + "loss": 1.1313, + "step": 14615 + }, + { + "epoch": 1.4, + "grad_norm": 0.29938172982547295, + "learning_rate": 4.552703080022832e-05, + "loss": 0.9757, + "step": 14616 + }, + { + "epoch": 1.4, + "grad_norm": 0.2999584478917376, + "learning_rate": 4.551376507211926e-05, + "loss": 1.0364, + "step": 14617 + }, + { + "epoch": 1.4, + "grad_norm": 0.3433993826807952, + "learning_rate": 4.5500500707562477e-05, + "loss": 1.0684, + "step": 14618 + }, + { + "epoch": 1.4, + "grad_norm": 0.3078774325250403, + "learning_rate": 4.548723770688992e-05, + "loss": 0.9488, + "step": 14619 + }, + { + "epoch": 1.4, + "grad_norm": 0.30877410694794355, + "learning_rate": 4.5473976070433486e-05, + "loss": 1.0408, + "step": 14620 + }, + { + "epoch": 1.4, + "grad_norm": 0.31746894555954874, + "learning_rate": 4.546071579852512e-05, + "loss": 1.0299, + "step": 14621 + }, + { + "epoch": 1.4, + "grad_norm": 0.3295027293551967, + "learning_rate": 4.5447456891496574e-05, + "loss": 1.1356, + "step": 14622 + }, + { + "epoch": 1.4, + "grad_norm": 0.3628879239554858, + "learning_rate": 4.543419934967974e-05, + "loss": 1.0506, + "step": 14623 + }, + { + "epoch": 1.4, + "grad_norm": 0.24330852161010927, + "learning_rate": 4.542094317340633e-05, + "loss": 1.0591, + "step": 14624 + }, + { + "epoch": 1.4, + "grad_norm": 0.3371795873403627, + "learning_rate": 4.54076883630081e-05, + "loss": 1.0708, + "step": 14625 + }, + { + "epoch": 1.4, + "grad_norm": 0.33448426395647074, + "learning_rate": 4.5394434918816794e-05, + "loss": 1.0282, + "step": 14626 + }, + { + "epoch": 1.4, + "grad_norm": 0.2984205223586194, + "learning_rate": 4.5381182841164116e-05, + "loss": 1.0039, + "step": 14627 + }, + { + "epoch": 1.4, + "grad_norm": 0.2678252171156085, + "learning_rate": 4.536793213038162e-05, + "loss": 1.0172, + "step": 14628 + }, + { + "epoch": 1.4, + "grad_norm": 0.309518593126307, + "learning_rate": 4.5354682786800996e-05, + "loss": 1.051, + "step": 14629 + }, + { + "epoch": 1.4, + "grad_norm": 0.3223707504113723, + "learning_rate": 4.534143481075374e-05, + "loss": 0.9824, + "step": 14630 + }, + { + "epoch": 1.4, + "grad_norm": 0.26678381903048737, + "learning_rate": 4.5328188202571464e-05, + "loss": 0.9335, + "step": 14631 + }, + { + "epoch": 1.4, + "grad_norm": 0.3183410725268524, + "learning_rate": 4.531494296258556e-05, + "loss": 1.0788, + "step": 14632 + }, + { + "epoch": 1.4, + "grad_norm": 0.307314081646174, + "learning_rate": 4.530169909112766e-05, + "loss": 1.0561, + "step": 14633 + }, + { + "epoch": 1.4, + "grad_norm": 0.3079636852912537, + "learning_rate": 4.5288456588529074e-05, + "loss": 0.9909, + "step": 14634 + }, + { + "epoch": 1.4, + "grad_norm": 0.33540020653161345, + "learning_rate": 4.527521545512129e-05, + "loss": 1.1203, + "step": 14635 + }, + { + "epoch": 1.4, + "grad_norm": 0.33358134886535756, + "learning_rate": 4.5261975691235595e-05, + "loss": 0.9587, + "step": 14636 + }, + { + "epoch": 1.4, + "grad_norm": 0.31518434301103476, + "learning_rate": 4.524873729720335e-05, + "loss": 1.024, + "step": 14637 + }, + { + "epoch": 1.4, + "grad_norm": 0.30467606081618026, + "learning_rate": 4.5235500273355866e-05, + "loss": 1.0174, + "step": 14638 + }, + { + "epoch": 1.4, + "grad_norm": 0.33519824564186074, + "learning_rate": 4.5222264620024455e-05, + "loss": 1.0662, + "step": 14639 + }, + { + "epoch": 1.4, + "grad_norm": 0.3044388575053393, + "learning_rate": 4.520903033754025e-05, + "loss": 1.0522, + "step": 14640 + }, + { + "epoch": 1.4, + "grad_norm": 0.279646407067448, + "learning_rate": 4.51957974262345e-05, + "loss": 1.0256, + "step": 14641 + }, + { + "epoch": 1.4, + "grad_norm": 0.3200731191003678, + "learning_rate": 4.51825658864384e-05, + "loss": 0.9474, + "step": 14642 + }, + { + "epoch": 1.4, + "grad_norm": 0.3511744302244567, + "learning_rate": 4.5169335718482996e-05, + "loss": 1.086, + "step": 14643 + }, + { + "epoch": 1.4, + "grad_norm": 0.32363222828041555, + "learning_rate": 4.5156106922699474e-05, + "loss": 0.994, + "step": 14644 + }, + { + "epoch": 1.4, + "grad_norm": 0.31111487400318977, + "learning_rate": 4.5142879499418764e-05, + "loss": 0.9527, + "step": 14645 + }, + { + "epoch": 1.4, + "grad_norm": 0.356327555248549, + "learning_rate": 4.5129653448972044e-05, + "loss": 1.0285, + "step": 14646 + }, + { + "epoch": 1.4, + "grad_norm": 0.34285637136450736, + "learning_rate": 4.51164287716902e-05, + "loss": 1.0799, + "step": 14647 + }, + { + "epoch": 1.4, + "grad_norm": 0.33954427414255856, + "learning_rate": 4.510320546790425e-05, + "loss": 1.0202, + "step": 14648 + }, + { + "epoch": 1.4, + "grad_norm": 0.2934047308213413, + "learning_rate": 4.5089983537945045e-05, + "loss": 1.1775, + "step": 14649 + }, + { + "epoch": 1.4, + "grad_norm": 0.2937614263876069, + "learning_rate": 4.5076762982143564e-05, + "loss": 1.0378, + "step": 14650 + }, + { + "epoch": 1.4, + "grad_norm": 0.2941961066686118, + "learning_rate": 4.506354380083056e-05, + "loss": 1.1458, + "step": 14651 + }, + { + "epoch": 1.4, + "grad_norm": 0.3237341866684905, + "learning_rate": 4.5050325994336896e-05, + "loss": 1.0701, + "step": 14652 + }, + { + "epoch": 1.4, + "grad_norm": 0.3509826369329139, + "learning_rate": 4.503710956299335e-05, + "loss": 1.0082, + "step": 14653 + }, + { + "epoch": 1.4, + "grad_norm": 0.32776922266222325, + "learning_rate": 4.502389450713074e-05, + "loss": 1.01, + "step": 14654 + }, + { + "epoch": 1.4, + "grad_norm": 0.2832138313586942, + "learning_rate": 4.501068082707967e-05, + "loss": 0.9666, + "step": 14655 + }, + { + "epoch": 1.4, + "grad_norm": 0.3080189429093457, + "learning_rate": 4.4997468523170906e-05, + "loss": 1.0305, + "step": 14656 + }, + { + "epoch": 1.4, + "grad_norm": 0.35296084015744, + "learning_rate": 4.498425759573503e-05, + "loss": 1.1025, + "step": 14657 + }, + { + "epoch": 1.4, + "grad_norm": 0.2901635342943118, + "learning_rate": 4.497104804510268e-05, + "loss": 1.02, + "step": 14658 + }, + { + "epoch": 1.4, + "grad_norm": 0.31474820809761683, + "learning_rate": 4.4957839871604436e-05, + "loss": 1.0008, + "step": 14659 + }, + { + "epoch": 1.4, + "grad_norm": 0.31820005077706587, + "learning_rate": 4.4944633075570884e-05, + "loss": 1.0672, + "step": 14660 + }, + { + "epoch": 1.4, + "grad_norm": 0.29737287163795995, + "learning_rate": 4.4931427657332447e-05, + "loss": 1.1314, + "step": 14661 + }, + { + "epoch": 1.4, + "grad_norm": 0.29756642475597045, + "learning_rate": 4.491822361721964e-05, + "loss": 1.0908, + "step": 14662 + }, + { + "epoch": 1.4, + "grad_norm": 0.3054496291276089, + "learning_rate": 4.490502095556296e-05, + "loss": 1.1198, + "step": 14663 + }, + { + "epoch": 1.4, + "grad_norm": 0.36793204597508533, + "learning_rate": 4.4891819672692704e-05, + "loss": 0.9445, + "step": 14664 + }, + { + "epoch": 1.4, + "grad_norm": 0.319247215945473, + "learning_rate": 4.4878619768939335e-05, + "loss": 1.0146, + "step": 14665 + }, + { + "epoch": 1.4, + "grad_norm": 0.3395774682236291, + "learning_rate": 4.486542124463308e-05, + "loss": 1.0829, + "step": 14666 + }, + { + "epoch": 1.4, + "grad_norm": 0.28010672554536575, + "learning_rate": 4.485222410010437e-05, + "loss": 0.9912, + "step": 14667 + }, + { + "epoch": 1.4, + "grad_norm": 0.2988522869488326, + "learning_rate": 4.4839028335683384e-05, + "loss": 1.0245, + "step": 14668 + }, + { + "epoch": 1.4, + "grad_norm": 0.3024615480891808, + "learning_rate": 4.482583395170041e-05, + "loss": 0.9939, + "step": 14669 + }, + { + "epoch": 1.4, + "grad_norm": 0.3132820186014763, + "learning_rate": 4.481264094848559e-05, + "loss": 0.9536, + "step": 14670 + }, + { + "epoch": 1.4, + "grad_norm": 0.31374088670366534, + "learning_rate": 4.4799449326369144e-05, + "loss": 1.0653, + "step": 14671 + }, + { + "epoch": 1.4, + "grad_norm": 0.32195743827523265, + "learning_rate": 4.4786259085681106e-05, + "loss": 1.0254, + "step": 14672 + }, + { + "epoch": 1.4, + "grad_norm": 0.3102449754086541, + "learning_rate": 4.47730702267517e-05, + "loss": 0.9055, + "step": 14673 + }, + { + "epoch": 1.4, + "grad_norm": 0.3326035694991885, + "learning_rate": 4.4759882749910885e-05, + "loss": 1.0652, + "step": 14674 + }, + { + "epoch": 1.4, + "grad_norm": 0.28958262970889065, + "learning_rate": 4.4746696655488764e-05, + "loss": 1.0833, + "step": 14675 + }, + { + "epoch": 1.4, + "grad_norm": 0.2954111534348282, + "learning_rate": 4.473351194381524e-05, + "loss": 0.9445, + "step": 14676 + }, + { + "epoch": 1.4, + "grad_norm": 0.31955236512850277, + "learning_rate": 4.4720328615220354e-05, + "loss": 1.071, + "step": 14677 + }, + { + "epoch": 1.4, + "grad_norm": 0.3067861362262012, + "learning_rate": 4.470714667003395e-05, + "loss": 1.108, + "step": 14678 + }, + { + "epoch": 1.4, + "grad_norm": 0.3148874271895208, + "learning_rate": 4.469396610858595e-05, + "loss": 1.1366, + "step": 14679 + }, + { + "epoch": 1.4, + "grad_norm": 0.33692830818104097, + "learning_rate": 4.468078693120621e-05, + "loss": 1.0005, + "step": 14680 + }, + { + "epoch": 1.4, + "grad_norm": 0.28344139846518396, + "learning_rate": 4.466760913822457e-05, + "loss": 1.0129, + "step": 14681 + }, + { + "epoch": 1.4, + "grad_norm": 0.36178024158089517, + "learning_rate": 4.465443272997076e-05, + "loss": 1.1196, + "step": 14682 + }, + { + "epoch": 1.4, + "grad_norm": 0.33583839265668164, + "learning_rate": 4.4641257706774555e-05, + "loss": 1.0155, + "step": 14683 + }, + { + "epoch": 1.4, + "grad_norm": 0.3424674232883683, + "learning_rate": 4.462808406896569e-05, + "loss": 0.9573, + "step": 14684 + }, + { + "epoch": 1.4, + "grad_norm": 0.3509068876532994, + "learning_rate": 4.4614911816873806e-05, + "loss": 1.0054, + "step": 14685 + }, + { + "epoch": 1.41, + "grad_norm": 0.3099782846680592, + "learning_rate": 4.460174095082854e-05, + "loss": 0.9715, + "step": 14686 + }, + { + "epoch": 1.41, + "grad_norm": 0.3422323071292702, + "learning_rate": 4.458857147115953e-05, + "loss": 0.8871, + "step": 14687 + }, + { + "epoch": 1.41, + "grad_norm": 0.308242012183329, + "learning_rate": 4.457540337819639e-05, + "loss": 1.0123, + "step": 14688 + }, + { + "epoch": 1.41, + "grad_norm": 0.2992832703621901, + "learning_rate": 4.456223667226857e-05, + "loss": 1.0614, + "step": 14689 + }, + { + "epoch": 1.41, + "grad_norm": 0.2936363628327573, + "learning_rate": 4.454907135370566e-05, + "loss": 0.9898, + "step": 14690 + }, + { + "epoch": 1.41, + "grad_norm": 0.300223756055059, + "learning_rate": 4.453590742283705e-05, + "loss": 1.087, + "step": 14691 + }, + { + "epoch": 1.41, + "grad_norm": 0.2871743404520794, + "learning_rate": 4.452274487999225e-05, + "loss": 0.9588, + "step": 14692 + }, + { + "epoch": 1.41, + "grad_norm": 0.327580587717383, + "learning_rate": 4.450958372550056e-05, + "loss": 1.0961, + "step": 14693 + }, + { + "epoch": 1.41, + "grad_norm": 0.30425385372095964, + "learning_rate": 4.4496423959691466e-05, + "loss": 1.0586, + "step": 14694 + }, + { + "epoch": 1.41, + "grad_norm": 0.30716769360591545, + "learning_rate": 4.4483265582894226e-05, + "loss": 0.9509, + "step": 14695 + }, + { + "epoch": 1.41, + "grad_norm": 0.3294702402826662, + "learning_rate": 4.447010859543819e-05, + "loss": 1.0504, + "step": 14696 + }, + { + "epoch": 1.41, + "grad_norm": 0.337198360544144, + "learning_rate": 4.445695299765254e-05, + "loss": 0.9363, + "step": 14697 + }, + { + "epoch": 1.41, + "grad_norm": 0.29332938825979776, + "learning_rate": 4.444379878986659e-05, + "loss": 0.955, + "step": 14698 + }, + { + "epoch": 1.41, + "grad_norm": 0.3425606363150429, + "learning_rate": 4.443064597240943e-05, + "loss": 1.0277, + "step": 14699 + }, + { + "epoch": 1.41, + "grad_norm": 0.31444304343150714, + "learning_rate": 4.4417494545610327e-05, + "loss": 1.153, + "step": 14700 + }, + { + "epoch": 1.41, + "grad_norm": 0.3267488394025441, + "learning_rate": 4.440434450979834e-05, + "loss": 1.0706, + "step": 14701 + }, + { + "epoch": 1.41, + "grad_norm": 0.36045824215873407, + "learning_rate": 4.439119586530259e-05, + "loss": 1.0329, + "step": 14702 + }, + { + "epoch": 1.41, + "grad_norm": 0.3574462560771493, + "learning_rate": 4.43780486124521e-05, + "loss": 1.0689, + "step": 14703 + }, + { + "epoch": 1.41, + "grad_norm": 0.3472231120223318, + "learning_rate": 4.4364902751575876e-05, + "loss": 1.0879, + "step": 14704 + }, + { + "epoch": 1.41, + "grad_norm": 0.32090873168687567, + "learning_rate": 4.4351758283002974e-05, + "loss": 0.9853, + "step": 14705 + }, + { + "epoch": 1.41, + "grad_norm": 0.3301379392643205, + "learning_rate": 4.433861520706226e-05, + "loss": 1.1566, + "step": 14706 + }, + { + "epoch": 1.41, + "grad_norm": 0.2933725709357391, + "learning_rate": 4.4325473524082683e-05, + "loss": 1.0, + "step": 14707 + }, + { + "epoch": 1.41, + "grad_norm": 0.32387775767935834, + "learning_rate": 4.431233323439311e-05, + "loss": 1.1118, + "step": 14708 + }, + { + "epoch": 1.41, + "grad_norm": 0.3212986025387452, + "learning_rate": 4.429919433832245e-05, + "loss": 1.0164, + "step": 14709 + }, + { + "epoch": 1.41, + "grad_norm": 0.2811861685438298, + "learning_rate": 4.4286056836199405e-05, + "loss": 0.9893, + "step": 14710 + }, + { + "epoch": 1.41, + "grad_norm": 0.3089876637279171, + "learning_rate": 4.427292072835285e-05, + "loss": 1.0199, + "step": 14711 + }, + { + "epoch": 1.41, + "grad_norm": 0.2769161964878103, + "learning_rate": 4.4259786015111435e-05, + "loss": 1.0937, + "step": 14712 + }, + { + "epoch": 1.41, + "grad_norm": 0.33214935279149566, + "learning_rate": 4.4246652696803915e-05, + "loss": 0.9715, + "step": 14713 + }, + { + "epoch": 1.41, + "grad_norm": 0.3262295297313104, + "learning_rate": 4.423352077375894e-05, + "loss": 1.0417, + "step": 14714 + }, + { + "epoch": 1.41, + "grad_norm": 0.3480655611739005, + "learning_rate": 4.42203902463052e-05, + "loss": 1.0451, + "step": 14715 + }, + { + "epoch": 1.41, + "grad_norm": 0.31708759805851155, + "learning_rate": 4.42072611147712e-05, + "loss": 0.982, + "step": 14716 + }, + { + "epoch": 1.41, + "grad_norm": 0.28836081686779524, + "learning_rate": 4.4194133379485615e-05, + "loss": 0.9975, + "step": 14717 + }, + { + "epoch": 1.41, + "grad_norm": 0.3041112903508868, + "learning_rate": 4.418100704077686e-05, + "loss": 0.9973, + "step": 14718 + }, + { + "epoch": 1.41, + "grad_norm": 0.34739616955237246, + "learning_rate": 4.416788209897349e-05, + "loss": 1.0332, + "step": 14719 + }, + { + "epoch": 1.41, + "grad_norm": 0.31489966931631547, + "learning_rate": 4.415475855440396e-05, + "loss": 1.0424, + "step": 14720 + }, + { + "epoch": 1.41, + "grad_norm": 0.3063810063921962, + "learning_rate": 4.414163640739673e-05, + "loss": 0.856, + "step": 14721 + }, + { + "epoch": 1.41, + "grad_norm": 0.3191670074639801, + "learning_rate": 4.412851565828011e-05, + "loss": 0.9025, + "step": 14722 + }, + { + "epoch": 1.41, + "grad_norm": 0.2934186880641808, + "learning_rate": 4.4115396307382514e-05, + "loss": 1.0611, + "step": 14723 + }, + { + "epoch": 1.41, + "grad_norm": 0.33508793256188807, + "learning_rate": 4.410227835503228e-05, + "loss": 1.0906, + "step": 14724 + }, + { + "epoch": 1.41, + "grad_norm": 0.32238892639268246, + "learning_rate": 4.408916180155762e-05, + "loss": 0.9862, + "step": 14725 + }, + { + "epoch": 1.41, + "grad_norm": 0.3020361826859757, + "learning_rate": 4.407604664728686e-05, + "loss": 1.0394, + "step": 14726 + }, + { + "epoch": 1.41, + "grad_norm": 0.2954527870910262, + "learning_rate": 4.4062932892548116e-05, + "loss": 1.1103, + "step": 14727 + }, + { + "epoch": 1.41, + "grad_norm": 0.32549379184637134, + "learning_rate": 4.4049820537669695e-05, + "loss": 1.0422, + "step": 14728 + }, + { + "epoch": 1.41, + "grad_norm": 0.3030787942499782, + "learning_rate": 4.403670958297965e-05, + "loss": 1.1207, + "step": 14729 + }, + { + "epoch": 1.41, + "grad_norm": 0.312878871264811, + "learning_rate": 4.4023600028806144e-05, + "loss": 1.0048, + "step": 14730 + }, + { + "epoch": 1.41, + "grad_norm": 0.30360591239176904, + "learning_rate": 4.40104918754772e-05, + "loss": 1.1425, + "step": 14731 + }, + { + "epoch": 1.41, + "grad_norm": 0.29380138431310937, + "learning_rate": 4.399738512332092e-05, + "loss": 1.13, + "step": 14732 + }, + { + "epoch": 1.41, + "grad_norm": 0.34093861693534583, + "learning_rate": 4.3984279772665204e-05, + "loss": 0.9353, + "step": 14733 + }, + { + "epoch": 1.41, + "grad_norm": 0.2721546100548205, + "learning_rate": 4.3971175823838165e-05, + "loss": 1.0474, + "step": 14734 + }, + { + "epoch": 1.41, + "grad_norm": 0.3913010088075184, + "learning_rate": 4.395807327716761e-05, + "loss": 0.9488, + "step": 14735 + }, + { + "epoch": 1.41, + "grad_norm": 0.34663881064012847, + "learning_rate": 4.394497213298154e-05, + "loss": 1.0742, + "step": 14736 + }, + { + "epoch": 1.41, + "grad_norm": 0.3078994110197533, + "learning_rate": 4.393187239160774e-05, + "loss": 1.0495, + "step": 14737 + }, + { + "epoch": 1.41, + "grad_norm": 0.3088893193818887, + "learning_rate": 4.3918774053374113e-05, + "loss": 0.9633, + "step": 14738 + }, + { + "epoch": 1.41, + "grad_norm": 0.35607049392197604, + "learning_rate": 4.3905677118608356e-05, + "loss": 1.0862, + "step": 14739 + }, + { + "epoch": 1.41, + "grad_norm": 0.36304479501962605, + "learning_rate": 4.389258158763829e-05, + "loss": 0.9807, + "step": 14740 + }, + { + "epoch": 1.41, + "grad_norm": 0.3076464479723, + "learning_rate": 4.387948746079163e-05, + "loss": 0.9552, + "step": 14741 + }, + { + "epoch": 1.41, + "grad_norm": 0.30417154034091626, + "learning_rate": 4.386639473839612e-05, + "loss": 0.9599, + "step": 14742 + }, + { + "epoch": 1.41, + "grad_norm": 0.3399648408383189, + "learning_rate": 4.3853303420779314e-05, + "loss": 1.0611, + "step": 14743 + }, + { + "epoch": 1.41, + "grad_norm": 0.3028526006635591, + "learning_rate": 4.3840213508268865e-05, + "loss": 1.0315, + "step": 14744 + }, + { + "epoch": 1.41, + "grad_norm": 0.29689125422043505, + "learning_rate": 4.382712500119241e-05, + "loss": 1.1065, + "step": 14745 + }, + { + "epoch": 1.41, + "grad_norm": 0.30517702105535655, + "learning_rate": 4.381403789987743e-05, + "loss": 0.9546, + "step": 14746 + }, + { + "epoch": 1.41, + "grad_norm": 0.3373072323050672, + "learning_rate": 4.380095220465146e-05, + "loss": 1.0672, + "step": 14747 + }, + { + "epoch": 1.41, + "grad_norm": 0.30879492384857404, + "learning_rate": 4.378786791584197e-05, + "loss": 0.9952, + "step": 14748 + }, + { + "epoch": 1.41, + "grad_norm": 0.30411714693184755, + "learning_rate": 4.3774785033776454e-05, + "loss": 1.0026, + "step": 14749 + }, + { + "epoch": 1.41, + "grad_norm": 0.2843415643833972, + "learning_rate": 4.3761703558782255e-05, + "loss": 1.008, + "step": 14750 + }, + { + "epoch": 1.41, + "grad_norm": 0.3210974137936761, + "learning_rate": 4.374862349118679e-05, + "loss": 0.9299, + "step": 14751 + }, + { + "epoch": 1.41, + "grad_norm": 0.3035369040211916, + "learning_rate": 4.3735544831317344e-05, + "loss": 0.8705, + "step": 14752 + }, + { + "epoch": 1.41, + "grad_norm": 0.3388271442700402, + "learning_rate": 4.372246757950128e-05, + "loss": 0.9108, + "step": 14753 + }, + { + "epoch": 1.41, + "grad_norm": 0.3069118785174167, + "learning_rate": 4.3709391736065764e-05, + "loss": 1.0646, + "step": 14754 + }, + { + "epoch": 1.41, + "grad_norm": 0.2962368870778398, + "learning_rate": 4.3696317301338175e-05, + "loss": 1.0044, + "step": 14755 + }, + { + "epoch": 1.41, + "grad_norm": 0.33129896584226154, + "learning_rate": 4.36832442756456e-05, + "loss": 0.9676, + "step": 14756 + }, + { + "epoch": 1.41, + "grad_norm": 0.31694007842816896, + "learning_rate": 4.367017265931526e-05, + "loss": 1.1172, + "step": 14757 + }, + { + "epoch": 1.41, + "grad_norm": 0.3213298952624508, + "learning_rate": 4.3657102452674205e-05, + "loss": 1.0929, + "step": 14758 + }, + { + "epoch": 1.41, + "grad_norm": 0.33125273077364276, + "learning_rate": 4.364403365604962e-05, + "loss": 1.103, + "step": 14759 + }, + { + "epoch": 1.41, + "grad_norm": 0.2667479002494273, + "learning_rate": 4.363096626976844e-05, + "loss": 0.9664, + "step": 14760 + }, + { + "epoch": 1.41, + "grad_norm": 0.3417318058487445, + "learning_rate": 4.361790029415782e-05, + "loss": 0.9567, + "step": 14761 + }, + { + "epoch": 1.41, + "grad_norm": 0.28434597559465236, + "learning_rate": 4.360483572954465e-05, + "loss": 1.0733, + "step": 14762 + }, + { + "epoch": 1.41, + "grad_norm": 0.2891289189140426, + "learning_rate": 4.359177257625594e-05, + "loss": 1.0149, + "step": 14763 + }, + { + "epoch": 1.41, + "grad_norm": 0.30068958878937047, + "learning_rate": 4.357871083461853e-05, + "loss": 1.0421, + "step": 14764 + }, + { + "epoch": 1.41, + "grad_norm": 0.30619313680399535, + "learning_rate": 4.3565650504959354e-05, + "loss": 1.0518, + "step": 14765 + }, + { + "epoch": 1.41, + "grad_norm": 0.2937433024657005, + "learning_rate": 4.355259158760527e-05, + "loss": 1.0759, + "step": 14766 + }, + { + "epoch": 1.41, + "grad_norm": 0.2704202906792794, + "learning_rate": 4.3539534082883026e-05, + "loss": 1.0041, + "step": 14767 + }, + { + "epoch": 1.41, + "grad_norm": 0.28664608983966505, + "learning_rate": 4.352647799111942e-05, + "loss": 1.0191, + "step": 14768 + }, + { + "epoch": 1.41, + "grad_norm": 0.36414034015436914, + "learning_rate": 4.35134233126412e-05, + "loss": 1.007, + "step": 14769 + }, + { + "epoch": 1.41, + "grad_norm": 0.3211470895343949, + "learning_rate": 4.35003700477751e-05, + "loss": 0.86, + "step": 14770 + }, + { + "epoch": 1.41, + "grad_norm": 0.32419090207174145, + "learning_rate": 4.34873181968477e-05, + "loss": 1.0025, + "step": 14771 + }, + { + "epoch": 1.41, + "grad_norm": 0.2935075779661925, + "learning_rate": 4.3474267760185725e-05, + "loss": 1.0737, + "step": 14772 + }, + { + "epoch": 1.41, + "grad_norm": 0.3054210079700227, + "learning_rate": 4.34612187381157e-05, + "loss": 1.0296, + "step": 14773 + }, + { + "epoch": 1.41, + "grad_norm": 0.3082110373244746, + "learning_rate": 4.344817113096419e-05, + "loss": 1.052, + "step": 14774 + }, + { + "epoch": 1.41, + "grad_norm": 0.3000827790848623, + "learning_rate": 4.343512493905775e-05, + "loss": 0.9064, + "step": 14775 + }, + { + "epoch": 1.41, + "grad_norm": 0.30631723671521943, + "learning_rate": 4.342208016272289e-05, + "loss": 1.0011, + "step": 14776 + }, + { + "epoch": 1.41, + "grad_norm": 0.32072977295738253, + "learning_rate": 4.3409036802285995e-05, + "loss": 1.0579, + "step": 14777 + }, + { + "epoch": 1.41, + "grad_norm": 0.3351715717998838, + "learning_rate": 4.3395994858073554e-05, + "loss": 1.1475, + "step": 14778 + }, + { + "epoch": 1.41, + "grad_norm": 0.3142626199212628, + "learning_rate": 4.338295433041188e-05, + "loss": 0.9864, + "step": 14779 + }, + { + "epoch": 1.41, + "grad_norm": 0.29818321220797495, + "learning_rate": 4.33699152196274e-05, + "loss": 1.0851, + "step": 14780 + }, + { + "epoch": 1.41, + "grad_norm": 0.3037210319212375, + "learning_rate": 4.3356877526046304e-05, + "loss": 1.0357, + "step": 14781 + }, + { + "epoch": 1.41, + "grad_norm": 0.3222482843837731, + "learning_rate": 4.334384124999502e-05, + "loss": 1.0351, + "step": 14782 + }, + { + "epoch": 1.41, + "grad_norm": 0.3286423136398438, + "learning_rate": 4.333080639179968e-05, + "loss": 0.9469, + "step": 14783 + }, + { + "epoch": 1.41, + "grad_norm": 0.28869178229153103, + "learning_rate": 4.331777295178656e-05, + "loss": 1.0286, + "step": 14784 + }, + { + "epoch": 1.41, + "grad_norm": 0.28107854386306225, + "learning_rate": 4.330474093028176e-05, + "loss": 1.1022, + "step": 14785 + }, + { + "epoch": 1.41, + "grad_norm": 0.3179380414831774, + "learning_rate": 4.3291710327611444e-05, + "loss": 1.1696, + "step": 14786 + }, + { + "epoch": 1.41, + "grad_norm": 0.2652541207175407, + "learning_rate": 4.327868114410176e-05, + "loss": 0.9211, + "step": 14787 + }, + { + "epoch": 1.41, + "grad_norm": 0.3358572533029838, + "learning_rate": 4.32656533800787e-05, + "loss": 1.0721, + "step": 14788 + }, + { + "epoch": 1.41, + "grad_norm": 0.3608447776380041, + "learning_rate": 4.325262703586831e-05, + "loss": 1.0897, + "step": 14789 + }, + { + "epoch": 1.41, + "grad_norm": 0.27539339874172003, + "learning_rate": 4.3239602111796596e-05, + "loss": 1.0656, + "step": 14790 + }, + { + "epoch": 1.42, + "grad_norm": 0.2711581148768477, + "learning_rate": 4.322657860818955e-05, + "loss": 1.0264, + "step": 14791 + }, + { + "epoch": 1.42, + "grad_norm": 0.31641936204001037, + "learning_rate": 4.3213556525373014e-05, + "loss": 1.1006, + "step": 14792 + }, + { + "epoch": 1.42, + "grad_norm": 0.29388973538820673, + "learning_rate": 4.3200535863672953e-05, + "loss": 1.0045, + "step": 14793 + }, + { + "epoch": 1.42, + "grad_norm": 0.2977047378974026, + "learning_rate": 4.31875166234151e-05, + "loss": 1.0881, + "step": 14794 + }, + { + "epoch": 1.42, + "grad_norm": 0.3099577749942426, + "learning_rate": 4.3174498804925434e-05, + "loss": 1.0927, + "step": 14795 + }, + { + "epoch": 1.42, + "grad_norm": 0.28556523619916013, + "learning_rate": 4.316148240852961e-05, + "loss": 0.9548, + "step": 14796 + }, + { + "epoch": 1.42, + "grad_norm": 0.31160369244910124, + "learning_rate": 4.314846743455344e-05, + "loss": 1.1486, + "step": 14797 + }, + { + "epoch": 1.42, + "grad_norm": 0.2815597270424023, + "learning_rate": 4.313545388332256e-05, + "loss": 1.0504, + "step": 14798 + }, + { + "epoch": 1.42, + "grad_norm": 0.3004493391144746, + "learning_rate": 4.3122441755162735e-05, + "loss": 1.1677, + "step": 14799 + }, + { + "epoch": 1.42, + "grad_norm": 0.3341359672546735, + "learning_rate": 4.3109431050399506e-05, + "loss": 1.0349, + "step": 14800 + }, + { + "epoch": 1.42, + "grad_norm": 0.2903302808584768, + "learning_rate": 4.309642176935853e-05, + "loss": 1.1032, + "step": 14801 + }, + { + "epoch": 1.42, + "grad_norm": 0.33857333965377145, + "learning_rate": 4.3083413912365354e-05, + "loss": 0.9921, + "step": 14802 + }, + { + "epoch": 1.42, + "grad_norm": 0.3184308673453194, + "learning_rate": 4.3070407479745556e-05, + "loss": 1.0627, + "step": 14803 + }, + { + "epoch": 1.42, + "grad_norm": 0.2827523569338063, + "learning_rate": 4.305740247182455e-05, + "loss": 1.0648, + "step": 14804 + }, + { + "epoch": 1.42, + "grad_norm": 0.32745839879000715, + "learning_rate": 4.304439888892786e-05, + "loss": 1.0912, + "step": 14805 + }, + { + "epoch": 1.42, + "grad_norm": 0.30850872510340305, + "learning_rate": 4.303139673138087e-05, + "loss": 1.0455, + "step": 14806 + }, + { + "epoch": 1.42, + "grad_norm": 0.308043200217309, + "learning_rate": 4.301839599950896e-05, + "loss": 0.9567, + "step": 14807 + }, + { + "epoch": 1.42, + "grad_norm": 0.3213394894014138, + "learning_rate": 4.300539669363751e-05, + "loss": 1.0319, + "step": 14808 + }, + { + "epoch": 1.42, + "grad_norm": 0.32764315425319684, + "learning_rate": 4.299239881409183e-05, + "loss": 1.0524, + "step": 14809 + }, + { + "epoch": 1.42, + "grad_norm": 0.3266623148888753, + "learning_rate": 4.297940236119723e-05, + "loss": 1.0446, + "step": 14810 + }, + { + "epoch": 1.42, + "grad_norm": 0.28013051303720227, + "learning_rate": 4.296640733527889e-05, + "loss": 1.0636, + "step": 14811 + }, + { + "epoch": 1.42, + "grad_norm": 0.32571992609435757, + "learning_rate": 4.2953413736662084e-05, + "loss": 0.9999, + "step": 14812 + }, + { + "epoch": 1.42, + "grad_norm": 0.2933116098869987, + "learning_rate": 4.294042156567191e-05, + "loss": 1.0094, + "step": 14813 + }, + { + "epoch": 1.42, + "grad_norm": 0.30719148541493324, + "learning_rate": 4.292743082263359e-05, + "loss": 0.9266, + "step": 14814 + }, + { + "epoch": 1.42, + "grad_norm": 0.30413740838389125, + "learning_rate": 4.291444150787211e-05, + "loss": 1.0166, + "step": 14815 + }, + { + "epoch": 1.42, + "grad_norm": 0.24766375680327563, + "learning_rate": 4.290145362171267e-05, + "loss": 0.9811, + "step": 14816 + }, + { + "epoch": 1.42, + "grad_norm": 0.2801659530804317, + "learning_rate": 4.2888467164480216e-05, + "loss": 0.9039, + "step": 14817 + }, + { + "epoch": 1.42, + "grad_norm": 0.29862549135986854, + "learning_rate": 4.2875482136499804e-05, + "loss": 1.0811, + "step": 14818 + }, + { + "epoch": 1.42, + "grad_norm": 0.32683976550329263, + "learning_rate": 4.2862498538096294e-05, + "loss": 1.0579, + "step": 14819 + }, + { + "epoch": 1.42, + "grad_norm": 0.2793849256985264, + "learning_rate": 4.284951636959472e-05, + "loss": 1.0754, + "step": 14820 + }, + { + "epoch": 1.42, + "grad_norm": 0.3270940944033948, + "learning_rate": 4.283653563131985e-05, + "loss": 1.0272, + "step": 14821 + }, + { + "epoch": 1.42, + "grad_norm": 0.3605401410202449, + "learning_rate": 4.282355632359666e-05, + "loss": 1.0102, + "step": 14822 + }, + { + "epoch": 1.42, + "grad_norm": 0.26215916629528496, + "learning_rate": 4.281057844674988e-05, + "loss": 0.9282, + "step": 14823 + }, + { + "epoch": 1.42, + "grad_norm": 0.3059900408884878, + "learning_rate": 4.279760200110435e-05, + "loss": 1.0772, + "step": 14824 + }, + { + "epoch": 1.42, + "grad_norm": 0.29788070343175754, + "learning_rate": 4.278462698698475e-05, + "loss": 1.0222, + "step": 14825 + }, + { + "epoch": 1.42, + "grad_norm": 0.3324491830907724, + "learning_rate": 4.2771653404715805e-05, + "loss": 0.9804, + "step": 14826 + }, + { + "epoch": 1.42, + "grad_norm": 0.29779525744995106, + "learning_rate": 4.275868125462225e-05, + "loss": 1.0841, + "step": 14827 + }, + { + "epoch": 1.42, + "grad_norm": 0.29025318284974555, + "learning_rate": 4.2745710537028626e-05, + "loss": 0.9668, + "step": 14828 + }, + { + "epoch": 1.42, + "grad_norm": 0.262020289019119, + "learning_rate": 4.2732741252259575e-05, + "loss": 1.0512, + "step": 14829 + }, + { + "epoch": 1.42, + "grad_norm": 0.310370473688601, + "learning_rate": 4.2719773400639676e-05, + "loss": 1.0438, + "step": 14830 + }, + { + "epoch": 1.42, + "grad_norm": 0.31710063595598054, + "learning_rate": 4.2706806982493484e-05, + "loss": 1.0626, + "step": 14831 + }, + { + "epoch": 1.42, + "grad_norm": 0.2931185431942299, + "learning_rate": 4.2693841998145414e-05, + "loss": 0.9955, + "step": 14832 + }, + { + "epoch": 1.42, + "grad_norm": 0.36110650092037067, + "learning_rate": 4.2680878447919994e-05, + "loss": 1.0401, + "step": 14833 + }, + { + "epoch": 1.42, + "grad_norm": 0.3166965743211788, + "learning_rate": 4.266791633214159e-05, + "loss": 1.105, + "step": 14834 + }, + { + "epoch": 1.42, + "grad_norm": 0.27705078056065074, + "learning_rate": 4.26549556511346e-05, + "loss": 0.925, + "step": 14835 + }, + { + "epoch": 1.42, + "grad_norm": 0.2988352518692726, + "learning_rate": 4.264199640522339e-05, + "loss": 0.9849, + "step": 14836 + }, + { + "epoch": 1.42, + "grad_norm": 0.2713431227213562, + "learning_rate": 4.2629038594732304e-05, + "loss": 1.055, + "step": 14837 + }, + { + "epoch": 1.42, + "grad_norm": 0.3106084866467521, + "learning_rate": 4.2616082219985553e-05, + "loss": 0.996, + "step": 14838 + }, + { + "epoch": 1.42, + "grad_norm": 0.31577401123479115, + "learning_rate": 4.260312728130744e-05, + "loss": 1.0855, + "step": 14839 + }, + { + "epoch": 1.42, + "grad_norm": 0.3296371469849248, + "learning_rate": 4.25901737790221e-05, + "loss": 1.1414, + "step": 14840 + }, + { + "epoch": 1.42, + "grad_norm": 0.3409181304738181, + "learning_rate": 4.2577221713453785e-05, + "loss": 1.0608, + "step": 14841 + }, + { + "epoch": 1.42, + "grad_norm": 0.3107260901345532, + "learning_rate": 4.256427108492651e-05, + "loss": 1.1139, + "step": 14842 + }, + { + "epoch": 1.42, + "grad_norm": 0.3553364822167293, + "learning_rate": 4.255132189376452e-05, + "loss": 1.0359, + "step": 14843 + }, + { + "epoch": 1.42, + "grad_norm": 0.29952252822546677, + "learning_rate": 4.253837414029176e-05, + "loss": 1.0119, + "step": 14844 + }, + { + "epoch": 1.42, + "grad_norm": 0.3331340339535103, + "learning_rate": 4.2525427824832354e-05, + "loss": 0.9828, + "step": 14845 + }, + { + "epoch": 1.42, + "grad_norm": 0.3391908676942946, + "learning_rate": 4.2512482947710186e-05, + "loss": 1.0306, + "step": 14846 + }, + { + "epoch": 1.42, + "grad_norm": 0.32190096539753915, + "learning_rate": 4.2499539509249274e-05, + "loss": 1.0067, + "step": 14847 + }, + { + "epoch": 1.42, + "grad_norm": 0.31919714268833044, + "learning_rate": 4.248659750977355e-05, + "loss": 1.047, + "step": 14848 + }, + { + "epoch": 1.42, + "grad_norm": 0.3149634656982471, + "learning_rate": 4.2473656949606834e-05, + "loss": 1.0764, + "step": 14849 + }, + { + "epoch": 1.42, + "grad_norm": 0.305487115520958, + "learning_rate": 4.2460717829073014e-05, + "loss": 1.046, + "step": 14850 + }, + { + "epoch": 1.42, + "grad_norm": 0.3419828262834496, + "learning_rate": 4.244778014849588e-05, + "loss": 1.0118, + "step": 14851 + }, + { + "epoch": 1.42, + "grad_norm": 0.3293149202404157, + "learning_rate": 4.243484390819927e-05, + "loss": 1.003, + "step": 14852 + }, + { + "epoch": 1.42, + "grad_norm": 0.28045235258886536, + "learning_rate": 4.242190910850683e-05, + "loss": 1.0714, + "step": 14853 + }, + { + "epoch": 1.42, + "grad_norm": 0.30242802654453926, + "learning_rate": 4.240897574974233e-05, + "loss": 1.118, + "step": 14854 + }, + { + "epoch": 1.42, + "grad_norm": 0.3198733516088864, + "learning_rate": 4.2396043832229335e-05, + "loss": 1.0012, + "step": 14855 + }, + { + "epoch": 1.42, + "grad_norm": 0.35006285094063816, + "learning_rate": 4.238311335629163e-05, + "loss": 1.049, + "step": 14856 + }, + { + "epoch": 1.42, + "grad_norm": 0.3093909726019152, + "learning_rate": 4.237018432225267e-05, + "loss": 1.0731, + "step": 14857 + }, + { + "epoch": 1.42, + "grad_norm": 0.29039266013172865, + "learning_rate": 4.235725673043611e-05, + "loss": 0.969, + "step": 14858 + }, + { + "epoch": 1.42, + "grad_norm": 0.30509615151913116, + "learning_rate": 4.2344330581165395e-05, + "loss": 1.0553, + "step": 14859 + }, + { + "epoch": 1.42, + "grad_norm": 0.3340502209325899, + "learning_rate": 4.2331405874764074e-05, + "loss": 1.0396, + "step": 14860 + }, + { + "epoch": 1.42, + "grad_norm": 0.30728906308463266, + "learning_rate": 4.2318482611555534e-05, + "loss": 1.0365, + "step": 14861 + }, + { + "epoch": 1.42, + "grad_norm": 0.30878437681681686, + "learning_rate": 4.2305560791863205e-05, + "loss": 1.0901, + "step": 14862 + }, + { + "epoch": 1.42, + "grad_norm": 0.31758503353533324, + "learning_rate": 4.22926404160105e-05, + "loss": 1.0388, + "step": 14863 + }, + { + "epoch": 1.42, + "grad_norm": 0.3155305504839294, + "learning_rate": 4.227972148432075e-05, + "loss": 1.0906, + "step": 14864 + }, + { + "epoch": 1.42, + "grad_norm": 0.3315071483148331, + "learning_rate": 4.226680399711722e-05, + "loss": 1.0997, + "step": 14865 + }, + { + "epoch": 1.42, + "grad_norm": 0.29918479404558895, + "learning_rate": 4.2253887954723235e-05, + "loss": 1.1037, + "step": 14866 + }, + { + "epoch": 1.42, + "grad_norm": 0.2975503317657487, + "learning_rate": 4.224097335746197e-05, + "loss": 1.0274, + "step": 14867 + }, + { + "epoch": 1.42, + "grad_norm": 0.30041841564839666, + "learning_rate": 4.222806020565664e-05, + "loss": 1.0517, + "step": 14868 + }, + { + "epoch": 1.42, + "grad_norm": 0.2932949257231135, + "learning_rate": 4.221514849963041e-05, + "loss": 1.0672, + "step": 14869 + }, + { + "epoch": 1.42, + "grad_norm": 0.29911669394736046, + "learning_rate": 4.220223823970644e-05, + "loss": 1.0885, + "step": 14870 + }, + { + "epoch": 1.42, + "grad_norm": 0.2651855726167262, + "learning_rate": 4.218932942620775e-05, + "loss": 1.0781, + "step": 14871 + }, + { + "epoch": 1.42, + "grad_norm": 0.3947468603503694, + "learning_rate": 4.217642205945742e-05, + "loss": 1.0099, + "step": 14872 + }, + { + "epoch": 1.42, + "grad_norm": 0.2758247265406958, + "learning_rate": 4.2163516139778516e-05, + "loss": 1.0152, + "step": 14873 + }, + { + "epoch": 1.42, + "grad_norm": 0.36960860479847873, + "learning_rate": 4.2150611667493925e-05, + "loss": 1.0175, + "step": 14874 + }, + { + "epoch": 1.42, + "grad_norm": 0.323958608147933, + "learning_rate": 4.213770864292667e-05, + "loss": 1.0214, + "step": 14875 + }, + { + "epoch": 1.42, + "grad_norm": 0.31670436375263683, + "learning_rate": 4.212480706639954e-05, + "loss": 1.0002, + "step": 14876 + }, + { + "epoch": 1.42, + "grad_norm": 0.31191714630754996, + "learning_rate": 4.211190693823557e-05, + "loss": 1.0076, + "step": 14877 + }, + { + "epoch": 1.42, + "grad_norm": 0.322162019513223, + "learning_rate": 4.209900825875748e-05, + "loss": 1.1067, + "step": 14878 + }, + { + "epoch": 1.42, + "grad_norm": 0.3125843013285155, + "learning_rate": 4.208611102828812e-05, + "loss": 1.1661, + "step": 14879 + }, + { + "epoch": 1.42, + "grad_norm": 0.29006128331460407, + "learning_rate": 4.20732152471502e-05, + "loss": 0.9778, + "step": 14880 + }, + { + "epoch": 1.42, + "grad_norm": 0.2826674815197641, + "learning_rate": 4.2060320915666505e-05, + "loss": 0.9669, + "step": 14881 + }, + { + "epoch": 1.42, + "grad_norm": 0.3659360248048912, + "learning_rate": 4.204742803415963e-05, + "loss": 1.1975, + "step": 14882 + }, + { + "epoch": 1.42, + "grad_norm": 0.3012656033272527, + "learning_rate": 4.203453660295236e-05, + "loss": 1.0781, + "step": 14883 + }, + { + "epoch": 1.42, + "grad_norm": 0.2995886875485569, + "learning_rate": 4.202164662236719e-05, + "loss": 1.0321, + "step": 14884 + }, + { + "epoch": 1.42, + "grad_norm": 0.31655561349555894, + "learning_rate": 4.2008758092726806e-05, + "loss": 0.9708, + "step": 14885 + }, + { + "epoch": 1.42, + "grad_norm": 0.29545667539838877, + "learning_rate": 4.199587101435365e-05, + "loss": 0.9364, + "step": 14886 + }, + { + "epoch": 1.42, + "grad_norm": 0.2834822643802017, + "learning_rate": 4.198298538757032e-05, + "loss": 0.97, + "step": 14887 + }, + { + "epoch": 1.42, + "grad_norm": 0.2974090943179033, + "learning_rate": 4.197010121269921e-05, + "loss": 1.0074, + "step": 14888 + }, + { + "epoch": 1.42, + "grad_norm": 0.27150445564089654, + "learning_rate": 4.195721849006278e-05, + "loss": 1.0056, + "step": 14889 + }, + { + "epoch": 1.42, + "grad_norm": 0.3008057459255813, + "learning_rate": 4.1944337219983434e-05, + "loss": 1.0831, + "step": 14890 + }, + { + "epoch": 1.42, + "grad_norm": 0.29494738079822663, + "learning_rate": 4.193145740278357e-05, + "loss": 0.894, + "step": 14891 + }, + { + "epoch": 1.42, + "grad_norm": 0.32156351792635174, + "learning_rate": 4.1918579038785454e-05, + "loss": 1.0325, + "step": 14892 + }, + { + "epoch": 1.42, + "grad_norm": 0.3328457597803948, + "learning_rate": 4.19057021283114e-05, + "loss": 0.9842, + "step": 14893 + }, + { + "epoch": 1.42, + "grad_norm": 0.32924138971012806, + "learning_rate": 4.189282667168368e-05, + "loss": 1.0047, + "step": 14894 + }, + { + "epoch": 1.43, + "grad_norm": 0.32697547616340134, + "learning_rate": 4.1879952669224476e-05, + "loss": 1.0154, + "step": 14895 + }, + { + "epoch": 1.43, + "grad_norm": 0.32406195205272886, + "learning_rate": 4.186708012125596e-05, + "loss": 0.9932, + "step": 14896 + }, + { + "epoch": 1.43, + "grad_norm": 0.34291975548011977, + "learning_rate": 4.185420902810032e-05, + "loss": 1.0895, + "step": 14897 + }, + { + "epoch": 1.43, + "grad_norm": 0.3322686472680487, + "learning_rate": 4.184133939007967e-05, + "loss": 1.0896, + "step": 14898 + }, + { + "epoch": 1.43, + "grad_norm": 0.33566323377148466, + "learning_rate": 4.1828471207516027e-05, + "loss": 1.0004, + "step": 14899 + }, + { + "epoch": 1.43, + "grad_norm": 0.31333992381256703, + "learning_rate": 4.181560448073147e-05, + "loss": 1.027, + "step": 14900 + }, + { + "epoch": 1.43, + "grad_norm": 0.33566917068338403, + "learning_rate": 4.180273921004796e-05, + "loss": 1.0172, + "step": 14901 + }, + { + "epoch": 1.43, + "grad_norm": 0.3246966417774535, + "learning_rate": 4.1789875395787504e-05, + "loss": 0.9485, + "step": 14902 + }, + { + "epoch": 1.43, + "grad_norm": 0.3067533624187742, + "learning_rate": 4.177701303827193e-05, + "loss": 1.1362, + "step": 14903 + }, + { + "epoch": 1.43, + "grad_norm": 0.32722077004704025, + "learning_rate": 4.176415213782326e-05, + "loss": 0.9638, + "step": 14904 + }, + { + "epoch": 1.43, + "grad_norm": 0.2835158555799552, + "learning_rate": 4.175129269476326e-05, + "loss": 0.9936, + "step": 14905 + }, + { + "epoch": 1.43, + "grad_norm": 0.29718923253368534, + "learning_rate": 4.1738434709413796e-05, + "loss": 0.9914, + "step": 14906 + }, + { + "epoch": 1.43, + "grad_norm": 0.318428074035626, + "learning_rate": 4.172557818209659e-05, + "loss": 1.0903, + "step": 14907 + }, + { + "epoch": 1.43, + "grad_norm": 0.3087860075480544, + "learning_rate": 4.1712723113133414e-05, + "loss": 0.9664, + "step": 14908 + }, + { + "epoch": 1.43, + "grad_norm": 0.3374404821314716, + "learning_rate": 4.1699869502846e-05, + "loss": 1.0585, + "step": 14909 + }, + { + "epoch": 1.43, + "grad_norm": 0.331209872885539, + "learning_rate": 4.168701735155596e-05, + "loss": 0.933, + "step": 14910 + }, + { + "epoch": 1.43, + "grad_norm": 0.3570996162378252, + "learning_rate": 4.167416665958496e-05, + "loss": 1.0356, + "step": 14911 + }, + { + "epoch": 1.43, + "grad_norm": 0.28868405542657144, + "learning_rate": 4.16613174272546e-05, + "loss": 1.0485, + "step": 14912 + }, + { + "epoch": 1.43, + "grad_norm": 0.31731136234689483, + "learning_rate": 4.1648469654886465e-05, + "loss": 1.0112, + "step": 14913 + }, + { + "epoch": 1.43, + "grad_norm": 0.3012659926674615, + "learning_rate": 4.163562334280202e-05, + "loss": 1.0401, + "step": 14914 + }, + { + "epoch": 1.43, + "grad_norm": 0.3046892407879478, + "learning_rate": 4.1622778491322813e-05, + "loss": 1.0567, + "step": 14915 + }, + { + "epoch": 1.43, + "grad_norm": 0.3092652941320658, + "learning_rate": 4.1609935100770194e-05, + "loss": 1.0586, + "step": 14916 + }, + { + "epoch": 1.43, + "grad_norm": 0.3491666932226508, + "learning_rate": 4.1597093171465716e-05, + "loss": 1.0359, + "step": 14917 + }, + { + "epoch": 1.43, + "grad_norm": 0.2977603664313724, + "learning_rate": 4.1584252703730656e-05, + "loss": 1.0358, + "step": 14918 + }, + { + "epoch": 1.43, + "grad_norm": 0.3022373367511126, + "learning_rate": 4.1571413697886405e-05, + "loss": 1.024, + "step": 14919 + }, + { + "epoch": 1.43, + "grad_norm": 0.3494527396279521, + "learning_rate": 4.155857615425423e-05, + "loss": 1.0653, + "step": 14920 + }, + { + "epoch": 1.43, + "grad_norm": 0.2724065285434603, + "learning_rate": 4.154574007315544e-05, + "loss": 1.0344, + "step": 14921 + }, + { + "epoch": 1.43, + "grad_norm": 0.29833594241639255, + "learning_rate": 4.153290545491121e-05, + "loss": 1.07, + "step": 14922 + }, + { + "epoch": 1.43, + "grad_norm": 0.2966950730559013, + "learning_rate": 4.152007229984277e-05, + "loss": 0.9459, + "step": 14923 + }, + { + "epoch": 1.43, + "grad_norm": 0.35218868994282304, + "learning_rate": 4.150724060827127e-05, + "loss": 1.0473, + "step": 14924 + }, + { + "epoch": 1.43, + "grad_norm": 0.3135255974518991, + "learning_rate": 4.149441038051787e-05, + "loss": 1.036, + "step": 14925 + }, + { + "epoch": 1.43, + "grad_norm": 0.30805969595930566, + "learning_rate": 4.148158161690359e-05, + "loss": 1.0452, + "step": 14926 + }, + { + "epoch": 1.43, + "grad_norm": 0.2831438429753198, + "learning_rate": 4.146875431774952e-05, + "loss": 1.0294, + "step": 14927 + }, + { + "epoch": 1.43, + "grad_norm": 0.29362629905710574, + "learning_rate": 4.145592848337665e-05, + "loss": 0.9086, + "step": 14928 + }, + { + "epoch": 1.43, + "grad_norm": 0.30316773592735036, + "learning_rate": 4.144310411410596e-05, + "loss": 1.0856, + "step": 14929 + }, + { + "epoch": 1.43, + "grad_norm": 0.32940488754205643, + "learning_rate": 4.143028121025838e-05, + "loss": 1.0203, + "step": 14930 + }, + { + "epoch": 1.43, + "grad_norm": 0.3125556751076664, + "learning_rate": 4.1417459772154864e-05, + "loss": 1.1021, + "step": 14931 + }, + { + "epoch": 1.43, + "grad_norm": 0.3128759371497385, + "learning_rate": 4.14046398001162e-05, + "loss": 1.0957, + "step": 14932 + }, + { + "epoch": 1.43, + "grad_norm": 0.2914435826217678, + "learning_rate": 4.139182129446325e-05, + "loss": 1.0063, + "step": 14933 + }, + { + "epoch": 1.43, + "grad_norm": 0.3194718267764711, + "learning_rate": 4.137900425551684e-05, + "loss": 1.0651, + "step": 14934 + }, + { + "epoch": 1.43, + "grad_norm": 0.30547125049926904, + "learning_rate": 4.136618868359765e-05, + "loss": 1.0763, + "step": 14935 + }, + { + "epoch": 1.43, + "grad_norm": 0.34815121320940545, + "learning_rate": 4.135337457902648e-05, + "loss": 1.0749, + "step": 14936 + }, + { + "epoch": 1.43, + "grad_norm": 0.3263178064468196, + "learning_rate": 4.1340561942123904e-05, + "loss": 1.0895, + "step": 14937 + }, + { + "epoch": 1.43, + "grad_norm": 0.3146675088943855, + "learning_rate": 4.1327750773210696e-05, + "loss": 0.9218, + "step": 14938 + }, + { + "epoch": 1.43, + "grad_norm": 0.2681589428041059, + "learning_rate": 4.131494107260736e-05, + "loss": 1.0588, + "step": 14939 + }, + { + "epoch": 1.43, + "grad_norm": 0.28071870598959336, + "learning_rate": 4.130213284063454e-05, + "loss": 0.9766, + "step": 14940 + }, + { + "epoch": 1.43, + "grad_norm": 0.28249838963279245, + "learning_rate": 4.128932607761271e-05, + "loss": 0.9911, + "step": 14941 + }, + { + "epoch": 1.43, + "grad_norm": 0.305630563214577, + "learning_rate": 4.127652078386243e-05, + "loss": 0.9756, + "step": 14942 + }, + { + "epoch": 1.43, + "grad_norm": 0.3121744598141387, + "learning_rate": 4.1263716959704036e-05, + "loss": 1.0671, + "step": 14943 + }, + { + "epoch": 1.43, + "grad_norm": 0.2939483079061041, + "learning_rate": 4.125091460545813e-05, + "loss": 1.0629, + "step": 14944 + }, + { + "epoch": 1.43, + "grad_norm": 0.29585551181658276, + "learning_rate": 4.123811372144496e-05, + "loss": 0.9291, + "step": 14945 + }, + { + "epoch": 1.43, + "grad_norm": 0.293309313017358, + "learning_rate": 4.1225314307984966e-05, + "loss": 1.0296, + "step": 14946 + }, + { + "epoch": 1.43, + "grad_norm": 0.3568636791277681, + "learning_rate": 4.121251636539838e-05, + "loss": 1.0659, + "step": 14947 + }, + { + "epoch": 1.43, + "grad_norm": 0.3046875643931903, + "learning_rate": 4.119971989400556e-05, + "loss": 0.9935, + "step": 14948 + }, + { + "epoch": 1.43, + "grad_norm": 0.3138792143082215, + "learning_rate": 4.118692489412667e-05, + "loss": 1.065, + "step": 14949 + }, + { + "epoch": 1.43, + "grad_norm": 0.296459128895389, + "learning_rate": 4.1174131366081935e-05, + "loss": 0.9592, + "step": 14950 + }, + { + "epoch": 1.43, + "grad_norm": 0.337883515964346, + "learning_rate": 4.116133931019154e-05, + "loss": 1.115, + "step": 14951 + }, + { + "epoch": 1.43, + "grad_norm": 0.28349972264058804, + "learning_rate": 4.114854872677565e-05, + "loss": 1.0132, + "step": 14952 + }, + { + "epoch": 1.43, + "grad_norm": 0.32973485157637195, + "learning_rate": 4.113575961615427e-05, + "loss": 0.9859, + "step": 14953 + }, + { + "epoch": 1.43, + "grad_norm": 0.28892806635726226, + "learning_rate": 4.1122971978647495e-05, + "loss": 1.0661, + "step": 14954 + }, + { + "epoch": 1.43, + "grad_norm": 0.2998709487713856, + "learning_rate": 4.1110185814575394e-05, + "loss": 0.949, + "step": 14955 + }, + { + "epoch": 1.43, + "grad_norm": 0.31296787888084654, + "learning_rate": 4.109740112425787e-05, + "loss": 1.0369, + "step": 14956 + }, + { + "epoch": 1.43, + "grad_norm": 0.3182156802939596, + "learning_rate": 4.10846179080149e-05, + "loss": 0.9748, + "step": 14957 + }, + { + "epoch": 1.43, + "grad_norm": 0.3401305584231448, + "learning_rate": 4.107183616616639e-05, + "loss": 1.0291, + "step": 14958 + }, + { + "epoch": 1.43, + "grad_norm": 0.31110026279021497, + "learning_rate": 4.1059055899032264e-05, + "loss": 1.0699, + "step": 14959 + }, + { + "epoch": 1.43, + "grad_norm": 0.3611225309264027, + "learning_rate": 4.104627710693225e-05, + "loss": 1.0204, + "step": 14960 + }, + { + "epoch": 1.43, + "grad_norm": 0.2846757898732917, + "learning_rate": 4.103349979018626e-05, + "loss": 1.0673, + "step": 14961 + }, + { + "epoch": 1.43, + "grad_norm": 0.29616453766370077, + "learning_rate": 4.102072394911395e-05, + "loss": 1.1495, + "step": 14962 + }, + { + "epoch": 1.43, + "grad_norm": 0.31446495843614997, + "learning_rate": 4.1007949584035124e-05, + "loss": 1.016, + "step": 14963 + }, + { + "epoch": 1.43, + "grad_norm": 0.26327159233243863, + "learning_rate": 4.099517669526937e-05, + "loss": 0.8801, + "step": 14964 + }, + { + "epoch": 1.43, + "grad_norm": 0.3910738630911374, + "learning_rate": 4.098240528313646e-05, + "loss": 1.0469, + "step": 14965 + }, + { + "epoch": 1.43, + "grad_norm": 0.32298835093841133, + "learning_rate": 4.096963534795593e-05, + "loss": 1.0019, + "step": 14966 + }, + { + "epoch": 1.43, + "grad_norm": 0.2726475578398932, + "learning_rate": 4.0956866890047396e-05, + "loss": 0.9668, + "step": 14967 + }, + { + "epoch": 1.43, + "grad_norm": 0.283220823425486, + "learning_rate": 4.094409990973035e-05, + "loss": 1.0081, + "step": 14968 + }, + { + "epoch": 1.43, + "grad_norm": 0.33948229163954835, + "learning_rate": 4.093133440732435e-05, + "loss": 1.0563, + "step": 14969 + }, + { + "epoch": 1.43, + "grad_norm": 0.3275434014556378, + "learning_rate": 4.091857038314879e-05, + "loss": 1.0604, + "step": 14970 + }, + { + "epoch": 1.43, + "grad_norm": 0.28062068454510997, + "learning_rate": 4.090580783752315e-05, + "loss": 1.0247, + "step": 14971 + }, + { + "epoch": 1.43, + "grad_norm": 0.2995736006844543, + "learning_rate": 4.0893046770766794e-05, + "loss": 1.0234, + "step": 14972 + }, + { + "epoch": 1.43, + "grad_norm": 0.32399460136477215, + "learning_rate": 4.088028718319914e-05, + "loss": 0.9999, + "step": 14973 + }, + { + "epoch": 1.43, + "grad_norm": 0.31938080776876704, + "learning_rate": 4.086752907513941e-05, + "loss": 0.9245, + "step": 14974 + }, + { + "epoch": 1.43, + "grad_norm": 0.34683013794569256, + "learning_rate": 4.085477244690693e-05, + "loss": 1.0233, + "step": 14975 + }, + { + "epoch": 1.43, + "grad_norm": 0.3547243322197581, + "learning_rate": 4.0842017298820965e-05, + "loss": 0.9734, + "step": 14976 + }, + { + "epoch": 1.43, + "grad_norm": 0.29302750596705474, + "learning_rate": 4.0829263631200676e-05, + "loss": 0.9792, + "step": 14977 + }, + { + "epoch": 1.43, + "grad_norm": 0.3166871259963163, + "learning_rate": 4.081651144436525e-05, + "loss": 0.9784, + "step": 14978 + }, + { + "epoch": 1.43, + "grad_norm": 0.30823434481244066, + "learning_rate": 4.0803760738633826e-05, + "loss": 1.0357, + "step": 14979 + }, + { + "epoch": 1.43, + "grad_norm": 0.3080176336244288, + "learning_rate": 4.0791011514325514e-05, + "loss": 1.0643, + "step": 14980 + }, + { + "epoch": 1.43, + "grad_norm": 0.3076076251619121, + "learning_rate": 4.077826377175932e-05, + "loss": 1.029, + "step": 14981 + }, + { + "epoch": 1.43, + "grad_norm": 0.289036244100653, + "learning_rate": 4.0765517511254336e-05, + "loss": 0.9669, + "step": 14982 + }, + { + "epoch": 1.43, + "grad_norm": 0.32415808388452544, + "learning_rate": 4.075277273312946e-05, + "loss": 1.0742, + "step": 14983 + }, + { + "epoch": 1.43, + "grad_norm": 0.2720279575818121, + "learning_rate": 4.0740029437703685e-05, + "loss": 1.0363, + "step": 14984 + }, + { + "epoch": 1.43, + "grad_norm": 0.3747136275903749, + "learning_rate": 4.072728762529592e-05, + "loss": 1.0109, + "step": 14985 + }, + { + "epoch": 1.43, + "grad_norm": 0.31177584202318487, + "learning_rate": 4.071454729622507e-05, + "loss": 1.0802, + "step": 14986 + }, + { + "epoch": 1.43, + "grad_norm": 0.2758453205390317, + "learning_rate": 4.07018084508099e-05, + "loss": 1.0878, + "step": 14987 + }, + { + "epoch": 1.43, + "grad_norm": 0.34049674483376086, + "learning_rate": 4.068907108936927e-05, + "loss": 1.097, + "step": 14988 + }, + { + "epoch": 1.43, + "grad_norm": 0.3414499093558633, + "learning_rate": 4.0676335212221873e-05, + "loss": 1.0395, + "step": 14989 + }, + { + "epoch": 1.43, + "grad_norm": 0.30452205151680234, + "learning_rate": 4.066360081968651e-05, + "loss": 0.9718, + "step": 14990 + }, + { + "epoch": 1.43, + "grad_norm": 0.34871406805647, + "learning_rate": 4.065086791208176e-05, + "loss": 1.0395, + "step": 14991 + }, + { + "epoch": 1.43, + "grad_norm": 0.33521059947189685, + "learning_rate": 4.063813648972641e-05, + "loss": 1.0469, + "step": 14992 + }, + { + "epoch": 1.43, + "grad_norm": 0.33468607130484546, + "learning_rate": 4.062540655293896e-05, + "loss": 0.9801, + "step": 14993 + }, + { + "epoch": 1.43, + "grad_norm": 0.3201297532652176, + "learning_rate": 4.061267810203804e-05, + "loss": 1.0541, + "step": 14994 + }, + { + "epoch": 1.43, + "grad_norm": 0.30731588571970314, + "learning_rate": 4.0599951137342195e-05, + "loss": 0.9114, + "step": 14995 + }, + { + "epoch": 1.43, + "grad_norm": 0.3173000757175976, + "learning_rate": 4.058722565916988e-05, + "loss": 0.8631, + "step": 14996 + }, + { + "epoch": 1.43, + "grad_norm": 0.316422799105344, + "learning_rate": 4.05745016678396e-05, + "loss": 1.0625, + "step": 14997 + }, + { + "epoch": 1.43, + "grad_norm": 0.32781273512405434, + "learning_rate": 4.056177916366971e-05, + "loss": 1.0072, + "step": 14998 + }, + { + "epoch": 1.43, + "grad_norm": 0.3728126851456133, + "learning_rate": 4.054905814697872e-05, + "loss": 1.0278, + "step": 14999 + }, + { + "epoch": 1.44, + "grad_norm": 0.29582647100598874, + "learning_rate": 4.053633861808488e-05, + "loss": 1.0718, + "step": 15000 + }, + { + "epoch": 1.44, + "grad_norm": 0.30847763685147295, + "learning_rate": 4.052362057730656e-05, + "loss": 1.0255, + "step": 15001 + }, + { + "epoch": 1.44, + "grad_norm": 0.32520541370536515, + "learning_rate": 4.0510904024961996e-05, + "loss": 1.057, + "step": 15002 + }, + { + "epoch": 1.44, + "grad_norm": 0.27839706599436775, + "learning_rate": 4.049818896136948e-05, + "loss": 1.0408, + "step": 15003 + }, + { + "epoch": 1.44, + "grad_norm": 0.3317442456338203, + "learning_rate": 4.048547538684709e-05, + "loss": 1.1462, + "step": 15004 + }, + { + "epoch": 1.44, + "grad_norm": 0.3243248834731339, + "learning_rate": 4.047276330171318e-05, + "loss": 1.0503, + "step": 15005 + }, + { + "epoch": 1.44, + "grad_norm": 0.32196576936389437, + "learning_rate": 4.046005270628572e-05, + "loss": 1.0173, + "step": 15006 + }, + { + "epoch": 1.44, + "grad_norm": 0.27842082486729597, + "learning_rate": 4.0447343600882915e-05, + "loss": 1.0294, + "step": 15007 + }, + { + "epoch": 1.44, + "grad_norm": 0.32870138290235246, + "learning_rate": 4.043463598582272e-05, + "loss": 1.0051, + "step": 15008 + }, + { + "epoch": 1.44, + "grad_norm": 0.3112731401690991, + "learning_rate": 4.042192986142322e-05, + "loss": 1.0302, + "step": 15009 + }, + { + "epoch": 1.44, + "grad_norm": 0.3070637091667967, + "learning_rate": 4.040922522800236e-05, + "loss": 0.9669, + "step": 15010 + }, + { + "epoch": 1.44, + "grad_norm": 0.309427451307841, + "learning_rate": 4.039652208587806e-05, + "loss": 1.101, + "step": 15011 + }, + { + "epoch": 1.44, + "grad_norm": 0.3035321482754023, + "learning_rate": 4.038382043536828e-05, + "loss": 1.0576, + "step": 15012 + }, + { + "epoch": 1.44, + "grad_norm": 0.3095272345169022, + "learning_rate": 4.0371120276790884e-05, + "loss": 1.0206, + "step": 15013 + }, + { + "epoch": 1.44, + "grad_norm": 0.29739044355062516, + "learning_rate": 4.035842161046364e-05, + "loss": 1.0509, + "step": 15014 + }, + { + "epoch": 1.44, + "grad_norm": 0.3332713567136474, + "learning_rate": 4.034572443670438e-05, + "loss": 1.0672, + "step": 15015 + }, + { + "epoch": 1.44, + "grad_norm": 0.3040317713921412, + "learning_rate": 4.0333028755830895e-05, + "loss": 0.9948, + "step": 15016 + }, + { + "epoch": 1.44, + "grad_norm": 0.3076442813485731, + "learning_rate": 4.0320334568160835e-05, + "loss": 1.1281, + "step": 15017 + }, + { + "epoch": 1.44, + "grad_norm": 0.3186382816850027, + "learning_rate": 4.03076418740119e-05, + "loss": 1.036, + "step": 15018 + }, + { + "epoch": 1.44, + "grad_norm": 0.328507939431143, + "learning_rate": 4.0294950673701745e-05, + "loss": 1.0537, + "step": 15019 + }, + { + "epoch": 1.44, + "grad_norm": 0.2844812515516222, + "learning_rate": 4.028226096754801e-05, + "loss": 1.0016, + "step": 15020 + }, + { + "epoch": 1.44, + "grad_norm": 0.2843200371747749, + "learning_rate": 4.026957275586819e-05, + "loss": 0.911, + "step": 15021 + }, + { + "epoch": 1.44, + "grad_norm": 0.31902978227183915, + "learning_rate": 4.025688603897989e-05, + "loss": 0.9538, + "step": 15022 + }, + { + "epoch": 1.44, + "grad_norm": 0.29518505113221977, + "learning_rate": 4.024420081720052e-05, + "loss": 0.9343, + "step": 15023 + }, + { + "epoch": 1.44, + "grad_norm": 0.3156845942522147, + "learning_rate": 4.023151709084761e-05, + "loss": 1.1685, + "step": 15024 + }, + { + "epoch": 1.44, + "grad_norm": 0.3466136468966329, + "learning_rate": 4.02188348602385e-05, + "loss": 1.005, + "step": 15025 + }, + { + "epoch": 1.44, + "grad_norm": 0.3200400107505025, + "learning_rate": 4.020615412569068e-05, + "loss": 0.9117, + "step": 15026 + }, + { + "epoch": 1.44, + "grad_norm": 0.3399697747576903, + "learning_rate": 4.019347488752139e-05, + "loss": 1.0811, + "step": 15027 + }, + { + "epoch": 1.44, + "grad_norm": 0.3076543975212716, + "learning_rate": 4.018079714604801e-05, + "loss": 1.1005, + "step": 15028 + }, + { + "epoch": 1.44, + "grad_norm": 0.31574116581872114, + "learning_rate": 4.0168120901587745e-05, + "loss": 0.9652, + "step": 15029 + }, + { + "epoch": 1.44, + "grad_norm": 0.2809907969896399, + "learning_rate": 4.01554461544579e-05, + "loss": 1.0085, + "step": 15030 + }, + { + "epoch": 1.44, + "grad_norm": 0.301995512065998, + "learning_rate": 4.014277290497558e-05, + "loss": 0.9909, + "step": 15031 + }, + { + "epoch": 1.44, + "grad_norm": 0.3394229488918653, + "learning_rate": 4.013010115345799e-05, + "loss": 1.0702, + "step": 15032 + }, + { + "epoch": 1.44, + "grad_norm": 0.3486216758849748, + "learning_rate": 4.011743090022225e-05, + "loss": 0.9535, + "step": 15033 + }, + { + "epoch": 1.44, + "grad_norm": 0.34234464403549636, + "learning_rate": 4.0104762145585474e-05, + "loss": 1.0434, + "step": 15034 + }, + { + "epoch": 1.44, + "grad_norm": 0.30025075450057676, + "learning_rate": 4.009209488986463e-05, + "loss": 0.9308, + "step": 15035 + }, + { + "epoch": 1.44, + "grad_norm": 0.3428818556721368, + "learning_rate": 4.0079429133376756e-05, + "loss": 1.0575, + "step": 15036 + }, + { + "epoch": 1.44, + "grad_norm": 0.30520997792830207, + "learning_rate": 4.0066764876438864e-05, + "loss": 1.01, + "step": 15037 + }, + { + "epoch": 1.44, + "grad_norm": 0.31146445832419023, + "learning_rate": 4.005410211936782e-05, + "loss": 1.0435, + "step": 15038 + }, + { + "epoch": 1.44, + "grad_norm": 0.28622208540880034, + "learning_rate": 4.004144086248054e-05, + "loss": 1.1252, + "step": 15039 + }, + { + "epoch": 1.44, + "grad_norm": 0.3327350877852854, + "learning_rate": 4.0028781106093885e-05, + "loss": 0.9869, + "step": 15040 + }, + { + "epoch": 1.44, + "grad_norm": 0.3111840147317802, + "learning_rate": 4.0016122850524707e-05, + "loss": 0.9713, + "step": 15041 + }, + { + "epoch": 1.44, + "grad_norm": 0.2791223847990123, + "learning_rate": 4.000346609608971e-05, + "loss": 0.9627, + "step": 15042 + }, + { + "epoch": 1.44, + "grad_norm": 0.29296642730119876, + "learning_rate": 3.999081084310573e-05, + "loss": 0.981, + "step": 15043 + }, + { + "epoch": 1.44, + "grad_norm": 0.30053267254220895, + "learning_rate": 3.997815709188938e-05, + "loss": 1.0199, + "step": 15044 + }, + { + "epoch": 1.44, + "grad_norm": 0.3537320549083488, + "learning_rate": 3.9965504842757375e-05, + "loss": 1.019, + "step": 15045 + }, + { + "epoch": 1.44, + "grad_norm": 0.3066083907212984, + "learning_rate": 3.995285409602634e-05, + "loss": 1.0299, + "step": 15046 + }, + { + "epoch": 1.44, + "grad_norm": 0.3450531539344218, + "learning_rate": 3.994020485201291e-05, + "loss": 1.1244, + "step": 15047 + }, + { + "epoch": 1.44, + "grad_norm": 0.2995148841365339, + "learning_rate": 3.9927557111033554e-05, + "loss": 1.0429, + "step": 15048 + }, + { + "epoch": 1.44, + "grad_norm": 0.3203830992285029, + "learning_rate": 3.991491087340488e-05, + "loss": 0.968, + "step": 15049 + }, + { + "epoch": 1.44, + "grad_norm": 0.32248340411799614, + "learning_rate": 3.990226613944328e-05, + "loss": 1.1102, + "step": 15050 + }, + { + "epoch": 1.44, + "grad_norm": 0.41224988687176145, + "learning_rate": 3.988962290946527e-05, + "loss": 1.0677, + "step": 15051 + }, + { + "epoch": 1.44, + "grad_norm": 0.3023479702722668, + "learning_rate": 3.987698118378717e-05, + "loss": 1.0138, + "step": 15052 + }, + { + "epoch": 1.44, + "grad_norm": 0.291984797224112, + "learning_rate": 3.9864340962725464e-05, + "loss": 0.9318, + "step": 15053 + }, + { + "epoch": 1.44, + "grad_norm": 0.28485237606248665, + "learning_rate": 3.985170224659639e-05, + "loss": 1.0544, + "step": 15054 + }, + { + "epoch": 1.44, + "grad_norm": 0.328611389328802, + "learning_rate": 3.9839065035716297e-05, + "loss": 0.9629, + "step": 15055 + }, + { + "epoch": 1.44, + "grad_norm": 0.310317668844201, + "learning_rate": 3.9826429330401384e-05, + "loss": 1.0405, + "step": 15056 + }, + { + "epoch": 1.44, + "grad_norm": 0.33188739599964073, + "learning_rate": 3.9813795130967876e-05, + "loss": 1.0048, + "step": 15057 + }, + { + "epoch": 1.44, + "grad_norm": 0.2992099424720532, + "learning_rate": 3.980116243773203e-05, + "loss": 1.0212, + "step": 15058 + }, + { + "epoch": 1.44, + "grad_norm": 0.345639154137502, + "learning_rate": 3.978853125100988e-05, + "loss": 0.9935, + "step": 15059 + }, + { + "epoch": 1.44, + "grad_norm": 0.30769752743137896, + "learning_rate": 3.977590157111758e-05, + "loss": 1.1013, + "step": 15060 + }, + { + "epoch": 1.44, + "grad_norm": 0.3174585132048583, + "learning_rate": 3.9763273398371195e-05, + "loss": 1.376, + "step": 15061 + }, + { + "epoch": 1.44, + "grad_norm": 0.30483374152897924, + "learning_rate": 3.975064673308677e-05, + "loss": 0.9777, + "step": 15062 + }, + { + "epoch": 1.44, + "grad_norm": 0.32441446024155923, + "learning_rate": 3.9738021575580255e-05, + "loss": 0.9906, + "step": 15063 + }, + { + "epoch": 1.44, + "grad_norm": 0.288973513634259, + "learning_rate": 3.972539792616766e-05, + "loss": 1.0029, + "step": 15064 + }, + { + "epoch": 1.44, + "grad_norm": 0.33123843631216776, + "learning_rate": 3.971277578516479e-05, + "loss": 1.0728, + "step": 15065 + }, + { + "epoch": 1.44, + "grad_norm": 0.30618304448051775, + "learning_rate": 3.9700155152887665e-05, + "loss": 0.9794, + "step": 15066 + }, + { + "epoch": 1.44, + "grad_norm": 0.32336066104543204, + "learning_rate": 3.9687536029652016e-05, + "loss": 1.0282, + "step": 15067 + }, + { + "epoch": 1.44, + "grad_norm": 0.32813435949492964, + "learning_rate": 3.967491841577372e-05, + "loss": 1.0572, + "step": 15068 + }, + { + "epoch": 1.44, + "grad_norm": 0.28512576270956025, + "learning_rate": 3.9662302311568466e-05, + "loss": 0.9863, + "step": 15069 + }, + { + "epoch": 1.44, + "grad_norm": 0.32419186706716985, + "learning_rate": 3.964968771735206e-05, + "loss": 1.033, + "step": 15070 + }, + { + "epoch": 1.44, + "grad_norm": 0.329641135847273, + "learning_rate": 3.963707463344011e-05, + "loss": 0.9856, + "step": 15071 + }, + { + "epoch": 1.44, + "grad_norm": 0.284183192037694, + "learning_rate": 3.9624463060148296e-05, + "loss": 1.0283, + "step": 15072 + }, + { + "epoch": 1.44, + "grad_norm": 0.3134646009784638, + "learning_rate": 3.961185299779224e-05, + "loss": 1.0014, + "step": 15073 + }, + { + "epoch": 1.44, + "grad_norm": 0.3248698959788762, + "learning_rate": 3.9599244446687554e-05, + "loss": 1.1051, + "step": 15074 + }, + { + "epoch": 1.44, + "grad_norm": 0.26940020763864014, + "learning_rate": 3.958663740714971e-05, + "loss": 0.9597, + "step": 15075 + }, + { + "epoch": 1.44, + "grad_norm": 0.3244247880181964, + "learning_rate": 3.9574031879494254e-05, + "loss": 0.9854, + "step": 15076 + }, + { + "epoch": 1.44, + "grad_norm": 0.27524784731023294, + "learning_rate": 3.95614278640366e-05, + "loss": 0.9034, + "step": 15077 + }, + { + "epoch": 1.44, + "grad_norm": 0.33777925842964923, + "learning_rate": 3.954882536109219e-05, + "loss": 1.0701, + "step": 15078 + }, + { + "epoch": 1.44, + "grad_norm": 0.31615572345006937, + "learning_rate": 3.9536224370976424e-05, + "loss": 1.017, + "step": 15079 + }, + { + "epoch": 1.44, + "grad_norm": 0.3107119668050168, + "learning_rate": 3.9523624894004686e-05, + "loss": 1.0248, + "step": 15080 + }, + { + "epoch": 1.44, + "grad_norm": 0.330992443230918, + "learning_rate": 3.951102693049219e-05, + "loss": 1.1297, + "step": 15081 + }, + { + "epoch": 1.44, + "grad_norm": 0.3248906340442192, + "learning_rate": 3.949843048075428e-05, + "loss": 1.129, + "step": 15082 + }, + { + "epoch": 1.44, + "grad_norm": 0.3248010560575294, + "learning_rate": 3.9485835545106186e-05, + "loss": 1.0335, + "step": 15083 + }, + { + "epoch": 1.44, + "grad_norm": 0.2959847571369666, + "learning_rate": 3.9473242123863066e-05, + "loss": 1.0685, + "step": 15084 + }, + { + "epoch": 1.44, + "grad_norm": 0.3043441856607569, + "learning_rate": 3.946065021734013e-05, + "loss": 1.0137, + "step": 15085 + }, + { + "epoch": 1.44, + "grad_norm": 0.2932229446842522, + "learning_rate": 3.9448059825852415e-05, + "loss": 0.964, + "step": 15086 + }, + { + "epoch": 1.44, + "grad_norm": 0.29600888002994935, + "learning_rate": 3.9435470949715116e-05, + "loss": 1.1609, + "step": 15087 + }, + { + "epoch": 1.44, + "grad_norm": 0.3393567729474331, + "learning_rate": 3.9422883589243184e-05, + "loss": 1.0352, + "step": 15088 + }, + { + "epoch": 1.44, + "grad_norm": 0.3026600030684016, + "learning_rate": 3.94102977447517e-05, + "loss": 1.0927, + "step": 15089 + }, + { + "epoch": 1.44, + "grad_norm": 0.35304302993295256, + "learning_rate": 3.9397713416555556e-05, + "loss": 1.0672, + "step": 15090 + }, + { + "epoch": 1.44, + "grad_norm": 0.33979428435667236, + "learning_rate": 3.938513060496977e-05, + "loss": 1.0247, + "step": 15091 + }, + { + "epoch": 1.44, + "grad_norm": 0.3071486798849866, + "learning_rate": 3.937254931030913e-05, + "loss": 0.9714, + "step": 15092 + }, + { + "epoch": 1.44, + "grad_norm": 0.33100642070092606, + "learning_rate": 3.935996953288855e-05, + "loss": 1.0073, + "step": 15093 + }, + { + "epoch": 1.44, + "grad_norm": 0.3158796044975933, + "learning_rate": 3.9347391273022847e-05, + "loss": 1.0966, + "step": 15094 + }, + { + "epoch": 1.44, + "grad_norm": 0.316127131737099, + "learning_rate": 3.933481453102684e-05, + "loss": 1.0846, + "step": 15095 + }, + { + "epoch": 1.44, + "grad_norm": 0.2825465132233702, + "learning_rate": 3.932223930721518e-05, + "loss": 0.99, + "step": 15096 + }, + { + "epoch": 1.44, + "grad_norm": 0.3187266512164236, + "learning_rate": 3.930966560190262e-05, + "loss": 1.1076, + "step": 15097 + }, + { + "epoch": 1.44, + "grad_norm": 0.32597125699555707, + "learning_rate": 3.9297093415403864e-05, + "loss": 1.0781, + "step": 15098 + }, + { + "epoch": 1.44, + "grad_norm": 0.26856931972544007, + "learning_rate": 3.928452274803345e-05, + "loss": 1.0644, + "step": 15099 + }, + { + "epoch": 1.44, + "grad_norm": 0.34071560820959057, + "learning_rate": 3.9271953600106015e-05, + "loss": 0.977, + "step": 15100 + }, + { + "epoch": 1.44, + "grad_norm": 0.25878940246719456, + "learning_rate": 3.9259385971936105e-05, + "loss": 1.0232, + "step": 15101 + }, + { + "epoch": 1.44, + "grad_norm": 0.29333124478187156, + "learning_rate": 3.924681986383827e-05, + "loss": 0.9753, + "step": 15102 + }, + { + "epoch": 1.44, + "grad_norm": 0.3094109062986934, + "learning_rate": 3.923425527612692e-05, + "loss": 1.0507, + "step": 15103 + }, + { + "epoch": 1.45, + "grad_norm": 0.2789440350557895, + "learning_rate": 3.922169220911655e-05, + "loss": 0.93, + "step": 15104 + }, + { + "epoch": 1.45, + "grad_norm": 0.3442599363693229, + "learning_rate": 3.9209130663121494e-05, + "loss": 1.0105, + "step": 15105 + }, + { + "epoch": 1.45, + "grad_norm": 0.3105453785037502, + "learning_rate": 3.9196570638456154e-05, + "loss": 0.9782, + "step": 15106 + }, + { + "epoch": 1.45, + "grad_norm": 0.31534108585807813, + "learning_rate": 3.918401213543486e-05, + "loss": 0.9752, + "step": 15107 + }, + { + "epoch": 1.45, + "grad_norm": 0.31299336699799085, + "learning_rate": 3.9171455154371896e-05, + "loss": 0.8639, + "step": 15108 + }, + { + "epoch": 1.45, + "grad_norm": 0.30430514652805807, + "learning_rate": 3.915889969558148e-05, + "loss": 1.0417, + "step": 15109 + }, + { + "epoch": 1.45, + "grad_norm": 0.31056987713968304, + "learning_rate": 3.9146345759377856e-05, + "loss": 0.9316, + "step": 15110 + }, + { + "epoch": 1.45, + "grad_norm": 0.3238686280825479, + "learning_rate": 3.913379334607514e-05, + "loss": 1.0394, + "step": 15111 + }, + { + "epoch": 1.45, + "grad_norm": 0.2895154184232506, + "learning_rate": 3.912124245598754e-05, + "loss": 1.0699, + "step": 15112 + }, + { + "epoch": 1.45, + "grad_norm": 0.3366332283120311, + "learning_rate": 3.9108693089429035e-05, + "loss": 0.966, + "step": 15113 + }, + { + "epoch": 1.45, + "grad_norm": 0.3085708240233232, + "learning_rate": 3.909614524671383e-05, + "loss": 0.9794, + "step": 15114 + }, + { + "epoch": 1.45, + "grad_norm": 0.3363755588089227, + "learning_rate": 3.908359892815583e-05, + "loss": 1.037, + "step": 15115 + }, + { + "epoch": 1.45, + "grad_norm": 0.33413933118870665, + "learning_rate": 3.9071054134069087e-05, + "loss": 1.0233, + "step": 15116 + }, + { + "epoch": 1.45, + "grad_norm": 0.3084842356520352, + "learning_rate": 3.905851086476746e-05, + "loss": 0.98, + "step": 15117 + }, + { + "epoch": 1.45, + "grad_norm": 0.353293098714956, + "learning_rate": 3.904596912056491e-05, + "loss": 1.0585, + "step": 15118 + }, + { + "epoch": 1.45, + "grad_norm": 0.3238965319321103, + "learning_rate": 3.9033428901775326e-05, + "loss": 1.1981, + "step": 15119 + }, + { + "epoch": 1.45, + "grad_norm": 0.30111048285000785, + "learning_rate": 3.902089020871246e-05, + "loss": 1.0881, + "step": 15120 + }, + { + "epoch": 1.45, + "grad_norm": 0.3023317607177684, + "learning_rate": 3.900835304169015e-05, + "loss": 0.9802, + "step": 15121 + }, + { + "epoch": 1.45, + "grad_norm": 0.3198000150455116, + "learning_rate": 3.899581740102213e-05, + "loss": 0.9753, + "step": 15122 + }, + { + "epoch": 1.45, + "grad_norm": 0.2852043210356904, + "learning_rate": 3.898328328702215e-05, + "loss": 1.0915, + "step": 15123 + }, + { + "epoch": 1.45, + "grad_norm": 0.2920408600686087, + "learning_rate": 3.897075070000382e-05, + "loss": 1.0526, + "step": 15124 + }, + { + "epoch": 1.45, + "grad_norm": 0.3304816921865988, + "learning_rate": 3.8958219640280846e-05, + "loss": 0.9828, + "step": 15125 + }, + { + "epoch": 1.45, + "grad_norm": 0.2928248759271021, + "learning_rate": 3.894569010816672e-05, + "loss": 1.0942, + "step": 15126 + }, + { + "epoch": 1.45, + "grad_norm": 0.3227795874430012, + "learning_rate": 3.893316210397514e-05, + "loss": 1.1719, + "step": 15127 + }, + { + "epoch": 1.45, + "grad_norm": 0.2771727131123157, + "learning_rate": 3.892063562801953e-05, + "loss": 1.0165, + "step": 15128 + }, + { + "epoch": 1.45, + "grad_norm": 0.3049237094748946, + "learning_rate": 3.890811068061343e-05, + "loss": 0.8954, + "step": 15129 + }, + { + "epoch": 1.45, + "grad_norm": 0.30325838330258215, + "learning_rate": 3.889558726207021e-05, + "loss": 1.0204, + "step": 15130 + }, + { + "epoch": 1.45, + "grad_norm": 0.294837262782324, + "learning_rate": 3.8883065372703365e-05, + "loss": 0.9633, + "step": 15131 + }, + { + "epoch": 1.45, + "grad_norm": 0.2984969715504458, + "learning_rate": 3.887054501282619e-05, + "loss": 0.9335, + "step": 15132 + }, + { + "epoch": 1.45, + "grad_norm": 0.3315312067858976, + "learning_rate": 3.885802618275204e-05, + "loss": 1.0004, + "step": 15133 + }, + { + "epoch": 1.45, + "grad_norm": 0.3367417718388073, + "learning_rate": 3.88455088827942e-05, + "loss": 0.9223, + "step": 15134 + }, + { + "epoch": 1.45, + "grad_norm": 0.3253825189885928, + "learning_rate": 3.883299311326598e-05, + "loss": 1.0735, + "step": 15135 + }, + { + "epoch": 1.45, + "grad_norm": 0.28691490730960784, + "learning_rate": 3.882047887448051e-05, + "loss": 1.0294, + "step": 15136 + }, + { + "epoch": 1.45, + "grad_norm": 0.26655841538197816, + "learning_rate": 3.880796616675105e-05, + "loss": 0.9848, + "step": 15137 + }, + { + "epoch": 1.45, + "grad_norm": 0.3825957072776052, + "learning_rate": 3.879545499039066e-05, + "loss": 1.1045, + "step": 15138 + }, + { + "epoch": 1.45, + "grad_norm": 0.3336318395656129, + "learning_rate": 3.878294534571246e-05, + "loss": 1.0997, + "step": 15139 + }, + { + "epoch": 1.45, + "grad_norm": 0.3155460276526394, + "learning_rate": 3.877043723302953e-05, + "loss": 0.9978, + "step": 15140 + }, + { + "epoch": 1.45, + "grad_norm": 0.3191074378959126, + "learning_rate": 3.875793065265494e-05, + "loss": 0.9541, + "step": 15141 + }, + { + "epoch": 1.45, + "grad_norm": 0.2913251039902597, + "learning_rate": 3.874542560490158e-05, + "loss": 1.0218, + "step": 15142 + }, + { + "epoch": 1.45, + "grad_norm": 0.32020648140754127, + "learning_rate": 3.873292209008244e-05, + "loss": 0.9305, + "step": 15143 + }, + { + "epoch": 1.45, + "grad_norm": 0.3262935711278876, + "learning_rate": 3.872042010851046e-05, + "loss": 1.1177, + "step": 15144 + }, + { + "epoch": 1.45, + "grad_norm": 0.2974331954652642, + "learning_rate": 3.870791966049845e-05, + "loss": 1.0142, + "step": 15145 + }, + { + "epoch": 1.45, + "grad_norm": 0.28629468002341546, + "learning_rate": 3.8695420746359304e-05, + "loss": 0.9217, + "step": 15146 + }, + { + "epoch": 1.45, + "grad_norm": 0.3242489708025684, + "learning_rate": 3.868292336640571e-05, + "loss": 1.0626, + "step": 15147 + }, + { + "epoch": 1.45, + "grad_norm": 0.2966595275287551, + "learning_rate": 3.867042752095057e-05, + "loss": 0.9561, + "step": 15148 + }, + { + "epoch": 1.45, + "grad_norm": 0.3327206868464094, + "learning_rate": 3.865793321030649e-05, + "loss": 0.9426, + "step": 15149 + }, + { + "epoch": 1.45, + "grad_norm": 0.31642786080184654, + "learning_rate": 3.864544043478622e-05, + "loss": 0.8471, + "step": 15150 + }, + { + "epoch": 1.45, + "grad_norm": 0.3019825694139279, + "learning_rate": 3.8632949194702315e-05, + "loss": 0.9825, + "step": 15151 + }, + { + "epoch": 1.45, + "grad_norm": 0.3661423888690602, + "learning_rate": 3.862045949036748e-05, + "loss": 0.9619, + "step": 15152 + }, + { + "epoch": 1.45, + "grad_norm": 0.2761055995009033, + "learning_rate": 3.860797132209417e-05, + "loss": 0.9897, + "step": 15153 + }, + { + "epoch": 1.45, + "grad_norm": 0.30638684949229716, + "learning_rate": 3.8595484690194974e-05, + "loss": 1.0278, + "step": 15154 + }, + { + "epoch": 1.45, + "grad_norm": 0.3400329852257071, + "learning_rate": 3.858299959498235e-05, + "loss": 0.8853, + "step": 15155 + }, + { + "epoch": 1.45, + "grad_norm": 0.30079307285560486, + "learning_rate": 3.857051603676881e-05, + "loss": 1.0552, + "step": 15156 + }, + { + "epoch": 1.45, + "grad_norm": 0.2950822382398543, + "learning_rate": 3.8558034015866674e-05, + "loss": 1.1183, + "step": 15157 + }, + { + "epoch": 1.45, + "grad_norm": 0.32048144395129036, + "learning_rate": 3.854555353258839e-05, + "loss": 1.0469, + "step": 15158 + }, + { + "epoch": 1.45, + "grad_norm": 0.33113240094414065, + "learning_rate": 3.85330745872462e-05, + "loss": 1.0138, + "step": 15159 + }, + { + "epoch": 1.45, + "grad_norm": 0.3236219652166974, + "learning_rate": 3.852059718015246e-05, + "loss": 1.0486, + "step": 15160 + }, + { + "epoch": 1.45, + "grad_norm": 0.3155575462514209, + "learning_rate": 3.850812131161942e-05, + "loss": 0.9945, + "step": 15161 + }, + { + "epoch": 1.45, + "grad_norm": 0.29770074279911984, + "learning_rate": 3.849564698195932e-05, + "loss": 1.0133, + "step": 15162 + }, + { + "epoch": 1.45, + "grad_norm": 0.33615042288995844, + "learning_rate": 3.848317419148427e-05, + "loss": 0.9593, + "step": 15163 + }, + { + "epoch": 1.45, + "grad_norm": 0.3080558562612871, + "learning_rate": 3.847070294050645e-05, + "loss": 0.9594, + "step": 15164 + }, + { + "epoch": 1.45, + "grad_norm": 0.27746012233660633, + "learning_rate": 3.845823322933799e-05, + "loss": 1.0741, + "step": 15165 + }, + { + "epoch": 1.45, + "grad_norm": 0.3404347864775315, + "learning_rate": 3.84457650582909e-05, + "loss": 0.9733, + "step": 15166 + }, + { + "epoch": 1.45, + "grad_norm": 0.3428201968361631, + "learning_rate": 3.843329842767721e-05, + "loss": 0.9812, + "step": 15167 + }, + { + "epoch": 1.45, + "grad_norm": 0.3126861800739002, + "learning_rate": 3.842083333780892e-05, + "loss": 0.989, + "step": 15168 + }, + { + "epoch": 1.45, + "grad_norm": 0.2938931079633564, + "learning_rate": 3.840836978899802e-05, + "loss": 1.0038, + "step": 15169 + }, + { + "epoch": 1.45, + "grad_norm": 0.3519179734559377, + "learning_rate": 3.839590778155634e-05, + "loss": 0.9615, + "step": 15170 + }, + { + "epoch": 1.45, + "grad_norm": 0.33801357823219796, + "learning_rate": 3.8383447315795815e-05, + "loss": 0.9121, + "step": 15171 + }, + { + "epoch": 1.45, + "grad_norm": 0.295327593638852, + "learning_rate": 3.837098839202821e-05, + "loss": 1.0447, + "step": 15172 + }, + { + "epoch": 1.45, + "grad_norm": 0.3373388647528763, + "learning_rate": 3.835853101056539e-05, + "loss": 1.0992, + "step": 15173 + }, + { + "epoch": 1.45, + "grad_norm": 0.35043986919795544, + "learning_rate": 3.8346075171718996e-05, + "loss": 1.0668, + "step": 15174 + }, + { + "epoch": 1.45, + "grad_norm": 0.26589546651084606, + "learning_rate": 3.83336208758009e-05, + "loss": 0.9357, + "step": 15175 + }, + { + "epoch": 1.45, + "grad_norm": 0.3167683181650761, + "learning_rate": 3.832116812312265e-05, + "loss": 1.0189, + "step": 15176 + }, + { + "epoch": 1.45, + "grad_norm": 0.28506293825130763, + "learning_rate": 3.8308716913995976e-05, + "loss": 0.8871, + "step": 15177 + }, + { + "epoch": 1.45, + "grad_norm": 0.3198020010896085, + "learning_rate": 3.8296267248732395e-05, + "loss": 0.9945, + "step": 15178 + }, + { + "epoch": 1.45, + "grad_norm": 0.2866851592836278, + "learning_rate": 3.828381912764354e-05, + "loss": 0.9818, + "step": 15179 + }, + { + "epoch": 1.45, + "grad_norm": 0.2863288788481053, + "learning_rate": 3.8271372551040866e-05, + "loss": 1.0163, + "step": 15180 + }, + { + "epoch": 1.45, + "grad_norm": 0.30419980602533647, + "learning_rate": 3.825892751923589e-05, + "loss": 0.8941, + "step": 15181 + }, + { + "epoch": 1.45, + "grad_norm": 0.3211236612136609, + "learning_rate": 3.824648403254004e-05, + "loss": 0.9604, + "step": 15182 + }, + { + "epoch": 1.45, + "grad_norm": 0.3114354765920371, + "learning_rate": 3.823404209126475e-05, + "loss": 0.949, + "step": 15183 + }, + { + "epoch": 1.45, + "grad_norm": 0.29777343090682473, + "learning_rate": 3.8221601695721424e-05, + "loss": 0.9807, + "step": 15184 + }, + { + "epoch": 1.45, + "grad_norm": 0.3038434921269321, + "learning_rate": 3.820916284622128e-05, + "loss": 1.0961, + "step": 15185 + }, + { + "epoch": 1.45, + "grad_norm": 0.308848118743579, + "learning_rate": 3.819672554307572e-05, + "loss": 1.0217, + "step": 15186 + }, + { + "epoch": 1.45, + "grad_norm": 0.2936211975080419, + "learning_rate": 3.8184289786595875e-05, + "loss": 0.9137, + "step": 15187 + }, + { + "epoch": 1.45, + "grad_norm": 0.3195456817884376, + "learning_rate": 3.81718555770931e-05, + "loss": 1.0597, + "step": 15188 + }, + { + "epoch": 1.45, + "grad_norm": 0.32535608307693875, + "learning_rate": 3.815942291487845e-05, + "loss": 1.0253, + "step": 15189 + }, + { + "epoch": 1.45, + "grad_norm": 0.33797232280069484, + "learning_rate": 3.814699180026316e-05, + "loss": 1.101, + "step": 15190 + }, + { + "epoch": 1.45, + "grad_norm": 0.3025008419792423, + "learning_rate": 3.8134562233558224e-05, + "loss": 0.9266, + "step": 15191 + }, + { + "epoch": 1.45, + "grad_norm": 0.33465620715410654, + "learning_rate": 3.8122134215074776e-05, + "loss": 0.9717, + "step": 15192 + }, + { + "epoch": 1.45, + "grad_norm": 0.2993936669759409, + "learning_rate": 3.810970774512379e-05, + "loss": 1.0412, + "step": 15193 + }, + { + "epoch": 1.45, + "grad_norm": 0.2680841081905593, + "learning_rate": 3.8097282824016266e-05, + "loss": 1.0718, + "step": 15194 + }, + { + "epoch": 1.45, + "grad_norm": 0.36917472146989083, + "learning_rate": 3.808485945206314e-05, + "loss": 1.0236, + "step": 15195 + }, + { + "epoch": 1.45, + "grad_norm": 0.3160328344372286, + "learning_rate": 3.8072437629575344e-05, + "loss": 1.0513, + "step": 15196 + }, + { + "epoch": 1.45, + "grad_norm": 0.3225779624469126, + "learning_rate": 3.8060017356863686e-05, + "loss": 1.0311, + "step": 15197 + }, + { + "epoch": 1.45, + "grad_norm": 0.28978394668957685, + "learning_rate": 3.804759863423906e-05, + "loss": 0.9688, + "step": 15198 + }, + { + "epoch": 1.45, + "grad_norm": 0.32039434023222174, + "learning_rate": 3.803518146201217e-05, + "loss": 1.0232, + "step": 15199 + }, + { + "epoch": 1.45, + "grad_norm": 0.3183332468287704, + "learning_rate": 3.802276584049382e-05, + "loss": 0.9634, + "step": 15200 + }, + { + "epoch": 1.45, + "grad_norm": 0.30793147997516657, + "learning_rate": 3.80103517699947e-05, + "loss": 1.0116, + "step": 15201 + }, + { + "epoch": 1.45, + "grad_norm": 0.3483070058142213, + "learning_rate": 3.799793925082552e-05, + "loss": 1.0308, + "step": 15202 + }, + { + "epoch": 1.45, + "grad_norm": 0.31054020783558645, + "learning_rate": 3.7985528283296845e-05, + "loss": 1.0936, + "step": 15203 + }, + { + "epoch": 1.45, + "grad_norm": 0.3091220116325475, + "learning_rate": 3.7973118867719294e-05, + "loss": 1.094, + "step": 15204 + }, + { + "epoch": 1.45, + "grad_norm": 0.3352413559578128, + "learning_rate": 3.796071100440347e-05, + "loss": 1.0675, + "step": 15205 + }, + { + "epoch": 1.45, + "grad_norm": 0.32840486005662806, + "learning_rate": 3.79483046936598e-05, + "loss": 0.957, + "step": 15206 + }, + { + "epoch": 1.45, + "grad_norm": 0.2781804592234992, + "learning_rate": 3.793589993579884e-05, + "loss": 0.9772, + "step": 15207 + }, + { + "epoch": 1.45, + "grad_norm": 0.330696479722861, + "learning_rate": 3.792349673113093e-05, + "loss": 0.9412, + "step": 15208 + }, + { + "epoch": 1.46, + "grad_norm": 0.2578854836917818, + "learning_rate": 3.791109507996659e-05, + "loss": 1.0095, + "step": 15209 + }, + { + "epoch": 1.46, + "grad_norm": 0.32018751069317436, + "learning_rate": 3.789869498261607e-05, + "loss": 1.1481, + "step": 15210 + }, + { + "epoch": 1.46, + "grad_norm": 0.3160379232720498, + "learning_rate": 3.788629643938978e-05, + "loss": 0.9635, + "step": 15211 + }, + { + "epoch": 1.46, + "grad_norm": 0.26478091893423, + "learning_rate": 3.787389945059793e-05, + "loss": 0.9698, + "step": 15212 + }, + { + "epoch": 1.46, + "grad_norm": 0.28044473526348995, + "learning_rate": 3.786150401655082e-05, + "loss": 0.9081, + "step": 15213 + }, + { + "epoch": 1.46, + "grad_norm": 0.31302384900778013, + "learning_rate": 3.7849110137558595e-05, + "loss": 1.0518, + "step": 15214 + }, + { + "epoch": 1.46, + "grad_norm": 0.3528347863215077, + "learning_rate": 3.7836717813931445e-05, + "loss": 1.0977, + "step": 15215 + }, + { + "epoch": 1.46, + "grad_norm": 0.30400033076739885, + "learning_rate": 3.78243270459795e-05, + "loss": 1.0608, + "step": 15216 + }, + { + "epoch": 1.46, + "grad_norm": 0.3200739911930644, + "learning_rate": 3.781193783401289e-05, + "loss": 0.9611, + "step": 15217 + }, + { + "epoch": 1.46, + "grad_norm": 0.327548919758267, + "learning_rate": 3.779955017834157e-05, + "loss": 0.9921, + "step": 15218 + }, + { + "epoch": 1.46, + "grad_norm": 0.31279760581781757, + "learning_rate": 3.778716407927564e-05, + "loss": 0.9826, + "step": 15219 + }, + { + "epoch": 1.46, + "grad_norm": 0.3337990333858575, + "learning_rate": 3.7774779537125e-05, + "loss": 1.0643, + "step": 15220 + }, + { + "epoch": 1.46, + "grad_norm": 0.3019032061592828, + "learning_rate": 3.7762396552199595e-05, + "loss": 1.1332, + "step": 15221 + }, + { + "epoch": 1.46, + "grad_norm": 0.2797472582574869, + "learning_rate": 3.7750015124809346e-05, + "loss": 0.9464, + "step": 15222 + }, + { + "epoch": 1.46, + "grad_norm": 0.3165143133897372, + "learning_rate": 3.773763525526412e-05, + "loss": 1.0918, + "step": 15223 + }, + { + "epoch": 1.46, + "grad_norm": 0.30950433660184323, + "learning_rate": 3.7725256943873664e-05, + "loss": 1.1254, + "step": 15224 + }, + { + "epoch": 1.46, + "grad_norm": 0.31345564413777455, + "learning_rate": 3.77128801909478e-05, + "loss": 1.0238, + "step": 15225 + }, + { + "epoch": 1.46, + "grad_norm": 0.31589245044002107, + "learning_rate": 3.770050499679628e-05, + "loss": 1.0493, + "step": 15226 + }, + { + "epoch": 1.46, + "grad_norm": 0.32535709564127513, + "learning_rate": 3.768813136172874e-05, + "loss": 1.1555, + "step": 15227 + }, + { + "epoch": 1.46, + "grad_norm": 0.3221116963448012, + "learning_rate": 3.767575928605488e-05, + "loss": 0.953, + "step": 15228 + }, + { + "epoch": 1.46, + "grad_norm": 0.3073762286798339, + "learning_rate": 3.766338877008432e-05, + "loss": 1.0463, + "step": 15229 + }, + { + "epoch": 1.46, + "grad_norm": 0.28784551409029574, + "learning_rate": 3.7651019814126654e-05, + "loss": 0.9911, + "step": 15230 + }, + { + "epoch": 1.46, + "grad_norm": 0.30952149936686973, + "learning_rate": 3.763865241849137e-05, + "loss": 0.9386, + "step": 15231 + }, + { + "epoch": 1.46, + "grad_norm": 0.31422465442620084, + "learning_rate": 3.762628658348805e-05, + "loss": 1.13, + "step": 15232 + }, + { + "epoch": 1.46, + "grad_norm": 0.2839978347039779, + "learning_rate": 3.7613922309426065e-05, + "loss": 1.0266, + "step": 15233 + }, + { + "epoch": 1.46, + "grad_norm": 0.3564101427808188, + "learning_rate": 3.760155959661491e-05, + "loss": 1.0156, + "step": 15234 + }, + { + "epoch": 1.46, + "grad_norm": 0.3011433838687494, + "learning_rate": 3.758919844536388e-05, + "loss": 1.0588, + "step": 15235 + }, + { + "epoch": 1.46, + "grad_norm": 0.31176718110217005, + "learning_rate": 3.757683885598245e-05, + "loss": 1.0664, + "step": 15236 + }, + { + "epoch": 1.46, + "grad_norm": 0.4109862351613479, + "learning_rate": 3.756448082877982e-05, + "loss": 1.0492, + "step": 15237 + }, + { + "epoch": 1.46, + "grad_norm": 0.28432056478749007, + "learning_rate": 3.755212436406535e-05, + "loss": 0.9758, + "step": 15238 + }, + { + "epoch": 1.46, + "grad_norm": 0.3246849581619725, + "learning_rate": 3.753976946214815e-05, + "loss": 0.9634, + "step": 15239 + }, + { + "epoch": 1.46, + "grad_norm": 0.38698959764402596, + "learning_rate": 3.752741612333753e-05, + "loss": 0.9831, + "step": 15240 + }, + { + "epoch": 1.46, + "grad_norm": 0.28535162018212906, + "learning_rate": 3.751506434794255e-05, + "loss": 0.9296, + "step": 15241 + }, + { + "epoch": 1.46, + "grad_norm": 0.27970104299166815, + "learning_rate": 3.7502714136272356e-05, + "loss": 0.9839, + "step": 15242 + }, + { + "epoch": 1.46, + "grad_norm": 0.28659030433907207, + "learning_rate": 3.7490365488636e-05, + "loss": 0.9212, + "step": 15243 + }, + { + "epoch": 1.46, + "grad_norm": 0.2852180492801495, + "learning_rate": 3.747801840534259e-05, + "loss": 0.9505, + "step": 15244 + }, + { + "epoch": 1.46, + "grad_norm": 0.2868397895801632, + "learning_rate": 3.746567288670102e-05, + "loss": 0.9792, + "step": 15245 + }, + { + "epoch": 1.46, + "grad_norm": 0.3429506110479355, + "learning_rate": 3.745332893302028e-05, + "loss": 1.0346, + "step": 15246 + }, + { + "epoch": 1.46, + "grad_norm": 0.31144138907571955, + "learning_rate": 3.744098654460934e-05, + "loss": 1.1207, + "step": 15247 + }, + { + "epoch": 1.46, + "grad_norm": 0.3211530504666596, + "learning_rate": 3.742864572177699e-05, + "loss": 0.9736, + "step": 15248 + }, + { + "epoch": 1.46, + "grad_norm": 0.30168084145860835, + "learning_rate": 3.74163064648321e-05, + "loss": 1.1834, + "step": 15249 + }, + { + "epoch": 1.46, + "grad_norm": 0.28001913798436145, + "learning_rate": 3.740396877408348e-05, + "loss": 0.8896, + "step": 15250 + }, + { + "epoch": 1.46, + "grad_norm": 0.3362068408650217, + "learning_rate": 3.73916326498399e-05, + "loss": 1.0526, + "step": 15251 + }, + { + "epoch": 1.46, + "grad_norm": 0.29517713566228765, + "learning_rate": 3.7379298092410045e-05, + "loss": 1.0048, + "step": 15252 + }, + { + "epoch": 1.46, + "grad_norm": 0.2786434926638676, + "learning_rate": 3.736696510210263e-05, + "loss": 1.0277, + "step": 15253 + }, + { + "epoch": 1.46, + "grad_norm": 0.33714149505803626, + "learning_rate": 3.735463367922626e-05, + "loss": 1.01, + "step": 15254 + }, + { + "epoch": 1.46, + "grad_norm": 0.33783506057786866, + "learning_rate": 3.734230382408954e-05, + "loss": 0.9559, + "step": 15255 + }, + { + "epoch": 1.46, + "grad_norm": 0.3184732184163937, + "learning_rate": 3.732997553700106e-05, + "loss": 0.9791, + "step": 15256 + }, + { + "epoch": 1.46, + "grad_norm": 0.29226928161114574, + "learning_rate": 3.731764881826935e-05, + "loss": 1.0278, + "step": 15257 + }, + { + "epoch": 1.46, + "grad_norm": 0.28734216548339475, + "learning_rate": 3.730532366820284e-05, + "loss": 0.9605, + "step": 15258 + }, + { + "epoch": 1.46, + "grad_norm": 0.3082168846471344, + "learning_rate": 3.7293000087110044e-05, + "loss": 1.2378, + "step": 15259 + }, + { + "epoch": 1.46, + "grad_norm": 0.31007152440209507, + "learning_rate": 3.7280678075299295e-05, + "loss": 1.0239, + "step": 15260 + }, + { + "epoch": 1.46, + "grad_norm": 0.2841066726622524, + "learning_rate": 3.726835763307903e-05, + "loss": 0.8897, + "step": 15261 + }, + { + "epoch": 1.46, + "grad_norm": 0.33942788061256635, + "learning_rate": 3.725603876075747e-05, + "loss": 0.9645, + "step": 15262 + }, + { + "epoch": 1.46, + "grad_norm": 0.2532411839800551, + "learning_rate": 3.7243721458643046e-05, + "loss": 0.9543, + "step": 15263 + }, + { + "epoch": 1.46, + "grad_norm": 0.3408644162308252, + "learning_rate": 3.72314057270439e-05, + "loss": 1.0757, + "step": 15264 + }, + { + "epoch": 1.46, + "grad_norm": 0.3105233322569978, + "learning_rate": 3.7219091566268305e-05, + "loss": 1.0147, + "step": 15265 + }, + { + "epoch": 1.46, + "grad_norm": 0.2986311057817003, + "learning_rate": 3.720677897662437e-05, + "loss": 1.0734, + "step": 15266 + }, + { + "epoch": 1.46, + "grad_norm": 0.29193944933167687, + "learning_rate": 3.719446795842024e-05, + "loss": 1.1241, + "step": 15267 + }, + { + "epoch": 1.46, + "grad_norm": 0.3183557030632783, + "learning_rate": 3.718215851196407e-05, + "loss": 0.867, + "step": 15268 + }, + { + "epoch": 1.46, + "grad_norm": 0.3148794606982273, + "learning_rate": 3.716985063756383e-05, + "loss": 1.0393, + "step": 15269 + }, + { + "epoch": 1.46, + "grad_norm": 0.30152581611618795, + "learning_rate": 3.7157544335527563e-05, + "loss": 0.9754, + "step": 15270 + }, + { + "epoch": 1.46, + "grad_norm": 0.3124559464981585, + "learning_rate": 3.714523960616323e-05, + "loss": 1.066, + "step": 15271 + }, + { + "epoch": 1.46, + "grad_norm": 0.32500421799077883, + "learning_rate": 3.7132936449778835e-05, + "loss": 0.9479, + "step": 15272 + }, + { + "epoch": 1.46, + "grad_norm": 0.3018308988096337, + "learning_rate": 3.712063486668217e-05, + "loss": 0.8932, + "step": 15273 + }, + { + "epoch": 1.46, + "grad_norm": 0.2469699409778249, + "learning_rate": 3.710833485718116e-05, + "loss": 0.957, + "step": 15274 + }, + { + "epoch": 1.46, + "grad_norm": 0.3026890000869845, + "learning_rate": 3.709603642158358e-05, + "loss": 1.1076, + "step": 15275 + }, + { + "epoch": 1.46, + "grad_norm": 0.31536353898258185, + "learning_rate": 3.7083739560197205e-05, + "loss": 1.0229, + "step": 15276 + }, + { + "epoch": 1.46, + "grad_norm": 0.31374824565585324, + "learning_rate": 3.70714442733298e-05, + "loss": 1.139, + "step": 15277 + }, + { + "epoch": 1.46, + "grad_norm": 0.31159253523687724, + "learning_rate": 3.705915056128909e-05, + "loss": 0.9909, + "step": 15278 + }, + { + "epoch": 1.46, + "grad_norm": 0.3127760578708144, + "learning_rate": 3.704685842438265e-05, + "loss": 1.0446, + "step": 15279 + }, + { + "epoch": 1.46, + "grad_norm": 0.3196812633684687, + "learning_rate": 3.703456786291818e-05, + "loss": 1.005, + "step": 15280 + }, + { + "epoch": 1.46, + "grad_norm": 0.30178710953243965, + "learning_rate": 3.702227887720319e-05, + "loss": 1.0642, + "step": 15281 + }, + { + "epoch": 1.46, + "grad_norm": 0.3557284839687944, + "learning_rate": 3.700999146754525e-05, + "loss": 0.906, + "step": 15282 + }, + { + "epoch": 1.46, + "grad_norm": 0.2670067133435032, + "learning_rate": 3.6997705634251854e-05, + "loss": 1.0295, + "step": 15283 + }, + { + "epoch": 1.46, + "grad_norm": 0.3161344169373289, + "learning_rate": 3.6985421377630514e-05, + "loss": 0.9609, + "step": 15284 + }, + { + "epoch": 1.46, + "grad_norm": 0.31751701234245183, + "learning_rate": 3.697313869798857e-05, + "loss": 1.0663, + "step": 15285 + }, + { + "epoch": 1.46, + "grad_norm": 0.3071432582432496, + "learning_rate": 3.6960857595633436e-05, + "loss": 1.0351, + "step": 15286 + }, + { + "epoch": 1.46, + "grad_norm": 0.2862975496748051, + "learning_rate": 3.6948578070872494e-05, + "loss": 1.0039, + "step": 15287 + }, + { + "epoch": 1.46, + "grad_norm": 0.3326789416411448, + "learning_rate": 3.693630012401299e-05, + "loss": 0.9701, + "step": 15288 + }, + { + "epoch": 1.46, + "grad_norm": 0.3096063115390158, + "learning_rate": 3.69240237553622e-05, + "loss": 1.033, + "step": 15289 + }, + { + "epoch": 1.46, + "grad_norm": 0.3104907000383338, + "learning_rate": 3.6911748965227354e-05, + "loss": 0.9766, + "step": 15290 + }, + { + "epoch": 1.46, + "grad_norm": 0.3158933504089654, + "learning_rate": 3.689947575391568e-05, + "loss": 1.0593, + "step": 15291 + }, + { + "epoch": 1.46, + "grad_norm": 0.32751286192443263, + "learning_rate": 3.688720412173425e-05, + "loss": 1.0598, + "step": 15292 + }, + { + "epoch": 1.46, + "grad_norm": 0.327362853968184, + "learning_rate": 3.687493406899023e-05, + "loss": 1.0401, + "step": 15293 + }, + { + "epoch": 1.46, + "grad_norm": 0.2984391307140166, + "learning_rate": 3.686266559599063e-05, + "loss": 1.0831, + "step": 15294 + }, + { + "epoch": 1.46, + "grad_norm": 0.3153679888846156, + "learning_rate": 3.6850398703042533e-05, + "loss": 1.0303, + "step": 15295 + }, + { + "epoch": 1.46, + "grad_norm": 0.27570413710285757, + "learning_rate": 3.683813339045282e-05, + "loss": 1.1554, + "step": 15296 + }, + { + "epoch": 1.46, + "grad_norm": 0.3029557943082633, + "learning_rate": 3.68258696585286e-05, + "loss": 0.943, + "step": 15297 + }, + { + "epoch": 1.46, + "grad_norm": 0.3650621697638689, + "learning_rate": 3.6813607507576655e-05, + "loss": 1.1079, + "step": 15298 + }, + { + "epoch": 1.46, + "grad_norm": 0.2776796888890911, + "learning_rate": 3.6801346937903925e-05, + "loss": 1.0343, + "step": 15299 + }, + { + "epoch": 1.46, + "grad_norm": 0.3343369818182857, + "learning_rate": 3.678908794981717e-05, + "loss": 1.0566, + "step": 15300 + }, + { + "epoch": 1.46, + "grad_norm": 0.3350137529053737, + "learning_rate": 3.677683054362325e-05, + "loss": 1.0112, + "step": 15301 + }, + { + "epoch": 1.46, + "grad_norm": 0.31600907885572477, + "learning_rate": 3.676457471962885e-05, + "loss": 0.9829, + "step": 15302 + }, + { + "epoch": 1.46, + "grad_norm": 0.3327955962507957, + "learning_rate": 3.675232047814071e-05, + "loss": 1.0404, + "step": 15303 + }, + { + "epoch": 1.46, + "grad_norm": 0.3272378962518136, + "learning_rate": 3.6740067819465495e-05, + "loss": 1.1022, + "step": 15304 + }, + { + "epoch": 1.46, + "grad_norm": 0.2920470324132137, + "learning_rate": 3.672781674390988e-05, + "loss": 1.0176, + "step": 15305 + }, + { + "epoch": 1.46, + "grad_norm": 0.3436440170108816, + "learning_rate": 3.671556725178038e-05, + "loss": 1.0226, + "step": 15306 + }, + { + "epoch": 1.46, + "grad_norm": 0.3220106758285306, + "learning_rate": 3.670331934338358e-05, + "loss": 1.0336, + "step": 15307 + }, + { + "epoch": 1.46, + "grad_norm": 0.3072775286662113, + "learning_rate": 3.6691073019026025e-05, + "loss": 1.1354, + "step": 15308 + }, + { + "epoch": 1.46, + "grad_norm": 0.28300334205117667, + "learning_rate": 3.667882827901413e-05, + "loss": 1.0182, + "step": 15309 + }, + { + "epoch": 1.46, + "grad_norm": 0.3186865158920503, + "learning_rate": 3.666658512365435e-05, + "loss": 1.0163, + "step": 15310 + }, + { + "epoch": 1.46, + "grad_norm": 0.2861697692933782, + "learning_rate": 3.665434355325308e-05, + "loss": 0.8629, + "step": 15311 + }, + { + "epoch": 1.46, + "grad_norm": 0.30888426645867867, + "learning_rate": 3.66421035681167e-05, + "loss": 0.9982, + "step": 15312 + }, + { + "epoch": 1.47, + "grad_norm": 0.3728796679482685, + "learning_rate": 3.662986516855148e-05, + "loss": 1.0911, + "step": 15313 + }, + { + "epoch": 1.47, + "grad_norm": 0.2929829048030513, + "learning_rate": 3.661762835486373e-05, + "loss": 1.0576, + "step": 15314 + }, + { + "epoch": 1.47, + "grad_norm": 0.31008215460486305, + "learning_rate": 3.6605393127359645e-05, + "loss": 1.1217, + "step": 15315 + }, + { + "epoch": 1.47, + "grad_norm": 0.31753815028283017, + "learning_rate": 3.659315948634543e-05, + "loss": 0.9031, + "step": 15316 + }, + { + "epoch": 1.47, + "grad_norm": 0.3229074876480997, + "learning_rate": 3.658092743212724e-05, + "loss": 1.0765, + "step": 15317 + }, + { + "epoch": 1.47, + "grad_norm": 0.29291979967198983, + "learning_rate": 3.656869696501125e-05, + "loss": 1.012, + "step": 15318 + }, + { + "epoch": 1.47, + "grad_norm": 0.32049820259785866, + "learning_rate": 3.655646808530344e-05, + "loss": 1.0431, + "step": 15319 + }, + { + "epoch": 1.47, + "grad_norm": 0.31456910074934963, + "learning_rate": 3.654424079330993e-05, + "loss": 0.9623, + "step": 15320 + }, + { + "epoch": 1.47, + "grad_norm": 0.3263014654075256, + "learning_rate": 3.653201508933662e-05, + "loss": 1.0416, + "step": 15321 + }, + { + "epoch": 1.47, + "grad_norm": 0.3226199099710918, + "learning_rate": 3.651979097368958e-05, + "loss": 1.0049, + "step": 15322 + }, + { + "epoch": 1.47, + "grad_norm": 0.32459464255472875, + "learning_rate": 3.6507568446674587e-05, + "loss": 1.0822, + "step": 15323 + }, + { + "epoch": 1.47, + "grad_norm": 0.2932808214306845, + "learning_rate": 3.6495347508597665e-05, + "loss": 1.0644, + "step": 15324 + }, + { + "epoch": 1.47, + "grad_norm": 0.30844664880896866, + "learning_rate": 3.648312815976455e-05, + "loss": 1.0361, + "step": 15325 + }, + { + "epoch": 1.47, + "grad_norm": 0.3745397561421422, + "learning_rate": 3.6470910400481105e-05, + "loss": 0.9026, + "step": 15326 + }, + { + "epoch": 1.47, + "grad_norm": 0.2987788935435836, + "learning_rate": 3.6458694231053025e-05, + "loss": 1.0459, + "step": 15327 + }, + { + "epoch": 1.47, + "grad_norm": 0.311595933396317, + "learning_rate": 3.6446479651786046e-05, + "loss": 1.0607, + "step": 15328 + }, + { + "epoch": 1.47, + "grad_norm": 0.28777094069831205, + "learning_rate": 3.643426666298589e-05, + "loss": 1.1459, + "step": 15329 + }, + { + "epoch": 1.47, + "grad_norm": 0.3444156727507926, + "learning_rate": 3.6422055264958135e-05, + "loss": 1.0095, + "step": 15330 + }, + { + "epoch": 1.47, + "grad_norm": 0.2697981022648838, + "learning_rate": 3.640984545800841e-05, + "loss": 1.0532, + "step": 15331 + }, + { + "epoch": 1.47, + "grad_norm": 0.2938270082200472, + "learning_rate": 3.639763724244225e-05, + "loss": 1.0303, + "step": 15332 + }, + { + "epoch": 1.47, + "grad_norm": 0.31477116016126155, + "learning_rate": 3.638543061856523e-05, + "loss": 0.9771, + "step": 15333 + }, + { + "epoch": 1.47, + "grad_norm": 0.3452296582256543, + "learning_rate": 3.6373225586682756e-05, + "loss": 1.0652, + "step": 15334 + }, + { + "epoch": 1.47, + "grad_norm": 0.3144490428564328, + "learning_rate": 3.636102214710034e-05, + "loss": 1.1083, + "step": 15335 + }, + { + "epoch": 1.47, + "grad_norm": 0.3311813622917311, + "learning_rate": 3.63488203001233e-05, + "loss": 1.0071, + "step": 15336 + }, + { + "epoch": 1.47, + "grad_norm": 0.29597556075422343, + "learning_rate": 3.633662004605703e-05, + "loss": 0.9723, + "step": 15337 + }, + { + "epoch": 1.47, + "grad_norm": 0.3170032781397048, + "learning_rate": 3.632442138520687e-05, + "loss": 1.057, + "step": 15338 + }, + { + "epoch": 1.47, + "grad_norm": 0.2896259295989772, + "learning_rate": 3.631222431787811e-05, + "loss": 0.936, + "step": 15339 + }, + { + "epoch": 1.47, + "grad_norm": 0.31579892524580083, + "learning_rate": 3.6300028844375925e-05, + "loss": 1.0903, + "step": 15340 + }, + { + "epoch": 1.47, + "grad_norm": 0.2868877319273006, + "learning_rate": 3.6287834965005583e-05, + "loss": 0.9925, + "step": 15341 + }, + { + "epoch": 1.47, + "grad_norm": 0.28682936739658943, + "learning_rate": 3.6275642680072186e-05, + "loss": 1.1198, + "step": 15342 + }, + { + "epoch": 1.47, + "grad_norm": 0.324997812243447, + "learning_rate": 3.626345198988088e-05, + "loss": 0.9787, + "step": 15343 + }, + { + "epoch": 1.47, + "grad_norm": 0.27306635486499975, + "learning_rate": 3.625126289473675e-05, + "loss": 0.9936, + "step": 15344 + }, + { + "epoch": 1.47, + "grad_norm": 0.28512299370112937, + "learning_rate": 3.6239075394944855e-05, + "loss": 1.0819, + "step": 15345 + }, + { + "epoch": 1.47, + "grad_norm": 0.3378601390741971, + "learning_rate": 3.622688949081013e-05, + "loss": 1.0109, + "step": 15346 + }, + { + "epoch": 1.47, + "grad_norm": 0.3346224348406575, + "learning_rate": 3.621470518263762e-05, + "loss": 1.114, + "step": 15347 + }, + { + "epoch": 1.47, + "grad_norm": 0.29017522330062684, + "learning_rate": 3.620252247073217e-05, + "loss": 1.1138, + "step": 15348 + }, + { + "epoch": 1.47, + "grad_norm": 0.32987079322320495, + "learning_rate": 3.6190341355398684e-05, + "loss": 0.9385, + "step": 15349 + }, + { + "epoch": 1.47, + "grad_norm": 0.3462608461909594, + "learning_rate": 3.617816183694201e-05, + "loss": 0.9402, + "step": 15350 + }, + { + "epoch": 1.47, + "grad_norm": 0.33680708001611875, + "learning_rate": 3.6165983915666976e-05, + "loss": 1.034, + "step": 15351 + }, + { + "epoch": 1.47, + "grad_norm": 0.3570016005616118, + "learning_rate": 3.6153807591878284e-05, + "loss": 1.0868, + "step": 15352 + }, + { + "epoch": 1.47, + "grad_norm": 0.31084842084685715, + "learning_rate": 3.6141632865880685e-05, + "loss": 0.998, + "step": 15353 + }, + { + "epoch": 1.47, + "grad_norm": 0.3207929569724231, + "learning_rate": 3.6129459737978887e-05, + "loss": 1.0085, + "step": 15354 + }, + { + "epoch": 1.47, + "grad_norm": 0.34285798861012556, + "learning_rate": 3.6117288208477476e-05, + "loss": 1.0893, + "step": 15355 + }, + { + "epoch": 1.47, + "grad_norm": 0.3148870843836842, + "learning_rate": 3.6105118277681094e-05, + "loss": 0.9819, + "step": 15356 + }, + { + "epoch": 1.47, + "grad_norm": 0.3246809550139308, + "learning_rate": 3.609294994589423e-05, + "loss": 0.9839, + "step": 15357 + }, + { + "epoch": 1.47, + "grad_norm": 0.2982629442310415, + "learning_rate": 3.608078321342151e-05, + "loss": 1.0398, + "step": 15358 + }, + { + "epoch": 1.47, + "grad_norm": 0.318417119358598, + "learning_rate": 3.606861808056734e-05, + "loss": 0.9566, + "step": 15359 + }, + { + "epoch": 1.47, + "grad_norm": 0.27733504491770594, + "learning_rate": 3.6056454547636206e-05, + "loss": 0.8901, + "step": 15360 + }, + { + "epoch": 1.47, + "grad_norm": 0.3184193602576306, + "learning_rate": 3.604429261493246e-05, + "loss": 1.0544, + "step": 15361 + }, + { + "epoch": 1.47, + "grad_norm": 0.3293149920079781, + "learning_rate": 3.603213228276052e-05, + "loss": 0.9506, + "step": 15362 + }, + { + "epoch": 1.47, + "grad_norm": 0.274336003428795, + "learning_rate": 3.601997355142463e-05, + "loss": 0.8945, + "step": 15363 + }, + { + "epoch": 1.47, + "grad_norm": 0.3379598060504814, + "learning_rate": 3.600781642122912e-05, + "loss": 0.9555, + "step": 15364 + }, + { + "epoch": 1.47, + "grad_norm": 0.3009648436310444, + "learning_rate": 3.599566089247821e-05, + "loss": 0.9258, + "step": 15365 + }, + { + "epoch": 1.47, + "grad_norm": 0.2765965576883077, + "learning_rate": 3.5983506965476155e-05, + "loss": 0.8895, + "step": 15366 + }, + { + "epoch": 1.47, + "grad_norm": 0.32982090347299065, + "learning_rate": 3.5971354640527047e-05, + "loss": 1.0024, + "step": 15367 + }, + { + "epoch": 1.47, + "grad_norm": 0.2940602855204122, + "learning_rate": 3.595920391793505e-05, + "loss": 1.0676, + "step": 15368 + }, + { + "epoch": 1.47, + "grad_norm": 0.27914458781698476, + "learning_rate": 3.594705479800419e-05, + "loss": 0.9105, + "step": 15369 + }, + { + "epoch": 1.47, + "grad_norm": 0.3172894828889384, + "learning_rate": 3.593490728103853e-05, + "loss": 0.9912, + "step": 15370 + }, + { + "epoch": 1.47, + "grad_norm": 0.3173103542618863, + "learning_rate": 3.59227613673421e-05, + "loss": 1.0129, + "step": 15371 + }, + { + "epoch": 1.47, + "grad_norm": 0.28508835209565314, + "learning_rate": 3.591061705721882e-05, + "loss": 0.9663, + "step": 15372 + }, + { + "epoch": 1.47, + "grad_norm": 0.31076971272126913, + "learning_rate": 3.5898474350972666e-05, + "loss": 0.9847, + "step": 15373 + }, + { + "epoch": 1.47, + "grad_norm": 0.3153039658804845, + "learning_rate": 3.588633324890745e-05, + "loss": 0.9445, + "step": 15374 + }, + { + "epoch": 1.47, + "grad_norm": 0.346538138481851, + "learning_rate": 3.587419375132707e-05, + "loss": 0.9874, + "step": 15375 + }, + { + "epoch": 1.47, + "grad_norm": 0.30519434021943326, + "learning_rate": 3.586205585853525e-05, + "loss": 0.9625, + "step": 15376 + }, + { + "epoch": 1.47, + "grad_norm": 0.33241585954538155, + "learning_rate": 3.58499195708358e-05, + "loss": 0.99, + "step": 15377 + }, + { + "epoch": 1.47, + "grad_norm": 0.30618399234519844, + "learning_rate": 3.583778488853242e-05, + "loss": 0.9348, + "step": 15378 + }, + { + "epoch": 1.47, + "grad_norm": 0.2959710352076148, + "learning_rate": 3.582565181192885e-05, + "loss": 1.0838, + "step": 15379 + }, + { + "epoch": 1.47, + "grad_norm": 0.2991839944206267, + "learning_rate": 3.581352034132862e-05, + "loss": 1.0723, + "step": 15380 + }, + { + "epoch": 1.47, + "grad_norm": 0.3302766884268446, + "learning_rate": 3.5801390477035436e-05, + "loss": 1.0843, + "step": 15381 + }, + { + "epoch": 1.47, + "grad_norm": 0.3229187130110488, + "learning_rate": 3.578926221935276e-05, + "loss": 1.0437, + "step": 15382 + }, + { + "epoch": 1.47, + "grad_norm": 0.31789923202216586, + "learning_rate": 3.5777135568584184e-05, + "loss": 0.9253, + "step": 15383 + }, + { + "epoch": 1.47, + "grad_norm": 0.3628666713175283, + "learning_rate": 3.576501052503308e-05, + "loss": 1.0193, + "step": 15384 + }, + { + "epoch": 1.47, + "grad_norm": 0.30459034344528313, + "learning_rate": 3.575288708900304e-05, + "loss": 1.0666, + "step": 15385 + }, + { + "epoch": 1.47, + "grad_norm": 0.3273943205901331, + "learning_rate": 3.574076526079734e-05, + "loss": 0.9937, + "step": 15386 + }, + { + "epoch": 1.47, + "grad_norm": 0.288158202881949, + "learning_rate": 3.57286450407194e-05, + "loss": 0.8945, + "step": 15387 + }, + { + "epoch": 1.47, + "grad_norm": 0.37285170390909833, + "learning_rate": 3.571652642907248e-05, + "loss": 0.9319, + "step": 15388 + }, + { + "epoch": 1.47, + "grad_norm": 0.3311202082081296, + "learning_rate": 3.57044094261599e-05, + "loss": 1.0011, + "step": 15389 + }, + { + "epoch": 1.47, + "grad_norm": 0.36553109360167524, + "learning_rate": 3.56922940322849e-05, + "loss": 0.9991, + "step": 15390 + }, + { + "epoch": 1.47, + "grad_norm": 0.2967320053547132, + "learning_rate": 3.568018024775063e-05, + "loss": 1.059, + "step": 15391 + }, + { + "epoch": 1.47, + "grad_norm": 0.28834135985176435, + "learning_rate": 3.5668068072860274e-05, + "loss": 0.9928, + "step": 15392 + }, + { + "epoch": 1.47, + "grad_norm": 0.2721173502696967, + "learning_rate": 3.5655957507916945e-05, + "loss": 1.0167, + "step": 15393 + }, + { + "epoch": 1.47, + "grad_norm": 0.3304883072418698, + "learning_rate": 3.5643848553223744e-05, + "loss": 1.0549, + "step": 15394 + }, + { + "epoch": 1.47, + "grad_norm": 0.29258285424377295, + "learning_rate": 3.563174120908366e-05, + "loss": 1.0268, + "step": 15395 + }, + { + "epoch": 1.47, + "grad_norm": 0.31591581972696037, + "learning_rate": 3.561963547579973e-05, + "loss": 1.106, + "step": 15396 + }, + { + "epoch": 1.47, + "grad_norm": 0.3937336239120012, + "learning_rate": 3.560753135367486e-05, + "loss": 1.1392, + "step": 15397 + }, + { + "epoch": 1.47, + "grad_norm": 0.32575763447729683, + "learning_rate": 3.559542884301198e-05, + "loss": 1.0362, + "step": 15398 + }, + { + "epoch": 1.47, + "grad_norm": 0.31146704491013183, + "learning_rate": 3.5583327944113964e-05, + "loss": 0.9544, + "step": 15399 + }, + { + "epoch": 1.47, + "grad_norm": 0.3360145278051506, + "learning_rate": 3.5571228657283695e-05, + "loss": 1.0661, + "step": 15400 + }, + { + "epoch": 1.47, + "grad_norm": 0.2883226172676515, + "learning_rate": 3.5559130982823874e-05, + "loss": 1.0315, + "step": 15401 + }, + { + "epoch": 1.47, + "grad_norm": 0.29971210235162227, + "learning_rate": 3.5547034921037345e-05, + "loss": 1.0952, + "step": 15402 + }, + { + "epoch": 1.47, + "grad_norm": 0.3143069513771481, + "learning_rate": 3.5534940472226743e-05, + "loss": 0.935, + "step": 15403 + }, + { + "epoch": 1.47, + "grad_norm": 0.3224536209477878, + "learning_rate": 3.552284763669476e-05, + "loss": 1.0954, + "step": 15404 + }, + { + "epoch": 1.47, + "grad_norm": 0.3105353114706034, + "learning_rate": 3.5510756414744037e-05, + "loss": 0.9644, + "step": 15405 + }, + { + "epoch": 1.47, + "grad_norm": 0.31823669864705684, + "learning_rate": 3.549866680667721e-05, + "loss": 1.0781, + "step": 15406 + }, + { + "epoch": 1.47, + "grad_norm": 0.31846973238093773, + "learning_rate": 3.548657881279674e-05, + "loss": 1.062, + "step": 15407 + }, + { + "epoch": 1.47, + "grad_norm": 0.3333260145003385, + "learning_rate": 3.5474492433405216e-05, + "loss": 1.0165, + "step": 15408 + }, + { + "epoch": 1.47, + "grad_norm": 0.3139281570812774, + "learning_rate": 3.5462407668805045e-05, + "loss": 1.0887, + "step": 15409 + }, + { + "epoch": 1.47, + "grad_norm": 0.3210731776389564, + "learning_rate": 3.5450324519298675e-05, + "loss": 1.0947, + "step": 15410 + }, + { + "epoch": 1.47, + "grad_norm": 0.28653743460073855, + "learning_rate": 3.54382429851885e-05, + "loss": 0.9578, + "step": 15411 + }, + { + "epoch": 1.47, + "grad_norm": 0.3397988848269122, + "learning_rate": 3.542616306677691e-05, + "loss": 1.012, + "step": 15412 + }, + { + "epoch": 1.47, + "grad_norm": 0.32982962896799695, + "learning_rate": 3.541408476436613e-05, + "loss": 1.0003, + "step": 15413 + }, + { + "epoch": 1.47, + "grad_norm": 0.34435343373421945, + "learning_rate": 3.540200807825848e-05, + "loss": 1.0374, + "step": 15414 + }, + { + "epoch": 1.47, + "grad_norm": 0.33528498246298305, + "learning_rate": 3.53899330087562e-05, + "loss": 0.9884, + "step": 15415 + }, + { + "epoch": 1.47, + "grad_norm": 0.3588664161347083, + "learning_rate": 3.5377859556161433e-05, + "loss": 1.0572, + "step": 15416 + }, + { + "epoch": 1.47, + "grad_norm": 0.295870887389693, + "learning_rate": 3.536578772077637e-05, + "loss": 1.1185, + "step": 15417 + }, + { + "epoch": 1.48, + "grad_norm": 0.29997626870636485, + "learning_rate": 3.535371750290302e-05, + "loss": 1.0037, + "step": 15418 + }, + { + "epoch": 1.48, + "grad_norm": 0.32448294776881714, + "learning_rate": 3.534164890284358e-05, + "loss": 1.0483, + "step": 15419 + }, + { + "epoch": 1.48, + "grad_norm": 0.3623096603136965, + "learning_rate": 3.532958192089999e-05, + "loss": 0.9689, + "step": 15420 + }, + { + "epoch": 1.48, + "grad_norm": 0.2744888211067063, + "learning_rate": 3.531751655737429e-05, + "loss": 1.0272, + "step": 15421 + }, + { + "epoch": 1.48, + "grad_norm": 0.29897628385862757, + "learning_rate": 3.530545281256834e-05, + "loss": 0.9656, + "step": 15422 + }, + { + "epoch": 1.48, + "grad_norm": 0.30607506229902576, + "learning_rate": 3.529339068678414e-05, + "loss": 1.0472, + "step": 15423 + }, + { + "epoch": 1.48, + "grad_norm": 0.34449714150536664, + "learning_rate": 3.528133018032347e-05, + "loss": 1.0326, + "step": 15424 + }, + { + "epoch": 1.48, + "grad_norm": 0.33395579076181703, + "learning_rate": 3.526927129348818e-05, + "loss": 1.0778, + "step": 15425 + }, + { + "epoch": 1.48, + "grad_norm": 0.28432300126077215, + "learning_rate": 3.525721402658007e-05, + "loss": 0.9729, + "step": 15426 + }, + { + "epoch": 1.48, + "grad_norm": 0.2720447579515147, + "learning_rate": 3.524515837990089e-05, + "loss": 0.9856, + "step": 15427 + }, + { + "epoch": 1.48, + "grad_norm": 0.31856187941576036, + "learning_rate": 3.523310435375229e-05, + "loss": 0.9949, + "step": 15428 + }, + { + "epoch": 1.48, + "grad_norm": 0.3301325630484232, + "learning_rate": 3.522105194843599e-05, + "loss": 0.9758, + "step": 15429 + }, + { + "epoch": 1.48, + "grad_norm": 0.2642170490603906, + "learning_rate": 3.520900116425355e-05, + "loss": 0.943, + "step": 15430 + }, + { + "epoch": 1.48, + "grad_norm": 0.2828236205150105, + "learning_rate": 3.5196952001506564e-05, + "loss": 1.0171, + "step": 15431 + }, + { + "epoch": 1.48, + "grad_norm": 0.2797785711907991, + "learning_rate": 3.518490446049659e-05, + "loss": 1.0516, + "step": 15432 + }, + { + "epoch": 1.48, + "grad_norm": 0.2827804280111508, + "learning_rate": 3.5172858541525146e-05, + "loss": 0.9703, + "step": 15433 + }, + { + "epoch": 1.48, + "grad_norm": 0.34191249828873854, + "learning_rate": 3.516081424489364e-05, + "loss": 1.0058, + "step": 15434 + }, + { + "epoch": 1.48, + "grad_norm": 0.3845985325194781, + "learning_rate": 3.5148771570903485e-05, + "loss": 1.0561, + "step": 15435 + }, + { + "epoch": 1.48, + "grad_norm": 0.30183473750552725, + "learning_rate": 3.513673051985612e-05, + "loss": 1.1281, + "step": 15436 + }, + { + "epoch": 1.48, + "grad_norm": 0.3221929260072393, + "learning_rate": 3.512469109205281e-05, + "loss": 0.9464, + "step": 15437 + }, + { + "epoch": 1.48, + "grad_norm": 0.31986464180788476, + "learning_rate": 3.511265328779487e-05, + "loss": 0.9909, + "step": 15438 + }, + { + "epoch": 1.48, + "grad_norm": 0.37045954685374727, + "learning_rate": 3.510061710738357e-05, + "loss": 1.1461, + "step": 15439 + }, + { + "epoch": 1.48, + "grad_norm": 0.31943611931236493, + "learning_rate": 3.508858255112015e-05, + "loss": 0.9378, + "step": 15440 + }, + { + "epoch": 1.48, + "grad_norm": 0.31523034038814346, + "learning_rate": 3.507654961930571e-05, + "loss": 1.1026, + "step": 15441 + }, + { + "epoch": 1.48, + "grad_norm": 0.32085957473004667, + "learning_rate": 3.506451831224144e-05, + "loss": 0.9964, + "step": 15442 + }, + { + "epoch": 1.48, + "grad_norm": 0.35164387525139923, + "learning_rate": 3.50524886302284e-05, + "loss": 0.9927, + "step": 15443 + }, + { + "epoch": 1.48, + "grad_norm": 0.3259575377360314, + "learning_rate": 3.504046057356767e-05, + "loss": 0.9773, + "step": 15444 + }, + { + "epoch": 1.48, + "grad_norm": 0.33060007432231925, + "learning_rate": 3.5028434142560176e-05, + "loss": 0.9981, + "step": 15445 + }, + { + "epoch": 1.48, + "grad_norm": 0.2848281182676272, + "learning_rate": 3.5016409337507004e-05, + "loss": 1.0237, + "step": 15446 + }, + { + "epoch": 1.48, + "grad_norm": 0.3195792063761986, + "learning_rate": 3.500438615870901e-05, + "loss": 1.0322, + "step": 15447 + }, + { + "epoch": 1.48, + "grad_norm": 0.29548580765815324, + "learning_rate": 3.499236460646713e-05, + "loss": 0.9517, + "step": 15448 + }, + { + "epoch": 1.48, + "grad_norm": 0.36456351291832556, + "learning_rate": 3.4980344681082145e-05, + "loss": 0.9307, + "step": 15449 + }, + { + "epoch": 1.48, + "grad_norm": 0.30681637708505055, + "learning_rate": 3.496832638285494e-05, + "loss": 0.8606, + "step": 15450 + }, + { + "epoch": 1.48, + "grad_norm": 0.3014702611302928, + "learning_rate": 3.49563097120862e-05, + "loss": 0.9627, + "step": 15451 + }, + { + "epoch": 1.48, + "grad_norm": 0.3269330756855203, + "learning_rate": 3.4944294669076685e-05, + "loss": 1.0406, + "step": 15452 + }, + { + "epoch": 1.48, + "grad_norm": 0.34504211822745817, + "learning_rate": 3.493228125412709e-05, + "loss": 1.2263, + "step": 15453 + }, + { + "epoch": 1.48, + "grad_norm": 0.2719889115089771, + "learning_rate": 3.492026946753807e-05, + "loss": 1.035, + "step": 15454 + }, + { + "epoch": 1.48, + "grad_norm": 0.25087361895659704, + "learning_rate": 3.490825930961018e-05, + "loss": 0.9708, + "step": 15455 + }, + { + "epoch": 1.48, + "grad_norm": 0.3411560705957845, + "learning_rate": 3.489625078064401e-05, + "loss": 0.9825, + "step": 15456 + }, + { + "epoch": 1.48, + "grad_norm": 0.31804920057930464, + "learning_rate": 3.4884243880940115e-05, + "loss": 1.0314, + "step": 15457 + }, + { + "epoch": 1.48, + "grad_norm": 0.2957566070429235, + "learning_rate": 3.487223861079891e-05, + "loss": 1.0328, + "step": 15458 + }, + { + "epoch": 1.48, + "grad_norm": 0.31619665786102735, + "learning_rate": 3.4860234970520866e-05, + "loss": 1.0479, + "step": 15459 + }, + { + "epoch": 1.48, + "grad_norm": 0.3088703049264611, + "learning_rate": 3.484823296040638e-05, + "loss": 0.8916, + "step": 15460 + }, + { + "epoch": 1.48, + "grad_norm": 0.34904620756132926, + "learning_rate": 3.483623258075583e-05, + "loss": 0.935, + "step": 15461 + }, + { + "epoch": 1.48, + "grad_norm": 0.3294739882827856, + "learning_rate": 3.48242338318695e-05, + "loss": 1.0085, + "step": 15462 + }, + { + "epoch": 1.48, + "grad_norm": 0.3186022412187682, + "learning_rate": 3.481223671404771e-05, + "loss": 1.0377, + "step": 15463 + }, + { + "epoch": 1.48, + "grad_norm": 0.27143311315674745, + "learning_rate": 3.4800241227590616e-05, + "loss": 0.9229, + "step": 15464 + }, + { + "epoch": 1.48, + "grad_norm": 0.27811079734700106, + "learning_rate": 3.4788247372798474e-05, + "loss": 0.8946, + "step": 15465 + }, + { + "epoch": 1.48, + "grad_norm": 0.32318776002330496, + "learning_rate": 3.477625514997142e-05, + "loss": 0.986, + "step": 15466 + }, + { + "epoch": 1.48, + "grad_norm": 0.3256645396157712, + "learning_rate": 3.476426455940961e-05, + "loss": 1.0963, + "step": 15467 + }, + { + "epoch": 1.48, + "grad_norm": 0.33269708461200254, + "learning_rate": 3.4752275601413045e-05, + "loss": 1.0314, + "step": 15468 + }, + { + "epoch": 1.48, + "grad_norm": 0.31773969243488065, + "learning_rate": 3.474028827628183e-05, + "loss": 1.0369, + "step": 15469 + }, + { + "epoch": 1.48, + "grad_norm": 0.28738714192254095, + "learning_rate": 3.4728302584315876e-05, + "loss": 1.1017, + "step": 15470 + }, + { + "epoch": 1.48, + "grad_norm": 0.30299728269681486, + "learning_rate": 3.4716318525815164e-05, + "loss": 0.9849, + "step": 15471 + }, + { + "epoch": 1.48, + "grad_norm": 0.3176642409054917, + "learning_rate": 3.470433610107962e-05, + "loss": 0.9037, + "step": 15472 + }, + { + "epoch": 1.48, + "grad_norm": 0.27289147466361996, + "learning_rate": 3.469235531040913e-05, + "loss": 0.9105, + "step": 15473 + }, + { + "epoch": 1.48, + "grad_norm": 0.30374445567282005, + "learning_rate": 3.468037615410346e-05, + "loss": 1.0233, + "step": 15474 + }, + { + "epoch": 1.48, + "grad_norm": 0.2713336868533708, + "learning_rate": 3.466839863246243e-05, + "loss": 1.1327, + "step": 15475 + }, + { + "epoch": 1.48, + "grad_norm": 0.3347632389484768, + "learning_rate": 3.465642274578581e-05, + "loss": 1.0945, + "step": 15476 + }, + { + "epoch": 1.48, + "grad_norm": 0.3293501819587789, + "learning_rate": 3.464444849437325e-05, + "loss": 1.1185, + "step": 15477 + }, + { + "epoch": 1.48, + "grad_norm": 0.2712499517223504, + "learning_rate": 3.4632475878524476e-05, + "loss": 1.0188, + "step": 15478 + }, + { + "epoch": 1.48, + "grad_norm": 0.30557691516309854, + "learning_rate": 3.462050489853901e-05, + "loss": 1.1314, + "step": 15479 + }, + { + "epoch": 1.48, + "grad_norm": 0.326795483911817, + "learning_rate": 3.460853555471656e-05, + "loss": 1.0182, + "step": 15480 + }, + { + "epoch": 1.48, + "grad_norm": 0.3566590119420042, + "learning_rate": 3.459656784735658e-05, + "loss": 1.0503, + "step": 15481 + }, + { + "epoch": 1.48, + "grad_norm": 0.32433653403926105, + "learning_rate": 3.458460177675862e-05, + "loss": 1.1172, + "step": 15482 + }, + { + "epoch": 1.48, + "grad_norm": 0.3443207933098826, + "learning_rate": 3.457263734322207e-05, + "loss": 1.0012, + "step": 15483 + }, + { + "epoch": 1.48, + "grad_norm": 0.25144396917987655, + "learning_rate": 3.4560674547046435e-05, + "loss": 1.125, + "step": 15484 + }, + { + "epoch": 1.48, + "grad_norm": 0.3317698524739587, + "learning_rate": 3.454871338853102e-05, + "loss": 0.9053, + "step": 15485 + }, + { + "epoch": 1.48, + "grad_norm": 0.3227879226923285, + "learning_rate": 3.453675386797517e-05, + "loss": 1.0218, + "step": 15486 + }, + { + "epoch": 1.48, + "grad_norm": 0.27265386576610884, + "learning_rate": 3.4524795985678205e-05, + "loss": 0.974, + "step": 15487 + }, + { + "epoch": 1.48, + "grad_norm": 0.3419308865598115, + "learning_rate": 3.451283974193941e-05, + "loss": 1.037, + "step": 15488 + }, + { + "epoch": 1.48, + "grad_norm": 0.31851655397417133, + "learning_rate": 3.450088513705791e-05, + "loss": 0.9484, + "step": 15489 + }, + { + "epoch": 1.48, + "grad_norm": 0.3056504824568811, + "learning_rate": 3.4488932171332975e-05, + "loss": 1.0211, + "step": 15490 + }, + { + "epoch": 1.48, + "grad_norm": 0.3113841481143988, + "learning_rate": 3.447698084506363e-05, + "loss": 0.9919, + "step": 15491 + }, + { + "epoch": 1.48, + "grad_norm": 0.3249694233221407, + "learning_rate": 3.4465031158549044e-05, + "loss": 0.9487, + "step": 15492 + }, + { + "epoch": 1.48, + "grad_norm": 0.3264256024031184, + "learning_rate": 3.4453083112088226e-05, + "loss": 0.9594, + "step": 15493 + }, + { + "epoch": 1.48, + "grad_norm": 0.2800561365120821, + "learning_rate": 3.4441136705980226e-05, + "loss": 1.0122, + "step": 15494 + }, + { + "epoch": 1.48, + "grad_norm": 0.34077566169599616, + "learning_rate": 3.442919194052395e-05, + "loss": 0.9622, + "step": 15495 + }, + { + "epoch": 1.48, + "grad_norm": 0.28844411848897805, + "learning_rate": 3.4417248816018366e-05, + "loss": 0.9397, + "step": 15496 + }, + { + "epoch": 1.48, + "grad_norm": 0.3212824463935046, + "learning_rate": 3.440530733276238e-05, + "loss": 1.0597, + "step": 15497 + }, + { + "epoch": 1.48, + "grad_norm": 0.34193291073083554, + "learning_rate": 3.439336749105476e-05, + "loss": 1.0394, + "step": 15498 + }, + { + "epoch": 1.48, + "grad_norm": 0.30050109489456117, + "learning_rate": 3.438142929119436e-05, + "loss": 1.0569, + "step": 15499 + }, + { + "epoch": 1.48, + "grad_norm": 0.32601787779005403, + "learning_rate": 3.436949273347992e-05, + "loss": 1.1012, + "step": 15500 + }, + { + "epoch": 1.48, + "grad_norm": 0.3038921046318048, + "learning_rate": 3.435755781821022e-05, + "loss": 1.0848, + "step": 15501 + }, + { + "epoch": 1.48, + "grad_norm": 0.3241663463605188, + "learning_rate": 3.434562454568385e-05, + "loss": 1.0678, + "step": 15502 + }, + { + "epoch": 1.48, + "grad_norm": 0.33231708108683095, + "learning_rate": 3.433369291619952e-05, + "loss": 0.9739, + "step": 15503 + }, + { + "epoch": 1.48, + "grad_norm": 0.3292334922553417, + "learning_rate": 3.432176293005576e-05, + "loss": 1.0144, + "step": 15504 + }, + { + "epoch": 1.48, + "grad_norm": 0.29428405687538484, + "learning_rate": 3.4309834587551195e-05, + "loss": 1.0057, + "step": 15505 + }, + { + "epoch": 1.48, + "grad_norm": 0.26659870191322316, + "learning_rate": 3.4297907888984234e-05, + "loss": 1.0128, + "step": 15506 + }, + { + "epoch": 1.48, + "grad_norm": 0.3472336850720002, + "learning_rate": 3.4285982834653494e-05, + "loss": 1.1028, + "step": 15507 + }, + { + "epoch": 1.48, + "grad_norm": 0.321059135134157, + "learning_rate": 3.42740594248573e-05, + "loss": 0.9187, + "step": 15508 + }, + { + "epoch": 1.48, + "grad_norm": 0.28125154189815704, + "learning_rate": 3.42621376598941e-05, + "loss": 1.0371, + "step": 15509 + }, + { + "epoch": 1.48, + "grad_norm": 0.3001262576919145, + "learning_rate": 3.425021754006219e-05, + "loss": 1.1606, + "step": 15510 + }, + { + "epoch": 1.48, + "grad_norm": 0.31168455866215655, + "learning_rate": 3.423829906565995e-05, + "loss": 0.8541, + "step": 15511 + }, + { + "epoch": 1.48, + "grad_norm": 0.32615541391774516, + "learning_rate": 3.422638223698557e-05, + "loss": 0.9765, + "step": 15512 + }, + { + "epoch": 1.48, + "grad_norm": 0.3341072575448335, + "learning_rate": 3.4214467054337305e-05, + "loss": 1.0392, + "step": 15513 + }, + { + "epoch": 1.48, + "grad_norm": 0.28520099176835645, + "learning_rate": 3.420255351801335e-05, + "loss": 1.0529, + "step": 15514 + }, + { + "epoch": 1.48, + "grad_norm": 0.24542995423515226, + "learning_rate": 3.419064162831187e-05, + "loss": 1.1144, + "step": 15515 + }, + { + "epoch": 1.48, + "grad_norm": 0.33826587089173227, + "learning_rate": 3.4178731385530925e-05, + "loss": 0.9995, + "step": 15516 + }, + { + "epoch": 1.48, + "grad_norm": 0.3166916579921224, + "learning_rate": 3.416682278996859e-05, + "loss": 1.1725, + "step": 15517 + }, + { + "epoch": 1.48, + "grad_norm": 0.2768942718853644, + "learning_rate": 3.415491584192292e-05, + "loss": 1.1043, + "step": 15518 + }, + { + "epoch": 1.48, + "grad_norm": 0.2893453637548575, + "learning_rate": 3.414301054169182e-05, + "loss": 0.9864, + "step": 15519 + }, + { + "epoch": 1.48, + "grad_norm": 0.30630614903714304, + "learning_rate": 3.4131106889573285e-05, + "loss": 1.0026, + "step": 15520 + }, + { + "epoch": 1.48, + "grad_norm": 0.30058463261433316, + "learning_rate": 3.411920488586519e-05, + "loss": 1.0891, + "step": 15521 + }, + { + "epoch": 1.49, + "grad_norm": 0.2990516231356582, + "learning_rate": 3.4107304530865434e-05, + "loss": 1.0829, + "step": 15522 + }, + { + "epoch": 1.49, + "grad_norm": 0.3637852876555222, + "learning_rate": 3.409540582487176e-05, + "loss": 1.1285, + "step": 15523 + }, + { + "epoch": 1.49, + "grad_norm": 0.2771200654529449, + "learning_rate": 3.4083508768182e-05, + "loss": 1.1274, + "step": 15524 + }, + { + "epoch": 1.49, + "grad_norm": 0.28364163827274286, + "learning_rate": 3.4071613361093835e-05, + "loss": 1.02, + "step": 15525 + }, + { + "epoch": 1.49, + "grad_norm": 0.30312905197042206, + "learning_rate": 3.405971960390498e-05, + "loss": 1.0177, + "step": 15526 + }, + { + "epoch": 1.49, + "grad_norm": 0.2804155030421263, + "learning_rate": 3.404782749691309e-05, + "loss": 0.9728, + "step": 15527 + }, + { + "epoch": 1.49, + "grad_norm": 0.30852558495562693, + "learning_rate": 3.403593704041579e-05, + "loss": 0.9627, + "step": 15528 + }, + { + "epoch": 1.49, + "grad_norm": 0.34803456649331815, + "learning_rate": 3.402404823471059e-05, + "loss": 1.1003, + "step": 15529 + }, + { + "epoch": 1.49, + "grad_norm": 0.3035621836373041, + "learning_rate": 3.401216108009508e-05, + "loss": 0.9844, + "step": 15530 + }, + { + "epoch": 1.49, + "grad_norm": 0.3025381819096346, + "learning_rate": 3.400027557686668e-05, + "loss": 1.0132, + "step": 15531 + }, + { + "epoch": 1.49, + "grad_norm": 0.3545638441210665, + "learning_rate": 3.39883917253229e-05, + "loss": 0.9927, + "step": 15532 + }, + { + "epoch": 1.49, + "grad_norm": 0.2769401504436889, + "learning_rate": 3.3976509525761035e-05, + "loss": 1.0612, + "step": 15533 + }, + { + "epoch": 1.49, + "grad_norm": 0.3268942073802195, + "learning_rate": 3.3964628978478576e-05, + "loss": 1.0448, + "step": 15534 + }, + { + "epoch": 1.49, + "grad_norm": 0.30454088177752636, + "learning_rate": 3.3952750083772745e-05, + "loss": 0.9919, + "step": 15535 + }, + { + "epoch": 1.49, + "grad_norm": 0.3144583747890368, + "learning_rate": 3.39408728419409e-05, + "loss": 1.0541, + "step": 15536 + }, + { + "epoch": 1.49, + "grad_norm": 0.32313598354913864, + "learning_rate": 3.392899725328018e-05, + "loss": 0.9877, + "step": 15537 + }, + { + "epoch": 1.49, + "grad_norm": 0.3244461516543057, + "learning_rate": 3.391712331808783e-05, + "loss": 1.1037, + "step": 15538 + }, + { + "epoch": 1.49, + "grad_norm": 0.3511324896464833, + "learning_rate": 3.390525103666104e-05, + "loss": 0.9031, + "step": 15539 + }, + { + "epoch": 1.49, + "grad_norm": 0.36181449903905316, + "learning_rate": 3.389338040929684e-05, + "loss": 1.0199, + "step": 15540 + }, + { + "epoch": 1.49, + "grad_norm": 0.31746378479536125, + "learning_rate": 3.388151143629235e-05, + "loss": 0.9516, + "step": 15541 + }, + { + "epoch": 1.49, + "grad_norm": 0.2727265556695842, + "learning_rate": 3.386964411794458e-05, + "loss": 0.9778, + "step": 15542 + }, + { + "epoch": 1.49, + "grad_norm": 0.3528622276919614, + "learning_rate": 3.385777845455056e-05, + "loss": 1.0413, + "step": 15543 + }, + { + "epoch": 1.49, + "grad_norm": 0.32725248837030635, + "learning_rate": 3.384591444640718e-05, + "loss": 1.0588, + "step": 15544 + }, + { + "epoch": 1.49, + "grad_norm": 0.3190600250957635, + "learning_rate": 3.3834052093811397e-05, + "loss": 1.103, + "step": 15545 + }, + { + "epoch": 1.49, + "grad_norm": 0.32565840144816244, + "learning_rate": 3.382219139706001e-05, + "loss": 1.0075, + "step": 15546 + }, + { + "epoch": 1.49, + "grad_norm": 0.2945152114167367, + "learning_rate": 3.3810332356449867e-05, + "loss": 1.1098, + "step": 15547 + }, + { + "epoch": 1.49, + "grad_norm": 0.311445770420042, + "learning_rate": 3.3798474972277774e-05, + "loss": 1.0333, + "step": 15548 + }, + { + "epoch": 1.49, + "grad_norm": 0.32007438614935324, + "learning_rate": 3.378661924484047e-05, + "loss": 1.0764, + "step": 15549 + }, + { + "epoch": 1.49, + "grad_norm": 0.3414794391829299, + "learning_rate": 3.377476517443461e-05, + "loss": 1.0865, + "step": 15550 + }, + { + "epoch": 1.49, + "grad_norm": 0.34146793635682754, + "learning_rate": 3.3762912761356894e-05, + "loss": 0.9722, + "step": 15551 + }, + { + "epoch": 1.49, + "grad_norm": 0.371382736097041, + "learning_rate": 3.3751062005903886e-05, + "loss": 1.035, + "step": 15552 + }, + { + "epoch": 1.49, + "grad_norm": 0.3074193485591952, + "learning_rate": 3.373921290837218e-05, + "loss": 1.008, + "step": 15553 + }, + { + "epoch": 1.49, + "grad_norm": 0.2981492541896228, + "learning_rate": 3.372736546905833e-05, + "loss": 1.0797, + "step": 15554 + }, + { + "epoch": 1.49, + "grad_norm": 0.3065615251239551, + "learning_rate": 3.3715519688258835e-05, + "loss": 1.0251, + "step": 15555 + }, + { + "epoch": 1.49, + "grad_norm": 0.30149773960224346, + "learning_rate": 3.3703675566270086e-05, + "loss": 1.1946, + "step": 15556 + }, + { + "epoch": 1.49, + "grad_norm": 0.3368432614243509, + "learning_rate": 3.369183310338855e-05, + "loss": 1.0328, + "step": 15557 + }, + { + "epoch": 1.49, + "grad_norm": 0.3326607931958618, + "learning_rate": 3.3679992299910535e-05, + "loss": 0.9773, + "step": 15558 + }, + { + "epoch": 1.49, + "grad_norm": 0.30015503683688044, + "learning_rate": 3.3668153156132386e-05, + "loss": 1.107, + "step": 15559 + }, + { + "epoch": 1.49, + "grad_norm": 0.3016709417820529, + "learning_rate": 3.3656315672350406e-05, + "loss": 0.929, + "step": 15560 + }, + { + "epoch": 1.49, + "grad_norm": 0.3046871028640248, + "learning_rate": 3.364447984886081e-05, + "loss": 1.0111, + "step": 15561 + }, + { + "epoch": 1.49, + "grad_norm": 0.320152824366245, + "learning_rate": 3.363264568595985e-05, + "loss": 1.1055, + "step": 15562 + }, + { + "epoch": 1.49, + "grad_norm": 0.28433427274190815, + "learning_rate": 3.36208131839436e-05, + "loss": 1.0757, + "step": 15563 + }, + { + "epoch": 1.49, + "grad_norm": 0.29903746539093845, + "learning_rate": 3.3608982343108266e-05, + "loss": 1.0006, + "step": 15564 + }, + { + "epoch": 1.49, + "grad_norm": 0.31767679722723985, + "learning_rate": 3.359715316374983e-05, + "loss": 0.9598, + "step": 15565 + }, + { + "epoch": 1.49, + "grad_norm": 0.2822611139733951, + "learning_rate": 3.35853256461644e-05, + "loss": 1.0522, + "step": 15566 + }, + { + "epoch": 1.49, + "grad_norm": 0.31809208065749894, + "learning_rate": 3.3573499790647865e-05, + "loss": 0.9834, + "step": 15567 + }, + { + "epoch": 1.49, + "grad_norm": 0.31244768680648394, + "learning_rate": 3.356167559749632e-05, + "loss": 1.0675, + "step": 15568 + }, + { + "epoch": 1.49, + "grad_norm": 0.3431534893772137, + "learning_rate": 3.3549853067005565e-05, + "loss": 1.1167, + "step": 15569 + }, + { + "epoch": 1.49, + "grad_norm": 0.3219457752893069, + "learning_rate": 3.353803219947153e-05, + "loss": 1.0415, + "step": 15570 + }, + { + "epoch": 1.49, + "grad_norm": 0.2838094497179779, + "learning_rate": 3.352621299518998e-05, + "loss": 1.032, + "step": 15571 + }, + { + "epoch": 1.49, + "grad_norm": 0.32594953771213026, + "learning_rate": 3.351439545445676e-05, + "loss": 1.0339, + "step": 15572 + }, + { + "epoch": 1.49, + "grad_norm": 0.33118979945032195, + "learning_rate": 3.350257957756754e-05, + "loss": 0.9898, + "step": 15573 + }, + { + "epoch": 1.49, + "grad_norm": 0.26891565721710087, + "learning_rate": 3.3490765364818065e-05, + "loss": 0.9845, + "step": 15574 + }, + { + "epoch": 1.49, + "grad_norm": 0.28155890755473356, + "learning_rate": 3.347895281650398e-05, + "loss": 1.0916, + "step": 15575 + }, + { + "epoch": 1.49, + "grad_norm": 0.34958238217671866, + "learning_rate": 3.3467141932920945e-05, + "loss": 1.0904, + "step": 15576 + }, + { + "epoch": 1.49, + "grad_norm": 0.3243259038212818, + "learning_rate": 3.345533271436446e-05, + "loss": 1.0164, + "step": 15577 + }, + { + "epoch": 1.49, + "grad_norm": 0.2978635432531241, + "learning_rate": 3.3443525161130096e-05, + "loss": 1.0074, + "step": 15578 + }, + { + "epoch": 1.49, + "grad_norm": 0.26654678562634704, + "learning_rate": 3.343171927351338e-05, + "loss": 1.0674, + "step": 15579 + }, + { + "epoch": 1.49, + "grad_norm": 0.27981869431448314, + "learning_rate": 3.3419915051809695e-05, + "loss": 0.9767, + "step": 15580 + }, + { + "epoch": 1.49, + "grad_norm": 0.2590451249014118, + "learning_rate": 3.340811249631447e-05, + "loss": 1.0347, + "step": 15581 + }, + { + "epoch": 1.49, + "grad_norm": 0.31221391088486034, + "learning_rate": 3.339631160732309e-05, + "loss": 0.9983, + "step": 15582 + }, + { + "epoch": 1.49, + "grad_norm": 0.29776710956985364, + "learning_rate": 3.3384512385130904e-05, + "loss": 0.9284, + "step": 15583 + }, + { + "epoch": 1.49, + "grad_norm": 0.3063865551743891, + "learning_rate": 3.337271483003312e-05, + "loss": 0.9745, + "step": 15584 + }, + { + "epoch": 1.49, + "grad_norm": 0.32296415617070456, + "learning_rate": 3.336091894232506e-05, + "loss": 0.9988, + "step": 15585 + }, + { + "epoch": 1.49, + "grad_norm": 0.3088453337711893, + "learning_rate": 3.334912472230185e-05, + "loss": 1.0385, + "step": 15586 + }, + { + "epoch": 1.49, + "grad_norm": 0.28944948494619965, + "learning_rate": 3.3337332170258695e-05, + "loss": 1.0064, + "step": 15587 + }, + { + "epoch": 1.49, + "grad_norm": 0.3167528163507865, + "learning_rate": 3.332554128649068e-05, + "loss": 1.0157, + "step": 15588 + }, + { + "epoch": 1.49, + "grad_norm": 0.31477362419672134, + "learning_rate": 3.3313752071292935e-05, + "loss": 1.0353, + "step": 15589 + }, + { + "epoch": 1.49, + "grad_norm": 0.3187441167993366, + "learning_rate": 3.330196452496043e-05, + "loss": 1.0607, + "step": 15590 + }, + { + "epoch": 1.49, + "grad_norm": 0.38232846137741905, + "learning_rate": 3.329017864778821e-05, + "loss": 1.0424, + "step": 15591 + }, + { + "epoch": 1.49, + "grad_norm": 0.3127217976558388, + "learning_rate": 3.3278394440071157e-05, + "loss": 0.9815, + "step": 15592 + }, + { + "epoch": 1.49, + "grad_norm": 0.29574806378712243, + "learning_rate": 3.326661190210426e-05, + "loss": 0.994, + "step": 15593 + }, + { + "epoch": 1.49, + "grad_norm": 0.30480196812560073, + "learning_rate": 3.325483103418226e-05, + "loss": 1.0889, + "step": 15594 + }, + { + "epoch": 1.49, + "grad_norm": 0.3320129468282981, + "learning_rate": 3.324305183660014e-05, + "loss": 0.9619, + "step": 15595 + }, + { + "epoch": 1.49, + "grad_norm": 0.27099996728082926, + "learning_rate": 3.323127430965256e-05, + "loss": 1.166, + "step": 15596 + }, + { + "epoch": 1.49, + "grad_norm": 0.3093458698629176, + "learning_rate": 3.321949845363435e-05, + "loss": 1.0675, + "step": 15597 + }, + { + "epoch": 1.49, + "grad_norm": 0.30604707135130593, + "learning_rate": 3.3207724268840115e-05, + "loss": 0.9455, + "step": 15598 + }, + { + "epoch": 1.49, + "grad_norm": 0.33255781869646267, + "learning_rate": 3.319595175556456e-05, + "loss": 1.0697, + "step": 15599 + }, + { + "epoch": 1.49, + "grad_norm": 0.3034583629724767, + "learning_rate": 3.3184180914102336e-05, + "loss": 1.0361, + "step": 15600 + }, + { + "epoch": 1.49, + "grad_norm": 0.33193094304073884, + "learning_rate": 3.3172411744747937e-05, + "loss": 1.0935, + "step": 15601 + }, + { + "epoch": 1.49, + "grad_norm": 0.30998951334374014, + "learning_rate": 3.3160644247795934e-05, + "loss": 1.1618, + "step": 15602 + }, + { + "epoch": 1.49, + "grad_norm": 0.30189435896873695, + "learning_rate": 3.314887842354082e-05, + "loss": 0.9468, + "step": 15603 + }, + { + "epoch": 1.49, + "grad_norm": 0.347326229968042, + "learning_rate": 3.313711427227707e-05, + "loss": 0.964, + "step": 15604 + }, + { + "epoch": 1.49, + "grad_norm": 0.29818348048412896, + "learning_rate": 3.312535179429902e-05, + "loss": 0.9874, + "step": 15605 + }, + { + "epoch": 1.49, + "grad_norm": 0.3027985349041424, + "learning_rate": 3.3113590989901114e-05, + "loss": 1.052, + "step": 15606 + }, + { + "epoch": 1.49, + "grad_norm": 0.3668509101372782, + "learning_rate": 3.3101831859377585e-05, + "loss": 1.0717, + "step": 15607 + }, + { + "epoch": 1.49, + "grad_norm": 0.3205864614553636, + "learning_rate": 3.309007440302276e-05, + "loss": 1.0846, + "step": 15608 + }, + { + "epoch": 1.49, + "grad_norm": 0.30403407017162576, + "learning_rate": 3.307831862113088e-05, + "loss": 1.0655, + "step": 15609 + }, + { + "epoch": 1.49, + "grad_norm": 0.3022408480930254, + "learning_rate": 3.3066564513996155e-05, + "loss": 1.0295, + "step": 15610 + }, + { + "epoch": 1.49, + "grad_norm": 0.30270769159454275, + "learning_rate": 3.30548120819127e-05, + "loss": 0.9296, + "step": 15611 + }, + { + "epoch": 1.49, + "grad_norm": 0.35684397201514584, + "learning_rate": 3.304306132517468e-05, + "loss": 0.9487, + "step": 15612 + }, + { + "epoch": 1.49, + "grad_norm": 0.28408226491924565, + "learning_rate": 3.3031312244076084e-05, + "loss": 0.9896, + "step": 15613 + }, + { + "epoch": 1.49, + "grad_norm": 0.3145681618798358, + "learning_rate": 3.3019564838911e-05, + "loss": 0.9984, + "step": 15614 + }, + { + "epoch": 1.49, + "grad_norm": 0.2980676387020889, + "learning_rate": 3.3007819109973396e-05, + "loss": 1.0959, + "step": 15615 + }, + { + "epoch": 1.49, + "grad_norm": 0.253937514673457, + "learning_rate": 3.299607505755726e-05, + "loss": 0.9629, + "step": 15616 + }, + { + "epoch": 1.49, + "grad_norm": 0.3075123010836788, + "learning_rate": 3.298433268195642e-05, + "loss": 1.0755, + "step": 15617 + }, + { + "epoch": 1.49, + "grad_norm": 0.3487714493776339, + "learning_rate": 3.297259198346481e-05, + "loss": 0.9441, + "step": 15618 + }, + { + "epoch": 1.49, + "grad_norm": 0.32549151273989985, + "learning_rate": 3.296085296237617e-05, + "loss": 0.9207, + "step": 15619 + }, + { + "epoch": 1.49, + "grad_norm": 0.3578589376620059, + "learning_rate": 3.2949115618984316e-05, + "loss": 1.1552, + "step": 15620 + }, + { + "epoch": 1.49, + "grad_norm": 0.299681468017544, + "learning_rate": 3.2937379953582994e-05, + "loss": 1.006, + "step": 15621 + }, + { + "epoch": 1.49, + "grad_norm": 0.30407609378805134, + "learning_rate": 3.29256459664659e-05, + "loss": 0.9784, + "step": 15622 + }, + { + "epoch": 1.49, + "grad_norm": 0.28053278067243903, + "learning_rate": 3.2913913657926654e-05, + "loss": 1.0098, + "step": 15623 + }, + { + "epoch": 1.49, + "grad_norm": 0.2964228280230065, + "learning_rate": 3.290218302825888e-05, + "loss": 1.0802, + "step": 15624 + }, + { + "epoch": 1.49, + "grad_norm": 0.32427324123679274, + "learning_rate": 3.2890454077756175e-05, + "loss": 0.9426, + "step": 15625 + }, + { + "epoch": 1.49, + "grad_norm": 0.34010825172976156, + "learning_rate": 3.2878726806712e-05, + "loss": 0.908, + "step": 15626 + }, + { + "epoch": 1.5, + "grad_norm": 0.31405381508608177, + "learning_rate": 3.2867001215419904e-05, + "loss": 1.0129, + "step": 15627 + }, + { + "epoch": 1.5, + "grad_norm": 0.3099198377188622, + "learning_rate": 3.2855277304173225e-05, + "loss": 1.0787, + "step": 15628 + }, + { + "epoch": 1.5, + "grad_norm": 0.3197590968864668, + "learning_rate": 3.28435550732655e-05, + "loss": 1.026, + "step": 15629 + }, + { + "epoch": 1.5, + "grad_norm": 0.2939090767477563, + "learning_rate": 3.283183452298998e-05, + "loss": 1.0725, + "step": 15630 + }, + { + "epoch": 1.5, + "grad_norm": 0.29618987403934405, + "learning_rate": 3.282011565364005e-05, + "loss": 1.0522, + "step": 15631 + }, + { + "epoch": 1.5, + "grad_norm": 0.29760831694813555, + "learning_rate": 3.2808398465508915e-05, + "loss": 1.0838, + "step": 15632 + }, + { + "epoch": 1.5, + "grad_norm": 0.3330776919310747, + "learning_rate": 3.279668295888987e-05, + "loss": 1.1215, + "step": 15633 + }, + { + "epoch": 1.5, + "grad_norm": 0.3104188033322878, + "learning_rate": 3.2784969134076026e-05, + "loss": 1.041, + "step": 15634 + }, + { + "epoch": 1.5, + "grad_norm": 0.33353078696359084, + "learning_rate": 3.277325699136059e-05, + "loss": 1.0367, + "step": 15635 + }, + { + "epoch": 1.5, + "grad_norm": 0.30757060664907854, + "learning_rate": 3.2761546531036634e-05, + "loss": 0.888, + "step": 15636 + }, + { + "epoch": 1.5, + "grad_norm": 0.31614253058402375, + "learning_rate": 3.2749837753397274e-05, + "loss": 0.9051, + "step": 15637 + }, + { + "epoch": 1.5, + "grad_norm": 0.278307401754329, + "learning_rate": 3.2738130658735456e-05, + "loss": 1.0672, + "step": 15638 + }, + { + "epoch": 1.5, + "grad_norm": 0.29873984011175253, + "learning_rate": 3.272642524734422e-05, + "loss": 1.1087, + "step": 15639 + }, + { + "epoch": 1.5, + "grad_norm": 0.295723605618044, + "learning_rate": 3.271472151951645e-05, + "loss": 1.0804, + "step": 15640 + }, + { + "epoch": 1.5, + "grad_norm": 0.3262763877780443, + "learning_rate": 3.2703019475545046e-05, + "loss": 0.9831, + "step": 15641 + }, + { + "epoch": 1.5, + "grad_norm": 0.3509902762417111, + "learning_rate": 3.269131911572288e-05, + "loss": 1.0484, + "step": 15642 + }, + { + "epoch": 1.5, + "grad_norm": 0.32372191452723953, + "learning_rate": 3.267962044034278e-05, + "loss": 1.0487, + "step": 15643 + }, + { + "epoch": 1.5, + "grad_norm": 0.33072215740744904, + "learning_rate": 3.266792344969747e-05, + "loss": 1.1413, + "step": 15644 + }, + { + "epoch": 1.5, + "grad_norm": 0.3185486048362475, + "learning_rate": 3.265622814407969e-05, + "loss": 0.9661, + "step": 15645 + }, + { + "epoch": 1.5, + "grad_norm": 0.34247096685170736, + "learning_rate": 3.264453452378216e-05, + "loss": 1.0397, + "step": 15646 + }, + { + "epoch": 1.5, + "grad_norm": 0.36360343193948724, + "learning_rate": 3.2632842589097434e-05, + "loss": 1.0754, + "step": 15647 + }, + { + "epoch": 1.5, + "grad_norm": 0.317626203030418, + "learning_rate": 3.2621152340318174e-05, + "loss": 1.0258, + "step": 15648 + }, + { + "epoch": 1.5, + "grad_norm": 0.3565561323335305, + "learning_rate": 3.260946377773692e-05, + "loss": 0.9879, + "step": 15649 + }, + { + "epoch": 1.5, + "grad_norm": 0.3423159105668498, + "learning_rate": 3.2597776901646226e-05, + "loss": 0.9547, + "step": 15650 + }, + { + "epoch": 1.5, + "grad_norm": 0.3301420822740786, + "learning_rate": 3.2586091712338485e-05, + "loss": 0.9789, + "step": 15651 + }, + { + "epoch": 1.5, + "grad_norm": 0.30671074864526593, + "learning_rate": 3.257440821010621e-05, + "loss": 1.0303, + "step": 15652 + }, + { + "epoch": 1.5, + "grad_norm": 0.3144873464415325, + "learning_rate": 3.2562726395241704e-05, + "loss": 1.1344, + "step": 15653 + }, + { + "epoch": 1.5, + "grad_norm": 0.36270384577467535, + "learning_rate": 3.255104626803739e-05, + "loss": 1.0232, + "step": 15654 + }, + { + "epoch": 1.5, + "grad_norm": 0.34213154605101703, + "learning_rate": 3.2539367828785464e-05, + "loss": 1.0543, + "step": 15655 + }, + { + "epoch": 1.5, + "grad_norm": 0.316206795126857, + "learning_rate": 3.2527691077778325e-05, + "loss": 1.1127, + "step": 15656 + }, + { + "epoch": 1.5, + "grad_norm": 0.3237235361883939, + "learning_rate": 3.251601601530808e-05, + "loss": 1.0641, + "step": 15657 + }, + { + "epoch": 1.5, + "grad_norm": 0.32160649707711164, + "learning_rate": 3.2504342641666985e-05, + "loss": 1.0076, + "step": 15658 + }, + { + "epoch": 1.5, + "grad_norm": 0.30567100420995824, + "learning_rate": 3.24926709571471e-05, + "loss": 0.906, + "step": 15659 + }, + { + "epoch": 1.5, + "grad_norm": 0.3345031250076558, + "learning_rate": 3.2481000962040556e-05, + "loss": 1.065, + "step": 15660 + }, + { + "epoch": 1.5, + "grad_norm": 0.3290010141792721, + "learning_rate": 3.246933265663943e-05, + "loss": 1.0003, + "step": 15661 + }, + { + "epoch": 1.5, + "grad_norm": 0.3264166914719706, + "learning_rate": 3.245766604123566e-05, + "loss": 1.0932, + "step": 15662 + }, + { + "epoch": 1.5, + "grad_norm": 0.33528700502140213, + "learning_rate": 3.244600111612124e-05, + "loss": 1.1825, + "step": 15663 + }, + { + "epoch": 1.5, + "grad_norm": 0.2994595848277191, + "learning_rate": 3.243433788158811e-05, + "loss": 1.0018, + "step": 15664 + }, + { + "epoch": 1.5, + "grad_norm": 0.3184630210346875, + "learning_rate": 3.2422676337928156e-05, + "loss": 1.0304, + "step": 15665 + }, + { + "epoch": 1.5, + "grad_norm": 0.29594903550955487, + "learning_rate": 3.2411016485433175e-05, + "loss": 1.0588, + "step": 15666 + }, + { + "epoch": 1.5, + "grad_norm": 0.2968699917482149, + "learning_rate": 3.239935832439502e-05, + "loss": 0.9964, + "step": 15667 + }, + { + "epoch": 1.5, + "grad_norm": 0.31554106595592074, + "learning_rate": 3.2387701855105365e-05, + "loss": 0.947, + "step": 15668 + }, + { + "epoch": 1.5, + "grad_norm": 0.3393088088393385, + "learning_rate": 3.2376047077855973e-05, + "loss": 1.0153, + "step": 15669 + }, + { + "epoch": 1.5, + "grad_norm": 0.30322102563035047, + "learning_rate": 3.2364393992938505e-05, + "loss": 1.1216, + "step": 15670 + }, + { + "epoch": 1.5, + "grad_norm": 0.28947926550540753, + "learning_rate": 3.2352742600644616e-05, + "loss": 1.0092, + "step": 15671 + }, + { + "epoch": 1.5, + "grad_norm": 0.37282735692105773, + "learning_rate": 3.234109290126582e-05, + "loss": 1.0265, + "step": 15672 + }, + { + "epoch": 1.5, + "grad_norm": 0.3092338098201103, + "learning_rate": 3.232944489509373e-05, + "loss": 1.111, + "step": 15673 + }, + { + "epoch": 1.5, + "grad_norm": 0.37507226990934245, + "learning_rate": 3.231779858241978e-05, + "loss": 0.9355, + "step": 15674 + }, + { + "epoch": 1.5, + "grad_norm": 0.3331578672123962, + "learning_rate": 3.2306153963535455e-05, + "loss": 0.9297, + "step": 15675 + }, + { + "epoch": 1.5, + "grad_norm": 0.32488180353738455, + "learning_rate": 3.2294511038732164e-05, + "loss": 0.9398, + "step": 15676 + }, + { + "epoch": 1.5, + "grad_norm": 0.31445393264279176, + "learning_rate": 3.228286980830133e-05, + "loss": 0.9797, + "step": 15677 + }, + { + "epoch": 1.5, + "grad_norm": 0.34634680593242323, + "learning_rate": 3.2271230272534205e-05, + "loss": 0.9951, + "step": 15678 + }, + { + "epoch": 1.5, + "eval_loss": 1.1253540515899658, + "eval_runtime": 4227.4598, + "eval_samples_per_second": 19.78, + "eval_steps_per_second": 2.473, + "step": 15678 + }, + { + "epoch": 1.5, + "grad_norm": 0.35655998841372444, + "learning_rate": 3.225959243172214e-05, + "loss": 1.0246, + "step": 15679 + }, + { + "epoch": 1.5, + "grad_norm": 0.29439311998884515, + "learning_rate": 3.224795628615631e-05, + "loss": 0.9545, + "step": 15680 + }, + { + "epoch": 1.5, + "grad_norm": 0.2986321612483612, + "learning_rate": 3.223632183612796e-05, + "loss": 1.0411, + "step": 15681 + }, + { + "epoch": 1.5, + "grad_norm": 0.32868373124055644, + "learning_rate": 3.222468908192824e-05, + "loss": 1.138, + "step": 15682 + }, + { + "epoch": 1.5, + "grad_norm": 0.31032151565496763, + "learning_rate": 3.221305802384831e-05, + "loss": 1.0452, + "step": 15683 + }, + { + "epoch": 1.5, + "grad_norm": 0.27215645674496824, + "learning_rate": 3.2201428662179156e-05, + "loss": 0.9939, + "step": 15684 + }, + { + "epoch": 1.5, + "grad_norm": 0.3663405477342002, + "learning_rate": 3.218980099721187e-05, + "loss": 1.1486, + "step": 15685 + }, + { + "epoch": 1.5, + "grad_norm": 0.37277117525152825, + "learning_rate": 3.2178175029237464e-05, + "loss": 1.0356, + "step": 15686 + }, + { + "epoch": 1.5, + "grad_norm": 0.31443031483740796, + "learning_rate": 3.216655075854683e-05, + "loss": 1.0969, + "step": 15687 + }, + { + "epoch": 1.5, + "grad_norm": 0.3777426687521151, + "learning_rate": 3.215492818543091e-05, + "loss": 1.0406, + "step": 15688 + }, + { + "epoch": 1.5, + "grad_norm": 0.2861768801107155, + "learning_rate": 3.2143307310180504e-05, + "loss": 1.0039, + "step": 15689 + }, + { + "epoch": 1.5, + "grad_norm": 0.3325779605860082, + "learning_rate": 3.2131688133086525e-05, + "loss": 1.0516, + "step": 15690 + }, + { + "epoch": 1.5, + "grad_norm": 0.34598250169292566, + "learning_rate": 3.212007065443968e-05, + "loss": 1.0775, + "step": 15691 + }, + { + "epoch": 1.5, + "grad_norm": 0.31126150707607136, + "learning_rate": 3.210845487453077e-05, + "loss": 1.0667, + "step": 15692 + }, + { + "epoch": 1.5, + "grad_norm": 0.3468709681579207, + "learning_rate": 3.209684079365039e-05, + "loss": 0.8854, + "step": 15693 + }, + { + "epoch": 1.5, + "grad_norm": 0.317412615502718, + "learning_rate": 3.208522841208929e-05, + "loss": 1.0, + "step": 15694 + }, + { + "epoch": 1.5, + "grad_norm": 0.3025088087919243, + "learning_rate": 3.207361773013801e-05, + "loss": 0.9918, + "step": 15695 + }, + { + "epoch": 1.5, + "grad_norm": 0.31557415836435604, + "learning_rate": 3.2062008748087126e-05, + "loss": 1.0316, + "step": 15696 + }, + { + "epoch": 1.5, + "grad_norm": 0.31027137481444145, + "learning_rate": 3.205040146622718e-05, + "loss": 0.9678, + "step": 15697 + }, + { + "epoch": 1.5, + "grad_norm": 0.27555457969246777, + "learning_rate": 3.203879588484866e-05, + "loss": 0.982, + "step": 15698 + }, + { + "epoch": 1.5, + "grad_norm": 0.2940594084965663, + "learning_rate": 3.202719200424196e-05, + "loss": 1.0588, + "step": 15699 + }, + { + "epoch": 1.5, + "grad_norm": 0.3277805917762819, + "learning_rate": 3.2015589824697524e-05, + "loss": 0.8998, + "step": 15700 + }, + { + "epoch": 1.5, + "grad_norm": 0.2918598575132349, + "learning_rate": 3.2003989346505645e-05, + "loss": 1.0208, + "step": 15701 + }, + { + "epoch": 1.5, + "grad_norm": 0.30905220875402295, + "learning_rate": 3.199239056995669e-05, + "loss": 1.0345, + "step": 15702 + }, + { + "epoch": 1.5, + "grad_norm": 0.33993402118461236, + "learning_rate": 3.1980793495340874e-05, + "loss": 1.0707, + "step": 15703 + }, + { + "epoch": 1.5, + "grad_norm": 0.3037913826592507, + "learning_rate": 3.19691981229485e-05, + "loss": 1.0129, + "step": 15704 + }, + { + "epoch": 1.5, + "grad_norm": 0.2922453602059969, + "learning_rate": 3.1957604453069654e-05, + "loss": 1.0041, + "step": 15705 + }, + { + "epoch": 1.5, + "grad_norm": 0.32891556775007896, + "learning_rate": 3.194601248599453e-05, + "loss": 1.0865, + "step": 15706 + }, + { + "epoch": 1.5, + "grad_norm": 0.29805467839785754, + "learning_rate": 3.193442222201324e-05, + "loss": 1.0788, + "step": 15707 + }, + { + "epoch": 1.5, + "grad_norm": 0.32281700513763695, + "learning_rate": 3.192283366141578e-05, + "loss": 0.9937, + "step": 15708 + }, + { + "epoch": 1.5, + "grad_norm": 0.34293813291960207, + "learning_rate": 3.19112468044922e-05, + "loss": 1.0649, + "step": 15709 + }, + { + "epoch": 1.5, + "grad_norm": 0.3537466605176951, + "learning_rate": 3.189966165153245e-05, + "loss": 0.9039, + "step": 15710 + }, + { + "epoch": 1.5, + "grad_norm": 0.31729548080790504, + "learning_rate": 3.188807820282651e-05, + "loss": 1.0928, + "step": 15711 + }, + { + "epoch": 1.5, + "grad_norm": 0.2946554761849703, + "learning_rate": 3.1876496458664175e-05, + "loss": 1.0944, + "step": 15712 + }, + { + "epoch": 1.5, + "grad_norm": 0.2724628854694174, + "learning_rate": 3.186491641933537e-05, + "loss": 1.0305, + "step": 15713 + }, + { + "epoch": 1.5, + "grad_norm": 0.33654654818989577, + "learning_rate": 3.1853338085129826e-05, + "loss": 1.0026, + "step": 15714 + }, + { + "epoch": 1.5, + "grad_norm": 0.2913485164676789, + "learning_rate": 3.184176145633735e-05, + "loss": 0.9378, + "step": 15715 + }, + { + "epoch": 1.5, + "grad_norm": 0.34059796058264846, + "learning_rate": 3.1830186533247566e-05, + "loss": 1.053, + "step": 15716 + }, + { + "epoch": 1.5, + "grad_norm": 0.34862748842933844, + "learning_rate": 3.181861331615026e-05, + "loss": 1.0508, + "step": 15717 + }, + { + "epoch": 1.5, + "grad_norm": 0.31838357326077926, + "learning_rate": 3.1807041805334994e-05, + "loss": 0.9999, + "step": 15718 + }, + { + "epoch": 1.5, + "grad_norm": 0.3071139806786451, + "learning_rate": 3.1795472001091386e-05, + "loss": 0.9563, + "step": 15719 + }, + { + "epoch": 1.5, + "grad_norm": 0.36198317990821544, + "learning_rate": 3.1783903903708925e-05, + "loss": 0.9517, + "step": 15720 + }, + { + "epoch": 1.5, + "grad_norm": 0.3262212685415591, + "learning_rate": 3.177233751347717e-05, + "loss": 1.0264, + "step": 15721 + }, + { + "epoch": 1.5, + "grad_norm": 0.312241795386285, + "learning_rate": 3.1760772830685516e-05, + "loss": 0.9928, + "step": 15722 + }, + { + "epoch": 1.5, + "grad_norm": 0.3308589118601134, + "learning_rate": 3.1749209855623415e-05, + "loss": 0.9492, + "step": 15723 + }, + { + "epoch": 1.5, + "grad_norm": 0.35218861588337963, + "learning_rate": 3.173764858858023e-05, + "loss": 1.0205, + "step": 15724 + }, + { + "epoch": 1.5, + "grad_norm": 0.32589994506795694, + "learning_rate": 3.172608902984532e-05, + "loss": 1.0755, + "step": 15725 + }, + { + "epoch": 1.5, + "grad_norm": 0.35839239116467425, + "learning_rate": 3.171453117970791e-05, + "loss": 0.8869, + "step": 15726 + }, + { + "epoch": 1.5, + "grad_norm": 0.3344398141652277, + "learning_rate": 3.170297503845727e-05, + "loss": 1.1267, + "step": 15727 + }, + { + "epoch": 1.5, + "grad_norm": 0.3034831253599235, + "learning_rate": 3.169142060638264e-05, + "loss": 0.9735, + "step": 15728 + }, + { + "epoch": 1.5, + "grad_norm": 0.30930618644539354, + "learning_rate": 3.167986788377311e-05, + "loss": 0.986, + "step": 15729 + }, + { + "epoch": 1.5, + "grad_norm": 0.3205500159191851, + "learning_rate": 3.166831687091781e-05, + "loss": 1.0177, + "step": 15730 + }, + { + "epoch": 1.5, + "grad_norm": 0.34752996494789096, + "learning_rate": 3.165676756810584e-05, + "loss": 1.1042, + "step": 15731 + }, + { + "epoch": 1.51, + "grad_norm": 0.31604147702411756, + "learning_rate": 3.164521997562624e-05, + "loss": 1.0709, + "step": 15732 + }, + { + "epoch": 1.51, + "grad_norm": 0.3764084641822102, + "learning_rate": 3.163367409376794e-05, + "loss": 1.0768, + "step": 15733 + }, + { + "epoch": 1.51, + "grad_norm": 0.3238198208896186, + "learning_rate": 3.162212992281994e-05, + "loss": 1.0174, + "step": 15734 + }, + { + "epoch": 1.51, + "grad_norm": 0.3309851592435171, + "learning_rate": 3.1610587463071086e-05, + "loss": 1.0259, + "step": 15735 + }, + { + "epoch": 1.51, + "grad_norm": 0.3169481489065765, + "learning_rate": 3.1599046714810256e-05, + "loss": 1.0164, + "step": 15736 + }, + { + "epoch": 1.51, + "grad_norm": 0.3262159620302953, + "learning_rate": 3.1587507678326266e-05, + "loss": 1.0582, + "step": 15737 + }, + { + "epoch": 1.51, + "grad_norm": 0.35334949244286334, + "learning_rate": 3.157597035390794e-05, + "loss": 1.0177, + "step": 15738 + }, + { + "epoch": 1.51, + "grad_norm": 0.31062901876601684, + "learning_rate": 3.156443474184391e-05, + "loss": 1.0966, + "step": 15739 + }, + { + "epoch": 1.51, + "grad_norm": 0.305550412540135, + "learning_rate": 3.155290084242295e-05, + "loss": 1.0902, + "step": 15740 + }, + { + "epoch": 1.51, + "grad_norm": 0.34926449405510007, + "learning_rate": 3.1541368655933636e-05, + "loss": 1.0081, + "step": 15741 + }, + { + "epoch": 1.51, + "grad_norm": 0.2887113914354914, + "learning_rate": 3.1529838182664615e-05, + "loss": 1.0757, + "step": 15742 + }, + { + "epoch": 1.51, + "grad_norm": 0.3364060897391225, + "learning_rate": 3.151830942290438e-05, + "loss": 1.0986, + "step": 15743 + }, + { + "epoch": 1.51, + "grad_norm": 0.29051343337834323, + "learning_rate": 3.1506782376941546e-05, + "loss": 1.0558, + "step": 15744 + }, + { + "epoch": 1.51, + "grad_norm": 0.3158044536037236, + "learning_rate": 3.149525704506448e-05, + "loss": 1.0165, + "step": 15745 + }, + { + "epoch": 1.51, + "grad_norm": 0.33063513653598764, + "learning_rate": 3.148373342756172e-05, + "loss": 1.0402, + "step": 15746 + }, + { + "epoch": 1.51, + "grad_norm": 0.3563231957267381, + "learning_rate": 3.147221152472155e-05, + "loss": 1.0776, + "step": 15747 + }, + { + "epoch": 1.51, + "grad_norm": 0.26067550212959484, + "learning_rate": 3.1460691336832345e-05, + "loss": 0.9467, + "step": 15748 + }, + { + "epoch": 1.51, + "grad_norm": 0.3148262723897057, + "learning_rate": 3.1449172864182465e-05, + "loss": 0.9773, + "step": 15749 + }, + { + "epoch": 1.51, + "grad_norm": 0.30193310862110795, + "learning_rate": 3.1437656107060043e-05, + "loss": 0.9394, + "step": 15750 + }, + { + "epoch": 1.51, + "grad_norm": 0.3369143062432768, + "learning_rate": 3.1426141065753426e-05, + "loss": 1.0158, + "step": 15751 + }, + { + "epoch": 1.51, + "grad_norm": 0.3140892006062649, + "learning_rate": 3.14146277405507e-05, + "loss": 1.0742, + "step": 15752 + }, + { + "epoch": 1.51, + "grad_norm": 0.3541053523546636, + "learning_rate": 3.140311613174005e-05, + "loss": 0.9711, + "step": 15753 + }, + { + "epoch": 1.51, + "grad_norm": 0.3090141738638482, + "learning_rate": 3.1391606239609496e-05, + "loss": 1.0839, + "step": 15754 + }, + { + "epoch": 1.51, + "grad_norm": 0.33428768169810796, + "learning_rate": 3.138009806444715e-05, + "loss": 0.9737, + "step": 15755 + }, + { + "epoch": 1.51, + "grad_norm": 0.29750358036811175, + "learning_rate": 3.1368591606540946e-05, + "loss": 1.0559, + "step": 15756 + }, + { + "epoch": 1.51, + "grad_norm": 0.37468528407851354, + "learning_rate": 3.135708686617885e-05, + "loss": 1.0937, + "step": 15757 + }, + { + "epoch": 1.51, + "grad_norm": 0.2554661780822934, + "learning_rate": 3.134558384364882e-05, + "loss": 0.9837, + "step": 15758 + }, + { + "epoch": 1.51, + "grad_norm": 0.3207993003420538, + "learning_rate": 3.133408253923873e-05, + "loss": 1.0029, + "step": 15759 + }, + { + "epoch": 1.51, + "grad_norm": 0.3159967552495714, + "learning_rate": 3.132258295323633e-05, + "loss": 1.108, + "step": 15760 + }, + { + "epoch": 1.51, + "grad_norm": 0.3140170209943366, + "learning_rate": 3.1311085085929505e-05, + "loss": 0.9884, + "step": 15761 + }, + { + "epoch": 1.51, + "grad_norm": 0.3158482046983106, + "learning_rate": 3.1299588937605904e-05, + "loss": 0.9035, + "step": 15762 + }, + { + "epoch": 1.51, + "grad_norm": 0.3113980685183037, + "learning_rate": 3.1288094508553275e-05, + "loss": 1.0025, + "step": 15763 + }, + { + "epoch": 1.51, + "grad_norm": 0.3386210431177634, + "learning_rate": 3.127660179905926e-05, + "loss": 0.9544, + "step": 15764 + }, + { + "epoch": 1.51, + "grad_norm": 0.3219298073241051, + "learning_rate": 3.1265110809411505e-05, + "loss": 1.0366, + "step": 15765 + }, + { + "epoch": 1.51, + "grad_norm": 0.3366547615084803, + "learning_rate": 3.1253621539897526e-05, + "loss": 1.0351, + "step": 15766 + }, + { + "epoch": 1.51, + "grad_norm": 0.3242751902501021, + "learning_rate": 3.1242133990804864e-05, + "loss": 1.015, + "step": 15767 + }, + { + "epoch": 1.51, + "grad_norm": 0.3250325538301571, + "learning_rate": 3.123064816242105e-05, + "loss": 1.0735, + "step": 15768 + }, + { + "epoch": 1.51, + "grad_norm": 0.30257423082574053, + "learning_rate": 3.121916405503345e-05, + "loss": 1.0395, + "step": 15769 + }, + { + "epoch": 1.51, + "grad_norm": 0.30876900778950295, + "learning_rate": 3.12076816689295e-05, + "loss": 1.0746, + "step": 15770 + }, + { + "epoch": 1.51, + "grad_norm": 0.3195277687972633, + "learning_rate": 3.119620100439654e-05, + "loss": 0.9355, + "step": 15771 + }, + { + "epoch": 1.51, + "grad_norm": 0.30708272663140906, + "learning_rate": 3.1184722061721936e-05, + "loss": 0.9628, + "step": 15772 + }, + { + "epoch": 1.51, + "grad_norm": 0.2971844259444463, + "learning_rate": 3.1173244841192874e-05, + "loss": 1.0702, + "step": 15773 + }, + { + "epoch": 1.51, + "grad_norm": 0.3689344756866829, + "learning_rate": 3.116176934309665e-05, + "loss": 1.0468, + "step": 15774 + }, + { + "epoch": 1.51, + "grad_norm": 0.2572755556010819, + "learning_rate": 3.1150295567720376e-05, + "loss": 0.9456, + "step": 15775 + }, + { + "epoch": 1.51, + "grad_norm": 0.28222194594104055, + "learning_rate": 3.113882351535127e-05, + "loss": 1.036, + "step": 15776 + }, + { + "epoch": 1.51, + "grad_norm": 0.2935725533548113, + "learning_rate": 3.1127353186276295e-05, + "loss": 0.9937, + "step": 15777 + }, + { + "epoch": 1.51, + "grad_norm": 0.3080827956999134, + "learning_rate": 3.111588458078267e-05, + "loss": 0.9813, + "step": 15778 + }, + { + "epoch": 1.51, + "grad_norm": 0.32904833712739706, + "learning_rate": 3.11044176991573e-05, + "loss": 1.1185, + "step": 15779 + }, + { + "epoch": 1.51, + "grad_norm": 0.3038309722308524, + "learning_rate": 3.10929525416872e-05, + "loss": 1.0617, + "step": 15780 + }, + { + "epoch": 1.51, + "grad_norm": 0.27392277582107577, + "learning_rate": 3.1081489108659244e-05, + "loss": 0.8981, + "step": 15781 + }, + { + "epoch": 1.51, + "grad_norm": 0.3339892687635368, + "learning_rate": 3.1070027400360354e-05, + "loss": 0.9297, + "step": 15782 + }, + { + "epoch": 1.51, + "grad_norm": 0.31281345692524637, + "learning_rate": 3.105856741707733e-05, + "loss": 0.9799, + "step": 15783 + }, + { + "epoch": 1.51, + "grad_norm": 0.35379924404531266, + "learning_rate": 3.1047109159096986e-05, + "loss": 1.0579, + "step": 15784 + }, + { + "epoch": 1.51, + "grad_norm": 0.30780416755798656, + "learning_rate": 3.103565262670606e-05, + "loss": 1.1459, + "step": 15785 + }, + { + "epoch": 1.51, + "grad_norm": 0.3266588256575852, + "learning_rate": 3.1024197820191305e-05, + "loss": 1.1123, + "step": 15786 + }, + { + "epoch": 1.51, + "grad_norm": 0.3065335613703641, + "learning_rate": 3.101274473983933e-05, + "loss": 1.0488, + "step": 15787 + }, + { + "epoch": 1.51, + "grad_norm": 0.2726343362887972, + "learning_rate": 3.100129338593676e-05, + "loss": 0.8757, + "step": 15788 + }, + { + "epoch": 1.51, + "grad_norm": 0.3091021882750087, + "learning_rate": 3.098984375877022e-05, + "loss": 0.9424, + "step": 15789 + }, + { + "epoch": 1.51, + "grad_norm": 0.28931684778082273, + "learning_rate": 3.097839585862619e-05, + "loss": 0.9152, + "step": 15790 + }, + { + "epoch": 1.51, + "grad_norm": 0.3361701573150555, + "learning_rate": 3.096694968579117e-05, + "loss": 1.0585, + "step": 15791 + }, + { + "epoch": 1.51, + "grad_norm": 0.3159588816305968, + "learning_rate": 3.0955505240551617e-05, + "loss": 0.9983, + "step": 15792 + }, + { + "epoch": 1.51, + "grad_norm": 0.32404574640575307, + "learning_rate": 3.0944062523193986e-05, + "loss": 1.054, + "step": 15793 + }, + { + "epoch": 1.51, + "grad_norm": 0.3071121142812244, + "learning_rate": 3.093262153400455e-05, + "loss": 0.9291, + "step": 15794 + }, + { + "epoch": 1.51, + "grad_norm": 0.30681266068246277, + "learning_rate": 3.092118227326969e-05, + "loss": 1.0432, + "step": 15795 + }, + { + "epoch": 1.51, + "grad_norm": 0.3046864289994884, + "learning_rate": 3.090974474127564e-05, + "loss": 0.9895, + "step": 15796 + }, + { + "epoch": 1.51, + "grad_norm": 0.35208571547858114, + "learning_rate": 3.089830893830864e-05, + "loss": 1.008, + "step": 15797 + }, + { + "epoch": 1.51, + "grad_norm": 0.3366848686031366, + "learning_rate": 3.088687486465489e-05, + "loss": 0.9705, + "step": 15798 + }, + { + "epoch": 1.51, + "grad_norm": 0.36113008620126386, + "learning_rate": 3.087544252060057e-05, + "loss": 1.0415, + "step": 15799 + }, + { + "epoch": 1.51, + "grad_norm": 0.3079976096072359, + "learning_rate": 3.0864011906431713e-05, + "loss": 0.9668, + "step": 15800 + }, + { + "epoch": 1.51, + "grad_norm": 0.3406259636981817, + "learning_rate": 3.085258302243444e-05, + "loss": 0.9512, + "step": 15801 + }, + { + "epoch": 1.51, + "grad_norm": 0.33160050567109134, + "learning_rate": 3.084115586889469e-05, + "loss": 1.1311, + "step": 15802 + }, + { + "epoch": 1.51, + "grad_norm": 0.3051057454471924, + "learning_rate": 3.082973044609852e-05, + "loss": 0.9929, + "step": 15803 + }, + { + "epoch": 1.51, + "grad_norm": 0.2964097289396579, + "learning_rate": 3.081830675433175e-05, + "loss": 1.088, + "step": 15804 + }, + { + "epoch": 1.51, + "grad_norm": 0.3008340007055918, + "learning_rate": 3.08068847938804e-05, + "loss": 0.9817, + "step": 15805 + }, + { + "epoch": 1.51, + "grad_norm": 0.3503311719614495, + "learning_rate": 3.0795464565030216e-05, + "loss": 1.1323, + "step": 15806 + }, + { + "epoch": 1.51, + "grad_norm": 0.3653858747279629, + "learning_rate": 3.0784046068067054e-05, + "loss": 0.9794, + "step": 15807 + }, + { + "epoch": 1.51, + "grad_norm": 0.33096690573692905, + "learning_rate": 3.07726293032766e-05, + "loss": 1.1062, + "step": 15808 + }, + { + "epoch": 1.51, + "grad_norm": 0.3090473950496848, + "learning_rate": 3.0761214270944614e-05, + "loss": 1.0716, + "step": 15809 + }, + { + "epoch": 1.51, + "grad_norm": 0.2963210002878919, + "learning_rate": 3.074980097135679e-05, + "loss": 0.9851, + "step": 15810 + }, + { + "epoch": 1.51, + "grad_norm": 0.31295955139721315, + "learning_rate": 3.073838940479867e-05, + "loss": 0.9236, + "step": 15811 + }, + { + "epoch": 1.51, + "grad_norm": 0.32487966663308043, + "learning_rate": 3.07269795715559e-05, + "loss": 1.0505, + "step": 15812 + }, + { + "epoch": 1.51, + "grad_norm": 0.3105796643237684, + "learning_rate": 3.071557147191401e-05, + "loss": 1.0084, + "step": 15813 + }, + { + "epoch": 1.51, + "grad_norm": 0.3113749818589614, + "learning_rate": 3.07041651061585e-05, + "loss": 0.9712, + "step": 15814 + }, + { + "epoch": 1.51, + "grad_norm": 0.33590786133275863, + "learning_rate": 3.069276047457479e-05, + "loss": 1.0877, + "step": 15815 + }, + { + "epoch": 1.51, + "grad_norm": 0.2870334329762812, + "learning_rate": 3.068135757744833e-05, + "loss": 0.9744, + "step": 15816 + }, + { + "epoch": 1.51, + "grad_norm": 0.3418167299424146, + "learning_rate": 3.0669956415064436e-05, + "loss": 1.0451, + "step": 15817 + }, + { + "epoch": 1.51, + "grad_norm": 0.3185227289293188, + "learning_rate": 3.0658556987708455e-05, + "loss": 1.0544, + "step": 15818 + }, + { + "epoch": 1.51, + "grad_norm": 0.3243359190011509, + "learning_rate": 3.064715929566566e-05, + "loss": 1.0419, + "step": 15819 + }, + { + "epoch": 1.51, + "grad_norm": 0.2768451657163076, + "learning_rate": 3.0635763339221335e-05, + "loss": 1.1064, + "step": 15820 + }, + { + "epoch": 1.51, + "grad_norm": 0.33858252443407516, + "learning_rate": 3.062436911866058e-05, + "loss": 0.9315, + "step": 15821 + }, + { + "epoch": 1.51, + "grad_norm": 0.29374972044921144, + "learning_rate": 3.061297663426863e-05, + "loss": 1.1298, + "step": 15822 + }, + { + "epoch": 1.51, + "grad_norm": 0.29108510352376027, + "learning_rate": 3.0601585886330506e-05, + "loss": 1.0046, + "step": 15823 + }, + { + "epoch": 1.51, + "grad_norm": 0.3286984863990125, + "learning_rate": 3.059019687513135e-05, + "loss": 1.0569, + "step": 15824 + }, + { + "epoch": 1.51, + "grad_norm": 0.29519987029118894, + "learning_rate": 3.057880960095607e-05, + "loss": 1.0298, + "step": 15825 + }, + { + "epoch": 1.51, + "grad_norm": 0.2892363453404336, + "learning_rate": 3.056742406408978e-05, + "loss": 0.9406, + "step": 15826 + }, + { + "epoch": 1.51, + "grad_norm": 0.3529727186291773, + "learning_rate": 3.055604026481731e-05, + "loss": 0.9807, + "step": 15827 + }, + { + "epoch": 1.51, + "grad_norm": 0.30439229438930676, + "learning_rate": 3.0544658203423606e-05, + "loss": 1.1531, + "step": 15828 + }, + { + "epoch": 1.51, + "grad_norm": 0.34190713297627173, + "learning_rate": 3.053327788019343e-05, + "loss": 1.0305, + "step": 15829 + }, + { + "epoch": 1.51, + "grad_norm": 0.31076901313792715, + "learning_rate": 3.052189929541166e-05, + "loss": 1.093, + "step": 15830 + }, + { + "epoch": 1.51, + "grad_norm": 0.29738707455484303, + "learning_rate": 3.0510522449363012e-05, + "loss": 1.0366, + "step": 15831 + }, + { + "epoch": 1.51, + "grad_norm": 0.31140624218341817, + "learning_rate": 3.0499147342332244e-05, + "loss": 0.993, + "step": 15832 + }, + { + "epoch": 1.51, + "grad_norm": 0.3092506336402174, + "learning_rate": 3.0487773974603972e-05, + "loss": 1.0036, + "step": 15833 + }, + { + "epoch": 1.51, + "grad_norm": 0.3577644523015519, + "learning_rate": 3.047640234646283e-05, + "loss": 1.0397, + "step": 15834 + }, + { + "epoch": 1.51, + "grad_norm": 0.2869728399286538, + "learning_rate": 3.0465032458193455e-05, + "loss": 0.9494, + "step": 15835 + }, + { + "epoch": 1.52, + "grad_norm": 0.35096428749880937, + "learning_rate": 3.045366431008031e-05, + "loss": 1.0507, + "step": 15836 + }, + { + "epoch": 1.52, + "grad_norm": 0.30247969277417347, + "learning_rate": 3.0442297902407957e-05, + "loss": 1.1117, + "step": 15837 + }, + { + "epoch": 1.52, + "grad_norm": 0.31150909314187164, + "learning_rate": 3.0430933235460747e-05, + "loss": 0.9795, + "step": 15838 + }, + { + "epoch": 1.52, + "grad_norm": 0.32468377260726733, + "learning_rate": 3.0419570309523214e-05, + "loss": 1.1235, + "step": 15839 + }, + { + "epoch": 1.52, + "grad_norm": 0.36872213900294315, + "learning_rate": 3.040820912487964e-05, + "loss": 1.0328, + "step": 15840 + }, + { + "epoch": 1.52, + "grad_norm": 0.32081486854263136, + "learning_rate": 3.0396849681814398e-05, + "loss": 1.1088, + "step": 15841 + }, + { + "epoch": 1.52, + "grad_norm": 0.33853140248617836, + "learning_rate": 3.03854919806117e-05, + "loss": 0.939, + "step": 15842 + }, + { + "epoch": 1.52, + "grad_norm": 0.33030537655277425, + "learning_rate": 3.0374136021555853e-05, + "loss": 1.0112, + "step": 15843 + }, + { + "epoch": 1.52, + "grad_norm": 0.3001650185161196, + "learning_rate": 3.0362781804930983e-05, + "loss": 1.0845, + "step": 15844 + }, + { + "epoch": 1.52, + "grad_norm": 0.26445696603336233, + "learning_rate": 3.0351429331021263e-05, + "loss": 1.0961, + "step": 15845 + }, + { + "epoch": 1.52, + "grad_norm": 0.30752741713329607, + "learning_rate": 3.034007860011079e-05, + "loss": 1.0357, + "step": 15846 + }, + { + "epoch": 1.52, + "grad_norm": 0.28070634091334257, + "learning_rate": 3.0328729612483665e-05, + "loss": 1.0227, + "step": 15847 + }, + { + "epoch": 1.52, + "grad_norm": 0.2772117132672523, + "learning_rate": 3.031738236842383e-05, + "loss": 1.0474, + "step": 15848 + }, + { + "epoch": 1.52, + "grad_norm": 0.3070016372084432, + "learning_rate": 3.0306036868215303e-05, + "loss": 1.12, + "step": 15849 + }, + { + "epoch": 1.52, + "grad_norm": 0.36925050441326746, + "learning_rate": 3.0294693112142037e-05, + "loss": 0.7741, + "step": 15850 + }, + { + "epoch": 1.52, + "grad_norm": 0.2924738136026552, + "learning_rate": 3.0283351100487857e-05, + "loss": 0.9516, + "step": 15851 + }, + { + "epoch": 1.52, + "grad_norm": 0.30335695687373654, + "learning_rate": 3.0272010833536614e-05, + "loss": 1.0628, + "step": 15852 + }, + { + "epoch": 1.52, + "grad_norm": 0.34253555906964855, + "learning_rate": 3.0260672311572135e-05, + "loss": 1.0225, + "step": 15853 + }, + { + "epoch": 1.52, + "grad_norm": 0.31741689706664483, + "learning_rate": 3.0249335534878187e-05, + "loss": 1.054, + "step": 15854 + }, + { + "epoch": 1.52, + "grad_norm": 0.2995168316040568, + "learning_rate": 3.023800050373843e-05, + "loss": 1.0603, + "step": 15855 + }, + { + "epoch": 1.52, + "grad_norm": 0.2966101546683351, + "learning_rate": 3.0226667218436576e-05, + "loss": 1.034, + "step": 15856 + }, + { + "epoch": 1.52, + "grad_norm": 0.2942219058266894, + "learning_rate": 3.0215335679256196e-05, + "loss": 1.0814, + "step": 15857 + }, + { + "epoch": 1.52, + "grad_norm": 0.318738479314566, + "learning_rate": 3.0204005886480902e-05, + "loss": 1.0364, + "step": 15858 + }, + { + "epoch": 1.52, + "grad_norm": 0.34099719462736827, + "learning_rate": 3.0192677840394225e-05, + "loss": 0.9259, + "step": 15859 + }, + { + "epoch": 1.52, + "grad_norm": 0.3193270425131773, + "learning_rate": 3.018135154127969e-05, + "loss": 1.1471, + "step": 15860 + }, + { + "epoch": 1.52, + "grad_norm": 0.31860362400387293, + "learning_rate": 3.0170026989420685e-05, + "loss": 0.993, + "step": 15861 + }, + { + "epoch": 1.52, + "grad_norm": 0.2966074618398691, + "learning_rate": 3.015870418510066e-05, + "loss": 1.0195, + "step": 15862 + }, + { + "epoch": 1.52, + "grad_norm": 0.3370089431712762, + "learning_rate": 3.0147383128602934e-05, + "loss": 1.0162, + "step": 15863 + }, + { + "epoch": 1.52, + "grad_norm": 0.33135604651285605, + "learning_rate": 3.013606382021088e-05, + "loss": 1.0232, + "step": 15864 + }, + { + "epoch": 1.52, + "grad_norm": 0.3200767813703859, + "learning_rate": 3.012474626020767e-05, + "loss": 0.8365, + "step": 15865 + }, + { + "epoch": 1.52, + "grad_norm": 0.3058883010358135, + "learning_rate": 3.0113430448876654e-05, + "loss": 1.127, + "step": 15866 + }, + { + "epoch": 1.52, + "grad_norm": 0.27114633987990044, + "learning_rate": 3.0102116386500946e-05, + "loss": 0.9353, + "step": 15867 + }, + { + "epoch": 1.52, + "grad_norm": 0.3212307264306363, + "learning_rate": 3.009080407336372e-05, + "loss": 0.9714, + "step": 15868 + }, + { + "epoch": 1.52, + "grad_norm": 0.2849652357960353, + "learning_rate": 3.0079493509748046e-05, + "loss": 0.9206, + "step": 15869 + }, + { + "epoch": 1.52, + "grad_norm": 0.296785826504847, + "learning_rate": 3.0068184695936975e-05, + "loss": 1.0432, + "step": 15870 + }, + { + "epoch": 1.52, + "grad_norm": 0.3019634578111783, + "learning_rate": 3.0056877632213575e-05, + "loss": 1.0426, + "step": 15871 + }, + { + "epoch": 1.52, + "grad_norm": 0.27664585113558315, + "learning_rate": 3.0045572318860727e-05, + "loss": 1.0164, + "step": 15872 + }, + { + "epoch": 1.52, + "grad_norm": 0.3568692231258155, + "learning_rate": 3.003426875616141e-05, + "loss": 0.9689, + "step": 15873 + }, + { + "epoch": 1.52, + "grad_norm": 0.2923631075130909, + "learning_rate": 3.0022966944398477e-05, + "loss": 1.0419, + "step": 15874 + }, + { + "epoch": 1.52, + "grad_norm": 0.3646034852398289, + "learning_rate": 3.0011666883854818e-05, + "loss": 1.1506, + "step": 15875 + }, + { + "epoch": 1.52, + "grad_norm": 0.29122502095075004, + "learning_rate": 3.000036857481313e-05, + "loss": 1.0053, + "step": 15876 + }, + { + "epoch": 1.52, + "grad_norm": 0.28923295850053343, + "learning_rate": 2.9989072017556263e-05, + "loss": 1.0763, + "step": 15877 + }, + { + "epoch": 1.52, + "grad_norm": 0.3088366023412008, + "learning_rate": 2.997777721236683e-05, + "loss": 1.0253, + "step": 15878 + }, + { + "epoch": 1.52, + "grad_norm": 0.33782498155762536, + "learning_rate": 2.996648415952753e-05, + "loss": 1.1636, + "step": 15879 + }, + { + "epoch": 1.52, + "grad_norm": 0.33775978469878565, + "learning_rate": 2.995519285932098e-05, + "loss": 0.9402, + "step": 15880 + }, + { + "epoch": 1.52, + "grad_norm": 0.29280843807888207, + "learning_rate": 2.9943903312029775e-05, + "loss": 1.0073, + "step": 15881 + }, + { + "epoch": 1.52, + "grad_norm": 0.3200414074101436, + "learning_rate": 2.993261551793639e-05, + "loss": 1.0266, + "step": 15882 + }, + { + "epoch": 1.52, + "grad_norm": 0.2515077576053623, + "learning_rate": 2.9921329477323378e-05, + "loss": 1.0794, + "step": 15883 + }, + { + "epoch": 1.52, + "grad_norm": 0.2880805125965138, + "learning_rate": 2.99100451904731e-05, + "loss": 1.1015, + "step": 15884 + }, + { + "epoch": 1.52, + "grad_norm": 0.35692842061065744, + "learning_rate": 2.9898762657668024e-05, + "loss": 1.0555, + "step": 15885 + }, + { + "epoch": 1.52, + "grad_norm": 0.34699544071803307, + "learning_rate": 2.9887481879190403e-05, + "loss": 1.0852, + "step": 15886 + }, + { + "epoch": 1.52, + "grad_norm": 0.3457813371814968, + "learning_rate": 2.9876202855322678e-05, + "loss": 0.9687, + "step": 15887 + }, + { + "epoch": 1.52, + "grad_norm": 0.29313244681474243, + "learning_rate": 2.9864925586347015e-05, + "loss": 1.0141, + "step": 15888 + }, + { + "epoch": 1.52, + "grad_norm": 0.31673592280676344, + "learning_rate": 2.985365007254569e-05, + "loss": 0.9677, + "step": 15889 + }, + { + "epoch": 1.52, + "grad_norm": 0.3313612258818296, + "learning_rate": 2.9842376314200836e-05, + "loss": 1.0124, + "step": 15890 + }, + { + "epoch": 1.52, + "grad_norm": 0.3385912984412232, + "learning_rate": 2.9831104311594616e-05, + "loss": 1.0037, + "step": 15891 + }, + { + "epoch": 1.52, + "grad_norm": 0.2630695965600685, + "learning_rate": 2.98198340650091e-05, + "loss": 0.9885, + "step": 15892 + }, + { + "epoch": 1.52, + "grad_norm": 0.2996855737903242, + "learning_rate": 2.9808565574726365e-05, + "loss": 0.9698, + "step": 15893 + }, + { + "epoch": 1.52, + "grad_norm": 0.34010407272789944, + "learning_rate": 2.9797298841028376e-05, + "loss": 1.0513, + "step": 15894 + }, + { + "epoch": 1.52, + "grad_norm": 0.31057141364675217, + "learning_rate": 2.9786033864197094e-05, + "loss": 1.0656, + "step": 15895 + }, + { + "epoch": 1.52, + "grad_norm": 0.3168481681530269, + "learning_rate": 2.977477064451447e-05, + "loss": 0.9789, + "step": 15896 + }, + { + "epoch": 1.52, + "grad_norm": 0.3008120185096565, + "learning_rate": 2.976350918226233e-05, + "loss": 0.9406, + "step": 15897 + }, + { + "epoch": 1.52, + "grad_norm": 0.4709248817744865, + "learning_rate": 2.9752249477722537e-05, + "loss": 1.1528, + "step": 15898 + }, + { + "epoch": 1.52, + "grad_norm": 0.36405573469310815, + "learning_rate": 2.9740991531176788e-05, + "loss": 1.0024, + "step": 15899 + }, + { + "epoch": 1.52, + "grad_norm": 0.32261890617133443, + "learning_rate": 2.972973534290694e-05, + "loss": 1.1733, + "step": 15900 + }, + { + "epoch": 1.52, + "grad_norm": 0.30511064307999797, + "learning_rate": 2.97184809131946e-05, + "loss": 1.0167, + "step": 15901 + }, + { + "epoch": 1.52, + "grad_norm": 0.2988018589922023, + "learning_rate": 2.970722824232147e-05, + "loss": 1.0047, + "step": 15902 + }, + { + "epoch": 1.52, + "grad_norm": 0.28218233225927103, + "learning_rate": 2.9695977330569114e-05, + "loss": 0.9857, + "step": 15903 + }, + { + "epoch": 1.52, + "grad_norm": 0.3205017465937416, + "learning_rate": 2.9684728178219134e-05, + "loss": 1.0656, + "step": 15904 + }, + { + "epoch": 1.52, + "grad_norm": 0.3076056008808317, + "learning_rate": 2.9673480785552987e-05, + "loss": 0.987, + "step": 15905 + }, + { + "epoch": 1.52, + "grad_norm": 0.2902915056640693, + "learning_rate": 2.9662235152852192e-05, + "loss": 1.0697, + "step": 15906 + }, + { + "epoch": 1.52, + "grad_norm": 0.2953103886026386, + "learning_rate": 2.9650991280398155e-05, + "loss": 0.9231, + "step": 15907 + }, + { + "epoch": 1.52, + "grad_norm": 0.26474115728885944, + "learning_rate": 2.963974916847232e-05, + "loss": 0.9575, + "step": 15908 + }, + { + "epoch": 1.52, + "grad_norm": 0.3226270237851715, + "learning_rate": 2.9628508817355936e-05, + "loss": 0.9673, + "step": 15909 + }, + { + "epoch": 1.52, + "grad_norm": 0.30979109609494465, + "learning_rate": 2.9617270227330385e-05, + "loss": 0.9874, + "step": 15910 + }, + { + "epoch": 1.52, + "grad_norm": 0.28561629268531324, + "learning_rate": 2.9606033398676847e-05, + "loss": 1.0349, + "step": 15911 + }, + { + "epoch": 1.52, + "grad_norm": 0.2757406831651905, + "learning_rate": 2.9594798331676565e-05, + "loss": 0.8487, + "step": 15912 + }, + { + "epoch": 1.52, + "grad_norm": 0.31778558401352064, + "learning_rate": 2.9583565026610706e-05, + "loss": 0.9839, + "step": 15913 + }, + { + "epoch": 1.52, + "grad_norm": 0.36247967293888433, + "learning_rate": 2.9572333483760416e-05, + "loss": 0.8927, + "step": 15914 + }, + { + "epoch": 1.52, + "grad_norm": 0.312064482242307, + "learning_rate": 2.9561103703406712e-05, + "loss": 1.0856, + "step": 15915 + }, + { + "epoch": 1.52, + "grad_norm": 0.3360093990766369, + "learning_rate": 2.9549875685830653e-05, + "loss": 0.963, + "step": 15916 + }, + { + "epoch": 1.52, + "grad_norm": 0.34697287592212606, + "learning_rate": 2.953864943131326e-05, + "loss": 1.0659, + "step": 15917 + }, + { + "epoch": 1.52, + "grad_norm": 0.3401463970339158, + "learning_rate": 2.9527424940135428e-05, + "loss": 1.0972, + "step": 15918 + }, + { + "epoch": 1.52, + "grad_norm": 0.2747262609065412, + "learning_rate": 2.951620221257807e-05, + "loss": 0.9503, + "step": 15919 + }, + { + "epoch": 1.52, + "grad_norm": 0.3011243906726051, + "learning_rate": 2.950498124892205e-05, + "loss": 0.9239, + "step": 15920 + }, + { + "epoch": 1.52, + "grad_norm": 0.3202704240363976, + "learning_rate": 2.9493762049448216e-05, + "loss": 0.9197, + "step": 15921 + }, + { + "epoch": 1.52, + "grad_norm": 0.2982475151375704, + "learning_rate": 2.9482544614437258e-05, + "loss": 1.0216, + "step": 15922 + }, + { + "epoch": 1.52, + "grad_norm": 0.32451494305192274, + "learning_rate": 2.9471328944169974e-05, + "loss": 1.0071, + "step": 15923 + }, + { + "epoch": 1.52, + "grad_norm": 0.30826949053652464, + "learning_rate": 2.946011503892697e-05, + "loss": 1.0604, + "step": 15924 + }, + { + "epoch": 1.52, + "grad_norm": 0.3333354583564712, + "learning_rate": 2.9448902898988963e-05, + "loss": 0.9457, + "step": 15925 + }, + { + "epoch": 1.52, + "grad_norm": 0.28991107131725974, + "learning_rate": 2.943769252463644e-05, + "loss": 1.0185, + "step": 15926 + }, + { + "epoch": 1.52, + "grad_norm": 0.3288320252227043, + "learning_rate": 2.942648391615006e-05, + "loss": 0.9412, + "step": 15927 + }, + { + "epoch": 1.52, + "grad_norm": 0.3197567827136486, + "learning_rate": 2.9415277073810244e-05, + "loss": 1.0717, + "step": 15928 + }, + { + "epoch": 1.52, + "grad_norm": 0.30932767256243326, + "learning_rate": 2.940407199789752e-05, + "loss": 1.0185, + "step": 15929 + }, + { + "epoch": 1.52, + "grad_norm": 0.3216519606749968, + "learning_rate": 2.939286868869222e-05, + "loss": 1.0242, + "step": 15930 + }, + { + "epoch": 1.52, + "grad_norm": 0.30750047269249725, + "learning_rate": 2.9381667146474802e-05, + "loss": 1.0696, + "step": 15931 + }, + { + "epoch": 1.52, + "grad_norm": 0.3397651800746093, + "learning_rate": 2.9370467371525502e-05, + "loss": 1.0909, + "step": 15932 + }, + { + "epoch": 1.52, + "grad_norm": 0.25536313100004765, + "learning_rate": 2.935926936412464e-05, + "loss": 0.8511, + "step": 15933 + }, + { + "epoch": 1.52, + "grad_norm": 0.30164055623837244, + "learning_rate": 2.934807312455248e-05, + "loss": 0.918, + "step": 15934 + }, + { + "epoch": 1.52, + "grad_norm": 0.30839016919055695, + "learning_rate": 2.9336878653089206e-05, + "loss": 0.9742, + "step": 15935 + }, + { + "epoch": 1.52, + "grad_norm": 0.33846330149283, + "learning_rate": 2.9325685950014923e-05, + "loss": 0.9159, + "step": 15936 + }, + { + "epoch": 1.52, + "grad_norm": 0.2931328642258853, + "learning_rate": 2.9314495015609765e-05, + "loss": 0.9485, + "step": 15937 + }, + { + "epoch": 1.52, + "grad_norm": 0.30598996202537326, + "learning_rate": 2.9303305850153838e-05, + "loss": 0.9802, + "step": 15938 + }, + { + "epoch": 1.52, + "grad_norm": 0.27957746252216503, + "learning_rate": 2.9292118453927077e-05, + "loss": 1.0054, + "step": 15939 + }, + { + "epoch": 1.52, + "grad_norm": 0.3732575259731392, + "learning_rate": 2.9280932827209483e-05, + "loss": 1.1073, + "step": 15940 + }, + { + "epoch": 1.53, + "grad_norm": 0.27460228511145857, + "learning_rate": 2.9269748970280998e-05, + "loss": 0.9737, + "step": 15941 + }, + { + "epoch": 1.53, + "grad_norm": 0.3260329515664336, + "learning_rate": 2.925856688342152e-05, + "loss": 1.0944, + "step": 15942 + }, + { + "epoch": 1.53, + "grad_norm": 0.33820638670752246, + "learning_rate": 2.924738656691084e-05, + "loss": 1.2718, + "step": 15943 + }, + { + "epoch": 1.53, + "grad_norm": 0.2915125583949025, + "learning_rate": 2.923620802102881e-05, + "loss": 1.0651, + "step": 15944 + }, + { + "epoch": 1.53, + "grad_norm": 0.27026555425506227, + "learning_rate": 2.9225031246055113e-05, + "loss": 0.9991, + "step": 15945 + }, + { + "epoch": 1.53, + "grad_norm": 0.33650194312064313, + "learning_rate": 2.921385624226952e-05, + "loss": 1.019, + "step": 15946 + }, + { + "epoch": 1.53, + "grad_norm": 0.3047847307591634, + "learning_rate": 2.9202683009951605e-05, + "loss": 0.9755, + "step": 15947 + }, + { + "epoch": 1.53, + "grad_norm": 0.32935424612921826, + "learning_rate": 2.91915115493811e-05, + "loss": 0.9016, + "step": 15948 + }, + { + "epoch": 1.53, + "grad_norm": 0.3445882979804249, + "learning_rate": 2.9180341860837492e-05, + "loss": 1.006, + "step": 15949 + }, + { + "epoch": 1.53, + "grad_norm": 0.3166055902551408, + "learning_rate": 2.916917394460036e-05, + "loss": 1.0117, + "step": 15950 + }, + { + "epoch": 1.53, + "grad_norm": 0.2841813962461922, + "learning_rate": 2.9158007800949137e-05, + "loss": 1.1019, + "step": 15951 + }, + { + "epoch": 1.53, + "grad_norm": 0.34342597219514814, + "learning_rate": 2.9146843430163295e-05, + "loss": 1.0398, + "step": 15952 + }, + { + "epoch": 1.53, + "grad_norm": 0.3231275623105714, + "learning_rate": 2.9135680832522227e-05, + "loss": 0.9766, + "step": 15953 + }, + { + "epoch": 1.53, + "grad_norm": 0.3250363764145123, + "learning_rate": 2.912452000830531e-05, + "loss": 1.0684, + "step": 15954 + }, + { + "epoch": 1.53, + "grad_norm": 0.27103364408886443, + "learning_rate": 2.91133609577918e-05, + "loss": 1.0154, + "step": 15955 + }, + { + "epoch": 1.53, + "grad_norm": 0.31140959883021263, + "learning_rate": 2.9102203681260975e-05, + "loss": 0.9708, + "step": 15956 + }, + { + "epoch": 1.53, + "grad_norm": 0.3584367961503395, + "learning_rate": 2.9091048178992097e-05, + "loss": 1.05, + "step": 15957 + }, + { + "epoch": 1.53, + "grad_norm": 0.32867676679309216, + "learning_rate": 2.9079894451264266e-05, + "loss": 1.1739, + "step": 15958 + }, + { + "epoch": 1.53, + "grad_norm": 0.2928241866323608, + "learning_rate": 2.9068742498356683e-05, + "loss": 1.0266, + "step": 15959 + }, + { + "epoch": 1.53, + "grad_norm": 0.32475092775922987, + "learning_rate": 2.9057592320548332e-05, + "loss": 0.99, + "step": 15960 + }, + { + "epoch": 1.53, + "grad_norm": 0.33741804146093674, + "learning_rate": 2.9046443918118383e-05, + "loss": 1.0847, + "step": 15961 + }, + { + "epoch": 1.53, + "grad_norm": 0.33702142094417314, + "learning_rate": 2.9035297291345742e-05, + "loss": 0.9161, + "step": 15962 + }, + { + "epoch": 1.53, + "grad_norm": 0.30505615320101803, + "learning_rate": 2.9024152440509413e-05, + "loss": 1.017, + "step": 15963 + }, + { + "epoch": 1.53, + "grad_norm": 0.3172992236042882, + "learning_rate": 2.9013009365888244e-05, + "loss": 1.0868, + "step": 15964 + }, + { + "epoch": 1.53, + "grad_norm": 0.2989234122628649, + "learning_rate": 2.9001868067761162e-05, + "loss": 1.0764, + "step": 15965 + }, + { + "epoch": 1.53, + "grad_norm": 0.3817974443922435, + "learning_rate": 2.899072854640691e-05, + "loss": 1.0229, + "step": 15966 + }, + { + "epoch": 1.53, + "grad_norm": 0.3096351146457015, + "learning_rate": 2.8979590802104316e-05, + "loss": 1.0477, + "step": 15967 + }, + { + "epoch": 1.53, + "grad_norm": 0.3009629521630225, + "learning_rate": 2.8968454835132097e-05, + "loss": 1.1122, + "step": 15968 + }, + { + "epoch": 1.53, + "grad_norm": 0.34529877341242593, + "learning_rate": 2.895732064576896e-05, + "loss": 0.9316, + "step": 15969 + }, + { + "epoch": 1.53, + "grad_norm": 0.4204657024630804, + "learning_rate": 2.8946188234293502e-05, + "loss": 1.0297, + "step": 15970 + }, + { + "epoch": 1.53, + "grad_norm": 0.31138366972107623, + "learning_rate": 2.893505760098436e-05, + "loss": 1.1352, + "step": 15971 + }, + { + "epoch": 1.53, + "grad_norm": 0.34973990369434893, + "learning_rate": 2.8923928746120043e-05, + "loss": 1.0679, + "step": 15972 + }, + { + "epoch": 1.53, + "grad_norm": 0.321937134678643, + "learning_rate": 2.891280166997907e-05, + "loss": 0.9373, + "step": 15973 + }, + { + "epoch": 1.53, + "grad_norm": 0.31394680672671976, + "learning_rate": 2.8901676372839915e-05, + "loss": 0.916, + "step": 15974 + }, + { + "epoch": 1.53, + "grad_norm": 0.3035802528293775, + "learning_rate": 2.8890552854981023e-05, + "loss": 1.0499, + "step": 15975 + }, + { + "epoch": 1.53, + "grad_norm": 0.32391363108936144, + "learning_rate": 2.887943111668071e-05, + "loss": 1.0447, + "step": 15976 + }, + { + "epoch": 1.53, + "grad_norm": 0.3351976452391942, + "learning_rate": 2.886831115821732e-05, + "loss": 1.01, + "step": 15977 + }, + { + "epoch": 1.53, + "grad_norm": 0.3108519944000598, + "learning_rate": 2.8857192979869184e-05, + "loss": 1.0102, + "step": 15978 + }, + { + "epoch": 1.53, + "grad_norm": 0.2692143060879353, + "learning_rate": 2.8846076581914483e-05, + "loss": 0.9389, + "step": 15979 + }, + { + "epoch": 1.53, + "grad_norm": 0.3303962503430621, + "learning_rate": 2.8834961964631425e-05, + "loss": 1.0122, + "step": 15980 + }, + { + "epoch": 1.53, + "grad_norm": 0.3015008948603912, + "learning_rate": 2.8823849128298174e-05, + "loss": 1.0474, + "step": 15981 + }, + { + "epoch": 1.53, + "grad_norm": 0.3216817746254628, + "learning_rate": 2.881273807319286e-05, + "loss": 1.0371, + "step": 15982 + }, + { + "epoch": 1.53, + "grad_norm": 0.29272919473366626, + "learning_rate": 2.8801628799593483e-05, + "loss": 1.0456, + "step": 15983 + }, + { + "epoch": 1.53, + "grad_norm": 0.3489986885485165, + "learning_rate": 2.8790521307778128e-05, + "loss": 1.1617, + "step": 15984 + }, + { + "epoch": 1.53, + "grad_norm": 0.298453400256656, + "learning_rate": 2.8779415598024694e-05, + "loss": 1.0005, + "step": 15985 + }, + { + "epoch": 1.53, + "grad_norm": 0.30572848503920996, + "learning_rate": 2.876831167061117e-05, + "loss": 1.1145, + "step": 15986 + }, + { + "epoch": 1.53, + "grad_norm": 0.27351502113811815, + "learning_rate": 2.8757209525815364e-05, + "loss": 1.1464, + "step": 15987 + }, + { + "epoch": 1.53, + "grad_norm": 0.3146116337141539, + "learning_rate": 2.8746109163915224e-05, + "loss": 1.0383, + "step": 15988 + }, + { + "epoch": 1.53, + "grad_norm": 0.2924855506846452, + "learning_rate": 2.873501058518845e-05, + "loss": 0.978, + "step": 15989 + }, + { + "epoch": 1.53, + "grad_norm": 0.373362069301196, + "learning_rate": 2.872391378991286e-05, + "loss": 1.1298, + "step": 15990 + }, + { + "epoch": 1.53, + "grad_norm": 0.30029424087360984, + "learning_rate": 2.871281877836608e-05, + "loss": 0.9258, + "step": 15991 + }, + { + "epoch": 1.53, + "grad_norm": 0.35705338837664646, + "learning_rate": 2.8701725550825853e-05, + "loss": 0.9411, + "step": 15992 + }, + { + "epoch": 1.53, + "grad_norm": 0.2710659398688357, + "learning_rate": 2.8690634107569724e-05, + "loss": 1.044, + "step": 15993 + }, + { + "epoch": 1.53, + "grad_norm": 0.335890864593455, + "learning_rate": 2.8679544448875284e-05, + "loss": 0.9376, + "step": 15994 + }, + { + "epoch": 1.53, + "grad_norm": 0.3151412466921922, + "learning_rate": 2.8668456575020074e-05, + "loss": 0.9488, + "step": 15995 + }, + { + "epoch": 1.53, + "grad_norm": 0.28723137396542603, + "learning_rate": 2.8657370486281597e-05, + "loss": 0.9471, + "step": 15996 + }, + { + "epoch": 1.53, + "grad_norm": 0.3477183524147287, + "learning_rate": 2.864628618293723e-05, + "loss": 1.0625, + "step": 15997 + }, + { + "epoch": 1.53, + "grad_norm": 0.36742309853435196, + "learning_rate": 2.8635203665264398e-05, + "loss": 1.0049, + "step": 15998 + }, + { + "epoch": 1.53, + "grad_norm": 0.30064002001925194, + "learning_rate": 2.8624122933540476e-05, + "loss": 0.9624, + "step": 15999 + }, + { + "epoch": 1.53, + "grad_norm": 0.29627146636180796, + "learning_rate": 2.86130439880427e-05, + "loss": 0.946, + "step": 16000 + }, + { + "epoch": 1.53, + "grad_norm": 0.34276010335417856, + "learning_rate": 2.8601966829048366e-05, + "loss": 0.9907, + "step": 16001 + }, + { + "epoch": 1.53, + "grad_norm": 0.3310369709772608, + "learning_rate": 2.859089145683468e-05, + "loss": 0.995, + "step": 16002 + }, + { + "epoch": 1.53, + "grad_norm": 0.31028388059465994, + "learning_rate": 2.8579817871678848e-05, + "loss": 0.9801, + "step": 16003 + }, + { + "epoch": 1.53, + "grad_norm": 0.30007971292260793, + "learning_rate": 2.856874607385793e-05, + "loss": 0.9815, + "step": 16004 + }, + { + "epoch": 1.53, + "grad_norm": 0.33120897283418915, + "learning_rate": 2.8557676063649075e-05, + "loss": 0.9826, + "step": 16005 + }, + { + "epoch": 1.53, + "grad_norm": 0.3030728283904543, + "learning_rate": 2.8546607841329232e-05, + "loss": 1.0798, + "step": 16006 + }, + { + "epoch": 1.53, + "grad_norm": 0.322750658349667, + "learning_rate": 2.853554140717547e-05, + "loss": 1.0457, + "step": 16007 + }, + { + "epoch": 1.53, + "grad_norm": 0.3345497758498206, + "learning_rate": 2.8524476761464634e-05, + "loss": 1.0104, + "step": 16008 + }, + { + "epoch": 1.53, + "grad_norm": 0.3100092634780141, + "learning_rate": 2.851341390447375e-05, + "loss": 0.9478, + "step": 16009 + }, + { + "epoch": 1.53, + "grad_norm": 0.3256308551666823, + "learning_rate": 2.8502352836479583e-05, + "loss": 1.0319, + "step": 16010 + }, + { + "epoch": 1.53, + "grad_norm": 0.32089300448124514, + "learning_rate": 2.8491293557759002e-05, + "loss": 0.9808, + "step": 16011 + }, + { + "epoch": 1.53, + "grad_norm": 0.3052082815141222, + "learning_rate": 2.8480236068588707e-05, + "loss": 1.0511, + "step": 16012 + }, + { + "epoch": 1.53, + "grad_norm": 0.30372902112522004, + "learning_rate": 2.8469180369245484e-05, + "loss": 0.9508, + "step": 16013 + }, + { + "epoch": 1.53, + "grad_norm": 0.33481091398453294, + "learning_rate": 2.8458126460005917e-05, + "loss": 0.9866, + "step": 16014 + }, + { + "epoch": 1.53, + "grad_norm": 0.33603273832390934, + "learning_rate": 2.844707434114676e-05, + "loss": 1.1119, + "step": 16015 + }, + { + "epoch": 1.53, + "grad_norm": 0.3422951417673504, + "learning_rate": 2.8436024012944506e-05, + "loss": 1.0273, + "step": 16016 + }, + { + "epoch": 1.53, + "grad_norm": 0.30826672100596847, + "learning_rate": 2.842497547567575e-05, + "loss": 1.0501, + "step": 16017 + }, + { + "epoch": 1.53, + "grad_norm": 0.3554962489210678, + "learning_rate": 2.8413928729616944e-05, + "loss": 1.0063, + "step": 16018 + }, + { + "epoch": 1.53, + "grad_norm": 0.2829986431358654, + "learning_rate": 2.8402883775044565e-05, + "loss": 1.0219, + "step": 16019 + }, + { + "epoch": 1.53, + "grad_norm": 0.3086857657862541, + "learning_rate": 2.8391840612235042e-05, + "loss": 0.9032, + "step": 16020 + }, + { + "epoch": 1.53, + "grad_norm": 0.3048440703471129, + "learning_rate": 2.8380799241464683e-05, + "loss": 0.9993, + "step": 16021 + }, + { + "epoch": 1.53, + "grad_norm": 0.34067067169677784, + "learning_rate": 2.836975966300983e-05, + "loss": 1.0338, + "step": 16022 + }, + { + "epoch": 1.53, + "grad_norm": 0.3109765525958761, + "learning_rate": 2.8358721877146765e-05, + "loss": 1.0415, + "step": 16023 + }, + { + "epoch": 1.53, + "grad_norm": 0.30694427897049553, + "learning_rate": 2.8347685884151732e-05, + "loss": 0.9965, + "step": 16024 + }, + { + "epoch": 1.53, + "grad_norm": 0.3444946649828595, + "learning_rate": 2.8336651684300864e-05, + "loss": 1.0438, + "step": 16025 + }, + { + "epoch": 1.53, + "grad_norm": 0.3778437519623611, + "learning_rate": 2.8325619277870362e-05, + "loss": 0.9219, + "step": 16026 + }, + { + "epoch": 1.53, + "grad_norm": 0.33063784246289424, + "learning_rate": 2.8314588665136256e-05, + "loss": 1.0192, + "step": 16027 + }, + { + "epoch": 1.53, + "grad_norm": 0.3254950450304844, + "learning_rate": 2.8303559846374605e-05, + "loss": 0.9738, + "step": 16028 + }, + { + "epoch": 1.53, + "grad_norm": 0.31068699561023916, + "learning_rate": 2.829253282186144e-05, + "loss": 0.9948, + "step": 16029 + }, + { + "epoch": 1.53, + "grad_norm": 0.33211414127970423, + "learning_rate": 2.8281507591872735e-05, + "loss": 1.0038, + "step": 16030 + }, + { + "epoch": 1.53, + "grad_norm": 0.33724060226397556, + "learning_rate": 2.827048415668435e-05, + "loss": 1.0097, + "step": 16031 + }, + { + "epoch": 1.53, + "grad_norm": 0.3057990850726954, + "learning_rate": 2.825946251657221e-05, + "loss": 1.0873, + "step": 16032 + }, + { + "epoch": 1.53, + "grad_norm": 0.314085786267021, + "learning_rate": 2.824844267181207e-05, + "loss": 1.0184, + "step": 16033 + }, + { + "epoch": 1.53, + "grad_norm": 0.41742305116695866, + "learning_rate": 2.8237424622679785e-05, + "loss": 0.999, + "step": 16034 + }, + { + "epoch": 1.53, + "grad_norm": 0.359973964732567, + "learning_rate": 2.8226408369450984e-05, + "loss": 1.0047, + "step": 16035 + }, + { + "epoch": 1.53, + "grad_norm": 0.27747159803621924, + "learning_rate": 2.821539391240149e-05, + "loss": 0.9129, + "step": 16036 + }, + { + "epoch": 1.53, + "grad_norm": 0.2990525354156681, + "learning_rate": 2.8204381251806834e-05, + "loss": 1.0824, + "step": 16037 + }, + { + "epoch": 1.53, + "grad_norm": 0.269317518024419, + "learning_rate": 2.8193370387942663e-05, + "loss": 0.8655, + "step": 16038 + }, + { + "epoch": 1.53, + "grad_norm": 0.2709630136473673, + "learning_rate": 2.8182361321084562e-05, + "loss": 0.955, + "step": 16039 + }, + { + "epoch": 1.53, + "grad_norm": 0.31069383649754656, + "learning_rate": 2.8171354051507982e-05, + "loss": 0.94, + "step": 16040 + }, + { + "epoch": 1.53, + "grad_norm": 0.3686642758796067, + "learning_rate": 2.8160348579488392e-05, + "loss": 1.0107, + "step": 16041 + }, + { + "epoch": 1.53, + "grad_norm": 0.2882260222233732, + "learning_rate": 2.8149344905301233e-05, + "loss": 1.0211, + "step": 16042 + }, + { + "epoch": 1.53, + "grad_norm": 0.32516092732864843, + "learning_rate": 2.8138343029221914e-05, + "loss": 1.0311, + "step": 16043 + }, + { + "epoch": 1.53, + "grad_norm": 0.3387112980959798, + "learning_rate": 2.8127342951525682e-05, + "loss": 1.0153, + "step": 16044 + }, + { + "epoch": 1.54, + "grad_norm": 0.3034673281894372, + "learning_rate": 2.8116344672487904e-05, + "loss": 1.0734, + "step": 16045 + }, + { + "epoch": 1.54, + "grad_norm": 0.34470797765143224, + "learning_rate": 2.8105348192383742e-05, + "loss": 0.9148, + "step": 16046 + }, + { + "epoch": 1.54, + "grad_norm": 0.31146684538621805, + "learning_rate": 2.809435351148846e-05, + "loss": 0.9994, + "step": 16047 + }, + { + "epoch": 1.54, + "grad_norm": 0.3138406557182258, + "learning_rate": 2.8083360630077106e-05, + "loss": 1.0786, + "step": 16048 + }, + { + "epoch": 1.54, + "grad_norm": 0.3029619141301558, + "learning_rate": 2.8072369548424917e-05, + "loss": 1.0175, + "step": 16049 + }, + { + "epoch": 1.54, + "grad_norm": 0.32746678947585955, + "learning_rate": 2.8061380266806848e-05, + "loss": 0.9824, + "step": 16050 + }, + { + "epoch": 1.54, + "grad_norm": 0.3572876897551119, + "learning_rate": 2.8050392785497982e-05, + "loss": 1.038, + "step": 16051 + }, + { + "epoch": 1.54, + "grad_norm": 0.2973505376019782, + "learning_rate": 2.803940710477322e-05, + "loss": 1.0141, + "step": 16052 + }, + { + "epoch": 1.54, + "grad_norm": 0.3138851204491961, + "learning_rate": 2.8028423224907563e-05, + "loss": 0.9635, + "step": 16053 + }, + { + "epoch": 1.54, + "grad_norm": 0.3509386558285649, + "learning_rate": 2.8017441146175806e-05, + "loss": 1.0249, + "step": 16054 + }, + { + "epoch": 1.54, + "grad_norm": 0.33218671511392767, + "learning_rate": 2.8006460868852825e-05, + "loss": 1.0156, + "step": 16055 + }, + { + "epoch": 1.54, + "grad_norm": 0.3118633253931291, + "learning_rate": 2.7995482393213413e-05, + "loss": 1.0512, + "step": 16056 + }, + { + "epoch": 1.54, + "grad_norm": 0.3389922877025385, + "learning_rate": 2.798450571953234e-05, + "loss": 1.056, + "step": 16057 + }, + { + "epoch": 1.54, + "grad_norm": 0.35211008375961433, + "learning_rate": 2.7973530848084228e-05, + "loss": 1.1353, + "step": 16058 + }, + { + "epoch": 1.54, + "grad_norm": 0.30648048879369866, + "learning_rate": 2.7962557779143783e-05, + "loss": 1.0381, + "step": 16059 + }, + { + "epoch": 1.54, + "grad_norm": 0.2860981157236097, + "learning_rate": 2.7951586512985627e-05, + "loss": 0.9838, + "step": 16060 + }, + { + "epoch": 1.54, + "grad_norm": 0.2583221536809254, + "learning_rate": 2.7940617049884275e-05, + "loss": 1.0334, + "step": 16061 + }, + { + "epoch": 1.54, + "grad_norm": 0.29919808764747113, + "learning_rate": 2.7929649390114277e-05, + "loss": 0.9753, + "step": 16062 + }, + { + "epoch": 1.54, + "grad_norm": 0.3027170900826791, + "learning_rate": 2.7918683533950086e-05, + "loss": 1.0012, + "step": 16063 + }, + { + "epoch": 1.54, + "grad_norm": 0.3223459061714809, + "learning_rate": 2.7907719481666172e-05, + "loss": 0.9973, + "step": 16064 + }, + { + "epoch": 1.54, + "grad_norm": 0.29642220125248164, + "learning_rate": 2.7896757233536853e-05, + "loss": 0.977, + "step": 16065 + }, + { + "epoch": 1.54, + "grad_norm": 0.33007467607995344, + "learning_rate": 2.7885796789836538e-05, + "loss": 1.0093, + "step": 16066 + }, + { + "epoch": 1.54, + "grad_norm": 0.33438219172239975, + "learning_rate": 2.7874838150839444e-05, + "loss": 0.9933, + "step": 16067 + }, + { + "epoch": 1.54, + "grad_norm": 0.3233699277700796, + "learning_rate": 2.7863881316819884e-05, + "loss": 0.9332, + "step": 16068 + }, + { + "epoch": 1.54, + "grad_norm": 0.2986776410956821, + "learning_rate": 2.785292628805197e-05, + "loss": 0.9919, + "step": 16069 + }, + { + "epoch": 1.54, + "grad_norm": 0.26876826389939523, + "learning_rate": 2.784197306480999e-05, + "loss": 1.0139, + "step": 16070 + }, + { + "epoch": 1.54, + "grad_norm": 0.3332730017616259, + "learning_rate": 2.7831021647367938e-05, + "loss": 1.0135, + "step": 16071 + }, + { + "epoch": 1.54, + "grad_norm": 0.27175589574432435, + "learning_rate": 2.7820072035999966e-05, + "loss": 0.9492, + "step": 16072 + }, + { + "epoch": 1.54, + "grad_norm": 0.3165451745670303, + "learning_rate": 2.780912423098002e-05, + "loss": 1.0621, + "step": 16073 + }, + { + "epoch": 1.54, + "grad_norm": 0.33145540468922213, + "learning_rate": 2.779817823258214e-05, + "loss": 1.1081, + "step": 16074 + }, + { + "epoch": 1.54, + "grad_norm": 0.3243518038643485, + "learning_rate": 2.7787234041080166e-05, + "loss": 1.0665, + "step": 16075 + }, + { + "epoch": 1.54, + "grad_norm": 0.2512592004232836, + "learning_rate": 2.777629165674811e-05, + "loss": 0.9981, + "step": 16076 + }, + { + "epoch": 1.54, + "grad_norm": 0.2805503507933718, + "learning_rate": 2.7765351079859712e-05, + "loss": 0.934, + "step": 16077 + }, + { + "epoch": 1.54, + "grad_norm": 0.3003983891042576, + "learning_rate": 2.7754412310688838e-05, + "loss": 0.9237, + "step": 16078 + }, + { + "epoch": 1.54, + "grad_norm": 0.31546637462898136, + "learning_rate": 2.7743475349509163e-05, + "loss": 1.1092, + "step": 16079 + }, + { + "epoch": 1.54, + "grad_norm": 0.30474834118667604, + "learning_rate": 2.7732540196594425e-05, + "loss": 1.0171, + "step": 16080 + }, + { + "epoch": 1.54, + "grad_norm": 0.3343351070689, + "learning_rate": 2.772160685221832e-05, + "loss": 1.0031, + "step": 16081 + }, + { + "epoch": 1.54, + "grad_norm": 0.28140154504685966, + "learning_rate": 2.77106753166544e-05, + "loss": 1.0587, + "step": 16082 + }, + { + "epoch": 1.54, + "grad_norm": 0.2693808744788165, + "learning_rate": 2.7699745590176262e-05, + "loss": 0.8558, + "step": 16083 + }, + { + "epoch": 1.54, + "grad_norm": 0.3214368389425974, + "learning_rate": 2.7688817673057433e-05, + "loss": 1.0251, + "step": 16084 + }, + { + "epoch": 1.54, + "grad_norm": 0.3316725582184951, + "learning_rate": 2.7677891565571424e-05, + "loss": 1.0729, + "step": 16085 + }, + { + "epoch": 1.54, + "grad_norm": 0.27745406689442437, + "learning_rate": 2.7666967267991583e-05, + "loss": 1.012, + "step": 16086 + }, + { + "epoch": 1.54, + "grad_norm": 0.2733598623947671, + "learning_rate": 2.7656044780591396e-05, + "loss": 0.9317, + "step": 16087 + }, + { + "epoch": 1.54, + "grad_norm": 0.322592918084245, + "learning_rate": 2.7645124103644103e-05, + "loss": 0.9795, + "step": 16088 + }, + { + "epoch": 1.54, + "grad_norm": 0.27599179379122457, + "learning_rate": 2.763420523742307e-05, + "loss": 1.0294, + "step": 16089 + }, + { + "epoch": 1.54, + "grad_norm": 0.3355775687409924, + "learning_rate": 2.7623288182201524e-05, + "loss": 1.0672, + "step": 16090 + }, + { + "epoch": 1.54, + "grad_norm": 0.2990456664076618, + "learning_rate": 2.7612372938252707e-05, + "loss": 1.0422, + "step": 16091 + }, + { + "epoch": 1.54, + "grad_norm": 0.2773670278239475, + "learning_rate": 2.7601459505849724e-05, + "loss": 1.0161, + "step": 16092 + }, + { + "epoch": 1.54, + "grad_norm": 0.30936638735934335, + "learning_rate": 2.759054788526575e-05, + "loss": 1.0051, + "step": 16093 + }, + { + "epoch": 1.54, + "grad_norm": 0.3092991768338473, + "learning_rate": 2.7579638076773782e-05, + "loss": 1.0181, + "step": 16094 + }, + { + "epoch": 1.54, + "grad_norm": 0.2674256497772369, + "learning_rate": 2.7568730080646933e-05, + "loss": 1.0667, + "step": 16095 + }, + { + "epoch": 1.54, + "grad_norm": 0.35349046342215673, + "learning_rate": 2.7557823897158065e-05, + "loss": 1.0219, + "step": 16096 + }, + { + "epoch": 1.54, + "grad_norm": 0.3259533178462659, + "learning_rate": 2.7546919526580238e-05, + "loss": 0.9799, + "step": 16097 + }, + { + "epoch": 1.54, + "grad_norm": 0.2797511659656376, + "learning_rate": 2.753601696918626e-05, + "loss": 1.046, + "step": 16098 + }, + { + "epoch": 1.54, + "grad_norm": 0.32647904422168844, + "learning_rate": 2.7525116225249038e-05, + "loss": 0.9169, + "step": 16099 + }, + { + "epoch": 1.54, + "grad_norm": 0.290896439548623, + "learning_rate": 2.751421729504129e-05, + "loss": 0.98, + "step": 16100 + }, + { + "epoch": 1.54, + "grad_norm": 0.3582837673749542, + "learning_rate": 2.750332017883581e-05, + "loss": 0.7857, + "step": 16101 + }, + { + "epoch": 1.54, + "grad_norm": 0.3335001445266211, + "learning_rate": 2.749242487690531e-05, + "loss": 0.9695, + "step": 16102 + }, + { + "epoch": 1.54, + "grad_norm": 0.3526987043570115, + "learning_rate": 2.7481531389522462e-05, + "loss": 1.072, + "step": 16103 + }, + { + "epoch": 1.54, + "grad_norm": 0.28394033302390487, + "learning_rate": 2.7470639716959855e-05, + "loss": 0.9385, + "step": 16104 + }, + { + "epoch": 1.54, + "grad_norm": 0.3475120058368341, + "learning_rate": 2.745974985949007e-05, + "loss": 1.0744, + "step": 16105 + }, + { + "epoch": 1.54, + "grad_norm": 0.28731623833667064, + "learning_rate": 2.7448861817385663e-05, + "loss": 1.0009, + "step": 16106 + }, + { + "epoch": 1.54, + "grad_norm": 0.2834261144642317, + "learning_rate": 2.7437975590919053e-05, + "loss": 0.9834, + "step": 16107 + }, + { + "epoch": 1.54, + "grad_norm": 0.32956816726972626, + "learning_rate": 2.742709118036273e-05, + "loss": 1.014, + "step": 16108 + }, + { + "epoch": 1.54, + "grad_norm": 0.314542634786329, + "learning_rate": 2.7416208585989013e-05, + "loss": 1.1795, + "step": 16109 + }, + { + "epoch": 1.54, + "grad_norm": 0.3488704223387836, + "learning_rate": 2.7405327808070347e-05, + "loss": 0.9258, + "step": 16110 + }, + { + "epoch": 1.54, + "grad_norm": 0.28757435197447806, + "learning_rate": 2.739444884687895e-05, + "loss": 0.9906, + "step": 16111 + }, + { + "epoch": 1.54, + "grad_norm": 0.3055393081208791, + "learning_rate": 2.738357170268713e-05, + "loss": 1.0861, + "step": 16112 + }, + { + "epoch": 1.54, + "grad_norm": 0.2937847885971275, + "learning_rate": 2.7372696375767036e-05, + "loss": 0.9705, + "step": 16113 + }, + { + "epoch": 1.54, + "grad_norm": 0.321618578630583, + "learning_rate": 2.7361822866390885e-05, + "loss": 1.0373, + "step": 16114 + }, + { + "epoch": 1.54, + "grad_norm": 0.3147461132406757, + "learning_rate": 2.735095117483074e-05, + "loss": 0.9759, + "step": 16115 + }, + { + "epoch": 1.54, + "grad_norm": 0.32258956581812026, + "learning_rate": 2.734008130135871e-05, + "loss": 0.9565, + "step": 16116 + }, + { + "epoch": 1.54, + "grad_norm": 0.3354621989891288, + "learning_rate": 2.7329213246246797e-05, + "loss": 1.0161, + "step": 16117 + }, + { + "epoch": 1.54, + "grad_norm": 0.3122365599068253, + "learning_rate": 2.7318347009767033e-05, + "loss": 1.0629, + "step": 16118 + }, + { + "epoch": 1.54, + "grad_norm": 0.29304166788841735, + "learning_rate": 2.730748259219128e-05, + "loss": 0.9578, + "step": 16119 + }, + { + "epoch": 1.54, + "grad_norm": 0.28893445039916993, + "learning_rate": 2.7296619993791496e-05, + "loss": 1.0236, + "step": 16120 + }, + { + "epoch": 1.54, + "grad_norm": 0.34598661411474807, + "learning_rate": 2.7285759214839458e-05, + "loss": 1.0906, + "step": 16121 + }, + { + "epoch": 1.54, + "grad_norm": 0.3173075061654457, + "learning_rate": 2.7274900255606983e-05, + "loss": 1.0145, + "step": 16122 + }, + { + "epoch": 1.54, + "grad_norm": 0.3086794480518638, + "learning_rate": 2.7264043116365847e-05, + "loss": 1.0335, + "step": 16123 + }, + { + "epoch": 1.54, + "grad_norm": 0.32370008057901767, + "learning_rate": 2.725318779738777e-05, + "loss": 0.9466, + "step": 16124 + }, + { + "epoch": 1.54, + "grad_norm": 0.32171205744164916, + "learning_rate": 2.724233429894436e-05, + "loss": 1.0579, + "step": 16125 + }, + { + "epoch": 1.54, + "grad_norm": 0.33450539779830624, + "learning_rate": 2.7231482621307258e-05, + "loss": 1.005, + "step": 16126 + }, + { + "epoch": 1.54, + "grad_norm": 0.29261808500357167, + "learning_rate": 2.7220632764748076e-05, + "loss": 1.0046, + "step": 16127 + }, + { + "epoch": 1.54, + "grad_norm": 0.3127735226677196, + "learning_rate": 2.7209784729538257e-05, + "loss": 1.058, + "step": 16128 + }, + { + "epoch": 1.54, + "grad_norm": 0.36583729622231237, + "learning_rate": 2.7198938515949356e-05, + "loss": 0.9598, + "step": 16129 + }, + { + "epoch": 1.54, + "grad_norm": 0.3047286669177891, + "learning_rate": 2.7188094124252715e-05, + "loss": 1.0631, + "step": 16130 + }, + { + "epoch": 1.54, + "grad_norm": 0.3023850677417078, + "learning_rate": 2.717725155471984e-05, + "loss": 1.0822, + "step": 16131 + }, + { + "epoch": 1.54, + "grad_norm": 0.3295611312620764, + "learning_rate": 2.7166410807621977e-05, + "loss": 1.152, + "step": 16132 + }, + { + "epoch": 1.54, + "grad_norm": 0.33756577176887786, + "learning_rate": 2.715557188323049e-05, + "loss": 1.049, + "step": 16133 + }, + { + "epoch": 1.54, + "grad_norm": 0.31639001244932335, + "learning_rate": 2.714473478181657e-05, + "loss": 1.0003, + "step": 16134 + }, + { + "epoch": 1.54, + "grad_norm": 0.2953950820942572, + "learning_rate": 2.7133899503651484e-05, + "loss": 0.9523, + "step": 16135 + }, + { + "epoch": 1.54, + "grad_norm": 0.296853514302945, + "learning_rate": 2.7123066049006285e-05, + "loss": 1.0998, + "step": 16136 + }, + { + "epoch": 1.54, + "grad_norm": 0.2912969934124045, + "learning_rate": 2.7112234418152228e-05, + "loss": 1.1765, + "step": 16137 + }, + { + "epoch": 1.54, + "grad_norm": 0.3093373091994, + "learning_rate": 2.7101404611360282e-05, + "loss": 1.0185, + "step": 16138 + }, + { + "epoch": 1.54, + "grad_norm": 0.3130543485145577, + "learning_rate": 2.709057662890153e-05, + "loss": 0.9911, + "step": 16139 + }, + { + "epoch": 1.54, + "grad_norm": 0.3104814181606734, + "learning_rate": 2.7079750471046895e-05, + "loss": 1.0821, + "step": 16140 + }, + { + "epoch": 1.54, + "grad_norm": 0.305476823403084, + "learning_rate": 2.706892613806733e-05, + "loss": 1.0557, + "step": 16141 + }, + { + "epoch": 1.54, + "grad_norm": 0.2989937054341642, + "learning_rate": 2.7058103630233746e-05, + "loss": 0.9903, + "step": 16142 + }, + { + "epoch": 1.54, + "grad_norm": 0.32305456270856975, + "learning_rate": 2.704728294781693e-05, + "loss": 0.9071, + "step": 16143 + }, + { + "epoch": 1.54, + "grad_norm": 0.3364324717744716, + "learning_rate": 2.703646409108772e-05, + "loss": 1.0411, + "step": 16144 + }, + { + "epoch": 1.54, + "grad_norm": 0.31856814796714256, + "learning_rate": 2.702564706031685e-05, + "loss": 1.008, + "step": 16145 + }, + { + "epoch": 1.54, + "grad_norm": 0.36408059766885326, + "learning_rate": 2.7014831855775048e-05, + "loss": 0.9997, + "step": 16146 + }, + { + "epoch": 1.54, + "grad_norm": 0.3846100972858552, + "learning_rate": 2.7004018477732918e-05, + "loss": 0.9998, + "step": 16147 + }, + { + "epoch": 1.54, + "grad_norm": 0.3256279114666956, + "learning_rate": 2.6993206926461135e-05, + "loss": 0.9877, + "step": 16148 + }, + { + "epoch": 1.54, + "grad_norm": 0.3061751716778186, + "learning_rate": 2.6982397202230203e-05, + "loss": 1.0538, + "step": 16149 + }, + { + "epoch": 1.55, + "grad_norm": 0.3230845335512633, + "learning_rate": 2.6971589305310675e-05, + "loss": 1.0338, + "step": 16150 + }, + { + "epoch": 1.55, + "grad_norm": 0.30982904754644125, + "learning_rate": 2.696078323597301e-05, + "loss": 1.0395, + "step": 16151 + }, + { + "epoch": 1.55, + "grad_norm": 0.3319955742828269, + "learning_rate": 2.694997899448769e-05, + "loss": 1.0619, + "step": 16152 + }, + { + "epoch": 1.55, + "grad_norm": 0.3185260237595713, + "learning_rate": 2.6939176581125014e-05, + "loss": 0.9788, + "step": 16153 + }, + { + "epoch": 1.55, + "grad_norm": 0.31189698663859194, + "learning_rate": 2.6928375996155398e-05, + "loss": 1.0832, + "step": 16154 + }, + { + "epoch": 1.55, + "grad_norm": 0.3277694041059853, + "learning_rate": 2.6917577239849066e-05, + "loss": 0.9024, + "step": 16155 + }, + { + "epoch": 1.55, + "grad_norm": 0.29952141844477853, + "learning_rate": 2.690678031247632e-05, + "loss": 1.0259, + "step": 16156 + }, + { + "epoch": 1.55, + "grad_norm": 0.29957352708630763, + "learning_rate": 2.6895985214307285e-05, + "loss": 1.0225, + "step": 16157 + }, + { + "epoch": 1.55, + "grad_norm": 0.3185913673948386, + "learning_rate": 2.688519194561221e-05, + "loss": 0.9439, + "step": 16158 + }, + { + "epoch": 1.55, + "grad_norm": 0.3202177893152143, + "learning_rate": 2.6874400506661134e-05, + "loss": 1.0269, + "step": 16159 + }, + { + "epoch": 1.55, + "grad_norm": 0.3187629066720692, + "learning_rate": 2.6863610897724178e-05, + "loss": 1.123, + "step": 16160 + }, + { + "epoch": 1.55, + "grad_norm": 0.3129029398018375, + "learning_rate": 2.6852823119071292e-05, + "loss": 0.9811, + "step": 16161 + }, + { + "epoch": 1.55, + "grad_norm": 0.3187493050197724, + "learning_rate": 2.6842037170972468e-05, + "loss": 1.0657, + "step": 16162 + }, + { + "epoch": 1.55, + "grad_norm": 0.308581015625846, + "learning_rate": 2.6831253053697657e-05, + "loss": 1.0793, + "step": 16163 + }, + { + "epoch": 1.55, + "grad_norm": 0.3370333607443117, + "learning_rate": 2.6820470767516746e-05, + "loss": 1.0886, + "step": 16164 + }, + { + "epoch": 1.55, + "grad_norm": 0.2955116301017712, + "learning_rate": 2.680969031269951e-05, + "loss": 1.0171, + "step": 16165 + }, + { + "epoch": 1.55, + "grad_norm": 0.3339182395229228, + "learning_rate": 2.6798911689515782e-05, + "loss": 1.1336, + "step": 16166 + }, + { + "epoch": 1.55, + "grad_norm": 0.2917783311757464, + "learning_rate": 2.6788134898235318e-05, + "loss": 1.0277, + "step": 16167 + }, + { + "epoch": 1.55, + "grad_norm": 0.3445148908056027, + "learning_rate": 2.6777359939127767e-05, + "loss": 0.9831, + "step": 16168 + }, + { + "epoch": 1.55, + "grad_norm": 0.3142385710306386, + "learning_rate": 2.6766586812462823e-05, + "loss": 1.0276, + "step": 16169 + }, + { + "epoch": 1.55, + "grad_norm": 0.3492661616766815, + "learning_rate": 2.675581551851001e-05, + "loss": 1.0175, + "step": 16170 + }, + { + "epoch": 1.55, + "grad_norm": 0.30986205852540366, + "learning_rate": 2.6745046057539014e-05, + "loss": 1.0847, + "step": 16171 + }, + { + "epoch": 1.55, + "grad_norm": 0.3251054736683284, + "learning_rate": 2.6734278429819248e-05, + "loss": 1.0089, + "step": 16172 + }, + { + "epoch": 1.55, + "grad_norm": 0.27518651880340256, + "learning_rate": 2.672351263562023e-05, + "loss": 0.9022, + "step": 16173 + }, + { + "epoch": 1.55, + "grad_norm": 0.3335187908899782, + "learning_rate": 2.6712748675211342e-05, + "loss": 1.0781, + "step": 16174 + }, + { + "epoch": 1.55, + "grad_norm": 0.3621948021593458, + "learning_rate": 2.6701986548862003e-05, + "loss": 1.0156, + "step": 16175 + }, + { + "epoch": 1.55, + "grad_norm": 0.2889429079253542, + "learning_rate": 2.6691226256841485e-05, + "loss": 0.9424, + "step": 16176 + }, + { + "epoch": 1.55, + "grad_norm": 0.3195583823830509, + "learning_rate": 2.6680467799419107e-05, + "loss": 1.135, + "step": 16177 + }, + { + "epoch": 1.55, + "grad_norm": 0.3012953974844471, + "learning_rate": 2.6669711176864098e-05, + "loss": 1.1361, + "step": 16178 + }, + { + "epoch": 1.55, + "grad_norm": 0.3392300870835931, + "learning_rate": 2.6658956389445688e-05, + "loss": 1.0247, + "step": 16179 + }, + { + "epoch": 1.55, + "grad_norm": 0.3258572684348612, + "learning_rate": 2.664820343743295e-05, + "loss": 1.0047, + "step": 16180 + }, + { + "epoch": 1.55, + "grad_norm": 0.2702066722884415, + "learning_rate": 2.6637452321095058e-05, + "loss": 1.0202, + "step": 16181 + }, + { + "epoch": 1.55, + "grad_norm": 0.28923231424019286, + "learning_rate": 2.6626703040700996e-05, + "loss": 1.0105, + "step": 16182 + }, + { + "epoch": 1.55, + "grad_norm": 0.30553676778343847, + "learning_rate": 2.6615955596519803e-05, + "loss": 1.048, + "step": 16183 + }, + { + "epoch": 1.55, + "grad_norm": 0.29206873844605424, + "learning_rate": 2.6605209988820444e-05, + "loss": 1.0128, + "step": 16184 + }, + { + "epoch": 1.55, + "grad_norm": 0.280607544844976, + "learning_rate": 2.659446621787186e-05, + "loss": 0.9652, + "step": 16185 + }, + { + "epoch": 1.55, + "grad_norm": 0.3283589186096802, + "learning_rate": 2.6583724283942855e-05, + "loss": 0.9598, + "step": 16186 + }, + { + "epoch": 1.55, + "grad_norm": 0.3176941440893366, + "learning_rate": 2.657298418730231e-05, + "loss": 1.11, + "step": 16187 + }, + { + "epoch": 1.55, + "grad_norm": 0.2926226077993872, + "learning_rate": 2.6562245928219e-05, + "loss": 0.995, + "step": 16188 + }, + { + "epoch": 1.55, + "grad_norm": 0.3007904261186584, + "learning_rate": 2.6551509506961602e-05, + "loss": 1.0172, + "step": 16189 + }, + { + "epoch": 1.55, + "grad_norm": 0.29163216523666796, + "learning_rate": 2.6540774923798883e-05, + "loss": 0.9939, + "step": 16190 + }, + { + "epoch": 1.55, + "grad_norm": 0.29644747830295953, + "learning_rate": 2.653004217899937e-05, + "loss": 1.0492, + "step": 16191 + }, + { + "epoch": 1.55, + "grad_norm": 0.31643422168779983, + "learning_rate": 2.651931127283179e-05, + "loss": 0.9643, + "step": 16192 + }, + { + "epoch": 1.55, + "grad_norm": 0.277213489868308, + "learning_rate": 2.6508582205564593e-05, + "loss": 0.9385, + "step": 16193 + }, + { + "epoch": 1.55, + "grad_norm": 0.28205920145440655, + "learning_rate": 2.649785497746635e-05, + "loss": 1.0096, + "step": 16194 + }, + { + "epoch": 1.55, + "grad_norm": 0.3206280813191778, + "learning_rate": 2.6487129588805448e-05, + "loss": 0.9944, + "step": 16195 + }, + { + "epoch": 1.55, + "grad_norm": 0.3236573887281505, + "learning_rate": 2.6476406039850355e-05, + "loss": 1.0025, + "step": 16196 + }, + { + "epoch": 1.55, + "grad_norm": 0.3329654579667973, + "learning_rate": 2.6465684330869354e-05, + "loss": 1.0827, + "step": 16197 + }, + { + "epoch": 1.55, + "grad_norm": 0.2904036454015087, + "learning_rate": 2.645496446213087e-05, + "loss": 0.9671, + "step": 16198 + }, + { + "epoch": 1.55, + "grad_norm": 0.3072662011296309, + "learning_rate": 2.6444246433903096e-05, + "loss": 1.0319, + "step": 16199 + }, + { + "epoch": 1.55, + "grad_norm": 0.3011465415264961, + "learning_rate": 2.6433530246454318e-05, + "loss": 1.0522, + "step": 16200 + }, + { + "epoch": 1.55, + "grad_norm": 0.34371232070750146, + "learning_rate": 2.6422815900052644e-05, + "loss": 1.0037, + "step": 16201 + }, + { + "epoch": 1.55, + "grad_norm": 0.30912384410686294, + "learning_rate": 2.641210339496627e-05, + "loss": 1.0813, + "step": 16202 + }, + { + "epoch": 1.55, + "grad_norm": 0.3665811978149081, + "learning_rate": 2.6401392731463236e-05, + "loss": 0.9769, + "step": 16203 + }, + { + "epoch": 1.55, + "grad_norm": 0.3086380768270777, + "learning_rate": 2.63906839098116e-05, + "loss": 0.9082, + "step": 16204 + }, + { + "epoch": 1.55, + "grad_norm": 0.37015468571643206, + "learning_rate": 2.637997693027936e-05, + "loss": 1.0142, + "step": 16205 + }, + { + "epoch": 1.55, + "grad_norm": 0.3121585338178743, + "learning_rate": 2.63692717931345e-05, + "loss": 1.0253, + "step": 16206 + }, + { + "epoch": 1.55, + "grad_norm": 0.311410351822817, + "learning_rate": 2.6358568498644855e-05, + "loss": 1.0525, + "step": 16207 + }, + { + "epoch": 1.55, + "grad_norm": 0.34695581102247636, + "learning_rate": 2.6347867047078313e-05, + "loss": 1.0455, + "step": 16208 + }, + { + "epoch": 1.55, + "grad_norm": 0.33533718007775, + "learning_rate": 2.6337167438702726e-05, + "loss": 1.1569, + "step": 16209 + }, + { + "epoch": 1.55, + "grad_norm": 0.34385263449958775, + "learning_rate": 2.632646967378578e-05, + "loss": 1.0091, + "step": 16210 + }, + { + "epoch": 1.55, + "grad_norm": 0.27067728180966494, + "learning_rate": 2.631577375259523e-05, + "loss": 0.9951, + "step": 16211 + }, + { + "epoch": 1.55, + "grad_norm": 0.31672049278273146, + "learning_rate": 2.630507967539876e-05, + "loss": 0.9729, + "step": 16212 + }, + { + "epoch": 1.55, + "grad_norm": 0.3123434125568876, + "learning_rate": 2.629438744246401e-05, + "loss": 1.0163, + "step": 16213 + }, + { + "epoch": 1.55, + "grad_norm": 0.3375881643867771, + "learning_rate": 2.628369705405851e-05, + "loss": 1.0414, + "step": 16214 + }, + { + "epoch": 1.55, + "grad_norm": 0.2854105376862624, + "learning_rate": 2.627300851044985e-05, + "loss": 0.9576, + "step": 16215 + }, + { + "epoch": 1.55, + "grad_norm": 0.33453443942234723, + "learning_rate": 2.6262321811905456e-05, + "loss": 0.9638, + "step": 16216 + }, + { + "epoch": 1.55, + "grad_norm": 0.30002674980874566, + "learning_rate": 2.6251636958692828e-05, + "loss": 1.0389, + "step": 16217 + }, + { + "epoch": 1.55, + "grad_norm": 0.30661104388142213, + "learning_rate": 2.6240953951079283e-05, + "loss": 0.9876, + "step": 16218 + }, + { + "epoch": 1.55, + "grad_norm": 0.3643265723949053, + "learning_rate": 2.623027278933228e-05, + "loss": 0.956, + "step": 16219 + }, + { + "epoch": 1.55, + "grad_norm": 0.347251445590381, + "learning_rate": 2.621959347371903e-05, + "loss": 1.0472, + "step": 16220 + }, + { + "epoch": 1.55, + "grad_norm": 0.3180148451883508, + "learning_rate": 2.6208916004506845e-05, + "loss": 1.0387, + "step": 16221 + }, + { + "epoch": 1.55, + "grad_norm": 0.3534899101142165, + "learning_rate": 2.619824038196289e-05, + "loss": 0.9441, + "step": 16222 + }, + { + "epoch": 1.55, + "grad_norm": 0.3110147029775816, + "learning_rate": 2.6187566606354387e-05, + "loss": 1.1275, + "step": 16223 + }, + { + "epoch": 1.55, + "grad_norm": 0.2907410239496143, + "learning_rate": 2.617689467794835e-05, + "loss": 1.0695, + "step": 16224 + }, + { + "epoch": 1.55, + "grad_norm": 0.3354911822190969, + "learning_rate": 2.616622459701198e-05, + "loss": 0.9401, + "step": 16225 + }, + { + "epoch": 1.55, + "grad_norm": 0.333207191698641, + "learning_rate": 2.615555636381222e-05, + "loss": 1.1625, + "step": 16226 + }, + { + "epoch": 1.55, + "grad_norm": 0.3284709177323467, + "learning_rate": 2.614488997861607e-05, + "loss": 0.9617, + "step": 16227 + }, + { + "epoch": 1.55, + "grad_norm": 0.2883747566476326, + "learning_rate": 2.6134225441690487e-05, + "loss": 1.0057, + "step": 16228 + }, + { + "epoch": 1.55, + "grad_norm": 0.3140560923780357, + "learning_rate": 2.6123562753302312e-05, + "loss": 1.0478, + "step": 16229 + }, + { + "epoch": 1.55, + "grad_norm": 0.3403813298428298, + "learning_rate": 2.6112901913718436e-05, + "loss": 1.1439, + "step": 16230 + }, + { + "epoch": 1.55, + "grad_norm": 0.32966314806629676, + "learning_rate": 2.6102242923205567e-05, + "loss": 1.0178, + "step": 16231 + }, + { + "epoch": 1.55, + "grad_norm": 0.29012725316360405, + "learning_rate": 2.609158578203057e-05, + "loss": 1.0689, + "step": 16232 + }, + { + "epoch": 1.55, + "grad_norm": 0.3652402085555326, + "learning_rate": 2.6080930490460064e-05, + "loss": 1.1218, + "step": 16233 + }, + { + "epoch": 1.55, + "grad_norm": 0.2691664166595972, + "learning_rate": 2.607027704876075e-05, + "loss": 1.0715, + "step": 16234 + }, + { + "epoch": 1.55, + "grad_norm": 0.30781086071759045, + "learning_rate": 2.6059625457199188e-05, + "loss": 1.0203, + "step": 16235 + }, + { + "epoch": 1.55, + "grad_norm": 0.3146184840742917, + "learning_rate": 2.604897571604201e-05, + "loss": 0.9994, + "step": 16236 + }, + { + "epoch": 1.55, + "grad_norm": 0.27199412025745595, + "learning_rate": 2.6038327825555653e-05, + "loss": 1.0632, + "step": 16237 + }, + { + "epoch": 1.55, + "grad_norm": 0.28419289826812894, + "learning_rate": 2.602768178600662e-05, + "loss": 0.8863, + "step": 16238 + }, + { + "epoch": 1.55, + "grad_norm": 0.3081262677155966, + "learning_rate": 2.601703759766134e-05, + "loss": 1.1481, + "step": 16239 + }, + { + "epoch": 1.55, + "grad_norm": 0.2981938391417738, + "learning_rate": 2.600639526078623e-05, + "loss": 1.033, + "step": 16240 + }, + { + "epoch": 1.55, + "grad_norm": 0.2729417506560446, + "learning_rate": 2.5995754775647552e-05, + "loss": 1.1245, + "step": 16241 + }, + { + "epoch": 1.55, + "grad_norm": 0.29423633376751734, + "learning_rate": 2.5985116142511647e-05, + "loss": 1.0273, + "step": 16242 + }, + { + "epoch": 1.55, + "grad_norm": 0.32630167917821046, + "learning_rate": 2.59744793616447e-05, + "loss": 0.9783, + "step": 16243 + }, + { + "epoch": 1.55, + "grad_norm": 0.3145286781431872, + "learning_rate": 2.5963844433312922e-05, + "loss": 1.0731, + "step": 16244 + }, + { + "epoch": 1.55, + "grad_norm": 0.30613397506897627, + "learning_rate": 2.595321135778247e-05, + "loss": 1.0477, + "step": 16245 + }, + { + "epoch": 1.55, + "grad_norm": 0.31303457850010713, + "learning_rate": 2.5942580135319473e-05, + "loss": 0.9132, + "step": 16246 + }, + { + "epoch": 1.55, + "grad_norm": 0.29917003593183206, + "learning_rate": 2.593195076618993e-05, + "loss": 1.1066, + "step": 16247 + }, + { + "epoch": 1.55, + "grad_norm": 0.3231367027640212, + "learning_rate": 2.5921323250659856e-05, + "loss": 1.0099, + "step": 16248 + }, + { + "epoch": 1.55, + "grad_norm": 0.35736995179620346, + "learning_rate": 2.5910697588995257e-05, + "loss": 1.097, + "step": 16249 + }, + { + "epoch": 1.55, + "grad_norm": 0.305799001561102, + "learning_rate": 2.5900073781461988e-05, + "loss": 1.0301, + "step": 16250 + }, + { + "epoch": 1.55, + "grad_norm": 0.2883150433300857, + "learning_rate": 2.588945182832596e-05, + "loss": 1.0345, + "step": 16251 + }, + { + "epoch": 1.55, + "grad_norm": 0.326710429268152, + "learning_rate": 2.5878831729852937e-05, + "loss": 0.9362, + "step": 16252 + }, + { + "epoch": 1.55, + "grad_norm": 0.2854512610215256, + "learning_rate": 2.5868213486308777e-05, + "loss": 0.9733, + "step": 16253 + }, + { + "epoch": 1.56, + "grad_norm": 0.3221232983967012, + "learning_rate": 2.585759709795914e-05, + "loss": 0.9425, + "step": 16254 + }, + { + "epoch": 1.56, + "grad_norm": 0.32784633632962373, + "learning_rate": 2.5846982565069766e-05, + "loss": 0.983, + "step": 16255 + }, + { + "epoch": 1.56, + "grad_norm": 0.312163826769028, + "learning_rate": 2.583636988790622e-05, + "loss": 0.8363, + "step": 16256 + }, + { + "epoch": 1.56, + "grad_norm": 0.29372971534802444, + "learning_rate": 2.582575906673417e-05, + "loss": 0.9768, + "step": 16257 + }, + { + "epoch": 1.56, + "grad_norm": 0.31083232354985196, + "learning_rate": 2.5815150101819042e-05, + "loss": 1.0491, + "step": 16258 + }, + { + "epoch": 1.56, + "grad_norm": 0.2897065494854459, + "learning_rate": 2.5804542993426473e-05, + "loss": 1.0734, + "step": 16259 + }, + { + "epoch": 1.56, + "grad_norm": 0.32406907061750967, + "learning_rate": 2.5793937741821818e-05, + "loss": 1.015, + "step": 16260 + }, + { + "epoch": 1.56, + "grad_norm": 0.2920174355899901, + "learning_rate": 2.5783334347270526e-05, + "loss": 1.1353, + "step": 16261 + }, + { + "epoch": 1.56, + "grad_norm": 0.3075255214935607, + "learning_rate": 2.5772732810037915e-05, + "loss": 1.0329, + "step": 16262 + }, + { + "epoch": 1.56, + "grad_norm": 0.27227650130907566, + "learning_rate": 2.5762133130389344e-05, + "loss": 0.8997, + "step": 16263 + }, + { + "epoch": 1.56, + "grad_norm": 0.2962758768411589, + "learning_rate": 2.5751535308590025e-05, + "loss": 1.0101, + "step": 16264 + }, + { + "epoch": 1.56, + "grad_norm": 0.305908174762293, + "learning_rate": 2.5740939344905192e-05, + "loss": 0.9183, + "step": 16265 + }, + { + "epoch": 1.56, + "grad_norm": 0.342875157511692, + "learning_rate": 2.5730345239600016e-05, + "loss": 1.049, + "step": 16266 + }, + { + "epoch": 1.56, + "grad_norm": 0.2782572031083594, + "learning_rate": 2.571975299293966e-05, + "loss": 0.9096, + "step": 16267 + }, + { + "epoch": 1.56, + "grad_norm": 0.28381479339610705, + "learning_rate": 2.570916260518915e-05, + "loss": 0.8623, + "step": 16268 + }, + { + "epoch": 1.56, + "grad_norm": 0.3032386873465054, + "learning_rate": 2.5698574076613524e-05, + "loss": 0.9572, + "step": 16269 + }, + { + "epoch": 1.56, + "grad_norm": 0.3476781337784424, + "learning_rate": 2.568798740747781e-05, + "loss": 0.9727, + "step": 16270 + }, + { + "epoch": 1.56, + "grad_norm": 0.3004830618895389, + "learning_rate": 2.5677402598046885e-05, + "loss": 0.9686, + "step": 16271 + }, + { + "epoch": 1.56, + "grad_norm": 0.3082404702849511, + "learning_rate": 2.5666819648585672e-05, + "loss": 0.9954, + "step": 16272 + }, + { + "epoch": 1.56, + "grad_norm": 0.29824712500683725, + "learning_rate": 2.5656238559359014e-05, + "loss": 0.9272, + "step": 16273 + }, + { + "epoch": 1.56, + "grad_norm": 0.3259167579900139, + "learning_rate": 2.5645659330631744e-05, + "loss": 1.0166, + "step": 16274 + }, + { + "epoch": 1.56, + "grad_norm": 0.3085926360747677, + "learning_rate": 2.563508196266854e-05, + "loss": 1.0725, + "step": 16275 + }, + { + "epoch": 1.56, + "grad_norm": 0.33769152771227007, + "learning_rate": 2.5624506455734186e-05, + "loss": 0.9871, + "step": 16276 + }, + { + "epoch": 1.56, + "grad_norm": 0.3374803090093981, + "learning_rate": 2.5613932810093268e-05, + "loss": 1.0381, + "step": 16277 + }, + { + "epoch": 1.56, + "grad_norm": 0.35401154480299807, + "learning_rate": 2.5603361026010453e-05, + "loss": 1.0007, + "step": 16278 + }, + { + "epoch": 1.56, + "grad_norm": 0.2803020677859373, + "learning_rate": 2.5592791103750235e-05, + "loss": 1.0861, + "step": 16279 + }, + { + "epoch": 1.56, + "grad_norm": 0.3229761156040626, + "learning_rate": 2.5582223043577236e-05, + "loss": 0.9707, + "step": 16280 + }, + { + "epoch": 1.56, + "grad_norm": 0.3279705644711631, + "learning_rate": 2.5571656845755843e-05, + "loss": 1.1181, + "step": 16281 + }, + { + "epoch": 1.56, + "grad_norm": 0.3135047008854237, + "learning_rate": 2.5561092510550555e-05, + "loss": 1.0685, + "step": 16282 + }, + { + "epoch": 1.56, + "grad_norm": 0.30066090740789153, + "learning_rate": 2.5550530038225673e-05, + "loss": 1.0425, + "step": 16283 + }, + { + "epoch": 1.56, + "grad_norm": 0.27666217497442036, + "learning_rate": 2.5539969429045594e-05, + "loss": 0.965, + "step": 16284 + }, + { + "epoch": 1.56, + "grad_norm": 0.36176248450731296, + "learning_rate": 2.5529410683274523e-05, + "loss": 1.0929, + "step": 16285 + }, + { + "epoch": 1.56, + "grad_norm": 0.2879883181356551, + "learning_rate": 2.55188538011768e-05, + "loss": 1.0263, + "step": 16286 + }, + { + "epoch": 1.56, + "grad_norm": 0.306330082344432, + "learning_rate": 2.550829878301656e-05, + "loss": 1.0707, + "step": 16287 + }, + { + "epoch": 1.56, + "grad_norm": 0.3421608684221668, + "learning_rate": 2.5497745629057978e-05, + "loss": 1.1147, + "step": 16288 + }, + { + "epoch": 1.56, + "grad_norm": 0.3371361399716282, + "learning_rate": 2.5487194339565103e-05, + "loss": 0.9448, + "step": 16289 + }, + { + "epoch": 1.56, + "grad_norm": 0.3021755830632525, + "learning_rate": 2.5476644914802017e-05, + "loss": 1.0323, + "step": 16290 + }, + { + "epoch": 1.56, + "grad_norm": 0.32214967609529044, + "learning_rate": 2.546609735503277e-05, + "loss": 1.0957, + "step": 16291 + }, + { + "epoch": 1.56, + "grad_norm": 0.349251197288962, + "learning_rate": 2.545555166052124e-05, + "loss": 1.0009, + "step": 16292 + }, + { + "epoch": 1.56, + "grad_norm": 0.2877939160006927, + "learning_rate": 2.5445007831531377e-05, + "loss": 0.9825, + "step": 16293 + }, + { + "epoch": 1.56, + "grad_norm": 0.32730894918016323, + "learning_rate": 2.543446586832705e-05, + "loss": 1.0183, + "step": 16294 + }, + { + "epoch": 1.56, + "grad_norm": 0.3108202048280946, + "learning_rate": 2.5423925771172118e-05, + "loss": 1.1213, + "step": 16295 + }, + { + "epoch": 1.56, + "grad_norm": 0.3531216467205498, + "learning_rate": 2.541338754033027e-05, + "loss": 1.096, + "step": 16296 + }, + { + "epoch": 1.56, + "grad_norm": 0.30567331386503893, + "learning_rate": 2.540285117606531e-05, + "loss": 1.0168, + "step": 16297 + }, + { + "epoch": 1.56, + "grad_norm": 0.33321163744237375, + "learning_rate": 2.539231667864086e-05, + "loss": 1.0478, + "step": 16298 + }, + { + "epoch": 1.56, + "grad_norm": 0.254743839263242, + "learning_rate": 2.5381784048320568e-05, + "loss": 0.9628, + "step": 16299 + }, + { + "epoch": 1.56, + "grad_norm": 0.3331445780543616, + "learning_rate": 2.5371253285368023e-05, + "loss": 0.9948, + "step": 16300 + }, + { + "epoch": 1.56, + "grad_norm": 0.3263656514179077, + "learning_rate": 2.53607243900468e-05, + "loss": 0.9329, + "step": 16301 + }, + { + "epoch": 1.56, + "grad_norm": 0.31631340536146296, + "learning_rate": 2.535019736262032e-05, + "loss": 0.9752, + "step": 16302 + }, + { + "epoch": 1.56, + "grad_norm": 0.31982830786160216, + "learning_rate": 2.5339672203352106e-05, + "loss": 1.0598, + "step": 16303 + }, + { + "epoch": 1.56, + "grad_norm": 0.3378597423332257, + "learning_rate": 2.5329148912505486e-05, + "loss": 1.0126, + "step": 16304 + }, + { + "epoch": 1.56, + "grad_norm": 0.32062110810612077, + "learning_rate": 2.5318627490343872e-05, + "loss": 1.0993, + "step": 16305 + }, + { + "epoch": 1.56, + "grad_norm": 0.32970176523555705, + "learning_rate": 2.5308107937130475e-05, + "loss": 1.1589, + "step": 16306 + }, + { + "epoch": 1.56, + "grad_norm": 0.34339188002751814, + "learning_rate": 2.529759025312869e-05, + "loss": 0.9276, + "step": 16307 + }, + { + "epoch": 1.56, + "grad_norm": 0.31401355127199415, + "learning_rate": 2.528707443860162e-05, + "loss": 1.0724, + "step": 16308 + }, + { + "epoch": 1.56, + "grad_norm": 0.3189302400680238, + "learning_rate": 2.5276560493812495e-05, + "loss": 0.9572, + "step": 16309 + }, + { + "epoch": 1.56, + "grad_norm": 0.29745900442363415, + "learning_rate": 2.526604841902438e-05, + "loss": 1.0001, + "step": 16310 + }, + { + "epoch": 1.56, + "grad_norm": 0.32029594215843404, + "learning_rate": 2.5255538214500373e-05, + "loss": 1.1217, + "step": 16311 + }, + { + "epoch": 1.56, + "grad_norm": 0.32549690008938714, + "learning_rate": 2.5245029880503524e-05, + "loss": 0.9385, + "step": 16312 + }, + { + "epoch": 1.56, + "grad_norm": 0.2970182886124677, + "learning_rate": 2.523452341729676e-05, + "loss": 0.9238, + "step": 16313 + }, + { + "epoch": 1.56, + "grad_norm": 0.3077311012871637, + "learning_rate": 2.5224018825143038e-05, + "loss": 0.973, + "step": 16314 + }, + { + "epoch": 1.56, + "grad_norm": 0.3223145253060788, + "learning_rate": 2.5213516104305235e-05, + "loss": 0.9704, + "step": 16315 + }, + { + "epoch": 1.56, + "grad_norm": 0.3556847853643339, + "learning_rate": 2.5203015255046215e-05, + "loss": 1.0115, + "step": 16316 + }, + { + "epoch": 1.56, + "grad_norm": 0.3086566694695708, + "learning_rate": 2.519251627762872e-05, + "loss": 0.9283, + "step": 16317 + }, + { + "epoch": 1.56, + "grad_norm": 0.30912492513114426, + "learning_rate": 2.5182019172315564e-05, + "loss": 0.9561, + "step": 16318 + }, + { + "epoch": 1.56, + "grad_norm": 0.336905655495767, + "learning_rate": 2.5171523939369324e-05, + "loss": 1.038, + "step": 16319 + }, + { + "epoch": 1.56, + "grad_norm": 0.3249533651441686, + "learning_rate": 2.516103057905279e-05, + "loss": 1.0414, + "step": 16320 + }, + { + "epoch": 1.56, + "grad_norm": 0.3235212260714399, + "learning_rate": 2.5150539091628466e-05, + "loss": 0.9901, + "step": 16321 + }, + { + "epoch": 1.56, + "grad_norm": 0.34259719834641794, + "learning_rate": 2.5140049477358984e-05, + "loss": 0.9915, + "step": 16322 + }, + { + "epoch": 1.56, + "grad_norm": 0.32976386292253135, + "learning_rate": 2.512956173650677e-05, + "loss": 1.0233, + "step": 16323 + }, + { + "epoch": 1.56, + "grad_norm": 0.30257972583169085, + "learning_rate": 2.5119075869334362e-05, + "loss": 1.0008, + "step": 16324 + }, + { + "epoch": 1.56, + "grad_norm": 0.3259887539668984, + "learning_rate": 2.5108591876104115e-05, + "loss": 1.1019, + "step": 16325 + }, + { + "epoch": 1.56, + "grad_norm": 0.34938344136067545, + "learning_rate": 2.5098109757078415e-05, + "loss": 0.9849, + "step": 16326 + }, + { + "epoch": 1.56, + "grad_norm": 0.3491568885215946, + "learning_rate": 2.5087629512519605e-05, + "loss": 0.9459, + "step": 16327 + }, + { + "epoch": 1.56, + "grad_norm": 0.33245281468630133, + "learning_rate": 2.507715114268997e-05, + "loss": 1.0407, + "step": 16328 + }, + { + "epoch": 1.56, + "grad_norm": 0.3336404110531226, + "learning_rate": 2.5066674647851683e-05, + "loss": 0.9901, + "step": 16329 + }, + { + "epoch": 1.56, + "grad_norm": 0.29620029804871295, + "learning_rate": 2.5056200028266952e-05, + "loss": 1.0194, + "step": 16330 + }, + { + "epoch": 1.56, + "grad_norm": 0.32190849912910596, + "learning_rate": 2.504572728419795e-05, + "loss": 0.9969, + "step": 16331 + }, + { + "epoch": 1.56, + "grad_norm": 0.3306330719994334, + "learning_rate": 2.5035256415906706e-05, + "loss": 1.1306, + "step": 16332 + }, + { + "epoch": 1.56, + "grad_norm": 0.388397934008338, + "learning_rate": 2.502478742365528e-05, + "loss": 0.9498, + "step": 16333 + }, + { + "epoch": 1.56, + "grad_norm": 0.29814076553579955, + "learning_rate": 2.5014320307705674e-05, + "loss": 1.0222, + "step": 16334 + }, + { + "epoch": 1.56, + "grad_norm": 0.29015240564329725, + "learning_rate": 2.5003855068319848e-05, + "loss": 1.0934, + "step": 16335 + }, + { + "epoch": 1.56, + "grad_norm": 0.31644856529384263, + "learning_rate": 2.499339170575966e-05, + "loss": 1.0176, + "step": 16336 + }, + { + "epoch": 1.56, + "grad_norm": 0.33408517583614655, + "learning_rate": 2.4982930220287015e-05, + "loss": 1.0097, + "step": 16337 + }, + { + "epoch": 1.56, + "grad_norm": 0.31310216984257083, + "learning_rate": 2.4972470612163657e-05, + "loss": 1.1124, + "step": 16338 + }, + { + "epoch": 1.56, + "grad_norm": 0.33666856103785325, + "learning_rate": 2.4962012881651396e-05, + "loss": 1.0587, + "step": 16339 + }, + { + "epoch": 1.56, + "grad_norm": 0.33701835380257755, + "learning_rate": 2.4951557029011874e-05, + "loss": 1.1543, + "step": 16340 + }, + { + "epoch": 1.56, + "grad_norm": 0.3172755121244004, + "learning_rate": 2.4941103054506854e-05, + "loss": 0.9998, + "step": 16341 + }, + { + "epoch": 1.56, + "grad_norm": 0.3014479914176879, + "learning_rate": 2.4930650958397883e-05, + "loss": 1.0007, + "step": 16342 + }, + { + "epoch": 1.56, + "grad_norm": 0.24946681372030238, + "learning_rate": 2.4920200740946576e-05, + "loss": 0.8923, + "step": 16343 + }, + { + "epoch": 1.56, + "grad_norm": 0.2875477782426157, + "learning_rate": 2.4909752402414398e-05, + "loss": 1.008, + "step": 16344 + }, + { + "epoch": 1.56, + "grad_norm": 0.31628032107524867, + "learning_rate": 2.489930594306289e-05, + "loss": 1.0157, + "step": 16345 + }, + { + "epoch": 1.56, + "grad_norm": 0.31248261066633665, + "learning_rate": 2.4888861363153394e-05, + "loss": 1.0665, + "step": 16346 + }, + { + "epoch": 1.56, + "grad_norm": 0.3253790124009655, + "learning_rate": 2.48784186629474e-05, + "loss": 1.0442, + "step": 16347 + }, + { + "epoch": 1.56, + "grad_norm": 0.3182227395292858, + "learning_rate": 2.486797784270617e-05, + "loss": 1.0579, + "step": 16348 + }, + { + "epoch": 1.56, + "grad_norm": 0.31731640794975213, + "learning_rate": 2.4857538902691036e-05, + "loss": 1.0048, + "step": 16349 + }, + { + "epoch": 1.56, + "grad_norm": 0.3868128366665495, + "learning_rate": 2.484710184316319e-05, + "loss": 0.9443, + "step": 16350 + }, + { + "epoch": 1.56, + "grad_norm": 0.3186496938579689, + "learning_rate": 2.4836666664383857e-05, + "loss": 0.9807, + "step": 16351 + }, + { + "epoch": 1.56, + "grad_norm": 0.31290799750604875, + "learning_rate": 2.4826233366614203e-05, + "loss": 1.0349, + "step": 16352 + }, + { + "epoch": 1.56, + "grad_norm": 0.3260619603338549, + "learning_rate": 2.4815801950115268e-05, + "loss": 1.1068, + "step": 16353 + }, + { + "epoch": 1.56, + "grad_norm": 0.3258820985245674, + "learning_rate": 2.4805372415148152e-05, + "loss": 1.0305, + "step": 16354 + }, + { + "epoch": 1.56, + "grad_norm": 0.2990190063235327, + "learning_rate": 2.4794944761973847e-05, + "loss": 0.9625, + "step": 16355 + }, + { + "epoch": 1.56, + "grad_norm": 0.2562014851118758, + "learning_rate": 2.4784518990853343e-05, + "loss": 1.0479, + "step": 16356 + }, + { + "epoch": 1.56, + "grad_norm": 0.36438921519324663, + "learning_rate": 2.477409510204749e-05, + "loss": 0.9093, + "step": 16357 + }, + { + "epoch": 1.56, + "grad_norm": 0.3491100278896006, + "learning_rate": 2.476367309581722e-05, + "loss": 1.0199, + "step": 16358 + }, + { + "epoch": 1.57, + "grad_norm": 0.30642082647592633, + "learning_rate": 2.475325297242328e-05, + "loss": 0.9988, + "step": 16359 + }, + { + "epoch": 1.57, + "grad_norm": 0.3169029137758554, + "learning_rate": 2.4742834732126473e-05, + "loss": 1.0584, + "step": 16360 + }, + { + "epoch": 1.57, + "grad_norm": 0.30205205746685293, + "learning_rate": 2.4732418375187528e-05, + "loss": 0.9436, + "step": 16361 + }, + { + "epoch": 1.57, + "grad_norm": 0.3264155977655207, + "learning_rate": 2.4722003901867153e-05, + "loss": 0.929, + "step": 16362 + }, + { + "epoch": 1.57, + "grad_norm": 0.2691580794782444, + "learning_rate": 2.4711591312425898e-05, + "loss": 0.9011, + "step": 16363 + }, + { + "epoch": 1.57, + "grad_norm": 0.332283670407174, + "learning_rate": 2.470118060712443e-05, + "loss": 0.9591, + "step": 16364 + }, + { + "epoch": 1.57, + "grad_norm": 0.2884821429560407, + "learning_rate": 2.46907717862232e-05, + "loss": 1.0817, + "step": 16365 + }, + { + "epoch": 1.57, + "grad_norm": 0.3181959663484106, + "learning_rate": 2.4680364849982773e-05, + "loss": 1.0161, + "step": 16366 + }, + { + "epoch": 1.57, + "grad_norm": 0.33180727609685795, + "learning_rate": 2.466995979866349e-05, + "loss": 1.0894, + "step": 16367 + }, + { + "epoch": 1.57, + "grad_norm": 0.2988756706770018, + "learning_rate": 2.465955663252587e-05, + "loss": 0.9593, + "step": 16368 + }, + { + "epoch": 1.57, + "grad_norm": 0.3169914016910384, + "learning_rate": 2.4649155351830156e-05, + "loss": 1.0631, + "step": 16369 + }, + { + "epoch": 1.57, + "grad_norm": 0.33017298025522296, + "learning_rate": 2.4638755956836724e-05, + "loss": 1.0364, + "step": 16370 + }, + { + "epoch": 1.57, + "grad_norm": 0.31821142057435436, + "learning_rate": 2.4628358447805755e-05, + "loss": 0.9142, + "step": 16371 + }, + { + "epoch": 1.57, + "grad_norm": 0.35334902156767223, + "learning_rate": 2.4617962824997487e-05, + "loss": 1.0385, + "step": 16372 + }, + { + "epoch": 1.57, + "grad_norm": 0.3592865193806289, + "learning_rate": 2.46075690886721e-05, + "loss": 1.0871, + "step": 16373 + }, + { + "epoch": 1.57, + "grad_norm": 0.33834311556797114, + "learning_rate": 2.459717723908965e-05, + "loss": 0.9858, + "step": 16374 + }, + { + "epoch": 1.57, + "grad_norm": 0.32859991222294516, + "learning_rate": 2.4586787276510227e-05, + "loss": 0.975, + "step": 16375 + }, + { + "epoch": 1.57, + "grad_norm": 0.3186903144899079, + "learning_rate": 2.4576399201193857e-05, + "loss": 0.9958, + "step": 16376 + }, + { + "epoch": 1.57, + "grad_norm": 0.31862965424795536, + "learning_rate": 2.4566013013400512e-05, + "loss": 1.1206, + "step": 16377 + }, + { + "epoch": 1.57, + "grad_norm": 0.3132686604163879, + "learning_rate": 2.455562871339009e-05, + "loss": 0.9504, + "step": 16378 + }, + { + "epoch": 1.57, + "grad_norm": 0.26982747995402795, + "learning_rate": 2.4545246301422488e-05, + "loss": 1.0079, + "step": 16379 + }, + { + "epoch": 1.57, + "grad_norm": 0.35591615851365777, + "learning_rate": 2.4534865777757465e-05, + "loss": 1.0167, + "step": 16380 + }, + { + "epoch": 1.57, + "grad_norm": 0.2967037741350534, + "learning_rate": 2.4524487142654917e-05, + "loss": 0.9575, + "step": 16381 + }, + { + "epoch": 1.57, + "grad_norm": 0.334560386401181, + "learning_rate": 2.451411039637448e-05, + "loss": 1.0469, + "step": 16382 + }, + { + "epoch": 1.57, + "grad_norm": 0.3340457086612969, + "learning_rate": 2.4503735539175898e-05, + "loss": 1.113, + "step": 16383 + }, + { + "epoch": 1.57, + "grad_norm": 0.31797467316500516, + "learning_rate": 2.4493362571318757e-05, + "loss": 1.0334, + "step": 16384 + }, + { + "epoch": 1.57, + "grad_norm": 0.3244431684949749, + "learning_rate": 2.44829914930627e-05, + "loss": 1.0799, + "step": 16385 + }, + { + "epoch": 1.57, + "grad_norm": 0.3350344894988579, + "learning_rate": 2.4472622304667214e-05, + "loss": 0.9095, + "step": 16386 + }, + { + "epoch": 1.57, + "grad_norm": 0.32842641712639564, + "learning_rate": 2.446225500639182e-05, + "loss": 1.1026, + "step": 16387 + }, + { + "epoch": 1.57, + "grad_norm": 0.29926054162260785, + "learning_rate": 2.445188959849597e-05, + "loss": 1.1026, + "step": 16388 + }, + { + "epoch": 1.57, + "grad_norm": 0.31382678948373915, + "learning_rate": 2.4441526081239087e-05, + "loss": 0.9272, + "step": 16389 + }, + { + "epoch": 1.57, + "grad_norm": 0.3259399502885661, + "learning_rate": 2.4431164454880473e-05, + "loss": 0.9961, + "step": 16390 + }, + { + "epoch": 1.57, + "grad_norm": 0.3470773576710722, + "learning_rate": 2.442080471967949e-05, + "loss": 1.098, + "step": 16391 + }, + { + "epoch": 1.57, + "grad_norm": 0.28653954598859926, + "learning_rate": 2.4410446875895343e-05, + "loss": 1.0664, + "step": 16392 + }, + { + "epoch": 1.57, + "grad_norm": 0.28760476722767386, + "learning_rate": 2.4400090923787278e-05, + "loss": 1.0592, + "step": 16393 + }, + { + "epoch": 1.57, + "grad_norm": 0.26754463395894595, + "learning_rate": 2.4389736863614433e-05, + "loss": 1.0187, + "step": 16394 + }, + { + "epoch": 1.57, + "grad_norm": 0.3149949951528966, + "learning_rate": 2.4379384695635988e-05, + "loss": 0.9535, + "step": 16395 + }, + { + "epoch": 1.57, + "grad_norm": 0.2930768936123896, + "learning_rate": 2.4369034420110926e-05, + "loss": 0.9664, + "step": 16396 + }, + { + "epoch": 1.57, + "grad_norm": 0.2913720754979205, + "learning_rate": 2.4358686037298306e-05, + "loss": 1.0048, + "step": 16397 + }, + { + "epoch": 1.57, + "grad_norm": 0.32541082469709887, + "learning_rate": 2.4348339547457145e-05, + "loss": 0.885, + "step": 16398 + }, + { + "epoch": 1.57, + "grad_norm": 0.3507458160382085, + "learning_rate": 2.4337994950846298e-05, + "loss": 1.0048, + "step": 16399 + }, + { + "epoch": 1.57, + "grad_norm": 0.2700426381756711, + "learning_rate": 2.432765224772472e-05, + "loss": 1.0543, + "step": 16400 + }, + { + "epoch": 1.57, + "grad_norm": 0.3514025719445595, + "learning_rate": 2.4317311438351133e-05, + "loss": 1.0197, + "step": 16401 + }, + { + "epoch": 1.57, + "grad_norm": 0.31712102806623377, + "learning_rate": 2.430697252298445e-05, + "loss": 1.0613, + "step": 16402 + }, + { + "epoch": 1.57, + "grad_norm": 0.3446318799068381, + "learning_rate": 2.4296635501883324e-05, + "loss": 0.9966, + "step": 16403 + }, + { + "epoch": 1.57, + "grad_norm": 0.2889858160425799, + "learning_rate": 2.4286300375306504e-05, + "loss": 1.0091, + "step": 16404 + }, + { + "epoch": 1.57, + "grad_norm": 0.32830224170171357, + "learning_rate": 2.4275967143512568e-05, + "loss": 1.0598, + "step": 16405 + }, + { + "epoch": 1.57, + "grad_norm": 0.28095192962301324, + "learning_rate": 2.426563580676018e-05, + "loss": 1.0875, + "step": 16406 + }, + { + "epoch": 1.57, + "grad_norm": 0.277132070119135, + "learning_rate": 2.4255306365307795e-05, + "loss": 0.9799, + "step": 16407 + }, + { + "epoch": 1.57, + "grad_norm": 0.3638379568317833, + "learning_rate": 2.424497881941402e-05, + "loss": 1.0099, + "step": 16408 + }, + { + "epoch": 1.57, + "grad_norm": 0.301546308784015, + "learning_rate": 2.423465316933724e-05, + "loss": 1.0319, + "step": 16409 + }, + { + "epoch": 1.57, + "grad_norm": 0.30701193927540454, + "learning_rate": 2.422432941533591e-05, + "loss": 0.928, + "step": 16410 + }, + { + "epoch": 1.57, + "grad_norm": 0.28948609650352547, + "learning_rate": 2.4214007557668327e-05, + "loss": 1.04, + "step": 16411 + }, + { + "epoch": 1.57, + "grad_norm": 0.2700328212047961, + "learning_rate": 2.420368759659286e-05, + "loss": 0.973, + "step": 16412 + }, + { + "epoch": 1.57, + "grad_norm": 0.35022282318032233, + "learning_rate": 2.4193369532367725e-05, + "loss": 1.0815, + "step": 16413 + }, + { + "epoch": 1.57, + "grad_norm": 0.33365594122543435, + "learning_rate": 2.418305336525116e-05, + "loss": 0.9561, + "step": 16414 + }, + { + "epoch": 1.57, + "grad_norm": 0.33787017541622816, + "learning_rate": 2.4172739095501328e-05, + "loss": 1.0455, + "step": 16415 + }, + { + "epoch": 1.57, + "grad_norm": 0.28600574680776697, + "learning_rate": 2.4162426723376364e-05, + "loss": 1.0078, + "step": 16416 + }, + { + "epoch": 1.57, + "grad_norm": 0.33978679346422, + "learning_rate": 2.415211624913436e-05, + "loss": 1.0114, + "step": 16417 + }, + { + "epoch": 1.57, + "grad_norm": 0.2975200281381104, + "learning_rate": 2.414180767303328e-05, + "loss": 0.9385, + "step": 16418 + }, + { + "epoch": 1.57, + "grad_norm": 0.3634473912322298, + "learning_rate": 2.413150099533118e-05, + "loss": 0.9906, + "step": 16419 + }, + { + "epoch": 1.57, + "grad_norm": 0.30006506214494655, + "learning_rate": 2.41211962162859e-05, + "loss": 0.9771, + "step": 16420 + }, + { + "epoch": 1.57, + "grad_norm": 0.30342159891830855, + "learning_rate": 2.4110893336155394e-05, + "loss": 0.9809, + "step": 16421 + }, + { + "epoch": 1.57, + "grad_norm": 0.32559582195655645, + "learning_rate": 2.410059235519746e-05, + "loss": 1.0813, + "step": 16422 + }, + { + "epoch": 1.57, + "grad_norm": 0.305032164597246, + "learning_rate": 2.4090293273669938e-05, + "loss": 1.014, + "step": 16423 + }, + { + "epoch": 1.57, + "grad_norm": 0.30915278787456085, + "learning_rate": 2.4079996091830513e-05, + "loss": 0.9407, + "step": 16424 + }, + { + "epoch": 1.57, + "grad_norm": 0.27933338702801763, + "learning_rate": 2.4069700809936923e-05, + "loss": 1.1147, + "step": 16425 + }, + { + "epoch": 1.57, + "grad_norm": 0.30221178908182605, + "learning_rate": 2.405940742824676e-05, + "loss": 1.0438, + "step": 16426 + }, + { + "epoch": 1.57, + "grad_norm": 0.3119144778720823, + "learning_rate": 2.4049115947017686e-05, + "loss": 1.0695, + "step": 16427 + }, + { + "epoch": 1.57, + "grad_norm": 0.35309979478816234, + "learning_rate": 2.403882636650716e-05, + "loss": 1.0317, + "step": 16428 + }, + { + "epoch": 1.57, + "grad_norm": 0.3464405108431666, + "learning_rate": 2.40285386869728e-05, + "loss": 1.0843, + "step": 16429 + }, + { + "epoch": 1.57, + "grad_norm": 0.28268420385878107, + "learning_rate": 2.4018252908671977e-05, + "loss": 0.8548, + "step": 16430 + }, + { + "epoch": 1.57, + "grad_norm": 0.33839880573552766, + "learning_rate": 2.4007969031862154e-05, + "loss": 1.0617, + "step": 16431 + }, + { + "epoch": 1.57, + "grad_norm": 0.28361234969823307, + "learning_rate": 2.3997687056800644e-05, + "loss": 1.1212, + "step": 16432 + }, + { + "epoch": 1.57, + "grad_norm": 0.32723548170111905, + "learning_rate": 2.3987406983744776e-05, + "loss": 1.1294, + "step": 16433 + }, + { + "epoch": 1.57, + "grad_norm": 0.30013468271803717, + "learning_rate": 2.3977128812951843e-05, + "loss": 1.0103, + "step": 16434 + }, + { + "epoch": 1.57, + "grad_norm": 0.30571150402955816, + "learning_rate": 2.396685254467902e-05, + "loss": 0.9737, + "step": 16435 + }, + { + "epoch": 1.57, + "grad_norm": 0.2885055160922447, + "learning_rate": 2.3956578179183498e-05, + "loss": 0.938, + "step": 16436 + }, + { + "epoch": 1.57, + "grad_norm": 0.31793359943653837, + "learning_rate": 2.3946305716722395e-05, + "loss": 1.0827, + "step": 16437 + }, + { + "epoch": 1.57, + "grad_norm": 0.30354906319289165, + "learning_rate": 2.3936035157552816e-05, + "loss": 1.045, + "step": 16438 + }, + { + "epoch": 1.57, + "grad_norm": 0.3393736207142222, + "learning_rate": 2.3925766501931735e-05, + "loss": 1.0003, + "step": 16439 + }, + { + "epoch": 1.57, + "grad_norm": 0.34556195202457823, + "learning_rate": 2.391549975011619e-05, + "loss": 0.9586, + "step": 16440 + }, + { + "epoch": 1.57, + "grad_norm": 0.2906738736096322, + "learning_rate": 2.3905234902363015e-05, + "loss": 1.0499, + "step": 16441 + }, + { + "epoch": 1.57, + "grad_norm": 0.2967333407089034, + "learning_rate": 2.3894971958929223e-05, + "loss": 1.0867, + "step": 16442 + }, + { + "epoch": 1.57, + "grad_norm": 0.32873982945402386, + "learning_rate": 2.388471092007156e-05, + "loss": 0.9282, + "step": 16443 + }, + { + "epoch": 1.57, + "grad_norm": 0.31048892874546147, + "learning_rate": 2.387445178604687e-05, + "loss": 1.0776, + "step": 16444 + }, + { + "epoch": 1.57, + "grad_norm": 0.27526775347931637, + "learning_rate": 2.386419455711184e-05, + "loss": 1.085, + "step": 16445 + }, + { + "epoch": 1.57, + "grad_norm": 0.34742599738529617, + "learning_rate": 2.3853939233523215e-05, + "loss": 0.9213, + "step": 16446 + }, + { + "epoch": 1.57, + "grad_norm": 0.3544211911205773, + "learning_rate": 2.3843685815537598e-05, + "loss": 1.0143, + "step": 16447 + }, + { + "epoch": 1.57, + "grad_norm": 0.2973093662750861, + "learning_rate": 2.3833434303411594e-05, + "loss": 1.0563, + "step": 16448 + }, + { + "epoch": 1.57, + "grad_norm": 0.3311223438255358, + "learning_rate": 2.382318469740178e-05, + "loss": 0.9342, + "step": 16449 + }, + { + "epoch": 1.57, + "grad_norm": 0.31930667696746867, + "learning_rate": 2.3812936997764658e-05, + "loss": 1.0472, + "step": 16450 + }, + { + "epoch": 1.57, + "grad_norm": 0.31747820413098754, + "learning_rate": 2.3802691204756643e-05, + "loss": 1.0577, + "step": 16451 + }, + { + "epoch": 1.57, + "grad_norm": 0.3950338067681003, + "learning_rate": 2.3792447318634203e-05, + "loss": 1.1009, + "step": 16452 + }, + { + "epoch": 1.57, + "grad_norm": 0.2990173265115576, + "learning_rate": 2.378220533965364e-05, + "loss": 1.0613, + "step": 16453 + }, + { + "epoch": 1.57, + "grad_norm": 0.3186221621138088, + "learning_rate": 2.3771965268071284e-05, + "loss": 0.9613, + "step": 16454 + }, + { + "epoch": 1.57, + "grad_norm": 0.3311440461071414, + "learning_rate": 2.3761727104143396e-05, + "loss": 1.0399, + "step": 16455 + }, + { + "epoch": 1.57, + "grad_norm": 0.36987638153277663, + "learning_rate": 2.3751490848126247e-05, + "loss": 1.0997, + "step": 16456 + }, + { + "epoch": 1.57, + "grad_norm": 0.2959259805818635, + "learning_rate": 2.374125650027591e-05, + "loss": 1.0383, + "step": 16457 + }, + { + "epoch": 1.57, + "grad_norm": 0.3382679436944268, + "learning_rate": 2.3731024060848573e-05, + "loss": 1.1026, + "step": 16458 + }, + { + "epoch": 1.57, + "grad_norm": 0.296107830922121, + "learning_rate": 2.3720793530100316e-05, + "loss": 1.0165, + "step": 16459 + }, + { + "epoch": 1.57, + "grad_norm": 0.34076347659684847, + "learning_rate": 2.3710564908287115e-05, + "loss": 1.0304, + "step": 16460 + }, + { + "epoch": 1.57, + "grad_norm": 0.30742987656208126, + "learning_rate": 2.3700338195664995e-05, + "loss": 1.0152, + "step": 16461 + }, + { + "epoch": 1.57, + "grad_norm": 0.34978299917330963, + "learning_rate": 2.369011339248981e-05, + "loss": 1.0272, + "step": 16462 + }, + { + "epoch": 1.58, + "grad_norm": 0.3248828882876703, + "learning_rate": 2.3679890499017554e-05, + "loss": 1.0288, + "step": 16463 + }, + { + "epoch": 1.58, + "grad_norm": 0.3249840271111718, + "learning_rate": 2.3669669515503977e-05, + "loss": 1.1088, + "step": 16464 + }, + { + "epoch": 1.58, + "grad_norm": 0.32228412332269685, + "learning_rate": 2.365945044220491e-05, + "loss": 1.0691, + "step": 16465 + }, + { + "epoch": 1.58, + "grad_norm": 0.3288090309930325, + "learning_rate": 2.364923327937606e-05, + "loss": 1.0843, + "step": 16466 + }, + { + "epoch": 1.58, + "grad_norm": 0.32405428290333965, + "learning_rate": 2.363901802727315e-05, + "loss": 1.0857, + "step": 16467 + }, + { + "epoch": 1.58, + "grad_norm": 0.3561903934226895, + "learning_rate": 2.3628804686151752e-05, + "loss": 0.9437, + "step": 16468 + }, + { + "epoch": 1.58, + "grad_norm": 0.29239950958097155, + "learning_rate": 2.3618593256267574e-05, + "loss": 1.0677, + "step": 16469 + }, + { + "epoch": 1.58, + "grad_norm": 0.34435253305821095, + "learning_rate": 2.360838373787607e-05, + "loss": 1.0591, + "step": 16470 + }, + { + "epoch": 1.58, + "grad_norm": 0.35722403478276343, + "learning_rate": 2.3598176131232808e-05, + "loss": 1.0486, + "step": 16471 + }, + { + "epoch": 1.58, + "grad_norm": 0.33118191956095605, + "learning_rate": 2.3587970436593178e-05, + "loss": 1.091, + "step": 16472 + }, + { + "epoch": 1.58, + "grad_norm": 0.3406747330627544, + "learning_rate": 2.357776665421263e-05, + "loss": 1.0145, + "step": 16473 + }, + { + "epoch": 1.58, + "grad_norm": 0.3018410265447693, + "learning_rate": 2.3567564784346474e-05, + "loss": 0.9771, + "step": 16474 + }, + { + "epoch": 1.58, + "grad_norm": 0.3464008920954256, + "learning_rate": 2.355736482725005e-05, + "loss": 0.9999, + "step": 16475 + }, + { + "epoch": 1.58, + "grad_norm": 0.312570940032271, + "learning_rate": 2.354716678317861e-05, + "loss": 0.9613, + "step": 16476 + }, + { + "epoch": 1.58, + "grad_norm": 0.3447687008896599, + "learning_rate": 2.35369706523874e-05, + "loss": 0.98, + "step": 16477 + }, + { + "epoch": 1.58, + "grad_norm": 0.3170033369802506, + "learning_rate": 2.3526776435131526e-05, + "loss": 1.0932, + "step": 16478 + }, + { + "epoch": 1.58, + "grad_norm": 0.3107313956024323, + "learning_rate": 2.3516584131666143e-05, + "loss": 1.0026, + "step": 16479 + }, + { + "epoch": 1.58, + "grad_norm": 0.36169511906807994, + "learning_rate": 2.3506393742246325e-05, + "loss": 0.9364, + "step": 16480 + }, + { + "epoch": 1.58, + "grad_norm": 0.35911183834991595, + "learning_rate": 2.3496205267127047e-05, + "loss": 1.0845, + "step": 16481 + }, + { + "epoch": 1.58, + "grad_norm": 0.31568873335023484, + "learning_rate": 2.348601870656333e-05, + "loss": 0.9472, + "step": 16482 + }, + { + "epoch": 1.58, + "grad_norm": 0.2747773099073729, + "learning_rate": 2.3475834060810063e-05, + "loss": 1.0064, + "step": 16483 + }, + { + "epoch": 1.58, + "grad_norm": 0.27693126277832164, + "learning_rate": 2.3465651330122184e-05, + "loss": 0.9753, + "step": 16484 + }, + { + "epoch": 1.58, + "grad_norm": 0.2957592092332828, + "learning_rate": 2.3455470514754452e-05, + "loss": 1.0342, + "step": 16485 + }, + { + "epoch": 1.58, + "grad_norm": 0.30160230720070963, + "learning_rate": 2.3445291614961694e-05, + "loss": 1.0596, + "step": 16486 + }, + { + "epoch": 1.58, + "grad_norm": 0.36265160505561134, + "learning_rate": 2.343511463099861e-05, + "loss": 0.9561, + "step": 16487 + }, + { + "epoch": 1.58, + "grad_norm": 0.3314212764338663, + "learning_rate": 2.342493956311993e-05, + "loss": 1.1052, + "step": 16488 + }, + { + "epoch": 1.58, + "grad_norm": 0.309395465183211, + "learning_rate": 2.3414766411580192e-05, + "loss": 1.0812, + "step": 16489 + }, + { + "epoch": 1.58, + "grad_norm": 0.26229233615416747, + "learning_rate": 2.3404595176634124e-05, + "loss": 1.1023, + "step": 16490 + }, + { + "epoch": 1.58, + "grad_norm": 0.30127218867177735, + "learning_rate": 2.3394425858536172e-05, + "loss": 1.0605, + "step": 16491 + }, + { + "epoch": 1.58, + "grad_norm": 0.3138279067797818, + "learning_rate": 2.3384258457540887e-05, + "loss": 1.0389, + "step": 16492 + }, + { + "epoch": 1.58, + "grad_norm": 0.3157623722262219, + "learning_rate": 2.3374092973902652e-05, + "loss": 1.0054, + "step": 16493 + }, + { + "epoch": 1.58, + "grad_norm": 0.29971087806578667, + "learning_rate": 2.336392940787593e-05, + "loss": 0.9634, + "step": 16494 + }, + { + "epoch": 1.58, + "grad_norm": 0.3191386822670371, + "learning_rate": 2.3353767759715007e-05, + "loss": 1.0368, + "step": 16495 + }, + { + "epoch": 1.58, + "grad_norm": 0.316684199201163, + "learning_rate": 2.3343608029674213e-05, + "loss": 1.0173, + "step": 16496 + }, + { + "epoch": 1.58, + "grad_norm": 0.308996379572238, + "learning_rate": 2.333345021800781e-05, + "loss": 0.9704, + "step": 16497 + }, + { + "epoch": 1.58, + "grad_norm": 0.319786982526145, + "learning_rate": 2.3323294324970036e-05, + "loss": 0.9567, + "step": 16498 + }, + { + "epoch": 1.58, + "grad_norm": 0.32707217818462114, + "learning_rate": 2.3313140350814966e-05, + "loss": 0.981, + "step": 16499 + }, + { + "epoch": 1.58, + "grad_norm": 0.29352430764516996, + "learning_rate": 2.3302988295796767e-05, + "loss": 1.0617, + "step": 16500 + }, + { + "epoch": 1.58, + "grad_norm": 0.306373678412686, + "learning_rate": 2.3292838160169504e-05, + "loss": 0.9892, + "step": 16501 + }, + { + "epoch": 1.58, + "grad_norm": 0.39544992598473305, + "learning_rate": 2.3282689944187165e-05, + "loss": 1.0514, + "step": 16502 + }, + { + "epoch": 1.58, + "grad_norm": 0.33015331348913773, + "learning_rate": 2.3272543648103708e-05, + "loss": 1.0068, + "step": 16503 + }, + { + "epoch": 1.58, + "grad_norm": 0.34190016190841493, + "learning_rate": 2.3262399272173075e-05, + "loss": 1.095, + "step": 16504 + }, + { + "epoch": 1.58, + "grad_norm": 0.3288636863173682, + "learning_rate": 2.3252256816649154e-05, + "loss": 1.0529, + "step": 16505 + }, + { + "epoch": 1.58, + "grad_norm": 0.33397949435503443, + "learning_rate": 2.324211628178571e-05, + "loss": 1.0765, + "step": 16506 + }, + { + "epoch": 1.58, + "grad_norm": 0.32290625294184194, + "learning_rate": 2.323197766783658e-05, + "loss": 1.1076, + "step": 16507 + }, + { + "epoch": 1.58, + "grad_norm": 0.3500813782408877, + "learning_rate": 2.322184097505542e-05, + "loss": 0.9841, + "step": 16508 + }, + { + "epoch": 1.58, + "grad_norm": 0.34003336017622426, + "learning_rate": 2.321170620369595e-05, + "loss": 1.0539, + "step": 16509 + }, + { + "epoch": 1.58, + "grad_norm": 0.33859294732029244, + "learning_rate": 2.3201573354011797e-05, + "loss": 1.1033, + "step": 16510 + }, + { + "epoch": 1.58, + "grad_norm": 0.2983068129981831, + "learning_rate": 2.319144242625655e-05, + "loss": 1.0724, + "step": 16511 + }, + { + "epoch": 1.58, + "grad_norm": 0.2760903933142405, + "learning_rate": 2.3181313420683714e-05, + "loss": 0.8828, + "step": 16512 + }, + { + "epoch": 1.58, + "grad_norm": 0.3158173015213168, + "learning_rate": 2.317118633754681e-05, + "loss": 1.0648, + "step": 16513 + }, + { + "epoch": 1.58, + "grad_norm": 0.29996812715557397, + "learning_rate": 2.3161061177099218e-05, + "loss": 1.0024, + "step": 16514 + }, + { + "epoch": 1.58, + "grad_norm": 0.3245649960016301, + "learning_rate": 2.3150937939594373e-05, + "loss": 0.9793, + "step": 16515 + }, + { + "epoch": 1.58, + "grad_norm": 0.3127695113978958, + "learning_rate": 2.31408166252856e-05, + "loss": 1.0814, + "step": 16516 + }, + { + "epoch": 1.58, + "grad_norm": 0.3189037222570758, + "learning_rate": 2.3130697234426225e-05, + "loss": 1.1517, + "step": 16517 + }, + { + "epoch": 1.58, + "grad_norm": 0.31360493024481284, + "learning_rate": 2.3120579767269436e-05, + "loss": 1.0157, + "step": 16518 + }, + { + "epoch": 1.58, + "grad_norm": 0.34766864485035603, + "learning_rate": 2.311046422406845e-05, + "loss": 0.9352, + "step": 16519 + }, + { + "epoch": 1.58, + "grad_norm": 0.36383350853738244, + "learning_rate": 2.3100350605076447e-05, + "loss": 0.9579, + "step": 16520 + }, + { + "epoch": 1.58, + "grad_norm": 0.3043172512902788, + "learning_rate": 2.309023891054648e-05, + "loss": 0.9935, + "step": 16521 + }, + { + "epoch": 1.58, + "grad_norm": 0.33910037760681283, + "learning_rate": 2.3080129140731643e-05, + "loss": 1.0786, + "step": 16522 + }, + { + "epoch": 1.58, + "grad_norm": 0.31306205756743954, + "learning_rate": 2.307002129588486e-05, + "loss": 1.0319, + "step": 16523 + }, + { + "epoch": 1.58, + "grad_norm": 0.3484476016602531, + "learning_rate": 2.30599153762592e-05, + "loss": 0.9862, + "step": 16524 + }, + { + "epoch": 1.58, + "grad_norm": 0.29663390659442956, + "learning_rate": 2.3049811382107477e-05, + "loss": 0.9436, + "step": 16525 + }, + { + "epoch": 1.58, + "grad_norm": 0.3180086612686387, + "learning_rate": 2.3039709313682623e-05, + "loss": 1.0814, + "step": 16526 + }, + { + "epoch": 1.58, + "grad_norm": 0.32689798548747, + "learning_rate": 2.3029609171237375e-05, + "loss": 1.009, + "step": 16527 + }, + { + "epoch": 1.58, + "grad_norm": 0.32380874945160737, + "learning_rate": 2.301951095502457e-05, + "loss": 1.0739, + "step": 16528 + }, + { + "epoch": 1.58, + "grad_norm": 0.35388877627417137, + "learning_rate": 2.3009414665296826e-05, + "loss": 1.0878, + "step": 16529 + }, + { + "epoch": 1.58, + "grad_norm": 0.3503117963291621, + "learning_rate": 2.2999320302306916e-05, + "loss": 1.0511, + "step": 16530 + }, + { + "epoch": 1.58, + "grad_norm": 0.30503192313141547, + "learning_rate": 2.2989227866307382e-05, + "loss": 0.9851, + "step": 16531 + }, + { + "epoch": 1.58, + "grad_norm": 0.29083619537517874, + "learning_rate": 2.297913735755085e-05, + "loss": 0.9947, + "step": 16532 + }, + { + "epoch": 1.58, + "grad_norm": 0.2777212732018254, + "learning_rate": 2.2969048776289793e-05, + "loss": 1.0212, + "step": 16533 + }, + { + "epoch": 1.58, + "grad_norm": 0.2987084908716022, + "learning_rate": 2.295896212277672e-05, + "loss": 0.9888, + "step": 16534 + }, + { + "epoch": 1.58, + "grad_norm": 0.2520946489969831, + "learning_rate": 2.2948877397264012e-05, + "loss": 0.9899, + "step": 16535 + }, + { + "epoch": 1.58, + "grad_norm": 0.3155961238987829, + "learning_rate": 2.2938794600004067e-05, + "loss": 1.0164, + "step": 16536 + }, + { + "epoch": 1.58, + "grad_norm": 0.3294012664505795, + "learning_rate": 2.292871373124923e-05, + "loss": 1.0398, + "step": 16537 + }, + { + "epoch": 1.58, + "grad_norm": 0.3251759297250868, + "learning_rate": 2.291863479125178e-05, + "loss": 0.859, + "step": 16538 + }, + { + "epoch": 1.58, + "grad_norm": 0.3402142901713329, + "learning_rate": 2.2908557780263927e-05, + "loss": 1.1359, + "step": 16539 + }, + { + "epoch": 1.58, + "grad_norm": 0.29626371528401857, + "learning_rate": 2.2898482698537858e-05, + "loss": 1.0556, + "step": 16540 + }, + { + "epoch": 1.58, + "grad_norm": 0.30441381844125187, + "learning_rate": 2.2888409546325738e-05, + "loss": 1.102, + "step": 16541 + }, + { + "epoch": 1.58, + "grad_norm": 0.4109911243450082, + "learning_rate": 2.2878338323879624e-05, + "loss": 0.972, + "step": 16542 + }, + { + "epoch": 1.58, + "grad_norm": 0.2847085012874215, + "learning_rate": 2.286826903145154e-05, + "loss": 1.0878, + "step": 16543 + }, + { + "epoch": 1.58, + "grad_norm": 0.3031090802913995, + "learning_rate": 2.2858201669293512e-05, + "loss": 1.1156, + "step": 16544 + }, + { + "epoch": 1.58, + "grad_norm": 0.29871338513722573, + "learning_rate": 2.2848136237657493e-05, + "loss": 0.9567, + "step": 16545 + }, + { + "epoch": 1.58, + "grad_norm": 0.34415770372970095, + "learning_rate": 2.2838072736795323e-05, + "loss": 0.9478, + "step": 16546 + }, + { + "epoch": 1.58, + "grad_norm": 0.319039400576664, + "learning_rate": 2.2828011166958907e-05, + "loss": 0.9808, + "step": 16547 + }, + { + "epoch": 1.58, + "grad_norm": 0.3120476362539826, + "learning_rate": 2.2817951528399983e-05, + "loss": 0.9536, + "step": 16548 + }, + { + "epoch": 1.58, + "grad_norm": 0.3286010624823061, + "learning_rate": 2.2807893821370363e-05, + "loss": 1.0813, + "step": 16549 + }, + { + "epoch": 1.58, + "grad_norm": 0.3174235693154982, + "learning_rate": 2.2797838046121657e-05, + "loss": 1.1018, + "step": 16550 + }, + { + "epoch": 1.58, + "grad_norm": 0.2796811704273479, + "learning_rate": 2.2787784202905626e-05, + "loss": 0.9534, + "step": 16551 + }, + { + "epoch": 1.58, + "grad_norm": 0.32664757794111693, + "learning_rate": 2.2777732291973796e-05, + "loss": 0.8941, + "step": 16552 + }, + { + "epoch": 1.58, + "grad_norm": 0.3005296723423557, + "learning_rate": 2.2767682313577767e-05, + "loss": 1.1233, + "step": 16553 + }, + { + "epoch": 1.58, + "grad_norm": 0.3037121582389583, + "learning_rate": 2.2757634267969007e-05, + "loss": 0.9944, + "step": 16554 + }, + { + "epoch": 1.58, + "grad_norm": 0.30351608606835906, + "learning_rate": 2.2747588155399025e-05, + "loss": 1.0815, + "step": 16555 + }, + { + "epoch": 1.58, + "grad_norm": 0.2723996226954624, + "learning_rate": 2.2737543976119167e-05, + "loss": 0.9666, + "step": 16556 + }, + { + "epoch": 1.58, + "grad_norm": 0.2790580919074516, + "learning_rate": 2.272750173038083e-05, + "loss": 0.9789, + "step": 16557 + }, + { + "epoch": 1.58, + "grad_norm": 0.30646170093004843, + "learning_rate": 2.2717461418435327e-05, + "loss": 0.9723, + "step": 16558 + }, + { + "epoch": 1.58, + "grad_norm": 0.3150311205391646, + "learning_rate": 2.2707423040533947e-05, + "loss": 0.9649, + "step": 16559 + }, + { + "epoch": 1.58, + "grad_norm": 0.2909294485453375, + "learning_rate": 2.269738659692786e-05, + "loss": 1.0484, + "step": 16560 + }, + { + "epoch": 1.58, + "grad_norm": 0.2864965393657759, + "learning_rate": 2.268735208786825e-05, + "loss": 1.0581, + "step": 16561 + }, + { + "epoch": 1.58, + "grad_norm": 0.31049761720411734, + "learning_rate": 2.2677319513606277e-05, + "loss": 1.049, + "step": 16562 + }, + { + "epoch": 1.58, + "grad_norm": 0.3141238745944132, + "learning_rate": 2.2667288874392944e-05, + "loss": 0.9678, + "step": 16563 + }, + { + "epoch": 1.58, + "grad_norm": 0.4393166993009678, + "learning_rate": 2.265726017047931e-05, + "loss": 1.0163, + "step": 16564 + }, + { + "epoch": 1.58, + "grad_norm": 0.3721821831476502, + "learning_rate": 2.264723340211635e-05, + "loss": 0.9714, + "step": 16565 + }, + { + "epoch": 1.58, + "grad_norm": 0.33381094247619786, + "learning_rate": 2.2637208569555023e-05, + "loss": 1.0506, + "step": 16566 + }, + { + "epoch": 1.58, + "grad_norm": 0.3063980936107499, + "learning_rate": 2.2627185673046137e-05, + "loss": 1.0508, + "step": 16567 + }, + { + "epoch": 1.59, + "grad_norm": 0.28657358397121174, + "learning_rate": 2.261716471284059e-05, + "loss": 1.0288, + "step": 16568 + }, + { + "epoch": 1.59, + "grad_norm": 0.3253477391463858, + "learning_rate": 2.260714568918909e-05, + "loss": 1.0873, + "step": 16569 + }, + { + "epoch": 1.59, + "grad_norm": 0.3188439293178237, + "learning_rate": 2.2597128602342423e-05, + "loss": 1.0567, + "step": 16570 + }, + { + "epoch": 1.59, + "grad_norm": 0.30937731619213116, + "learning_rate": 2.2587113452551247e-05, + "loss": 1.0772, + "step": 16571 + }, + { + "epoch": 1.59, + "grad_norm": 0.28915405667412125, + "learning_rate": 2.257710024006624e-05, + "loss": 1.0019, + "step": 16572 + }, + { + "epoch": 1.59, + "grad_norm": 0.2937569257853697, + "learning_rate": 2.256708896513793e-05, + "loss": 1.0953, + "step": 16573 + }, + { + "epoch": 1.59, + "grad_norm": 0.2992644414427269, + "learning_rate": 2.255707962801692e-05, + "loss": 0.9832, + "step": 16574 + }, + { + "epoch": 1.59, + "grad_norm": 0.31717179945706675, + "learning_rate": 2.2547072228953625e-05, + "loss": 1.0199, + "step": 16575 + }, + { + "epoch": 1.59, + "grad_norm": 0.31975059795133104, + "learning_rate": 2.2537066768198556e-05, + "loss": 0.8904, + "step": 16576 + }, + { + "epoch": 1.59, + "grad_norm": 0.28736425595573817, + "learning_rate": 2.2527063246002022e-05, + "loss": 0.9217, + "step": 16577 + }, + { + "epoch": 1.59, + "grad_norm": 0.29358241462676954, + "learning_rate": 2.251706166261447e-05, + "loss": 1.0311, + "step": 16578 + }, + { + "epoch": 1.59, + "grad_norm": 0.3049897849140691, + "learning_rate": 2.250706201828612e-05, + "loss": 1.0499, + "step": 16579 + }, + { + "epoch": 1.59, + "grad_norm": 0.2990763710177706, + "learning_rate": 2.249706431326728e-05, + "loss": 1.0453, + "step": 16580 + }, + { + "epoch": 1.59, + "grad_norm": 0.3453888094424672, + "learning_rate": 2.2487068547808075e-05, + "loss": 0.9622, + "step": 16581 + }, + { + "epoch": 1.59, + "grad_norm": 0.33002853747143956, + "learning_rate": 2.24770747221587e-05, + "loss": 1.1131, + "step": 16582 + }, + { + "epoch": 1.59, + "grad_norm": 0.3313873523095158, + "learning_rate": 2.2467082836569286e-05, + "loss": 0.9659, + "step": 16583 + }, + { + "epoch": 1.59, + "grad_norm": 0.3237380136859333, + "learning_rate": 2.245709289128981e-05, + "loss": 1.051, + "step": 16584 + }, + { + "epoch": 1.59, + "grad_norm": 0.31084983660727733, + "learning_rate": 2.2447104886570326e-05, + "loss": 1.0602, + "step": 16585 + }, + { + "epoch": 1.59, + "grad_norm": 0.29431679285854334, + "learning_rate": 2.2437118822660785e-05, + "loss": 0.9252, + "step": 16586 + }, + { + "epoch": 1.59, + "grad_norm": 0.34051961956201104, + "learning_rate": 2.2427134699811113e-05, + "loss": 1.0674, + "step": 16587 + }, + { + "epoch": 1.59, + "grad_norm": 0.35023273522155757, + "learning_rate": 2.2417152518271134e-05, + "loss": 1.0763, + "step": 16588 + }, + { + "epoch": 1.59, + "grad_norm": 0.29154528207586755, + "learning_rate": 2.240717227829069e-05, + "loss": 1.1053, + "step": 16589 + }, + { + "epoch": 1.59, + "grad_norm": 0.34810289288466634, + "learning_rate": 2.239719398011947e-05, + "loss": 0.9305, + "step": 16590 + }, + { + "epoch": 1.59, + "grad_norm": 0.30957628403176185, + "learning_rate": 2.2387217624007296e-05, + "loss": 1.071, + "step": 16591 + }, + { + "epoch": 1.59, + "grad_norm": 0.30541062174319533, + "learning_rate": 2.2377243210203746e-05, + "loss": 1.0576, + "step": 16592 + }, + { + "epoch": 1.59, + "grad_norm": 0.3283088262373048, + "learning_rate": 2.2367270738958502e-05, + "loss": 0.9447, + "step": 16593 + }, + { + "epoch": 1.59, + "grad_norm": 0.31929116290243015, + "learning_rate": 2.235730021052107e-05, + "loss": 1.0151, + "step": 16594 + }, + { + "epoch": 1.59, + "grad_norm": 0.3213114280781385, + "learning_rate": 2.2347331625141032e-05, + "loss": 0.9908, + "step": 16595 + }, + { + "epoch": 1.59, + "grad_norm": 0.319548435086839, + "learning_rate": 2.233736498306779e-05, + "loss": 1.1401, + "step": 16596 + }, + { + "epoch": 1.59, + "grad_norm": 0.30551369831157055, + "learning_rate": 2.2327400284550794e-05, + "loss": 1.0077, + "step": 16597 + }, + { + "epoch": 1.59, + "grad_norm": 0.31347304183294833, + "learning_rate": 2.2317437529839426e-05, + "loss": 1.0012, + "step": 16598 + }, + { + "epoch": 1.59, + "grad_norm": 0.29479126390104293, + "learning_rate": 2.2307476719183028e-05, + "loss": 0.9506, + "step": 16599 + }, + { + "epoch": 1.59, + "grad_norm": 0.3473359694237999, + "learning_rate": 2.2297517852830828e-05, + "loss": 1.0427, + "step": 16600 + }, + { + "epoch": 1.59, + "grad_norm": 0.3199948894647322, + "learning_rate": 2.2287560931032114e-05, + "loss": 1.0051, + "step": 16601 + }, + { + "epoch": 1.59, + "grad_norm": 0.3140778801948873, + "learning_rate": 2.2277605954035996e-05, + "loss": 1.0329, + "step": 16602 + }, + { + "epoch": 1.59, + "grad_norm": 0.2470043308461956, + "learning_rate": 2.2267652922091632e-05, + "loss": 1.0574, + "step": 16603 + }, + { + "epoch": 1.59, + "grad_norm": 0.28658296770777225, + "learning_rate": 2.225770183544811e-05, + "loss": 1.0753, + "step": 16604 + }, + { + "epoch": 1.59, + "grad_norm": 0.3013285772413749, + "learning_rate": 2.224775269435445e-05, + "loss": 0.9802, + "step": 16605 + }, + { + "epoch": 1.59, + "grad_norm": 0.34694705894911854, + "learning_rate": 2.2237805499059682e-05, + "loss": 1.1162, + "step": 16606 + }, + { + "epoch": 1.59, + "grad_norm": 0.2966919532500109, + "learning_rate": 2.222786024981267e-05, + "loss": 1.0524, + "step": 16607 + }, + { + "epoch": 1.59, + "grad_norm": 0.29529595210596243, + "learning_rate": 2.221791694686236e-05, + "loss": 1.0503, + "step": 16608 + }, + { + "epoch": 1.59, + "grad_norm": 0.2879979744073126, + "learning_rate": 2.2207975590457543e-05, + "loss": 1.0897, + "step": 16609 + }, + { + "epoch": 1.59, + "grad_norm": 0.31186738048830775, + "learning_rate": 2.219803618084705e-05, + "loss": 1.0301, + "step": 16610 + }, + { + "epoch": 1.59, + "grad_norm": 0.32386205393370715, + "learning_rate": 2.2188098718279538e-05, + "loss": 1.1253, + "step": 16611 + }, + { + "epoch": 1.59, + "grad_norm": 0.31573269781964314, + "learning_rate": 2.2178163203003822e-05, + "loss": 1.0859, + "step": 16612 + }, + { + "epoch": 1.59, + "grad_norm": 0.309630778450211, + "learning_rate": 2.2168229635268444e-05, + "loss": 1.1539, + "step": 16613 + }, + { + "epoch": 1.59, + "grad_norm": 0.3134267195846851, + "learning_rate": 2.2158298015322076e-05, + "loss": 0.9671, + "step": 16614 + }, + { + "epoch": 1.59, + "grad_norm": 0.3281755591276653, + "learning_rate": 2.2148368343413183e-05, + "loss": 1.0985, + "step": 16615 + }, + { + "epoch": 1.59, + "grad_norm": 0.33615106585880455, + "learning_rate": 2.2138440619790336e-05, + "loss": 1.044, + "step": 16616 + }, + { + "epoch": 1.59, + "grad_norm": 0.30643894010226674, + "learning_rate": 2.2128514844701907e-05, + "loss": 1.0868, + "step": 16617 + }, + { + "epoch": 1.59, + "grad_norm": 0.3373406834635836, + "learning_rate": 2.211859101839634e-05, + "loss": 1.0161, + "step": 16618 + }, + { + "epoch": 1.59, + "grad_norm": 0.28898460557950717, + "learning_rate": 2.2108669141121983e-05, + "loss": 0.8847, + "step": 16619 + }, + { + "epoch": 1.59, + "grad_norm": 0.27045053049190404, + "learning_rate": 2.2098749213127157e-05, + "loss": 0.9604, + "step": 16620 + }, + { + "epoch": 1.59, + "grad_norm": 0.3393048849981757, + "learning_rate": 2.208883123466007e-05, + "loss": 1.0678, + "step": 16621 + }, + { + "epoch": 1.59, + "grad_norm": 0.30383650896260384, + "learning_rate": 2.2078915205968943e-05, + "loss": 0.9786, + "step": 16622 + }, + { + "epoch": 1.59, + "grad_norm": 0.2672145348613689, + "learning_rate": 2.2069001127301958e-05, + "loss": 1.0198, + "step": 16623 + }, + { + "epoch": 1.59, + "grad_norm": 0.3243939434150309, + "learning_rate": 2.2059088998907175e-05, + "loss": 1.1318, + "step": 16624 + }, + { + "epoch": 1.59, + "grad_norm": 0.3195703820914604, + "learning_rate": 2.204917882103268e-05, + "loss": 1.1496, + "step": 16625 + }, + { + "epoch": 1.59, + "grad_norm": 0.3390345340698075, + "learning_rate": 2.2039270593926464e-05, + "loss": 0.9592, + "step": 16626 + }, + { + "epoch": 1.59, + "grad_norm": 0.3378779849873555, + "learning_rate": 2.202936431783653e-05, + "loss": 1.0365, + "step": 16627 + }, + { + "epoch": 1.59, + "grad_norm": 0.2654855702598797, + "learning_rate": 2.2019459993010727e-05, + "loss": 1.0386, + "step": 16628 + }, + { + "epoch": 1.59, + "grad_norm": 0.34482482122685776, + "learning_rate": 2.2009557619696974e-05, + "loss": 0.9768, + "step": 16629 + }, + { + "epoch": 1.59, + "grad_norm": 0.3914166406485813, + "learning_rate": 2.199965719814303e-05, + "loss": 1.058, + "step": 16630 + }, + { + "epoch": 1.59, + "grad_norm": 0.2842046331245498, + "learning_rate": 2.1989758728596687e-05, + "loss": 0.8966, + "step": 16631 + }, + { + "epoch": 1.59, + "grad_norm": 0.33459351791683256, + "learning_rate": 2.1979862211305656e-05, + "loss": 1.07, + "step": 16632 + }, + { + "epoch": 1.59, + "grad_norm": 0.27410328541355156, + "learning_rate": 2.196996764651764e-05, + "loss": 1.0045, + "step": 16633 + }, + { + "epoch": 1.59, + "grad_norm": 0.3095100430272306, + "learning_rate": 2.196007503448019e-05, + "loss": 0.9488, + "step": 16634 + }, + { + "epoch": 1.59, + "grad_norm": 0.28859255832669806, + "learning_rate": 2.1950184375440942e-05, + "loss": 1.0054, + "step": 16635 + }, + { + "epoch": 1.59, + "grad_norm": 0.2841873979592766, + "learning_rate": 2.194029566964735e-05, + "loss": 1.0065, + "step": 16636 + }, + { + "epoch": 1.59, + "grad_norm": 0.3180107070935772, + "learning_rate": 2.1930408917346956e-05, + "loss": 1.0154, + "step": 16637 + }, + { + "epoch": 1.59, + "grad_norm": 0.2891808633155986, + "learning_rate": 2.1920524118787077e-05, + "loss": 0.9277, + "step": 16638 + }, + { + "epoch": 1.59, + "grad_norm": 0.3136249894981432, + "learning_rate": 2.1910641274215206e-05, + "loss": 1.0989, + "step": 16639 + }, + { + "epoch": 1.59, + "grad_norm": 0.3071270989227352, + "learning_rate": 2.1900760383878594e-05, + "loss": 1.0186, + "step": 16640 + }, + { + "epoch": 1.59, + "grad_norm": 0.2659217501810846, + "learning_rate": 2.189088144802457e-05, + "loss": 1.0072, + "step": 16641 + }, + { + "epoch": 1.59, + "grad_norm": 0.334302328311976, + "learning_rate": 2.188100446690029e-05, + "loss": 1.0722, + "step": 16642 + }, + { + "epoch": 1.59, + "grad_norm": 0.34781376860124774, + "learning_rate": 2.1871129440752968e-05, + "loss": 0.9156, + "step": 16643 + }, + { + "epoch": 1.59, + "grad_norm": 0.30432214233361576, + "learning_rate": 2.1861256369829763e-05, + "loss": 1.0699, + "step": 16644 + }, + { + "epoch": 1.59, + "grad_norm": 0.3420130668744218, + "learning_rate": 2.185138525437771e-05, + "loss": 0.931, + "step": 16645 + }, + { + "epoch": 1.59, + "grad_norm": 0.31427819001993723, + "learning_rate": 2.1841516094643844e-05, + "loss": 0.9796, + "step": 16646 + }, + { + "epoch": 1.59, + "grad_norm": 0.3452887143716011, + "learning_rate": 2.1831648890875157e-05, + "loss": 1.0711, + "step": 16647 + }, + { + "epoch": 1.59, + "grad_norm": 0.33035484119220615, + "learning_rate": 2.1821783643318615e-05, + "loss": 1.074, + "step": 16648 + }, + { + "epoch": 1.59, + "grad_norm": 0.33050296103740984, + "learning_rate": 2.1811920352221038e-05, + "loss": 1.039, + "step": 16649 + }, + { + "epoch": 1.59, + "grad_norm": 0.3459578063539777, + "learning_rate": 2.1802059017829336e-05, + "loss": 0.9875, + "step": 16650 + }, + { + "epoch": 1.59, + "grad_norm": 0.30764190524290363, + "learning_rate": 2.1792199640390188e-05, + "loss": 1.1335, + "step": 16651 + }, + { + "epoch": 1.59, + "grad_norm": 0.3117411681438231, + "learning_rate": 2.178234222015044e-05, + "loss": 0.9163, + "step": 16652 + }, + { + "epoch": 1.59, + "grad_norm": 0.3263286412371126, + "learning_rate": 2.1772486757356726e-05, + "loss": 0.9715, + "step": 16653 + }, + { + "epoch": 1.59, + "grad_norm": 0.31525088623924064, + "learning_rate": 2.1762633252255716e-05, + "loss": 0.9098, + "step": 16654 + }, + { + "epoch": 1.59, + "grad_norm": 0.2886947493601534, + "learning_rate": 2.175278170509395e-05, + "loss": 0.9988, + "step": 16655 + }, + { + "epoch": 1.59, + "grad_norm": 0.37183616979691186, + "learning_rate": 2.174293211611802e-05, + "loss": 1.0853, + "step": 16656 + }, + { + "epoch": 1.59, + "grad_norm": 0.2835560254079156, + "learning_rate": 2.1733084485574383e-05, + "loss": 1.0079, + "step": 16657 + }, + { + "epoch": 1.59, + "grad_norm": 0.3644587428925115, + "learning_rate": 2.1723238813709478e-05, + "loss": 1.0121, + "step": 16658 + }, + { + "epoch": 1.59, + "grad_norm": 0.2909831261747957, + "learning_rate": 2.171339510076973e-05, + "loss": 1.0461, + "step": 16659 + }, + { + "epoch": 1.59, + "grad_norm": 0.3239536539646879, + "learning_rate": 2.1703553347001505e-05, + "loss": 1.0601, + "step": 16660 + }, + { + "epoch": 1.59, + "grad_norm": 0.319733937003227, + "learning_rate": 2.1693713552651018e-05, + "loss": 0.9687, + "step": 16661 + }, + { + "epoch": 1.59, + "grad_norm": 0.32512097141961255, + "learning_rate": 2.1683875717964607e-05, + "loss": 0.9892, + "step": 16662 + }, + { + "epoch": 1.59, + "grad_norm": 0.2753101191381352, + "learning_rate": 2.1674039843188386e-05, + "loss": 1.1179, + "step": 16663 + }, + { + "epoch": 1.59, + "grad_norm": 0.3720794281609692, + "learning_rate": 2.1664205928568548e-05, + "loss": 0.8949, + "step": 16664 + }, + { + "epoch": 1.59, + "grad_norm": 0.3340827849732531, + "learning_rate": 2.1654373974351195e-05, + "loss": 1.0835, + "step": 16665 + }, + { + "epoch": 1.59, + "grad_norm": 0.3115682474374399, + "learning_rate": 2.1644543980782394e-05, + "loss": 1.0243, + "step": 16666 + }, + { + "epoch": 1.59, + "grad_norm": 0.29422597403958495, + "learning_rate": 2.1634715948108097e-05, + "loss": 1.0692, + "step": 16667 + }, + { + "epoch": 1.59, + "grad_norm": 0.33344398957445653, + "learning_rate": 2.1624889876574282e-05, + "loss": 0.9927, + "step": 16668 + }, + { + "epoch": 1.59, + "grad_norm": 0.3303544713388326, + "learning_rate": 2.161506576642689e-05, + "loss": 0.9678, + "step": 16669 + }, + { + "epoch": 1.59, + "grad_norm": 0.31562858608457073, + "learning_rate": 2.160524361791171e-05, + "loss": 1.1433, + "step": 16670 + }, + { + "epoch": 1.59, + "grad_norm": 0.3118211402701126, + "learning_rate": 2.1595423431274597e-05, + "loss": 0.982, + "step": 16671 + }, + { + "epoch": 1.6, + "grad_norm": 0.30916701472565733, + "learning_rate": 2.158560520676125e-05, + "loss": 1.1187, + "step": 16672 + }, + { + "epoch": 1.6, + "grad_norm": 0.28829865575942243, + "learning_rate": 2.157578894461746e-05, + "loss": 1.1655, + "step": 16673 + }, + { + "epoch": 1.6, + "grad_norm": 0.33114476734864906, + "learning_rate": 2.1565974645088806e-05, + "loss": 0.987, + "step": 16674 + }, + { + "epoch": 1.6, + "grad_norm": 0.29250007259229754, + "learning_rate": 2.1556162308420957e-05, + "loss": 1.0216, + "step": 16675 + }, + { + "epoch": 1.6, + "grad_norm": 0.3148231982288371, + "learning_rate": 2.1546351934859432e-05, + "loss": 1.045, + "step": 16676 + }, + { + "epoch": 1.6, + "grad_norm": 0.3026350203782963, + "learning_rate": 2.153654352464978e-05, + "loss": 1.1303, + "step": 16677 + }, + { + "epoch": 1.6, + "grad_norm": 0.2949464919005034, + "learning_rate": 2.1526737078037396e-05, + "loss": 1.0871, + "step": 16678 + }, + { + "epoch": 1.6, + "grad_norm": 0.29206275050937297, + "learning_rate": 2.1516932595267747e-05, + "loss": 1.0895, + "step": 16679 + }, + { + "epoch": 1.6, + "grad_norm": 0.30230287262319167, + "learning_rate": 2.150713007658619e-05, + "loss": 1.0101, + "step": 16680 + }, + { + "epoch": 1.6, + "grad_norm": 0.30070581368081956, + "learning_rate": 2.1497329522238053e-05, + "loss": 1.0594, + "step": 16681 + }, + { + "epoch": 1.6, + "grad_norm": 0.33971709284043916, + "learning_rate": 2.148753093246856e-05, + "loss": 0.948, + "step": 16682 + }, + { + "epoch": 1.6, + "grad_norm": 0.287671426113771, + "learning_rate": 2.1477734307522968e-05, + "loss": 0.9663, + "step": 16683 + }, + { + "epoch": 1.6, + "grad_norm": 0.30337539788136614, + "learning_rate": 2.1467939647646408e-05, + "loss": 1.035, + "step": 16684 + }, + { + "epoch": 1.6, + "grad_norm": 0.30879164012769394, + "learning_rate": 2.1458146953084003e-05, + "loss": 1.108, + "step": 16685 + }, + { + "epoch": 1.6, + "grad_norm": 0.34302696828754936, + "learning_rate": 2.1448356224080834e-05, + "loss": 1.0534, + "step": 16686 + }, + { + "epoch": 1.6, + "grad_norm": 0.29017478519808454, + "learning_rate": 2.1438567460881955e-05, + "loss": 1.0598, + "step": 16687 + }, + { + "epoch": 1.6, + "grad_norm": 0.3051271616430812, + "learning_rate": 2.1428780663732262e-05, + "loss": 1.0575, + "step": 16688 + }, + { + "epoch": 1.6, + "grad_norm": 0.30183031817190803, + "learning_rate": 2.1418995832876708e-05, + "loss": 0.9738, + "step": 16689 + }, + { + "epoch": 1.6, + "grad_norm": 0.32400484724141515, + "learning_rate": 2.140921296856021e-05, + "loss": 1.0846, + "step": 16690 + }, + { + "epoch": 1.6, + "grad_norm": 0.3140822007998573, + "learning_rate": 2.1399432071027515e-05, + "loss": 1.1, + "step": 16691 + }, + { + "epoch": 1.6, + "grad_norm": 0.29884360845028884, + "learning_rate": 2.1389653140523434e-05, + "loss": 0.9043, + "step": 16692 + }, + { + "epoch": 1.6, + "grad_norm": 0.2854966361700575, + "learning_rate": 2.137987617729269e-05, + "loss": 0.9993, + "step": 16693 + }, + { + "epoch": 1.6, + "grad_norm": 0.3179765117128633, + "learning_rate": 2.137010118157998e-05, + "loss": 0.8905, + "step": 16694 + }, + { + "epoch": 1.6, + "grad_norm": 0.34497298495969664, + "learning_rate": 2.1360328153629882e-05, + "loss": 1.013, + "step": 16695 + }, + { + "epoch": 1.6, + "grad_norm": 0.30146639870847347, + "learning_rate": 2.135055709368702e-05, + "loss": 0.9305, + "step": 16696 + }, + { + "epoch": 1.6, + "grad_norm": 0.2842331858539669, + "learning_rate": 2.1340788001995872e-05, + "loss": 1.0085, + "step": 16697 + }, + { + "epoch": 1.6, + "grad_norm": 0.3022345258358604, + "learning_rate": 2.1331020878800956e-05, + "loss": 1.0274, + "step": 16698 + }, + { + "epoch": 1.6, + "grad_norm": 0.31762588159842825, + "learning_rate": 2.1321255724346634e-05, + "loss": 1.0466, + "step": 16699 + }, + { + "epoch": 1.6, + "grad_norm": 0.28403021168432757, + "learning_rate": 2.1311492538877396e-05, + "loss": 0.9704, + "step": 16700 + }, + { + "epoch": 1.6, + "grad_norm": 0.314095358690901, + "learning_rate": 2.1301731322637473e-05, + "loss": 0.9287, + "step": 16701 + }, + { + "epoch": 1.6, + "grad_norm": 0.31943626566180783, + "learning_rate": 2.1291972075871223e-05, + "loss": 1.0463, + "step": 16702 + }, + { + "epoch": 1.6, + "grad_norm": 0.29662390227818536, + "learning_rate": 2.128221479882281e-05, + "loss": 0.9341, + "step": 16703 + }, + { + "epoch": 1.6, + "grad_norm": 0.3439931994363826, + "learning_rate": 2.127245949173643e-05, + "loss": 1.0465, + "step": 16704 + }, + { + "epoch": 1.6, + "grad_norm": 0.3072128832874735, + "learning_rate": 2.126270615485627e-05, + "loss": 1.0642, + "step": 16705 + }, + { + "epoch": 1.6, + "grad_norm": 0.3006723290315059, + "learning_rate": 2.1252954788426337e-05, + "loss": 1.0049, + "step": 16706 + }, + { + "epoch": 1.6, + "grad_norm": 0.30483218647912796, + "learning_rate": 2.1243205392690712e-05, + "loss": 0.9227, + "step": 16707 + }, + { + "epoch": 1.6, + "grad_norm": 0.30325984344448514, + "learning_rate": 2.1233457967893366e-05, + "loss": 1.045, + "step": 16708 + }, + { + "epoch": 1.6, + "grad_norm": 0.29186639151859617, + "learning_rate": 2.122371251427826e-05, + "loss": 1.0284, + "step": 16709 + }, + { + "epoch": 1.6, + "grad_norm": 0.30711845717922276, + "learning_rate": 2.1213969032089232e-05, + "loss": 1.0349, + "step": 16710 + }, + { + "epoch": 1.6, + "grad_norm": 0.31519496187487983, + "learning_rate": 2.1204227521570163e-05, + "loss": 1.0042, + "step": 16711 + }, + { + "epoch": 1.6, + "grad_norm": 0.27029572121786866, + "learning_rate": 2.1194487982964784e-05, + "loss": 0.9407, + "step": 16712 + }, + { + "epoch": 1.6, + "grad_norm": 0.3620356052474033, + "learning_rate": 2.118475041651692e-05, + "loss": 1.1242, + "step": 16713 + }, + { + "epoch": 1.6, + "grad_norm": 0.28243444447244404, + "learning_rate": 2.1175014822470172e-05, + "loss": 0.9924, + "step": 16714 + }, + { + "epoch": 1.6, + "grad_norm": 0.31368405378207154, + "learning_rate": 2.1165281201068254e-05, + "loss": 1.0121, + "step": 16715 + }, + { + "epoch": 1.6, + "grad_norm": 0.2899014452278226, + "learning_rate": 2.1155549552554687e-05, + "loss": 0.9791, + "step": 16716 + }, + { + "epoch": 1.6, + "grad_norm": 0.30652300237004615, + "learning_rate": 2.114581987717308e-05, + "loss": 0.8942, + "step": 16717 + }, + { + "epoch": 1.6, + "grad_norm": 0.3508043685631578, + "learning_rate": 2.1136092175166855e-05, + "loss": 0.9388, + "step": 16718 + }, + { + "epoch": 1.6, + "grad_norm": 0.32708223934423875, + "learning_rate": 2.112636644677949e-05, + "loss": 1.0008, + "step": 16719 + }, + { + "epoch": 1.6, + "grad_norm": 0.3225875324863135, + "learning_rate": 2.1116642692254375e-05, + "loss": 1.0322, + "step": 16720 + }, + { + "epoch": 1.6, + "grad_norm": 0.2974118511178529, + "learning_rate": 2.1106920911834872e-05, + "loss": 1.0022, + "step": 16721 + }, + { + "epoch": 1.6, + "grad_norm": 0.25775303862962995, + "learning_rate": 2.1097201105764242e-05, + "loss": 1.1435, + "step": 16722 + }, + { + "epoch": 1.6, + "grad_norm": 0.3191358336956244, + "learning_rate": 2.1087483274285756e-05, + "loss": 0.9669, + "step": 16723 + }, + { + "epoch": 1.6, + "grad_norm": 0.3073323699044901, + "learning_rate": 2.107776741764258e-05, + "loss": 1.0176, + "step": 16724 + }, + { + "epoch": 1.6, + "grad_norm": 0.3324779081121651, + "learning_rate": 2.106805353607787e-05, + "loss": 0.935, + "step": 16725 + }, + { + "epoch": 1.6, + "grad_norm": 0.2722091458728582, + "learning_rate": 2.1058341629834733e-05, + "loss": 1.0581, + "step": 16726 + }, + { + "epoch": 1.6, + "grad_norm": 0.34163668385996854, + "learning_rate": 2.104863169915623e-05, + "loss": 0.9613, + "step": 16727 + }, + { + "epoch": 1.6, + "grad_norm": 0.33083087963104074, + "learning_rate": 2.1038923744285312e-05, + "loss": 1.0097, + "step": 16728 + }, + { + "epoch": 1.6, + "grad_norm": 0.33421002879305106, + "learning_rate": 2.102921776546496e-05, + "loss": 0.9812, + "step": 16729 + }, + { + "epoch": 1.6, + "grad_norm": 0.3384063435802197, + "learning_rate": 2.101951376293808e-05, + "loss": 0.8959, + "step": 16730 + }, + { + "epoch": 1.6, + "grad_norm": 0.3757627137265119, + "learning_rate": 2.1009811736947484e-05, + "loss": 1.0932, + "step": 16731 + }, + { + "epoch": 1.6, + "grad_norm": 0.2989417362930991, + "learning_rate": 2.1000111687736024e-05, + "loss": 1.0192, + "step": 16732 + }, + { + "epoch": 1.6, + "grad_norm": 0.3499850216827928, + "learning_rate": 2.0990413615546355e-05, + "loss": 1.0687, + "step": 16733 + }, + { + "epoch": 1.6, + "grad_norm": 0.3371305431221311, + "learning_rate": 2.0980717520621297e-05, + "loss": 1.0099, + "step": 16734 + }, + { + "epoch": 1.6, + "grad_norm": 0.32500618283595534, + "learning_rate": 2.0971023403203427e-05, + "loss": 0.9345, + "step": 16735 + }, + { + "epoch": 1.6, + "grad_norm": 0.3302246425558807, + "learning_rate": 2.096133126353538e-05, + "loss": 0.9636, + "step": 16736 + }, + { + "epoch": 1.6, + "grad_norm": 0.2882427980975201, + "learning_rate": 2.0951641101859676e-05, + "loss": 1.1103, + "step": 16737 + }, + { + "epoch": 1.6, + "grad_norm": 0.32088307252169496, + "learning_rate": 2.094195291841885e-05, + "loss": 1.1437, + "step": 16738 + }, + { + "epoch": 1.6, + "grad_norm": 0.33216014633189056, + "learning_rate": 2.0932266713455316e-05, + "loss": 1.0746, + "step": 16739 + }, + { + "epoch": 1.6, + "grad_norm": 0.28558835784243913, + "learning_rate": 2.0922582487211494e-05, + "loss": 1.1129, + "step": 16740 + }, + { + "epoch": 1.6, + "grad_norm": 0.32313603763135906, + "learning_rate": 2.0912900239929757e-05, + "loss": 1.0459, + "step": 16741 + }, + { + "epoch": 1.6, + "grad_norm": 0.2880241676560057, + "learning_rate": 2.0903219971852405e-05, + "loss": 1.1087, + "step": 16742 + }, + { + "epoch": 1.6, + "grad_norm": 0.3236152694658199, + "learning_rate": 2.0893541683221672e-05, + "loss": 1.0949, + "step": 16743 + }, + { + "epoch": 1.6, + "grad_norm": 0.3208989197498561, + "learning_rate": 2.08838653742798e-05, + "loss": 0.8852, + "step": 16744 + }, + { + "epoch": 1.6, + "grad_norm": 0.32867230869420366, + "learning_rate": 2.0874191045268886e-05, + "loss": 1.0308, + "step": 16745 + }, + { + "epoch": 1.6, + "grad_norm": 0.30848061851695435, + "learning_rate": 2.0864518696431068e-05, + "loss": 1.0072, + "step": 16746 + }, + { + "epoch": 1.6, + "grad_norm": 0.32571440462234974, + "learning_rate": 2.0854848328008413e-05, + "loss": 0.9624, + "step": 16747 + }, + { + "epoch": 1.6, + "grad_norm": 0.2590576735393619, + "learning_rate": 2.0845179940242944e-05, + "loss": 0.9728, + "step": 16748 + }, + { + "epoch": 1.6, + "grad_norm": 0.2907495913069128, + "learning_rate": 2.0835513533376572e-05, + "loss": 0.9636, + "step": 16749 + }, + { + "epoch": 1.6, + "grad_norm": 0.2900562869671305, + "learning_rate": 2.082584910765122e-05, + "loss": 1.072, + "step": 16750 + }, + { + "epoch": 1.6, + "grad_norm": 0.34009469067779763, + "learning_rate": 2.0816186663308778e-05, + "loss": 0.93, + "step": 16751 + }, + { + "epoch": 1.6, + "grad_norm": 0.32134297630565967, + "learning_rate": 2.0806526200591004e-05, + "loss": 1.0179, + "step": 16752 + }, + { + "epoch": 1.6, + "grad_norm": 0.35002644765475666, + "learning_rate": 2.0796867719739688e-05, + "loss": 1.0731, + "step": 16753 + }, + { + "epoch": 1.6, + "grad_norm": 0.3376642199875619, + "learning_rate": 2.0787211220996527e-05, + "loss": 1.1023, + "step": 16754 + }, + { + "epoch": 1.6, + "grad_norm": 0.316869174102132, + "learning_rate": 2.0777556704603217e-05, + "loss": 0.9085, + "step": 16755 + }, + { + "epoch": 1.6, + "grad_norm": 0.3654033693882466, + "learning_rate": 2.0767904170801323e-05, + "loss": 1.165, + "step": 16756 + }, + { + "epoch": 1.6, + "grad_norm": 0.3217395811937246, + "learning_rate": 2.0758253619832435e-05, + "loss": 1.0852, + "step": 16757 + }, + { + "epoch": 1.6, + "grad_norm": 0.32781253411254174, + "learning_rate": 2.074860505193803e-05, + "loss": 1.1375, + "step": 16758 + }, + { + "epoch": 1.6, + "grad_norm": 0.31251744367437795, + "learning_rate": 2.0738958467359625e-05, + "loss": 1.1491, + "step": 16759 + }, + { + "epoch": 1.6, + "grad_norm": 0.3437374259098806, + "learning_rate": 2.072931386633854e-05, + "loss": 0.9439, + "step": 16760 + }, + { + "epoch": 1.6, + "grad_norm": 0.37458902475713934, + "learning_rate": 2.0719671249116247e-05, + "loss": 1.0637, + "step": 16761 + }, + { + "epoch": 1.6, + "grad_norm": 0.2961938082017746, + "learning_rate": 2.0710030615933986e-05, + "loss": 1.094, + "step": 16762 + }, + { + "epoch": 1.6, + "grad_norm": 0.43438199559636703, + "learning_rate": 2.070039196703306e-05, + "loss": 0.9793, + "step": 16763 + }, + { + "epoch": 1.6, + "grad_norm": 0.31292448012314567, + "learning_rate": 2.0690755302654642e-05, + "loss": 1.0567, + "step": 16764 + }, + { + "epoch": 1.6, + "grad_norm": 0.2890178139155901, + "learning_rate": 2.068112062303994e-05, + "loss": 0.8935, + "step": 16765 + }, + { + "epoch": 1.6, + "grad_norm": 0.29793826236591564, + "learning_rate": 2.0671487928430023e-05, + "loss": 0.9636, + "step": 16766 + }, + { + "epoch": 1.6, + "grad_norm": 0.27209830902543514, + "learning_rate": 2.066185721906596e-05, + "loss": 1.0662, + "step": 16767 + }, + { + "epoch": 1.6, + "grad_norm": 0.3277273592294273, + "learning_rate": 2.0652228495188795e-05, + "loss": 1.1562, + "step": 16768 + }, + { + "epoch": 1.6, + "grad_norm": 0.32233722385286745, + "learning_rate": 2.0642601757039516e-05, + "loss": 1.0498, + "step": 16769 + }, + { + "epoch": 1.6, + "grad_norm": 0.3145395551789763, + "learning_rate": 2.0632977004858955e-05, + "loss": 1.0603, + "step": 16770 + }, + { + "epoch": 1.6, + "grad_norm": 0.3486446030732091, + "learning_rate": 2.062335423888804e-05, + "loss": 1.0272, + "step": 16771 + }, + { + "epoch": 1.6, + "grad_norm": 0.2773820720256362, + "learning_rate": 2.061373345936759e-05, + "loss": 0.9167, + "step": 16772 + }, + { + "epoch": 1.6, + "grad_norm": 0.29862311676468833, + "learning_rate": 2.0604114666538334e-05, + "loss": 0.9849, + "step": 16773 + }, + { + "epoch": 1.6, + "grad_norm": 0.34885106689062717, + "learning_rate": 2.0594497860641005e-05, + "loss": 0.9939, + "step": 16774 + }, + { + "epoch": 1.6, + "grad_norm": 0.341675917007617, + "learning_rate": 2.058488304191627e-05, + "loss": 1.0485, + "step": 16775 + }, + { + "epoch": 1.6, + "grad_norm": 0.29893033603635916, + "learning_rate": 2.0575270210604768e-05, + "loss": 0.9536, + "step": 16776 + }, + { + "epoch": 1.61, + "grad_norm": 0.2973417494976247, + "learning_rate": 2.0565659366947022e-05, + "loss": 0.883, + "step": 16777 + }, + { + "epoch": 1.61, + "grad_norm": 0.32278887860083605, + "learning_rate": 2.0556050511183612e-05, + "loss": 0.9839, + "step": 16778 + }, + { + "epoch": 1.61, + "grad_norm": 0.3774413246388554, + "learning_rate": 2.0546443643554923e-05, + "loss": 1.0278, + "step": 16779 + }, + { + "epoch": 1.61, + "grad_norm": 0.30859208465791865, + "learning_rate": 2.0536838764301423e-05, + "loss": 1.0247, + "step": 16780 + }, + { + "epoch": 1.61, + "grad_norm": 0.29432913585657794, + "learning_rate": 2.0527235873663475e-05, + "loss": 0.9834, + "step": 16781 + }, + { + "epoch": 1.61, + "grad_norm": 0.2983824116246595, + "learning_rate": 2.0517634971881417e-05, + "loss": 1.0421, + "step": 16782 + }, + { + "epoch": 1.61, + "grad_norm": 0.28573885352833145, + "learning_rate": 2.050803605919548e-05, + "loss": 0.9745, + "step": 16783 + }, + { + "epoch": 1.61, + "grad_norm": 0.3592496456120595, + "learning_rate": 2.0498439135845914e-05, + "loss": 1.0164, + "step": 16784 + }, + { + "epoch": 1.61, + "grad_norm": 0.3129540951984324, + "learning_rate": 2.0488844202072855e-05, + "loss": 1.1221, + "step": 16785 + }, + { + "epoch": 1.61, + "grad_norm": 0.3541402363056382, + "learning_rate": 2.0479251258116474e-05, + "loss": 1.0267, + "step": 16786 + }, + { + "epoch": 1.61, + "grad_norm": 0.3382373617863707, + "learning_rate": 2.0469660304216744e-05, + "loss": 0.978, + "step": 16787 + }, + { + "epoch": 1.61, + "grad_norm": 0.30218782992070853, + "learning_rate": 2.0460071340613796e-05, + "loss": 1.0383, + "step": 16788 + }, + { + "epoch": 1.61, + "grad_norm": 0.3297384416664677, + "learning_rate": 2.0450484367547528e-05, + "loss": 1.0108, + "step": 16789 + }, + { + "epoch": 1.61, + "grad_norm": 0.2834210000497093, + "learning_rate": 2.0440899385257916e-05, + "loss": 0.8356, + "step": 16790 + }, + { + "epoch": 1.61, + "grad_norm": 0.3633632541329241, + "learning_rate": 2.0431316393984777e-05, + "loss": 1.086, + "step": 16791 + }, + { + "epoch": 1.61, + "grad_norm": 0.3068130128337737, + "learning_rate": 2.0421735393967943e-05, + "loss": 1.1157, + "step": 16792 + }, + { + "epoch": 1.61, + "grad_norm": 0.3155466210518406, + "learning_rate": 2.0412156385447222e-05, + "loss": 1.0812, + "step": 16793 + }, + { + "epoch": 1.61, + "grad_norm": 0.3354729506245039, + "learning_rate": 2.0402579368662257e-05, + "loss": 1.1187, + "step": 16794 + }, + { + "epoch": 1.61, + "grad_norm": 0.31741701587446375, + "learning_rate": 2.039300434385282e-05, + "loss": 1.0144, + "step": 16795 + }, + { + "epoch": 1.61, + "grad_norm": 0.2909899778138651, + "learning_rate": 2.038343131125845e-05, + "loss": 1.0016, + "step": 16796 + }, + { + "epoch": 1.61, + "grad_norm": 0.291324541550361, + "learning_rate": 2.0373860271118772e-05, + "loss": 1.0321, + "step": 16797 + }, + { + "epoch": 1.61, + "grad_norm": 0.2876850020110775, + "learning_rate": 2.036429122367326e-05, + "loss": 0.8835, + "step": 16798 + }, + { + "epoch": 1.61, + "grad_norm": 0.31958812011341015, + "learning_rate": 2.0354724169161444e-05, + "loss": 1.0927, + "step": 16799 + }, + { + "epoch": 1.61, + "grad_norm": 0.2990763951243673, + "learning_rate": 2.0345159107822677e-05, + "loss": 1.118, + "step": 16800 + }, + { + "epoch": 1.61, + "grad_norm": 0.3096527120603492, + "learning_rate": 2.0335596039896364e-05, + "loss": 1.074, + "step": 16801 + }, + { + "epoch": 1.61, + "grad_norm": 0.346305988260934, + "learning_rate": 2.0326034965621834e-05, + "loss": 0.9821, + "step": 16802 + }, + { + "epoch": 1.61, + "grad_norm": 0.2540486902542284, + "learning_rate": 2.0316475885238374e-05, + "loss": 0.8967, + "step": 16803 + }, + { + "epoch": 1.61, + "grad_norm": 0.25465034959807187, + "learning_rate": 2.0306918798985152e-05, + "loss": 1.0411, + "step": 16804 + }, + { + "epoch": 1.61, + "grad_norm": 0.30458863661340113, + "learning_rate": 2.0297363707101402e-05, + "loss": 1.0266, + "step": 16805 + }, + { + "epoch": 1.61, + "grad_norm": 0.34447183052209635, + "learning_rate": 2.0287810609826198e-05, + "loss": 0.9892, + "step": 16806 + }, + { + "epoch": 1.61, + "grad_norm": 0.30884305513030674, + "learning_rate": 2.0278259507398624e-05, + "loss": 1.0602, + "step": 16807 + }, + { + "epoch": 1.61, + "grad_norm": 0.3501218917370684, + "learning_rate": 2.0268710400057712e-05, + "loss": 0.9816, + "step": 16808 + }, + { + "epoch": 1.61, + "grad_norm": 0.33578325238619877, + "learning_rate": 2.0259163288042447e-05, + "loss": 1.0884, + "step": 16809 + }, + { + "epoch": 1.61, + "grad_norm": 0.3229020891631856, + "learning_rate": 2.0249618171591724e-05, + "loss": 0.9808, + "step": 16810 + }, + { + "epoch": 1.61, + "grad_norm": 0.27301935121797466, + "learning_rate": 2.024007505094442e-05, + "loss": 1.015, + "step": 16811 + }, + { + "epoch": 1.61, + "grad_norm": 0.29637446973290965, + "learning_rate": 2.02305339263394e-05, + "loss": 0.9568, + "step": 16812 + }, + { + "epoch": 1.61, + "grad_norm": 0.31020421351517574, + "learning_rate": 2.022099479801537e-05, + "loss": 0.9703, + "step": 16813 + }, + { + "epoch": 1.61, + "grad_norm": 0.31460228480156793, + "learning_rate": 2.0211457666211075e-05, + "loss": 0.9716, + "step": 16814 + }, + { + "epoch": 1.61, + "grad_norm": 0.26217997824163886, + "learning_rate": 2.0201922531165207e-05, + "loss": 1.0698, + "step": 16815 + }, + { + "epoch": 1.61, + "grad_norm": 0.32945300957975887, + "learning_rate": 2.0192389393116407e-05, + "loss": 0.8574, + "step": 16816 + }, + { + "epoch": 1.61, + "grad_norm": 0.32793977832422033, + "learning_rate": 2.0182858252303195e-05, + "loss": 1.015, + "step": 16817 + }, + { + "epoch": 1.61, + "grad_norm": 0.2983644538589317, + "learning_rate": 2.0173329108964132e-05, + "loss": 0.9845, + "step": 16818 + }, + { + "epoch": 1.61, + "grad_norm": 0.36085845490954177, + "learning_rate": 2.016380196333766e-05, + "loss": 1.0395, + "step": 16819 + }, + { + "epoch": 1.61, + "grad_norm": 0.33169764933392965, + "learning_rate": 2.0154276815662244e-05, + "loss": 1.1049, + "step": 16820 + }, + { + "epoch": 1.61, + "grad_norm": 0.30013511564506656, + "learning_rate": 2.0144753666176175e-05, + "loss": 1.0004, + "step": 16821 + }, + { + "epoch": 1.61, + "grad_norm": 0.3100869024930035, + "learning_rate": 2.0135232515117897e-05, + "loss": 1.0677, + "step": 16822 + }, + { + "epoch": 1.61, + "grad_norm": 0.35174370612702244, + "learning_rate": 2.0125713362725585e-05, + "loss": 0.9677, + "step": 16823 + }, + { + "epoch": 1.61, + "grad_norm": 0.33518912168410786, + "learning_rate": 2.0116196209237516e-05, + "loss": 0.9554, + "step": 16824 + }, + { + "epoch": 1.61, + "grad_norm": 0.2916095823756253, + "learning_rate": 2.0106681054891817e-05, + "loss": 1.0754, + "step": 16825 + }, + { + "epoch": 1.61, + "grad_norm": 0.26886143687013175, + "learning_rate": 2.0097167899926673e-05, + "loss": 0.9209, + "step": 16826 + }, + { + "epoch": 1.61, + "grad_norm": 0.3093350753965334, + "learning_rate": 2.0087656744580084e-05, + "loss": 0.9459, + "step": 16827 + }, + { + "epoch": 1.61, + "grad_norm": 0.31480344760041057, + "learning_rate": 2.0078147589090112e-05, + "loss": 0.9913, + "step": 16828 + }, + { + "epoch": 1.61, + "grad_norm": 0.29417415690531007, + "learning_rate": 2.006864043369472e-05, + "loss": 0.9913, + "step": 16829 + }, + { + "epoch": 1.61, + "grad_norm": 0.3275660324775199, + "learning_rate": 2.005913527863187e-05, + "loss": 1.0666, + "step": 16830 + }, + { + "epoch": 1.61, + "grad_norm": 0.32045437861000603, + "learning_rate": 2.0049632124139373e-05, + "loss": 1.077, + "step": 16831 + }, + { + "epoch": 1.61, + "grad_norm": 0.34247387702202475, + "learning_rate": 2.0040130970455074e-05, + "loss": 0.9796, + "step": 16832 + }, + { + "epoch": 1.61, + "grad_norm": 0.2857506882035686, + "learning_rate": 2.003063181781678e-05, + "loss": 0.9767, + "step": 16833 + }, + { + "epoch": 1.61, + "grad_norm": 0.32338539648413805, + "learning_rate": 2.002113466646215e-05, + "loss": 1.0709, + "step": 16834 + }, + { + "epoch": 1.61, + "grad_norm": 0.3308615832704575, + "learning_rate": 2.0011639516628898e-05, + "loss": 1.0914, + "step": 16835 + }, + { + "epoch": 1.61, + "grad_norm": 0.3525995973625897, + "learning_rate": 2.0002146368554642e-05, + "loss": 1.0068, + "step": 16836 + }, + { + "epoch": 1.61, + "grad_norm": 0.3750879720156884, + "learning_rate": 1.9992655222476964e-05, + "loss": 1.0894, + "step": 16837 + }, + { + "epoch": 1.61, + "grad_norm": 0.32497533585630006, + "learning_rate": 1.9983166078633354e-05, + "loss": 1.0204, + "step": 16838 + }, + { + "epoch": 1.61, + "grad_norm": 0.3132177078709715, + "learning_rate": 1.997367893726132e-05, + "loss": 0.9636, + "step": 16839 + }, + { + "epoch": 1.61, + "grad_norm": 0.30115251149723826, + "learning_rate": 1.9964193798598253e-05, + "loss": 1.1313, + "step": 16840 + }, + { + "epoch": 1.61, + "grad_norm": 0.30210363309647187, + "learning_rate": 1.9954710662881525e-05, + "loss": 0.9785, + "step": 16841 + }, + { + "epoch": 1.61, + "grad_norm": 0.3235931552310994, + "learning_rate": 1.9945229530348476e-05, + "loss": 1.0498, + "step": 16842 + }, + { + "epoch": 1.61, + "grad_norm": 0.2903274354474711, + "learning_rate": 1.9935750401236386e-05, + "loss": 0.9631, + "step": 16843 + }, + { + "epoch": 1.61, + "grad_norm": 0.31049567410024403, + "learning_rate": 1.9926273275782447e-05, + "loss": 0.9719, + "step": 16844 + }, + { + "epoch": 1.61, + "grad_norm": 0.29877051042145925, + "learning_rate": 1.991679815422386e-05, + "loss": 1.0422, + "step": 16845 + }, + { + "epoch": 1.61, + "grad_norm": 0.28520813169621273, + "learning_rate": 1.9907325036797707e-05, + "loss": 1.0812, + "step": 16846 + }, + { + "epoch": 1.61, + "grad_norm": 0.2974553493509437, + "learning_rate": 1.9897853923741118e-05, + "loss": 1.1102, + "step": 16847 + }, + { + "epoch": 1.61, + "grad_norm": 0.3234750172381007, + "learning_rate": 1.9888384815291005e-05, + "loss": 0.9301, + "step": 16848 + }, + { + "epoch": 1.61, + "grad_norm": 0.3080001347218969, + "learning_rate": 1.9878917711684475e-05, + "loss": 1.0147, + "step": 16849 + }, + { + "epoch": 1.61, + "grad_norm": 0.34328186467331023, + "learning_rate": 1.9869452613158357e-05, + "loss": 0.9665, + "step": 16850 + }, + { + "epoch": 1.61, + "grad_norm": 0.32173651562672295, + "learning_rate": 1.9859989519949575e-05, + "loss": 0.9766, + "step": 16851 + }, + { + "epoch": 1.61, + "grad_norm": 0.32640509827306974, + "learning_rate": 1.9850528432294892e-05, + "loss": 1.1956, + "step": 16852 + }, + { + "epoch": 1.61, + "grad_norm": 0.2951998888178708, + "learning_rate": 1.9841069350431107e-05, + "loss": 0.9955, + "step": 16853 + }, + { + "epoch": 1.61, + "grad_norm": 0.3232894193664376, + "learning_rate": 1.9831612274594967e-05, + "loss": 1.0832, + "step": 16854 + }, + { + "epoch": 1.61, + "grad_norm": 0.31259813481740195, + "learning_rate": 1.9822157205023085e-05, + "loss": 1.0026, + "step": 16855 + }, + { + "epoch": 1.61, + "grad_norm": 0.32869554317652205, + "learning_rate": 1.981270414195211e-05, + "loss": 1.0219, + "step": 16856 + }, + { + "epoch": 1.61, + "grad_norm": 0.2869469231218165, + "learning_rate": 1.980325308561861e-05, + "loss": 1.0108, + "step": 16857 + }, + { + "epoch": 1.61, + "grad_norm": 0.26701120839523795, + "learning_rate": 1.9793804036259123e-05, + "loss": 1.0536, + "step": 16858 + }, + { + "epoch": 1.61, + "grad_norm": 0.3285106425779339, + "learning_rate": 1.9784356994110077e-05, + "loss": 1.0211, + "step": 16859 + }, + { + "epoch": 1.61, + "grad_norm": 0.32627898721174103, + "learning_rate": 1.9774911959407937e-05, + "loss": 1.0456, + "step": 16860 + }, + { + "epoch": 1.61, + "grad_norm": 0.32244867723937, + "learning_rate": 1.976546893238902e-05, + "loss": 0.9911, + "step": 16861 + }, + { + "epoch": 1.61, + "grad_norm": 0.3288490367603777, + "learning_rate": 1.9756027913289676e-05, + "loss": 1.085, + "step": 16862 + }, + { + "epoch": 1.61, + "grad_norm": 0.30818142597558335, + "learning_rate": 1.9746588902346152e-05, + "loss": 1.0307, + "step": 16863 + }, + { + "epoch": 1.61, + "grad_norm": 0.3410581237961392, + "learning_rate": 1.9737151899794715e-05, + "loss": 1.0818, + "step": 16864 + }, + { + "epoch": 1.61, + "grad_norm": 0.31419835137855434, + "learning_rate": 1.9727716905871462e-05, + "loss": 0.9394, + "step": 16865 + }, + { + "epoch": 1.61, + "grad_norm": 0.31597021496694133, + "learning_rate": 1.9718283920812564e-05, + "loss": 1.0292, + "step": 16866 + }, + { + "epoch": 1.61, + "grad_norm": 0.29968049967947025, + "learning_rate": 1.970885294485405e-05, + "loss": 0.9002, + "step": 16867 + }, + { + "epoch": 1.61, + "grad_norm": 0.3329118851863104, + "learning_rate": 1.9699423978231947e-05, + "loss": 0.9609, + "step": 16868 + }, + { + "epoch": 1.61, + "grad_norm": 0.2824075700327243, + "learning_rate": 1.968999702118223e-05, + "loss": 1.0103, + "step": 16869 + }, + { + "epoch": 1.61, + "grad_norm": 0.33430719974971235, + "learning_rate": 1.968057207394083e-05, + "loss": 0.9128, + "step": 16870 + }, + { + "epoch": 1.61, + "grad_norm": 0.2733463490214131, + "learning_rate": 1.9671149136743572e-05, + "loss": 0.9667, + "step": 16871 + }, + { + "epoch": 1.61, + "grad_norm": 0.3441420170750598, + "learning_rate": 1.9661728209826312e-05, + "loss": 0.9415, + "step": 16872 + }, + { + "epoch": 1.61, + "grad_norm": 0.2903420626158203, + "learning_rate": 1.9652309293424776e-05, + "loss": 0.9224, + "step": 16873 + }, + { + "epoch": 1.61, + "grad_norm": 0.35847680007496235, + "learning_rate": 1.964289238777469e-05, + "loss": 0.9728, + "step": 16874 + }, + { + "epoch": 1.61, + "grad_norm": 0.3014153650554044, + "learning_rate": 1.963347749311173e-05, + "loss": 1.0067, + "step": 16875 + }, + { + "epoch": 1.61, + "grad_norm": 0.340689728423836, + "learning_rate": 1.9624064609671522e-05, + "loss": 1.1091, + "step": 16876 + }, + { + "epoch": 1.61, + "grad_norm": 0.34682719843501236, + "learning_rate": 1.961465373768959e-05, + "loss": 1.0779, + "step": 16877 + }, + { + "epoch": 1.61, + "grad_norm": 0.33510763683682376, + "learning_rate": 1.9605244877401474e-05, + "loss": 1.1507, + "step": 16878 + }, + { + "epoch": 1.61, + "grad_norm": 0.30754363396412376, + "learning_rate": 1.9595838029042656e-05, + "loss": 1.0108, + "step": 16879 + }, + { + "epoch": 1.61, + "grad_norm": 0.322748567123215, + "learning_rate": 1.9586433192848487e-05, + "loss": 0.9166, + "step": 16880 + }, + { + "epoch": 1.62, + "grad_norm": 0.29245599232923, + "learning_rate": 1.9577030369054404e-05, + "loss": 0.9806, + "step": 16881 + }, + { + "epoch": 1.62, + "grad_norm": 0.3458739377770743, + "learning_rate": 1.9567629557895616e-05, + "loss": 0.9283, + "step": 16882 + }, + { + "epoch": 1.62, + "grad_norm": 0.3348967392909263, + "learning_rate": 1.955823075960751e-05, + "loss": 0.9111, + "step": 16883 + }, + { + "epoch": 1.62, + "grad_norm": 0.2884623513176458, + "learning_rate": 1.9548833974425207e-05, + "loss": 1.1005, + "step": 16884 + }, + { + "epoch": 1.62, + "grad_norm": 0.31441286766966325, + "learning_rate": 1.953943920258392e-05, + "loss": 0.918, + "step": 16885 + }, + { + "epoch": 1.62, + "grad_norm": 0.3101847739280723, + "learning_rate": 1.9530046444318705e-05, + "loss": 0.9885, + "step": 16886 + }, + { + "epoch": 1.62, + "grad_norm": 0.28323536759146, + "learning_rate": 1.9520655699864687e-05, + "loss": 1.0458, + "step": 16887 + }, + { + "epoch": 1.62, + "grad_norm": 0.3221743057204547, + "learning_rate": 1.95112669694568e-05, + "loss": 1.0098, + "step": 16888 + }, + { + "epoch": 1.62, + "grad_norm": 0.352929819701682, + "learning_rate": 1.950188025333005e-05, + "loss": 1.0831, + "step": 16889 + }, + { + "epoch": 1.62, + "grad_norm": 0.31275759101477774, + "learning_rate": 1.9492495551719335e-05, + "loss": 1.1159, + "step": 16890 + }, + { + "epoch": 1.62, + "grad_norm": 0.31564622585025415, + "learning_rate": 1.9483112864859555e-05, + "loss": 1.0094, + "step": 16891 + }, + { + "epoch": 1.62, + "grad_norm": 0.2986354873606361, + "learning_rate": 1.947373219298544e-05, + "loss": 1.0335, + "step": 16892 + }, + { + "epoch": 1.62, + "grad_norm": 0.3045205692659056, + "learning_rate": 1.946435353633178e-05, + "loss": 1.052, + "step": 16893 + }, + { + "epoch": 1.62, + "grad_norm": 0.2863891928863833, + "learning_rate": 1.9454976895133326e-05, + "loss": 0.8859, + "step": 16894 + }, + { + "epoch": 1.62, + "grad_norm": 0.3382791035694023, + "learning_rate": 1.9445602269624663e-05, + "loss": 0.9216, + "step": 16895 + }, + { + "epoch": 1.62, + "grad_norm": 0.3254304276572563, + "learning_rate": 1.9436229660040427e-05, + "loss": 1.0211, + "step": 16896 + }, + { + "epoch": 1.62, + "grad_norm": 0.35528037626349845, + "learning_rate": 1.9426859066615187e-05, + "loss": 1.07, + "step": 16897 + }, + { + "epoch": 1.62, + "grad_norm": 0.3595944710888877, + "learning_rate": 1.9417490489583455e-05, + "loss": 1.1099, + "step": 16898 + }, + { + "epoch": 1.62, + "grad_norm": 0.2973078219359796, + "learning_rate": 1.9408123929179643e-05, + "loss": 1.0843, + "step": 16899 + }, + { + "epoch": 1.62, + "grad_norm": 0.32583319583835213, + "learning_rate": 1.939875938563821e-05, + "loss": 0.9635, + "step": 16900 + }, + { + "epoch": 1.62, + "grad_norm": 0.33544027599083953, + "learning_rate": 1.9389396859193444e-05, + "loss": 1.1198, + "step": 16901 + }, + { + "epoch": 1.62, + "grad_norm": 0.26328884215609166, + "learning_rate": 1.938003635007969e-05, + "loss": 1.1064, + "step": 16902 + }, + { + "epoch": 1.62, + "grad_norm": 0.3112203965085122, + "learning_rate": 1.9370677858531194e-05, + "loss": 0.9393, + "step": 16903 + }, + { + "epoch": 1.62, + "grad_norm": 0.32894964235936874, + "learning_rate": 1.9361321384782183e-05, + "loss": 0.965, + "step": 16904 + }, + { + "epoch": 1.62, + "grad_norm": 0.3214919614387117, + "learning_rate": 1.9351966929066757e-05, + "loss": 1.0823, + "step": 16905 + }, + { + "epoch": 1.62, + "grad_norm": 0.3644099345235849, + "learning_rate": 1.934261449161907e-05, + "loss": 1.0169, + "step": 16906 + }, + { + "epoch": 1.62, + "grad_norm": 0.3325368309205661, + "learning_rate": 1.9333264072673128e-05, + "loss": 0.8948, + "step": 16907 + }, + { + "epoch": 1.62, + "grad_norm": 0.3275548153482294, + "learning_rate": 1.9323915672462968e-05, + "loss": 1.0185, + "step": 16908 + }, + { + "epoch": 1.62, + "grad_norm": 0.32631297724934955, + "learning_rate": 1.9314569291222472e-05, + "loss": 0.942, + "step": 16909 + }, + { + "epoch": 1.62, + "grad_norm": 0.3614088819786124, + "learning_rate": 1.9305224929185638e-05, + "loss": 0.8758, + "step": 16910 + }, + { + "epoch": 1.62, + "grad_norm": 0.29365257743257667, + "learning_rate": 1.9295882586586244e-05, + "loss": 1.0373, + "step": 16911 + }, + { + "epoch": 1.62, + "grad_norm": 0.30397039956931887, + "learning_rate": 1.9286542263658135e-05, + "loss": 1.0399, + "step": 16912 + }, + { + "epoch": 1.62, + "grad_norm": 0.34003582253073933, + "learning_rate": 1.9277203960634994e-05, + "loss": 1.0294, + "step": 16913 + }, + { + "epoch": 1.62, + "grad_norm": 0.3380079053894546, + "learning_rate": 1.9267867677750572e-05, + "loss": 1.0554, + "step": 16914 + }, + { + "epoch": 1.62, + "grad_norm": 0.3136075614913414, + "learning_rate": 1.925853341523851e-05, + "loss": 0.9604, + "step": 16915 + }, + { + "epoch": 1.62, + "grad_norm": 0.29825293841632017, + "learning_rate": 1.924920117333239e-05, + "loss": 0.9785, + "step": 16916 + }, + { + "epoch": 1.62, + "grad_norm": 0.3232431645991683, + "learning_rate": 1.923987095226574e-05, + "loss": 0.9395, + "step": 16917 + }, + { + "epoch": 1.62, + "grad_norm": 0.3335140885181981, + "learning_rate": 1.9230542752272085e-05, + "loss": 1.0986, + "step": 16918 + }, + { + "epoch": 1.62, + "grad_norm": 0.30331039385786374, + "learning_rate": 1.9221216573584876e-05, + "loss": 1.0406, + "step": 16919 + }, + { + "epoch": 1.62, + "grad_norm": 0.28067153731209743, + "learning_rate": 1.921189241643747e-05, + "loss": 1.0665, + "step": 16920 + }, + { + "epoch": 1.62, + "grad_norm": 0.3128553502565437, + "learning_rate": 1.9202570281063258e-05, + "loss": 0.9292, + "step": 16921 + }, + { + "epoch": 1.62, + "grad_norm": 0.2979510186232973, + "learning_rate": 1.9193250167695476e-05, + "loss": 1.1016, + "step": 16922 + }, + { + "epoch": 1.62, + "grad_norm": 0.3176058203680617, + "learning_rate": 1.918393207656739e-05, + "loss": 1.0689, + "step": 16923 + }, + { + "epoch": 1.62, + "grad_norm": 0.31198767597466925, + "learning_rate": 1.9174616007912204e-05, + "loss": 0.9637, + "step": 16924 + }, + { + "epoch": 1.62, + "grad_norm": 0.31353475857427743, + "learning_rate": 1.9165301961963067e-05, + "loss": 1.0814, + "step": 16925 + }, + { + "epoch": 1.62, + "grad_norm": 0.3219404035172954, + "learning_rate": 1.9155989938953024e-05, + "loss": 1.0787, + "step": 16926 + }, + { + "epoch": 1.62, + "grad_norm": 0.290020795052471, + "learning_rate": 1.9146679939115176e-05, + "loss": 1.0559, + "step": 16927 + }, + { + "epoch": 1.62, + "grad_norm": 0.2803241067185356, + "learning_rate": 1.913737196268245e-05, + "loss": 0.9904, + "step": 16928 + }, + { + "epoch": 1.62, + "grad_norm": 0.3260949651929496, + "learning_rate": 1.9128066009887803e-05, + "loss": 0.9603, + "step": 16929 + }, + { + "epoch": 1.62, + "grad_norm": 0.380606688142758, + "learning_rate": 1.9118762080964136e-05, + "loss": 1.0762, + "step": 16930 + }, + { + "epoch": 1.62, + "grad_norm": 0.33826417942378983, + "learning_rate": 1.91094601761443e-05, + "loss": 0.9513, + "step": 16931 + }, + { + "epoch": 1.62, + "grad_norm": 0.3089257147246083, + "learning_rate": 1.910016029566104e-05, + "loss": 0.9797, + "step": 16932 + }, + { + "epoch": 1.62, + "grad_norm": 0.3301037176658933, + "learning_rate": 1.9090862439747136e-05, + "loss": 0.9322, + "step": 16933 + }, + { + "epoch": 1.62, + "grad_norm": 0.2712297920451885, + "learning_rate": 1.9081566608635227e-05, + "loss": 1.0066, + "step": 16934 + }, + { + "epoch": 1.62, + "grad_norm": 0.27603405678595655, + "learning_rate": 1.9072272802557968e-05, + "loss": 0.928, + "step": 16935 + }, + { + "epoch": 1.62, + "grad_norm": 0.2769594512230032, + "learning_rate": 1.906298102174794e-05, + "loss": 0.9601, + "step": 16936 + }, + { + "epoch": 1.62, + "grad_norm": 0.35082436570030995, + "learning_rate": 1.9053691266437702e-05, + "loss": 1.1144, + "step": 16937 + }, + { + "epoch": 1.62, + "grad_norm": 0.3394780128882192, + "learning_rate": 1.904440353685969e-05, + "loss": 0.9762, + "step": 16938 + }, + { + "epoch": 1.62, + "grad_norm": 0.3025589496911696, + "learning_rate": 1.9035117833246364e-05, + "loss": 0.9747, + "step": 16939 + }, + { + "epoch": 1.62, + "grad_norm": 0.3292983076539834, + "learning_rate": 1.9025834155830115e-05, + "loss": 0.8831, + "step": 16940 + }, + { + "epoch": 1.62, + "grad_norm": 0.3072617169524354, + "learning_rate": 1.9016552504843233e-05, + "loss": 0.943, + "step": 16941 + }, + { + "epoch": 1.62, + "grad_norm": 0.3405150111127615, + "learning_rate": 1.9007272880518057e-05, + "loss": 1.0923, + "step": 16942 + }, + { + "epoch": 1.62, + "grad_norm": 0.3073361192165714, + "learning_rate": 1.899799528308671e-05, + "loss": 0.996, + "step": 16943 + }, + { + "epoch": 1.62, + "grad_norm": 0.26837097540365223, + "learning_rate": 1.89887197127815e-05, + "loss": 0.9407, + "step": 16944 + }, + { + "epoch": 1.62, + "grad_norm": 0.30579837924350484, + "learning_rate": 1.897944616983447e-05, + "loss": 1.0636, + "step": 16945 + }, + { + "epoch": 1.62, + "grad_norm": 0.31217506123314825, + "learning_rate": 1.8970174654477746e-05, + "loss": 1.069, + "step": 16946 + }, + { + "epoch": 1.62, + "grad_norm": 0.2934022257532515, + "learning_rate": 1.89609051669433e-05, + "loss": 1.052, + "step": 16947 + }, + { + "epoch": 1.62, + "grad_norm": 0.3486418148367129, + "learning_rate": 1.8951637707463156e-05, + "loss": 0.9951, + "step": 16948 + }, + { + "epoch": 1.62, + "grad_norm": 0.3140485936071772, + "learning_rate": 1.894237227626918e-05, + "loss": 1.0333, + "step": 16949 + }, + { + "epoch": 1.62, + "grad_norm": 0.29860850542207157, + "learning_rate": 1.8933108873593297e-05, + "loss": 1.0275, + "step": 16950 + }, + { + "epoch": 1.62, + "grad_norm": 0.316643564373999, + "learning_rate": 1.8923847499667302e-05, + "loss": 1.1119, + "step": 16951 + }, + { + "epoch": 1.62, + "grad_norm": 0.36510450087708984, + "learning_rate": 1.8914588154723013e-05, + "loss": 1.063, + "step": 16952 + }, + { + "epoch": 1.62, + "grad_norm": 0.27493109333227383, + "learning_rate": 1.8905330838992086e-05, + "loss": 0.9609, + "step": 16953 + }, + { + "epoch": 1.62, + "grad_norm": 0.2953974561783024, + "learning_rate": 1.889607555270624e-05, + "loss": 1.1412, + "step": 16954 + }, + { + "epoch": 1.62, + "grad_norm": 0.307945225595946, + "learning_rate": 1.888682229609706e-05, + "loss": 1.0747, + "step": 16955 + }, + { + "epoch": 1.62, + "grad_norm": 0.3394967436196446, + "learning_rate": 1.8877571069396126e-05, + "loss": 1.0245, + "step": 16956 + }, + { + "epoch": 1.62, + "grad_norm": 0.3137729870114311, + "learning_rate": 1.886832187283496e-05, + "loss": 1.0247, + "step": 16957 + }, + { + "epoch": 1.62, + "grad_norm": 0.339541860379853, + "learning_rate": 1.8859074706645052e-05, + "loss": 1.0517, + "step": 16958 + }, + { + "epoch": 1.62, + "grad_norm": 0.31121749218380185, + "learning_rate": 1.8849829571057764e-05, + "loss": 1.0803, + "step": 16959 + }, + { + "epoch": 1.62, + "grad_norm": 0.29592936315391977, + "learning_rate": 1.88405864663045e-05, + "loss": 1.0915, + "step": 16960 + }, + { + "epoch": 1.62, + "grad_norm": 0.32159986796407014, + "learning_rate": 1.8831345392616583e-05, + "loss": 0.999, + "step": 16961 + }, + { + "epoch": 1.62, + "grad_norm": 0.3335534674045699, + "learning_rate": 1.8822106350225244e-05, + "loss": 0.988, + "step": 16962 + }, + { + "epoch": 1.62, + "grad_norm": 0.3454687540309944, + "learning_rate": 1.8812869339361717e-05, + "loss": 0.9678, + "step": 16963 + }, + { + "epoch": 1.62, + "grad_norm": 0.3112738846741404, + "learning_rate": 1.880363436025715e-05, + "loss": 1.0334, + "step": 16964 + }, + { + "epoch": 1.62, + "grad_norm": 0.33906738010170123, + "learning_rate": 1.8794401413142682e-05, + "loss": 0.9176, + "step": 16965 + }, + { + "epoch": 1.62, + "grad_norm": 0.32484103440254297, + "learning_rate": 1.878517049824934e-05, + "loss": 1.0805, + "step": 16966 + }, + { + "epoch": 1.62, + "grad_norm": 0.3564128356796072, + "learning_rate": 1.877594161580817e-05, + "loss": 0.9239, + "step": 16967 + }, + { + "epoch": 1.62, + "grad_norm": 0.3066708457109732, + "learning_rate": 1.8766714766050085e-05, + "loss": 0.9881, + "step": 16968 + }, + { + "epoch": 1.62, + "grad_norm": 0.33252691642763293, + "learning_rate": 1.875748994920604e-05, + "loss": 0.9982, + "step": 16969 + }, + { + "epoch": 1.62, + "grad_norm": 0.3281948380249302, + "learning_rate": 1.8748267165506817e-05, + "loss": 1.0242, + "step": 16970 + }, + { + "epoch": 1.62, + "grad_norm": 0.3191382866682618, + "learning_rate": 1.8739046415183313e-05, + "loss": 0.9924, + "step": 16971 + }, + { + "epoch": 1.62, + "grad_norm": 0.29338642706432505, + "learning_rate": 1.872982769846624e-05, + "loss": 1.0757, + "step": 16972 + }, + { + "epoch": 1.62, + "grad_norm": 0.31952833401668157, + "learning_rate": 1.872061101558631e-05, + "loss": 0.9899, + "step": 16973 + }, + { + "epoch": 1.62, + "grad_norm": 0.32788228910927264, + "learning_rate": 1.8711396366774157e-05, + "loss": 0.9723, + "step": 16974 + }, + { + "epoch": 1.62, + "grad_norm": 0.28682960697334425, + "learning_rate": 1.870218375226043e-05, + "loss": 0.9467, + "step": 16975 + }, + { + "epoch": 1.62, + "grad_norm": 0.3573606824240437, + "learning_rate": 1.8692973172275607e-05, + "loss": 1.0464, + "step": 16976 + }, + { + "epoch": 1.62, + "grad_norm": 0.3177053959292174, + "learning_rate": 1.8683764627050238e-05, + "loss": 0.9649, + "step": 16977 + }, + { + "epoch": 1.62, + "grad_norm": 0.30965869403107904, + "learning_rate": 1.8674558116814765e-05, + "loss": 1.076, + "step": 16978 + }, + { + "epoch": 1.62, + "grad_norm": 0.3329932350695768, + "learning_rate": 1.866535364179959e-05, + "loss": 0.9988, + "step": 16979 + }, + { + "epoch": 1.62, + "grad_norm": 0.2899708581680402, + "learning_rate": 1.865615120223507e-05, + "loss": 1.0024, + "step": 16980 + }, + { + "epoch": 1.62, + "grad_norm": 0.30415705733818565, + "learning_rate": 1.864695079835147e-05, + "loss": 0.9922, + "step": 16981 + }, + { + "epoch": 1.62, + "grad_norm": 0.3487245509835739, + "learning_rate": 1.8637752430379085e-05, + "loss": 0.9494, + "step": 16982 + }, + { + "epoch": 1.62, + "grad_norm": 0.3105212259813366, + "learning_rate": 1.8628556098548044e-05, + "loss": 1.0476, + "step": 16983 + }, + { + "epoch": 1.62, + "grad_norm": 0.30465803475539066, + "learning_rate": 1.8619361803088542e-05, + "loss": 1.0235, + "step": 16984 + }, + { + "epoch": 1.62, + "grad_norm": 0.29916873849573694, + "learning_rate": 1.8610169544230638e-05, + "loss": 1.0231, + "step": 16985 + }, + { + "epoch": 1.63, + "grad_norm": 0.33720098840359525, + "learning_rate": 1.8600979322204438e-05, + "loss": 1.0356, + "step": 16986 + }, + { + "epoch": 1.63, + "grad_norm": 0.3150300656519562, + "learning_rate": 1.8591791137239845e-05, + "loss": 1.1567, + "step": 16987 + }, + { + "epoch": 1.63, + "grad_norm": 0.31979635038165904, + "learning_rate": 1.8582604989566876e-05, + "loss": 1.036, + "step": 16988 + }, + { + "epoch": 1.63, + "grad_norm": 0.30344726509543674, + "learning_rate": 1.857342087941536e-05, + "loss": 1.0097, + "step": 16989 + }, + { + "epoch": 1.63, + "grad_norm": 0.3143350158703734, + "learning_rate": 1.8564238807015155e-05, + "loss": 0.9939, + "step": 16990 + }, + { + "epoch": 1.63, + "grad_norm": 0.3071041205338204, + "learning_rate": 1.8555058772596068e-05, + "loss": 1.1426, + "step": 16991 + }, + { + "epoch": 1.63, + "grad_norm": 0.3482996347560852, + "learning_rate": 1.8545880776387835e-05, + "loss": 1.0889, + "step": 16992 + }, + { + "epoch": 1.63, + "grad_norm": 0.27826078766952184, + "learning_rate": 1.853670481862011e-05, + "loss": 0.9568, + "step": 16993 + }, + { + "epoch": 1.63, + "grad_norm": 0.3282777701902149, + "learning_rate": 1.852753089952256e-05, + "loss": 1.0229, + "step": 16994 + }, + { + "epoch": 1.63, + "grad_norm": 0.31439052988532196, + "learning_rate": 1.8518359019324738e-05, + "loss": 1.0511, + "step": 16995 + }, + { + "epoch": 1.63, + "grad_norm": 0.3411339331305132, + "learning_rate": 1.8509189178256182e-05, + "loss": 1.1597, + "step": 16996 + }, + { + "epoch": 1.63, + "grad_norm": 0.33956393370757376, + "learning_rate": 1.8500021376546382e-05, + "loss": 0.9385, + "step": 16997 + }, + { + "epoch": 1.63, + "grad_norm": 0.3807382139626251, + "learning_rate": 1.8490855614424808e-05, + "loss": 1.0241, + "step": 16998 + }, + { + "epoch": 1.63, + "grad_norm": 0.3359474848753567, + "learning_rate": 1.8481691892120757e-05, + "loss": 1.0989, + "step": 16999 + }, + { + "epoch": 1.63, + "grad_norm": 0.3272181370084019, + "learning_rate": 1.8472530209863602e-05, + "loss": 1.1479, + "step": 17000 + }, + { + "epoch": 1.63, + "grad_norm": 0.34959970722085815, + "learning_rate": 1.8463370567882643e-05, + "loss": 0.9651, + "step": 17001 + }, + { + "epoch": 1.63, + "grad_norm": 0.300924297926539, + "learning_rate": 1.845421296640706e-05, + "loss": 1.0158, + "step": 17002 + }, + { + "epoch": 1.63, + "grad_norm": 0.3133326714513339, + "learning_rate": 1.8445057405666067e-05, + "loss": 1.0054, + "step": 17003 + }, + { + "epoch": 1.63, + "grad_norm": 0.3561541750843562, + "learning_rate": 1.8435903885888716e-05, + "loss": 1.0497, + "step": 17004 + }, + { + "epoch": 1.63, + "grad_norm": 0.2892316766967573, + "learning_rate": 1.842675240730418e-05, + "loss": 1.1217, + "step": 17005 + }, + { + "epoch": 1.63, + "grad_norm": 0.3218622365493191, + "learning_rate": 1.841760297014141e-05, + "loss": 0.9823, + "step": 17006 + }, + { + "epoch": 1.63, + "grad_norm": 0.3013832833603239, + "learning_rate": 1.8408455574629434e-05, + "loss": 0.9941, + "step": 17007 + }, + { + "epoch": 1.63, + "grad_norm": 0.35741238164535577, + "learning_rate": 1.8399310220997102e-05, + "loss": 1.0085, + "step": 17008 + }, + { + "epoch": 1.63, + "grad_norm": 0.347441109135565, + "learning_rate": 1.839016690947334e-05, + "loss": 1.0716, + "step": 17009 + }, + { + "epoch": 1.63, + "grad_norm": 0.3151530634625425, + "learning_rate": 1.838102564028691e-05, + "loss": 1.0098, + "step": 17010 + }, + { + "epoch": 1.63, + "grad_norm": 0.30625525577059703, + "learning_rate": 1.8371886413666617e-05, + "loss": 1.0975, + "step": 17011 + }, + { + "epoch": 1.63, + "grad_norm": 0.32090703901749273, + "learning_rate": 1.8362749229841157e-05, + "loss": 1.0508, + "step": 17012 + }, + { + "epoch": 1.63, + "grad_norm": 0.3183555096250065, + "learning_rate": 1.835361408903924e-05, + "loss": 1.0326, + "step": 17013 + }, + { + "epoch": 1.63, + "grad_norm": 0.32599994933462684, + "learning_rate": 1.834448099148942e-05, + "loss": 0.9739, + "step": 17014 + }, + { + "epoch": 1.63, + "grad_norm": 0.37367007815294395, + "learning_rate": 1.83353499374203e-05, + "loss": 1.1021, + "step": 17015 + }, + { + "epoch": 1.63, + "grad_norm": 0.30361307020138534, + "learning_rate": 1.8326220927060355e-05, + "loss": 0.93, + "step": 17016 + }, + { + "epoch": 1.63, + "grad_norm": 0.30568267580101743, + "learning_rate": 1.8317093960638055e-05, + "loss": 0.9806, + "step": 17017 + }, + { + "epoch": 1.63, + "grad_norm": 0.3050899826499216, + "learning_rate": 1.830796903838181e-05, + "loss": 0.9552, + "step": 17018 + }, + { + "epoch": 1.63, + "grad_norm": 0.3297780748148927, + "learning_rate": 1.8298846160520022e-05, + "loss": 1.1032, + "step": 17019 + }, + { + "epoch": 1.63, + "grad_norm": 0.33676686628204666, + "learning_rate": 1.8289725327280914e-05, + "loss": 1.0614, + "step": 17020 + }, + { + "epoch": 1.63, + "grad_norm": 0.37438138415811184, + "learning_rate": 1.8280606538892785e-05, + "loss": 0.9444, + "step": 17021 + }, + { + "epoch": 1.63, + "grad_norm": 0.28194937875657466, + "learning_rate": 1.8271489795583874e-05, + "loss": 0.9202, + "step": 17022 + }, + { + "epoch": 1.63, + "grad_norm": 0.3156946373528751, + "learning_rate": 1.8262375097582263e-05, + "loss": 0.9467, + "step": 17023 + }, + { + "epoch": 1.63, + "grad_norm": 0.2874580084483674, + "learning_rate": 1.825326244511608e-05, + "loss": 0.9214, + "step": 17024 + }, + { + "epoch": 1.63, + "grad_norm": 0.3752165951325836, + "learning_rate": 1.824415183841338e-05, + "loss": 1.0491, + "step": 17025 + }, + { + "epoch": 1.63, + "grad_norm": 0.3227319803576955, + "learning_rate": 1.8235043277702203e-05, + "loss": 1.0547, + "step": 17026 + }, + { + "epoch": 1.63, + "grad_norm": 0.3412290620341622, + "learning_rate": 1.8225936763210417e-05, + "loss": 1.0307, + "step": 17027 + }, + { + "epoch": 1.63, + "grad_norm": 0.2888276691704394, + "learning_rate": 1.8216832295165987e-05, + "loss": 1.1343, + "step": 17028 + }, + { + "epoch": 1.63, + "grad_norm": 0.34568660075493207, + "learning_rate": 1.8207729873796696e-05, + "loss": 0.9196, + "step": 17029 + }, + { + "epoch": 1.63, + "grad_norm": 0.28085312621698977, + "learning_rate": 1.81986294993304e-05, + "loss": 1.0532, + "step": 17030 + }, + { + "epoch": 1.63, + "grad_norm": 0.35838748268019194, + "learning_rate": 1.8189531171994757e-05, + "loss": 0.952, + "step": 17031 + }, + { + "epoch": 1.63, + "grad_norm": 0.31230354407642397, + "learning_rate": 1.8180434892017562e-05, + "loss": 1.0396, + "step": 17032 + }, + { + "epoch": 1.63, + "grad_norm": 0.3074617117565263, + "learning_rate": 1.817134065962638e-05, + "loss": 0.9425, + "step": 17033 + }, + { + "epoch": 1.63, + "grad_norm": 0.33745488244386684, + "learning_rate": 1.816224847504885e-05, + "loss": 0.9834, + "step": 17034 + }, + { + "epoch": 1.63, + "grad_norm": 0.34820675159738457, + "learning_rate": 1.8153158338512465e-05, + "loss": 1.0436, + "step": 17035 + }, + { + "epoch": 1.63, + "grad_norm": 0.29323238150295733, + "learning_rate": 1.814407025024476e-05, + "loss": 1.0376, + "step": 17036 + }, + { + "epoch": 1.63, + "grad_norm": 0.3070851626205139, + "learning_rate": 1.8134984210473106e-05, + "loss": 0.97, + "step": 17037 + }, + { + "epoch": 1.63, + "grad_norm": 0.2946193266014478, + "learning_rate": 1.812590021942493e-05, + "loss": 1.0339, + "step": 17038 + }, + { + "epoch": 1.63, + "grad_norm": 0.3014456431178478, + "learning_rate": 1.811681827732755e-05, + "loss": 0.9552, + "step": 17039 + }, + { + "epoch": 1.63, + "grad_norm": 0.344391797286783, + "learning_rate": 1.810773838440829e-05, + "loss": 1.0719, + "step": 17040 + }, + { + "epoch": 1.63, + "grad_norm": 0.34285205611255315, + "learning_rate": 1.809866054089431e-05, + "loss": 0.9763, + "step": 17041 + }, + { + "epoch": 1.63, + "grad_norm": 0.29524457546554833, + "learning_rate": 1.8089584747012823e-05, + "loss": 1.0294, + "step": 17042 + }, + { + "epoch": 1.63, + "grad_norm": 0.3532602658457792, + "learning_rate": 1.8080511002990986e-05, + "loss": 1.0178, + "step": 17043 + }, + { + "epoch": 1.63, + "grad_norm": 0.2995541032826581, + "learning_rate": 1.8071439309055815e-05, + "loss": 1.0354, + "step": 17044 + }, + { + "epoch": 1.63, + "grad_norm": 0.32394046642633717, + "learning_rate": 1.8062369665434376e-05, + "loss": 0.9438, + "step": 17045 + }, + { + "epoch": 1.63, + "grad_norm": 0.335417244639758, + "learning_rate": 1.8053302072353616e-05, + "loss": 1.0084, + "step": 17046 + }, + { + "epoch": 1.63, + "grad_norm": 0.3431857289405035, + "learning_rate": 1.8044236530040514e-05, + "loss": 1.0555, + "step": 17047 + }, + { + "epoch": 1.63, + "grad_norm": 0.35367630106923703, + "learning_rate": 1.8035173038721863e-05, + "loss": 0.9565, + "step": 17048 + }, + { + "epoch": 1.63, + "grad_norm": 0.3531561729098898, + "learning_rate": 1.802611159862455e-05, + "loss": 0.9563, + "step": 17049 + }, + { + "epoch": 1.63, + "grad_norm": 0.3099599806583361, + "learning_rate": 1.8017052209975293e-05, + "loss": 0.9902, + "step": 17050 + }, + { + "epoch": 1.63, + "grad_norm": 0.2898051825389753, + "learning_rate": 1.800799487300082e-05, + "loss": 0.9045, + "step": 17051 + }, + { + "epoch": 1.63, + "grad_norm": 0.3042622196333819, + "learning_rate": 1.799893958792781e-05, + "loss": 1.0357, + "step": 17052 + }, + { + "epoch": 1.63, + "grad_norm": 0.3312213220967892, + "learning_rate": 1.79898863549829e-05, + "loss": 1.0737, + "step": 17053 + }, + { + "epoch": 1.63, + "grad_norm": 0.3156932546681177, + "learning_rate": 1.7980835174392596e-05, + "loss": 1.0319, + "step": 17054 + }, + { + "epoch": 1.63, + "grad_norm": 0.31713614463827655, + "learning_rate": 1.7971786046383465e-05, + "loss": 1.043, + "step": 17055 + }, + { + "epoch": 1.63, + "grad_norm": 0.31623546603693037, + "learning_rate": 1.7962738971181913e-05, + "loss": 1.1077, + "step": 17056 + }, + { + "epoch": 1.63, + "grad_norm": 0.32986418432462056, + "learning_rate": 1.7953693949014404e-05, + "loss": 0.9248, + "step": 17057 + }, + { + "epoch": 1.63, + "grad_norm": 0.34425841341663466, + "learning_rate": 1.7944650980107225e-05, + "loss": 1.0877, + "step": 17058 + }, + { + "epoch": 1.63, + "grad_norm": 0.3252357813589762, + "learning_rate": 1.793561006468677e-05, + "loss": 0.9435, + "step": 17059 + }, + { + "epoch": 1.63, + "grad_norm": 0.29958145114055507, + "learning_rate": 1.7926571202979224e-05, + "loss": 1.0311, + "step": 17060 + }, + { + "epoch": 1.63, + "grad_norm": 0.3435298097391128, + "learning_rate": 1.7917534395210834e-05, + "loss": 1.0087, + "step": 17061 + }, + { + "epoch": 1.63, + "grad_norm": 0.3050015371147638, + "learning_rate": 1.7908499641607712e-05, + "loss": 1.0077, + "step": 17062 + }, + { + "epoch": 1.63, + "grad_norm": 0.31674939638792315, + "learning_rate": 1.7899466942395983e-05, + "loss": 1.0941, + "step": 17063 + }, + { + "epoch": 1.63, + "grad_norm": 0.3012341874513499, + "learning_rate": 1.7890436297801716e-05, + "loss": 1.0164, + "step": 17064 + }, + { + "epoch": 1.63, + "grad_norm": 0.3167549009185218, + "learning_rate": 1.7881407708050857e-05, + "loss": 1.0576, + "step": 17065 + }, + { + "epoch": 1.63, + "grad_norm": 0.3171445648055221, + "learning_rate": 1.787238117336938e-05, + "loss": 1.1425, + "step": 17066 + }, + { + "epoch": 1.63, + "grad_norm": 0.34961728360348726, + "learning_rate": 1.786335669398317e-05, + "loss": 1.0489, + "step": 17067 + }, + { + "epoch": 1.63, + "grad_norm": 0.27179044454602763, + "learning_rate": 1.785433427011811e-05, + "loss": 1.103, + "step": 17068 + }, + { + "epoch": 1.63, + "grad_norm": 0.31597192451260836, + "learning_rate": 1.784531390199995e-05, + "loss": 0.9814, + "step": 17069 + }, + { + "epoch": 1.63, + "grad_norm": 0.2850482156010418, + "learning_rate": 1.7836295589854456e-05, + "loss": 1.0705, + "step": 17070 + }, + { + "epoch": 1.63, + "grad_norm": 0.3331480677820594, + "learning_rate": 1.7827279333907276e-05, + "loss": 1.0116, + "step": 17071 + }, + { + "epoch": 1.63, + "grad_norm": 0.28917035638752225, + "learning_rate": 1.7818265134384084e-05, + "loss": 0.9241, + "step": 17072 + }, + { + "epoch": 1.63, + "grad_norm": 0.33306593398928175, + "learning_rate": 1.780925299151044e-05, + "loss": 0.9742, + "step": 17073 + }, + { + "epoch": 1.63, + "grad_norm": 0.34642438795933617, + "learning_rate": 1.7800242905511934e-05, + "loss": 0.9731, + "step": 17074 + }, + { + "epoch": 1.63, + "grad_norm": 0.3608760910288961, + "learning_rate": 1.7791234876613984e-05, + "loss": 1.0057, + "step": 17075 + }, + { + "epoch": 1.63, + "grad_norm": 0.34337901034519897, + "learning_rate": 1.7782228905042064e-05, + "loss": 0.8325, + "step": 17076 + }, + { + "epoch": 1.63, + "grad_norm": 0.30135675216741964, + "learning_rate": 1.777322499102152e-05, + "loss": 1.0779, + "step": 17077 + }, + { + "epoch": 1.63, + "grad_norm": 0.3076893800579344, + "learning_rate": 1.7764223134777692e-05, + "loss": 1.0855, + "step": 17078 + }, + { + "epoch": 1.63, + "grad_norm": 0.31242799097235574, + "learning_rate": 1.7755223336535865e-05, + "loss": 0.992, + "step": 17079 + }, + { + "epoch": 1.63, + "grad_norm": 0.27119309559423344, + "learning_rate": 1.7746225596521294e-05, + "loss": 0.9502, + "step": 17080 + }, + { + "epoch": 1.63, + "grad_norm": 0.34005869486478435, + "learning_rate": 1.7737229914959087e-05, + "loss": 1.0189, + "step": 17081 + }, + { + "epoch": 1.63, + "grad_norm": 0.3254139652656149, + "learning_rate": 1.772823629207441e-05, + "loss": 0.9828, + "step": 17082 + }, + { + "epoch": 1.63, + "grad_norm": 0.30905320441859574, + "learning_rate": 1.7719244728092342e-05, + "loss": 0.9881, + "step": 17083 + }, + { + "epoch": 1.63, + "grad_norm": 0.3244318682194523, + "learning_rate": 1.771025522323787e-05, + "loss": 1.0224, + "step": 17084 + }, + { + "epoch": 1.63, + "grad_norm": 0.2726630681359227, + "learning_rate": 1.7701267777735975e-05, + "loss": 1.0774, + "step": 17085 + }, + { + "epoch": 1.63, + "grad_norm": 0.36569918814750363, + "learning_rate": 1.7692282391811577e-05, + "loss": 0.9858, + "step": 17086 + }, + { + "epoch": 1.63, + "grad_norm": 0.2908742058454405, + "learning_rate": 1.7683299065689575e-05, + "loss": 0.8995, + "step": 17087 + }, + { + "epoch": 1.63, + "grad_norm": 0.30846042156847836, + "learning_rate": 1.767431779959471e-05, + "loss": 1.0669, + "step": 17088 + }, + { + "epoch": 1.63, + "grad_norm": 0.3202311480212603, + "learning_rate": 1.7665338593751813e-05, + "loss": 1.0614, + "step": 17089 + }, + { + "epoch": 1.64, + "grad_norm": 0.34181964286003513, + "learning_rate": 1.7656361448385526e-05, + "loss": 0.9748, + "step": 17090 + }, + { + "epoch": 1.64, + "grad_norm": 0.3298340101922306, + "learning_rate": 1.7647386363720587e-05, + "loss": 1.0523, + "step": 17091 + }, + { + "epoch": 1.64, + "grad_norm": 0.31925636221094283, + "learning_rate": 1.7638413339981497e-05, + "loss": 0.9695, + "step": 17092 + }, + { + "epoch": 1.64, + "grad_norm": 0.3097957273436585, + "learning_rate": 1.7629442377392934e-05, + "loss": 0.9444, + "step": 17093 + }, + { + "epoch": 1.64, + "grad_norm": 0.29125059668984726, + "learning_rate": 1.7620473476179322e-05, + "loss": 1.0493, + "step": 17094 + }, + { + "epoch": 1.64, + "grad_norm": 0.3580648048404283, + "learning_rate": 1.7611506636565157e-05, + "loss": 0.9369, + "step": 17095 + }, + { + "epoch": 1.64, + "grad_norm": 0.3435239605586353, + "learning_rate": 1.760254185877478e-05, + "loss": 0.9064, + "step": 17096 + }, + { + "epoch": 1.64, + "grad_norm": 0.30134060653177613, + "learning_rate": 1.7593579143032612e-05, + "loss": 1.0626, + "step": 17097 + }, + { + "epoch": 1.64, + "grad_norm": 0.3117460802751422, + "learning_rate": 1.75846184895629e-05, + "loss": 0.9653, + "step": 17098 + }, + { + "epoch": 1.64, + "grad_norm": 0.3078265384088596, + "learning_rate": 1.757565989858989e-05, + "loss": 1.2103, + "step": 17099 + }, + { + "epoch": 1.64, + "grad_norm": 0.32312099908926795, + "learning_rate": 1.75667033703378e-05, + "loss": 1.0685, + "step": 17100 + }, + { + "epoch": 1.64, + "grad_norm": 0.315611416563454, + "learning_rate": 1.7557748905030802e-05, + "loss": 1.0668, + "step": 17101 + }, + { + "epoch": 1.64, + "grad_norm": 0.26287052311809916, + "learning_rate": 1.7548796502892906e-05, + "loss": 0.9923, + "step": 17102 + }, + { + "epoch": 1.64, + "grad_norm": 0.28755870620404556, + "learning_rate": 1.753984616414821e-05, + "loss": 0.8947, + "step": 17103 + }, + { + "epoch": 1.64, + "grad_norm": 0.3163429398621177, + "learning_rate": 1.7530897889020713e-05, + "loss": 0.9907, + "step": 17104 + }, + { + "epoch": 1.64, + "grad_norm": 0.3125232803653689, + "learning_rate": 1.7521951677734295e-05, + "loss": 0.938, + "step": 17105 + }, + { + "epoch": 1.64, + "grad_norm": 0.32585848941406326, + "learning_rate": 1.751300753051287e-05, + "loss": 1.005, + "step": 17106 + }, + { + "epoch": 1.64, + "grad_norm": 0.2921658466301723, + "learning_rate": 1.7504065447580275e-05, + "loss": 0.9554, + "step": 17107 + }, + { + "epoch": 1.64, + "grad_norm": 0.3469842419616409, + "learning_rate": 1.7495125429160318e-05, + "loss": 1.0102, + "step": 17108 + }, + { + "epoch": 1.64, + "grad_norm": 0.2919530215682396, + "learning_rate": 1.7486187475476666e-05, + "loss": 1.0019, + "step": 17109 + }, + { + "epoch": 1.64, + "grad_norm": 0.2958253081378012, + "learning_rate": 1.7477251586753064e-05, + "loss": 1.1708, + "step": 17110 + }, + { + "epoch": 1.64, + "grad_norm": 0.28850889468174223, + "learning_rate": 1.746831776321308e-05, + "loss": 1.0407, + "step": 17111 + }, + { + "epoch": 1.64, + "grad_norm": 0.3063611132273975, + "learning_rate": 1.7459386005080313e-05, + "loss": 0.9982, + "step": 17112 + }, + { + "epoch": 1.64, + "grad_norm": 0.2979481650873024, + "learning_rate": 1.745045631257828e-05, + "loss": 1.0464, + "step": 17113 + }, + { + "epoch": 1.64, + "grad_norm": 0.30896708986037547, + "learning_rate": 1.7441528685930485e-05, + "loss": 1.0322, + "step": 17114 + }, + { + "epoch": 1.64, + "grad_norm": 0.34753049252692836, + "learning_rate": 1.74326031253603e-05, + "loss": 1.0152, + "step": 17115 + }, + { + "epoch": 1.64, + "grad_norm": 0.3577974396678184, + "learning_rate": 1.742367963109114e-05, + "loss": 0.9737, + "step": 17116 + }, + { + "epoch": 1.64, + "grad_norm": 0.29027954504263015, + "learning_rate": 1.741475820334626e-05, + "loss": 0.9705, + "step": 17117 + }, + { + "epoch": 1.64, + "grad_norm": 0.3792090199138504, + "learning_rate": 1.7405838842348986e-05, + "loss": 1.0136, + "step": 17118 + }, + { + "epoch": 1.64, + "grad_norm": 0.322411112255275, + "learning_rate": 1.7396921548322452e-05, + "loss": 1.0862, + "step": 17119 + }, + { + "epoch": 1.64, + "grad_norm": 0.2839737302378354, + "learning_rate": 1.7388006321489935e-05, + "loss": 0.9441, + "step": 17120 + }, + { + "epoch": 1.64, + "grad_norm": 0.3555911921798039, + "learning_rate": 1.7379093162074444e-05, + "loss": 1.0481, + "step": 17121 + }, + { + "epoch": 1.64, + "grad_norm": 0.3462172610773083, + "learning_rate": 1.737018207029909e-05, + "loss": 1.0195, + "step": 17122 + }, + { + "epoch": 1.64, + "grad_norm": 0.2758484031265217, + "learning_rate": 1.736127304638685e-05, + "loss": 0.9905, + "step": 17123 + }, + { + "epoch": 1.64, + "grad_norm": 0.3182763761346745, + "learning_rate": 1.735236609056068e-05, + "loss": 0.9429, + "step": 17124 + }, + { + "epoch": 1.64, + "grad_norm": 0.3064891198003043, + "learning_rate": 1.734346120304351e-05, + "loss": 0.9555, + "step": 17125 + }, + { + "epoch": 1.64, + "grad_norm": 0.31175676384284273, + "learning_rate": 1.7334558384058152e-05, + "loss": 1.0341, + "step": 17126 + }, + { + "epoch": 1.64, + "grad_norm": 0.3177668052718184, + "learning_rate": 1.732565763382743e-05, + "loss": 0.9014, + "step": 17127 + }, + { + "epoch": 1.64, + "grad_norm": 0.32317733850483826, + "learning_rate": 1.7316758952574074e-05, + "loss": 0.8719, + "step": 17128 + }, + { + "epoch": 1.64, + "grad_norm": 0.354654595202803, + "learning_rate": 1.730786234052082e-05, + "loss": 1.0077, + "step": 17129 + }, + { + "epoch": 1.64, + "grad_norm": 0.3417472679918441, + "learning_rate": 1.7298967797890265e-05, + "loss": 1.1059, + "step": 17130 + }, + { + "epoch": 1.64, + "grad_norm": 0.3323054413724692, + "learning_rate": 1.729007532490503e-05, + "loss": 0.9834, + "step": 17131 + }, + { + "epoch": 1.64, + "grad_norm": 0.32868357482545757, + "learning_rate": 1.728118492178762e-05, + "loss": 1.0043, + "step": 17132 + }, + { + "epoch": 1.64, + "grad_norm": 0.3127387552332104, + "learning_rate": 1.727229658876054e-05, + "loss": 0.9241, + "step": 17133 + }, + { + "epoch": 1.64, + "grad_norm": 0.28324170447021674, + "learning_rate": 1.7263410326046236e-05, + "loss": 1.1183, + "step": 17134 + }, + { + "epoch": 1.64, + "grad_norm": 0.28536076450136394, + "learning_rate": 1.72545261338671e-05, + "loss": 0.9925, + "step": 17135 + }, + { + "epoch": 1.64, + "grad_norm": 0.3098147440585222, + "learning_rate": 1.7245644012445438e-05, + "loss": 0.9884, + "step": 17136 + }, + { + "epoch": 1.64, + "grad_norm": 0.3043377868417015, + "learning_rate": 1.7236763962003565e-05, + "loss": 1.0186, + "step": 17137 + }, + { + "epoch": 1.64, + "grad_norm": 0.3202842556024104, + "learning_rate": 1.722788598276367e-05, + "loss": 1.0257, + "step": 17138 + }, + { + "epoch": 1.64, + "grad_norm": 0.31237919825478694, + "learning_rate": 1.7219010074947938e-05, + "loss": 1.023, + "step": 17139 + }, + { + "epoch": 1.64, + "grad_norm": 0.33801338579988877, + "learning_rate": 1.7210136238778508e-05, + "loss": 0.9751, + "step": 17140 + }, + { + "epoch": 1.64, + "grad_norm": 0.29592395356335677, + "learning_rate": 1.7201264474477475e-05, + "loss": 0.9101, + "step": 17141 + }, + { + "epoch": 1.64, + "grad_norm": 0.32035173017116264, + "learning_rate": 1.7192394782266818e-05, + "loss": 1.0213, + "step": 17142 + }, + { + "epoch": 1.64, + "grad_norm": 0.27638086261333283, + "learning_rate": 1.718352716236854e-05, + "loss": 1.0856, + "step": 17143 + }, + { + "epoch": 1.64, + "grad_norm": 0.3275220583668711, + "learning_rate": 1.7174661615004518e-05, + "loss": 1.0473, + "step": 17144 + }, + { + "epoch": 1.64, + "grad_norm": 0.30659790737435894, + "learning_rate": 1.7165798140396648e-05, + "loss": 0.9674, + "step": 17145 + }, + { + "epoch": 1.64, + "grad_norm": 0.3140783964865994, + "learning_rate": 1.7156936738766738e-05, + "loss": 1.1384, + "step": 17146 + }, + { + "epoch": 1.64, + "grad_norm": 0.33605872381955254, + "learning_rate": 1.7148077410336573e-05, + "loss": 0.9408, + "step": 17147 + }, + { + "epoch": 1.64, + "grad_norm": 0.33487660233024313, + "learning_rate": 1.713922015532782e-05, + "loss": 1.0887, + "step": 17148 + }, + { + "epoch": 1.64, + "grad_norm": 0.30725507998887125, + "learning_rate": 1.7130364973962155e-05, + "loss": 1.1118, + "step": 17149 + }, + { + "epoch": 1.64, + "grad_norm": 0.29380689197228543, + "learning_rate": 1.7121511866461215e-05, + "loss": 1.1267, + "step": 17150 + }, + { + "epoch": 1.64, + "grad_norm": 0.3074766729407517, + "learning_rate": 1.7112660833046512e-05, + "loss": 1.0607, + "step": 17151 + }, + { + "epoch": 1.64, + "grad_norm": 0.32389494972957356, + "learning_rate": 1.710381187393958e-05, + "loss": 1.0254, + "step": 17152 + }, + { + "epoch": 1.64, + "grad_norm": 0.3316097220768074, + "learning_rate": 1.7094964989361807e-05, + "loss": 0.9607, + "step": 17153 + }, + { + "epoch": 1.64, + "grad_norm": 0.3056084657380874, + "learning_rate": 1.708612017953468e-05, + "loss": 0.8583, + "step": 17154 + }, + { + "epoch": 1.64, + "grad_norm": 0.30673676489597385, + "learning_rate": 1.7077277444679484e-05, + "loss": 0.9665, + "step": 17155 + }, + { + "epoch": 1.64, + "grad_norm": 0.3289552843366403, + "learning_rate": 1.706843678501756e-05, + "loss": 1.1119, + "step": 17156 + }, + { + "epoch": 1.64, + "grad_norm": 0.3396265742669103, + "learning_rate": 1.7059598200770112e-05, + "loss": 1.057, + "step": 17157 + }, + { + "epoch": 1.64, + "grad_norm": 0.2963750469519453, + "learning_rate": 1.7050761692158356e-05, + "loss": 0.9579, + "step": 17158 + }, + { + "epoch": 1.64, + "grad_norm": 0.2671168750520538, + "learning_rate": 1.70419272594034e-05, + "loss": 1.0432, + "step": 17159 + }, + { + "epoch": 1.64, + "grad_norm": 0.3355002726148698, + "learning_rate": 1.7033094902726354e-05, + "loss": 1.0269, + "step": 17160 + }, + { + "epoch": 1.64, + "grad_norm": 0.333328470047879, + "learning_rate": 1.7024264622348253e-05, + "loss": 1.1065, + "step": 17161 + }, + { + "epoch": 1.64, + "grad_norm": 0.3071409434061426, + "learning_rate": 1.7015436418490094e-05, + "loss": 1.0881, + "step": 17162 + }, + { + "epoch": 1.64, + "grad_norm": 0.3059180751790747, + "learning_rate": 1.7006610291372773e-05, + "loss": 0.9479, + "step": 17163 + }, + { + "epoch": 1.64, + "grad_norm": 0.37424820444668605, + "learning_rate": 1.6997786241217216e-05, + "loss": 1.0074, + "step": 17164 + }, + { + "epoch": 1.64, + "grad_norm": 0.3288645813408185, + "learning_rate": 1.6988964268244202e-05, + "loss": 1.0667, + "step": 17165 + }, + { + "epoch": 1.64, + "grad_norm": 0.32869472488986873, + "learning_rate": 1.698014437267451e-05, + "loss": 1.0254, + "step": 17166 + }, + { + "epoch": 1.64, + "grad_norm": 0.33932559851314825, + "learning_rate": 1.6971326554728893e-05, + "loss": 0.9896, + "step": 17167 + }, + { + "epoch": 1.64, + "grad_norm": 0.30288348152120825, + "learning_rate": 1.6962510814628018e-05, + "loss": 1.0515, + "step": 17168 + }, + { + "epoch": 1.64, + "grad_norm": 0.3158463250029187, + "learning_rate": 1.6953697152592506e-05, + "loss": 0.9691, + "step": 17169 + }, + { + "epoch": 1.64, + "grad_norm": 0.306858827940392, + "learning_rate": 1.6944885568842906e-05, + "loss": 1.0389, + "step": 17170 + }, + { + "epoch": 1.64, + "grad_norm": 0.3165541579696138, + "learning_rate": 1.693607606359976e-05, + "loss": 1.1087, + "step": 17171 + }, + { + "epoch": 1.64, + "grad_norm": 0.32352731591720885, + "learning_rate": 1.6927268637083484e-05, + "loss": 0.9573, + "step": 17172 + }, + { + "epoch": 1.64, + "grad_norm": 0.3105070230825433, + "learning_rate": 1.691846328951453e-05, + "loss": 1.0566, + "step": 17173 + }, + { + "epoch": 1.64, + "grad_norm": 0.2969366946947264, + "learning_rate": 1.6909660021113237e-05, + "loss": 1.0913, + "step": 17174 + }, + { + "epoch": 1.64, + "grad_norm": 0.3268432219529488, + "learning_rate": 1.6900858832099954e-05, + "loss": 1.0238, + "step": 17175 + }, + { + "epoch": 1.64, + "grad_norm": 0.33844920485343716, + "learning_rate": 1.6892059722694875e-05, + "loss": 1.0801, + "step": 17176 + }, + { + "epoch": 1.64, + "grad_norm": 0.28621941594487427, + "learning_rate": 1.6883262693118263e-05, + "loss": 1.0306, + "step": 17177 + }, + { + "epoch": 1.64, + "grad_norm": 0.307562683805903, + "learning_rate": 1.6874467743590216e-05, + "loss": 0.8621, + "step": 17178 + }, + { + "epoch": 1.64, + "grad_norm": 0.31272594012043864, + "learning_rate": 1.6865674874330873e-05, + "loss": 1.106, + "step": 17179 + }, + { + "epoch": 1.64, + "grad_norm": 0.3520520747785393, + "learning_rate": 1.6856884085560222e-05, + "loss": 1.0482, + "step": 17180 + }, + { + "epoch": 1.64, + "grad_norm": 0.2836902110225293, + "learning_rate": 1.684809537749834e-05, + "loss": 1.0492, + "step": 17181 + }, + { + "epoch": 1.64, + "grad_norm": 0.321168412668058, + "learning_rate": 1.6839308750365114e-05, + "loss": 0.9851, + "step": 17182 + }, + { + "epoch": 1.64, + "grad_norm": 0.2959074008185144, + "learning_rate": 1.683052420438048e-05, + "loss": 0.9029, + "step": 17183 + }, + { + "epoch": 1.64, + "grad_norm": 0.31055358514896075, + "learning_rate": 1.6821741739764218e-05, + "loss": 1.0806, + "step": 17184 + }, + { + "epoch": 1.64, + "grad_norm": 0.3575720299531552, + "learning_rate": 1.6812961356736135e-05, + "loss": 1.0532, + "step": 17185 + }, + { + "epoch": 1.64, + "grad_norm": 0.30655790666611515, + "learning_rate": 1.680418305551601e-05, + "loss": 1.0592, + "step": 17186 + }, + { + "epoch": 1.64, + "grad_norm": 0.30064821300986067, + "learning_rate": 1.6795406836323467e-05, + "loss": 1.0576, + "step": 17187 + }, + { + "epoch": 1.64, + "grad_norm": 0.3541933354962236, + "learning_rate": 1.6786632699378157e-05, + "loss": 1.0078, + "step": 17188 + }, + { + "epoch": 1.64, + "grad_norm": 0.3321053419730896, + "learning_rate": 1.6777860644899656e-05, + "loss": 0.9564, + "step": 17189 + }, + { + "epoch": 1.64, + "grad_norm": 0.3012747817505155, + "learning_rate": 1.6769090673107536e-05, + "loss": 1.0204, + "step": 17190 + }, + { + "epoch": 1.64, + "grad_norm": 0.29013280302770134, + "learning_rate": 1.6760322784221195e-05, + "loss": 1.0325, + "step": 17191 + }, + { + "epoch": 1.64, + "grad_norm": 0.31475463739601023, + "learning_rate": 1.6751556978460114e-05, + "loss": 0.9613, + "step": 17192 + }, + { + "epoch": 1.64, + "grad_norm": 0.28685184204131436, + "learning_rate": 1.674279325604361e-05, + "loss": 1.0231, + "step": 17193 + }, + { + "epoch": 1.64, + "grad_norm": 0.33863476701578765, + "learning_rate": 1.6734031617191036e-05, + "loss": 1.1269, + "step": 17194 + }, + { + "epoch": 1.65, + "grad_norm": 0.30779878323353127, + "learning_rate": 1.6725272062121655e-05, + "loss": 0.9178, + "step": 17195 + }, + { + "epoch": 1.65, + "grad_norm": 0.32628550506495635, + "learning_rate": 1.6716514591054698e-05, + "loss": 1.0442, + "step": 17196 + }, + { + "epoch": 1.65, + "grad_norm": 0.32735427285402846, + "learning_rate": 1.6707759204209283e-05, + "loss": 1.0168, + "step": 17197 + }, + { + "epoch": 1.65, + "grad_norm": 0.3474066844181251, + "learning_rate": 1.6699005901804565e-05, + "loss": 1.0577, + "step": 17198 + }, + { + "epoch": 1.65, + "grad_norm": 0.3414777634124281, + "learning_rate": 1.669025468405955e-05, + "loss": 1.067, + "step": 17199 + }, + { + "epoch": 1.65, + "grad_norm": 0.3238034109643111, + "learning_rate": 1.6681505551193266e-05, + "loss": 1.1315, + "step": 17200 + }, + { + "epoch": 1.65, + "grad_norm": 0.2861852963493291, + "learning_rate": 1.667275850342468e-05, + "loss": 1.0893, + "step": 17201 + }, + { + "epoch": 1.65, + "grad_norm": 0.3499358132702966, + "learning_rate": 1.66640135409727e-05, + "loss": 0.9671, + "step": 17202 + }, + { + "epoch": 1.65, + "grad_norm": 0.32131408647563836, + "learning_rate": 1.6655270664056132e-05, + "loss": 1.01, + "step": 17203 + }, + { + "epoch": 1.65, + "grad_norm": 0.30743693452243737, + "learning_rate": 1.6646529872893812e-05, + "loss": 1.0638, + "step": 17204 + }, + { + "epoch": 1.65, + "grad_norm": 0.34242879347929117, + "learning_rate": 1.6637791167704454e-05, + "loss": 0.9986, + "step": 17205 + }, + { + "epoch": 1.65, + "grad_norm": 0.3105142587490805, + "learning_rate": 1.6629054548706758e-05, + "loss": 1.0679, + "step": 17206 + }, + { + "epoch": 1.65, + "grad_norm": 0.34069269731583585, + "learning_rate": 1.662032001611937e-05, + "loss": 1.0618, + "step": 17207 + }, + { + "epoch": 1.65, + "grad_norm": 0.36068263420600305, + "learning_rate": 1.6611587570160902e-05, + "loss": 1.0294, + "step": 17208 + }, + { + "epoch": 1.65, + "grad_norm": 0.31437999470154876, + "learning_rate": 1.660285721104984e-05, + "loss": 1.1279, + "step": 17209 + }, + { + "epoch": 1.65, + "grad_norm": 0.3099637136409322, + "learning_rate": 1.6594128939004683e-05, + "loss": 1.0424, + "step": 17210 + }, + { + "epoch": 1.65, + "grad_norm": 0.3065704904091915, + "learning_rate": 1.6585402754243907e-05, + "loss": 0.9902, + "step": 17211 + }, + { + "epoch": 1.65, + "grad_norm": 0.28887425089095603, + "learning_rate": 1.657667865698581e-05, + "loss": 1.0265, + "step": 17212 + }, + { + "epoch": 1.65, + "grad_norm": 0.30576259994514304, + "learning_rate": 1.656795664744879e-05, + "loss": 1.1169, + "step": 17213 + }, + { + "epoch": 1.65, + "grad_norm": 0.32014499809566743, + "learning_rate": 1.6559236725851034e-05, + "loss": 1.0516, + "step": 17214 + }, + { + "epoch": 1.65, + "grad_norm": 0.32241166332187593, + "learning_rate": 1.6550518892410884e-05, + "loss": 1.0189, + "step": 17215 + }, + { + "epoch": 1.65, + "grad_norm": 0.31193887113736196, + "learning_rate": 1.6541803147346412e-05, + "loss": 0.9391, + "step": 17216 + }, + { + "epoch": 1.65, + "grad_norm": 0.3211896102291243, + "learning_rate": 1.6533089490875785e-05, + "loss": 1.0498, + "step": 17217 + }, + { + "epoch": 1.65, + "grad_norm": 0.34052225551437576, + "learning_rate": 1.652437792321704e-05, + "loss": 1.0106, + "step": 17218 + }, + { + "epoch": 1.65, + "grad_norm": 0.3137476603540321, + "learning_rate": 1.6515668444588218e-05, + "loss": 0.9756, + "step": 17219 + }, + { + "epoch": 1.65, + "grad_norm": 0.3300213342141381, + "learning_rate": 1.6506961055207225e-05, + "loss": 1.0833, + "step": 17220 + }, + { + "epoch": 1.65, + "grad_norm": 0.33350370606732727, + "learning_rate": 1.6498255755292026e-05, + "loss": 0.9992, + "step": 17221 + }, + { + "epoch": 1.65, + "grad_norm": 0.30624786673008697, + "learning_rate": 1.6489552545060437e-05, + "loss": 1.0501, + "step": 17222 + }, + { + "epoch": 1.65, + "grad_norm": 0.3022111106427225, + "learning_rate": 1.6480851424730315e-05, + "loss": 1.0239, + "step": 17223 + }, + { + "epoch": 1.65, + "grad_norm": 0.3260053214404138, + "learning_rate": 1.6472152394519348e-05, + "loss": 0.9429, + "step": 17224 + }, + { + "epoch": 1.65, + "grad_norm": 0.3046501664531063, + "learning_rate": 1.6463455454645293e-05, + "loss": 0.987, + "step": 17225 + }, + { + "epoch": 1.65, + "grad_norm": 0.3151791863906537, + "learning_rate": 1.6454760605325737e-05, + "loss": 0.9566, + "step": 17226 + }, + { + "epoch": 1.65, + "grad_norm": 0.3205127767899489, + "learning_rate": 1.64460678467783e-05, + "loss": 1.0379, + "step": 17227 + }, + { + "epoch": 1.65, + "grad_norm": 0.3184204522395816, + "learning_rate": 1.6437377179220527e-05, + "loss": 1.0026, + "step": 17228 + }, + { + "epoch": 1.65, + "grad_norm": 0.3201540984372184, + "learning_rate": 1.6428688602869935e-05, + "loss": 1.0387, + "step": 17229 + }, + { + "epoch": 1.65, + "grad_norm": 0.28704363397709237, + "learning_rate": 1.642000211794391e-05, + "loss": 1.1268, + "step": 17230 + }, + { + "epoch": 1.65, + "grad_norm": 0.2839312066430913, + "learning_rate": 1.6411317724659858e-05, + "loss": 1.0781, + "step": 17231 + }, + { + "epoch": 1.65, + "grad_norm": 0.33483655057147244, + "learning_rate": 1.6402635423235137e-05, + "loss": 1.0739, + "step": 17232 + }, + { + "epoch": 1.65, + "grad_norm": 0.3392080989861084, + "learning_rate": 1.639395521388698e-05, + "loss": 1.114, + "step": 17233 + }, + { + "epoch": 1.65, + "grad_norm": 0.3166858562878593, + "learning_rate": 1.638527709683264e-05, + "loss": 0.9299, + "step": 17234 + }, + { + "epoch": 1.65, + "grad_norm": 0.29425119201388505, + "learning_rate": 1.637660107228929e-05, + "loss": 1.0243, + "step": 17235 + }, + { + "epoch": 1.65, + "grad_norm": 0.2965965837119095, + "learning_rate": 1.6367927140474072e-05, + "loss": 1.0049, + "step": 17236 + }, + { + "epoch": 1.65, + "grad_norm": 0.28405996936079636, + "learning_rate": 1.6359255301604016e-05, + "loss": 0.9917, + "step": 17237 + }, + { + "epoch": 1.65, + "grad_norm": 0.31007257766086005, + "learning_rate": 1.635058555589619e-05, + "loss": 1.0846, + "step": 17238 + }, + { + "epoch": 1.65, + "grad_norm": 0.34324340532760755, + "learning_rate": 1.6341917903567504e-05, + "loss": 1.0687, + "step": 17239 + }, + { + "epoch": 1.65, + "grad_norm": 0.30736600259712377, + "learning_rate": 1.6333252344834936e-05, + "loss": 0.9914, + "step": 17240 + }, + { + "epoch": 1.65, + "grad_norm": 0.294730292577579, + "learning_rate": 1.6324588879915247e-05, + "loss": 1.0861, + "step": 17241 + }, + { + "epoch": 1.65, + "grad_norm": 0.4134176209454361, + "learning_rate": 1.6315927509025362e-05, + "loss": 1.1723, + "step": 17242 + }, + { + "epoch": 1.65, + "grad_norm": 0.3263582816635457, + "learning_rate": 1.6307268232381966e-05, + "loss": 0.9473, + "step": 17243 + }, + { + "epoch": 1.65, + "grad_norm": 0.2634632820682845, + "learning_rate": 1.6298611050201807e-05, + "loss": 0.9501, + "step": 17244 + }, + { + "epoch": 1.65, + "grad_norm": 0.2851004682360689, + "learning_rate": 1.6289955962701475e-05, + "loss": 0.9829, + "step": 17245 + }, + { + "epoch": 1.65, + "grad_norm": 0.32139184035879675, + "learning_rate": 1.6281302970097634e-05, + "loss": 1.0235, + "step": 17246 + }, + { + "epoch": 1.65, + "grad_norm": 0.24980629598272655, + "learning_rate": 1.6272652072606786e-05, + "loss": 1.0591, + "step": 17247 + }, + { + "epoch": 1.65, + "grad_norm": 0.3301286722580597, + "learning_rate": 1.6264003270445427e-05, + "loss": 1.0123, + "step": 17248 + }, + { + "epoch": 1.65, + "grad_norm": 0.3184539760962757, + "learning_rate": 1.6255356563830016e-05, + "loss": 1.133, + "step": 17249 + }, + { + "epoch": 1.65, + "grad_norm": 0.31275348665732255, + "learning_rate": 1.6246711952976967e-05, + "loss": 0.9854, + "step": 17250 + }, + { + "epoch": 1.65, + "grad_norm": 0.31569964902713094, + "learning_rate": 1.6238069438102565e-05, + "loss": 0.9942, + "step": 17251 + }, + { + "epoch": 1.65, + "grad_norm": 0.30776545519951465, + "learning_rate": 1.6229429019423116e-05, + "loss": 1.0811, + "step": 17252 + }, + { + "epoch": 1.65, + "grad_norm": 0.28576794618861034, + "learning_rate": 1.6220790697154873e-05, + "loss": 0.9703, + "step": 17253 + }, + { + "epoch": 1.65, + "grad_norm": 0.2789586746961189, + "learning_rate": 1.6212154471513973e-05, + "loss": 1.0329, + "step": 17254 + }, + { + "epoch": 1.65, + "grad_norm": 0.2854400828957201, + "learning_rate": 1.620352034271657e-05, + "loss": 1.1623, + "step": 17255 + }, + { + "epoch": 1.65, + "grad_norm": 0.3498801447905086, + "learning_rate": 1.619488831097874e-05, + "loss": 1.0688, + "step": 17256 + }, + { + "epoch": 1.65, + "grad_norm": 0.32884089984334985, + "learning_rate": 1.6186258376516516e-05, + "loss": 0.9875, + "step": 17257 + }, + { + "epoch": 1.65, + "grad_norm": 0.31799155250582506, + "learning_rate": 1.617763053954583e-05, + "loss": 1.0355, + "step": 17258 + }, + { + "epoch": 1.65, + "grad_norm": 0.27073160598136453, + "learning_rate": 1.616900480028264e-05, + "loss": 1.086, + "step": 17259 + }, + { + "epoch": 1.65, + "grad_norm": 0.3251834661164695, + "learning_rate": 1.616038115894276e-05, + "loss": 0.9057, + "step": 17260 + }, + { + "epoch": 1.65, + "grad_norm": 0.3162095246465272, + "learning_rate": 1.6151759615742047e-05, + "loss": 0.9181, + "step": 17261 + }, + { + "epoch": 1.65, + "grad_norm": 0.35683228654563953, + "learning_rate": 1.6143140170896243e-05, + "loss": 1.0371, + "step": 17262 + }, + { + "epoch": 1.65, + "grad_norm": 0.3164407116971294, + "learning_rate": 1.613452282462108e-05, + "loss": 1.1204, + "step": 17263 + }, + { + "epoch": 1.65, + "grad_norm": 0.3092905339106458, + "learning_rate": 1.612590757713216e-05, + "loss": 1.0132, + "step": 17264 + }, + { + "epoch": 1.65, + "grad_norm": 0.32264715397528815, + "learning_rate": 1.6117294428645146e-05, + "loss": 1.1002, + "step": 17265 + }, + { + "epoch": 1.65, + "grad_norm": 0.2996140765456563, + "learning_rate": 1.6108683379375544e-05, + "loss": 0.977, + "step": 17266 + }, + { + "epoch": 1.65, + "grad_norm": 0.33246402075115883, + "learning_rate": 1.6100074429538848e-05, + "loss": 1.0374, + "step": 17267 + }, + { + "epoch": 1.65, + "grad_norm": 0.3225496304413993, + "learning_rate": 1.609146757935053e-05, + "loss": 1.0538, + "step": 17268 + }, + { + "epoch": 1.65, + "grad_norm": 0.3142637501131633, + "learning_rate": 1.608286282902599e-05, + "loss": 0.8721, + "step": 17269 + }, + { + "epoch": 1.65, + "grad_norm": 0.28056308632559507, + "learning_rate": 1.607426017878052e-05, + "loss": 1.1231, + "step": 17270 + }, + { + "epoch": 1.65, + "grad_norm": 0.32038248370011685, + "learning_rate": 1.6065659628829432e-05, + "loss": 0.9436, + "step": 17271 + }, + { + "epoch": 1.65, + "grad_norm": 0.35381904809238934, + "learning_rate": 1.6057061179387988e-05, + "loss": 0.9883, + "step": 17272 + }, + { + "epoch": 1.65, + "grad_norm": 0.34916603383324984, + "learning_rate": 1.604846483067133e-05, + "loss": 0.9692, + "step": 17273 + }, + { + "epoch": 1.65, + "grad_norm": 0.30844672998995204, + "learning_rate": 1.6039870582894623e-05, + "loss": 1.0315, + "step": 17274 + }, + { + "epoch": 1.65, + "grad_norm": 0.3572654718892533, + "learning_rate": 1.6031278436272868e-05, + "loss": 1.0027, + "step": 17275 + }, + { + "epoch": 1.65, + "grad_norm": 0.32402484234859735, + "learning_rate": 1.602268839102119e-05, + "loss": 0.9875, + "step": 17276 + }, + { + "epoch": 1.65, + "grad_norm": 0.32718774443227616, + "learning_rate": 1.6014100447354484e-05, + "loss": 0.885, + "step": 17277 + }, + { + "epoch": 1.65, + "grad_norm": 0.3361518153751234, + "learning_rate": 1.6005514605487736e-05, + "loss": 0.9384, + "step": 17278 + }, + { + "epoch": 1.65, + "grad_norm": 0.32953632243435127, + "learning_rate": 1.599693086563574e-05, + "loss": 1.1162, + "step": 17279 + }, + { + "epoch": 1.65, + "grad_norm": 0.3468481907247419, + "learning_rate": 1.5988349228013366e-05, + "loss": 1.0739, + "step": 17280 + }, + { + "epoch": 1.65, + "grad_norm": 0.31799868822822747, + "learning_rate": 1.5979769692835333e-05, + "loss": 1.0344, + "step": 17281 + }, + { + "epoch": 1.65, + "grad_norm": 0.39181331718879203, + "learning_rate": 1.597119226031637e-05, + "loss": 1.1456, + "step": 17282 + }, + { + "epoch": 1.65, + "grad_norm": 0.30507781412087875, + "learning_rate": 1.5962616930671128e-05, + "loss": 1.0254, + "step": 17283 + }, + { + "epoch": 1.65, + "grad_norm": 0.3157704763292254, + "learning_rate": 1.595404370411423e-05, + "loss": 1.047, + "step": 17284 + }, + { + "epoch": 1.65, + "grad_norm": 0.2904034954657744, + "learning_rate": 1.5945472580860187e-05, + "loss": 1.0093, + "step": 17285 + }, + { + "epoch": 1.65, + "grad_norm": 0.3005626460136862, + "learning_rate": 1.593690356112355e-05, + "loss": 1.0089, + "step": 17286 + }, + { + "epoch": 1.65, + "grad_norm": 0.3124258766936041, + "learning_rate": 1.5928336645118703e-05, + "loss": 1.0294, + "step": 17287 + }, + { + "epoch": 1.65, + "grad_norm": 0.31631092195729055, + "learning_rate": 1.5919771833060072e-05, + "loss": 0.9615, + "step": 17288 + }, + { + "epoch": 1.65, + "grad_norm": 0.3276233108995428, + "learning_rate": 1.5911209125161995e-05, + "loss": 1.0715, + "step": 17289 + }, + { + "epoch": 1.65, + "grad_norm": 0.35424571792928494, + "learning_rate": 1.5902648521638773e-05, + "loss": 0.9362, + "step": 17290 + }, + { + "epoch": 1.65, + "grad_norm": 0.36436164955227107, + "learning_rate": 1.589409002270461e-05, + "loss": 1.0355, + "step": 17291 + }, + { + "epoch": 1.65, + "grad_norm": 0.35330151205095534, + "learning_rate": 1.58855336285737e-05, + "loss": 1.0781, + "step": 17292 + }, + { + "epoch": 1.65, + "grad_norm": 0.3249267058522883, + "learning_rate": 1.5876979339460207e-05, + "loss": 1.1326, + "step": 17293 + }, + { + "epoch": 1.65, + "grad_norm": 0.30967522170859096, + "learning_rate": 1.5868427155578135e-05, + "loss": 1.0664, + "step": 17294 + }, + { + "epoch": 1.65, + "grad_norm": 0.30673991762088015, + "learning_rate": 1.585987707714156e-05, + "loss": 1.0754, + "step": 17295 + }, + { + "epoch": 1.65, + "grad_norm": 0.32337111374851957, + "learning_rate": 1.585132910436443e-05, + "loss": 1.1162, + "step": 17296 + }, + { + "epoch": 1.65, + "grad_norm": 0.32574522688663515, + "learning_rate": 1.5842783237460713e-05, + "loss": 1.03, + "step": 17297 + }, + { + "epoch": 1.65, + "grad_norm": 0.3287756225736176, + "learning_rate": 1.5834239476644208e-05, + "loss": 1.0214, + "step": 17298 + }, + { + "epoch": 1.66, + "grad_norm": 0.33474402829751443, + "learning_rate": 1.582569782212877e-05, + "loss": 0.991, + "step": 17299 + }, + { + "epoch": 1.66, + "grad_norm": 0.3225454039618413, + "learning_rate": 1.581715827412813e-05, + "loss": 1.0607, + "step": 17300 + }, + { + "epoch": 1.66, + "grad_norm": 0.3083464503810653, + "learning_rate": 1.5808620832856036e-05, + "loss": 0.8593, + "step": 17301 + }, + { + "epoch": 1.66, + "grad_norm": 0.3105927089570744, + "learning_rate": 1.5800085498526073e-05, + "loss": 1.0554, + "step": 17302 + }, + { + "epoch": 1.66, + "grad_norm": 0.3692881283648538, + "learning_rate": 1.579155227135193e-05, + "loss": 1.0166, + "step": 17303 + }, + { + "epoch": 1.66, + "grad_norm": 0.3507023988043067, + "learning_rate": 1.5783021151547095e-05, + "loss": 1.1218, + "step": 17304 + }, + { + "epoch": 1.66, + "grad_norm": 0.28018069237680193, + "learning_rate": 1.5774492139325103e-05, + "loss": 1.0967, + "step": 17305 + }, + { + "epoch": 1.66, + "grad_norm": 0.297154201242998, + "learning_rate": 1.576596523489936e-05, + "loss": 1.0686, + "step": 17306 + }, + { + "epoch": 1.66, + "grad_norm": 0.3131509781621094, + "learning_rate": 1.5757440438483294e-05, + "loss": 0.9057, + "step": 17307 + }, + { + "epoch": 1.66, + "grad_norm": 0.3251733894749431, + "learning_rate": 1.5748917750290205e-05, + "loss": 1.0855, + "step": 17308 + }, + { + "epoch": 1.66, + "grad_norm": 0.303322043269195, + "learning_rate": 1.5740397170533395e-05, + "loss": 0.8808, + "step": 17309 + }, + { + "epoch": 1.66, + "grad_norm": 0.3235813031557602, + "learning_rate": 1.5731878699426107e-05, + "loss": 1.0996, + "step": 17310 + }, + { + "epoch": 1.66, + "grad_norm": 0.30015404253607997, + "learning_rate": 1.5723362337181535e-05, + "loss": 1.0674, + "step": 17311 + }, + { + "epoch": 1.66, + "grad_norm": 0.3548729382793615, + "learning_rate": 1.571484808401277e-05, + "loss": 1.0854, + "step": 17312 + }, + { + "epoch": 1.66, + "grad_norm": 0.3400148594362455, + "learning_rate": 1.5706335940132888e-05, + "loss": 1.0904, + "step": 17313 + }, + { + "epoch": 1.66, + "grad_norm": 0.2618198476946843, + "learning_rate": 1.569782590575496e-05, + "loss": 0.8956, + "step": 17314 + }, + { + "epoch": 1.66, + "grad_norm": 0.2770769914786039, + "learning_rate": 1.56893179810919e-05, + "loss": 0.9751, + "step": 17315 + }, + { + "epoch": 1.66, + "grad_norm": 0.3676985480181428, + "learning_rate": 1.568081216635664e-05, + "loss": 1.0018, + "step": 17316 + }, + { + "epoch": 1.66, + "grad_norm": 0.33196236590667183, + "learning_rate": 1.567230846176204e-05, + "loss": 1.0259, + "step": 17317 + }, + { + "epoch": 1.66, + "grad_norm": 0.31498154870302497, + "learning_rate": 1.5663806867520957e-05, + "loss": 1.0091, + "step": 17318 + }, + { + "epoch": 1.66, + "grad_norm": 0.28004518795191913, + "learning_rate": 1.5655307383846083e-05, + "loss": 0.9587, + "step": 17319 + }, + { + "epoch": 1.66, + "grad_norm": 0.3051467994478118, + "learning_rate": 1.5646810010950176e-05, + "loss": 0.9283, + "step": 17320 + }, + { + "epoch": 1.66, + "grad_norm": 0.29831939883228753, + "learning_rate": 1.5638314749045825e-05, + "loss": 1.1306, + "step": 17321 + }, + { + "epoch": 1.66, + "grad_norm": 0.3466599318413504, + "learning_rate": 1.5629821598345685e-05, + "loss": 1.0641, + "step": 17322 + }, + { + "epoch": 1.66, + "grad_norm": 0.27919483849019966, + "learning_rate": 1.5621330559062275e-05, + "loss": 1.0457, + "step": 17323 + }, + { + "epoch": 1.66, + "grad_norm": 0.3344093985795658, + "learning_rate": 1.5612841631408116e-05, + "loss": 1.0629, + "step": 17324 + }, + { + "epoch": 1.66, + "grad_norm": 0.3026850788806729, + "learning_rate": 1.560435481559561e-05, + "loss": 1.1619, + "step": 17325 + }, + { + "epoch": 1.66, + "grad_norm": 0.33769633030286117, + "learning_rate": 1.5595870111837198e-05, + "loss": 0.9734, + "step": 17326 + }, + { + "epoch": 1.66, + "grad_norm": 0.2924038638883622, + "learning_rate": 1.5587387520345154e-05, + "loss": 0.9274, + "step": 17327 + }, + { + "epoch": 1.66, + "grad_norm": 0.27222460766716805, + "learning_rate": 1.5578907041331803e-05, + "loss": 1.0234, + "step": 17328 + }, + { + "epoch": 1.66, + "grad_norm": 0.3485124293214331, + "learning_rate": 1.557042867500932e-05, + "loss": 1.0706, + "step": 17329 + }, + { + "epoch": 1.66, + "grad_norm": 0.3058237110619497, + "learning_rate": 1.5561952421589975e-05, + "loss": 0.9782, + "step": 17330 + }, + { + "epoch": 1.66, + "grad_norm": 0.3863221663217281, + "learning_rate": 1.5553478281285804e-05, + "loss": 0.9551, + "step": 17331 + }, + { + "epoch": 1.66, + "grad_norm": 0.27309543575557155, + "learning_rate": 1.5545006254308948e-05, + "loss": 0.9734, + "step": 17332 + }, + { + "epoch": 1.66, + "grad_norm": 0.278164737480813, + "learning_rate": 1.5536536340871355e-05, + "loss": 0.9896, + "step": 17333 + }, + { + "epoch": 1.66, + "grad_norm": 0.3271594933313493, + "learning_rate": 1.5528068541185027e-05, + "loss": 1.1069, + "step": 17334 + }, + { + "epoch": 1.66, + "grad_norm": 0.35245894913727455, + "learning_rate": 1.551960285546189e-05, + "loss": 1.0106, + "step": 17335 + }, + { + "epoch": 1.66, + "grad_norm": 0.29150515801741766, + "learning_rate": 1.5511139283913766e-05, + "loss": 1.0112, + "step": 17336 + }, + { + "epoch": 1.66, + "grad_norm": 0.28767345621313184, + "learning_rate": 1.5502677826752475e-05, + "loss": 0.8244, + "step": 17337 + }, + { + "epoch": 1.66, + "grad_norm": 0.32042736996448207, + "learning_rate": 1.5494218484189783e-05, + "loss": 0.9589, + "step": 17338 + }, + { + "epoch": 1.66, + "grad_norm": 0.31982107583783387, + "learning_rate": 1.5485761256437402e-05, + "loss": 1.0821, + "step": 17339 + }, + { + "epoch": 1.66, + "grad_norm": 0.3016685247511034, + "learning_rate": 1.5477306143706947e-05, + "loss": 0.9749, + "step": 17340 + }, + { + "epoch": 1.66, + "grad_norm": 0.3012342995084219, + "learning_rate": 1.5468853146210038e-05, + "loss": 1.0435, + "step": 17341 + }, + { + "epoch": 1.66, + "grad_norm": 0.3063419543674852, + "learning_rate": 1.5460402264158193e-05, + "loss": 1.1258, + "step": 17342 + }, + { + "epoch": 1.66, + "grad_norm": 0.35664263240013155, + "learning_rate": 1.5451953497762905e-05, + "loss": 0.9106, + "step": 17343 + }, + { + "epoch": 1.66, + "grad_norm": 0.3124263482243161, + "learning_rate": 1.5443506847235624e-05, + "loss": 1.1489, + "step": 17344 + }, + { + "epoch": 1.66, + "grad_norm": 0.2924606997713858, + "learning_rate": 1.543506231278774e-05, + "loss": 0.9726, + "step": 17345 + }, + { + "epoch": 1.66, + "grad_norm": 0.294924642801625, + "learning_rate": 1.5426619894630545e-05, + "loss": 1.1098, + "step": 17346 + }, + { + "epoch": 1.66, + "grad_norm": 0.2865799904833445, + "learning_rate": 1.541817959297537e-05, + "loss": 0.9715, + "step": 17347 + }, + { + "epoch": 1.66, + "grad_norm": 0.3298192395467313, + "learning_rate": 1.5409741408033384e-05, + "loss": 1.039, + "step": 17348 + }, + { + "epoch": 1.66, + "grad_norm": 0.27648200613037544, + "learning_rate": 1.5401305340015805e-05, + "loss": 0.9492, + "step": 17349 + }, + { + "epoch": 1.66, + "grad_norm": 0.33571992469833867, + "learning_rate": 1.539287138913368e-05, + "loss": 1.0532, + "step": 17350 + }, + { + "epoch": 1.66, + "grad_norm": 0.3244081557284884, + "learning_rate": 1.538443955559816e-05, + "loss": 1.004, + "step": 17351 + }, + { + "epoch": 1.66, + "grad_norm": 0.3197512707062865, + "learning_rate": 1.537600983962021e-05, + "loss": 1.0748, + "step": 17352 + }, + { + "epoch": 1.66, + "grad_norm": 0.3248854898567877, + "learning_rate": 1.5367582241410816e-05, + "loss": 1.0691, + "step": 17353 + }, + { + "epoch": 1.66, + "grad_norm": 0.3202063327060419, + "learning_rate": 1.535915676118084e-05, + "loss": 0.9896, + "step": 17354 + }, + { + "epoch": 1.66, + "grad_norm": 0.32570085217543965, + "learning_rate": 1.5350733399141158e-05, + "loss": 1.0573, + "step": 17355 + }, + { + "epoch": 1.66, + "grad_norm": 0.3146763618403506, + "learning_rate": 1.534231215550258e-05, + "loss": 1.0528, + "step": 17356 + }, + { + "epoch": 1.66, + "grad_norm": 0.33047898128992304, + "learning_rate": 1.5333893030475842e-05, + "loss": 1.0765, + "step": 17357 + }, + { + "epoch": 1.66, + "grad_norm": 0.3152751379561106, + "learning_rate": 1.5325476024271667e-05, + "loss": 1.0961, + "step": 17358 + }, + { + "epoch": 1.66, + "grad_norm": 0.332385829118546, + "learning_rate": 1.531706113710065e-05, + "loss": 1.0169, + "step": 17359 + }, + { + "epoch": 1.66, + "grad_norm": 0.32910350969629076, + "learning_rate": 1.5308648369173417e-05, + "loss": 1.0524, + "step": 17360 + }, + { + "epoch": 1.66, + "grad_norm": 0.29204290187082416, + "learning_rate": 1.530023772070047e-05, + "loss": 1.0223, + "step": 17361 + }, + { + "epoch": 1.66, + "grad_norm": 0.3276957045699913, + "learning_rate": 1.5291829191892327e-05, + "loss": 0.9853, + "step": 17362 + }, + { + "epoch": 1.66, + "grad_norm": 0.28250450120574616, + "learning_rate": 1.5283422782959344e-05, + "loss": 0.9315, + "step": 17363 + }, + { + "epoch": 1.66, + "grad_norm": 0.31986034127515156, + "learning_rate": 1.5275018494111992e-05, + "loss": 0.9904, + "step": 17364 + }, + { + "epoch": 1.66, + "grad_norm": 0.3389590532505694, + "learning_rate": 1.5266616325560533e-05, + "loss": 0.9552, + "step": 17365 + }, + { + "epoch": 1.66, + "grad_norm": 0.3144008253845427, + "learning_rate": 1.5258216277515269e-05, + "loss": 1.0264, + "step": 17366 + }, + { + "epoch": 1.66, + "grad_norm": 0.3302116266881156, + "learning_rate": 1.5249818350186385e-05, + "loss": 1.1001, + "step": 17367 + }, + { + "epoch": 1.66, + "grad_norm": 0.2586864026591455, + "learning_rate": 1.524142254378408e-05, + "loss": 1.0446, + "step": 17368 + }, + { + "epoch": 1.66, + "grad_norm": 0.317291637913826, + "learning_rate": 1.5233028858518427e-05, + "loss": 0.9893, + "step": 17369 + }, + { + "epoch": 1.66, + "grad_norm": 0.3342958752807293, + "learning_rate": 1.5224637294599497e-05, + "loss": 1.0253, + "step": 17370 + }, + { + "epoch": 1.66, + "grad_norm": 0.2992521163201816, + "learning_rate": 1.5216247852237298e-05, + "loss": 1.0475, + "step": 17371 + }, + { + "epoch": 1.66, + "grad_norm": 0.3572923825842893, + "learning_rate": 1.5207860531641804e-05, + "loss": 1.0608, + "step": 17372 + }, + { + "epoch": 1.66, + "grad_norm": 0.2731154872116241, + "learning_rate": 1.519947533302286e-05, + "loss": 0.9999, + "step": 17373 + }, + { + "epoch": 1.66, + "grad_norm": 0.29422350889250354, + "learning_rate": 1.5191092256590357e-05, + "loss": 1.0193, + "step": 17374 + }, + { + "epoch": 1.66, + "grad_norm": 0.32033001518757187, + "learning_rate": 1.5182711302554076e-05, + "loss": 0.9772, + "step": 17375 + }, + { + "epoch": 1.66, + "grad_norm": 0.2985849924679597, + "learning_rate": 1.5174332471123742e-05, + "loss": 1.0868, + "step": 17376 + }, + { + "epoch": 1.66, + "grad_norm": 0.3501152986078158, + "learning_rate": 1.5165955762509043e-05, + "loss": 0.9658, + "step": 17377 + }, + { + "epoch": 1.66, + "grad_norm": 0.3101533031039218, + "learning_rate": 1.515758117691961e-05, + "loss": 1.0563, + "step": 17378 + }, + { + "epoch": 1.66, + "grad_norm": 0.30169349358186137, + "learning_rate": 1.5149208714565066e-05, + "loss": 1.0959, + "step": 17379 + }, + { + "epoch": 1.66, + "grad_norm": 0.2977711701264271, + "learning_rate": 1.5140838375654865e-05, + "loss": 0.9802, + "step": 17380 + }, + { + "epoch": 1.66, + "grad_norm": 0.3115613006094451, + "learning_rate": 1.5132470160398549e-05, + "loss": 0.9877, + "step": 17381 + }, + { + "epoch": 1.66, + "grad_norm": 0.34744665216640536, + "learning_rate": 1.5124104069005474e-05, + "loss": 1.0357, + "step": 17382 + }, + { + "epoch": 1.66, + "grad_norm": 0.3493305941150002, + "learning_rate": 1.511574010168504e-05, + "loss": 0.9195, + "step": 17383 + }, + { + "epoch": 1.66, + "grad_norm": 0.3121941642552152, + "learning_rate": 1.5107378258646554e-05, + "loss": 0.9638, + "step": 17384 + }, + { + "epoch": 1.66, + "grad_norm": 0.35624344929582374, + "learning_rate": 1.5099018540099308e-05, + "loss": 1.0397, + "step": 17385 + }, + { + "epoch": 1.66, + "grad_norm": 0.3168992298807674, + "learning_rate": 1.5090660946252466e-05, + "loss": 0.9235, + "step": 17386 + }, + { + "epoch": 1.66, + "grad_norm": 0.30485221981272087, + "learning_rate": 1.5082305477315205e-05, + "loss": 1.0309, + "step": 17387 + }, + { + "epoch": 1.66, + "grad_norm": 0.3203845127378914, + "learning_rate": 1.5073952133496604e-05, + "loss": 1.0907, + "step": 17388 + }, + { + "epoch": 1.66, + "grad_norm": 0.33597831659522515, + "learning_rate": 1.506560091500575e-05, + "loss": 1.0704, + "step": 17389 + }, + { + "epoch": 1.66, + "grad_norm": 0.3764302812553428, + "learning_rate": 1.5057251822051555e-05, + "loss": 1.0669, + "step": 17390 + }, + { + "epoch": 1.66, + "grad_norm": 0.30188704513146974, + "learning_rate": 1.504890485484307e-05, + "loss": 1.0339, + "step": 17391 + }, + { + "epoch": 1.66, + "grad_norm": 0.3360537931158151, + "learning_rate": 1.504056001358911e-05, + "loss": 1.0412, + "step": 17392 + }, + { + "epoch": 1.66, + "grad_norm": 0.325609514915759, + "learning_rate": 1.5032217298498552e-05, + "loss": 1.0745, + "step": 17393 + }, + { + "epoch": 1.66, + "grad_norm": 0.3870428954652664, + "learning_rate": 1.5023876709780138e-05, + "loss": 0.9806, + "step": 17394 + }, + { + "epoch": 1.66, + "grad_norm": 0.32115487180949076, + "learning_rate": 1.5015538247642613e-05, + "loss": 1.0174, + "step": 17395 + }, + { + "epoch": 1.66, + "grad_norm": 0.30237385599634037, + "learning_rate": 1.5007201912294678e-05, + "loss": 1.1054, + "step": 17396 + }, + { + "epoch": 1.66, + "grad_norm": 0.3415330512358321, + "learning_rate": 1.4998867703944897e-05, + "loss": 1.0907, + "step": 17397 + }, + { + "epoch": 1.66, + "grad_norm": 0.344794922577873, + "learning_rate": 1.4990535622801882e-05, + "loss": 1.0388, + "step": 17398 + }, + { + "epoch": 1.66, + "grad_norm": 0.30669625785388993, + "learning_rate": 1.4982205669074134e-05, + "loss": 1.0927, + "step": 17399 + }, + { + "epoch": 1.66, + "grad_norm": 0.30086642899696603, + "learning_rate": 1.4973877842970152e-05, + "loss": 1.1083, + "step": 17400 + }, + { + "epoch": 1.66, + "grad_norm": 0.29789805430985106, + "learning_rate": 1.4965552144698291e-05, + "loss": 1.1347, + "step": 17401 + }, + { + "epoch": 1.66, + "grad_norm": 0.3186461317992837, + "learning_rate": 1.4957228574466942e-05, + "loss": 0.9206, + "step": 17402 + }, + { + "epoch": 1.66, + "grad_norm": 0.2864686581107256, + "learning_rate": 1.494890713248438e-05, + "loss": 1.0278, + "step": 17403 + }, + { + "epoch": 1.67, + "grad_norm": 0.32345638958998063, + "learning_rate": 1.4940587818958874e-05, + "loss": 0.9234, + "step": 17404 + }, + { + "epoch": 1.67, + "grad_norm": 0.34124217621929126, + "learning_rate": 1.4932270634098611e-05, + "loss": 1.0212, + "step": 17405 + }, + { + "epoch": 1.67, + "grad_norm": 0.314826991307951, + "learning_rate": 1.4923955578111759e-05, + "loss": 0.9253, + "step": 17406 + }, + { + "epoch": 1.67, + "grad_norm": 0.326407531638917, + "learning_rate": 1.4915642651206363e-05, + "loss": 1.0885, + "step": 17407 + }, + { + "epoch": 1.67, + "grad_norm": 0.27875571473145394, + "learning_rate": 1.4907331853590512e-05, + "loss": 0.9952, + "step": 17408 + }, + { + "epoch": 1.67, + "grad_norm": 0.3220064649685888, + "learning_rate": 1.4899023185472127e-05, + "loss": 1.0424, + "step": 17409 + }, + { + "epoch": 1.67, + "grad_norm": 0.29140652902203185, + "learning_rate": 1.48907166470592e-05, + "loss": 1.0106, + "step": 17410 + }, + { + "epoch": 1.67, + "grad_norm": 0.3162524706526793, + "learning_rate": 1.488241223855954e-05, + "loss": 1.0682, + "step": 17411 + }, + { + "epoch": 1.67, + "grad_norm": 0.29932476517776957, + "learning_rate": 1.4874109960181048e-05, + "loss": 1.0832, + "step": 17412 + }, + { + "epoch": 1.67, + "grad_norm": 0.3286363158963139, + "learning_rate": 1.4865809812131426e-05, + "loss": 1.0234, + "step": 17413 + }, + { + "epoch": 1.67, + "grad_norm": 0.29453232594437306, + "learning_rate": 1.485751179461844e-05, + "loss": 1.0518, + "step": 17414 + }, + { + "epoch": 1.67, + "grad_norm": 0.3206391873285117, + "learning_rate": 1.4849215907849723e-05, + "loss": 0.9982, + "step": 17415 + }, + { + "epoch": 1.67, + "grad_norm": 0.31278978493871534, + "learning_rate": 1.4840922152032877e-05, + "loss": 1.0537, + "step": 17416 + }, + { + "epoch": 1.67, + "grad_norm": 0.3113751411614062, + "learning_rate": 1.4832630527375479e-05, + "loss": 1.057, + "step": 17417 + }, + { + "epoch": 1.67, + "grad_norm": 0.3396257330036846, + "learning_rate": 1.4824341034085055e-05, + "loss": 0.9534, + "step": 17418 + }, + { + "epoch": 1.67, + "grad_norm": 0.30707877186214244, + "learning_rate": 1.4816053672369e-05, + "loss": 1.0025, + "step": 17419 + }, + { + "epoch": 1.67, + "grad_norm": 0.332151367265177, + "learning_rate": 1.4807768442434744e-05, + "loss": 1.0699, + "step": 17420 + }, + { + "epoch": 1.67, + "grad_norm": 0.30820438113516724, + "learning_rate": 1.4799485344489638e-05, + "loss": 1.0431, + "step": 17421 + }, + { + "epoch": 1.67, + "grad_norm": 0.29839251071185047, + "learning_rate": 1.479120437874094e-05, + "loss": 1.0505, + "step": 17422 + }, + { + "epoch": 1.67, + "grad_norm": 0.2854305556350824, + "learning_rate": 1.4782925545395931e-05, + "loss": 0.9423, + "step": 17423 + }, + { + "epoch": 1.67, + "grad_norm": 0.3273210487889984, + "learning_rate": 1.477464884466172e-05, + "loss": 1.0836, + "step": 17424 + }, + { + "epoch": 1.67, + "grad_norm": 0.3403224618777856, + "learning_rate": 1.4766374276745532e-05, + "loss": 0.8792, + "step": 17425 + }, + { + "epoch": 1.67, + "grad_norm": 0.3245821929768175, + "learning_rate": 1.4758101841854365e-05, + "loss": 1.0548, + "step": 17426 + }, + { + "epoch": 1.67, + "grad_norm": 0.3278980197030739, + "learning_rate": 1.4749831540195291e-05, + "loss": 1.017, + "step": 17427 + }, + { + "epoch": 1.67, + "grad_norm": 0.3164122356851469, + "learning_rate": 1.4741563371975242e-05, + "loss": 1.0405, + "step": 17428 + }, + { + "epoch": 1.67, + "grad_norm": 0.24388656899469177, + "learning_rate": 1.4733297337401187e-05, + "loss": 1.0122, + "step": 17429 + }, + { + "epoch": 1.67, + "grad_norm": 0.30900296382867753, + "learning_rate": 1.4725033436679914e-05, + "loss": 1.0471, + "step": 17430 + }, + { + "epoch": 1.67, + "grad_norm": 0.28785931302068357, + "learning_rate": 1.4716771670018281e-05, + "loss": 1.051, + "step": 17431 + }, + { + "epoch": 1.67, + "grad_norm": 0.27788434948138474, + "learning_rate": 1.4708512037623034e-05, + "loss": 0.9962, + "step": 17432 + }, + { + "epoch": 1.67, + "grad_norm": 0.3232412177615238, + "learning_rate": 1.4700254539700908e-05, + "loss": 1.0375, + "step": 17433 + }, + { + "epoch": 1.67, + "grad_norm": 0.31642608663519195, + "learning_rate": 1.4691999176458483e-05, + "loss": 1.0105, + "step": 17434 + }, + { + "epoch": 1.67, + "grad_norm": 0.27517625866488654, + "learning_rate": 1.4683745948102422e-05, + "loss": 1.0053, + "step": 17435 + }, + { + "epoch": 1.67, + "grad_norm": 0.2771132099009318, + "learning_rate": 1.4675494854839222e-05, + "loss": 0.9431, + "step": 17436 + }, + { + "epoch": 1.67, + "grad_norm": 0.31471837577976924, + "learning_rate": 1.4667245896875382e-05, + "loss": 1.0459, + "step": 17437 + }, + { + "epoch": 1.67, + "grad_norm": 0.3227988574150817, + "learning_rate": 1.4658999074417345e-05, + "loss": 1.0312, + "step": 17438 + }, + { + "epoch": 1.67, + "grad_norm": 0.2832877241701537, + "learning_rate": 1.4650754387671506e-05, + "loss": 1.0951, + "step": 17439 + }, + { + "epoch": 1.67, + "grad_norm": 0.27946217871561246, + "learning_rate": 1.4642511836844163e-05, + "loss": 1.0026, + "step": 17440 + }, + { + "epoch": 1.67, + "grad_norm": 0.28878472850361453, + "learning_rate": 1.463427142214161e-05, + "loss": 0.9714, + "step": 17441 + }, + { + "epoch": 1.67, + "grad_norm": 0.3078339400995846, + "learning_rate": 1.4626033143770079e-05, + "loss": 0.9838, + "step": 17442 + }, + { + "epoch": 1.67, + "grad_norm": 0.3536979630485215, + "learning_rate": 1.4617797001935707e-05, + "loss": 1.0939, + "step": 17443 + }, + { + "epoch": 1.67, + "grad_norm": 0.3119958529635826, + "learning_rate": 1.4609562996844616e-05, + "loss": 1.1641, + "step": 17444 + }, + { + "epoch": 1.67, + "grad_norm": 0.29563646100480523, + "learning_rate": 1.4601331128702889e-05, + "loss": 1.0216, + "step": 17445 + }, + { + "epoch": 1.67, + "grad_norm": 0.3291684453458526, + "learning_rate": 1.4593101397716535e-05, + "loss": 0.9606, + "step": 17446 + }, + { + "epoch": 1.67, + "grad_norm": 0.2692587595424595, + "learning_rate": 1.4584873804091481e-05, + "loss": 0.9693, + "step": 17447 + }, + { + "epoch": 1.67, + "grad_norm": 0.3194020066525342, + "learning_rate": 1.4576648348033661e-05, + "loss": 0.9785, + "step": 17448 + }, + { + "epoch": 1.67, + "grad_norm": 0.32329689455568505, + "learning_rate": 1.4568425029748877e-05, + "loss": 1.0257, + "step": 17449 + }, + { + "epoch": 1.67, + "grad_norm": 0.31801786053440706, + "learning_rate": 1.4560203849442978e-05, + "loss": 1.0577, + "step": 17450 + }, + { + "epoch": 1.67, + "grad_norm": 0.28866666996652857, + "learning_rate": 1.4551984807321617e-05, + "loss": 1.0553, + "step": 17451 + }, + { + "epoch": 1.67, + "grad_norm": 0.3043074877040972, + "learning_rate": 1.45437679035906e-05, + "loss": 1.1246, + "step": 17452 + }, + { + "epoch": 1.67, + "grad_norm": 0.35040644331295273, + "learning_rate": 1.453555313845546e-05, + "loss": 1.0424, + "step": 17453 + }, + { + "epoch": 1.67, + "grad_norm": 0.2945064556879527, + "learning_rate": 1.4527340512121846e-05, + "loss": 0.8895, + "step": 17454 + }, + { + "epoch": 1.67, + "grad_norm": 0.32342090672137785, + "learning_rate": 1.4519130024795224e-05, + "loss": 1.0167, + "step": 17455 + }, + { + "epoch": 1.67, + "grad_norm": 0.31959815930096713, + "learning_rate": 1.451092167668111e-05, + "loss": 0.9363, + "step": 17456 + }, + { + "epoch": 1.67, + "grad_norm": 0.3729188171993666, + "learning_rate": 1.4502715467984918e-05, + "loss": 1.0046, + "step": 17457 + }, + { + "epoch": 1.67, + "grad_norm": 0.30041060894728855, + "learning_rate": 1.449451139891198e-05, + "loss": 1.1555, + "step": 17458 + }, + { + "epoch": 1.67, + "grad_norm": 0.3335269552145815, + "learning_rate": 1.4486309469667647e-05, + "loss": 1.0056, + "step": 17459 + }, + { + "epoch": 1.67, + "grad_norm": 0.32112393637252656, + "learning_rate": 1.4478109680457153e-05, + "loss": 1.0365, + "step": 17460 + }, + { + "epoch": 1.67, + "grad_norm": 0.26008571627436533, + "learning_rate": 1.4469912031485732e-05, + "loss": 0.8506, + "step": 17461 + }, + { + "epoch": 1.67, + "grad_norm": 0.3364121660307569, + "learning_rate": 1.4461716522958502e-05, + "loss": 1.0288, + "step": 17462 + }, + { + "epoch": 1.67, + "grad_norm": 0.33824542221112336, + "learning_rate": 1.4453523155080595e-05, + "loss": 1.0057, + "step": 17463 + }, + { + "epoch": 1.67, + "grad_norm": 0.35380880249803504, + "learning_rate": 1.4445331928057016e-05, + "loss": 1.0373, + "step": 17464 + }, + { + "epoch": 1.67, + "grad_norm": 0.2999445917216118, + "learning_rate": 1.4437142842092766e-05, + "loss": 0.8697, + "step": 17465 + }, + { + "epoch": 1.67, + "grad_norm": 0.3345236861079351, + "learning_rate": 1.4428955897392794e-05, + "loss": 1.0731, + "step": 17466 + }, + { + "epoch": 1.67, + "grad_norm": 0.32218611816344767, + "learning_rate": 1.4420771094162e-05, + "loss": 1.0583, + "step": 17467 + }, + { + "epoch": 1.67, + "grad_norm": 0.3315477236798198, + "learning_rate": 1.4412588432605178e-05, + "loss": 1.0823, + "step": 17468 + }, + { + "epoch": 1.67, + "grad_norm": 0.33578760491019327, + "learning_rate": 1.4404407912927143e-05, + "loss": 0.9831, + "step": 17469 + }, + { + "epoch": 1.67, + "grad_norm": 0.3239090995534745, + "learning_rate": 1.4396229535332562e-05, + "loss": 1.0876, + "step": 17470 + }, + { + "epoch": 1.67, + "grad_norm": 0.31739226223954276, + "learning_rate": 1.4388053300026172e-05, + "loss": 1.0122, + "step": 17471 + }, + { + "epoch": 1.67, + "grad_norm": 0.287659486263477, + "learning_rate": 1.43798792072125e-05, + "loss": 1.0875, + "step": 17472 + }, + { + "epoch": 1.67, + "grad_norm": 0.3111414803716315, + "learning_rate": 1.4371707257096201e-05, + "loss": 1.0396, + "step": 17473 + }, + { + "epoch": 1.67, + "grad_norm": 0.3443672368638686, + "learning_rate": 1.4363537449881726e-05, + "loss": 1.0828, + "step": 17474 + }, + { + "epoch": 1.67, + "grad_norm": 0.277486891413935, + "learning_rate": 1.4355369785773576e-05, + "loss": 0.9809, + "step": 17475 + }, + { + "epoch": 1.67, + "grad_norm": 0.278999190251525, + "learning_rate": 1.43472042649761e-05, + "loss": 1.0453, + "step": 17476 + }, + { + "epoch": 1.67, + "grad_norm": 0.3304289697786955, + "learning_rate": 1.4339040887693655e-05, + "loss": 1.0693, + "step": 17477 + }, + { + "epoch": 1.67, + "grad_norm": 0.2654730019075425, + "learning_rate": 1.4330879654130558e-05, + "loss": 0.9698, + "step": 17478 + }, + { + "epoch": 1.67, + "grad_norm": 0.33005786212152993, + "learning_rate": 1.4322720564491066e-05, + "loss": 1.0213, + "step": 17479 + }, + { + "epoch": 1.67, + "grad_norm": 0.2803098008376071, + "learning_rate": 1.4314563618979305e-05, + "loss": 1.0936, + "step": 17480 + }, + { + "epoch": 1.67, + "grad_norm": 0.30674976472615406, + "learning_rate": 1.4306408817799444e-05, + "loss": 0.877, + "step": 17481 + }, + { + "epoch": 1.67, + "grad_norm": 0.2696229449057849, + "learning_rate": 1.4298256161155577e-05, + "loss": 0.9724, + "step": 17482 + }, + { + "epoch": 1.67, + "grad_norm": 0.2674004953781008, + "learning_rate": 1.4290105649251695e-05, + "loss": 1.0332, + "step": 17483 + }, + { + "epoch": 1.67, + "grad_norm": 0.3052868486120438, + "learning_rate": 1.4281957282291814e-05, + "loss": 0.9592, + "step": 17484 + }, + { + "epoch": 1.67, + "grad_norm": 0.3211124789237402, + "learning_rate": 1.4273811060479769e-05, + "loss": 0.9837, + "step": 17485 + }, + { + "epoch": 1.67, + "grad_norm": 0.30264819961180683, + "learning_rate": 1.4265666984019522e-05, + "loss": 1.0075, + "step": 17486 + }, + { + "epoch": 1.67, + "grad_norm": 0.3021266171091263, + "learning_rate": 1.425752505311483e-05, + "loss": 1.0565, + "step": 17487 + }, + { + "epoch": 1.67, + "grad_norm": 0.34777657366293646, + "learning_rate": 1.4249385267969473e-05, + "loss": 1.0447, + "step": 17488 + }, + { + "epoch": 1.67, + "grad_norm": 0.30234187453155403, + "learning_rate": 1.4241247628787125e-05, + "loss": 1.0675, + "step": 17489 + }, + { + "epoch": 1.67, + "grad_norm": 0.3300057309570085, + "learning_rate": 1.4233112135771466e-05, + "loss": 1.0382, + "step": 17490 + }, + { + "epoch": 1.67, + "grad_norm": 0.33873992723546986, + "learning_rate": 1.4224978789126065e-05, + "loss": 1.069, + "step": 17491 + }, + { + "epoch": 1.67, + "grad_norm": 0.3132275766195238, + "learning_rate": 1.4216847589054472e-05, + "loss": 0.8591, + "step": 17492 + }, + { + "epoch": 1.67, + "grad_norm": 0.3020478662444418, + "learning_rate": 1.420871853576018e-05, + "loss": 1.0417, + "step": 17493 + }, + { + "epoch": 1.67, + "grad_norm": 0.33769212278018707, + "learning_rate": 1.4200591629446657e-05, + "loss": 1.015, + "step": 17494 + }, + { + "epoch": 1.67, + "grad_norm": 0.3229281326422278, + "learning_rate": 1.4192466870317223e-05, + "loss": 1.0022, + "step": 17495 + }, + { + "epoch": 1.67, + "grad_norm": 0.35070628333720444, + "learning_rate": 1.4184344258575255e-05, + "loss": 1.0084, + "step": 17496 + }, + { + "epoch": 1.67, + "grad_norm": 0.3806104425473119, + "learning_rate": 1.4176223794423981e-05, + "loss": 1.0204, + "step": 17497 + }, + { + "epoch": 1.67, + "grad_norm": 0.299173235055894, + "learning_rate": 1.416810547806664e-05, + "loss": 0.9819, + "step": 17498 + }, + { + "epoch": 1.67, + "grad_norm": 0.28396693846965776, + "learning_rate": 1.4159989309706411e-05, + "loss": 1.0264, + "step": 17499 + }, + { + "epoch": 1.67, + "grad_norm": 0.25784146306500727, + "learning_rate": 1.4151875289546423e-05, + "loss": 0.9627, + "step": 17500 + }, + { + "epoch": 1.67, + "grad_norm": 0.30378841571006715, + "learning_rate": 1.414376341778968e-05, + "loss": 0.9717, + "step": 17501 + }, + { + "epoch": 1.67, + "grad_norm": 0.2934335118482125, + "learning_rate": 1.4135653694639217e-05, + "loss": 1.0059, + "step": 17502 + }, + { + "epoch": 1.67, + "grad_norm": 0.32322245632853436, + "learning_rate": 1.4127546120298007e-05, + "loss": 1.0459, + "step": 17503 + }, + { + "epoch": 1.67, + "grad_norm": 0.3621842014780602, + "learning_rate": 1.41194406949689e-05, + "loss": 0.9863, + "step": 17504 + }, + { + "epoch": 1.67, + "grad_norm": 0.3106384357075395, + "learning_rate": 1.4111337418854775e-05, + "loss": 0.9076, + "step": 17505 + }, + { + "epoch": 1.67, + "grad_norm": 0.2945800300743261, + "learning_rate": 1.4103236292158395e-05, + "loss": 0.9377, + "step": 17506 + }, + { + "epoch": 1.67, + "grad_norm": 0.32585775195910577, + "learning_rate": 1.409513731508254e-05, + "loss": 0.9732, + "step": 17507 + }, + { + "epoch": 1.68, + "grad_norm": 0.35144696645165674, + "learning_rate": 1.4087040487829839e-05, + "loss": 1.0904, + "step": 17508 + }, + { + "epoch": 1.68, + "grad_norm": 0.3258117909397663, + "learning_rate": 1.4078945810602973e-05, + "loss": 1.0712, + "step": 17509 + }, + { + "epoch": 1.68, + "grad_norm": 0.3026318549827225, + "learning_rate": 1.4070853283604457e-05, + "loss": 1.0854, + "step": 17510 + }, + { + "epoch": 1.68, + "grad_norm": 0.31425430364922297, + "learning_rate": 1.4062762907036864e-05, + "loss": 0.982, + "step": 17511 + }, + { + "epoch": 1.68, + "grad_norm": 0.36078961037594065, + "learning_rate": 1.4054674681102598e-05, + "loss": 0.9991, + "step": 17512 + }, + { + "epoch": 1.68, + "grad_norm": 0.3203877558358694, + "learning_rate": 1.4046588606004163e-05, + "loss": 1.0131, + "step": 17513 + }, + { + "epoch": 1.68, + "grad_norm": 0.2881342021579295, + "learning_rate": 1.4038504681943831e-05, + "loss": 0.9239, + "step": 17514 + }, + { + "epoch": 1.68, + "grad_norm": 0.3239009244986449, + "learning_rate": 1.4030422909123975e-05, + "loss": 0.9111, + "step": 17515 + }, + { + "epoch": 1.68, + "grad_norm": 0.2889281598587997, + "learning_rate": 1.4022343287746798e-05, + "loss": 1.0595, + "step": 17516 + }, + { + "epoch": 1.68, + "grad_norm": 0.3411590109474246, + "learning_rate": 1.4014265818014527e-05, + "loss": 0.9877, + "step": 17517 + }, + { + "epoch": 1.68, + "grad_norm": 0.31492834927933433, + "learning_rate": 1.400619050012928e-05, + "loss": 1.0581, + "step": 17518 + }, + { + "epoch": 1.68, + "grad_norm": 0.3088691466153341, + "learning_rate": 1.399811733429316e-05, + "loss": 1.114, + "step": 17519 + }, + { + "epoch": 1.68, + "grad_norm": 0.3035267716076317, + "learning_rate": 1.3990046320708206e-05, + "loss": 1.0332, + "step": 17520 + }, + { + "epoch": 1.68, + "grad_norm": 0.30725897087079374, + "learning_rate": 1.3981977459576412e-05, + "loss": 0.9827, + "step": 17521 + }, + { + "epoch": 1.68, + "grad_norm": 0.3034873420325804, + "learning_rate": 1.3973910751099672e-05, + "loss": 1.0783, + "step": 17522 + }, + { + "epoch": 1.68, + "grad_norm": 0.3493802829015187, + "learning_rate": 1.3965846195479882e-05, + "loss": 1.0315, + "step": 17523 + }, + { + "epoch": 1.68, + "grad_norm": 0.32625903347074575, + "learning_rate": 1.3957783792918888e-05, + "loss": 0.9152, + "step": 17524 + }, + { + "epoch": 1.68, + "grad_norm": 0.2979037758155255, + "learning_rate": 1.3949723543618398e-05, + "loss": 1.1072, + "step": 17525 + }, + { + "epoch": 1.68, + "grad_norm": 0.30178855135386684, + "learning_rate": 1.3941665447780173e-05, + "loss": 1.0729, + "step": 17526 + }, + { + "epoch": 1.68, + "grad_norm": 0.27231613070954175, + "learning_rate": 1.3933609505605838e-05, + "loss": 1.0359, + "step": 17527 + }, + { + "epoch": 1.68, + "grad_norm": 0.3165090433099864, + "learning_rate": 1.3925555717297045e-05, + "loss": 0.9607, + "step": 17528 + }, + { + "epoch": 1.68, + "grad_norm": 0.31118124233903843, + "learning_rate": 1.39175040830553e-05, + "loss": 0.9937, + "step": 17529 + }, + { + "epoch": 1.68, + "grad_norm": 0.30351656385964765, + "learning_rate": 1.3909454603082128e-05, + "loss": 1.0231, + "step": 17530 + }, + { + "epoch": 1.68, + "grad_norm": 0.3155629388557388, + "learning_rate": 1.390140727757895e-05, + "loss": 0.9047, + "step": 17531 + }, + { + "epoch": 1.68, + "grad_norm": 0.31591351431648296, + "learning_rate": 1.3893362106747176e-05, + "loss": 0.9799, + "step": 17532 + }, + { + "epoch": 1.68, + "grad_norm": 0.37765380361574324, + "learning_rate": 1.3885319090788096e-05, + "loss": 1.0771, + "step": 17533 + }, + { + "epoch": 1.68, + "grad_norm": 0.3157645962137812, + "learning_rate": 1.3877278229903068e-05, + "loss": 1.045, + "step": 17534 + }, + { + "epoch": 1.68, + "grad_norm": 0.2931473124901867, + "learning_rate": 1.3869239524293255e-05, + "loss": 1.0497, + "step": 17535 + }, + { + "epoch": 1.68, + "grad_norm": 0.3466477811093972, + "learning_rate": 1.3861202974159881e-05, + "loss": 1.043, + "step": 17536 + }, + { + "epoch": 1.68, + "grad_norm": 0.2738955538183313, + "learning_rate": 1.3853168579704012e-05, + "loss": 0.9049, + "step": 17537 + }, + { + "epoch": 1.68, + "grad_norm": 0.3174937958916137, + "learning_rate": 1.3845136341126764e-05, + "loss": 1.0346, + "step": 17538 + }, + { + "epoch": 1.68, + "grad_norm": 0.31249041225598706, + "learning_rate": 1.3837106258629074e-05, + "loss": 0.9382, + "step": 17539 + }, + { + "epoch": 1.68, + "grad_norm": 0.293811162982976, + "learning_rate": 1.3829078332412004e-05, + "loss": 0.9434, + "step": 17540 + }, + { + "epoch": 1.68, + "grad_norm": 0.31769015361514036, + "learning_rate": 1.3821052562676385e-05, + "loss": 1.1118, + "step": 17541 + }, + { + "epoch": 1.68, + "grad_norm": 0.287600727505577, + "learning_rate": 1.3813028949623097e-05, + "loss": 1.0796, + "step": 17542 + }, + { + "epoch": 1.68, + "grad_norm": 0.2894254803213153, + "learning_rate": 1.3805007493452914e-05, + "loss": 0.941, + "step": 17543 + }, + { + "epoch": 1.68, + "grad_norm": 0.3232889854260773, + "learning_rate": 1.379698819436659e-05, + "loss": 0.9947, + "step": 17544 + }, + { + "epoch": 1.68, + "grad_norm": 0.32446903035721797, + "learning_rate": 1.3788971052564836e-05, + "loss": 1.0535, + "step": 17545 + }, + { + "epoch": 1.68, + "grad_norm": 0.3100441843824035, + "learning_rate": 1.3780956068248208e-05, + "loss": 1.0175, + "step": 17546 + }, + { + "epoch": 1.68, + "grad_norm": 0.2799356068507369, + "learning_rate": 1.3772943241617398e-05, + "loss": 1.023, + "step": 17547 + }, + { + "epoch": 1.68, + "grad_norm": 0.30037631625299466, + "learning_rate": 1.3764932572872846e-05, + "loss": 0.9468, + "step": 17548 + }, + { + "epoch": 1.68, + "grad_norm": 0.32830176599169286, + "learning_rate": 1.3756924062215083e-05, + "loss": 1.0252, + "step": 17549 + }, + { + "epoch": 1.68, + "grad_norm": 0.31586718199401215, + "learning_rate": 1.3748917709844467e-05, + "loss": 1.0305, + "step": 17550 + }, + { + "epoch": 1.68, + "grad_norm": 0.34982504946799114, + "learning_rate": 1.3740913515961417e-05, + "loss": 1.0823, + "step": 17551 + }, + { + "epoch": 1.68, + "grad_norm": 0.3235677570630502, + "learning_rate": 1.3732911480766186e-05, + "loss": 0.9736, + "step": 17552 + }, + { + "epoch": 1.68, + "grad_norm": 0.3425032291596828, + "learning_rate": 1.3724911604459079e-05, + "loss": 0.9185, + "step": 17553 + }, + { + "epoch": 1.68, + "grad_norm": 0.3116793635139905, + "learning_rate": 1.3716913887240268e-05, + "loss": 1.0557, + "step": 17554 + }, + { + "epoch": 1.68, + "grad_norm": 0.3197163008391024, + "learning_rate": 1.370891832930995e-05, + "loss": 1.1551, + "step": 17555 + }, + { + "epoch": 1.68, + "grad_norm": 0.263178810351029, + "learning_rate": 1.3700924930868142e-05, + "loss": 1.1498, + "step": 17556 + }, + { + "epoch": 1.68, + "grad_norm": 0.3319492938965182, + "learning_rate": 1.3692933692114962e-05, + "loss": 0.9885, + "step": 17557 + }, + { + "epoch": 1.68, + "grad_norm": 0.3366673782849636, + "learning_rate": 1.3684944613250328e-05, + "loss": 1.0612, + "step": 17558 + }, + { + "epoch": 1.68, + "grad_norm": 0.28604599531833974, + "learning_rate": 1.3676957694474201e-05, + "loss": 1.0754, + "step": 17559 + }, + { + "epoch": 1.68, + "grad_norm": 0.34347503015939057, + "learning_rate": 1.3668972935986468e-05, + "loss": 1.0423, + "step": 17560 + }, + { + "epoch": 1.68, + "grad_norm": 0.3214382503519984, + "learning_rate": 1.3660990337986967e-05, + "loss": 1.0625, + "step": 17561 + }, + { + "epoch": 1.68, + "grad_norm": 0.3146717377632213, + "learning_rate": 1.3653009900675418e-05, + "loss": 0.9294, + "step": 17562 + }, + { + "epoch": 1.68, + "grad_norm": 0.27619011917919206, + "learning_rate": 1.3645031624251558e-05, + "loss": 0.9877, + "step": 17563 + }, + { + "epoch": 1.68, + "grad_norm": 0.33672206681169226, + "learning_rate": 1.3637055508915086e-05, + "loss": 0.9938, + "step": 17564 + }, + { + "epoch": 1.68, + "grad_norm": 0.31042528758259974, + "learning_rate": 1.3629081554865542e-05, + "loss": 1.1178, + "step": 17565 + }, + { + "epoch": 1.68, + "grad_norm": 0.34793933988113573, + "learning_rate": 1.3621109762302519e-05, + "loss": 1.1225, + "step": 17566 + }, + { + "epoch": 1.68, + "grad_norm": 0.34156231153430533, + "learning_rate": 1.3613140131425516e-05, + "loss": 0.9569, + "step": 17567 + }, + { + "epoch": 1.68, + "grad_norm": 0.31078919758768886, + "learning_rate": 1.3605172662433997e-05, + "loss": 0.9437, + "step": 17568 + }, + { + "epoch": 1.68, + "grad_norm": 0.2894968685158787, + "learning_rate": 1.3597207355527309e-05, + "loss": 1.0673, + "step": 17569 + }, + { + "epoch": 1.68, + "grad_norm": 0.31582559096385976, + "learning_rate": 1.3589244210904816e-05, + "loss": 0.8515, + "step": 17570 + }, + { + "epoch": 1.68, + "grad_norm": 0.33293176232361243, + "learning_rate": 1.3581283228765784e-05, + "loss": 0.9367, + "step": 17571 + }, + { + "epoch": 1.68, + "grad_norm": 0.28906007475947554, + "learning_rate": 1.3573324409309473e-05, + "loss": 1.0718, + "step": 17572 + }, + { + "epoch": 1.68, + "grad_norm": 0.32083027911055495, + "learning_rate": 1.3565367752734992e-05, + "loss": 1.0106, + "step": 17573 + }, + { + "epoch": 1.68, + "grad_norm": 0.3640905302996117, + "learning_rate": 1.3557413259241547e-05, + "loss": 1.1368, + "step": 17574 + }, + { + "epoch": 1.68, + "grad_norm": 0.33065955740411884, + "learning_rate": 1.3549460929028136e-05, + "loss": 0.8753, + "step": 17575 + }, + { + "epoch": 1.68, + "grad_norm": 0.33542684128221456, + "learning_rate": 1.354151076229383e-05, + "loss": 1.0133, + "step": 17576 + }, + { + "epoch": 1.68, + "grad_norm": 0.32599230652361844, + "learning_rate": 1.3533562759237516e-05, + "loss": 0.9678, + "step": 17577 + }, + { + "epoch": 1.68, + "grad_norm": 0.39930402678943683, + "learning_rate": 1.352561692005817e-05, + "loss": 0.999, + "step": 17578 + }, + { + "epoch": 1.68, + "grad_norm": 0.2920921510986315, + "learning_rate": 1.3517673244954587e-05, + "loss": 1.0155, + "step": 17579 + }, + { + "epoch": 1.68, + "grad_norm": 0.33952991478853817, + "learning_rate": 1.3509731734125575e-05, + "loss": 1.0135, + "step": 17580 + }, + { + "epoch": 1.68, + "grad_norm": 0.3237071393239722, + "learning_rate": 1.3501792387769896e-05, + "loss": 1.0825, + "step": 17581 + }, + { + "epoch": 1.68, + "grad_norm": 0.32744591536030226, + "learning_rate": 1.3493855206086236e-05, + "loss": 0.9294, + "step": 17582 + }, + { + "epoch": 1.68, + "grad_norm": 0.3251326052350188, + "learning_rate": 1.3485920189273194e-05, + "loss": 1.0096, + "step": 17583 + }, + { + "epoch": 1.68, + "grad_norm": 0.35034705898452956, + "learning_rate": 1.3477987337529375e-05, + "loss": 0.9791, + "step": 17584 + }, + { + "epoch": 1.68, + "grad_norm": 0.30354834681890525, + "learning_rate": 1.347005665105332e-05, + "loss": 1.0107, + "step": 17585 + }, + { + "epoch": 1.68, + "grad_norm": 0.35676584923705623, + "learning_rate": 1.3462128130043462e-05, + "loss": 1.0824, + "step": 17586 + }, + { + "epoch": 1.68, + "grad_norm": 0.2676715221904577, + "learning_rate": 1.3454201774698228e-05, + "loss": 1.0498, + "step": 17587 + }, + { + "epoch": 1.68, + "grad_norm": 0.3572432988134186, + "learning_rate": 1.3446277585215994e-05, + "loss": 1.01, + "step": 17588 + }, + { + "epoch": 1.68, + "grad_norm": 0.30139837059354063, + "learning_rate": 1.343835556179508e-05, + "loss": 0.971, + "step": 17589 + }, + { + "epoch": 1.68, + "grad_norm": 0.2585424022065951, + "learning_rate": 1.3430435704633693e-05, + "loss": 1.1251, + "step": 17590 + }, + { + "epoch": 1.68, + "grad_norm": 0.3105330899647823, + "learning_rate": 1.3422518013930085e-05, + "loss": 1.0094, + "step": 17591 + }, + { + "epoch": 1.68, + "grad_norm": 0.32833084755655323, + "learning_rate": 1.3414602489882355e-05, + "loss": 0.9781, + "step": 17592 + }, + { + "epoch": 1.68, + "grad_norm": 0.3480348624523742, + "learning_rate": 1.3406689132688633e-05, + "loss": 1.131, + "step": 17593 + }, + { + "epoch": 1.68, + "grad_norm": 0.3033797039719529, + "learning_rate": 1.3398777942546891e-05, + "loss": 0.8906, + "step": 17594 + }, + { + "epoch": 1.68, + "grad_norm": 0.34046708868714937, + "learning_rate": 1.339086891965521e-05, + "loss": 1.011, + "step": 17595 + }, + { + "epoch": 1.68, + "grad_norm": 0.3071797257816816, + "learning_rate": 1.3382962064211447e-05, + "loss": 1.1265, + "step": 17596 + }, + { + "epoch": 1.68, + "grad_norm": 0.34518886277979166, + "learning_rate": 1.3375057376413502e-05, + "loss": 0.9147, + "step": 17597 + }, + { + "epoch": 1.68, + "grad_norm": 0.3200381988803872, + "learning_rate": 1.3367154856459174e-05, + "loss": 0.9058, + "step": 17598 + }, + { + "epoch": 1.68, + "grad_norm": 0.3204101847029496, + "learning_rate": 1.335925450454627e-05, + "loss": 1.0234, + "step": 17599 + }, + { + "epoch": 1.68, + "grad_norm": 0.30934609467224505, + "learning_rate": 1.3351356320872422e-05, + "loss": 1.011, + "step": 17600 + }, + { + "epoch": 1.68, + "grad_norm": 0.2983156628949313, + "learning_rate": 1.334346030563538e-05, + "loss": 0.9584, + "step": 17601 + }, + { + "epoch": 1.68, + "grad_norm": 0.3008112262454905, + "learning_rate": 1.3335566459032689e-05, + "loss": 1.0337, + "step": 17602 + }, + { + "epoch": 1.68, + "grad_norm": 0.309929487892796, + "learning_rate": 1.3327674781261945e-05, + "loss": 1.0009, + "step": 17603 + }, + { + "epoch": 1.68, + "grad_norm": 0.37643034221577176, + "learning_rate": 1.331978527252058e-05, + "loss": 0.9633, + "step": 17604 + }, + { + "epoch": 1.68, + "grad_norm": 0.28207206725671274, + "learning_rate": 1.3311897933006057e-05, + "loss": 1.0346, + "step": 17605 + }, + { + "epoch": 1.68, + "grad_norm": 0.3131411727432425, + "learning_rate": 1.3304012762915807e-05, + "loss": 1.1111, + "step": 17606 + }, + { + "epoch": 1.68, + "grad_norm": 0.28508842286533587, + "learning_rate": 1.3296129762447084e-05, + "loss": 0.9163, + "step": 17607 + }, + { + "epoch": 1.68, + "grad_norm": 0.3586881320189459, + "learning_rate": 1.328824893179722e-05, + "loss": 0.9834, + "step": 17608 + }, + { + "epoch": 1.68, + "grad_norm": 0.29172087143875175, + "learning_rate": 1.3280370271163422e-05, + "loss": 0.9775, + "step": 17609 + }, + { + "epoch": 1.68, + "grad_norm": 0.30018765323860697, + "learning_rate": 1.3272493780742878e-05, + "loss": 0.9323, + "step": 17610 + }, + { + "epoch": 1.68, + "grad_norm": 0.3107691449332591, + "learning_rate": 1.3264619460732652e-05, + "loss": 1.0404, + "step": 17611 + }, + { + "epoch": 1.68, + "grad_norm": 0.3258125230159753, + "learning_rate": 1.3256747311329875e-05, + "loss": 0.9584, + "step": 17612 + }, + { + "epoch": 1.69, + "grad_norm": 0.28322802850938544, + "learning_rate": 1.3248877332731479e-05, + "loss": 1.0275, + "step": 17613 + }, + { + "epoch": 1.69, + "grad_norm": 0.3141363762650755, + "learning_rate": 1.3241009525134451e-05, + "loss": 1.1189, + "step": 17614 + }, + { + "epoch": 1.69, + "grad_norm": 0.3198525383605891, + "learning_rate": 1.3233143888735689e-05, + "loss": 1.0332, + "step": 17615 + }, + { + "epoch": 1.69, + "grad_norm": 0.3136122084739105, + "learning_rate": 1.3225280423732046e-05, + "loss": 0.9787, + "step": 17616 + }, + { + "epoch": 1.69, + "grad_norm": 0.3031603493050634, + "learning_rate": 1.3217419130320275e-05, + "loss": 1.134, + "step": 17617 + }, + { + "epoch": 1.69, + "grad_norm": 0.3058136221639915, + "learning_rate": 1.3209560008697163e-05, + "loss": 0.9505, + "step": 17618 + }, + { + "epoch": 1.69, + "grad_norm": 0.29203124393138913, + "learning_rate": 1.3201703059059322e-05, + "loss": 0.9889, + "step": 17619 + }, + { + "epoch": 1.69, + "grad_norm": 0.3483666774510711, + "learning_rate": 1.3193848281603449e-05, + "loss": 1.1108, + "step": 17620 + }, + { + "epoch": 1.69, + "grad_norm": 0.31154124407808126, + "learning_rate": 1.318599567652603e-05, + "loss": 1.0863, + "step": 17621 + }, + { + "epoch": 1.69, + "grad_norm": 0.2914492984507073, + "learning_rate": 1.3178145244023665e-05, + "loss": 1.1228, + "step": 17622 + }, + { + "epoch": 1.69, + "grad_norm": 0.35019192809457916, + "learning_rate": 1.3170296984292774e-05, + "loss": 1.3322, + "step": 17623 + }, + { + "epoch": 1.69, + "grad_norm": 0.3151271215107481, + "learning_rate": 1.3162450897529777e-05, + "loss": 1.1499, + "step": 17624 + }, + { + "epoch": 1.69, + "grad_norm": 0.33693579743277613, + "learning_rate": 1.3154606983931006e-05, + "loss": 0.9992, + "step": 17625 + }, + { + "epoch": 1.69, + "grad_norm": 0.3212484216066483, + "learning_rate": 1.3146765243692782e-05, + "loss": 1.1508, + "step": 17626 + }, + { + "epoch": 1.69, + "grad_norm": 0.2924708522394135, + "learning_rate": 1.3138925677011338e-05, + "loss": 1.0773, + "step": 17627 + }, + { + "epoch": 1.69, + "grad_norm": 0.3000552446500743, + "learning_rate": 1.3131088284082893e-05, + "loss": 1.0078, + "step": 17628 + }, + { + "epoch": 1.69, + "grad_norm": 0.29762253899844904, + "learning_rate": 1.3123253065103535e-05, + "loss": 1.0115, + "step": 17629 + }, + { + "epoch": 1.69, + "grad_norm": 0.3279477824955888, + "learning_rate": 1.3115420020269364e-05, + "loss": 0.9715, + "step": 17630 + }, + { + "epoch": 1.69, + "grad_norm": 0.3073756796628143, + "learning_rate": 1.3107589149776445e-05, + "loss": 0.9642, + "step": 17631 + }, + { + "epoch": 1.69, + "grad_norm": 0.34365386553481403, + "learning_rate": 1.3099760453820687e-05, + "loss": 1.0092, + "step": 17632 + }, + { + "epoch": 1.69, + "grad_norm": 0.33759689969721873, + "learning_rate": 1.3091933932598055e-05, + "loss": 1.0594, + "step": 17633 + }, + { + "epoch": 1.69, + "grad_norm": 0.3059639971756752, + "learning_rate": 1.308410958630436e-05, + "loss": 0.9535, + "step": 17634 + }, + { + "epoch": 1.69, + "grad_norm": 0.30952884255222385, + "learning_rate": 1.307628741513549e-05, + "loss": 1.0624, + "step": 17635 + }, + { + "epoch": 1.69, + "grad_norm": 0.28980613195377924, + "learning_rate": 1.3068467419287145e-05, + "loss": 0.995, + "step": 17636 + }, + { + "epoch": 1.69, + "grad_norm": 0.35660062125448727, + "learning_rate": 1.3060649598955043e-05, + "loss": 1.0309, + "step": 17637 + }, + { + "epoch": 1.69, + "grad_norm": 0.33559359980681824, + "learning_rate": 1.3052833954334808e-05, + "loss": 1.0055, + "step": 17638 + }, + { + "epoch": 1.69, + "grad_norm": 0.32787842864228206, + "learning_rate": 1.3045020485622072e-05, + "loss": 1.0261, + "step": 17639 + }, + { + "epoch": 1.69, + "grad_norm": 0.3438827987615655, + "learning_rate": 1.3037209193012323e-05, + "loss": 0.9238, + "step": 17640 + }, + { + "epoch": 1.69, + "grad_norm": 0.3287275267943713, + "learning_rate": 1.302940007670107e-05, + "loss": 1.0058, + "step": 17641 + }, + { + "epoch": 1.69, + "grad_norm": 0.3365645327570122, + "learning_rate": 1.3021593136883736e-05, + "loss": 1.0635, + "step": 17642 + }, + { + "epoch": 1.69, + "grad_norm": 0.3328774347060007, + "learning_rate": 1.301378837375572e-05, + "loss": 0.9598, + "step": 17643 + }, + { + "epoch": 1.69, + "grad_norm": 0.2964572390954562, + "learning_rate": 1.3005985787512299e-05, + "loss": 0.9995, + "step": 17644 + }, + { + "epoch": 1.69, + "grad_norm": 0.32052077408867924, + "learning_rate": 1.299818537834876e-05, + "loss": 1.025, + "step": 17645 + }, + { + "epoch": 1.69, + "grad_norm": 0.33414855575142083, + "learning_rate": 1.2990387146460326e-05, + "loss": 1.0287, + "step": 17646 + }, + { + "epoch": 1.69, + "grad_norm": 0.4626702366906077, + "learning_rate": 1.2982591092042117e-05, + "loss": 1.0646, + "step": 17647 + }, + { + "epoch": 1.69, + "grad_norm": 0.3085761984215263, + "learning_rate": 1.2974797215289258e-05, + "loss": 1.0863, + "step": 17648 + }, + { + "epoch": 1.69, + "grad_norm": 0.324545153884794, + "learning_rate": 1.296700551639679e-05, + "loss": 1.0072, + "step": 17649 + }, + { + "epoch": 1.69, + "grad_norm": 0.25475043333923386, + "learning_rate": 1.2959215995559736e-05, + "loss": 0.9045, + "step": 17650 + }, + { + "epoch": 1.69, + "grad_norm": 0.31453274775471723, + "learning_rate": 1.2951428652972975e-05, + "loss": 1.0419, + "step": 17651 + }, + { + "epoch": 1.69, + "grad_norm": 0.3460844565651267, + "learning_rate": 1.2943643488831437e-05, + "loss": 0.9972, + "step": 17652 + }, + { + "epoch": 1.69, + "grad_norm": 0.3472817434332906, + "learning_rate": 1.2935860503329922e-05, + "loss": 1.0798, + "step": 17653 + }, + { + "epoch": 1.69, + "grad_norm": 0.3222914346624616, + "learning_rate": 1.2928079696663232e-05, + "loss": 1.0518, + "step": 17654 + }, + { + "epoch": 1.69, + "grad_norm": 0.3206660923624734, + "learning_rate": 1.2920301069026031e-05, + "loss": 1.0111, + "step": 17655 + }, + { + "epoch": 1.69, + "grad_norm": 0.2964059878542902, + "learning_rate": 1.2912524620613054e-05, + "loss": 0.852, + "step": 17656 + }, + { + "epoch": 1.69, + "grad_norm": 0.3657719029280144, + "learning_rate": 1.2904750351618878e-05, + "loss": 0.9304, + "step": 17657 + }, + { + "epoch": 1.69, + "grad_norm": 0.3238008567948505, + "learning_rate": 1.2896978262238068e-05, + "loss": 1.0982, + "step": 17658 + }, + { + "epoch": 1.69, + "grad_norm": 0.36989417636051974, + "learning_rate": 1.2889208352665106e-05, + "loss": 1.0993, + "step": 17659 + }, + { + "epoch": 1.69, + "grad_norm": 0.32501723427551205, + "learning_rate": 1.2881440623094466e-05, + "loss": 0.9408, + "step": 17660 + }, + { + "epoch": 1.69, + "grad_norm": 0.32800997944051385, + "learning_rate": 1.2873675073720493e-05, + "loss": 1.0997, + "step": 17661 + }, + { + "epoch": 1.69, + "grad_norm": 0.3117793495408782, + "learning_rate": 1.2865911704737599e-05, + "loss": 1.0525, + "step": 17662 + }, + { + "epoch": 1.69, + "grad_norm": 0.36302629037515277, + "learning_rate": 1.2858150516339995e-05, + "loss": 1.0474, + "step": 17663 + }, + { + "epoch": 1.69, + "grad_norm": 0.3122492739220237, + "learning_rate": 1.2850391508721971e-05, + "loss": 1.096, + "step": 17664 + }, + { + "epoch": 1.69, + "grad_norm": 0.326665222700803, + "learning_rate": 1.2842634682077647e-05, + "loss": 0.9213, + "step": 17665 + }, + { + "epoch": 1.69, + "grad_norm": 0.30031958532579145, + "learning_rate": 1.2834880036601171e-05, + "loss": 0.9924, + "step": 17666 + }, + { + "epoch": 1.69, + "grad_norm": 0.30717890874229237, + "learning_rate": 1.282712757248662e-05, + "loss": 1.0364, + "step": 17667 + }, + { + "epoch": 1.69, + "grad_norm": 0.3115681621369651, + "learning_rate": 1.281937728992797e-05, + "loss": 0.9945, + "step": 17668 + }, + { + "epoch": 1.69, + "grad_norm": 0.3456332879946246, + "learning_rate": 1.281162918911919e-05, + "loss": 1.0475, + "step": 17669 + }, + { + "epoch": 1.69, + "grad_norm": 0.3182410459236312, + "learning_rate": 1.280388327025418e-05, + "loss": 1.0905, + "step": 17670 + }, + { + "epoch": 1.69, + "grad_norm": 0.2902488992802202, + "learning_rate": 1.279613953352683e-05, + "loss": 0.9478, + "step": 17671 + }, + { + "epoch": 1.69, + "grad_norm": 0.3737620876275147, + "learning_rate": 1.2788397979130862e-05, + "loss": 1.0944, + "step": 17672 + }, + { + "epoch": 1.69, + "grad_norm": 0.315785801064412, + "learning_rate": 1.2780658607260066e-05, + "loss": 0.9454, + "step": 17673 + }, + { + "epoch": 1.69, + "grad_norm": 0.3467080349475676, + "learning_rate": 1.2772921418108085e-05, + "loss": 0.9836, + "step": 17674 + }, + { + "epoch": 1.69, + "grad_norm": 0.3337184616057167, + "learning_rate": 1.2765186411868557e-05, + "loss": 0.978, + "step": 17675 + }, + { + "epoch": 1.69, + "grad_norm": 0.33375276821857963, + "learning_rate": 1.275745358873508e-05, + "loss": 1.0694, + "step": 17676 + }, + { + "epoch": 1.69, + "grad_norm": 0.32720021041006936, + "learning_rate": 1.2749722948901166e-05, + "loss": 1.0555, + "step": 17677 + }, + { + "epoch": 1.69, + "grad_norm": 0.3515393938339399, + "learning_rate": 1.274199449256025e-05, + "loss": 1.004, + "step": 17678 + }, + { + "epoch": 1.69, + "grad_norm": 0.29932736845957725, + "learning_rate": 1.2734268219905788e-05, + "loss": 0.9526, + "step": 17679 + }, + { + "epoch": 1.69, + "grad_norm": 0.31665490151817344, + "learning_rate": 1.272654413113108e-05, + "loss": 1.0253, + "step": 17680 + }, + { + "epoch": 1.69, + "grad_norm": 0.30810930522161517, + "learning_rate": 1.2718822226429484e-05, + "loss": 1.0307, + "step": 17681 + }, + { + "epoch": 1.69, + "grad_norm": 0.30414275928618945, + "learning_rate": 1.2711102505994165e-05, + "loss": 0.9912, + "step": 17682 + }, + { + "epoch": 1.69, + "grad_norm": 0.3279966020406129, + "learning_rate": 1.2703384970018407e-05, + "loss": 1.0214, + "step": 17683 + }, + { + "epoch": 1.69, + "grad_norm": 0.31993590227132784, + "learning_rate": 1.2695669618695294e-05, + "loss": 1.0879, + "step": 17684 + }, + { + "epoch": 1.69, + "grad_norm": 0.31147867603488705, + "learning_rate": 1.268795645221793e-05, + "loss": 1.0633, + "step": 17685 + }, + { + "epoch": 1.69, + "grad_norm": 0.30902127489706027, + "learning_rate": 1.2680245470779307e-05, + "loss": 0.9579, + "step": 17686 + }, + { + "epoch": 1.69, + "grad_norm": 0.27081525177922466, + "learning_rate": 1.2672536674572422e-05, + "loss": 1.0467, + "step": 17687 + }, + { + "epoch": 1.69, + "grad_norm": 0.33096275475192816, + "learning_rate": 1.266483006379019e-05, + "loss": 1.021, + "step": 17688 + }, + { + "epoch": 1.69, + "grad_norm": 0.29454818867181615, + "learning_rate": 1.26571256386255e-05, + "loss": 1.0636, + "step": 17689 + }, + { + "epoch": 1.69, + "grad_norm": 0.2794259128130045, + "learning_rate": 1.264942339927111e-05, + "loss": 1.0657, + "step": 17690 + }, + { + "epoch": 1.69, + "grad_norm": 0.9692882987405552, + "learning_rate": 1.2641723345919786e-05, + "loss": 1.4177, + "step": 17691 + }, + { + "epoch": 1.69, + "grad_norm": 0.3424760081875002, + "learning_rate": 1.2634025478764277e-05, + "loss": 1.0592, + "step": 17692 + }, + { + "epoch": 1.69, + "grad_norm": 0.30428594327408065, + "learning_rate": 1.262632979799715e-05, + "loss": 1.0783, + "step": 17693 + }, + { + "epoch": 1.69, + "grad_norm": 0.3008776408434387, + "learning_rate": 1.261863630381106e-05, + "loss": 0.8895, + "step": 17694 + }, + { + "epoch": 1.69, + "grad_norm": 0.3252361504535194, + "learning_rate": 1.2610944996398476e-05, + "loss": 1.0162, + "step": 17695 + }, + { + "epoch": 1.69, + "grad_norm": 0.3067283829176, + "learning_rate": 1.2603255875951959e-05, + "loss": 1.0747, + "step": 17696 + }, + { + "epoch": 1.69, + "grad_norm": 0.34671594865024863, + "learning_rate": 1.259556894266386e-05, + "loss": 1.006, + "step": 17697 + }, + { + "epoch": 1.69, + "grad_norm": 0.32522232301248377, + "learning_rate": 1.2587884196726607e-05, + "loss": 1.0281, + "step": 17698 + }, + { + "epoch": 1.69, + "grad_norm": 0.2783448725388084, + "learning_rate": 1.2580201638332467e-05, + "loss": 0.9501, + "step": 17699 + }, + { + "epoch": 1.69, + "grad_norm": 0.3014682448734001, + "learning_rate": 1.2572521267673742e-05, + "loss": 1.1178, + "step": 17700 + }, + { + "epoch": 1.69, + "grad_norm": 0.3035460036311199, + "learning_rate": 1.256484308494259e-05, + "loss": 1.0303, + "step": 17701 + }, + { + "epoch": 1.69, + "grad_norm": 0.3738283161575054, + "learning_rate": 1.25571670903312e-05, + "loss": 1.049, + "step": 17702 + }, + { + "epoch": 1.69, + "grad_norm": 0.34923857219676907, + "learning_rate": 1.2549493284031665e-05, + "loss": 0.9874, + "step": 17703 + }, + { + "epoch": 1.69, + "grad_norm": 0.31287545463928595, + "learning_rate": 1.2541821666236031e-05, + "loss": 0.991, + "step": 17704 + }, + { + "epoch": 1.69, + "grad_norm": 0.327175018085624, + "learning_rate": 1.2534152237136253e-05, + "loss": 1.1745, + "step": 17705 + }, + { + "epoch": 1.69, + "grad_norm": 0.3052979164554808, + "learning_rate": 1.2526484996924315e-05, + "loss": 0.9743, + "step": 17706 + }, + { + "epoch": 1.69, + "grad_norm": 0.34088448254449144, + "learning_rate": 1.2518819945792037e-05, + "loss": 1.0196, + "step": 17707 + }, + { + "epoch": 1.69, + "grad_norm": 0.28467732405302393, + "learning_rate": 1.251115708393128e-05, + "loss": 1.0658, + "step": 17708 + }, + { + "epoch": 1.69, + "grad_norm": 0.31407190795481377, + "learning_rate": 1.2503496411533789e-05, + "loss": 0.9137, + "step": 17709 + }, + { + "epoch": 1.69, + "grad_norm": 0.2833121200738783, + "learning_rate": 1.2495837928791321e-05, + "loss": 0.9986, + "step": 17710 + }, + { + "epoch": 1.69, + "grad_norm": 0.2592670649308681, + "learning_rate": 1.2488181635895479e-05, + "loss": 1.0758, + "step": 17711 + }, + { + "epoch": 1.69, + "grad_norm": 0.3022968042801004, + "learning_rate": 1.2480527533037889e-05, + "loss": 1.0812, + "step": 17712 + }, + { + "epoch": 1.69, + "grad_norm": 0.2928023768215186, + "learning_rate": 1.2472875620410118e-05, + "loss": 1.0327, + "step": 17713 + }, + { + "epoch": 1.69, + "grad_norm": 0.3545927458096583, + "learning_rate": 1.2465225898203624e-05, + "loss": 0.9797, + "step": 17714 + }, + { + "epoch": 1.69, + "grad_norm": 0.36694255897071026, + "learning_rate": 1.24575783666099e-05, + "loss": 1.0067, + "step": 17715 + }, + { + "epoch": 1.69, + "grad_norm": 0.35257391902137747, + "learning_rate": 1.2449933025820237e-05, + "loss": 0.9809, + "step": 17716 + }, + { + "epoch": 1.7, + "grad_norm": 0.30179787219933435, + "learning_rate": 1.244228987602607e-05, + "loss": 0.9815, + "step": 17717 + }, + { + "epoch": 1.7, + "grad_norm": 0.35908201870501394, + "learning_rate": 1.2434648917418613e-05, + "loss": 0.9969, + "step": 17718 + }, + { + "epoch": 1.7, + "grad_norm": 0.30443670841994364, + "learning_rate": 1.2427010150189111e-05, + "loss": 1.0542, + "step": 17719 + }, + { + "epoch": 1.7, + "grad_norm": 0.2792050746247292, + "learning_rate": 1.2419373574528704e-05, + "loss": 1.0672, + "step": 17720 + }, + { + "epoch": 1.7, + "grad_norm": 0.32014381453242396, + "learning_rate": 1.2411739190628536e-05, + "loss": 1.0236, + "step": 17721 + }, + { + "epoch": 1.7, + "grad_norm": 0.3192949293788855, + "learning_rate": 1.2404106998679587e-05, + "loss": 1.0865, + "step": 17722 + }, + { + "epoch": 1.7, + "grad_norm": 0.34400067861299183, + "learning_rate": 1.2396476998872975e-05, + "loss": 1.0383, + "step": 17723 + }, + { + "epoch": 1.7, + "grad_norm": 0.3334620649387309, + "learning_rate": 1.2388849191399544e-05, + "loss": 1.0986, + "step": 17724 + }, + { + "epoch": 1.7, + "grad_norm": 0.3511750447408061, + "learning_rate": 1.2381223576450251e-05, + "loss": 1.0005, + "step": 17725 + }, + { + "epoch": 1.7, + "grad_norm": 0.3126514990631652, + "learning_rate": 1.2373600154215892e-05, + "loss": 1.0583, + "step": 17726 + }, + { + "epoch": 1.7, + "grad_norm": 0.2919450186827245, + "learning_rate": 1.2365978924887267e-05, + "loss": 0.9651, + "step": 17727 + }, + { + "epoch": 1.7, + "grad_norm": 0.32726781620651296, + "learning_rate": 1.235835988865508e-05, + "loss": 1.0659, + "step": 17728 + }, + { + "epoch": 1.7, + "grad_norm": 0.33959317339840955, + "learning_rate": 1.2350743045710022e-05, + "loss": 0.8496, + "step": 17729 + }, + { + "epoch": 1.7, + "grad_norm": 0.3631028510811403, + "learning_rate": 1.2343128396242698e-05, + "loss": 1.0136, + "step": 17730 + }, + { + "epoch": 1.7, + "grad_norm": 0.2912587023661965, + "learning_rate": 1.2335515940443698e-05, + "loss": 1.0133, + "step": 17731 + }, + { + "epoch": 1.7, + "grad_norm": 0.29383643124138414, + "learning_rate": 1.2327905678503492e-05, + "loss": 1.183, + "step": 17732 + }, + { + "epoch": 1.7, + "grad_norm": 0.33376482066018504, + "learning_rate": 1.2320297610612541e-05, + "loss": 0.9313, + "step": 17733 + }, + { + "epoch": 1.7, + "grad_norm": 0.2590262195334323, + "learning_rate": 1.2312691736961268e-05, + "loss": 1.0427, + "step": 17734 + }, + { + "epoch": 1.7, + "grad_norm": 0.30585182576825715, + "learning_rate": 1.2305088057739977e-05, + "loss": 1.0182, + "step": 17735 + }, + { + "epoch": 1.7, + "grad_norm": 0.3512994033393, + "learning_rate": 1.229748657313896e-05, + "loss": 1.0599, + "step": 17736 + }, + { + "epoch": 1.7, + "grad_norm": 0.30136393044099646, + "learning_rate": 1.2289887283348477e-05, + "loss": 1.082, + "step": 17737 + }, + { + "epoch": 1.7, + "grad_norm": 0.26908916541708294, + "learning_rate": 1.2282290188558698e-05, + "loss": 0.8871, + "step": 17738 + }, + { + "epoch": 1.7, + "grad_norm": 0.33378111488391343, + "learning_rate": 1.2274695288959714e-05, + "loss": 1.1026, + "step": 17739 + }, + { + "epoch": 1.7, + "grad_norm": 0.30466321487812165, + "learning_rate": 1.2267102584741642e-05, + "loss": 1.0056, + "step": 17740 + }, + { + "epoch": 1.7, + "grad_norm": 0.32028272636581917, + "learning_rate": 1.2259512076094448e-05, + "loss": 1.0345, + "step": 17741 + }, + { + "epoch": 1.7, + "grad_norm": 0.30555405749522896, + "learning_rate": 1.2251923763208117e-05, + "loss": 1.034, + "step": 17742 + }, + { + "epoch": 1.7, + "grad_norm": 0.28897310062097614, + "learning_rate": 1.2244337646272508e-05, + "loss": 1.0222, + "step": 17743 + }, + { + "epoch": 1.7, + "grad_norm": 0.3347236778423393, + "learning_rate": 1.2236753725477535e-05, + "loss": 0.9311, + "step": 17744 + }, + { + "epoch": 1.7, + "grad_norm": 0.3095665772661041, + "learning_rate": 1.2229172001012944e-05, + "loss": 1.081, + "step": 17745 + }, + { + "epoch": 1.7, + "grad_norm": 0.32179125701045835, + "learning_rate": 1.2221592473068499e-05, + "loss": 0.9598, + "step": 17746 + }, + { + "epoch": 1.7, + "grad_norm": 0.309380077961645, + "learning_rate": 1.2214015141833857e-05, + "loss": 1.0369, + "step": 17747 + }, + { + "epoch": 1.7, + "grad_norm": 0.32262205964512425, + "learning_rate": 1.2206440007498654e-05, + "loss": 1.1613, + "step": 17748 + }, + { + "epoch": 1.7, + "grad_norm": 0.2794689693908111, + "learning_rate": 1.2198867070252463e-05, + "loss": 1.0816, + "step": 17749 + }, + { + "epoch": 1.7, + "grad_norm": 0.35699703162369006, + "learning_rate": 1.219129633028483e-05, + "loss": 1.0826, + "step": 17750 + }, + { + "epoch": 1.7, + "grad_norm": 0.30058824702038944, + "learning_rate": 1.218372778778516e-05, + "loss": 0.979, + "step": 17751 + }, + { + "epoch": 1.7, + "grad_norm": 0.30434997166364525, + "learning_rate": 1.2176161442942902e-05, + "loss": 0.9971, + "step": 17752 + }, + { + "epoch": 1.7, + "grad_norm": 0.2877369655829467, + "learning_rate": 1.2168597295947426e-05, + "loss": 0.9084, + "step": 17753 + }, + { + "epoch": 1.7, + "grad_norm": 0.3011606025873177, + "learning_rate": 1.216103534698797e-05, + "loss": 1.1145, + "step": 17754 + }, + { + "epoch": 1.7, + "grad_norm": 0.328607961922337, + "learning_rate": 1.2153475596253839e-05, + "loss": 0.9905, + "step": 17755 + }, + { + "epoch": 1.7, + "grad_norm": 0.341317673567557, + "learning_rate": 1.2145918043934146e-05, + "loss": 1.0326, + "step": 17756 + }, + { + "epoch": 1.7, + "grad_norm": 0.30645602012186474, + "learning_rate": 1.2138362690218107e-05, + "loss": 0.8936, + "step": 17757 + }, + { + "epoch": 1.7, + "grad_norm": 0.337370464830551, + "learning_rate": 1.213080953529474e-05, + "loss": 1.0297, + "step": 17758 + }, + { + "epoch": 1.7, + "grad_norm": 0.3473685681547159, + "learning_rate": 1.2123258579353114e-05, + "loss": 1.0005, + "step": 17759 + }, + { + "epoch": 1.7, + "grad_norm": 0.28946701121488677, + "learning_rate": 1.2115709822582145e-05, + "loss": 1.0194, + "step": 17760 + }, + { + "epoch": 1.7, + "grad_norm": 0.2961614237252617, + "learning_rate": 1.210816326517079e-05, + "loss": 1.0226, + "step": 17761 + }, + { + "epoch": 1.7, + "grad_norm": 0.2699254021041901, + "learning_rate": 1.2100618907307871e-05, + "loss": 0.9661, + "step": 17762 + }, + { + "epoch": 1.7, + "grad_norm": 0.2832981808677624, + "learning_rate": 1.2093076749182197e-05, + "loss": 0.8768, + "step": 17763 + }, + { + "epoch": 1.7, + "grad_norm": 0.32175950583008345, + "learning_rate": 1.2085536790982532e-05, + "loss": 1.0212, + "step": 17764 + }, + { + "epoch": 1.7, + "grad_norm": 0.27545511724300425, + "learning_rate": 1.2077999032897592e-05, + "loss": 0.9891, + "step": 17765 + }, + { + "epoch": 1.7, + "grad_norm": 0.2932110623984763, + "learning_rate": 1.2070463475115945e-05, + "loss": 0.9512, + "step": 17766 + }, + { + "epoch": 1.7, + "grad_norm": 0.3150900444946918, + "learning_rate": 1.2062930117826243e-05, + "loss": 0.9905, + "step": 17767 + }, + { + "epoch": 1.7, + "grad_norm": 0.2817493885039847, + "learning_rate": 1.2055398961216957e-05, + "loss": 1.057, + "step": 17768 + }, + { + "epoch": 1.7, + "grad_norm": 0.3017367380760184, + "learning_rate": 1.204787000547658e-05, + "loss": 0.9743, + "step": 17769 + }, + { + "epoch": 1.7, + "grad_norm": 0.3424921977913688, + "learning_rate": 1.204034325079354e-05, + "loss": 1.008, + "step": 17770 + }, + { + "epoch": 1.7, + "grad_norm": 0.35144095526406594, + "learning_rate": 1.2032818697356207e-05, + "loss": 0.9891, + "step": 17771 + }, + { + "epoch": 1.7, + "grad_norm": 0.3126287804083134, + "learning_rate": 1.2025296345352843e-05, + "loss": 1.1216, + "step": 17772 + }, + { + "epoch": 1.7, + "grad_norm": 0.32078245577291453, + "learning_rate": 1.2017776194971741e-05, + "loss": 1.071, + "step": 17773 + }, + { + "epoch": 1.7, + "grad_norm": 0.31242390886166754, + "learning_rate": 1.2010258246401096e-05, + "loss": 1.0, + "step": 17774 + }, + { + "epoch": 1.7, + "grad_norm": 0.3215103539077436, + "learning_rate": 1.2002742499829022e-05, + "loss": 1.0944, + "step": 17775 + }, + { + "epoch": 1.7, + "grad_norm": 0.2905339298360969, + "learning_rate": 1.1995228955443639e-05, + "loss": 0.9536, + "step": 17776 + }, + { + "epoch": 1.7, + "grad_norm": 0.31315932092475723, + "learning_rate": 1.1987717613432924e-05, + "loss": 1.0001, + "step": 17777 + }, + { + "epoch": 1.7, + "grad_norm": 0.30054845624287085, + "learning_rate": 1.1980208473984922e-05, + "loss": 0.9958, + "step": 17778 + }, + { + "epoch": 1.7, + "grad_norm": 0.32716869750067096, + "learning_rate": 1.1972701537287512e-05, + "loss": 1.0234, + "step": 17779 + }, + { + "epoch": 1.7, + "grad_norm": 0.3261967204020199, + "learning_rate": 1.1965196803528577e-05, + "loss": 0.9819, + "step": 17780 + }, + { + "epoch": 1.7, + "grad_norm": 0.3097083139402055, + "learning_rate": 1.1957694272895914e-05, + "loss": 1.0904, + "step": 17781 + }, + { + "epoch": 1.7, + "grad_norm": 0.2744578671643849, + "learning_rate": 1.1950193945577293e-05, + "loss": 1.0569, + "step": 17782 + }, + { + "epoch": 1.7, + "grad_norm": 0.30469294468933683, + "learning_rate": 1.1942695821760364e-05, + "loss": 1.0927, + "step": 17783 + }, + { + "epoch": 1.7, + "grad_norm": 0.34180301452780143, + "learning_rate": 1.1935199901632865e-05, + "loss": 1.1384, + "step": 17784 + }, + { + "epoch": 1.7, + "grad_norm": 0.28248758593251033, + "learning_rate": 1.1927706185382304e-05, + "loss": 0.9795, + "step": 17785 + }, + { + "epoch": 1.7, + "grad_norm": 0.240964736941893, + "learning_rate": 1.1920214673196272e-05, + "loss": 0.9609, + "step": 17786 + }, + { + "epoch": 1.7, + "grad_norm": 0.26223003077979146, + "learning_rate": 1.1912725365262212e-05, + "loss": 1.1035, + "step": 17787 + }, + { + "epoch": 1.7, + "grad_norm": 0.32874586305555153, + "learning_rate": 1.190523826176757e-05, + "loss": 1.0465, + "step": 17788 + }, + { + "epoch": 1.7, + "grad_norm": 0.3127118718142191, + "learning_rate": 1.1897753362899689e-05, + "loss": 1.0194, + "step": 17789 + }, + { + "epoch": 1.7, + "grad_norm": 0.29765726585269436, + "learning_rate": 1.1890270668845894e-05, + "loss": 1.0159, + "step": 17790 + }, + { + "epoch": 1.7, + "grad_norm": 0.3531411248600686, + "learning_rate": 1.1882790179793456e-05, + "loss": 1.1063, + "step": 17791 + }, + { + "epoch": 1.7, + "grad_norm": 0.3028771355851253, + "learning_rate": 1.1875311895929598e-05, + "loss": 0.873, + "step": 17792 + }, + { + "epoch": 1.7, + "grad_norm": 0.3124043267330201, + "learning_rate": 1.186783581744142e-05, + "loss": 1.0499, + "step": 17793 + }, + { + "epoch": 1.7, + "grad_norm": 0.3583184732817829, + "learning_rate": 1.186036194451604e-05, + "loss": 1.0131, + "step": 17794 + }, + { + "epoch": 1.7, + "grad_norm": 0.3071969252374364, + "learning_rate": 1.1852890277340512e-05, + "loss": 0.9862, + "step": 17795 + }, + { + "epoch": 1.7, + "grad_norm": 0.3329620954771214, + "learning_rate": 1.1845420816101792e-05, + "loss": 0.9578, + "step": 17796 + }, + { + "epoch": 1.7, + "grad_norm": 0.3158726989624777, + "learning_rate": 1.1837953560986815e-05, + "loss": 0.9776, + "step": 17797 + }, + { + "epoch": 1.7, + "grad_norm": 0.32371080666976737, + "learning_rate": 1.1830488512182448e-05, + "loss": 0.9134, + "step": 17798 + }, + { + "epoch": 1.7, + "grad_norm": 0.32612316170550776, + "learning_rate": 1.1823025669875554e-05, + "loss": 0.976, + "step": 17799 + }, + { + "epoch": 1.7, + "grad_norm": 0.3104936364104526, + "learning_rate": 1.1815565034252828e-05, + "loss": 0.98, + "step": 17800 + }, + { + "epoch": 1.7, + "grad_norm": 0.2903118650340204, + "learning_rate": 1.1808106605501034e-05, + "loss": 1.1053, + "step": 17801 + }, + { + "epoch": 1.7, + "grad_norm": 0.26427692141277154, + "learning_rate": 1.1800650383806776e-05, + "loss": 1.0356, + "step": 17802 + }, + { + "epoch": 1.7, + "grad_norm": 0.32566036971890383, + "learning_rate": 1.1793196369356696e-05, + "loss": 0.9568, + "step": 17803 + }, + { + "epoch": 1.7, + "grad_norm": 0.2692317079458073, + "learning_rate": 1.1785744562337276e-05, + "loss": 1.0871, + "step": 17804 + }, + { + "epoch": 1.7, + "grad_norm": 0.34033321802073363, + "learning_rate": 1.1778294962935077e-05, + "loss": 1.0727, + "step": 17805 + }, + { + "epoch": 1.7, + "grad_norm": 0.3302453343313747, + "learning_rate": 1.1770847571336474e-05, + "loss": 1.0121, + "step": 17806 + }, + { + "epoch": 1.7, + "grad_norm": 0.33603360474898547, + "learning_rate": 1.1763402387727873e-05, + "loss": 1.0754, + "step": 17807 + }, + { + "epoch": 1.7, + "grad_norm": 0.32853640150588687, + "learning_rate": 1.1755959412295569e-05, + "loss": 1.0992, + "step": 17808 + }, + { + "epoch": 1.7, + "grad_norm": 0.3456988197093645, + "learning_rate": 1.1748518645225859e-05, + "loss": 1.0268, + "step": 17809 + }, + { + "epoch": 1.7, + "grad_norm": 0.3329893191289446, + "learning_rate": 1.1741080086704881e-05, + "loss": 1.1522, + "step": 17810 + }, + { + "epoch": 1.7, + "grad_norm": 0.3248946717447016, + "learning_rate": 1.1733643736918898e-05, + "loss": 1.13, + "step": 17811 + }, + { + "epoch": 1.7, + "grad_norm": 0.3168387203949139, + "learning_rate": 1.1726209596053928e-05, + "loss": 1.0398, + "step": 17812 + }, + { + "epoch": 1.7, + "grad_norm": 0.3212207950008923, + "learning_rate": 1.1718777664296066e-05, + "loss": 1.0281, + "step": 17813 + }, + { + "epoch": 1.7, + "grad_norm": 0.29696618206019565, + "learning_rate": 1.1711347941831253e-05, + "loss": 0.9821, + "step": 17814 + }, + { + "epoch": 1.7, + "grad_norm": 0.32440451235839124, + "learning_rate": 1.1703920428845439e-05, + "loss": 1.0034, + "step": 17815 + }, + { + "epoch": 1.7, + "grad_norm": 0.33155628344344135, + "learning_rate": 1.1696495125524532e-05, + "loss": 1.0501, + "step": 17816 + }, + { + "epoch": 1.7, + "grad_norm": 0.3033949277991693, + "learning_rate": 1.1689072032054315e-05, + "loss": 1.0796, + "step": 17817 + }, + { + "epoch": 1.7, + "grad_norm": 0.28287634147614343, + "learning_rate": 1.1681651148620565e-05, + "loss": 1.0169, + "step": 17818 + }, + { + "epoch": 1.7, + "grad_norm": 0.34401781601311143, + "learning_rate": 1.1674232475409009e-05, + "loss": 0.8724, + "step": 17819 + }, + { + "epoch": 1.7, + "grad_norm": 0.3278431898685156, + "learning_rate": 1.1666816012605309e-05, + "loss": 1.0079, + "step": 17820 + }, + { + "epoch": 1.7, + "grad_norm": 0.331868420341758, + "learning_rate": 1.165940176039504e-05, + "loss": 0.9233, + "step": 17821 + }, + { + "epoch": 1.71, + "grad_norm": 0.3160563931802113, + "learning_rate": 1.1651989718963774e-05, + "loss": 1.0448, + "step": 17822 + }, + { + "epoch": 1.71, + "grad_norm": 0.3010417896711723, + "learning_rate": 1.1644579888496976e-05, + "loss": 1.0408, + "step": 17823 + }, + { + "epoch": 1.71, + "grad_norm": 0.3403982595860137, + "learning_rate": 1.1637172269180085e-05, + "loss": 1.0954, + "step": 17824 + }, + { + "epoch": 1.71, + "grad_norm": 0.3046948050133378, + "learning_rate": 1.162976686119851e-05, + "loss": 0.9649, + "step": 17825 + }, + { + "epoch": 1.71, + "grad_norm": 0.3246829354095784, + "learning_rate": 1.1622363664737569e-05, + "loss": 1.0072, + "step": 17826 + }, + { + "epoch": 1.71, + "grad_norm": 0.31717651966726684, + "learning_rate": 1.16149626799825e-05, + "loss": 1.0644, + "step": 17827 + }, + { + "epoch": 1.71, + "grad_norm": 0.3607264622595277, + "learning_rate": 1.1607563907118557e-05, + "loss": 1.0101, + "step": 17828 + }, + { + "epoch": 1.71, + "grad_norm": 0.2978868946893537, + "learning_rate": 1.160016734633086e-05, + "loss": 1.1007, + "step": 17829 + }, + { + "epoch": 1.71, + "grad_norm": 0.3192073055495015, + "learning_rate": 1.1592772997804546e-05, + "loss": 1.0511, + "step": 17830 + }, + { + "epoch": 1.71, + "grad_norm": 0.30601360873112665, + "learning_rate": 1.1585380861724615e-05, + "loss": 0.8562, + "step": 17831 + }, + { + "epoch": 1.71, + "grad_norm": 0.33686407298489274, + "learning_rate": 1.157799093827614e-05, + "loss": 1.1081, + "step": 17832 + }, + { + "epoch": 1.71, + "grad_norm": 0.333755526956051, + "learning_rate": 1.1570603227643995e-05, + "loss": 1.0376, + "step": 17833 + }, + { + "epoch": 1.71, + "grad_norm": 0.29574246477094956, + "learning_rate": 1.1563217730013065e-05, + "loss": 0.9347, + "step": 17834 + }, + { + "epoch": 1.71, + "grad_norm": 0.3743252283712165, + "learning_rate": 1.1555834445568226e-05, + "loss": 1.0924, + "step": 17835 + }, + { + "epoch": 1.71, + "grad_norm": 0.346087535292262, + "learning_rate": 1.1548453374494196e-05, + "loss": 0.9322, + "step": 17836 + }, + { + "epoch": 1.71, + "grad_norm": 0.30751461139551023, + "learning_rate": 1.1541074516975713e-05, + "loss": 0.9314, + "step": 17837 + }, + { + "epoch": 1.71, + "grad_norm": 0.29442121647126474, + "learning_rate": 1.1533697873197413e-05, + "loss": 1.0095, + "step": 17838 + }, + { + "epoch": 1.71, + "grad_norm": 0.34130124015937346, + "learning_rate": 1.1526323443343956e-05, + "loss": 1.0461, + "step": 17839 + }, + { + "epoch": 1.71, + "grad_norm": 0.2757443874558075, + "learning_rate": 1.1518951227599839e-05, + "loss": 1.0602, + "step": 17840 + }, + { + "epoch": 1.71, + "grad_norm": 0.30722011294611373, + "learning_rate": 1.1511581226149592e-05, + "loss": 0.9911, + "step": 17841 + }, + { + "epoch": 1.71, + "grad_norm": 0.3147179393864746, + "learning_rate": 1.1504213439177613e-05, + "loss": 0.9974, + "step": 17842 + }, + { + "epoch": 1.71, + "grad_norm": 0.28635104160849195, + "learning_rate": 1.1496847866868343e-05, + "loss": 0.9474, + "step": 17843 + }, + { + "epoch": 1.71, + "grad_norm": 0.29488821096404416, + "learning_rate": 1.1489484509406035e-05, + "loss": 1.0402, + "step": 17844 + }, + { + "epoch": 1.71, + "grad_norm": 0.346416333773624, + "learning_rate": 1.148212336697504e-05, + "loss": 0.9977, + "step": 17845 + }, + { + "epoch": 1.71, + "grad_norm": 0.3540677984969293, + "learning_rate": 1.1474764439759512e-05, + "loss": 1.0407, + "step": 17846 + }, + { + "epoch": 1.71, + "grad_norm": 0.31633239485149683, + "learning_rate": 1.146740772794367e-05, + "loss": 1.006, + "step": 17847 + }, + { + "epoch": 1.71, + "grad_norm": 0.3501579398685816, + "learning_rate": 1.1460053231711565e-05, + "loss": 1.0622, + "step": 17848 + }, + { + "epoch": 1.71, + "grad_norm": 0.3773868283685306, + "learning_rate": 1.1452700951247297e-05, + "loss": 0.969, + "step": 17849 + }, + { + "epoch": 1.71, + "grad_norm": 0.2769748681334888, + "learning_rate": 1.1445350886734819e-05, + "loss": 1.0719, + "step": 17850 + }, + { + "epoch": 1.71, + "grad_norm": 0.32429081333768445, + "learning_rate": 1.1438003038358091e-05, + "loss": 0.9898, + "step": 17851 + }, + { + "epoch": 1.71, + "grad_norm": 0.32291740148417675, + "learning_rate": 1.1430657406300993e-05, + "loss": 0.9947, + "step": 17852 + }, + { + "epoch": 1.71, + "grad_norm": 0.3128773353489315, + "learning_rate": 1.1423313990747376e-05, + "loss": 1.033, + "step": 17853 + }, + { + "epoch": 1.71, + "grad_norm": 0.3369459315558084, + "learning_rate": 1.1415972791880969e-05, + "loss": 1.1686, + "step": 17854 + }, + { + "epoch": 1.71, + "grad_norm": 0.30949432193306176, + "learning_rate": 1.1408633809885528e-05, + "loss": 0.941, + "step": 17855 + }, + { + "epoch": 1.71, + "grad_norm": 0.32406422793295914, + "learning_rate": 1.1401297044944714e-05, + "loss": 0.9537, + "step": 17856 + }, + { + "epoch": 1.71, + "grad_norm": 0.3070958633893159, + "learning_rate": 1.1393962497242105e-05, + "loss": 1.0496, + "step": 17857 + }, + { + "epoch": 1.71, + "grad_norm": 0.3010996080101056, + "learning_rate": 1.1386630166961276e-05, + "loss": 1.0314, + "step": 17858 + }, + { + "epoch": 1.71, + "grad_norm": 0.31519981118205725, + "learning_rate": 1.1379300054285713e-05, + "loss": 0.9579, + "step": 17859 + }, + { + "epoch": 1.71, + "grad_norm": 0.3043835048645108, + "learning_rate": 1.137197215939889e-05, + "loss": 0.9713, + "step": 17860 + }, + { + "epoch": 1.71, + "grad_norm": 0.3510320475505025, + "learning_rate": 1.1364646482484143e-05, + "loss": 0.9816, + "step": 17861 + }, + { + "epoch": 1.71, + "grad_norm": 0.27118562985766553, + "learning_rate": 1.135732302372483e-05, + "loss": 1.0233, + "step": 17862 + }, + { + "epoch": 1.71, + "grad_norm": 0.34793876608317564, + "learning_rate": 1.135000178330421e-05, + "loss": 0.9106, + "step": 17863 + }, + { + "epoch": 1.71, + "grad_norm": 0.2987692844549672, + "learning_rate": 1.1342682761405533e-05, + "loss": 1.019, + "step": 17864 + }, + { + "epoch": 1.71, + "grad_norm": 0.31995760335977974, + "learning_rate": 1.1335365958211886e-05, + "loss": 1.0227, + "step": 17865 + }, + { + "epoch": 1.71, + "grad_norm": 0.338823378788628, + "learning_rate": 1.1328051373906479e-05, + "loss": 1.0576, + "step": 17866 + }, + { + "epoch": 1.71, + "grad_norm": 0.28843377919034774, + "learning_rate": 1.13207390086723e-05, + "loss": 1.1394, + "step": 17867 + }, + { + "epoch": 1.71, + "grad_norm": 0.30448169485062, + "learning_rate": 1.1313428862692366e-05, + "loss": 0.9404, + "step": 17868 + }, + { + "epoch": 1.71, + "grad_norm": 0.26936714278265045, + "learning_rate": 1.13061209361496e-05, + "loss": 0.9095, + "step": 17869 + }, + { + "epoch": 1.71, + "grad_norm": 0.32300657872719224, + "learning_rate": 1.1298815229226923e-05, + "loss": 1.0432, + "step": 17870 + }, + { + "epoch": 1.71, + "grad_norm": 0.26492802338657284, + "learning_rate": 1.1291511742107097e-05, + "loss": 1.046, + "step": 17871 + }, + { + "epoch": 1.71, + "grad_norm": 0.3237103524907265, + "learning_rate": 1.1284210474972978e-05, + "loss": 1.1097, + "step": 17872 + }, + { + "epoch": 1.71, + "grad_norm": 0.2615821024345081, + "learning_rate": 1.1276911428007231e-05, + "loss": 1.0068, + "step": 17873 + }, + { + "epoch": 1.71, + "grad_norm": 0.347371039572508, + "learning_rate": 1.1269614601392554e-05, + "loss": 1.0213, + "step": 17874 + }, + { + "epoch": 1.71, + "grad_norm": 0.3251901464489176, + "learning_rate": 1.126231999531152e-05, + "loss": 1.092, + "step": 17875 + }, + { + "epoch": 1.71, + "grad_norm": 0.3208181522434588, + "learning_rate": 1.125502760994669e-05, + "loss": 1.0754, + "step": 17876 + }, + { + "epoch": 1.71, + "grad_norm": 0.33004872890196474, + "learning_rate": 1.1247737445480599e-05, + "loss": 1.0829, + "step": 17877 + }, + { + "epoch": 1.71, + "grad_norm": 0.3004285316063065, + "learning_rate": 1.124044950209563e-05, + "loss": 1.0531, + "step": 17878 + }, + { + "epoch": 1.71, + "grad_norm": 0.25935207784162756, + "learning_rate": 1.1233163779974199e-05, + "loss": 1.0002, + "step": 17879 + }, + { + "epoch": 1.71, + "grad_norm": 0.2895769224522656, + "learning_rate": 1.1225880279298629e-05, + "loss": 1.0102, + "step": 17880 + }, + { + "epoch": 1.71, + "grad_norm": 0.32997348267978655, + "learning_rate": 1.1218599000251218e-05, + "loss": 1.0741, + "step": 17881 + }, + { + "epoch": 1.71, + "grad_norm": 0.28135659483358283, + "learning_rate": 1.1211319943014143e-05, + "loss": 0.9685, + "step": 17882 + }, + { + "epoch": 1.71, + "grad_norm": 0.3289302126999154, + "learning_rate": 1.1204043107769612e-05, + "loss": 1.1054, + "step": 17883 + }, + { + "epoch": 1.71, + "grad_norm": 0.30144490081965436, + "learning_rate": 1.1196768494699683e-05, + "loss": 0.9979, + "step": 17884 + }, + { + "epoch": 1.71, + "grad_norm": 0.263419930608055, + "learning_rate": 1.1189496103986441e-05, + "loss": 1.0109, + "step": 17885 + }, + { + "epoch": 1.71, + "grad_norm": 0.3310040067016559, + "learning_rate": 1.1182225935811874e-05, + "loss": 0.934, + "step": 17886 + }, + { + "epoch": 1.71, + "grad_norm": 0.31471504841948966, + "learning_rate": 1.1174957990357937e-05, + "loss": 1.0187, + "step": 17887 + }, + { + "epoch": 1.71, + "grad_norm": 0.2856450211512566, + "learning_rate": 1.1167692267806484e-05, + "loss": 0.9622, + "step": 17888 + }, + { + "epoch": 1.71, + "grad_norm": 0.3053652536836929, + "learning_rate": 1.116042876833938e-05, + "loss": 1.0434, + "step": 17889 + }, + { + "epoch": 1.71, + "grad_norm": 0.2792049327645188, + "learning_rate": 1.1153167492138362e-05, + "loss": 1.1056, + "step": 17890 + }, + { + "epoch": 1.71, + "grad_norm": 0.29567506231925206, + "learning_rate": 1.1145908439385178e-05, + "loss": 0.9419, + "step": 17891 + }, + { + "epoch": 1.71, + "grad_norm": 0.30694854927402426, + "learning_rate": 1.1138651610261441e-05, + "loss": 1.0373, + "step": 17892 + }, + { + "epoch": 1.71, + "grad_norm": 0.3321095869186875, + "learning_rate": 1.113139700494884e-05, + "loss": 1.0061, + "step": 17893 + }, + { + "epoch": 1.71, + "grad_norm": 0.3211131109661633, + "learning_rate": 1.1124144623628852e-05, + "loss": 1.0579, + "step": 17894 + }, + { + "epoch": 1.71, + "grad_norm": 0.2739255453170774, + "learning_rate": 1.111689446648303e-05, + "loss": 0.9665, + "step": 17895 + }, + { + "epoch": 1.71, + "grad_norm": 0.3031945964713595, + "learning_rate": 1.1109646533692763e-05, + "loss": 1.0329, + "step": 17896 + }, + { + "epoch": 1.71, + "grad_norm": 0.37617208714118555, + "learning_rate": 1.1102400825439452e-05, + "loss": 0.9937, + "step": 17897 + }, + { + "epoch": 1.71, + "grad_norm": 0.31635155623323186, + "learning_rate": 1.109515734190446e-05, + "loss": 1.0176, + "step": 17898 + }, + { + "epoch": 1.71, + "grad_norm": 0.2812728010800794, + "learning_rate": 1.1087916083269002e-05, + "loss": 0.9928, + "step": 17899 + }, + { + "epoch": 1.71, + "grad_norm": 0.34086315164947606, + "learning_rate": 1.1080677049714328e-05, + "loss": 1.0475, + "step": 17900 + }, + { + "epoch": 1.71, + "grad_norm": 0.3065006511060092, + "learning_rate": 1.1073440241421584e-05, + "loss": 0.9802, + "step": 17901 + }, + { + "epoch": 1.71, + "grad_norm": 0.26660664254497907, + "learning_rate": 1.1066205658571916e-05, + "loss": 0.8683, + "step": 17902 + }, + { + "epoch": 1.71, + "grad_norm": 0.32099338310925163, + "learning_rate": 1.105897330134632e-05, + "loss": 1.0885, + "step": 17903 + }, + { + "epoch": 1.71, + "grad_norm": 0.29545381374536744, + "learning_rate": 1.1051743169925843e-05, + "loss": 1.0301, + "step": 17904 + }, + { + "epoch": 1.71, + "grad_norm": 0.2532673562800316, + "learning_rate": 1.104451526449135e-05, + "loss": 0.9701, + "step": 17905 + }, + { + "epoch": 1.71, + "grad_norm": 0.321864164369757, + "learning_rate": 1.1037289585223808e-05, + "loss": 1.1613, + "step": 17906 + }, + { + "epoch": 1.71, + "grad_norm": 0.31872213289250395, + "learning_rate": 1.1030066132303985e-05, + "loss": 0.9441, + "step": 17907 + }, + { + "epoch": 1.71, + "grad_norm": 0.3240805016928337, + "learning_rate": 1.1022844905912688e-05, + "loss": 0.9813, + "step": 17908 + }, + { + "epoch": 1.71, + "grad_norm": 0.2720075675161121, + "learning_rate": 1.1015625906230598e-05, + "loss": 1.0367, + "step": 17909 + }, + { + "epoch": 1.71, + "grad_norm": 0.27504727347357266, + "learning_rate": 1.1008409133438424e-05, + "loss": 0.8572, + "step": 17910 + }, + { + "epoch": 1.71, + "grad_norm": 0.3321683731128891, + "learning_rate": 1.1001194587716712e-05, + "loss": 1.0533, + "step": 17911 + }, + { + "epoch": 1.71, + "grad_norm": 0.33508161857863966, + "learning_rate": 1.0993982269246039e-05, + "loss": 1.1205, + "step": 17912 + }, + { + "epoch": 1.71, + "grad_norm": 0.28887242901530263, + "learning_rate": 1.0986772178206906e-05, + "loss": 0.8357, + "step": 17913 + }, + { + "epoch": 1.71, + "grad_norm": 0.31420376891107915, + "learning_rate": 1.0979564314779756e-05, + "loss": 1.0105, + "step": 17914 + }, + { + "epoch": 1.71, + "grad_norm": 0.32061474418336205, + "learning_rate": 1.0972358679144934e-05, + "loss": 1.0756, + "step": 17915 + }, + { + "epoch": 1.71, + "grad_norm": 0.3110475185191958, + "learning_rate": 1.0965155271482808e-05, + "loss": 1.0948, + "step": 17916 + }, + { + "epoch": 1.71, + "grad_norm": 0.30815465991243135, + "learning_rate": 1.095795409197361e-05, + "loss": 0.923, + "step": 17917 + }, + { + "epoch": 1.71, + "grad_norm": 0.2811887732869571, + "learning_rate": 1.0950755140797574e-05, + "loss": 1.1026, + "step": 17918 + }, + { + "epoch": 1.71, + "grad_norm": 0.3322712268173492, + "learning_rate": 1.0943558418134858e-05, + "loss": 1.0511, + "step": 17919 + }, + { + "epoch": 1.71, + "grad_norm": 0.352795955292615, + "learning_rate": 1.0936363924165571e-05, + "loss": 1.0424, + "step": 17920 + }, + { + "epoch": 1.71, + "grad_norm": 0.32744841151017784, + "learning_rate": 1.0929171659069737e-05, + "loss": 0.9818, + "step": 17921 + }, + { + "epoch": 1.71, + "grad_norm": 0.2553202927361924, + "learning_rate": 1.0921981623027366e-05, + "loss": 0.9689, + "step": 17922 + }, + { + "epoch": 1.71, + "grad_norm": 0.3200540933278744, + "learning_rate": 1.0914793816218405e-05, + "loss": 1.0709, + "step": 17923 + }, + { + "epoch": 1.71, + "grad_norm": 0.3151080990701299, + "learning_rate": 1.0907608238822697e-05, + "loss": 1.0726, + "step": 17924 + }, + { + "epoch": 1.71, + "grad_norm": 0.3292088336963438, + "learning_rate": 1.090042489102011e-05, + "loss": 0.9372, + "step": 17925 + }, + { + "epoch": 1.71, + "grad_norm": 0.2931636806680276, + "learning_rate": 1.0893243772990337e-05, + "loss": 1.0015, + "step": 17926 + }, + { + "epoch": 1.72, + "grad_norm": 0.30157440581503553, + "learning_rate": 1.0886064884913183e-05, + "loss": 1.0238, + "step": 17927 + }, + { + "epoch": 1.72, + "grad_norm": 0.3500089800975664, + "learning_rate": 1.0878888226968242e-05, + "loss": 1.0705, + "step": 17928 + }, + { + "epoch": 1.72, + "grad_norm": 0.288211970071399, + "learning_rate": 1.0871713799335158e-05, + "loss": 0.9983, + "step": 17929 + }, + { + "epoch": 1.72, + "grad_norm": 0.31252177245200946, + "learning_rate": 1.0864541602193435e-05, + "loss": 0.935, + "step": 17930 + }, + { + "epoch": 1.72, + "grad_norm": 0.3052833909966686, + "learning_rate": 1.085737163572259e-05, + "loss": 0.9649, + "step": 17931 + }, + { + "epoch": 1.72, + "grad_norm": 0.320879142608633, + "learning_rate": 1.0850203900101996e-05, + "loss": 0.9081, + "step": 17932 + }, + { + "epoch": 1.72, + "grad_norm": 0.30654837324632983, + "learning_rate": 1.084303839551113e-05, + "loss": 1.0582, + "step": 17933 + }, + { + "epoch": 1.72, + "grad_norm": 0.26473062010234016, + "learning_rate": 1.0835875122129236e-05, + "loss": 0.9899, + "step": 17934 + }, + { + "epoch": 1.72, + "grad_norm": 0.29946146917472266, + "learning_rate": 1.0828714080135627e-05, + "loss": 1.0583, + "step": 17935 + }, + { + "epoch": 1.72, + "grad_norm": 0.30011411263643495, + "learning_rate": 1.0821555269709472e-05, + "loss": 1.0131, + "step": 17936 + }, + { + "epoch": 1.72, + "grad_norm": 0.3469669829049744, + "learning_rate": 1.0814398691029948e-05, + "loss": 1.0667, + "step": 17937 + }, + { + "epoch": 1.72, + "grad_norm": 0.32867500545060935, + "learning_rate": 1.0807244344276158e-05, + "loss": 1.0745, + "step": 17938 + }, + { + "epoch": 1.72, + "grad_norm": 0.37597749310804285, + "learning_rate": 1.0800092229627112e-05, + "loss": 1.0088, + "step": 17939 + }, + { + "epoch": 1.72, + "grad_norm": 0.3312637395900194, + "learning_rate": 1.0792942347261826e-05, + "loss": 1.0132, + "step": 17940 + }, + { + "epoch": 1.72, + "grad_norm": 0.3141225055508908, + "learning_rate": 1.078579469735922e-05, + "loss": 1.0767, + "step": 17941 + }, + { + "epoch": 1.72, + "grad_norm": 0.3237562872332324, + "learning_rate": 1.0778649280098186e-05, + "loss": 1.1065, + "step": 17942 + }, + { + "epoch": 1.72, + "grad_norm": 0.34317342636153836, + "learning_rate": 1.0771506095657513e-05, + "loss": 0.9683, + "step": 17943 + }, + { + "epoch": 1.72, + "grad_norm": 0.3040932350250649, + "learning_rate": 1.0764365144215993e-05, + "loss": 0.9548, + "step": 17944 + }, + { + "epoch": 1.72, + "grad_norm": 0.34441402493743944, + "learning_rate": 1.0757226425952305e-05, + "loss": 1.0099, + "step": 17945 + }, + { + "epoch": 1.72, + "grad_norm": 0.2844825052596555, + "learning_rate": 1.0750089941045106e-05, + "loss": 0.9997, + "step": 17946 + }, + { + "epoch": 1.72, + "grad_norm": 0.32028579362775783, + "learning_rate": 1.0742955689672995e-05, + "loss": 1.056, + "step": 17947 + }, + { + "epoch": 1.72, + "grad_norm": 0.3536663136883441, + "learning_rate": 1.0735823672014544e-05, + "loss": 1.0769, + "step": 17948 + }, + { + "epoch": 1.72, + "grad_norm": 0.31597150009497593, + "learning_rate": 1.0728693888248177e-05, + "loss": 0.8562, + "step": 17949 + }, + { + "epoch": 1.72, + "grad_norm": 0.30440612874995576, + "learning_rate": 1.072156633855238e-05, + "loss": 0.868, + "step": 17950 + }, + { + "epoch": 1.72, + "grad_norm": 0.32345113941591497, + "learning_rate": 1.071444102310547e-05, + "loss": 1.0137, + "step": 17951 + }, + { + "epoch": 1.72, + "grad_norm": 0.30291506321515066, + "learning_rate": 1.0707317942085804e-05, + "loss": 0.8795, + "step": 17952 + }, + { + "epoch": 1.72, + "grad_norm": 0.335426672777134, + "learning_rate": 1.0700197095671582e-05, + "loss": 1.0474, + "step": 17953 + }, + { + "epoch": 1.72, + "grad_norm": 0.3523986530696545, + "learning_rate": 1.0693078484041097e-05, + "loss": 1.0132, + "step": 17954 + }, + { + "epoch": 1.72, + "grad_norm": 0.3329103346813557, + "learning_rate": 1.0685962107372427e-05, + "loss": 1.0225, + "step": 17955 + }, + { + "epoch": 1.72, + "grad_norm": 0.3413435792300596, + "learning_rate": 1.0678847965843708e-05, + "loss": 0.9842, + "step": 17956 + }, + { + "epoch": 1.72, + "grad_norm": 0.2974958148246348, + "learning_rate": 1.0671736059632942e-05, + "loss": 1.0679, + "step": 17957 + }, + { + "epoch": 1.72, + "grad_norm": 0.31178263688838465, + "learning_rate": 1.066462638891812e-05, + "loss": 1.0292, + "step": 17958 + }, + { + "epoch": 1.72, + "grad_norm": 0.31957484338300113, + "learning_rate": 1.0657518953877177e-05, + "loss": 1.078, + "step": 17959 + }, + { + "epoch": 1.72, + "grad_norm": 0.2917082082913536, + "learning_rate": 1.065041375468796e-05, + "loss": 1.0117, + "step": 17960 + }, + { + "epoch": 1.72, + "grad_norm": 0.30079752130509285, + "learning_rate": 1.0643310791528294e-05, + "loss": 1.0867, + "step": 17961 + }, + { + "epoch": 1.72, + "grad_norm": 0.3067036229584096, + "learning_rate": 1.0636210064575924e-05, + "loss": 1.0301, + "step": 17962 + }, + { + "epoch": 1.72, + "grad_norm": 0.33538823520310257, + "learning_rate": 1.0629111574008588e-05, + "loss": 1.1562, + "step": 17963 + }, + { + "epoch": 1.72, + "grad_norm": 0.28997935179985895, + "learning_rate": 1.0622015320003886e-05, + "loss": 1.0328, + "step": 17964 + }, + { + "epoch": 1.72, + "grad_norm": 0.3130303437042929, + "learning_rate": 1.0614921302739433e-05, + "loss": 1.0248, + "step": 17965 + }, + { + "epoch": 1.72, + "grad_norm": 0.31461095195809036, + "learning_rate": 1.060782952239272e-05, + "loss": 1.0755, + "step": 17966 + }, + { + "epoch": 1.72, + "grad_norm": 0.3177497981348636, + "learning_rate": 1.0600739979141295e-05, + "loss": 1.0104, + "step": 17967 + }, + { + "epoch": 1.72, + "grad_norm": 0.31487885930941484, + "learning_rate": 1.0593652673162513e-05, + "loss": 1.0221, + "step": 17968 + }, + { + "epoch": 1.72, + "grad_norm": 0.33355371006568707, + "learning_rate": 1.0586567604633779e-05, + "loss": 1.1265, + "step": 17969 + }, + { + "epoch": 1.72, + "grad_norm": 0.3167761892411543, + "learning_rate": 1.0579484773732361e-05, + "loss": 1.0263, + "step": 17970 + }, + { + "epoch": 1.72, + "grad_norm": 0.2863519860036368, + "learning_rate": 1.0572404180635565e-05, + "loss": 1.0235, + "step": 17971 + }, + { + "epoch": 1.72, + "grad_norm": 0.3115928925519741, + "learning_rate": 1.0565325825520533e-05, + "loss": 0.9034, + "step": 17972 + }, + { + "epoch": 1.72, + "grad_norm": 0.327270500103882, + "learning_rate": 1.0558249708564438e-05, + "loss": 0.9384, + "step": 17973 + }, + { + "epoch": 1.72, + "grad_norm": 0.30020115862159935, + "learning_rate": 1.0551175829944349e-05, + "loss": 1.1476, + "step": 17974 + }, + { + "epoch": 1.72, + "grad_norm": 0.3259377833425876, + "learning_rate": 1.0544104189837323e-05, + "loss": 0.9725, + "step": 17975 + }, + { + "epoch": 1.72, + "grad_norm": 0.35238317967011656, + "learning_rate": 1.0537034788420286e-05, + "loss": 1.0055, + "step": 17976 + }, + { + "epoch": 1.72, + "grad_norm": 0.351130704160892, + "learning_rate": 1.0529967625870208e-05, + "loss": 1.085, + "step": 17977 + }, + { + "epoch": 1.72, + "grad_norm": 0.3526458597465198, + "learning_rate": 1.052290270236389e-05, + "loss": 1.1321, + "step": 17978 + }, + { + "epoch": 1.72, + "grad_norm": 0.30003716439757383, + "learning_rate": 1.0515840018078171e-05, + "loss": 0.9752, + "step": 17979 + }, + { + "epoch": 1.72, + "grad_norm": 0.3117578447513087, + "learning_rate": 1.0508779573189797e-05, + "loss": 1.0505, + "step": 17980 + }, + { + "epoch": 1.72, + "grad_norm": 0.3761205101513627, + "learning_rate": 1.0501721367875473e-05, + "loss": 1.0683, + "step": 17981 + }, + { + "epoch": 1.72, + "grad_norm": 0.3279959424252034, + "learning_rate": 1.0494665402311799e-05, + "loss": 1.0013, + "step": 17982 + }, + { + "epoch": 1.72, + "grad_norm": 0.2819644169337336, + "learning_rate": 1.048761167667538e-05, + "loss": 0.9499, + "step": 17983 + }, + { + "epoch": 1.72, + "grad_norm": 0.3281818807337427, + "learning_rate": 1.0480560191142752e-05, + "loss": 0.9365, + "step": 17984 + }, + { + "epoch": 1.72, + "grad_norm": 0.3161799006555158, + "learning_rate": 1.0473510945890342e-05, + "loss": 1.0455, + "step": 17985 + }, + { + "epoch": 1.72, + "grad_norm": 0.3269674537955631, + "learning_rate": 1.0466463941094607e-05, + "loss": 0.962, + "step": 17986 + }, + { + "epoch": 1.72, + "grad_norm": 0.2832861199365769, + "learning_rate": 1.0459419176931851e-05, + "loss": 1.003, + "step": 17987 + }, + { + "epoch": 1.72, + "grad_norm": 0.31752741018430575, + "learning_rate": 1.0452376653578433e-05, + "loss": 0.9221, + "step": 17988 + }, + { + "epoch": 1.72, + "grad_norm": 0.2752892002968324, + "learning_rate": 1.0445336371210556e-05, + "loss": 0.9986, + "step": 17989 + }, + { + "epoch": 1.72, + "grad_norm": 0.2858779030504229, + "learning_rate": 1.0438298330004437e-05, + "loss": 0.9205, + "step": 17990 + }, + { + "epoch": 1.72, + "grad_norm": 0.35265520444999, + "learning_rate": 1.0431262530136165e-05, + "loss": 1.0282, + "step": 17991 + }, + { + "epoch": 1.72, + "grad_norm": 0.33468447843305604, + "learning_rate": 1.0424228971781868e-05, + "loss": 0.9913, + "step": 17992 + }, + { + "epoch": 1.72, + "grad_norm": 0.33298551364922546, + "learning_rate": 1.0417197655117483e-05, + "loss": 1.051, + "step": 17993 + }, + { + "epoch": 1.72, + "grad_norm": 0.3283755925027953, + "learning_rate": 1.0410168580319079e-05, + "loss": 1.061, + "step": 17994 + }, + { + "epoch": 1.72, + "grad_norm": 0.31101815733571325, + "learning_rate": 1.0403141747562495e-05, + "loss": 1.1112, + "step": 17995 + }, + { + "epoch": 1.72, + "grad_norm": 0.32161858727858755, + "learning_rate": 1.039611715702361e-05, + "loss": 1.0201, + "step": 17996 + }, + { + "epoch": 1.72, + "grad_norm": 0.3285422816965206, + "learning_rate": 1.0389094808878186e-05, + "loss": 1.0343, + "step": 17997 + }, + { + "epoch": 1.72, + "grad_norm": 0.3224665866068969, + "learning_rate": 1.0382074703302014e-05, + "loss": 1.0122, + "step": 17998 + }, + { + "epoch": 1.72, + "grad_norm": 0.2972737820525684, + "learning_rate": 1.037505684047072e-05, + "loss": 1.0371, + "step": 17999 + }, + { + "epoch": 1.72, + "grad_norm": 0.2735035441986728, + "learning_rate": 1.0368041220559955e-05, + "loss": 0.9794, + "step": 18000 + }, + { + "epoch": 1.72, + "grad_norm": 0.30309749578530404, + "learning_rate": 1.0361027843745286e-05, + "loss": 0.8518, + "step": 18001 + }, + { + "epoch": 1.72, + "grad_norm": 0.3413850959374048, + "learning_rate": 1.0354016710202252e-05, + "loss": 0.9767, + "step": 18002 + }, + { + "epoch": 1.72, + "grad_norm": 0.25823104069742686, + "learning_rate": 1.034700782010627e-05, + "loss": 1.0689, + "step": 18003 + }, + { + "epoch": 1.72, + "grad_norm": 0.36151749126463667, + "learning_rate": 1.0340001173632762e-05, + "loss": 0.9568, + "step": 18004 + }, + { + "epoch": 1.72, + "grad_norm": 0.3470180375690676, + "learning_rate": 1.0332996770957105e-05, + "loss": 0.9539, + "step": 18005 + }, + { + "epoch": 1.72, + "grad_norm": 0.3176851944095077, + "learning_rate": 1.032599461225452e-05, + "loss": 1.0976, + "step": 18006 + }, + { + "epoch": 1.72, + "grad_norm": 0.31632310528499097, + "learning_rate": 1.031899469770029e-05, + "loss": 1.0295, + "step": 18007 + }, + { + "epoch": 1.72, + "grad_norm": 0.31090503997357405, + "learning_rate": 1.0311997027469577e-05, + "loss": 1.05, + "step": 18008 + }, + { + "epoch": 1.72, + "grad_norm": 0.296074151078534, + "learning_rate": 1.030500160173753e-05, + "loss": 0.8644, + "step": 18009 + }, + { + "epoch": 1.72, + "grad_norm": 0.2830202109177459, + "learning_rate": 1.0298008420679161e-05, + "loss": 0.9786, + "step": 18010 + }, + { + "epoch": 1.72, + "grad_norm": 0.3034393818345933, + "learning_rate": 1.0291017484469524e-05, + "loss": 1.0709, + "step": 18011 + }, + { + "epoch": 1.72, + "grad_norm": 0.3055346976789533, + "learning_rate": 1.028402879328354e-05, + "loss": 0.9942, + "step": 18012 + }, + { + "epoch": 1.72, + "grad_norm": 0.28899512715772196, + "learning_rate": 1.027704234729614e-05, + "loss": 1.1175, + "step": 18013 + }, + { + "epoch": 1.72, + "grad_norm": 0.29992441917857304, + "learning_rate": 1.0270058146682105e-05, + "loss": 1.0024, + "step": 18014 + }, + { + "epoch": 1.72, + "grad_norm": 0.29353234796786154, + "learning_rate": 1.0263076191616294e-05, + "loss": 0.9666, + "step": 18015 + }, + { + "epoch": 1.72, + "grad_norm": 0.3243178502165033, + "learning_rate": 1.0256096482273391e-05, + "loss": 0.9681, + "step": 18016 + }, + { + "epoch": 1.72, + "grad_norm": 0.32359554357027376, + "learning_rate": 1.0249119018828079e-05, + "loss": 1.0113, + "step": 18017 + }, + { + "epoch": 1.72, + "grad_norm": 0.3034448142236216, + "learning_rate": 1.0242143801454962e-05, + "loss": 1.0417, + "step": 18018 + }, + { + "epoch": 1.72, + "grad_norm": 0.3087108906837879, + "learning_rate": 1.0235170830328622e-05, + "loss": 1.0654, + "step": 18019 + }, + { + "epoch": 1.72, + "grad_norm": 0.3345107693267886, + "learning_rate": 1.022820010562352e-05, + "loss": 0.9914, + "step": 18020 + }, + { + "epoch": 1.72, + "grad_norm": 0.33356118520112793, + "learning_rate": 1.0221231627514138e-05, + "loss": 1.0963, + "step": 18021 + }, + { + "epoch": 1.72, + "grad_norm": 0.4871797969241581, + "learning_rate": 1.021426539617486e-05, + "loss": 1.0323, + "step": 18022 + }, + { + "epoch": 1.72, + "grad_norm": 0.3118516173979496, + "learning_rate": 1.0207301411780023e-05, + "loss": 0.974, + "step": 18023 + }, + { + "epoch": 1.72, + "grad_norm": 0.31718579565792526, + "learning_rate": 1.0200339674503911e-05, + "loss": 0.9956, + "step": 18024 + }, + { + "epoch": 1.72, + "grad_norm": 0.35909732947164896, + "learning_rate": 1.0193380184520718e-05, + "loss": 0.965, + "step": 18025 + }, + { + "epoch": 1.72, + "grad_norm": 0.2935461232612471, + "learning_rate": 1.0186422942004648e-05, + "loss": 0.9452, + "step": 18026 + }, + { + "epoch": 1.72, + "grad_norm": 0.3194477236465143, + "learning_rate": 1.017946794712975e-05, + "loss": 1.0083, + "step": 18027 + }, + { + "epoch": 1.72, + "grad_norm": 0.2995186369451746, + "learning_rate": 1.0172515200070153e-05, + "loss": 1.1364, + "step": 18028 + }, + { + "epoch": 1.72, + "grad_norm": 0.3447145878827241, + "learning_rate": 1.0165564700999796e-05, + "loss": 0.9009, + "step": 18029 + }, + { + "epoch": 1.72, + "grad_norm": 0.35292266911814274, + "learning_rate": 1.0158616450092673e-05, + "loss": 1.1449, + "step": 18030 + }, + { + "epoch": 1.73, + "grad_norm": 0.3552660202063185, + "learning_rate": 1.015167044752261e-05, + "loss": 0.9544, + "step": 18031 + }, + { + "epoch": 1.73, + "grad_norm": 0.28476893513567697, + "learning_rate": 1.014472669346348e-05, + "loss": 0.9402, + "step": 18032 + }, + { + "epoch": 1.73, + "grad_norm": 0.32873064064174295, + "learning_rate": 1.0137785188089021e-05, + "loss": 1.016, + "step": 18033 + }, + { + "epoch": 1.73, + "grad_norm": 0.2868700411527962, + "learning_rate": 1.0130845931572952e-05, + "loss": 0.9706, + "step": 18034 + }, + { + "epoch": 1.73, + "grad_norm": 0.285660299218245, + "learning_rate": 1.0123908924088954e-05, + "loss": 1.0192, + "step": 18035 + }, + { + "epoch": 1.73, + "grad_norm": 0.34098367555014036, + "learning_rate": 1.0116974165810644e-05, + "loss": 1.0767, + "step": 18036 + }, + { + "epoch": 1.73, + "grad_norm": 0.2999396960740459, + "learning_rate": 1.0110041656911517e-05, + "loss": 0.9665, + "step": 18037 + }, + { + "epoch": 1.73, + "grad_norm": 0.3381429629722079, + "learning_rate": 1.0103111397565112e-05, + "loss": 1.0231, + "step": 18038 + }, + { + "epoch": 1.73, + "grad_norm": 0.31477126914898274, + "learning_rate": 1.0096183387944824e-05, + "loss": 1.0523, + "step": 18039 + }, + { + "epoch": 1.73, + "grad_norm": 0.28752518280802597, + "learning_rate": 1.0089257628224046e-05, + "loss": 1.0727, + "step": 18040 + }, + { + "epoch": 1.73, + "grad_norm": 0.33025663655030457, + "learning_rate": 1.0082334118576108e-05, + "loss": 1.0456, + "step": 18041 + }, + { + "epoch": 1.73, + "grad_norm": 0.2915258962549091, + "learning_rate": 1.0075412859174293e-05, + "loss": 1.1096, + "step": 18042 + }, + { + "epoch": 1.73, + "grad_norm": 0.35842381426545317, + "learning_rate": 1.0068493850191763e-05, + "loss": 0.9819, + "step": 18043 + }, + { + "epoch": 1.73, + "grad_norm": 0.3099169252387948, + "learning_rate": 1.0061577091801689e-05, + "loss": 0.9748, + "step": 18044 + }, + { + "epoch": 1.73, + "grad_norm": 0.3439436177771512, + "learning_rate": 1.0054662584177199e-05, + "loss": 1.0365, + "step": 18045 + }, + { + "epoch": 1.73, + "grad_norm": 0.345351010004228, + "learning_rate": 1.0047750327491278e-05, + "loss": 1.039, + "step": 18046 + }, + { + "epoch": 1.73, + "grad_norm": 0.2880985173331054, + "learning_rate": 1.0040840321916967e-05, + "loss": 1.0515, + "step": 18047 + }, + { + "epoch": 1.73, + "grad_norm": 0.3076721200252257, + "learning_rate": 1.0033932567627125e-05, + "loss": 1.0641, + "step": 18048 + }, + { + "epoch": 1.73, + "grad_norm": 0.26480622406149945, + "learning_rate": 1.0027027064794691e-05, + "loss": 0.8189, + "step": 18049 + }, + { + "epoch": 1.73, + "grad_norm": 0.2906720179584616, + "learning_rate": 1.0020123813592441e-05, + "loss": 1.0602, + "step": 18050 + }, + { + "epoch": 1.73, + "grad_norm": 0.31233537796686206, + "learning_rate": 1.0013222814193157e-05, + "loss": 1.0048, + "step": 18051 + }, + { + "epoch": 1.73, + "grad_norm": 0.29708153374136964, + "learning_rate": 1.0006324066769512e-05, + "loss": 0.991, + "step": 18052 + }, + { + "epoch": 1.73, + "grad_norm": 0.2536339138892414, + "learning_rate": 9.99942757149418e-06, + "loss": 0.9345, + "step": 18053 + }, + { + "epoch": 1.73, + "grad_norm": 0.321553848211941, + "learning_rate": 9.992533328539699e-06, + "loss": 1.0057, + "step": 18054 + }, + { + "epoch": 1.73, + "grad_norm": 0.2818679753673852, + "learning_rate": 9.985641338078678e-06, + "loss": 0.9435, + "step": 18055 + }, + { + "epoch": 1.73, + "grad_norm": 0.2970765522033754, + "learning_rate": 9.978751600283542e-06, + "loss": 1.0922, + "step": 18056 + }, + { + "epoch": 1.73, + "grad_norm": 0.3073245595143973, + "learning_rate": 9.971864115326734e-06, + "loss": 1.0501, + "step": 18057 + }, + { + "epoch": 1.73, + "grad_norm": 0.3248668755954426, + "learning_rate": 9.964978883380593e-06, + "loss": 1.0408, + "step": 18058 + }, + { + "epoch": 1.73, + "grad_norm": 0.37150823756937923, + "learning_rate": 9.958095904617459e-06, + "loss": 1.0805, + "step": 18059 + }, + { + "epoch": 1.73, + "grad_norm": 0.3187557767974174, + "learning_rate": 9.951215179209539e-06, + "loss": 0.9986, + "step": 18060 + }, + { + "epoch": 1.73, + "grad_norm": 0.36664387322113295, + "learning_rate": 9.944336707329061e-06, + "loss": 0.9861, + "step": 18061 + }, + { + "epoch": 1.73, + "grad_norm": 0.34327995174179016, + "learning_rate": 9.937460489148142e-06, + "loss": 0.9814, + "step": 18062 + }, + { + "epoch": 1.73, + "grad_norm": 0.3356054868867401, + "learning_rate": 9.930586524838903e-06, + "loss": 1.04, + "step": 18063 + }, + { + "epoch": 1.73, + "grad_norm": 0.35793671893112944, + "learning_rate": 9.923714814573326e-06, + "loss": 1.0203, + "step": 18064 + }, + { + "epoch": 1.73, + "grad_norm": 0.3207561447268859, + "learning_rate": 9.916845358523396e-06, + "loss": 1.0899, + "step": 18065 + }, + { + "epoch": 1.73, + "grad_norm": 0.33346910487979936, + "learning_rate": 9.909978156861044e-06, + "loss": 0.8673, + "step": 18066 + }, + { + "epoch": 1.73, + "grad_norm": 0.29546278148359995, + "learning_rate": 9.903113209758096e-06, + "loss": 0.9629, + "step": 18067 + }, + { + "epoch": 1.73, + "grad_norm": 0.3083197773065079, + "learning_rate": 9.896250517386352e-06, + "loss": 1.1456, + "step": 18068 + }, + { + "epoch": 1.73, + "grad_norm": 0.26482633175719095, + "learning_rate": 9.88939007991757e-06, + "loss": 0.7869, + "step": 18069 + }, + { + "epoch": 1.73, + "grad_norm": 0.3200469516349646, + "learning_rate": 9.882531897523462e-06, + "loss": 1.0904, + "step": 18070 + }, + { + "epoch": 1.73, + "grad_norm": 0.3952261111952405, + "learning_rate": 9.875675970375597e-06, + "loss": 0.984, + "step": 18071 + }, + { + "epoch": 1.73, + "grad_norm": 0.2875636980121753, + "learning_rate": 9.868822298645607e-06, + "loss": 1.022, + "step": 18072 + }, + { + "epoch": 1.73, + "grad_norm": 0.34444833375665246, + "learning_rate": 9.861970882504967e-06, + "loss": 1.0051, + "step": 18073 + }, + { + "epoch": 1.73, + "grad_norm": 0.3421215602816179, + "learning_rate": 9.855121722125172e-06, + "loss": 1.009, + "step": 18074 + }, + { + "epoch": 1.73, + "grad_norm": 0.30625501262574883, + "learning_rate": 9.848274817677571e-06, + "loss": 1.1054, + "step": 18075 + }, + { + "epoch": 1.73, + "grad_norm": 0.28881028295789335, + "learning_rate": 9.841430169333598e-06, + "loss": 1.0343, + "step": 18076 + }, + { + "epoch": 1.73, + "grad_norm": 0.352569460836126, + "learning_rate": 9.83458777726447e-06, + "loss": 1.1474, + "step": 18077 + }, + { + "epoch": 1.73, + "grad_norm": 0.29851355614303415, + "learning_rate": 9.827747641641472e-06, + "loss": 1.1584, + "step": 18078 + }, + { + "epoch": 1.73, + "grad_norm": 0.3134276021636491, + "learning_rate": 9.820909762635732e-06, + "loss": 0.938, + "step": 18079 + }, + { + "epoch": 1.73, + "grad_norm": 0.3360752398214166, + "learning_rate": 9.814074140418427e-06, + "loss": 1.0421, + "step": 18080 + }, + { + "epoch": 1.73, + "grad_norm": 0.2923479284210727, + "learning_rate": 9.807240775160587e-06, + "loss": 0.995, + "step": 18081 + }, + { + "epoch": 1.73, + "grad_norm": 0.3154007032539473, + "learning_rate": 9.800409667033216e-06, + "loss": 0.9557, + "step": 18082 + }, + { + "epoch": 1.73, + "grad_norm": 0.3366841874962632, + "learning_rate": 9.793580816207293e-06, + "loss": 1.0556, + "step": 18083 + }, + { + "epoch": 1.73, + "grad_norm": 0.3573244290837972, + "learning_rate": 9.786754222853711e-06, + "loss": 1.0323, + "step": 18084 + }, + { + "epoch": 1.73, + "grad_norm": 0.27993554101625634, + "learning_rate": 9.779929887143292e-06, + "loss": 1.0037, + "step": 18085 + }, + { + "epoch": 1.73, + "grad_norm": 0.29020889605406175, + "learning_rate": 9.773107809246818e-06, + "loss": 1.0752, + "step": 18086 + }, + { + "epoch": 1.73, + "grad_norm": 0.2815208072885254, + "learning_rate": 9.766287989335054e-06, + "loss": 1.0365, + "step": 18087 + }, + { + "epoch": 1.73, + "grad_norm": 0.31914738359919176, + "learning_rate": 9.759470427578622e-06, + "loss": 0.9808, + "step": 18088 + }, + { + "epoch": 1.73, + "grad_norm": 0.30088892669240913, + "learning_rate": 9.752655124148146e-06, + "loss": 1.0501, + "step": 18089 + }, + { + "epoch": 1.73, + "grad_norm": 0.30326126904945394, + "learning_rate": 9.745842079214207e-06, + "loss": 1.0104, + "step": 18090 + }, + { + "epoch": 1.73, + "grad_norm": 0.33647165566148246, + "learning_rate": 9.73903129294731e-06, + "loss": 0.9017, + "step": 18091 + }, + { + "epoch": 1.73, + "grad_norm": 0.3055394741919497, + "learning_rate": 9.732222765517862e-06, + "loss": 0.955, + "step": 18092 + }, + { + "epoch": 1.73, + "grad_norm": 0.3094157425537272, + "learning_rate": 9.725416497096285e-06, + "loss": 0.9734, + "step": 18093 + }, + { + "epoch": 1.73, + "grad_norm": 0.3033509951371394, + "learning_rate": 9.718612487852885e-06, + "loss": 1.1114, + "step": 18094 + }, + { + "epoch": 1.73, + "grad_norm": 0.31674471503774604, + "learning_rate": 9.711810737957938e-06, + "loss": 1.0352, + "step": 18095 + }, + { + "epoch": 1.73, + "grad_norm": 0.33426704312817945, + "learning_rate": 9.705011247581675e-06, + "loss": 0.9981, + "step": 18096 + }, + { + "epoch": 1.73, + "grad_norm": 0.2708946202146054, + "learning_rate": 9.69821401689427e-06, + "loss": 1.0172, + "step": 18097 + }, + { + "epoch": 1.73, + "grad_norm": 0.302652916685366, + "learning_rate": 9.691419046065797e-06, + "loss": 0.9672, + "step": 18098 + }, + { + "epoch": 1.73, + "grad_norm": 0.28081762672278526, + "learning_rate": 9.684626335266345e-06, + "loss": 0.9502, + "step": 18099 + }, + { + "epoch": 1.73, + "grad_norm": 0.3252241166476098, + "learning_rate": 9.677835884665842e-06, + "loss": 1.0591, + "step": 18100 + }, + { + "epoch": 1.73, + "grad_norm": 0.3429357946740841, + "learning_rate": 9.671047694434298e-06, + "loss": 1.0976, + "step": 18101 + }, + { + "epoch": 1.73, + "grad_norm": 0.2803507921463548, + "learning_rate": 9.664261764741511e-06, + "loss": 0.9866, + "step": 18102 + }, + { + "epoch": 1.73, + "grad_norm": 0.3364474924244306, + "learning_rate": 9.657478095757388e-06, + "loss": 1.0033, + "step": 18103 + }, + { + "epoch": 1.73, + "grad_norm": 0.35840710877940807, + "learning_rate": 9.650696687651628e-06, + "loss": 1.0463, + "step": 18104 + }, + { + "epoch": 1.73, + "grad_norm": 0.29140946233944426, + "learning_rate": 9.643917540593983e-06, + "loss": 0.9395, + "step": 18105 + }, + { + "epoch": 1.73, + "grad_norm": 0.291558550032567, + "learning_rate": 9.637140654754084e-06, + "loss": 0.9763, + "step": 18106 + }, + { + "epoch": 1.73, + "grad_norm": 0.31027264420226824, + "learning_rate": 9.630366030301518e-06, + "loss": 1.0774, + "step": 18107 + }, + { + "epoch": 1.73, + "grad_norm": 0.318041930719192, + "learning_rate": 9.623593667405861e-06, + "loss": 1.0751, + "step": 18108 + }, + { + "epoch": 1.73, + "grad_norm": 0.3453345690603281, + "learning_rate": 9.616823566236555e-06, + "loss": 0.9942, + "step": 18109 + }, + { + "epoch": 1.73, + "grad_norm": 0.2965070838612417, + "learning_rate": 9.61005572696303e-06, + "loss": 0.9748, + "step": 18110 + }, + { + "epoch": 1.73, + "grad_norm": 0.298262397508659, + "learning_rate": 9.603290149754684e-06, + "loss": 1.0605, + "step": 18111 + }, + { + "epoch": 1.73, + "grad_norm": 0.2814991657811611, + "learning_rate": 9.596526834780827e-06, + "loss": 1.0442, + "step": 18112 + }, + { + "epoch": 1.73, + "grad_norm": 0.3575544183686376, + "learning_rate": 9.58976578221068e-06, + "loss": 1.0436, + "step": 18113 + }, + { + "epoch": 1.73, + "grad_norm": 0.3351873115910221, + "learning_rate": 9.583006992213494e-06, + "loss": 1.0605, + "step": 18114 + }, + { + "epoch": 1.73, + "grad_norm": 0.3169114667413366, + "learning_rate": 9.576250464958336e-06, + "loss": 0.9945, + "step": 18115 + }, + { + "epoch": 1.73, + "grad_norm": 0.3355619310184321, + "learning_rate": 9.569496200614381e-06, + "loss": 0.9694, + "step": 18116 + }, + { + "epoch": 1.73, + "grad_norm": 0.29433584128136675, + "learning_rate": 9.562744199350593e-06, + "loss": 0.9894, + "step": 18117 + }, + { + "epoch": 1.73, + "grad_norm": 0.3067401058871665, + "learning_rate": 9.555994461335993e-06, + "loss": 0.9846, + "step": 18118 + }, + { + "epoch": 1.73, + "grad_norm": 0.3307989639691603, + "learning_rate": 9.549246986739457e-06, + "loss": 1.001, + "step": 18119 + }, + { + "epoch": 1.73, + "grad_norm": 0.3219025182222384, + "learning_rate": 9.542501775729873e-06, + "loss": 0.9662, + "step": 18120 + }, + { + "epoch": 1.73, + "grad_norm": 0.33309890104237666, + "learning_rate": 9.535758828476015e-06, + "loss": 1.028, + "step": 18121 + }, + { + "epoch": 1.73, + "grad_norm": 0.3453925915634871, + "learning_rate": 9.529018145146652e-06, + "loss": 1.0532, + "step": 18122 + }, + { + "epoch": 1.73, + "grad_norm": 0.33241760263826975, + "learning_rate": 9.522279725910466e-06, + "loss": 0.9847, + "step": 18123 + }, + { + "epoch": 1.73, + "grad_norm": 0.3046377173479671, + "learning_rate": 9.515543570936114e-06, + "loss": 0.9351, + "step": 18124 + }, + { + "epoch": 1.73, + "grad_norm": 0.28096459605150126, + "learning_rate": 9.508809680392128e-06, + "loss": 0.9694, + "step": 18125 + }, + { + "epoch": 1.73, + "grad_norm": 0.3262091955180324, + "learning_rate": 9.502078054447061e-06, + "loss": 0.9604, + "step": 18126 + }, + { + "epoch": 1.73, + "grad_norm": 0.32338932670928694, + "learning_rate": 9.49534869326938e-06, + "loss": 0.9242, + "step": 18127 + }, + { + "epoch": 1.73, + "grad_norm": 0.33007026829943753, + "learning_rate": 9.488621597027458e-06, + "loss": 0.985, + "step": 18128 + }, + { + "epoch": 1.73, + "grad_norm": 0.28815457061231603, + "learning_rate": 9.481896765889665e-06, + "loss": 1.1948, + "step": 18129 + }, + { + "epoch": 1.73, + "grad_norm": 0.3012702183729818, + "learning_rate": 9.475174200024284e-06, + "loss": 1.0349, + "step": 18130 + }, + { + "epoch": 1.73, + "grad_norm": 0.3122146844044841, + "learning_rate": 9.468453899599594e-06, + "loss": 1.0655, + "step": 18131 + }, + { + "epoch": 1.73, + "grad_norm": 0.30828847253421227, + "learning_rate": 9.461735864783716e-06, + "loss": 0.9734, + "step": 18132 + }, + { + "epoch": 1.73, + "grad_norm": 0.32110392245721375, + "learning_rate": 9.455020095744826e-06, + "loss": 1.0182, + "step": 18133 + }, + { + "epoch": 1.73, + "grad_norm": 0.314257395218865, + "learning_rate": 9.448306592650936e-06, + "loss": 1.0319, + "step": 18134 + }, + { + "epoch": 1.73, + "grad_norm": 0.2959673090566554, + "learning_rate": 9.441595355670108e-06, + "loss": 1.0, + "step": 18135 + }, + { + "epoch": 1.74, + "grad_norm": 0.2677136096175064, + "learning_rate": 9.434886384970232e-06, + "loss": 0.9271, + "step": 18136 + }, + { + "epoch": 1.74, + "grad_norm": 0.3152552786949547, + "learning_rate": 9.428179680719273e-06, + "loss": 1.0677, + "step": 18137 + }, + { + "epoch": 1.74, + "grad_norm": 0.3049045299432868, + "learning_rate": 9.421475243085032e-06, + "loss": 1.0103, + "step": 18138 + }, + { + "epoch": 1.74, + "grad_norm": 0.3445245187181725, + "learning_rate": 9.414773072235306e-06, + "loss": 0.9337, + "step": 18139 + }, + { + "epoch": 1.74, + "grad_norm": 0.32465686106182795, + "learning_rate": 9.408073168337805e-06, + "loss": 0.9439, + "step": 18140 + }, + { + "epoch": 1.74, + "grad_norm": 0.34497856852837516, + "learning_rate": 9.40137553156022e-06, + "loss": 1.0439, + "step": 18141 + }, + { + "epoch": 1.74, + "grad_norm": 0.3117849120558212, + "learning_rate": 9.394680162070136e-06, + "loss": 0.9901, + "step": 18142 + }, + { + "epoch": 1.74, + "grad_norm": 0.2962548380882118, + "learning_rate": 9.38798706003513e-06, + "loss": 0.988, + "step": 18143 + }, + { + "epoch": 1.74, + "grad_norm": 0.32706042184995904, + "learning_rate": 9.381296225622682e-06, + "loss": 1.0235, + "step": 18144 + }, + { + "epoch": 1.74, + "grad_norm": 0.3452061731669315, + "learning_rate": 9.374607659000279e-06, + "loss": 1.1061, + "step": 18145 + }, + { + "epoch": 1.74, + "grad_norm": 0.3261576820605798, + "learning_rate": 9.367921360335252e-06, + "loss": 0.8716, + "step": 18146 + }, + { + "epoch": 1.74, + "grad_norm": 0.315057522566934, + "learning_rate": 9.361237329794947e-06, + "loss": 0.9933, + "step": 18147 + }, + { + "epoch": 1.74, + "grad_norm": 0.305535383330379, + "learning_rate": 9.354555567546675e-06, + "loss": 1.0278, + "step": 18148 + }, + { + "epoch": 1.74, + "grad_norm": 0.3099769518424617, + "learning_rate": 9.34787607375759e-06, + "loss": 1.1005, + "step": 18149 + }, + { + "epoch": 1.74, + "grad_norm": 0.2960263596048134, + "learning_rate": 9.341198848594879e-06, + "loss": 1.134, + "step": 18150 + }, + { + "epoch": 1.74, + "grad_norm": 0.35381394422996976, + "learning_rate": 9.334523892225645e-06, + "loss": 1.1158, + "step": 18151 + }, + { + "epoch": 1.74, + "grad_norm": 0.34157860798655854, + "learning_rate": 9.327851204816951e-06, + "loss": 0.986, + "step": 18152 + }, + { + "epoch": 1.74, + "grad_norm": 0.35369838964744404, + "learning_rate": 9.321180786535743e-06, + "loss": 0.9742, + "step": 18153 + }, + { + "epoch": 1.74, + "grad_norm": 0.2645366221450154, + "learning_rate": 9.314512637549e-06, + "loss": 1.0602, + "step": 18154 + }, + { + "epoch": 1.74, + "grad_norm": 0.3632326297765401, + "learning_rate": 9.307846758023552e-06, + "loss": 1.0411, + "step": 18155 + }, + { + "epoch": 1.74, + "grad_norm": 0.3148166999326681, + "learning_rate": 9.301183148126235e-06, + "loss": 0.8678, + "step": 18156 + }, + { + "epoch": 1.74, + "grad_norm": 0.3246201227336874, + "learning_rate": 9.294521808023815e-06, + "loss": 1.064, + "step": 18157 + }, + { + "epoch": 1.74, + "grad_norm": 0.29055903704153574, + "learning_rate": 9.287862737883002e-06, + "loss": 0.9027, + "step": 18158 + }, + { + "epoch": 1.74, + "grad_norm": 0.3061773270738819, + "learning_rate": 9.28120593787043e-06, + "loss": 0.9583, + "step": 18159 + }, + { + "epoch": 1.74, + "grad_norm": 0.31026646316320866, + "learning_rate": 9.274551408152709e-06, + "loss": 1.1007, + "step": 18160 + }, + { + "epoch": 1.74, + "grad_norm": 0.3131370664608547, + "learning_rate": 9.26789914889633e-06, + "loss": 0.8149, + "step": 18161 + }, + { + "epoch": 1.74, + "grad_norm": 0.322674608182508, + "learning_rate": 9.261249160267827e-06, + "loss": 1.1251, + "step": 18162 + }, + { + "epoch": 1.74, + "grad_norm": 0.3040045201227944, + "learning_rate": 9.254601442433542e-06, + "loss": 1.1021, + "step": 18163 + }, + { + "epoch": 1.74, + "grad_norm": 0.3253612831867738, + "learning_rate": 9.247955995559931e-06, + "loss": 0.8693, + "step": 18164 + }, + { + "epoch": 1.74, + "grad_norm": 0.3043819838896509, + "learning_rate": 9.241312819813242e-06, + "loss": 0.9602, + "step": 18165 + }, + { + "epoch": 1.74, + "grad_norm": 0.2649095752536167, + "learning_rate": 9.23467191535976e-06, + "loss": 1.005, + "step": 18166 + }, + { + "epoch": 1.74, + "grad_norm": 0.31230304097892436, + "learning_rate": 9.228033282365633e-06, + "loss": 1.1585, + "step": 18167 + }, + { + "epoch": 1.74, + "grad_norm": 0.3254025882095238, + "learning_rate": 9.221396920997016e-06, + "loss": 0.909, + "step": 18168 + }, + { + "epoch": 1.74, + "grad_norm": 0.31122897277369466, + "learning_rate": 9.21476283142002e-06, + "loss": 1.0888, + "step": 18169 + }, + { + "epoch": 1.74, + "grad_norm": 0.27928736728137576, + "learning_rate": 9.208131013800614e-06, + "loss": 0.878, + "step": 18170 + }, + { + "epoch": 1.74, + "grad_norm": 0.3125299446452924, + "learning_rate": 9.201501468304797e-06, + "loss": 1.1575, + "step": 18171 + }, + { + "epoch": 1.74, + "grad_norm": 0.3441022079260827, + "learning_rate": 9.194874195098469e-06, + "loss": 0.9535, + "step": 18172 + }, + { + "epoch": 1.74, + "grad_norm": 0.3174065245060874, + "learning_rate": 9.188249194347499e-06, + "loss": 0.9888, + "step": 18173 + }, + { + "epoch": 1.74, + "grad_norm": 0.28017699152619185, + "learning_rate": 9.181626466217652e-06, + "loss": 1.0376, + "step": 18174 + }, + { + "epoch": 1.74, + "grad_norm": 0.302967516028628, + "learning_rate": 9.175006010874698e-06, + "loss": 1.0568, + "step": 18175 + }, + { + "epoch": 1.74, + "grad_norm": 0.32724323345037093, + "learning_rate": 9.16838782848426e-06, + "loss": 1.004, + "step": 18176 + }, + { + "epoch": 1.74, + "grad_norm": 0.33846927313249636, + "learning_rate": 9.161771919212036e-06, + "loss": 0.9559, + "step": 18177 + }, + { + "epoch": 1.74, + "grad_norm": 0.2827942140978951, + "learning_rate": 9.155158283223552e-06, + "loss": 0.9638, + "step": 18178 + }, + { + "epoch": 1.74, + "grad_norm": 0.33333904138847487, + "learning_rate": 9.14854692068433e-06, + "loss": 1.0724, + "step": 18179 + }, + { + "epoch": 1.74, + "grad_norm": 0.30793718776474194, + "learning_rate": 9.141937831759818e-06, + "loss": 1.118, + "step": 18180 + }, + { + "epoch": 1.74, + "grad_norm": 0.32756205146052075, + "learning_rate": 9.135331016615422e-06, + "loss": 1.0585, + "step": 18181 + }, + { + "epoch": 1.74, + "grad_norm": 0.32478866090648467, + "learning_rate": 9.12872647541645e-06, + "loss": 1.0855, + "step": 18182 + }, + { + "epoch": 1.74, + "grad_norm": 0.3126934248510032, + "learning_rate": 9.12212420832822e-06, + "loss": 1.041, + "step": 18183 + }, + { + "epoch": 1.74, + "grad_norm": 0.38703300056610707, + "learning_rate": 9.115524215515936e-06, + "loss": 1.075, + "step": 18184 + }, + { + "epoch": 1.74, + "grad_norm": 0.33502737365419516, + "learning_rate": 9.108926497144799e-06, + "loss": 1.0295, + "step": 18185 + }, + { + "epoch": 1.74, + "grad_norm": 0.3119527192315406, + "learning_rate": 9.102331053379887e-06, + "loss": 1.1074, + "step": 18186 + }, + { + "epoch": 1.74, + "grad_norm": 0.27617489446989013, + "learning_rate": 9.09573788438628e-06, + "loss": 0.9952, + "step": 18187 + }, + { + "epoch": 1.74, + "grad_norm": 0.28402352025815597, + "learning_rate": 9.089146990328945e-06, + "loss": 0.9661, + "step": 18188 + }, + { + "epoch": 1.74, + "grad_norm": 0.31774965855861176, + "learning_rate": 9.082558371372841e-06, + "loss": 1.017, + "step": 18189 + }, + { + "epoch": 1.74, + "grad_norm": 0.3134022261672248, + "learning_rate": 9.075972027682855e-06, + "loss": 1.0018, + "step": 18190 + }, + { + "epoch": 1.74, + "grad_norm": 0.31801850963330697, + "learning_rate": 9.069387959423836e-06, + "loss": 1.0974, + "step": 18191 + }, + { + "epoch": 1.74, + "grad_norm": 0.3593266338966357, + "learning_rate": 9.062806166760506e-06, + "loss": 1.1046, + "step": 18192 + }, + { + "epoch": 1.74, + "grad_norm": 0.2900026465464531, + "learning_rate": 9.056226649857602e-06, + "loss": 1.0366, + "step": 18193 + }, + { + "epoch": 1.74, + "grad_norm": 0.32811716652640965, + "learning_rate": 9.049649408879801e-06, + "loss": 1.0063, + "step": 18194 + }, + { + "epoch": 1.74, + "grad_norm": 0.29273403542592175, + "learning_rate": 9.043074443991673e-06, + "loss": 0.9981, + "step": 18195 + }, + { + "epoch": 1.74, + "grad_norm": 0.274149125279032, + "learning_rate": 9.036501755357785e-06, + "loss": 1.0094, + "step": 18196 + }, + { + "epoch": 1.74, + "grad_norm": 0.36093925779180636, + "learning_rate": 9.029931343142572e-06, + "loss": 1.0543, + "step": 18197 + }, + { + "epoch": 1.74, + "grad_norm": 0.2820548148100993, + "learning_rate": 9.023363207510537e-06, + "loss": 0.9839, + "step": 18198 + }, + { + "epoch": 1.74, + "grad_norm": 0.28848596237807594, + "learning_rate": 9.016797348625995e-06, + "loss": 0.9178, + "step": 18199 + }, + { + "epoch": 1.74, + "grad_norm": 0.2826861503578286, + "learning_rate": 9.010233766653297e-06, + "loss": 0.9597, + "step": 18200 + }, + { + "epoch": 1.74, + "grad_norm": 0.36952403760501007, + "learning_rate": 9.003672461756674e-06, + "loss": 1.0244, + "step": 18201 + }, + { + "epoch": 1.74, + "grad_norm": 0.36865436750586256, + "learning_rate": 8.997113434100346e-06, + "loss": 0.9888, + "step": 18202 + }, + { + "epoch": 1.74, + "grad_norm": 0.3012421511650934, + "learning_rate": 8.990556683848428e-06, + "loss": 1.0591, + "step": 18203 + }, + { + "epoch": 1.74, + "grad_norm": 0.2973628644686906, + "learning_rate": 8.984002211165032e-06, + "loss": 1.0037, + "step": 18204 + }, + { + "epoch": 1.74, + "grad_norm": 0.27578277792076605, + "learning_rate": 8.977450016214184e-06, + "loss": 0.9758, + "step": 18205 + }, + { + "epoch": 1.74, + "grad_norm": 0.30089741295611744, + "learning_rate": 8.970900099159863e-06, + "loss": 0.9459, + "step": 18206 + }, + { + "epoch": 1.74, + "grad_norm": 0.35113401776799447, + "learning_rate": 8.96435246016597e-06, + "loss": 0.9836, + "step": 18207 + }, + { + "epoch": 1.74, + "grad_norm": 0.3326264209220799, + "learning_rate": 8.957807099396387e-06, + "loss": 1.0388, + "step": 18208 + }, + { + "epoch": 1.74, + "grad_norm": 0.30477119843352163, + "learning_rate": 8.951264017014871e-06, + "loss": 1.0167, + "step": 18209 + }, + { + "epoch": 1.74, + "grad_norm": 0.34329131918597444, + "learning_rate": 8.944723213185202e-06, + "loss": 0.8976, + "step": 18210 + }, + { + "epoch": 1.74, + "grad_norm": 0.3027989317741552, + "learning_rate": 8.93818468807106e-06, + "loss": 1.0165, + "step": 18211 + }, + { + "epoch": 1.74, + "grad_norm": 0.29630559224197633, + "learning_rate": 8.931648441836083e-06, + "loss": 1.0859, + "step": 18212 + }, + { + "epoch": 1.74, + "grad_norm": 0.33469225558114735, + "learning_rate": 8.925114474643847e-06, + "loss": 0.9936, + "step": 18213 + }, + { + "epoch": 1.74, + "grad_norm": 0.3050011335561082, + "learning_rate": 8.918582786657847e-06, + "loss": 1.0762, + "step": 18214 + }, + { + "epoch": 1.74, + "grad_norm": 0.3417813941194547, + "learning_rate": 8.912053378041562e-06, + "loss": 1.0503, + "step": 18215 + }, + { + "epoch": 1.74, + "grad_norm": 0.31561106207893486, + "learning_rate": 8.905526248958385e-06, + "loss": 1.0553, + "step": 18216 + }, + { + "epoch": 1.74, + "grad_norm": 0.36890586437704487, + "learning_rate": 8.89900139957166e-06, + "loss": 1.0107, + "step": 18217 + }, + { + "epoch": 1.74, + "grad_norm": 0.3185956947490087, + "learning_rate": 8.89247883004467e-06, + "loss": 0.9964, + "step": 18218 + }, + { + "epoch": 1.74, + "grad_norm": 0.30607076330757776, + "learning_rate": 8.885958540540685e-06, + "loss": 0.9163, + "step": 18219 + }, + { + "epoch": 1.74, + "grad_norm": 0.27100178219951304, + "learning_rate": 8.879440531222826e-06, + "loss": 1.1056, + "step": 18220 + }, + { + "epoch": 1.74, + "grad_norm": 0.2691596963981213, + "learning_rate": 8.872924802254256e-06, + "loss": 1.0696, + "step": 18221 + }, + { + "epoch": 1.74, + "grad_norm": 0.3416465503600971, + "learning_rate": 8.866411353797999e-06, + "loss": 0.984, + "step": 18222 + }, + { + "epoch": 1.74, + "grad_norm": 0.333239639471976, + "learning_rate": 8.85990018601709e-06, + "loss": 1.0216, + "step": 18223 + }, + { + "epoch": 1.74, + "grad_norm": 0.27753657986206864, + "learning_rate": 8.853391299074421e-06, + "loss": 1.0455, + "step": 18224 + }, + { + "epoch": 1.74, + "grad_norm": 0.30266938097770324, + "learning_rate": 8.84688469313295e-06, + "loss": 1.0301, + "step": 18225 + }, + { + "epoch": 1.74, + "grad_norm": 0.3454497009980911, + "learning_rate": 8.840380368355472e-06, + "loss": 0.9733, + "step": 18226 + }, + { + "epoch": 1.74, + "grad_norm": 0.40052184675399666, + "learning_rate": 8.833878324904776e-06, + "loss": 0.9424, + "step": 18227 + }, + { + "epoch": 1.74, + "grad_norm": 0.35565598558601436, + "learning_rate": 8.827378562943556e-06, + "loss": 1.0303, + "step": 18228 + }, + { + "epoch": 1.74, + "grad_norm": 0.3463834405704416, + "learning_rate": 8.82088108263448e-06, + "loss": 1.0193, + "step": 18229 + }, + { + "epoch": 1.74, + "grad_norm": 0.2811519807701716, + "learning_rate": 8.814385884140175e-06, + "loss": 0.9151, + "step": 18230 + }, + { + "epoch": 1.74, + "grad_norm": 0.33503921824561195, + "learning_rate": 8.807892967623165e-06, + "loss": 1.1084, + "step": 18231 + }, + { + "epoch": 1.74, + "grad_norm": 0.31132380675291005, + "learning_rate": 8.801402333245933e-06, + "loss": 1.0332, + "step": 18232 + }, + { + "epoch": 1.74, + "grad_norm": 0.3239736740594233, + "learning_rate": 8.794913981170938e-06, + "loss": 1.0642, + "step": 18233 + }, + { + "epoch": 1.74, + "grad_norm": 0.6131215341305762, + "learning_rate": 8.788427911560548e-06, + "loss": 1.0153, + "step": 18234 + }, + { + "epoch": 1.74, + "grad_norm": 0.31910146166196757, + "learning_rate": 8.781944124577057e-06, + "loss": 1.1442, + "step": 18235 + }, + { + "epoch": 1.74, + "grad_norm": 0.38448015606883174, + "learning_rate": 8.775462620382769e-06, + "loss": 0.9042, + "step": 18236 + }, + { + "epoch": 1.74, + "grad_norm": 0.34044842864686325, + "learning_rate": 8.768983399139818e-06, + "loss": 0.9379, + "step": 18237 + }, + { + "epoch": 1.74, + "grad_norm": 0.3101632430549503, + "learning_rate": 8.762506461010434e-06, + "loss": 1.0059, + "step": 18238 + }, + { + "epoch": 1.74, + "grad_norm": 0.3181757951773556, + "learning_rate": 8.756031806156638e-06, + "loss": 0.9725, + "step": 18239 + }, + { + "epoch": 1.75, + "grad_norm": 0.30034897971698765, + "learning_rate": 8.749559434740517e-06, + "loss": 1.0956, + "step": 18240 + }, + { + "epoch": 1.75, + "grad_norm": 0.28568249885217656, + "learning_rate": 8.743089346924005e-06, + "loss": 0.988, + "step": 18241 + }, + { + "epoch": 1.75, + "grad_norm": 0.34662675262157316, + "learning_rate": 8.73662154286905e-06, + "loss": 1.0325, + "step": 18242 + }, + { + "epoch": 1.75, + "grad_norm": 0.35197866209386786, + "learning_rate": 8.73015602273748e-06, + "loss": 0.9975, + "step": 18243 + }, + { + "epoch": 1.75, + "grad_norm": 0.31714566793330334, + "learning_rate": 8.723692786691117e-06, + "loss": 1.0196, + "step": 18244 + }, + { + "epoch": 1.75, + "grad_norm": 0.31914964727253153, + "learning_rate": 8.717231834891704e-06, + "loss": 0.9846, + "step": 18245 + }, + { + "epoch": 1.75, + "grad_norm": 0.29667552661756924, + "learning_rate": 8.710773167500953e-06, + "loss": 1.0016, + "step": 18246 + }, + { + "epoch": 1.75, + "grad_norm": 0.31389816289673755, + "learning_rate": 8.704316784680456e-06, + "loss": 1.0457, + "step": 18247 + }, + { + "epoch": 1.75, + "grad_norm": 0.28988159049011714, + "learning_rate": 8.697862686591828e-06, + "loss": 0.9561, + "step": 18248 + }, + { + "epoch": 1.75, + "grad_norm": 0.3257894190243748, + "learning_rate": 8.691410873396555e-06, + "loss": 1.0644, + "step": 18249 + }, + { + "epoch": 1.75, + "grad_norm": 0.35795442218843615, + "learning_rate": 8.684961345256104e-06, + "loss": 0.926, + "step": 18250 + }, + { + "epoch": 1.75, + "grad_norm": 0.3394093074628556, + "learning_rate": 8.678514102331892e-06, + "loss": 1.0695, + "step": 18251 + }, + { + "epoch": 1.75, + "grad_norm": 0.3164255066699684, + "learning_rate": 8.672069144785266e-06, + "loss": 1.1117, + "step": 18252 + }, + { + "epoch": 1.75, + "grad_norm": 0.314881658874949, + "learning_rate": 8.665626472777499e-06, + "loss": 1.0765, + "step": 18253 + }, + { + "epoch": 1.75, + "grad_norm": 0.33516567169089945, + "learning_rate": 8.659186086469828e-06, + "loss": 0.9703, + "step": 18254 + }, + { + "epoch": 1.75, + "grad_norm": 0.3045845427478362, + "learning_rate": 8.652747986023445e-06, + "loss": 1.0512, + "step": 18255 + }, + { + "epoch": 1.75, + "grad_norm": 0.3161893190538045, + "learning_rate": 8.646312171599436e-06, + "loss": 1.0339, + "step": 18256 + }, + { + "epoch": 1.75, + "grad_norm": 0.30836369501960925, + "learning_rate": 8.639878643358901e-06, + "loss": 1.0747, + "step": 18257 + }, + { + "epoch": 1.75, + "grad_norm": 0.31870114896879886, + "learning_rate": 8.63344740146278e-06, + "loss": 1.0432, + "step": 18258 + }, + { + "epoch": 1.75, + "grad_norm": 0.2994591324904543, + "learning_rate": 8.627018446072088e-06, + "loss": 1.053, + "step": 18259 + }, + { + "epoch": 1.75, + "grad_norm": 0.29021425667471396, + "learning_rate": 8.620591777347676e-06, + "loss": 1.0291, + "step": 18260 + }, + { + "epoch": 1.75, + "grad_norm": 0.26295098170353226, + "learning_rate": 8.614167395450412e-06, + "loss": 0.948, + "step": 18261 + }, + { + "epoch": 1.75, + "grad_norm": 0.3494602148830545, + "learning_rate": 8.607745300541015e-06, + "loss": 0.9987, + "step": 18262 + }, + { + "epoch": 1.75, + "grad_norm": 0.3516384470685503, + "learning_rate": 8.601325492780243e-06, + "loss": 0.9757, + "step": 18263 + }, + { + "epoch": 1.75, + "grad_norm": 0.29287600536023695, + "learning_rate": 8.594907972328724e-06, + "loss": 0.9623, + "step": 18264 + }, + { + "epoch": 1.75, + "grad_norm": 0.29562950780635366, + "learning_rate": 8.588492739347088e-06, + "loss": 0.9829, + "step": 18265 + }, + { + "epoch": 1.75, + "grad_norm": 0.3405449597297574, + "learning_rate": 8.582079793995867e-06, + "loss": 0.9241, + "step": 18266 + }, + { + "epoch": 1.75, + "grad_norm": 0.3298919714367362, + "learning_rate": 8.575669136435571e-06, + "loss": 0.9381, + "step": 18267 + }, + { + "epoch": 1.75, + "grad_norm": 0.33619260696439246, + "learning_rate": 8.569260766826603e-06, + "loss": 1.079, + "step": 18268 + }, + { + "epoch": 1.75, + "grad_norm": 0.28918146728995836, + "learning_rate": 8.562854685329358e-06, + "loss": 1.0854, + "step": 18269 + }, + { + "epoch": 1.75, + "grad_norm": 0.33117431899784067, + "learning_rate": 8.556450892104118e-06, + "loss": 1.1023, + "step": 18270 + }, + { + "epoch": 1.75, + "grad_norm": 0.30602940832521763, + "learning_rate": 8.550049387311165e-06, + "loss": 0.9259, + "step": 18271 + }, + { + "epoch": 1.75, + "grad_norm": 0.3551060056474373, + "learning_rate": 8.543650171110707e-06, + "loss": 1.0312, + "step": 18272 + }, + { + "epoch": 1.75, + "grad_norm": 0.32509397236312865, + "learning_rate": 8.53725324366289e-06, + "loss": 1.0912, + "step": 18273 + }, + { + "epoch": 1.75, + "grad_norm": 0.3120522480784208, + "learning_rate": 8.53085860512779e-06, + "loss": 1.0993, + "step": 18274 + }, + { + "epoch": 1.75, + "grad_norm": 0.2930348287542431, + "learning_rate": 8.52446625566542e-06, + "loss": 1.0542, + "step": 18275 + }, + { + "epoch": 1.75, + "grad_norm": 0.3012545002169073, + "learning_rate": 8.518076195435809e-06, + "loss": 1.0258, + "step": 18276 + }, + { + "epoch": 1.75, + "grad_norm": 0.31254330307704864, + "learning_rate": 8.511688424598808e-06, + "loss": 1.0088, + "step": 18277 + }, + { + "epoch": 1.75, + "grad_norm": 0.3085840562648264, + "learning_rate": 8.505302943314297e-06, + "loss": 0.9631, + "step": 18278 + }, + { + "epoch": 1.75, + "grad_norm": 0.3107330155458428, + "learning_rate": 8.498919751742096e-06, + "loss": 1.0607, + "step": 18279 + }, + { + "epoch": 1.75, + "grad_norm": 0.31991746732508397, + "learning_rate": 8.492538850041943e-06, + "loss": 0.9096, + "step": 18280 + }, + { + "epoch": 1.75, + "grad_norm": 0.33730874595749794, + "learning_rate": 8.486160238373497e-06, + "loss": 0.9734, + "step": 18281 + }, + { + "epoch": 1.75, + "grad_norm": 0.3248107983396405, + "learning_rate": 8.479783916896423e-06, + "loss": 1.029, + "step": 18282 + }, + { + "epoch": 1.75, + "grad_norm": 0.3321321440711838, + "learning_rate": 8.473409885770267e-06, + "loss": 1.1526, + "step": 18283 + }, + { + "epoch": 1.75, + "grad_norm": 0.30883507911897845, + "learning_rate": 8.46703814515456e-06, + "loss": 1.0455, + "step": 18284 + }, + { + "epoch": 1.75, + "grad_norm": 0.342577988519466, + "learning_rate": 8.460668695208718e-06, + "loss": 1.0514, + "step": 18285 + }, + { + "epoch": 1.75, + "grad_norm": 0.28942189274160685, + "learning_rate": 8.454301536092201e-06, + "loss": 1.0434, + "step": 18286 + }, + { + "epoch": 1.75, + "grad_norm": 0.30017036254763035, + "learning_rate": 8.447936667964307e-06, + "loss": 1.008, + "step": 18287 + }, + { + "epoch": 1.75, + "grad_norm": 0.32488647970028756, + "learning_rate": 8.44157409098436e-06, + "loss": 1.1344, + "step": 18288 + }, + { + "epoch": 1.75, + "grad_norm": 0.3523626883217153, + "learning_rate": 8.435213805311537e-06, + "loss": 0.9556, + "step": 18289 + }, + { + "epoch": 1.75, + "grad_norm": 0.31889361871304506, + "learning_rate": 8.428855811105062e-06, + "loss": 1.1024, + "step": 18290 + }, + { + "epoch": 1.75, + "grad_norm": 0.31383602108749203, + "learning_rate": 8.422500108524e-06, + "loss": 1.01, + "step": 18291 + }, + { + "epoch": 1.75, + "eval_loss": 1.1242320537567139, + "eval_runtime": 4225.6098, + "eval_samples_per_second": 19.789, + "eval_steps_per_second": 2.474, + "step": 18291 + }, + { + "epoch": 1.75, + "grad_norm": 0.3335209231761056, + "learning_rate": 8.416146697727423e-06, + "loss": 1.0494, + "step": 18292 + }, + { + "epoch": 1.75, + "grad_norm": 0.34153618621079757, + "learning_rate": 8.409795578874336e-06, + "loss": 1.0039, + "step": 18293 + }, + { + "epoch": 1.75, + "grad_norm": 0.3239198840912569, + "learning_rate": 8.403446752123689e-06, + "loss": 1.072, + "step": 18294 + }, + { + "epoch": 1.75, + "grad_norm": 0.30106573808334963, + "learning_rate": 8.397100217634346e-06, + "loss": 0.9054, + "step": 18295 + }, + { + "epoch": 1.75, + "grad_norm": 0.2863865861753647, + "learning_rate": 8.390755975565135e-06, + "loss": 1.0414, + "step": 18296 + }, + { + "epoch": 1.75, + "grad_norm": 0.3206044155302858, + "learning_rate": 8.38441402607485e-06, + "loss": 1.0466, + "step": 18297 + }, + { + "epoch": 1.75, + "grad_norm": 0.2981027887607858, + "learning_rate": 8.378074369322153e-06, + "loss": 1.007, + "step": 18298 + }, + { + "epoch": 1.75, + "grad_norm": 0.28236980042786636, + "learning_rate": 8.37173700546574e-06, + "loss": 0.9037, + "step": 18299 + }, + { + "epoch": 1.75, + "grad_norm": 0.2558070399515313, + "learning_rate": 8.365401934664197e-06, + "loss": 1.0422, + "step": 18300 + }, + { + "epoch": 1.75, + "grad_norm": 0.31584822176259403, + "learning_rate": 8.359069157076071e-06, + "loss": 1.0723, + "step": 18301 + }, + { + "epoch": 1.75, + "grad_norm": 0.327337334223491, + "learning_rate": 8.352738672859817e-06, + "loss": 1.0363, + "step": 18302 + }, + { + "epoch": 1.75, + "grad_norm": 0.3231558067902488, + "learning_rate": 8.346410482173894e-06, + "loss": 1.0044, + "step": 18303 + }, + { + "epoch": 1.75, + "grad_norm": 0.3218111408605086, + "learning_rate": 8.340084585176633e-06, + "loss": 1.057, + "step": 18304 + }, + { + "epoch": 1.75, + "grad_norm": 0.3240440196814289, + "learning_rate": 8.333760982026362e-06, + "loss": 0.9467, + "step": 18305 + }, + { + "epoch": 1.75, + "grad_norm": 0.312735939977943, + "learning_rate": 8.327439672881333e-06, + "loss": 0.9674, + "step": 18306 + }, + { + "epoch": 1.75, + "grad_norm": 0.327912386823362, + "learning_rate": 8.32112065789975e-06, + "loss": 0.9726, + "step": 18307 + }, + { + "epoch": 1.75, + "grad_norm": 0.295137767899863, + "learning_rate": 8.314803937239734e-06, + "loss": 0.9871, + "step": 18308 + }, + { + "epoch": 1.75, + "grad_norm": 0.28526052502036037, + "learning_rate": 8.30848951105938e-06, + "loss": 0.8958, + "step": 18309 + }, + { + "epoch": 1.75, + "grad_norm": 0.3035276870416069, + "learning_rate": 8.302177379516695e-06, + "loss": 0.9658, + "step": 18310 + }, + { + "epoch": 1.75, + "grad_norm": 0.31559530630169924, + "learning_rate": 8.295867542769642e-06, + "loss": 1.0748, + "step": 18311 + }, + { + "epoch": 1.75, + "grad_norm": 0.2921326176168538, + "learning_rate": 8.289560000976138e-06, + "loss": 0.8845, + "step": 18312 + }, + { + "epoch": 1.75, + "grad_norm": 0.3183800790668581, + "learning_rate": 8.283254754294056e-06, + "loss": 1.0493, + "step": 18313 + }, + { + "epoch": 1.75, + "grad_norm": 0.2943540935345056, + "learning_rate": 8.27695180288115e-06, + "loss": 1.0158, + "step": 18314 + }, + { + "epoch": 1.75, + "grad_norm": 0.32955832113648903, + "learning_rate": 8.270651146895159e-06, + "loss": 1.0903, + "step": 18315 + }, + { + "epoch": 1.75, + "grad_norm": 0.30720546854673336, + "learning_rate": 8.2643527864938e-06, + "loss": 1.0361, + "step": 18316 + }, + { + "epoch": 1.75, + "grad_norm": 0.31918412486065045, + "learning_rate": 8.25805672183465e-06, + "loss": 1.0534, + "step": 18317 + }, + { + "epoch": 1.75, + "grad_norm": 0.288015582300729, + "learning_rate": 8.251762953075303e-06, + "loss": 0.941, + "step": 18318 + }, + { + "epoch": 1.75, + "grad_norm": 0.31543834952421823, + "learning_rate": 8.245471480373213e-06, + "loss": 1.0438, + "step": 18319 + }, + { + "epoch": 1.75, + "grad_norm": 0.3338043082502734, + "learning_rate": 8.239182303885895e-06, + "loss": 1.0543, + "step": 18320 + }, + { + "epoch": 1.75, + "grad_norm": 0.2913907674659578, + "learning_rate": 8.232895423770704e-06, + "loss": 1.0707, + "step": 18321 + }, + { + "epoch": 1.75, + "grad_norm": 0.34606942392482926, + "learning_rate": 8.22661084018499e-06, + "loss": 0.9509, + "step": 18322 + }, + { + "epoch": 1.75, + "grad_norm": 0.34643386456191544, + "learning_rate": 8.220328553285993e-06, + "loss": 1.0228, + "step": 18323 + }, + { + "epoch": 1.75, + "grad_norm": 0.32934991601667224, + "learning_rate": 8.21404856323098e-06, + "loss": 1.1139, + "step": 18324 + }, + { + "epoch": 1.75, + "grad_norm": 0.2693013855109291, + "learning_rate": 8.207770870177068e-06, + "loss": 1.0461, + "step": 18325 + }, + { + "epoch": 1.75, + "grad_norm": 0.33149850174169365, + "learning_rate": 8.201495474281385e-06, + "loss": 1.0528, + "step": 18326 + }, + { + "epoch": 1.75, + "grad_norm": 0.33314321255320045, + "learning_rate": 8.195222375700962e-06, + "loss": 1.0006, + "step": 18327 + }, + { + "epoch": 1.75, + "grad_norm": 0.2786991817744734, + "learning_rate": 8.188951574592818e-06, + "loss": 1.0573, + "step": 18328 + }, + { + "epoch": 1.75, + "grad_norm": 0.3189012267142431, + "learning_rate": 8.18268307111384e-06, + "loss": 1.0071, + "step": 18329 + }, + { + "epoch": 1.75, + "grad_norm": 0.3257391789470805, + "learning_rate": 8.176416865420944e-06, + "loss": 0.8708, + "step": 18330 + }, + { + "epoch": 1.75, + "grad_norm": 0.30221085272057135, + "learning_rate": 8.170152957670907e-06, + "loss": 1.0254, + "step": 18331 + }, + { + "epoch": 1.75, + "grad_norm": 0.3005946268870369, + "learning_rate": 8.163891348020502e-06, + "loss": 1.0676, + "step": 18332 + }, + { + "epoch": 1.75, + "grad_norm": 0.35302483510270793, + "learning_rate": 8.157632036626439e-06, + "loss": 1.1175, + "step": 18333 + }, + { + "epoch": 1.75, + "grad_norm": 0.3242807865504245, + "learning_rate": 8.15137502364537e-06, + "loss": 1.0244, + "step": 18334 + }, + { + "epoch": 1.75, + "grad_norm": 0.36265100933065025, + "learning_rate": 8.145120309233856e-06, + "loss": 1.0904, + "step": 18335 + }, + { + "epoch": 1.75, + "grad_norm": 0.37157209222929954, + "learning_rate": 8.138867893548418e-06, + "loss": 1.0304, + "step": 18336 + }, + { + "epoch": 1.75, + "grad_norm": 0.3609085791637654, + "learning_rate": 8.132617776745576e-06, + "loss": 1.1059, + "step": 18337 + }, + { + "epoch": 1.75, + "grad_norm": 0.2853303716327857, + "learning_rate": 8.126369958981683e-06, + "loss": 1.019, + "step": 18338 + }, + { + "epoch": 1.75, + "grad_norm": 0.2794969175660827, + "learning_rate": 8.120124440413135e-06, + "loss": 1.0311, + "step": 18339 + }, + { + "epoch": 1.75, + "grad_norm": 0.30552870874539856, + "learning_rate": 8.113881221196207e-06, + "loss": 0.8498, + "step": 18340 + }, + { + "epoch": 1.75, + "grad_norm": 0.3167525514450764, + "learning_rate": 8.107640301487162e-06, + "loss": 0.9592, + "step": 18341 + }, + { + "epoch": 1.75, + "grad_norm": 0.3367329626601782, + "learning_rate": 8.101401681442166e-06, + "loss": 1.0442, + "step": 18342 + }, + { + "epoch": 1.75, + "grad_norm": 0.29503559542414626, + "learning_rate": 8.09516536121736e-06, + "loss": 0.8413, + "step": 18343 + }, + { + "epoch": 1.75, + "grad_norm": 0.32287658382721657, + "learning_rate": 8.088931340968786e-06, + "loss": 1.0028, + "step": 18344 + }, + { + "epoch": 1.76, + "grad_norm": 0.2823816514373493, + "learning_rate": 8.082699620852486e-06, + "loss": 0.9933, + "step": 18345 + }, + { + "epoch": 1.76, + "grad_norm": 0.30009018052137776, + "learning_rate": 8.076470201024355e-06, + "loss": 0.904, + "step": 18346 + }, + { + "epoch": 1.76, + "grad_norm": 0.37214696784008533, + "learning_rate": 8.070243081640371e-06, + "loss": 1.0774, + "step": 18347 + }, + { + "epoch": 1.76, + "grad_norm": 0.3096588991961864, + "learning_rate": 8.06401826285631e-06, + "loss": 1.0702, + "step": 18348 + }, + { + "epoch": 1.76, + "grad_norm": 0.2808521046474838, + "learning_rate": 8.057795744827989e-06, + "loss": 1.1005, + "step": 18349 + }, + { + "epoch": 1.76, + "grad_norm": 0.2888241051172836, + "learning_rate": 8.051575527711097e-06, + "loss": 0.8637, + "step": 18350 + }, + { + "epoch": 1.76, + "grad_norm": 0.3064069580749499, + "learning_rate": 8.045357611661331e-06, + "loss": 0.9696, + "step": 18351 + }, + { + "epoch": 1.76, + "grad_norm": 0.3424504002434234, + "learning_rate": 8.039141996834254e-06, + "loss": 0.974, + "step": 18352 + }, + { + "epoch": 1.76, + "grad_norm": 0.30290612714701415, + "learning_rate": 8.032928683385454e-06, + "loss": 1.0631, + "step": 18353 + }, + { + "epoch": 1.76, + "grad_norm": 0.39994822379423, + "learning_rate": 8.02671767147042e-06, + "loss": 1.068, + "step": 18354 + }, + { + "epoch": 1.76, + "grad_norm": 0.3422171280321558, + "learning_rate": 8.020508961244589e-06, + "loss": 0.8501, + "step": 18355 + }, + { + "epoch": 1.76, + "grad_norm": 0.28479706636723806, + "learning_rate": 8.014302552863307e-06, + "loss": 1.03, + "step": 18356 + }, + { + "epoch": 1.76, + "grad_norm": 0.31566244828579404, + "learning_rate": 8.008098446481927e-06, + "loss": 1.0119, + "step": 18357 + }, + { + "epoch": 1.76, + "grad_norm": 0.3735451031035795, + "learning_rate": 8.001896642255701e-06, + "loss": 0.9911, + "step": 18358 + }, + { + "epoch": 1.76, + "grad_norm": 0.33806223808797353, + "learning_rate": 7.99569714033982e-06, + "loss": 0.9355, + "step": 18359 + }, + { + "epoch": 1.76, + "grad_norm": 0.32222370557236146, + "learning_rate": 7.989499940889444e-06, + "loss": 0.9816, + "step": 18360 + }, + { + "epoch": 1.76, + "grad_norm": 0.39277607393379665, + "learning_rate": 7.983305044059653e-06, + "loss": 0.9848, + "step": 18361 + }, + { + "epoch": 1.76, + "grad_norm": 0.2987662331134108, + "learning_rate": 7.977112450005498e-06, + "loss": 0.9005, + "step": 18362 + }, + { + "epoch": 1.76, + "grad_norm": 0.30444274557255757, + "learning_rate": 7.970922158881921e-06, + "loss": 1.1375, + "step": 18363 + }, + { + "epoch": 1.76, + "grad_norm": 0.29262710934850195, + "learning_rate": 7.964734170843879e-06, + "loss": 1.031, + "step": 18364 + }, + { + "epoch": 1.76, + "grad_norm": 0.3366041142216789, + "learning_rate": 7.95854848604619e-06, + "loss": 1.0489, + "step": 18365 + }, + { + "epoch": 1.76, + "grad_norm": 0.3152169048129086, + "learning_rate": 7.952365104643666e-06, + "loss": 1.0138, + "step": 18366 + }, + { + "epoch": 1.76, + "grad_norm": 0.32610867745798033, + "learning_rate": 7.946184026791059e-06, + "loss": 1.1462, + "step": 18367 + }, + { + "epoch": 1.76, + "grad_norm": 0.3494862232036809, + "learning_rate": 7.940005252643057e-06, + "loss": 0.9286, + "step": 18368 + }, + { + "epoch": 1.76, + "grad_norm": 0.32236331677321145, + "learning_rate": 7.93382878235428e-06, + "loss": 1.0182, + "step": 18369 + }, + { + "epoch": 1.76, + "grad_norm": 0.3273043972110613, + "learning_rate": 7.927654616079305e-06, + "loss": 1.0448, + "step": 18370 + }, + { + "epoch": 1.76, + "grad_norm": 0.333033920576632, + "learning_rate": 7.92148275397262e-06, + "loss": 1.019, + "step": 18371 + }, + { + "epoch": 1.76, + "grad_norm": 0.2996208260547191, + "learning_rate": 7.915313196188723e-06, + "loss": 1.0407, + "step": 18372 + }, + { + "epoch": 1.76, + "grad_norm": 0.33347722735179347, + "learning_rate": 7.909145942881945e-06, + "loss": 1.0383, + "step": 18373 + }, + { + "epoch": 1.76, + "grad_norm": 0.31666238406988645, + "learning_rate": 7.90298099420671e-06, + "loss": 1.0628, + "step": 18374 + }, + { + "epoch": 1.76, + "grad_norm": 0.3183464578751749, + "learning_rate": 7.896818350317236e-06, + "loss": 1.0169, + "step": 18375 + }, + { + "epoch": 1.76, + "grad_norm": 0.29777903290687097, + "learning_rate": 7.890658011367779e-06, + "loss": 0.9189, + "step": 18376 + }, + { + "epoch": 1.76, + "grad_norm": 0.344300055976854, + "learning_rate": 7.884499977512484e-06, + "loss": 1.0766, + "step": 18377 + }, + { + "epoch": 1.76, + "grad_norm": 0.25989794638324376, + "learning_rate": 7.878344248905467e-06, + "loss": 0.9457, + "step": 18378 + }, + { + "epoch": 1.76, + "grad_norm": 0.2918906256971073, + "learning_rate": 7.872190825700798e-06, + "loss": 0.962, + "step": 18379 + }, + { + "epoch": 1.76, + "grad_norm": 0.3351116520219412, + "learning_rate": 7.86603970805243e-06, + "loss": 1.0158, + "step": 18380 + }, + { + "epoch": 1.76, + "grad_norm": 0.3054628472927794, + "learning_rate": 7.85989089611433e-06, + "loss": 1.0782, + "step": 18381 + }, + { + "epoch": 1.76, + "grad_norm": 0.30986771292049575, + "learning_rate": 7.853744390040374e-06, + "loss": 0.8809, + "step": 18382 + }, + { + "epoch": 1.76, + "grad_norm": 0.38406790856516615, + "learning_rate": 7.847600189984383e-06, + "loss": 1.0638, + "step": 18383 + }, + { + "epoch": 1.76, + "grad_norm": 0.2784791209528582, + "learning_rate": 7.841458296100102e-06, + "loss": 1.1157, + "step": 18384 + }, + { + "epoch": 1.76, + "grad_norm": 0.3338717293709326, + "learning_rate": 7.835318708541262e-06, + "loss": 1.0654, + "step": 18385 + }, + { + "epoch": 1.76, + "grad_norm": 0.29778434012983634, + "learning_rate": 7.829181427461475e-06, + "loss": 1.0121, + "step": 18386 + }, + { + "epoch": 1.76, + "grad_norm": 0.3397355042226414, + "learning_rate": 7.823046453014349e-06, + "loss": 1.0927, + "step": 18387 + }, + { + "epoch": 1.76, + "grad_norm": 0.3266313568429112, + "learning_rate": 7.81691378535342e-06, + "loss": 0.961, + "step": 18388 + }, + { + "epoch": 1.76, + "grad_norm": 0.2728362766493653, + "learning_rate": 7.810783424632174e-06, + "loss": 1.0964, + "step": 18389 + }, + { + "epoch": 1.76, + "grad_norm": 0.3069854645227864, + "learning_rate": 7.804655371003989e-06, + "loss": 0.9414, + "step": 18390 + }, + { + "epoch": 1.76, + "grad_norm": 0.272697192251724, + "learning_rate": 7.798529624622274e-06, + "loss": 0.9188, + "step": 18391 + }, + { + "epoch": 1.76, + "grad_norm": 0.3094120396419827, + "learning_rate": 7.792406185640277e-06, + "loss": 1.0022, + "step": 18392 + }, + { + "epoch": 1.76, + "grad_norm": 0.2711549385620591, + "learning_rate": 7.786285054211273e-06, + "loss": 0.9531, + "step": 18393 + }, + { + "epoch": 1.76, + "grad_norm": 0.34240482789479204, + "learning_rate": 7.780166230488428e-06, + "loss": 0.9044, + "step": 18394 + }, + { + "epoch": 1.76, + "grad_norm": 0.2718425285377865, + "learning_rate": 7.774049714624909e-06, + "loss": 0.9923, + "step": 18395 + }, + { + "epoch": 1.76, + "grad_norm": 0.32292345943887, + "learning_rate": 7.767935506773739e-06, + "loss": 1.0382, + "step": 18396 + }, + { + "epoch": 1.76, + "grad_norm": 0.3109491395843045, + "learning_rate": 7.76182360708796e-06, + "loss": 0.9846, + "step": 18397 + }, + { + "epoch": 1.76, + "grad_norm": 0.3299427715652202, + "learning_rate": 7.755714015720506e-06, + "loss": 1.0702, + "step": 18398 + }, + { + "epoch": 1.76, + "grad_norm": 0.29550509017997173, + "learning_rate": 7.749606732824277e-06, + "loss": 1.0178, + "step": 18399 + }, + { + "epoch": 1.76, + "grad_norm": 0.26192048985381006, + "learning_rate": 7.743501758552108e-06, + "loss": 1.0608, + "step": 18400 + }, + { + "epoch": 1.76, + "grad_norm": 0.32709779687086626, + "learning_rate": 7.737399093056797e-06, + "loss": 1.0129, + "step": 18401 + }, + { + "epoch": 1.76, + "grad_norm": 0.283693171170967, + "learning_rate": 7.731298736491078e-06, + "loss": 1.0348, + "step": 18402 + }, + { + "epoch": 1.76, + "grad_norm": 0.3114527031472833, + "learning_rate": 7.725200689007584e-06, + "loss": 1.0086, + "step": 18403 + }, + { + "epoch": 1.76, + "grad_norm": 0.29030961082433365, + "learning_rate": 7.719104950758938e-06, + "loss": 1.0468, + "step": 18404 + }, + { + "epoch": 1.76, + "grad_norm": 0.37468377052021073, + "learning_rate": 7.713011521897684e-06, + "loss": 1.0202, + "step": 18405 + }, + { + "epoch": 1.76, + "grad_norm": 0.35376082772005507, + "learning_rate": 7.706920402576324e-06, + "loss": 0.9359, + "step": 18406 + }, + { + "epoch": 1.76, + "grad_norm": 0.33946188252683096, + "learning_rate": 7.700831592947255e-06, + "loss": 1.1324, + "step": 18407 + }, + { + "epoch": 1.76, + "grad_norm": 0.3148796252628572, + "learning_rate": 7.694745093162913e-06, + "loss": 1.051, + "step": 18408 + }, + { + "epoch": 1.76, + "grad_norm": 0.2774605421170154, + "learning_rate": 7.688660903375577e-06, + "loss": 1.0342, + "step": 18409 + }, + { + "epoch": 1.76, + "grad_norm": 0.30229330714472796, + "learning_rate": 7.682579023737535e-06, + "loss": 1.0211, + "step": 18410 + }, + { + "epoch": 1.76, + "grad_norm": 0.3210915984026661, + "learning_rate": 7.676499454400954e-06, + "loss": 0.9855, + "step": 18411 + }, + { + "epoch": 1.76, + "grad_norm": 0.33287687854029685, + "learning_rate": 7.670422195518012e-06, + "loss": 1.1078, + "step": 18412 + }, + { + "epoch": 1.76, + "grad_norm": 0.34747942404547705, + "learning_rate": 7.664347247240754e-06, + "loss": 1.1079, + "step": 18413 + }, + { + "epoch": 1.76, + "grad_norm": 0.29471930968482846, + "learning_rate": 7.658274609721249e-06, + "loss": 0.8613, + "step": 18414 + }, + { + "epoch": 1.76, + "grad_norm": 0.31364392115354583, + "learning_rate": 7.652204283111452e-06, + "loss": 1.1112, + "step": 18415 + }, + { + "epoch": 1.76, + "grad_norm": 0.3231681763470353, + "learning_rate": 7.646136267563308e-06, + "loss": 1.0092, + "step": 18416 + }, + { + "epoch": 1.76, + "grad_norm": 0.3135944842745121, + "learning_rate": 7.640070563228618e-06, + "loss": 0.9291, + "step": 18417 + }, + { + "epoch": 1.76, + "grad_norm": 0.3286307280055697, + "learning_rate": 7.634007170259217e-06, + "loss": 1.147, + "step": 18418 + }, + { + "epoch": 1.76, + "grad_norm": 0.2869143350882997, + "learning_rate": 7.627946088806848e-06, + "loss": 0.9693, + "step": 18419 + }, + { + "epoch": 1.76, + "grad_norm": 0.3223483502934543, + "learning_rate": 7.621887319023169e-06, + "loss": 1.1475, + "step": 18420 + }, + { + "epoch": 1.76, + "grad_norm": 0.31239850246726153, + "learning_rate": 7.615830861059814e-06, + "loss": 1.0592, + "step": 18421 + }, + { + "epoch": 1.76, + "grad_norm": 0.3282237179904265, + "learning_rate": 7.60977671506835e-06, + "loss": 1.0402, + "step": 18422 + }, + { + "epoch": 1.76, + "grad_norm": 0.2813893360061304, + "learning_rate": 7.603724881200303e-06, + "loss": 0.8837, + "step": 18423 + }, + { + "epoch": 1.76, + "grad_norm": 0.29057923972184757, + "learning_rate": 7.5976753596071034e-06, + "loss": 1.0601, + "step": 18424 + }, + { + "epoch": 1.76, + "grad_norm": 0.2869892358687874, + "learning_rate": 7.591628150440156e-06, + "loss": 1.0215, + "step": 18425 + }, + { + "epoch": 1.76, + "grad_norm": 0.3381925605281966, + "learning_rate": 7.585583253850781e-06, + "loss": 0.9878, + "step": 18426 + }, + { + "epoch": 1.76, + "grad_norm": 0.31882694952594254, + "learning_rate": 7.5795406699902705e-06, + "loss": 1.0694, + "step": 18427 + }, + { + "epoch": 1.76, + "grad_norm": 0.294404309741497, + "learning_rate": 7.5735003990098254e-06, + "loss": 0.9639, + "step": 18428 + }, + { + "epoch": 1.76, + "grad_norm": 0.3250694459291814, + "learning_rate": 7.567462441060646e-06, + "loss": 0.9124, + "step": 18429 + }, + { + "epoch": 1.76, + "grad_norm": 0.2980181099157457, + "learning_rate": 7.561426796293791e-06, + "loss": 1.073, + "step": 18430 + }, + { + "epoch": 1.76, + "grad_norm": 0.3486247139257687, + "learning_rate": 7.55539346486035e-06, + "loss": 0.9734, + "step": 18431 + }, + { + "epoch": 1.76, + "grad_norm": 0.32367744674229837, + "learning_rate": 7.549362446911268e-06, + "loss": 0.9952, + "step": 18432 + }, + { + "epoch": 1.76, + "grad_norm": 0.311545701358848, + "learning_rate": 7.543333742597502e-06, + "loss": 0.9586, + "step": 18433 + }, + { + "epoch": 1.76, + "grad_norm": 0.31003120393810696, + "learning_rate": 7.537307352069889e-06, + "loss": 1.0112, + "step": 18434 + }, + { + "epoch": 1.76, + "grad_norm": 0.3229581563459572, + "learning_rate": 7.531283275479306e-06, + "loss": 1.0406, + "step": 18435 + }, + { + "epoch": 1.76, + "grad_norm": 0.32034783644814896, + "learning_rate": 7.525261512976445e-06, + "loss": 1.0351, + "step": 18436 + }, + { + "epoch": 1.76, + "grad_norm": 0.35646136863830796, + "learning_rate": 7.519242064712062e-06, + "loss": 1.0881, + "step": 18437 + }, + { + "epoch": 1.76, + "grad_norm": 0.308437876147295, + "learning_rate": 7.513224930836748e-06, + "loss": 1.0369, + "step": 18438 + }, + { + "epoch": 1.76, + "grad_norm": 0.3025805624177084, + "learning_rate": 7.507210111501106e-06, + "loss": 0.99, + "step": 18439 + }, + { + "epoch": 1.76, + "grad_norm": 0.30694973577766776, + "learning_rate": 7.5011976068556696e-06, + "loss": 0.9141, + "step": 18440 + }, + { + "epoch": 1.76, + "grad_norm": 0.3374164206908718, + "learning_rate": 7.495187417050887e-06, + "loss": 1.0732, + "step": 18441 + }, + { + "epoch": 1.76, + "grad_norm": 0.33722914231548323, + "learning_rate": 7.489179542237179e-06, + "loss": 0.9788, + "step": 18442 + }, + { + "epoch": 1.76, + "grad_norm": 0.2981982204344903, + "learning_rate": 7.483173982564884e-06, + "loss": 0.9278, + "step": 18443 + }, + { + "epoch": 1.76, + "grad_norm": 0.29130166675828023, + "learning_rate": 7.477170738184336e-06, + "loss": 0.9519, + "step": 18444 + }, + { + "epoch": 1.76, + "grad_norm": 0.3382185623256633, + "learning_rate": 7.471169809245715e-06, + "loss": 1.075, + "step": 18445 + }, + { + "epoch": 1.76, + "grad_norm": 0.32158093699635926, + "learning_rate": 7.465171195899235e-06, + "loss": 0.9646, + "step": 18446 + }, + { + "epoch": 1.76, + "grad_norm": 0.30305392559800404, + "learning_rate": 7.459174898294985e-06, + "loss": 1.0622, + "step": 18447 + }, + { + "epoch": 1.76, + "grad_norm": 0.30764864436425626, + "learning_rate": 7.4531809165830465e-06, + "loss": 0.9802, + "step": 18448 + }, + { + "epoch": 1.77, + "grad_norm": 0.29125928399811046, + "learning_rate": 7.447189250913422e-06, + "loss": 1.0258, + "step": 18449 + }, + { + "epoch": 1.77, + "grad_norm": 0.2950502079101755, + "learning_rate": 7.441199901436058e-06, + "loss": 1.0313, + "step": 18450 + }, + { + "epoch": 1.77, + "grad_norm": 0.27775383485918315, + "learning_rate": 7.435212868300834e-06, + "loss": 0.9727, + "step": 18451 + }, + { + "epoch": 1.77, + "grad_norm": 0.30781804166942867, + "learning_rate": 7.429228151657597e-06, + "loss": 1.0778, + "step": 18452 + }, + { + "epoch": 1.77, + "grad_norm": 0.34778353358692543, + "learning_rate": 7.423245751656083e-06, + "loss": 0.9936, + "step": 18453 + }, + { + "epoch": 1.77, + "grad_norm": 0.3363419774766726, + "learning_rate": 7.417265668446028e-06, + "loss": 1.0729, + "step": 18454 + }, + { + "epoch": 1.77, + "grad_norm": 0.29477559918965657, + "learning_rate": 7.411287902177088e-06, + "loss": 1.0615, + "step": 18455 + }, + { + "epoch": 1.77, + "grad_norm": 0.29471470296277397, + "learning_rate": 7.4053124529988694e-06, + "loss": 0.9608, + "step": 18456 + }, + { + "epoch": 1.77, + "grad_norm": 0.28218767359656083, + "learning_rate": 7.399339321060883e-06, + "loss": 1.0449, + "step": 18457 + }, + { + "epoch": 1.77, + "grad_norm": 0.31356129504597846, + "learning_rate": 7.393368506512655e-06, + "loss": 1.1775, + "step": 18458 + }, + { + "epoch": 1.77, + "grad_norm": 0.31898632501328283, + "learning_rate": 7.387400009503554e-06, + "loss": 1.0369, + "step": 18459 + }, + { + "epoch": 1.77, + "grad_norm": 0.3293644488988385, + "learning_rate": 7.381433830182971e-06, + "loss": 0.8985, + "step": 18460 + }, + { + "epoch": 1.77, + "grad_norm": 0.2659910337617717, + "learning_rate": 7.375469968700221e-06, + "loss": 0.9435, + "step": 18461 + }, + { + "epoch": 1.77, + "grad_norm": 0.32547932487950443, + "learning_rate": 7.369508425204563e-06, + "loss": 1.0877, + "step": 18462 + }, + { + "epoch": 1.77, + "grad_norm": 0.3351976728694077, + "learning_rate": 7.363549199845165e-06, + "loss": 0.9285, + "step": 18463 + }, + { + "epoch": 1.77, + "grad_norm": 0.27274566197635347, + "learning_rate": 7.357592292771154e-06, + "loss": 1.0345, + "step": 18464 + }, + { + "epoch": 1.77, + "grad_norm": 0.3034071371391061, + "learning_rate": 7.351637704131642e-06, + "loss": 1.0598, + "step": 18465 + }, + { + "epoch": 1.77, + "grad_norm": 0.3271740873237195, + "learning_rate": 7.3456854340756105e-06, + "loss": 1.0734, + "step": 18466 + }, + { + "epoch": 1.77, + "grad_norm": 0.2867166900199094, + "learning_rate": 7.3397354827520416e-06, + "loss": 1.0093, + "step": 18467 + }, + { + "epoch": 1.77, + "grad_norm": 0.29073668710176426, + "learning_rate": 7.333787850309793e-06, + "loss": 0.9193, + "step": 18468 + }, + { + "epoch": 1.77, + "grad_norm": 0.28966549666906505, + "learning_rate": 7.3278425368977885e-06, + "loss": 1.0223, + "step": 18469 + }, + { + "epoch": 1.77, + "grad_norm": 0.32943824401427374, + "learning_rate": 7.321899542664734e-06, + "loss": 1.0446, + "step": 18470 + }, + { + "epoch": 1.77, + "grad_norm": 0.31207042671727836, + "learning_rate": 7.315958867759409e-06, + "loss": 1.0401, + "step": 18471 + }, + { + "epoch": 1.77, + "grad_norm": 0.3146955973665837, + "learning_rate": 7.3100205123304515e-06, + "loss": 1.0583, + "step": 18472 + }, + { + "epoch": 1.77, + "grad_norm": 0.32420919868654235, + "learning_rate": 7.304084476526507e-06, + "loss": 1.0271, + "step": 18473 + }, + { + "epoch": 1.77, + "grad_norm": 0.3298720899465551, + "learning_rate": 7.298150760496081e-06, + "loss": 1.0075, + "step": 18474 + }, + { + "epoch": 1.77, + "grad_norm": 0.300081217725146, + "learning_rate": 7.292219364387687e-06, + "loss": 1.067, + "step": 18475 + }, + { + "epoch": 1.77, + "grad_norm": 0.3020539138560655, + "learning_rate": 7.286290288349784e-06, + "loss": 0.9996, + "step": 18476 + }, + { + "epoch": 1.77, + "grad_norm": 0.2743671039820983, + "learning_rate": 7.280363532530743e-06, + "loss": 0.878, + "step": 18477 + }, + { + "epoch": 1.77, + "grad_norm": 0.3240673961148568, + "learning_rate": 7.274439097078855e-06, + "loss": 0.9813, + "step": 18478 + }, + { + "epoch": 1.77, + "grad_norm": 0.304754163079096, + "learning_rate": 7.268516982142426e-06, + "loss": 0.8841, + "step": 18479 + }, + { + "epoch": 1.77, + "grad_norm": 0.30239773926329677, + "learning_rate": 7.262597187869624e-06, + "loss": 1.0386, + "step": 18480 + }, + { + "epoch": 1.77, + "grad_norm": 0.3631311990239393, + "learning_rate": 7.25667971440861e-06, + "loss": 0.9956, + "step": 18481 + }, + { + "epoch": 1.77, + "grad_norm": 0.2841530824243912, + "learning_rate": 7.2507645619074634e-06, + "loss": 0.9377, + "step": 18482 + }, + { + "epoch": 1.77, + "grad_norm": 0.2691775659829598, + "learning_rate": 7.244851730514257e-06, + "loss": 1.0947, + "step": 18483 + }, + { + "epoch": 1.77, + "grad_norm": 0.36114047300385826, + "learning_rate": 7.2389412203769045e-06, + "loss": 1.0367, + "step": 18484 + }, + { + "epoch": 1.77, + "grad_norm": 0.3190296561956919, + "learning_rate": 7.233033031643344e-06, + "loss": 1.0933, + "step": 18485 + }, + { + "epoch": 1.77, + "grad_norm": 0.3003147745694108, + "learning_rate": 7.227127164461456e-06, + "loss": 1.0093, + "step": 18486 + }, + { + "epoch": 1.77, + "grad_norm": 0.3070265695684648, + "learning_rate": 7.221223618979e-06, + "loss": 0.9078, + "step": 18487 + }, + { + "epoch": 1.77, + "grad_norm": 0.316005107104906, + "learning_rate": 7.215322395343726e-06, + "loss": 1.0289, + "step": 18488 + }, + { + "epoch": 1.77, + "grad_norm": 0.33757830358872665, + "learning_rate": 7.2094234937033266e-06, + "loss": 0.9946, + "step": 18489 + }, + { + "epoch": 1.77, + "grad_norm": 0.30231367250109414, + "learning_rate": 7.203526914205438e-06, + "loss": 0.9651, + "step": 18490 + }, + { + "epoch": 1.77, + "grad_norm": 0.32607123738143895, + "learning_rate": 7.197632656997588e-06, + "loss": 1.0531, + "step": 18491 + }, + { + "epoch": 1.77, + "grad_norm": 0.3051839722335436, + "learning_rate": 7.191740722227336e-06, + "loss": 0.8865, + "step": 18492 + }, + { + "epoch": 1.77, + "grad_norm": 0.3013109472297498, + "learning_rate": 7.185851110042064e-06, + "loss": 0.9704, + "step": 18493 + }, + { + "epoch": 1.77, + "grad_norm": 0.30722202992159386, + "learning_rate": 7.17996382058923e-06, + "loss": 1.0063, + "step": 18494 + }, + { + "epoch": 1.77, + "grad_norm": 0.26816900424158363, + "learning_rate": 7.174078854016097e-06, + "loss": 0.8169, + "step": 18495 + }, + { + "epoch": 1.77, + "grad_norm": 0.25850139511165965, + "learning_rate": 7.1681962104700125e-06, + "loss": 1.0011, + "step": 18496 + }, + { + "epoch": 1.77, + "grad_norm": 0.33922619376438795, + "learning_rate": 7.162315890098148e-06, + "loss": 1.0531, + "step": 18497 + }, + { + "epoch": 1.77, + "grad_norm": 0.32023719359492314, + "learning_rate": 7.156437893047685e-06, + "loss": 1.1402, + "step": 18498 + }, + { + "epoch": 1.77, + "grad_norm": 0.30354842377467894, + "learning_rate": 7.150562219465695e-06, + "loss": 1.0076, + "step": 18499 + }, + { + "epoch": 1.77, + "grad_norm": 0.29097205420924316, + "learning_rate": 7.144688869499249e-06, + "loss": 1.0552, + "step": 18500 + }, + { + "epoch": 1.77, + "grad_norm": 0.3001165446678803, + "learning_rate": 7.138817843295331e-06, + "loss": 0.926, + "step": 18501 + }, + { + "epoch": 1.77, + "grad_norm": 0.2826607896088951, + "learning_rate": 7.132949141000844e-06, + "loss": 1.0006, + "step": 18502 + }, + { + "epoch": 1.77, + "grad_norm": 0.31515207800349554, + "learning_rate": 7.127082762762672e-06, + "loss": 0.9539, + "step": 18503 + }, + { + "epoch": 1.77, + "grad_norm": 0.3633205143230308, + "learning_rate": 7.121218708727618e-06, + "loss": 1.0383, + "step": 18504 + }, + { + "epoch": 1.77, + "grad_norm": 0.32910593904867735, + "learning_rate": 7.115356979042454e-06, + "loss": 1.0617, + "step": 18505 + }, + { + "epoch": 1.77, + "grad_norm": 0.3004178906898131, + "learning_rate": 7.109497573853851e-06, + "loss": 1.0037, + "step": 18506 + }, + { + "epoch": 1.77, + "grad_norm": 0.3561350874382039, + "learning_rate": 7.10364049330845e-06, + "loss": 1.0653, + "step": 18507 + }, + { + "epoch": 1.77, + "grad_norm": 0.2961986865553128, + "learning_rate": 7.09778573755282e-06, + "loss": 0.9267, + "step": 18508 + }, + { + "epoch": 1.77, + "grad_norm": 0.32815573098636275, + "learning_rate": 7.0919333067335e-06, + "loss": 1.0766, + "step": 18509 + }, + { + "epoch": 1.77, + "grad_norm": 0.3133074966640458, + "learning_rate": 7.086083200996929e-06, + "loss": 0.9815, + "step": 18510 + }, + { + "epoch": 1.77, + "grad_norm": 0.3350549514950147, + "learning_rate": 7.080235420489534e-06, + "loss": 0.9912, + "step": 18511 + }, + { + "epoch": 1.77, + "grad_norm": 0.3417160980793271, + "learning_rate": 7.07438996535763e-06, + "loss": 1.0892, + "step": 18512 + }, + { + "epoch": 1.77, + "grad_norm": 0.39345352509821, + "learning_rate": 7.068546835747536e-06, + "loss": 1.1656, + "step": 18513 + }, + { + "epoch": 1.77, + "grad_norm": 0.3531184457166658, + "learning_rate": 7.062706031805455e-06, + "loss": 1.0687, + "step": 18514 + }, + { + "epoch": 1.77, + "grad_norm": 0.29510492719565706, + "learning_rate": 7.056867553677549e-06, + "loss": 1.0164, + "step": 18515 + }, + { + "epoch": 1.77, + "grad_norm": 0.3129519355802406, + "learning_rate": 7.051031401509955e-06, + "loss": 1.0512, + "step": 18516 + }, + { + "epoch": 1.77, + "grad_norm": 0.2932721256553452, + "learning_rate": 7.045197575448736e-06, + "loss": 1.0003, + "step": 18517 + }, + { + "epoch": 1.77, + "grad_norm": 0.29505984129831436, + "learning_rate": 7.039366075639842e-06, + "loss": 1.063, + "step": 18518 + }, + { + "epoch": 1.77, + "grad_norm": 0.2843686170583582, + "learning_rate": 7.0335369022292545e-06, + "loss": 1.1098, + "step": 18519 + }, + { + "epoch": 1.77, + "grad_norm": 0.3359198781980621, + "learning_rate": 7.027710055362824e-06, + "loss": 0.9048, + "step": 18520 + }, + { + "epoch": 1.77, + "grad_norm": 0.3304117220336929, + "learning_rate": 7.021885535186368e-06, + "loss": 1.1348, + "step": 18521 + }, + { + "epoch": 1.77, + "grad_norm": 0.29571214582201, + "learning_rate": 7.016063341845669e-06, + "loss": 1.1504, + "step": 18522 + }, + { + "epoch": 1.77, + "grad_norm": 0.2650264508877992, + "learning_rate": 7.010243475486445e-06, + "loss": 0.9409, + "step": 18523 + }, + { + "epoch": 1.77, + "grad_norm": 0.30672918784901565, + "learning_rate": 7.004425936254289e-06, + "loss": 1.0019, + "step": 18524 + }, + { + "epoch": 1.77, + "grad_norm": 0.30511772375981316, + "learning_rate": 6.99861072429483e-06, + "loss": 1.06, + "step": 18525 + }, + { + "epoch": 1.77, + "grad_norm": 0.3179990684858709, + "learning_rate": 6.9927978397535956e-06, + "loss": 1.1416, + "step": 18526 + }, + { + "epoch": 1.77, + "grad_norm": 0.2758184533674834, + "learning_rate": 6.986987282776036e-06, + "loss": 1.1055, + "step": 18527 + }, + { + "epoch": 1.77, + "grad_norm": 0.34499990691580046, + "learning_rate": 6.981179053507603e-06, + "loss": 0.9916, + "step": 18528 + }, + { + "epoch": 1.77, + "grad_norm": 0.3186325394355233, + "learning_rate": 6.975373152093579e-06, + "loss": 1.0059, + "step": 18529 + }, + { + "epoch": 1.77, + "grad_norm": 0.3082581462253261, + "learning_rate": 6.969569578679336e-06, + "loss": 1.0175, + "step": 18530 + }, + { + "epoch": 1.77, + "grad_norm": 0.3360086327217977, + "learning_rate": 6.96376833341007e-06, + "loss": 1.1078, + "step": 18531 + }, + { + "epoch": 1.77, + "grad_norm": 0.2798068045883675, + "learning_rate": 6.9579694164309874e-06, + "loss": 0.9646, + "step": 18532 + }, + { + "epoch": 1.77, + "grad_norm": 0.35356003311699635, + "learning_rate": 6.952172827887182e-06, + "loss": 0.9332, + "step": 18533 + }, + { + "epoch": 1.77, + "grad_norm": 0.3203993286020271, + "learning_rate": 6.94637856792375e-06, + "loss": 1.0492, + "step": 18534 + }, + { + "epoch": 1.77, + "grad_norm": 0.2996937679471188, + "learning_rate": 6.940586636685653e-06, + "loss": 1.0023, + "step": 18535 + }, + { + "epoch": 1.77, + "grad_norm": 0.2843212552447446, + "learning_rate": 6.934797034317864e-06, + "loss": 0.9691, + "step": 18536 + }, + { + "epoch": 1.77, + "grad_norm": 0.3275588346187922, + "learning_rate": 6.929009760965266e-06, + "loss": 1.1122, + "step": 18537 + }, + { + "epoch": 1.77, + "grad_norm": 0.30703741208965885, + "learning_rate": 6.923224816772711e-06, + "loss": 0.9413, + "step": 18538 + }, + { + "epoch": 1.77, + "grad_norm": 0.29120293268289665, + "learning_rate": 6.917442201884927e-06, + "loss": 0.9735, + "step": 18539 + }, + { + "epoch": 1.77, + "grad_norm": 0.3147194289668976, + "learning_rate": 6.911661916446677e-06, + "loss": 0.964, + "step": 18540 + }, + { + "epoch": 1.77, + "grad_norm": 0.33079389299737094, + "learning_rate": 6.905883960602577e-06, + "loss": 0.9555, + "step": 18541 + }, + { + "epoch": 1.77, + "grad_norm": 0.3128986820121427, + "learning_rate": 6.900108334497235e-06, + "loss": 1.0634, + "step": 18542 + }, + { + "epoch": 1.77, + "grad_norm": 0.3392687462719816, + "learning_rate": 6.8943350382751905e-06, + "loss": 0.99, + "step": 18543 + }, + { + "epoch": 1.77, + "grad_norm": 0.35544604649553346, + "learning_rate": 6.888564072080939e-06, + "loss": 1.0048, + "step": 18544 + }, + { + "epoch": 1.77, + "grad_norm": 0.31219418833478774, + "learning_rate": 6.882795436058886e-06, + "loss": 1.0305, + "step": 18545 + }, + { + "epoch": 1.77, + "grad_norm": 0.323980762013247, + "learning_rate": 6.8770291303533855e-06, + "loss": 1.0247, + "step": 18546 + }, + { + "epoch": 1.77, + "grad_norm": 0.29081513560220207, + "learning_rate": 6.871265155108786e-06, + "loss": 0.9059, + "step": 18547 + }, + { + "epoch": 1.77, + "grad_norm": 0.32223425009442563, + "learning_rate": 6.865503510469284e-06, + "loss": 1.0351, + "step": 18548 + }, + { + "epoch": 1.77, + "grad_norm": 0.3251298454020793, + "learning_rate": 6.859744196579088e-06, + "loss": 1.0673, + "step": 18549 + }, + { + "epoch": 1.77, + "grad_norm": 0.3309872732532633, + "learning_rate": 6.853987213582325e-06, + "loss": 1.0931, + "step": 18550 + }, + { + "epoch": 1.77, + "grad_norm": 0.30494892853572586, + "learning_rate": 6.848232561623092e-06, + "loss": 1.0417, + "step": 18551 + }, + { + "epoch": 1.77, + "grad_norm": 0.34135337875695815, + "learning_rate": 6.842480240845372e-06, + "loss": 1.0807, + "step": 18552 + }, + { + "epoch": 1.77, + "grad_norm": 0.3450855647481958, + "learning_rate": 6.836730251393142e-06, + "loss": 1.0803, + "step": 18553 + }, + { + "epoch": 1.78, + "grad_norm": 0.30374770439721305, + "learning_rate": 6.830982593410273e-06, + "loss": 1.0349, + "step": 18554 + }, + { + "epoch": 1.78, + "grad_norm": 0.3709619386305951, + "learning_rate": 6.825237267040641e-06, + "loss": 0.9666, + "step": 18555 + }, + { + "epoch": 1.78, + "grad_norm": 0.3038717806461077, + "learning_rate": 6.819494272427962e-06, + "loss": 0.9461, + "step": 18556 + }, + { + "epoch": 1.78, + "grad_norm": 0.3315811093836826, + "learning_rate": 6.813753609716034e-06, + "loss": 1.0283, + "step": 18557 + }, + { + "epoch": 1.78, + "grad_norm": 0.3452301768730758, + "learning_rate": 6.808015279048474e-06, + "loss": 0.9869, + "step": 18558 + }, + { + "epoch": 1.78, + "grad_norm": 0.2851316429143863, + "learning_rate": 6.8022792805689235e-06, + "loss": 0.9865, + "step": 18559 + }, + { + "epoch": 1.78, + "grad_norm": 0.31285031824258447, + "learning_rate": 6.796545614420879e-06, + "loss": 1.0633, + "step": 18560 + }, + { + "epoch": 1.78, + "grad_norm": 0.30750852608713614, + "learning_rate": 6.790814280747881e-06, + "loss": 1.1236, + "step": 18561 + }, + { + "epoch": 1.78, + "grad_norm": 0.2941753091886968, + "learning_rate": 6.785085279693315e-06, + "loss": 0.9159, + "step": 18562 + }, + { + "epoch": 1.78, + "grad_norm": 0.2519384450726812, + "learning_rate": 6.779358611400577e-06, + "loss": 0.9036, + "step": 18563 + }, + { + "epoch": 1.78, + "grad_norm": 0.3128903570610397, + "learning_rate": 6.773634276012975e-06, + "loss": 0.9285, + "step": 18564 + }, + { + "epoch": 1.78, + "grad_norm": 0.35625612568400905, + "learning_rate": 6.7679122736737825e-06, + "loss": 1.0633, + "step": 18565 + }, + { + "epoch": 1.78, + "grad_norm": 0.3266975903917612, + "learning_rate": 6.762192604526163e-06, + "loss": 0.9649, + "step": 18566 + }, + { + "epoch": 1.78, + "grad_norm": 0.35762931590506725, + "learning_rate": 6.75647526871327e-06, + "loss": 0.9362, + "step": 18567 + }, + { + "epoch": 1.78, + "grad_norm": 0.29365829234863894, + "learning_rate": 6.750760266378209e-06, + "loss": 1.0528, + "step": 18568 + }, + { + "epoch": 1.78, + "grad_norm": 0.321084812449342, + "learning_rate": 6.745047597663956e-06, + "loss": 0.996, + "step": 18569 + }, + { + "epoch": 1.78, + "grad_norm": 0.3403861866224486, + "learning_rate": 6.739337262713485e-06, + "loss": 1.0567, + "step": 18570 + }, + { + "epoch": 1.78, + "grad_norm": 0.3082914086933878, + "learning_rate": 6.733629261669727e-06, + "loss": 0.9587, + "step": 18571 + }, + { + "epoch": 1.78, + "grad_norm": 0.2807230165448705, + "learning_rate": 6.7279235946755226e-06, + "loss": 1.0695, + "step": 18572 + }, + { + "epoch": 1.78, + "grad_norm": 0.3281763059322337, + "learning_rate": 6.722220261873624e-06, + "loss": 0.9875, + "step": 18573 + }, + { + "epoch": 1.78, + "grad_norm": 0.2787881137370532, + "learning_rate": 6.716519263406817e-06, + "loss": 0.8845, + "step": 18574 + }, + { + "epoch": 1.78, + "grad_norm": 0.29307988640461735, + "learning_rate": 6.710820599417711e-06, + "loss": 0.9637, + "step": 18575 + }, + { + "epoch": 1.78, + "grad_norm": 0.2854129168067405, + "learning_rate": 6.705124270048968e-06, + "loss": 0.9707, + "step": 18576 + }, + { + "epoch": 1.78, + "grad_norm": 0.36643268772662035, + "learning_rate": 6.699430275443108e-06, + "loss": 1.0519, + "step": 18577 + }, + { + "epoch": 1.78, + "grad_norm": 0.36664544618189654, + "learning_rate": 6.6937386157426615e-06, + "loss": 1.078, + "step": 18578 + }, + { + "epoch": 1.78, + "grad_norm": 0.2968566617194101, + "learning_rate": 6.688049291090037e-06, + "loss": 0.9313, + "step": 18579 + }, + { + "epoch": 1.78, + "grad_norm": 0.2975630167636191, + "learning_rate": 6.682362301627631e-06, + "loss": 1.0436, + "step": 18580 + }, + { + "epoch": 1.78, + "grad_norm": 0.33891704395435057, + "learning_rate": 6.676677647497753e-06, + "loss": 1.0005, + "step": 18581 + }, + { + "epoch": 1.78, + "grad_norm": 0.31033198598231776, + "learning_rate": 6.670995328842678e-06, + "loss": 0.9257, + "step": 18582 + }, + { + "epoch": 1.78, + "grad_norm": 0.3408949503852422, + "learning_rate": 6.665315345804568e-06, + "loss": 1.0732, + "step": 18583 + }, + { + "epoch": 1.78, + "grad_norm": 0.31588039429360687, + "learning_rate": 6.659637698525623e-06, + "loss": 1.0767, + "step": 18584 + }, + { + "epoch": 1.78, + "grad_norm": 0.3025871973567283, + "learning_rate": 6.653962387147905e-06, + "loss": 1.0462, + "step": 18585 + }, + { + "epoch": 1.78, + "grad_norm": 0.3236304640412586, + "learning_rate": 6.6482894118134466e-06, + "loss": 1.0509, + "step": 18586 + }, + { + "epoch": 1.78, + "grad_norm": 0.28663748237644254, + "learning_rate": 6.642618772664211e-06, + "loss": 0.9702, + "step": 18587 + }, + { + "epoch": 1.78, + "grad_norm": 0.3270715610921301, + "learning_rate": 6.636950469842107e-06, + "loss": 1.0113, + "step": 18588 + }, + { + "epoch": 1.78, + "grad_norm": 0.27739788393616077, + "learning_rate": 6.6312845034890105e-06, + "loss": 1.2025, + "step": 18589 + }, + { + "epoch": 1.78, + "grad_norm": 0.2879697946270983, + "learning_rate": 6.625620873746674e-06, + "loss": 1.0264, + "step": 18590 + }, + { + "epoch": 1.78, + "grad_norm": 0.27358861767465353, + "learning_rate": 6.6199595807568846e-06, + "loss": 1.0242, + "step": 18591 + }, + { + "epoch": 1.78, + "grad_norm": 0.3074903061361043, + "learning_rate": 6.614300624661274e-06, + "loss": 0.9568, + "step": 18592 + }, + { + "epoch": 1.78, + "grad_norm": 0.26712238081218903, + "learning_rate": 6.608644005601505e-06, + "loss": 1.0436, + "step": 18593 + }, + { + "epoch": 1.78, + "grad_norm": 0.3018560986971663, + "learning_rate": 6.602989723719089e-06, + "loss": 1.0527, + "step": 18594 + }, + { + "epoch": 1.78, + "grad_norm": 0.35677712204053524, + "learning_rate": 6.5973377791555765e-06, + "loss": 0.9334, + "step": 18595 + }, + { + "epoch": 1.78, + "grad_norm": 0.31242335496024837, + "learning_rate": 6.5916881720523685e-06, + "loss": 1.044, + "step": 18596 + }, + { + "epoch": 1.78, + "grad_norm": 0.2939441866734272, + "learning_rate": 6.586040902550872e-06, + "loss": 1.0106, + "step": 18597 + }, + { + "epoch": 1.78, + "grad_norm": 0.29616873588337744, + "learning_rate": 6.580395970792408e-06, + "loss": 0.9224, + "step": 18598 + }, + { + "epoch": 1.78, + "grad_norm": 0.2870606070980242, + "learning_rate": 6.574753376918264e-06, + "loss": 1.1491, + "step": 18599 + }, + { + "epoch": 1.78, + "grad_norm": 0.3324625669827519, + "learning_rate": 6.569113121069614e-06, + "loss": 0.9781, + "step": 18600 + }, + { + "epoch": 1.78, + "grad_norm": 0.315070663295295, + "learning_rate": 6.563475203387648e-06, + "loss": 1.0334, + "step": 18601 + }, + { + "epoch": 1.78, + "grad_norm": 0.2850610274242632, + "learning_rate": 6.557839624013418e-06, + "loss": 0.978, + "step": 18602 + }, + { + "epoch": 1.78, + "grad_norm": 0.2675514483295839, + "learning_rate": 6.552206383087978e-06, + "loss": 1.004, + "step": 18603 + }, + { + "epoch": 1.78, + "grad_norm": 0.3523355577334535, + "learning_rate": 6.5465754807523035e-06, + "loss": 1.0274, + "step": 18604 + }, + { + "epoch": 1.78, + "grad_norm": 0.29920448083117324, + "learning_rate": 6.540946917147328e-06, + "loss": 1.0533, + "step": 18605 + }, + { + "epoch": 1.78, + "grad_norm": 0.3118887827802827, + "learning_rate": 6.535320692413871e-06, + "loss": 0.9505, + "step": 18606 + }, + { + "epoch": 1.78, + "grad_norm": 0.33783620761796396, + "learning_rate": 6.5296968066927535e-06, + "loss": 1.0175, + "step": 18607 + }, + { + "epoch": 1.78, + "grad_norm": 0.33921772045053716, + "learning_rate": 6.52407526012474e-06, + "loss": 0.9989, + "step": 18608 + }, + { + "epoch": 1.78, + "grad_norm": 0.3134743174090919, + "learning_rate": 6.518456052850474e-06, + "loss": 0.9443, + "step": 18609 + }, + { + "epoch": 1.78, + "grad_norm": 0.2979114050482758, + "learning_rate": 6.5128391850106e-06, + "loss": 1.0005, + "step": 18610 + }, + { + "epoch": 1.78, + "grad_norm": 0.2996557982757305, + "learning_rate": 6.50722465674567e-06, + "loss": 1.0628, + "step": 18611 + }, + { + "epoch": 1.78, + "grad_norm": 0.2812467382115383, + "learning_rate": 6.5016124681962275e-06, + "loss": 0.9983, + "step": 18612 + }, + { + "epoch": 1.78, + "grad_norm": 0.316151040836614, + "learning_rate": 6.496002619502672e-06, + "loss": 1.1176, + "step": 18613 + }, + { + "epoch": 1.78, + "grad_norm": 0.2713656682404429, + "learning_rate": 6.490395110805425e-06, + "loss": 0.9191, + "step": 18614 + }, + { + "epoch": 1.78, + "grad_norm": 0.3528845304612627, + "learning_rate": 6.484789942244806e-06, + "loss": 0.9351, + "step": 18615 + }, + { + "epoch": 1.78, + "grad_norm": 0.28114051236905707, + "learning_rate": 6.4791871139610934e-06, + "loss": 0.8705, + "step": 18616 + }, + { + "epoch": 1.78, + "grad_norm": 0.2740938503155917, + "learning_rate": 6.473586626094485e-06, + "loss": 1.0185, + "step": 18617 + }, + { + "epoch": 1.78, + "grad_norm": 0.3174149035379467, + "learning_rate": 6.4679884787851696e-06, + "loss": 1.0826, + "step": 18618 + }, + { + "epoch": 1.78, + "grad_norm": 0.2847733288835967, + "learning_rate": 6.462392672173212e-06, + "loss": 0.9602, + "step": 18619 + }, + { + "epoch": 1.78, + "grad_norm": 0.3417282586151304, + "learning_rate": 6.4567992063986785e-06, + "loss": 0.9846, + "step": 18620 + }, + { + "epoch": 1.78, + "grad_norm": 0.29970411473670056, + "learning_rate": 6.451208081601523e-06, + "loss": 1.012, + "step": 18621 + }, + { + "epoch": 1.78, + "grad_norm": 0.351191744239036, + "learning_rate": 6.44561929792169e-06, + "loss": 0.9629, + "step": 18622 + }, + { + "epoch": 1.78, + "grad_norm": 0.29786564563560464, + "learning_rate": 6.440032855499012e-06, + "loss": 1.0946, + "step": 18623 + }, + { + "epoch": 1.78, + "grad_norm": 0.31618299903975583, + "learning_rate": 6.434448754473321e-06, + "loss": 1.1006, + "step": 18624 + }, + { + "epoch": 1.78, + "grad_norm": 0.3148893064667007, + "learning_rate": 6.4288669949843504e-06, + "loss": 0.9948, + "step": 18625 + }, + { + "epoch": 1.78, + "grad_norm": 0.30093367329926185, + "learning_rate": 6.4232875771718e-06, + "loss": 1.011, + "step": 18626 + }, + { + "epoch": 1.78, + "grad_norm": 0.29059385679572747, + "learning_rate": 6.417710501175278e-06, + "loss": 1.0249, + "step": 18627 + }, + { + "epoch": 1.78, + "grad_norm": 0.3297677862176844, + "learning_rate": 6.412135767134364e-06, + "loss": 1.0469, + "step": 18628 + }, + { + "epoch": 1.78, + "grad_norm": 0.3470378305463926, + "learning_rate": 6.40656337518859e-06, + "loss": 1.0727, + "step": 18629 + }, + { + "epoch": 1.78, + "grad_norm": 0.34041355648863286, + "learning_rate": 6.400993325477367e-06, + "loss": 1.0598, + "step": 18630 + }, + { + "epoch": 1.78, + "grad_norm": 0.3204338775770835, + "learning_rate": 6.395425618140116e-06, + "loss": 0.976, + "step": 18631 + }, + { + "epoch": 1.78, + "grad_norm": 0.32124047849093573, + "learning_rate": 6.38986025331616e-06, + "loss": 1.0511, + "step": 18632 + }, + { + "epoch": 1.78, + "grad_norm": 0.32309953072051717, + "learning_rate": 6.384297231144809e-06, + "loss": 1.0085, + "step": 18633 + }, + { + "epoch": 1.78, + "grad_norm": 0.342531526867171, + "learning_rate": 6.37873655176523e-06, + "loss": 0.9595, + "step": 18634 + }, + { + "epoch": 1.78, + "grad_norm": 0.354423755618818, + "learning_rate": 6.373178215316622e-06, + "loss": 1.0435, + "step": 18635 + }, + { + "epoch": 1.78, + "grad_norm": 0.29276245129346873, + "learning_rate": 6.3676222219380634e-06, + "loss": 1.0848, + "step": 18636 + }, + { + "epoch": 1.78, + "grad_norm": 0.3383462783399589, + "learning_rate": 6.362068571768609e-06, + "loss": 0.9905, + "step": 18637 + }, + { + "epoch": 1.78, + "grad_norm": 0.29639734276675156, + "learning_rate": 6.356517264947226e-06, + "loss": 1.0038, + "step": 18638 + }, + { + "epoch": 1.78, + "grad_norm": 0.294676134226555, + "learning_rate": 6.35096830161287e-06, + "loss": 0.96, + "step": 18639 + }, + { + "epoch": 1.78, + "grad_norm": 0.3156569551116364, + "learning_rate": 6.345421681904373e-06, + "loss": 1.0033, + "step": 18640 + }, + { + "epoch": 1.78, + "grad_norm": 0.30695437992227187, + "learning_rate": 6.339877405960581e-06, + "loss": 1.0426, + "step": 18641 + }, + { + "epoch": 1.78, + "grad_norm": 0.3588638966123018, + "learning_rate": 6.3343354739201945e-06, + "loss": 1.0376, + "step": 18642 + }, + { + "epoch": 1.78, + "grad_norm": 0.3076995504540817, + "learning_rate": 6.328795885921945e-06, + "loss": 1.0418, + "step": 18643 + }, + { + "epoch": 1.78, + "grad_norm": 0.3162545456501499, + "learning_rate": 6.323258642104424e-06, + "loss": 1.0472, + "step": 18644 + }, + { + "epoch": 1.78, + "grad_norm": 0.31718493565948735, + "learning_rate": 6.317723742606263e-06, + "loss": 1.0665, + "step": 18645 + }, + { + "epoch": 1.78, + "grad_norm": 0.312177580765309, + "learning_rate": 6.312191187565919e-06, + "loss": 1.0084, + "step": 18646 + }, + { + "epoch": 1.78, + "grad_norm": 0.34318883316182164, + "learning_rate": 6.306660977121892e-06, + "loss": 1.061, + "step": 18647 + }, + { + "epoch": 1.78, + "grad_norm": 0.31722878571440827, + "learning_rate": 6.301133111412538e-06, + "loss": 1.0067, + "step": 18648 + }, + { + "epoch": 1.78, + "grad_norm": 0.3085514669496179, + "learning_rate": 6.295607590576214e-06, + "loss": 1.0234, + "step": 18649 + }, + { + "epoch": 1.78, + "grad_norm": 0.29767310777692735, + "learning_rate": 6.290084414751218e-06, + "loss": 1.0652, + "step": 18650 + }, + { + "epoch": 1.78, + "grad_norm": 0.3124098933699307, + "learning_rate": 6.2845635840757425e-06, + "loss": 1.0363, + "step": 18651 + }, + { + "epoch": 1.78, + "grad_norm": 0.2906948120438801, + "learning_rate": 6.279045098687964e-06, + "loss": 1.0192, + "step": 18652 + }, + { + "epoch": 1.78, + "grad_norm": 0.28866104625673544, + "learning_rate": 6.273528958725972e-06, + "loss": 1.001, + "step": 18653 + }, + { + "epoch": 1.78, + "grad_norm": 0.3229715327342054, + "learning_rate": 6.268015164327845e-06, + "loss": 1.0516, + "step": 18654 + }, + { + "epoch": 1.78, + "grad_norm": 0.3247854867464979, + "learning_rate": 6.262503715631529e-06, + "loss": 0.9696, + "step": 18655 + }, + { + "epoch": 1.78, + "grad_norm": 0.2612712148830415, + "learning_rate": 6.256994612774991e-06, + "loss": 0.9555, + "step": 18656 + }, + { + "epoch": 1.78, + "grad_norm": 0.3161503619644513, + "learning_rate": 6.251487855896054e-06, + "loss": 1.0216, + "step": 18657 + }, + { + "epoch": 1.79, + "grad_norm": 0.3213177088008538, + "learning_rate": 6.245983445132564e-06, + "loss": 1.0337, + "step": 18658 + }, + { + "epoch": 1.79, + "grad_norm": 0.2865923438635052, + "learning_rate": 6.240481380622254e-06, + "loss": 0.9837, + "step": 18659 + }, + { + "epoch": 1.79, + "grad_norm": 0.3304541480074128, + "learning_rate": 6.234981662502837e-06, + "loss": 1.0098, + "step": 18660 + }, + { + "epoch": 1.79, + "grad_norm": 0.3328517516443761, + "learning_rate": 6.229484290911913e-06, + "loss": 1.1248, + "step": 18661 + }, + { + "epoch": 1.79, + "grad_norm": 0.3202254864663805, + "learning_rate": 6.223989265987096e-06, + "loss": 0.991, + "step": 18662 + }, + { + "epoch": 1.79, + "grad_norm": 0.31137422085668265, + "learning_rate": 6.218496587865874e-06, + "loss": 1.0006, + "step": 18663 + }, + { + "epoch": 1.79, + "grad_norm": 0.3571836451835221, + "learning_rate": 6.213006256685705e-06, + "loss": 1.074, + "step": 18664 + }, + { + "epoch": 1.79, + "grad_norm": 0.32520507378503816, + "learning_rate": 6.207518272584012e-06, + "loss": 1.012, + "step": 18665 + }, + { + "epoch": 1.79, + "grad_norm": 0.3015745631669657, + "learning_rate": 6.202032635698119e-06, + "loss": 1.0462, + "step": 18666 + }, + { + "epoch": 1.79, + "grad_norm": 0.30866946780135013, + "learning_rate": 6.1965493461653035e-06, + "loss": 0.9834, + "step": 18667 + }, + { + "epoch": 1.79, + "grad_norm": 0.2948991077189447, + "learning_rate": 6.191068404122813e-06, + "loss": 1.0032, + "step": 18668 + }, + { + "epoch": 1.79, + "grad_norm": 0.3162478758853431, + "learning_rate": 6.18558980970777e-06, + "loss": 1.0176, + "step": 18669 + }, + { + "epoch": 1.79, + "grad_norm": 0.32164948273008526, + "learning_rate": 6.180113563057299e-06, + "loss": 1.052, + "step": 18670 + }, + { + "epoch": 1.79, + "grad_norm": 0.31489582089197954, + "learning_rate": 6.174639664308457e-06, + "loss": 0.874, + "step": 18671 + }, + { + "epoch": 1.79, + "grad_norm": 0.2892556730135506, + "learning_rate": 6.169168113598234e-06, + "loss": 0.975, + "step": 18672 + }, + { + "epoch": 1.79, + "grad_norm": 0.31860780309969333, + "learning_rate": 6.163698911063532e-06, + "loss": 0.9685, + "step": 18673 + }, + { + "epoch": 1.79, + "grad_norm": 0.30822738642746955, + "learning_rate": 6.158232056841251e-06, + "loss": 1.0418, + "step": 18674 + }, + { + "epoch": 1.79, + "grad_norm": 0.3241897317844401, + "learning_rate": 6.152767551068195e-06, + "loss": 1.0695, + "step": 18675 + }, + { + "epoch": 1.79, + "grad_norm": 0.30518396107114104, + "learning_rate": 6.147305393881097e-06, + "loss": 0.993, + "step": 18676 + }, + { + "epoch": 1.79, + "grad_norm": 0.3106786855071411, + "learning_rate": 6.141845585416683e-06, + "loss": 0.9631, + "step": 18677 + }, + { + "epoch": 1.79, + "grad_norm": 0.3125930942533485, + "learning_rate": 6.136388125811543e-06, + "loss": 1.117, + "step": 18678 + }, + { + "epoch": 1.79, + "grad_norm": 0.3649791013959072, + "learning_rate": 6.130933015202311e-06, + "loss": 1.0794, + "step": 18679 + }, + { + "epoch": 1.79, + "grad_norm": 0.35201115183280907, + "learning_rate": 6.125480253725458e-06, + "loss": 1.0995, + "step": 18680 + }, + { + "epoch": 1.79, + "grad_norm": 0.3245748320159616, + "learning_rate": 6.120029841517472e-06, + "loss": 0.9315, + "step": 18681 + }, + { + "epoch": 1.79, + "grad_norm": 0.31876859943646263, + "learning_rate": 6.1145817787147345e-06, + "loss": 0.9915, + "step": 18682 + }, + { + "epoch": 1.79, + "grad_norm": 0.27098125981240556, + "learning_rate": 6.109136065453603e-06, + "loss": 0.9134, + "step": 18683 + }, + { + "epoch": 1.79, + "grad_norm": 0.28993077732292244, + "learning_rate": 6.103692701870322e-06, + "loss": 1.1974, + "step": 18684 + }, + { + "epoch": 1.79, + "grad_norm": 0.3770611821226213, + "learning_rate": 6.098251688101164e-06, + "loss": 0.9966, + "step": 18685 + }, + { + "epoch": 1.79, + "grad_norm": 0.30737550141683656, + "learning_rate": 6.092813024282262e-06, + "loss": 1.0196, + "step": 18686 + }, + { + "epoch": 1.79, + "grad_norm": 0.34784094951483413, + "learning_rate": 6.087376710549741e-06, + "loss": 1.0423, + "step": 18687 + }, + { + "epoch": 1.79, + "grad_norm": 0.3199011598241475, + "learning_rate": 6.081942747039637e-06, + "loss": 1.0418, + "step": 18688 + }, + { + "epoch": 1.79, + "grad_norm": 0.2804258407265739, + "learning_rate": 6.07651113388793e-06, + "loss": 1.0165, + "step": 18689 + }, + { + "epoch": 1.79, + "grad_norm": 0.3008353359813493, + "learning_rate": 6.071081871230588e-06, + "loss": 1.1509, + "step": 18690 + }, + { + "epoch": 1.79, + "grad_norm": 0.3698229046122635, + "learning_rate": 6.065654959203426e-06, + "loss": 1.0942, + "step": 18691 + }, + { + "epoch": 1.79, + "grad_norm": 0.30811707384968506, + "learning_rate": 6.06023039794229e-06, + "loss": 1.0641, + "step": 18692 + }, + { + "epoch": 1.79, + "grad_norm": 0.31346444210324564, + "learning_rate": 6.054808187582927e-06, + "loss": 0.9319, + "step": 18693 + }, + { + "epoch": 1.79, + "grad_norm": 0.31578696168290005, + "learning_rate": 6.049388328261052e-06, + "loss": 1.0367, + "step": 18694 + }, + { + "epoch": 1.79, + "grad_norm": 0.32045902670174226, + "learning_rate": 6.043970820112265e-06, + "loss": 1.108, + "step": 18695 + }, + { + "epoch": 1.79, + "grad_norm": 0.29893742488446623, + "learning_rate": 6.038555663272161e-06, + "loss": 1.1836, + "step": 18696 + }, + { + "epoch": 1.79, + "grad_norm": 0.3430135897056135, + "learning_rate": 6.03314285787625e-06, + "loss": 1.0502, + "step": 18697 + }, + { + "epoch": 1.79, + "grad_norm": 0.3501470231424017, + "learning_rate": 6.027732404059994e-06, + "loss": 1.0237, + "step": 18698 + }, + { + "epoch": 1.79, + "grad_norm": 0.31535574368716823, + "learning_rate": 6.022324301958782e-06, + "loss": 0.9778, + "step": 18699 + }, + { + "epoch": 1.79, + "grad_norm": 0.3131874490643463, + "learning_rate": 6.016918551707995e-06, + "loss": 1.0488, + "step": 18700 + }, + { + "epoch": 1.79, + "grad_norm": 0.3039797695332379, + "learning_rate": 6.0115151534428705e-06, + "loss": 1.092, + "step": 18701 + }, + { + "epoch": 1.79, + "grad_norm": 0.3074056888759944, + "learning_rate": 6.006114107298655e-06, + "loss": 0.9868, + "step": 18702 + }, + { + "epoch": 1.79, + "grad_norm": 0.3283529379642502, + "learning_rate": 6.000715413410507e-06, + "loss": 1.0833, + "step": 18703 + }, + { + "epoch": 1.79, + "grad_norm": 0.28279506941445653, + "learning_rate": 5.9953190719135295e-06, + "loss": 1.0442, + "step": 18704 + }, + { + "epoch": 1.79, + "grad_norm": 0.25005931617306654, + "learning_rate": 5.98992508294276e-06, + "loss": 0.9821, + "step": 18705 + }, + { + "epoch": 1.79, + "grad_norm": 0.25525587590676235, + "learning_rate": 5.984533446633212e-06, + "loss": 1.0428, + "step": 18706 + }, + { + "epoch": 1.79, + "grad_norm": 0.30009228472183086, + "learning_rate": 5.979144163119799e-06, + "loss": 1.083, + "step": 18707 + }, + { + "epoch": 1.79, + "grad_norm": 0.2930478507065316, + "learning_rate": 5.973757232537403e-06, + "loss": 0.9533, + "step": 18708 + }, + { + "epoch": 1.79, + "grad_norm": 0.338020596529762, + "learning_rate": 5.968372655020804e-06, + "loss": 1.0902, + "step": 18709 + }, + { + "epoch": 1.79, + "grad_norm": 0.3078125890020947, + "learning_rate": 5.9629904307047845e-06, + "loss": 1.1551, + "step": 18710 + }, + { + "epoch": 1.79, + "grad_norm": 0.3198543861248538, + "learning_rate": 5.957610559724047e-06, + "loss": 1.0092, + "step": 18711 + }, + { + "epoch": 1.79, + "grad_norm": 0.30183652583458437, + "learning_rate": 5.952233042213184e-06, + "loss": 0.9757, + "step": 18712 + }, + { + "epoch": 1.79, + "grad_norm": 0.31127533157202547, + "learning_rate": 5.946857878306799e-06, + "loss": 1.0563, + "step": 18713 + }, + { + "epoch": 1.79, + "grad_norm": 0.32534329881973734, + "learning_rate": 5.9414850681394075e-06, + "loss": 1.0602, + "step": 18714 + }, + { + "epoch": 1.79, + "grad_norm": 0.2954113999459429, + "learning_rate": 5.936114611845489e-06, + "loss": 0.9622, + "step": 18715 + }, + { + "epoch": 1.79, + "grad_norm": 0.2856364621649031, + "learning_rate": 5.930746509559393e-06, + "loss": 0.9765, + "step": 18716 + }, + { + "epoch": 1.79, + "grad_norm": 0.36235685087781483, + "learning_rate": 5.9253807614155e-06, + "loss": 0.9473, + "step": 18717 + }, + { + "epoch": 1.79, + "grad_norm": 0.2753237953155129, + "learning_rate": 5.920017367548069e-06, + "loss": 1.1067, + "step": 18718 + }, + { + "epoch": 1.79, + "grad_norm": 0.3129030637794475, + "learning_rate": 5.914656328091328e-06, + "loss": 1.0686, + "step": 18719 + }, + { + "epoch": 1.79, + "grad_norm": 0.29336687014870444, + "learning_rate": 5.909297643179446e-06, + "loss": 1.0451, + "step": 18720 + }, + { + "epoch": 1.79, + "grad_norm": 0.2940904758257372, + "learning_rate": 5.903941312946537e-06, + "loss": 0.9653, + "step": 18721 + }, + { + "epoch": 1.79, + "grad_norm": 0.30365638330401556, + "learning_rate": 5.898587337526618e-06, + "loss": 0.9851, + "step": 18722 + }, + { + "epoch": 1.79, + "grad_norm": 0.3014583770306041, + "learning_rate": 5.893235717053702e-06, + "loss": 1.0781, + "step": 18723 + }, + { + "epoch": 1.79, + "grad_norm": 0.30330581338349105, + "learning_rate": 5.887886451661695e-06, + "loss": 1.0214, + "step": 18724 + }, + { + "epoch": 1.79, + "grad_norm": 0.36076258993972876, + "learning_rate": 5.882539541484478e-06, + "loss": 1.0129, + "step": 18725 + }, + { + "epoch": 1.79, + "grad_norm": 0.33928225439302695, + "learning_rate": 5.877194986655854e-06, + "loss": 0.9332, + "step": 18726 + }, + { + "epoch": 1.79, + "grad_norm": 0.3349947454843725, + "learning_rate": 5.871852787309595e-06, + "loss": 0.9765, + "step": 18727 + }, + { + "epoch": 1.79, + "grad_norm": 0.3367966817047234, + "learning_rate": 5.866512943579372e-06, + "loss": 1.1322, + "step": 18728 + }, + { + "epoch": 1.79, + "grad_norm": 0.27690250300950575, + "learning_rate": 5.861175455598833e-06, + "loss": 1.0079, + "step": 18729 + }, + { + "epoch": 1.79, + "grad_norm": 0.3416887148181893, + "learning_rate": 5.855840323501527e-06, + "loss": 0.8552, + "step": 18730 + }, + { + "epoch": 1.79, + "grad_norm": 0.3311236726290958, + "learning_rate": 5.85050754742098e-06, + "loss": 0.9745, + "step": 18731 + }, + { + "epoch": 1.79, + "grad_norm": 0.3482632185898418, + "learning_rate": 5.845177127490664e-06, + "loss": 1.021, + "step": 18732 + }, + { + "epoch": 1.79, + "grad_norm": 0.24661339111470318, + "learning_rate": 5.839849063843972e-06, + "loss": 1.1197, + "step": 18733 + }, + { + "epoch": 1.79, + "grad_norm": 0.28625225380558045, + "learning_rate": 5.8345233566142186e-06, + "loss": 1.0135, + "step": 18734 + }, + { + "epoch": 1.79, + "grad_norm": 0.29889797526899436, + "learning_rate": 5.829200005934698e-06, + "loss": 1.0316, + "step": 18735 + }, + { + "epoch": 1.79, + "grad_norm": 0.30726237642781845, + "learning_rate": 5.8238790119386485e-06, + "loss": 1.111, + "step": 18736 + }, + { + "epoch": 1.79, + "grad_norm": 0.3128800853757584, + "learning_rate": 5.818560374759197e-06, + "loss": 1.0625, + "step": 18737 + }, + { + "epoch": 1.79, + "grad_norm": 0.27671273546233344, + "learning_rate": 5.81324409452948e-06, + "loss": 0.9721, + "step": 18738 + }, + { + "epoch": 1.79, + "grad_norm": 0.2985791968692481, + "learning_rate": 5.8079301713824915e-06, + "loss": 1.0512, + "step": 18739 + }, + { + "epoch": 1.79, + "grad_norm": 0.2841797086747127, + "learning_rate": 5.802618605451282e-06, + "loss": 0.7847, + "step": 18740 + }, + { + "epoch": 1.79, + "grad_norm": 0.27815025507433094, + "learning_rate": 5.797309396868722e-06, + "loss": 0.9607, + "step": 18741 + }, + { + "epoch": 1.79, + "grad_norm": 0.304958518501576, + "learning_rate": 5.792002545767716e-06, + "loss": 0.988, + "step": 18742 + }, + { + "epoch": 1.79, + "grad_norm": 0.2854132247701008, + "learning_rate": 5.786698052281047e-06, + "loss": 1.0537, + "step": 18743 + }, + { + "epoch": 1.79, + "grad_norm": 0.28537814595429567, + "learning_rate": 5.781395916541476e-06, + "loss": 1.0839, + "step": 18744 + }, + { + "epoch": 1.79, + "grad_norm": 0.29592415863820437, + "learning_rate": 5.776096138681675e-06, + "loss": 0.9589, + "step": 18745 + }, + { + "epoch": 1.79, + "grad_norm": 0.27295010769898004, + "learning_rate": 5.7707987188342805e-06, + "loss": 0.9433, + "step": 18746 + }, + { + "epoch": 1.79, + "grad_norm": 0.3269982739531233, + "learning_rate": 5.765503657131876e-06, + "loss": 0.928, + "step": 18747 + }, + { + "epoch": 1.79, + "grad_norm": 0.4327922788263341, + "learning_rate": 5.760210953706979e-06, + "loss": 0.9976, + "step": 18748 + }, + { + "epoch": 1.79, + "grad_norm": 0.2578539022516852, + "learning_rate": 5.754920608692016e-06, + "loss": 1.0742, + "step": 18749 + }, + { + "epoch": 1.79, + "grad_norm": 0.3371919075551128, + "learning_rate": 5.749632622219403e-06, + "loss": 1.0556, + "step": 18750 + }, + { + "epoch": 1.79, + "grad_norm": 0.29804387696430423, + "learning_rate": 5.744346994421457e-06, + "loss": 1.0214, + "step": 18751 + }, + { + "epoch": 1.79, + "grad_norm": 0.3077932171863703, + "learning_rate": 5.739063725430472e-06, + "loss": 1.0447, + "step": 18752 + }, + { + "epoch": 1.79, + "grad_norm": 0.33295992519216105, + "learning_rate": 5.733782815378652e-06, + "loss": 1.0801, + "step": 18753 + }, + { + "epoch": 1.79, + "grad_norm": 0.3058124566948733, + "learning_rate": 5.728504264398171e-06, + "loss": 1.0189, + "step": 18754 + }, + { + "epoch": 1.79, + "grad_norm": 0.3557312496295928, + "learning_rate": 5.723228072621112e-06, + "loss": 0.9036, + "step": 18755 + }, + { + "epoch": 1.79, + "grad_norm": 0.31193150344264636, + "learning_rate": 5.717954240179524e-06, + "loss": 0.9208, + "step": 18756 + }, + { + "epoch": 1.79, + "grad_norm": 0.3048823739173144, + "learning_rate": 5.7126827672053905e-06, + "loss": 0.9798, + "step": 18757 + }, + { + "epoch": 1.79, + "grad_norm": 0.295477335764936, + "learning_rate": 5.7074136538306175e-06, + "loss": 0.9915, + "step": 18758 + }, + { + "epoch": 1.79, + "grad_norm": 0.2812332187965299, + "learning_rate": 5.702146900187078e-06, + "loss": 0.9523, + "step": 18759 + }, + { + "epoch": 1.79, + "grad_norm": 0.307888436108888, + "learning_rate": 5.696882506406575e-06, + "loss": 0.9621, + "step": 18760 + }, + { + "epoch": 1.79, + "grad_norm": 0.30703259351506956, + "learning_rate": 5.691620472620873e-06, + "loss": 1.0278, + "step": 18761 + }, + { + "epoch": 1.79, + "grad_norm": 0.298988316221636, + "learning_rate": 5.686360798961621e-06, + "loss": 1.1275, + "step": 18762 + }, + { + "epoch": 1.8, + "grad_norm": 0.3197883379355681, + "learning_rate": 5.6811034855604795e-06, + "loss": 1.06, + "step": 18763 + }, + { + "epoch": 1.8, + "grad_norm": 0.3382590925865129, + "learning_rate": 5.6758485325489885e-06, + "loss": 0.9889, + "step": 18764 + }, + { + "epoch": 1.8, + "grad_norm": 0.29871958433371953, + "learning_rate": 5.670595940058676e-06, + "loss": 1.0398, + "step": 18765 + }, + { + "epoch": 1.8, + "grad_norm": 0.3137761083163329, + "learning_rate": 5.6653457082209595e-06, + "loss": 0.9252, + "step": 18766 + }, + { + "epoch": 1.8, + "grad_norm": 0.3338136536134675, + "learning_rate": 5.660097837167289e-06, + "loss": 1.0235, + "step": 18767 + }, + { + "epoch": 1.8, + "grad_norm": 0.309831911689051, + "learning_rate": 5.654852327028937e-06, + "loss": 0.9664, + "step": 18768 + }, + { + "epoch": 1.8, + "grad_norm": 0.3353103607474175, + "learning_rate": 5.649609177937221e-06, + "loss": 1.0271, + "step": 18769 + }, + { + "epoch": 1.8, + "grad_norm": 0.34739471873209127, + "learning_rate": 5.644368390023313e-06, + "loss": 1.1638, + "step": 18770 + }, + { + "epoch": 1.8, + "grad_norm": 0.28635362236676215, + "learning_rate": 5.639129963418399e-06, + "loss": 0.9485, + "step": 18771 + }, + { + "epoch": 1.8, + "grad_norm": 0.30314974326335364, + "learning_rate": 5.633893898253551e-06, + "loss": 1.0062, + "step": 18772 + }, + { + "epoch": 1.8, + "grad_norm": 0.3284379135168331, + "learning_rate": 5.6286601946598186e-06, + "loss": 1.0725, + "step": 18773 + }, + { + "epoch": 1.8, + "grad_norm": 0.3083901385733184, + "learning_rate": 5.623428852768164e-06, + "loss": 1.0834, + "step": 18774 + }, + { + "epoch": 1.8, + "grad_norm": 0.3367052886091145, + "learning_rate": 5.618199872709539e-06, + "loss": 1.093, + "step": 18775 + }, + { + "epoch": 1.8, + "grad_norm": 0.3569981793502439, + "learning_rate": 5.612973254614762e-06, + "loss": 0.9352, + "step": 18776 + }, + { + "epoch": 1.8, + "grad_norm": 0.28382054006217794, + "learning_rate": 5.607748998614648e-06, + "loss": 0.9528, + "step": 18777 + }, + { + "epoch": 1.8, + "grad_norm": 0.2949219081892133, + "learning_rate": 5.602527104839949e-06, + "loss": 0.966, + "step": 18778 + }, + { + "epoch": 1.8, + "grad_norm": 0.29325769648452465, + "learning_rate": 5.597307573421329e-06, + "loss": 1.0454, + "step": 18779 + }, + { + "epoch": 1.8, + "grad_norm": 0.30574200254914047, + "learning_rate": 5.592090404489414e-06, + "loss": 1.055, + "step": 18780 + }, + { + "epoch": 1.8, + "grad_norm": 0.31926771547558597, + "learning_rate": 5.586875598174768e-06, + "loss": 1.0465, + "step": 18781 + }, + { + "epoch": 1.8, + "grad_norm": 0.2957269092275416, + "learning_rate": 5.581663154607919e-06, + "loss": 0.9814, + "step": 18782 + }, + { + "epoch": 1.8, + "grad_norm": 0.32590077928798283, + "learning_rate": 5.576453073919263e-06, + "loss": 1.0083, + "step": 18783 + }, + { + "epoch": 1.8, + "grad_norm": 0.3297222573344637, + "learning_rate": 5.57124535623923e-06, + "loss": 1.0273, + "step": 18784 + }, + { + "epoch": 1.8, + "grad_norm": 0.31438495097408703, + "learning_rate": 5.566040001698114e-06, + "loss": 0.96, + "step": 18785 + }, + { + "epoch": 1.8, + "grad_norm": 0.2958450010667885, + "learning_rate": 5.560837010426201e-06, + "loss": 1.0413, + "step": 18786 + }, + { + "epoch": 1.8, + "grad_norm": 0.2932741758471821, + "learning_rate": 5.555636382553697e-06, + "loss": 1.0071, + "step": 18787 + }, + { + "epoch": 1.8, + "grad_norm": 0.32191696811279463, + "learning_rate": 5.550438118210754e-06, + "loss": 1.1429, + "step": 18788 + }, + { + "epoch": 1.8, + "grad_norm": 0.28078057107912446, + "learning_rate": 5.545242217527447e-06, + "loss": 0.9356, + "step": 18789 + }, + { + "epoch": 1.8, + "grad_norm": 0.29291183693715833, + "learning_rate": 5.5400486806338245e-06, + "loss": 1.0396, + "step": 18790 + }, + { + "epoch": 1.8, + "grad_norm": 0.3091267960811802, + "learning_rate": 5.534857507659841e-06, + "loss": 1.0217, + "step": 18791 + }, + { + "epoch": 1.8, + "grad_norm": 0.35242301259942826, + "learning_rate": 5.529668698735424e-06, + "loss": 0.9884, + "step": 18792 + }, + { + "epoch": 1.8, + "grad_norm": 0.36458577637023626, + "learning_rate": 5.524482253990404e-06, + "loss": 0.9803, + "step": 18793 + }, + { + "epoch": 1.8, + "grad_norm": 0.31224874705970423, + "learning_rate": 5.519298173554621e-06, + "loss": 0.9652, + "step": 18794 + }, + { + "epoch": 1.8, + "grad_norm": 0.2866033404605923, + "learning_rate": 5.514116457557761e-06, + "loss": 1.1176, + "step": 18795 + }, + { + "epoch": 1.8, + "grad_norm": 0.3542711832259779, + "learning_rate": 5.508937106129508e-06, + "loss": 1.1274, + "step": 18796 + }, + { + "epoch": 1.8, + "grad_norm": 0.3624206329461225, + "learning_rate": 5.5037601193995145e-06, + "loss": 1.0465, + "step": 18797 + }, + { + "epoch": 1.8, + "grad_norm": 0.3266795193232925, + "learning_rate": 5.498585497497299e-06, + "loss": 1.0124, + "step": 18798 + }, + { + "epoch": 1.8, + "grad_norm": 0.30240085021459984, + "learning_rate": 5.493413240552381e-06, + "loss": 1.0716, + "step": 18799 + }, + { + "epoch": 1.8, + "grad_norm": 0.2903821566128506, + "learning_rate": 5.488243348694167e-06, + "loss": 0.9856, + "step": 18800 + }, + { + "epoch": 1.8, + "grad_norm": 0.2668355823119947, + "learning_rate": 5.4830758220520974e-06, + "loss": 1.1157, + "step": 18801 + }, + { + "epoch": 1.8, + "grad_norm": 0.3428996114339763, + "learning_rate": 5.477910660755436e-06, + "loss": 1.0164, + "step": 18802 + }, + { + "epoch": 1.8, + "grad_norm": 0.3542102057326383, + "learning_rate": 5.47274786493348e-06, + "loss": 0.9996, + "step": 18803 + }, + { + "epoch": 1.8, + "grad_norm": 0.3519518285030121, + "learning_rate": 5.467587434715404e-06, + "loss": 1.0337, + "step": 18804 + }, + { + "epoch": 1.8, + "grad_norm": 0.2983879237185156, + "learning_rate": 5.462429370230371e-06, + "loss": 1.0443, + "step": 18805 + }, + { + "epoch": 1.8, + "grad_norm": 0.32118193083031316, + "learning_rate": 5.457273671607444e-06, + "loss": 1.0949, + "step": 18806 + }, + { + "epoch": 1.8, + "grad_norm": 0.31911993105342407, + "learning_rate": 5.452120338975675e-06, + "loss": 0.9846, + "step": 18807 + }, + { + "epoch": 1.8, + "grad_norm": 0.3394144285737662, + "learning_rate": 5.4469693724640055e-06, + "loss": 1.0015, + "step": 18808 + }, + { + "epoch": 1.8, + "grad_norm": 0.3209431343685656, + "learning_rate": 5.441820772201367e-06, + "loss": 1.0495, + "step": 18809 + }, + { + "epoch": 1.8, + "grad_norm": 0.31258440191340936, + "learning_rate": 5.4366745383165774e-06, + "loss": 0.9031, + "step": 18810 + }, + { + "epoch": 1.8, + "grad_norm": 0.3130110569249833, + "learning_rate": 5.431530670938445e-06, + "loss": 0.8932, + "step": 18811 + }, + { + "epoch": 1.8, + "grad_norm": 0.3242523168012393, + "learning_rate": 5.426389170195689e-06, + "loss": 1.0581, + "step": 18812 + }, + { + "epoch": 1.8, + "grad_norm": 0.3153857906428153, + "learning_rate": 5.421250036216974e-06, + "loss": 0.9778, + "step": 18813 + }, + { + "epoch": 1.8, + "grad_norm": 0.3331009618607537, + "learning_rate": 5.416113269130918e-06, + "loss": 1.0672, + "step": 18814 + }, + { + "epoch": 1.8, + "grad_norm": 0.3308352033219071, + "learning_rate": 5.410978869066085e-06, + "loss": 1.0302, + "step": 18815 + }, + { + "epoch": 1.8, + "grad_norm": 0.3236073300769159, + "learning_rate": 5.4058468361509405e-06, + "loss": 1.0407, + "step": 18816 + }, + { + "epoch": 1.8, + "grad_norm": 0.3066050877351283, + "learning_rate": 5.400717170513936e-06, + "loss": 1.0535, + "step": 18817 + }, + { + "epoch": 1.8, + "grad_norm": 0.26584956187471637, + "learning_rate": 5.395589872283446e-06, + "loss": 0.8825, + "step": 18818 + }, + { + "epoch": 1.8, + "grad_norm": 0.29460979040697727, + "learning_rate": 5.390464941587759e-06, + "loss": 0.9421, + "step": 18819 + }, + { + "epoch": 1.8, + "grad_norm": 0.3064863239816208, + "learning_rate": 5.385342378555147e-06, + "loss": 0.9645, + "step": 18820 + }, + { + "epoch": 1.8, + "grad_norm": 0.27042623498744595, + "learning_rate": 5.38022218331381e-06, + "loss": 0.9508, + "step": 18821 + }, + { + "epoch": 1.8, + "grad_norm": 0.33561618035144136, + "learning_rate": 5.375104355991889e-06, + "loss": 1.0195, + "step": 18822 + }, + { + "epoch": 1.8, + "grad_norm": 0.3296470065556117, + "learning_rate": 5.3699888967174375e-06, + "loss": 1.0278, + "step": 18823 + }, + { + "epoch": 1.8, + "grad_norm": 0.3126970449991614, + "learning_rate": 5.364875805618508e-06, + "loss": 1.0603, + "step": 18824 + }, + { + "epoch": 1.8, + "grad_norm": 0.32507139353966547, + "learning_rate": 5.3597650828230095e-06, + "loss": 0.8517, + "step": 18825 + }, + { + "epoch": 1.8, + "grad_norm": 0.26447884213003253, + "learning_rate": 5.354656728458896e-06, + "loss": 1.0191, + "step": 18826 + }, + { + "epoch": 1.8, + "grad_norm": 0.34666537861451324, + "learning_rate": 5.3495507426539414e-06, + "loss": 0.9798, + "step": 18827 + }, + { + "epoch": 1.8, + "grad_norm": 0.30984449152983096, + "learning_rate": 5.3444471255360005e-06, + "loss": 0.9396, + "step": 18828 + }, + { + "epoch": 1.8, + "grad_norm": 0.3021495999330885, + "learning_rate": 5.339345877232749e-06, + "loss": 1.0068, + "step": 18829 + }, + { + "epoch": 1.8, + "grad_norm": 0.31829028898075906, + "learning_rate": 5.334246997871861e-06, + "loss": 0.8566, + "step": 18830 + }, + { + "epoch": 1.8, + "grad_norm": 0.330429285705311, + "learning_rate": 5.329150487580936e-06, + "loss": 1.0742, + "step": 18831 + }, + { + "epoch": 1.8, + "grad_norm": 0.3037409821913984, + "learning_rate": 5.324056346487516e-06, + "loss": 1.0557, + "step": 18832 + }, + { + "epoch": 1.8, + "grad_norm": 0.30180546850617446, + "learning_rate": 5.318964574719088e-06, + "loss": 0.9068, + "step": 18833 + }, + { + "epoch": 1.8, + "grad_norm": 0.31748430003366845, + "learning_rate": 5.313875172403071e-06, + "loss": 0.9162, + "step": 18834 + }, + { + "epoch": 1.8, + "grad_norm": 0.3162060567042437, + "learning_rate": 5.308788139666832e-06, + "loss": 1.1232, + "step": 18835 + }, + { + "epoch": 1.8, + "grad_norm": 0.2981612389119333, + "learning_rate": 5.303703476637701e-06, + "loss": 0.9417, + "step": 18836 + }, + { + "epoch": 1.8, + "grad_norm": 0.3441532739006076, + "learning_rate": 5.298621183442887e-06, + "loss": 1.0166, + "step": 18837 + }, + { + "epoch": 1.8, + "grad_norm": 0.32988689331653903, + "learning_rate": 5.2935412602096e-06, + "loss": 1.0535, + "step": 18838 + }, + { + "epoch": 1.8, + "grad_norm": 0.29345799077351026, + "learning_rate": 5.2884637070649725e-06, + "loss": 1.0988, + "step": 18839 + }, + { + "epoch": 1.8, + "grad_norm": 0.2726179006867351, + "learning_rate": 5.283388524136057e-06, + "loss": 0.9735, + "step": 18840 + }, + { + "epoch": 1.8, + "grad_norm": 0.3336478411974013, + "learning_rate": 5.278315711549864e-06, + "loss": 0.9362, + "step": 18841 + }, + { + "epoch": 1.8, + "grad_norm": 0.32031464899181755, + "learning_rate": 5.273245269433358e-06, + "loss": 1.0498, + "step": 18842 + }, + { + "epoch": 1.8, + "grad_norm": 0.33925873161666653, + "learning_rate": 5.268177197913438e-06, + "loss": 1.0158, + "step": 18843 + }, + { + "epoch": 1.8, + "grad_norm": 0.32329551898543096, + "learning_rate": 5.263111497116891e-06, + "loss": 0.9941, + "step": 18844 + }, + { + "epoch": 1.8, + "grad_norm": 0.33847488275381676, + "learning_rate": 5.258048167170549e-06, + "loss": 0.9256, + "step": 18845 + }, + { + "epoch": 1.8, + "grad_norm": 0.3033513043784794, + "learning_rate": 5.252987208201077e-06, + "loss": 1.0648, + "step": 18846 + }, + { + "epoch": 1.8, + "grad_norm": 0.3083601365638897, + "learning_rate": 5.247928620335141e-06, + "loss": 1.0841, + "step": 18847 + }, + { + "epoch": 1.8, + "grad_norm": 0.33549882033746137, + "learning_rate": 5.242872403699339e-06, + "loss": 0.9605, + "step": 18848 + }, + { + "epoch": 1.8, + "grad_norm": 0.3358792783793603, + "learning_rate": 5.2378185584202264e-06, + "loss": 1.0277, + "step": 18849 + }, + { + "epoch": 1.8, + "grad_norm": 0.30590492887837195, + "learning_rate": 5.232767084624246e-06, + "loss": 0.9423, + "step": 18850 + }, + { + "epoch": 1.8, + "grad_norm": 0.31822165084917736, + "learning_rate": 5.227717982437841e-06, + "loss": 0.97, + "step": 18851 + }, + { + "epoch": 1.8, + "grad_norm": 0.2951605324052581, + "learning_rate": 5.222671251987343e-06, + "loss": 0.9435, + "step": 18852 + }, + { + "epoch": 1.8, + "grad_norm": 0.3400599383327807, + "learning_rate": 5.217626893399074e-06, + "loss": 1.1557, + "step": 18853 + }, + { + "epoch": 1.8, + "grad_norm": 0.29809703259754883, + "learning_rate": 5.212584906799234e-06, + "loss": 0.9902, + "step": 18854 + }, + { + "epoch": 1.8, + "grad_norm": 0.3251564064984153, + "learning_rate": 5.207545292314065e-06, + "loss": 1.0418, + "step": 18855 + }, + { + "epoch": 1.8, + "grad_norm": 0.33924146250171516, + "learning_rate": 5.202508050069621e-06, + "loss": 1.0999, + "step": 18856 + }, + { + "epoch": 1.8, + "grad_norm": 0.3280699054067235, + "learning_rate": 5.197473180192014e-06, + "loss": 1.1287, + "step": 18857 + }, + { + "epoch": 1.8, + "grad_norm": 0.30738472295084096, + "learning_rate": 5.192440682807209e-06, + "loss": 1.1754, + "step": 18858 + }, + { + "epoch": 1.8, + "grad_norm": 0.3472586261825483, + "learning_rate": 5.187410558041151e-06, + "loss": 1.0565, + "step": 18859 + }, + { + "epoch": 1.8, + "grad_norm": 0.3974851914931688, + "learning_rate": 5.182382806019759e-06, + "loss": 0.976, + "step": 18860 + }, + { + "epoch": 1.8, + "grad_norm": 0.3396224243616631, + "learning_rate": 5.177357426868801e-06, + "loss": 1.0103, + "step": 18861 + }, + { + "epoch": 1.8, + "grad_norm": 0.3080977474664744, + "learning_rate": 5.172334420714075e-06, + "loss": 1.1036, + "step": 18862 + }, + { + "epoch": 1.8, + "grad_norm": 0.3112771489017127, + "learning_rate": 5.167313787681272e-06, + "loss": 1.0862, + "step": 18863 + }, + { + "epoch": 1.8, + "grad_norm": 0.27163827154990416, + "learning_rate": 5.162295527896055e-06, + "loss": 1.067, + "step": 18864 + }, + { + "epoch": 1.8, + "grad_norm": 0.382278794684592, + "learning_rate": 5.157279641483992e-06, + "loss": 1.0635, + "step": 18865 + }, + { + "epoch": 1.8, + "grad_norm": 0.31169871174419733, + "learning_rate": 5.152266128570615e-06, + "loss": 0.9908, + "step": 18866 + }, + { + "epoch": 1.81, + "grad_norm": 0.3477344311616689, + "learning_rate": 5.14725498928138e-06, + "loss": 0.9916, + "step": 18867 + }, + { + "epoch": 1.81, + "grad_norm": 0.3289975395077162, + "learning_rate": 5.142246223741698e-06, + "loss": 0.9962, + "step": 18868 + }, + { + "epoch": 1.81, + "grad_norm": 0.30966504652295757, + "learning_rate": 5.1372398320769235e-06, + "loss": 0.958, + "step": 18869 + }, + { + "epoch": 1.81, + "grad_norm": 0.32605283154499015, + "learning_rate": 5.132235814412356e-06, + "loss": 0.9955, + "step": 18870 + }, + { + "epoch": 1.81, + "grad_norm": 0.2931021445492193, + "learning_rate": 5.127234170873185e-06, + "loss": 0.9844, + "step": 18871 + }, + { + "epoch": 1.81, + "grad_norm": 0.3166514657134127, + "learning_rate": 5.1222349015846324e-06, + "loss": 0.8628, + "step": 18872 + }, + { + "epoch": 1.81, + "grad_norm": 0.3088012232931153, + "learning_rate": 5.117238006671754e-06, + "loss": 1.0461, + "step": 18873 + }, + { + "epoch": 1.81, + "grad_norm": 0.31489081136545394, + "learning_rate": 5.112243486259649e-06, + "loss": 0.9855, + "step": 18874 + }, + { + "epoch": 1.81, + "grad_norm": 0.3079351545071471, + "learning_rate": 5.107251340473251e-06, + "loss": 0.8728, + "step": 18875 + }, + { + "epoch": 1.81, + "grad_norm": 0.27130477713292334, + "learning_rate": 5.102261569437561e-06, + "loss": 1.0021, + "step": 18876 + }, + { + "epoch": 1.81, + "grad_norm": 0.3103757588755947, + "learning_rate": 5.0972741732774e-06, + "loss": 0.9026, + "step": 18877 + }, + { + "epoch": 1.81, + "grad_norm": 0.3452837609147426, + "learning_rate": 5.092289152117591e-06, + "loss": 0.9832, + "step": 18878 + }, + { + "epoch": 1.81, + "grad_norm": 0.3089170941124422, + "learning_rate": 5.087306506082912e-06, + "loss": 1.0342, + "step": 18879 + }, + { + "epoch": 1.81, + "grad_norm": 0.32278132431981127, + "learning_rate": 5.082326235298018e-06, + "loss": 1.0199, + "step": 18880 + }, + { + "epoch": 1.81, + "grad_norm": 0.2888405746103104, + "learning_rate": 5.077348339887578e-06, + "loss": 1.0797, + "step": 18881 + }, + { + "epoch": 1.81, + "grad_norm": 0.357637432923308, + "learning_rate": 5.072372819976135e-06, + "loss": 0.9368, + "step": 18882 + }, + { + "epoch": 1.81, + "grad_norm": 0.3132737112336327, + "learning_rate": 5.0673996756882445e-06, + "loss": 0.9245, + "step": 18883 + }, + { + "epoch": 1.81, + "grad_norm": 0.33237649189204505, + "learning_rate": 5.062428907148331e-06, + "loss": 0.9541, + "step": 18884 + }, + { + "epoch": 1.81, + "grad_norm": 0.3178774359122405, + "learning_rate": 5.0574605144808055e-06, + "loss": 1.0022, + "step": 18885 + }, + { + "epoch": 1.81, + "grad_norm": 0.34607820081877905, + "learning_rate": 5.05249449780999e-06, + "loss": 1.0619, + "step": 18886 + }, + { + "epoch": 1.81, + "grad_norm": 0.3207770905361018, + "learning_rate": 5.047530857260186e-06, + "loss": 0.9135, + "step": 18887 + }, + { + "epoch": 1.81, + "grad_norm": 0.3220298488127968, + "learning_rate": 5.0425695929555705e-06, + "loss": 1.0485, + "step": 18888 + }, + { + "epoch": 1.81, + "grad_norm": 0.30931148447307005, + "learning_rate": 5.037610705020357e-06, + "loss": 0.957, + "step": 18889 + }, + { + "epoch": 1.81, + "grad_norm": 0.3749721276665234, + "learning_rate": 5.032654193578601e-06, + "loss": 1.1119, + "step": 18890 + }, + { + "epoch": 1.81, + "grad_norm": 0.30795416334225195, + "learning_rate": 5.02770005875437e-06, + "loss": 1.0328, + "step": 18891 + }, + { + "epoch": 1.81, + "grad_norm": 0.3428617728889385, + "learning_rate": 5.0227483006716206e-06, + "loss": 0.9742, + "step": 18892 + }, + { + "epoch": 1.81, + "grad_norm": 0.3137278878047628, + "learning_rate": 5.017798919454308e-06, + "loss": 0.9995, + "step": 18893 + }, + { + "epoch": 1.81, + "grad_norm": 0.3332743230492452, + "learning_rate": 5.012851915226247e-06, + "loss": 0.9654, + "step": 18894 + }, + { + "epoch": 1.81, + "grad_norm": 0.29846477550873274, + "learning_rate": 5.007907288111269e-06, + "loss": 1.0563, + "step": 18895 + }, + { + "epoch": 1.81, + "grad_norm": 0.3204324813390751, + "learning_rate": 5.002965038233109e-06, + "loss": 1.0002, + "step": 18896 + }, + { + "epoch": 1.81, + "grad_norm": 0.3590115930623466, + "learning_rate": 4.99802516571547e-06, + "loss": 0.9791, + "step": 18897 + }, + { + "epoch": 1.81, + "grad_norm": 0.33836852029469955, + "learning_rate": 4.9930876706819395e-06, + "loss": 1.0744, + "step": 18898 + }, + { + "epoch": 1.81, + "grad_norm": 0.2825770201933621, + "learning_rate": 4.988152553256098e-06, + "loss": 1.0379, + "step": 18899 + }, + { + "epoch": 1.81, + "grad_norm": 0.29036863020987136, + "learning_rate": 4.983219813561479e-06, + "loss": 0.9714, + "step": 18900 + }, + { + "epoch": 1.81, + "grad_norm": 0.326525426905924, + "learning_rate": 4.978289451721474e-06, + "loss": 1.1101, + "step": 18901 + }, + { + "epoch": 1.81, + "grad_norm": 0.32404452608539874, + "learning_rate": 4.973361467859494e-06, + "loss": 1.067, + "step": 18902 + }, + { + "epoch": 1.81, + "grad_norm": 0.3113325023377256, + "learning_rate": 4.968435862098875e-06, + "loss": 1.0632, + "step": 18903 + }, + { + "epoch": 1.81, + "grad_norm": 0.36239926103047637, + "learning_rate": 4.963512634562873e-06, + "loss": 0.9855, + "step": 18904 + }, + { + "epoch": 1.81, + "grad_norm": 0.330082396036211, + "learning_rate": 4.958591785374689e-06, + "loss": 0.9562, + "step": 18905 + }, + { + "epoch": 1.81, + "grad_norm": 0.34081530315890274, + "learning_rate": 4.9536733146574925e-06, + "loss": 0.9899, + "step": 18906 + }, + { + "epoch": 1.81, + "grad_norm": 0.35421391557671783, + "learning_rate": 4.948757222534339e-06, + "loss": 1.063, + "step": 18907 + }, + { + "epoch": 1.81, + "grad_norm": 0.3203278716883194, + "learning_rate": 4.9438435091282765e-06, + "loss": 1.0816, + "step": 18908 + }, + { + "epoch": 1.81, + "grad_norm": 0.35180409217570946, + "learning_rate": 4.938932174562272e-06, + "loss": 1.1236, + "step": 18909 + }, + { + "epoch": 1.81, + "grad_norm": 0.3365482171690354, + "learning_rate": 4.934023218959238e-06, + "loss": 0.9835, + "step": 18910 + }, + { + "epoch": 1.81, + "grad_norm": 0.34629071462623523, + "learning_rate": 4.929116642442011e-06, + "loss": 0.9541, + "step": 18911 + }, + { + "epoch": 1.81, + "grad_norm": 0.340002818107411, + "learning_rate": 4.924212445133414e-06, + "loss": 1.0058, + "step": 18912 + }, + { + "epoch": 1.81, + "grad_norm": 0.38124355033145824, + "learning_rate": 4.919310627156126e-06, + "loss": 0.9871, + "step": 18913 + }, + { + "epoch": 1.81, + "grad_norm": 0.281922686816116, + "learning_rate": 4.914411188632862e-06, + "loss": 0.9921, + "step": 18914 + }, + { + "epoch": 1.81, + "grad_norm": 0.334054890374091, + "learning_rate": 4.909514129686199e-06, + "loss": 1.0362, + "step": 18915 + }, + { + "epoch": 1.81, + "grad_norm": 0.3310138809611744, + "learning_rate": 4.904619450438719e-06, + "loss": 1.081, + "step": 18916 + }, + { + "epoch": 1.81, + "grad_norm": 0.3049726261492093, + "learning_rate": 4.899727151012901e-06, + "loss": 1.0431, + "step": 18917 + }, + { + "epoch": 1.81, + "grad_norm": 0.3064222176482394, + "learning_rate": 4.894837231531191e-06, + "loss": 1.0213, + "step": 18918 + }, + { + "epoch": 1.81, + "grad_norm": 0.3306777459860109, + "learning_rate": 4.889949692115936e-06, + "loss": 0.8898, + "step": 18919 + }, + { + "epoch": 1.81, + "grad_norm": 0.319024157524171, + "learning_rate": 4.8850645328894605e-06, + "loss": 0.9809, + "step": 18920 + }, + { + "epoch": 1.81, + "grad_norm": 0.29806484243889264, + "learning_rate": 4.880181753974033e-06, + "loss": 1.0215, + "step": 18921 + }, + { + "epoch": 1.81, + "grad_norm": 0.27444414280203483, + "learning_rate": 4.8753013554918344e-06, + "loss": 0.9591, + "step": 18922 + }, + { + "epoch": 1.81, + "grad_norm": 0.34529958416941486, + "learning_rate": 4.870423337565e-06, + "loss": 0.9817, + "step": 18923 + }, + { + "epoch": 1.81, + "grad_norm": 0.30679043438268805, + "learning_rate": 4.865547700315609e-06, + "loss": 1.0584, + "step": 18924 + }, + { + "epoch": 1.81, + "grad_norm": 0.31525882232170427, + "learning_rate": 4.860674443865687e-06, + "loss": 1.0449, + "step": 18925 + }, + { + "epoch": 1.81, + "grad_norm": 0.29922187177874193, + "learning_rate": 4.8558035683371585e-06, + "loss": 1.0168, + "step": 18926 + }, + { + "epoch": 1.81, + "grad_norm": 0.29130695668957707, + "learning_rate": 4.8509350738519695e-06, + "loss": 0.995, + "step": 18927 + }, + { + "epoch": 1.81, + "grad_norm": 0.3061066083193226, + "learning_rate": 4.846068960531913e-06, + "loss": 1.0335, + "step": 18928 + }, + { + "epoch": 1.81, + "grad_norm": 0.3251083637275549, + "learning_rate": 4.84120522849878e-06, + "loss": 1.0236, + "step": 18929 + }, + { + "epoch": 1.81, + "grad_norm": 0.285840775960315, + "learning_rate": 4.836343877874283e-06, + "loss": 1.1081, + "step": 18930 + }, + { + "epoch": 1.81, + "grad_norm": 0.32952569490867606, + "learning_rate": 4.831484908780115e-06, + "loss": 1.0523, + "step": 18931 + }, + { + "epoch": 1.81, + "grad_norm": 0.3310171261825978, + "learning_rate": 4.826628321337834e-06, + "loss": 0.9381, + "step": 18932 + }, + { + "epoch": 1.81, + "grad_norm": 0.3223998404705937, + "learning_rate": 4.821774115668998e-06, + "loss": 1.0031, + "step": 18933 + }, + { + "epoch": 1.81, + "grad_norm": 0.28091900220348087, + "learning_rate": 4.816922291895076e-06, + "loss": 0.9807, + "step": 18934 + }, + { + "epoch": 1.81, + "grad_norm": 0.2642709381183348, + "learning_rate": 4.812072850137505e-06, + "loss": 0.9955, + "step": 18935 + }, + { + "epoch": 1.81, + "grad_norm": 0.3173284703111218, + "learning_rate": 4.8072257905176e-06, + "loss": 0.9927, + "step": 18936 + }, + { + "epoch": 1.81, + "grad_norm": 0.3378606589769754, + "learning_rate": 4.802381113156729e-06, + "loss": 0.9846, + "step": 18937 + }, + { + "epoch": 1.81, + "grad_norm": 0.30663785540025773, + "learning_rate": 4.797538818176095e-06, + "loss": 1.0176, + "step": 18938 + }, + { + "epoch": 1.81, + "grad_norm": 0.37100595841113476, + "learning_rate": 4.792698905696891e-06, + "loss": 1.025, + "step": 18939 + }, + { + "epoch": 1.81, + "grad_norm": 0.28933666491042503, + "learning_rate": 4.787861375840219e-06, + "loss": 0.9497, + "step": 18940 + }, + { + "epoch": 1.81, + "grad_norm": 0.30228709132048504, + "learning_rate": 4.78302622872715e-06, + "loss": 1.0768, + "step": 18941 + }, + { + "epoch": 1.81, + "grad_norm": 0.3476607769764273, + "learning_rate": 4.7781934644786976e-06, + "loss": 0.999, + "step": 18942 + }, + { + "epoch": 1.81, + "grad_norm": 0.31421479855574, + "learning_rate": 4.7733630832158095e-06, + "loss": 0.8894, + "step": 18943 + }, + { + "epoch": 1.81, + "grad_norm": 0.2800651096538969, + "learning_rate": 4.768535085059344e-06, + "loss": 1.03, + "step": 18944 + }, + { + "epoch": 1.81, + "grad_norm": 0.30479125923544187, + "learning_rate": 4.76370947013014e-06, + "loss": 1.0441, + "step": 18945 + }, + { + "epoch": 1.81, + "grad_norm": 0.30547417577907743, + "learning_rate": 4.758886238548976e-06, + "loss": 1.0525, + "step": 18946 + }, + { + "epoch": 1.81, + "grad_norm": 0.3230283022471722, + "learning_rate": 4.754065390436524e-06, + "loss": 0.989, + "step": 18947 + }, + { + "epoch": 1.81, + "grad_norm": 0.32394938449056443, + "learning_rate": 4.749246925913464e-06, + "loss": 1.0835, + "step": 18948 + }, + { + "epoch": 1.81, + "grad_norm": 0.301752506000106, + "learning_rate": 4.744430845100322e-06, + "loss": 0.8985, + "step": 18949 + }, + { + "epoch": 1.81, + "grad_norm": 0.29131858418653256, + "learning_rate": 4.739617148117703e-06, + "loss": 0.934, + "step": 18950 + }, + { + "epoch": 1.81, + "grad_norm": 0.31378881008566617, + "learning_rate": 4.7348058350860205e-06, + "loss": 0.9647, + "step": 18951 + }, + { + "epoch": 1.81, + "grad_norm": 0.3300477780808296, + "learning_rate": 4.7299969061257e-06, + "loss": 1.0728, + "step": 18952 + }, + { + "epoch": 1.81, + "grad_norm": 0.3267979492559538, + "learning_rate": 4.725190361357079e-06, + "loss": 1.0328, + "step": 18953 + }, + { + "epoch": 1.81, + "grad_norm": 0.30733462703587283, + "learning_rate": 4.720386200900451e-06, + "loss": 1.007, + "step": 18954 + }, + { + "epoch": 1.81, + "grad_norm": 0.31490929835423326, + "learning_rate": 4.71558442487603e-06, + "loss": 1.0249, + "step": 18955 + }, + { + "epoch": 1.81, + "grad_norm": 0.28756915401810706, + "learning_rate": 4.710785033403986e-06, + "loss": 1.0652, + "step": 18956 + }, + { + "epoch": 1.81, + "grad_norm": 0.31104753344269115, + "learning_rate": 4.705988026604447e-06, + "loss": 0.9908, + "step": 18957 + }, + { + "epoch": 1.81, + "grad_norm": 0.31885304524106556, + "learning_rate": 4.701193404597448e-06, + "loss": 1.085, + "step": 18958 + }, + { + "epoch": 1.81, + "grad_norm": 0.3166354426313907, + "learning_rate": 4.6964011675029725e-06, + "loss": 1.0276, + "step": 18959 + }, + { + "epoch": 1.81, + "grad_norm": 0.27829922291342096, + "learning_rate": 4.691611315440969e-06, + "loss": 0.9391, + "step": 18960 + }, + { + "epoch": 1.81, + "grad_norm": 0.3205771306217669, + "learning_rate": 4.686823848531274e-06, + "loss": 0.9242, + "step": 18961 + }, + { + "epoch": 1.81, + "grad_norm": 0.34880160647222797, + "learning_rate": 4.682038766893704e-06, + "loss": 0.9441, + "step": 18962 + }, + { + "epoch": 1.81, + "grad_norm": 0.29613742245394387, + "learning_rate": 4.677256070648029e-06, + "loss": 0.9111, + "step": 18963 + }, + { + "epoch": 1.81, + "grad_norm": 0.33810005019533174, + "learning_rate": 4.67247575991393e-06, + "loss": 1.1082, + "step": 18964 + }, + { + "epoch": 1.81, + "grad_norm": 0.28756446337985414, + "learning_rate": 4.6676978348110465e-06, + "loss": 0.9995, + "step": 18965 + }, + { + "epoch": 1.81, + "grad_norm": 0.34245317182150503, + "learning_rate": 4.662922295458927e-06, + "loss": 0.8952, + "step": 18966 + }, + { + "epoch": 1.81, + "grad_norm": 0.314801511496653, + "learning_rate": 4.658149141977108e-06, + "loss": 0.9101, + "step": 18967 + }, + { + "epoch": 1.81, + "grad_norm": 0.2947938276025124, + "learning_rate": 4.653378374485018e-06, + "loss": 1.0865, + "step": 18968 + }, + { + "epoch": 1.81, + "grad_norm": 0.28297207907039196, + "learning_rate": 4.648609993102049e-06, + "loss": 0.8804, + "step": 18969 + }, + { + "epoch": 1.81, + "grad_norm": 0.294803807963093, + "learning_rate": 4.64384399794755e-06, + "loss": 1.085, + "step": 18970 + }, + { + "epoch": 1.81, + "grad_norm": 0.2919710466742953, + "learning_rate": 4.639080389140782e-06, + "loss": 1.0106, + "step": 18971 + }, + { + "epoch": 1.82, + "grad_norm": 0.27989731343216645, + "learning_rate": 4.63431916680096e-06, + "loss": 0.9296, + "step": 18972 + }, + { + "epoch": 1.82, + "grad_norm": 0.30104931789260125, + "learning_rate": 4.629560331047245e-06, + "loss": 1.0567, + "step": 18973 + }, + { + "epoch": 1.82, + "grad_norm": 0.33534095510495227, + "learning_rate": 4.624803881998707e-06, + "loss": 1.0484, + "step": 18974 + }, + { + "epoch": 1.82, + "grad_norm": 0.36030178028885174, + "learning_rate": 4.620049819774408e-06, + "loss": 1.1495, + "step": 18975 + }, + { + "epoch": 1.82, + "grad_norm": 0.34255696194443785, + "learning_rate": 4.6152981444932744e-06, + "loss": 0.9321, + "step": 18976 + }, + { + "epoch": 1.82, + "grad_norm": 0.32067318806006045, + "learning_rate": 4.610548856274288e-06, + "loss": 0.9573, + "step": 18977 + }, + { + "epoch": 1.82, + "grad_norm": 0.362226624599472, + "learning_rate": 4.605801955236244e-06, + "loss": 1.0602, + "step": 18978 + }, + { + "epoch": 1.82, + "grad_norm": 0.2715861110516333, + "learning_rate": 4.60105744149798e-06, + "loss": 0.9765, + "step": 18979 + }, + { + "epoch": 1.82, + "grad_norm": 0.3282836234716903, + "learning_rate": 4.5963153151781904e-06, + "loss": 1.0193, + "step": 18980 + }, + { + "epoch": 1.82, + "grad_norm": 0.3182926247449987, + "learning_rate": 4.591575576395568e-06, + "loss": 1.0674, + "step": 18981 + }, + { + "epoch": 1.82, + "grad_norm": 0.2968671857771453, + "learning_rate": 4.5868382252687415e-06, + "loss": 0.9896, + "step": 18982 + }, + { + "epoch": 1.82, + "grad_norm": 0.3248809373889038, + "learning_rate": 4.582103261916249e-06, + "loss": 1.0572, + "step": 18983 + }, + { + "epoch": 1.82, + "grad_norm": 0.31531526997331405, + "learning_rate": 4.577370686456583e-06, + "loss": 0.9246, + "step": 18984 + }, + { + "epoch": 1.82, + "grad_norm": 0.3060265191927756, + "learning_rate": 4.572640499008185e-06, + "loss": 1.0421, + "step": 18985 + }, + { + "epoch": 1.82, + "grad_norm": 0.27577272265666797, + "learning_rate": 4.567912699689447e-06, + "loss": 0.9934, + "step": 18986 + }, + { + "epoch": 1.82, + "grad_norm": 0.2946116131940658, + "learning_rate": 4.563187288618665e-06, + "loss": 1.0548, + "step": 18987 + }, + { + "epoch": 1.82, + "grad_norm": 0.3480991172215515, + "learning_rate": 4.5584642659141086e-06, + "loss": 1.0662, + "step": 18988 + }, + { + "epoch": 1.82, + "grad_norm": 0.36086233624917186, + "learning_rate": 4.553743631693952e-06, + "loss": 0.9936, + "step": 18989 + }, + { + "epoch": 1.82, + "grad_norm": 0.2954709528189461, + "learning_rate": 4.549025386076344e-06, + "loss": 0.9915, + "step": 18990 + }, + { + "epoch": 1.82, + "grad_norm": 0.3619182383971262, + "learning_rate": 4.544309529179369e-06, + "loss": 0.9768, + "step": 18991 + }, + { + "epoch": 1.82, + "grad_norm": 0.3241482267254355, + "learning_rate": 4.539596061121054e-06, + "loss": 1.0304, + "step": 18992 + }, + { + "epoch": 1.82, + "grad_norm": 0.3282827286061813, + "learning_rate": 4.534884982019327e-06, + "loss": 0.9479, + "step": 18993 + }, + { + "epoch": 1.82, + "grad_norm": 0.27087154815768116, + "learning_rate": 4.530176291992116e-06, + "loss": 1.0319, + "step": 18994 + }, + { + "epoch": 1.82, + "grad_norm": 0.2996989061528595, + "learning_rate": 4.525469991157227e-06, + "loss": 1.0424, + "step": 18995 + }, + { + "epoch": 1.82, + "grad_norm": 0.2714837013688411, + "learning_rate": 4.520766079632465e-06, + "loss": 1.0063, + "step": 18996 + }, + { + "epoch": 1.82, + "grad_norm": 0.3149554923254972, + "learning_rate": 4.516064557535515e-06, + "loss": 0.8773, + "step": 18997 + }, + { + "epoch": 1.82, + "grad_norm": 0.329370736540731, + "learning_rate": 4.511365424984093e-06, + "loss": 1.1271, + "step": 18998 + }, + { + "epoch": 1.82, + "grad_norm": 0.35224409681523977, + "learning_rate": 4.5066686820957495e-06, + "loss": 1.1796, + "step": 18999 + }, + { + "epoch": 1.82, + "grad_norm": 0.3119418485199795, + "learning_rate": 4.501974328988046e-06, + "loss": 0.9796, + "step": 19000 + }, + { + "epoch": 1.82, + "grad_norm": 0.2665891767871346, + "learning_rate": 4.497282365778432e-06, + "loss": 1.0219, + "step": 19001 + }, + { + "epoch": 1.82, + "grad_norm": 0.34071623366519993, + "learning_rate": 4.4925927925843605e-06, + "loss": 1.1132, + "step": 19002 + }, + { + "epoch": 1.82, + "grad_norm": 0.30972444514624786, + "learning_rate": 4.487905609523168e-06, + "loss": 1.0314, + "step": 19003 + }, + { + "epoch": 1.82, + "grad_norm": 0.3050798577665559, + "learning_rate": 4.483220816712186e-06, + "loss": 1.0399, + "step": 19004 + }, + { + "epoch": 1.82, + "grad_norm": 0.3089464652985311, + "learning_rate": 4.478538414268618e-06, + "loss": 1.0209, + "step": 19005 + }, + { + "epoch": 1.82, + "grad_norm": 0.31745359475524954, + "learning_rate": 4.473858402309661e-06, + "loss": 1.0546, + "step": 19006 + }, + { + "epoch": 1.82, + "grad_norm": 0.32674199489957767, + "learning_rate": 4.4691807809524425e-06, + "loss": 1.0059, + "step": 19007 + }, + { + "epoch": 1.82, + "grad_norm": 0.302243853106551, + "learning_rate": 4.464505550314002e-06, + "loss": 1.0438, + "step": 19008 + }, + { + "epoch": 1.82, + "grad_norm": 0.2842345087244765, + "learning_rate": 4.459832710511369e-06, + "loss": 1.0671, + "step": 19009 + }, + { + "epoch": 1.82, + "grad_norm": 0.31285022949636543, + "learning_rate": 4.455162261661438e-06, + "loss": 0.9636, + "step": 19010 + }, + { + "epoch": 1.82, + "grad_norm": 0.34224471539554224, + "learning_rate": 4.450494203881139e-06, + "loss": 0.9811, + "step": 19011 + }, + { + "epoch": 1.82, + "grad_norm": 0.303483155607465, + "learning_rate": 4.4458285372872665e-06, + "loss": 1.1079, + "step": 19012 + }, + { + "epoch": 1.82, + "grad_norm": 0.30076874249285646, + "learning_rate": 4.441165261996605e-06, + "loss": 1.0163, + "step": 19013 + }, + { + "epoch": 1.82, + "grad_norm": 0.315101001708977, + "learning_rate": 4.436504378125817e-06, + "loss": 1.0888, + "step": 19014 + }, + { + "epoch": 1.82, + "grad_norm": 0.3090039220041389, + "learning_rate": 4.431845885791585e-06, + "loss": 0.9331, + "step": 19015 + }, + { + "epoch": 1.82, + "grad_norm": 0.36082349293887944, + "learning_rate": 4.4271897851104635e-06, + "loss": 1.0235, + "step": 19016 + }, + { + "epoch": 1.82, + "grad_norm": 0.2723515631097077, + "learning_rate": 4.4225360761989685e-06, + "loss": 0.9939, + "step": 19017 + }, + { + "epoch": 1.82, + "grad_norm": 0.2931549832168943, + "learning_rate": 4.417884759173596e-06, + "loss": 1.0365, + "step": 19018 + }, + { + "epoch": 1.82, + "grad_norm": 0.29430248308075363, + "learning_rate": 4.413235834150731e-06, + "loss": 0.929, + "step": 19019 + }, + { + "epoch": 1.82, + "grad_norm": 0.3178375725817254, + "learning_rate": 4.40858930124669e-06, + "loss": 1.0649, + "step": 19020 + }, + { + "epoch": 1.82, + "grad_norm": 0.27923783130491203, + "learning_rate": 4.403945160577805e-06, + "loss": 1.0775, + "step": 19021 + }, + { + "epoch": 1.82, + "grad_norm": 0.33480590445815656, + "learning_rate": 4.399303412260258e-06, + "loss": 1.0306, + "step": 19022 + }, + { + "epoch": 1.82, + "grad_norm": 0.2999682224655151, + "learning_rate": 4.3946640564102245e-06, + "loss": 1.0279, + "step": 19023 + }, + { + "epoch": 1.82, + "grad_norm": 0.29064576589989205, + "learning_rate": 4.390027093143811e-06, + "loss": 0.9921, + "step": 19024 + }, + { + "epoch": 1.82, + "grad_norm": 0.3167811741851801, + "learning_rate": 4.38539252257707e-06, + "loss": 1.0825, + "step": 19025 + }, + { + "epoch": 1.82, + "grad_norm": 0.3241207250862052, + "learning_rate": 4.380760344825952e-06, + "loss": 0.9983, + "step": 19026 + }, + { + "epoch": 1.82, + "grad_norm": 0.3054005182720288, + "learning_rate": 4.376130560006408e-06, + "loss": 0.9573, + "step": 19027 + }, + { + "epoch": 1.82, + "grad_norm": 0.28409768217664894, + "learning_rate": 4.371503168234303e-06, + "loss": 1.0192, + "step": 19028 + }, + { + "epoch": 1.82, + "grad_norm": 0.3240510179381279, + "learning_rate": 4.366878169625433e-06, + "loss": 1.0415, + "step": 19029 + }, + { + "epoch": 1.82, + "grad_norm": 0.2971204904433024, + "learning_rate": 4.3622555642955254e-06, + "loss": 1.0661, + "step": 19030 + }, + { + "epoch": 1.82, + "grad_norm": 0.34005674631821975, + "learning_rate": 4.357635352360279e-06, + "loss": 0.947, + "step": 19031 + }, + { + "epoch": 1.82, + "grad_norm": 0.35510153389281846, + "learning_rate": 4.353017533935333e-06, + "loss": 0.9966, + "step": 19032 + }, + { + "epoch": 1.82, + "grad_norm": 0.33342624835847307, + "learning_rate": 4.348402109136218e-06, + "loss": 1.0321, + "step": 19033 + }, + { + "epoch": 1.82, + "grad_norm": 0.32002164333247357, + "learning_rate": 4.343789078078475e-06, + "loss": 1.1118, + "step": 19034 + }, + { + "epoch": 1.82, + "grad_norm": 0.28209796192722847, + "learning_rate": 4.339178440877511e-06, + "loss": 1.0225, + "step": 19035 + }, + { + "epoch": 1.82, + "grad_norm": 0.2889832695717426, + "learning_rate": 4.334570197648746e-06, + "loss": 1.0612, + "step": 19036 + }, + { + "epoch": 1.82, + "grad_norm": 0.37170781974171163, + "learning_rate": 4.329964348507454e-06, + "loss": 0.9762, + "step": 19037 + }, + { + "epoch": 1.82, + "grad_norm": 0.32302685156256306, + "learning_rate": 4.3253608935689525e-06, + "loss": 0.9926, + "step": 19038 + }, + { + "epoch": 1.82, + "grad_norm": 0.31465526635293756, + "learning_rate": 4.320759832948418e-06, + "loss": 0.9931, + "step": 19039 + }, + { + "epoch": 1.82, + "grad_norm": 0.32502783947473146, + "learning_rate": 4.316161166761002e-06, + "loss": 0.8896, + "step": 19040 + }, + { + "epoch": 1.82, + "grad_norm": 0.3099888285347532, + "learning_rate": 4.311564895121789e-06, + "loss": 1.0153, + "step": 19041 + }, + { + "epoch": 1.82, + "grad_norm": 0.3126303897594447, + "learning_rate": 4.306971018145811e-06, + "loss": 1.0698, + "step": 19042 + }, + { + "epoch": 1.82, + "grad_norm": 0.37022196345908803, + "learning_rate": 4.302379535948009e-06, + "loss": 1.0694, + "step": 19043 + }, + { + "epoch": 1.82, + "grad_norm": 0.2924414954621829, + "learning_rate": 4.297790448643302e-06, + "loss": 1.075, + "step": 19044 + }, + { + "epoch": 1.82, + "grad_norm": 0.2658383861598201, + "learning_rate": 4.293203756346542e-06, + "loss": 0.9609, + "step": 19045 + }, + { + "epoch": 1.82, + "grad_norm": 0.30622610347320117, + "learning_rate": 4.288619459172516e-06, + "loss": 1.0378, + "step": 19046 + }, + { + "epoch": 1.82, + "grad_norm": 0.3502899394440976, + "learning_rate": 4.2840375572359204e-06, + "loss": 1.0477, + "step": 19047 + }, + { + "epoch": 1.82, + "grad_norm": 0.29943002101530364, + "learning_rate": 4.2794580506514526e-06, + "loss": 1.1465, + "step": 19048 + }, + { + "epoch": 1.82, + "grad_norm": 0.33510732154016715, + "learning_rate": 4.2748809395337095e-06, + "loss": 0.9655, + "step": 19049 + }, + { + "epoch": 1.82, + "grad_norm": 0.26871405577257645, + "learning_rate": 4.270306223997223e-06, + "loss": 1.0226, + "step": 19050 + }, + { + "epoch": 1.82, + "grad_norm": 0.31449829715846717, + "learning_rate": 4.265733904156488e-06, + "loss": 1.0232, + "step": 19051 + }, + { + "epoch": 1.82, + "grad_norm": 0.3294359846177066, + "learning_rate": 4.261163980125926e-06, + "loss": 0.9878, + "step": 19052 + }, + { + "epoch": 1.82, + "grad_norm": 0.2872497986124848, + "learning_rate": 4.256596452019923e-06, + "loss": 0.9973, + "step": 19053 + }, + { + "epoch": 1.82, + "grad_norm": 0.33304046083622557, + "learning_rate": 4.2520313199527426e-06, + "loss": 1.0057, + "step": 19054 + }, + { + "epoch": 1.82, + "grad_norm": 0.4185397099079884, + "learning_rate": 4.247468584038672e-06, + "loss": 0.9759, + "step": 19055 + }, + { + "epoch": 1.82, + "grad_norm": 0.35816706702556367, + "learning_rate": 4.242908244391874e-06, + "loss": 0.9699, + "step": 19056 + }, + { + "epoch": 1.82, + "grad_norm": 0.33409436147024013, + "learning_rate": 4.2383503011264815e-06, + "loss": 1.0248, + "step": 19057 + }, + { + "epoch": 1.82, + "grad_norm": 0.29776261286910194, + "learning_rate": 4.233794754356535e-06, + "loss": 0.9393, + "step": 19058 + }, + { + "epoch": 1.82, + "grad_norm": 0.30622429926444916, + "learning_rate": 4.229241604196088e-06, + "loss": 0.9875, + "step": 19059 + }, + { + "epoch": 1.82, + "grad_norm": 0.33495981238388, + "learning_rate": 4.224690850759039e-06, + "loss": 0.8922, + "step": 19060 + }, + { + "epoch": 1.82, + "grad_norm": 0.35032206518007186, + "learning_rate": 4.220142494159307e-06, + "loss": 0.9059, + "step": 19061 + }, + { + "epoch": 1.82, + "grad_norm": 0.3413376238733185, + "learning_rate": 4.215596534510702e-06, + "loss": 0.9287, + "step": 19062 + }, + { + "epoch": 1.82, + "grad_norm": 0.32917054044745964, + "learning_rate": 4.211052971926988e-06, + "loss": 1.0204, + "step": 19063 + }, + { + "epoch": 1.82, + "grad_norm": 0.30017706937632715, + "learning_rate": 4.206511806521873e-06, + "loss": 0.9762, + "step": 19064 + }, + { + "epoch": 1.82, + "grad_norm": 0.3122118395953923, + "learning_rate": 4.201973038409013e-06, + "loss": 0.9915, + "step": 19065 + }, + { + "epoch": 1.82, + "grad_norm": 0.32529452996030384, + "learning_rate": 4.197436667701982e-06, + "loss": 1.0763, + "step": 19066 + }, + { + "epoch": 1.82, + "grad_norm": 0.40060585861992604, + "learning_rate": 4.1929026945143006e-06, + "loss": 1.0433, + "step": 19067 + }, + { + "epoch": 1.82, + "grad_norm": 0.32653677459032254, + "learning_rate": 4.1883711189594554e-06, + "loss": 1.0035, + "step": 19068 + }, + { + "epoch": 1.82, + "grad_norm": 0.3352422466948183, + "learning_rate": 4.183841941150824e-06, + "loss": 1.0952, + "step": 19069 + }, + { + "epoch": 1.82, + "grad_norm": 0.305979938419924, + "learning_rate": 4.17931516120178e-06, + "loss": 1.0191, + "step": 19070 + }, + { + "epoch": 1.82, + "grad_norm": 0.36744610896417407, + "learning_rate": 4.174790779225568e-06, + "loss": 0.9459, + "step": 19071 + }, + { + "epoch": 1.82, + "grad_norm": 0.34464904787440165, + "learning_rate": 4.170268795335474e-06, + "loss": 0.9437, + "step": 19072 + }, + { + "epoch": 1.82, + "grad_norm": 0.3273459726388392, + "learning_rate": 4.16574920964461e-06, + "loss": 1.0621, + "step": 19073 + }, + { + "epoch": 1.82, + "grad_norm": 0.2742525835290965, + "learning_rate": 4.161232022266115e-06, + "loss": 0.8634, + "step": 19074 + }, + { + "epoch": 1.82, + "grad_norm": 0.30663612204339546, + "learning_rate": 4.156717233313012e-06, + "loss": 1.1247, + "step": 19075 + }, + { + "epoch": 1.83, + "grad_norm": 0.27618031248141806, + "learning_rate": 4.152204842898311e-06, + "loss": 0.9, + "step": 19076 + }, + { + "epoch": 1.83, + "grad_norm": 0.34520211177879306, + "learning_rate": 4.147694851134898e-06, + "loss": 1.0196, + "step": 19077 + }, + { + "epoch": 1.83, + "grad_norm": 0.3440988568125437, + "learning_rate": 4.143187258135672e-06, + "loss": 0.9989, + "step": 19078 + }, + { + "epoch": 1.83, + "grad_norm": 0.3001733174280439, + "learning_rate": 4.138682064013421e-06, + "loss": 1.0186, + "step": 19079 + }, + { + "epoch": 1.83, + "grad_norm": 0.2958103938660743, + "learning_rate": 4.134179268880911e-06, + "loss": 0.9523, + "step": 19080 + }, + { + "epoch": 1.83, + "grad_norm": 0.32934323150061046, + "learning_rate": 4.1296788728508065e-06, + "loss": 1.0433, + "step": 19081 + }, + { + "epoch": 1.83, + "grad_norm": 0.30831104433015377, + "learning_rate": 4.12518087603575e-06, + "loss": 0.9969, + "step": 19082 + }, + { + "epoch": 1.83, + "grad_norm": 0.32301911174699194, + "learning_rate": 4.1206852785482865e-06, + "loss": 0.9592, + "step": 19083 + }, + { + "epoch": 1.83, + "grad_norm": 0.32145605023942775, + "learning_rate": 4.116192080500936e-06, + "loss": 1.1476, + "step": 19084 + }, + { + "epoch": 1.83, + "grad_norm": 0.3011714945075884, + "learning_rate": 4.111701282006131e-06, + "loss": 0.899, + "step": 19085 + }, + { + "epoch": 1.83, + "grad_norm": 0.29331439621934835, + "learning_rate": 4.107212883176282e-06, + "loss": 1.1596, + "step": 19086 + }, + { + "epoch": 1.83, + "grad_norm": 0.3452004929106225, + "learning_rate": 4.1027268841236865e-06, + "loss": 1.0107, + "step": 19087 + }, + { + "epoch": 1.83, + "grad_norm": 0.31436569475743026, + "learning_rate": 4.098243284960623e-06, + "loss": 1.1666, + "step": 19088 + }, + { + "epoch": 1.83, + "grad_norm": 0.2842884857374239, + "learning_rate": 4.093762085799291e-06, + "loss": 1.0863, + "step": 19089 + }, + { + "epoch": 1.83, + "grad_norm": 0.3103378924334046, + "learning_rate": 4.089283286751844e-06, + "loss": 1.1104, + "step": 19090 + }, + { + "epoch": 1.83, + "grad_norm": 0.3104256690176975, + "learning_rate": 4.084806887930348e-06, + "loss": 0.9863, + "step": 19091 + }, + { + "epoch": 1.83, + "grad_norm": 0.345371521234481, + "learning_rate": 4.080332889446836e-06, + "loss": 1.0492, + "step": 19092 + }, + { + "epoch": 1.83, + "grad_norm": 0.3578048197551978, + "learning_rate": 4.075861291413297e-06, + "loss": 1.0235, + "step": 19093 + }, + { + "epoch": 1.83, + "grad_norm": 0.32308272776764513, + "learning_rate": 4.071392093941606e-06, + "loss": 1.0525, + "step": 19094 + }, + { + "epoch": 1.83, + "grad_norm": 0.2871006490016058, + "learning_rate": 4.06692529714362e-06, + "loss": 0.9306, + "step": 19095 + }, + { + "epoch": 1.83, + "grad_norm": 0.3444541198690883, + "learning_rate": 4.062460901131116e-06, + "loss": 0.976, + "step": 19096 + }, + { + "epoch": 1.83, + "grad_norm": 0.28215637052874826, + "learning_rate": 4.057998906015836e-06, + "loss": 0.9786, + "step": 19097 + }, + { + "epoch": 1.83, + "grad_norm": 0.2945918499205393, + "learning_rate": 4.0535393119093934e-06, + "loss": 1.0334, + "step": 19098 + }, + { + "epoch": 1.83, + "grad_norm": 0.2839857003945392, + "learning_rate": 4.049082118923464e-06, + "loss": 1.0458, + "step": 19099 + }, + { + "epoch": 1.83, + "grad_norm": 0.32581924629932374, + "learning_rate": 4.044627327169548e-06, + "loss": 1.0103, + "step": 19100 + }, + { + "epoch": 1.83, + "grad_norm": 0.32085009323159164, + "learning_rate": 4.040174936759144e-06, + "loss": 1.094, + "step": 19101 + }, + { + "epoch": 1.83, + "grad_norm": 0.2904097246402389, + "learning_rate": 4.035724947803654e-06, + "loss": 1.0174, + "step": 19102 + }, + { + "epoch": 1.83, + "grad_norm": 0.293195212346906, + "learning_rate": 4.031277360414487e-06, + "loss": 0.8855, + "step": 19103 + }, + { + "epoch": 1.83, + "grad_norm": 0.28703905622413584, + "learning_rate": 4.0268321747028884e-06, + "loss": 0.992, + "step": 19104 + }, + { + "epoch": 1.83, + "grad_norm": 0.30526143268621275, + "learning_rate": 4.022389390780146e-06, + "loss": 1.0526, + "step": 19105 + }, + { + "epoch": 1.83, + "grad_norm": 0.31953207859917776, + "learning_rate": 4.017949008757416e-06, + "loss": 1.1227, + "step": 19106 + }, + { + "epoch": 1.83, + "grad_norm": 0.3594632770154032, + "learning_rate": 4.0135110287458536e-06, + "loss": 0.9565, + "step": 19107 + }, + { + "epoch": 1.83, + "grad_norm": 0.31464463497970546, + "learning_rate": 4.009075450856492e-06, + "loss": 1.0147, + "step": 19108 + }, + { + "epoch": 1.83, + "grad_norm": 0.315106447489395, + "learning_rate": 4.0046422752003435e-06, + "loss": 0.9592, + "step": 19109 + }, + { + "epoch": 1.83, + "grad_norm": 0.31999017046521766, + "learning_rate": 4.000211501888363e-06, + "loss": 1.0033, + "step": 19110 + }, + { + "epoch": 1.83, + "grad_norm": 0.3373443706519512, + "learning_rate": 3.995783131031416e-06, + "loss": 1.0407, + "step": 19111 + }, + { + "epoch": 1.83, + "grad_norm": 0.2907847807609406, + "learning_rate": 3.991357162740328e-06, + "loss": 1.0726, + "step": 19112 + }, + { + "epoch": 1.83, + "grad_norm": 0.3577601295668185, + "learning_rate": 3.986933597125875e-06, + "loss": 1.063, + "step": 19113 + }, + { + "epoch": 1.83, + "grad_norm": 0.3404949901312744, + "learning_rate": 3.982512434298746e-06, + "loss": 1.0449, + "step": 19114 + }, + { + "epoch": 1.83, + "grad_norm": 0.2553027852998145, + "learning_rate": 3.978093674369587e-06, + "loss": 0.8829, + "step": 19115 + }, + { + "epoch": 1.83, + "grad_norm": 0.33101193979226495, + "learning_rate": 3.973677317448998e-06, + "loss": 1.089, + "step": 19116 + }, + { + "epoch": 1.83, + "grad_norm": 0.3274368743439695, + "learning_rate": 3.969263363647468e-06, + "loss": 0.8929, + "step": 19117 + }, + { + "epoch": 1.83, + "grad_norm": 0.32337920044679574, + "learning_rate": 3.964851813075487e-06, + "loss": 1.0063, + "step": 19118 + }, + { + "epoch": 1.83, + "grad_norm": 0.3039610490405899, + "learning_rate": 3.960442665843411e-06, + "loss": 1.106, + "step": 19119 + }, + { + "epoch": 1.83, + "grad_norm": 0.2970985293650194, + "learning_rate": 3.956035922061652e-06, + "loss": 0.8142, + "step": 19120 + }, + { + "epoch": 1.83, + "grad_norm": 0.29467509304253275, + "learning_rate": 3.951631581840443e-06, + "loss": 1.1155, + "step": 19121 + }, + { + "epoch": 1.83, + "grad_norm": 0.32360235730254094, + "learning_rate": 3.947229645290029e-06, + "loss": 0.9643, + "step": 19122 + }, + { + "epoch": 1.83, + "grad_norm": 0.2875457345155455, + "learning_rate": 3.9428301125205455e-06, + "loss": 1.011, + "step": 19123 + }, + { + "epoch": 1.83, + "grad_norm": 0.30155068596825346, + "learning_rate": 3.9384329836421155e-06, + "loss": 1.0164, + "step": 19124 + }, + { + "epoch": 1.83, + "grad_norm": 0.29794611099488666, + "learning_rate": 3.93403825876476e-06, + "loss": 1.056, + "step": 19125 + }, + { + "epoch": 1.83, + "grad_norm": 0.3373023623177827, + "learning_rate": 3.929645937998483e-06, + "loss": 1.0121, + "step": 19126 + }, + { + "epoch": 1.83, + "grad_norm": 0.32404230865676914, + "learning_rate": 3.925256021453194e-06, + "loss": 1.0786, + "step": 19127 + }, + { + "epoch": 1.83, + "grad_norm": 0.3260839368285672, + "learning_rate": 3.920868509238762e-06, + "loss": 0.9705, + "step": 19128 + }, + { + "epoch": 1.83, + "grad_norm": 0.30308324390786495, + "learning_rate": 3.916483401464965e-06, + "loss": 1.0927, + "step": 19129 + }, + { + "epoch": 1.83, + "grad_norm": 0.3499121243325125, + "learning_rate": 3.912100698241561e-06, + "loss": 0.9216, + "step": 19130 + }, + { + "epoch": 1.83, + "grad_norm": 0.2969182091049244, + "learning_rate": 3.907720399678228e-06, + "loss": 1.0771, + "step": 19131 + }, + { + "epoch": 1.83, + "grad_norm": 0.3221999332384458, + "learning_rate": 3.903342505884589e-06, + "loss": 1.0223, + "step": 19132 + }, + { + "epoch": 1.83, + "grad_norm": 0.27637598317406326, + "learning_rate": 3.898967016970179e-06, + "loss": 0.873, + "step": 19133 + }, + { + "epoch": 1.83, + "grad_norm": 0.35579591008316946, + "learning_rate": 3.894593933044533e-06, + "loss": 0.9826, + "step": 19134 + }, + { + "epoch": 1.83, + "grad_norm": 0.32125832761471046, + "learning_rate": 3.890223254217074e-06, + "loss": 1.1169, + "step": 19135 + }, + { + "epoch": 1.83, + "grad_norm": 0.30860369791963693, + "learning_rate": 3.885854980597181e-06, + "loss": 1.0191, + "step": 19136 + }, + { + "epoch": 1.83, + "grad_norm": 0.2996565983475842, + "learning_rate": 3.881489112294167e-06, + "loss": 1.0127, + "step": 19137 + }, + { + "epoch": 1.83, + "grad_norm": 0.2830272472345875, + "learning_rate": 3.877125649417302e-06, + "loss": 0.8669, + "step": 19138 + }, + { + "epoch": 1.83, + "grad_norm": 0.3193965177202238, + "learning_rate": 3.872764592075773e-06, + "loss": 1.0999, + "step": 19139 + }, + { + "epoch": 1.83, + "grad_norm": 0.29032055381208965, + "learning_rate": 3.868405940378716e-06, + "loss": 1.0547, + "step": 19140 + }, + { + "epoch": 1.83, + "grad_norm": 0.37970683234652397, + "learning_rate": 3.864049694435234e-06, + "loss": 1.0171, + "step": 19141 + }, + { + "epoch": 1.83, + "grad_norm": 0.297090499134574, + "learning_rate": 3.859695854354317e-06, + "loss": 1.0034, + "step": 19142 + }, + { + "epoch": 1.83, + "grad_norm": 0.3024236271978167, + "learning_rate": 3.855344420244944e-06, + "loss": 0.894, + "step": 19143 + }, + { + "epoch": 1.83, + "grad_norm": 0.3249660640324817, + "learning_rate": 3.8509953922159945e-06, + "loss": 1.0247, + "step": 19144 + }, + { + "epoch": 1.83, + "grad_norm": 0.3065630430328336, + "learning_rate": 3.846648770376316e-06, + "loss": 1.0621, + "step": 19145 + }, + { + "epoch": 1.83, + "grad_norm": 0.3107832193325116, + "learning_rate": 3.8423045548346636e-06, + "loss": 1.0655, + "step": 19146 + }, + { + "epoch": 1.83, + "grad_norm": 0.3137337530421254, + "learning_rate": 3.837962745699797e-06, + "loss": 1.0515, + "step": 19147 + }, + { + "epoch": 1.83, + "grad_norm": 0.34113570983892544, + "learning_rate": 3.833623343080328e-06, + "loss": 1.0991, + "step": 19148 + }, + { + "epoch": 1.83, + "grad_norm": 0.3091985142531074, + "learning_rate": 3.829286347084893e-06, + "loss": 1.094, + "step": 19149 + }, + { + "epoch": 1.83, + "grad_norm": 0.3414375913499566, + "learning_rate": 3.824951757821993e-06, + "loss": 1.1933, + "step": 19150 + }, + { + "epoch": 1.83, + "grad_norm": 0.28568028681893076, + "learning_rate": 3.820619575400131e-06, + "loss": 0.9819, + "step": 19151 + }, + { + "epoch": 1.83, + "grad_norm": 0.35919245946790007, + "learning_rate": 3.816289799927697e-06, + "loss": 1.0016, + "step": 19152 + }, + { + "epoch": 1.83, + "grad_norm": 0.2992944315616545, + "learning_rate": 3.811962431513061e-06, + "loss": 0.9404, + "step": 19153 + }, + { + "epoch": 1.83, + "grad_norm": 0.3324381439336268, + "learning_rate": 3.8076374702645355e-06, + "loss": 1.0253, + "step": 19154 + }, + { + "epoch": 1.83, + "grad_norm": 0.3082377024923424, + "learning_rate": 3.8033149162903125e-06, + "loss": 1.03, + "step": 19155 + }, + { + "epoch": 1.83, + "grad_norm": 0.27466963030341274, + "learning_rate": 3.798994769698616e-06, + "loss": 0.8906, + "step": 19156 + }, + { + "epoch": 1.83, + "grad_norm": 0.33271086776047143, + "learning_rate": 3.7946770305975154e-06, + "loss": 1.1101, + "step": 19157 + }, + { + "epoch": 1.83, + "grad_norm": 0.26763360130354347, + "learning_rate": 3.7903616990951017e-06, + "loss": 1.1034, + "step": 19158 + }, + { + "epoch": 1.83, + "grad_norm": 0.31500159008936807, + "learning_rate": 3.7860487752993332e-06, + "loss": 0.9349, + "step": 19159 + }, + { + "epoch": 1.83, + "grad_norm": 0.33874125296281976, + "learning_rate": 3.7817382593181796e-06, + "loss": 1.0988, + "step": 19160 + }, + { + "epoch": 1.83, + "grad_norm": 0.30039444659485903, + "learning_rate": 3.7774301512594757e-06, + "loss": 1.0811, + "step": 19161 + }, + { + "epoch": 1.83, + "grad_norm": 0.28773113925660143, + "learning_rate": 3.7731244512310805e-06, + "loss": 0.8937, + "step": 19162 + }, + { + "epoch": 1.83, + "grad_norm": 0.30792417912071246, + "learning_rate": 3.7688211593407076e-06, + "loss": 1.0294, + "step": 19163 + }, + { + "epoch": 1.83, + "grad_norm": 0.35867117450991154, + "learning_rate": 3.7645202756960816e-06, + "loss": 1.1023, + "step": 19164 + }, + { + "epoch": 1.83, + "grad_norm": 0.3606878902463763, + "learning_rate": 3.7602218004047952e-06, + "loss": 1.0192, + "step": 19165 + }, + { + "epoch": 1.83, + "grad_norm": 0.32412192695286635, + "learning_rate": 3.75592573357445e-06, + "loss": 1.0751, + "step": 19166 + }, + { + "epoch": 1.83, + "grad_norm": 0.2926705443759884, + "learning_rate": 3.75163207531255e-06, + "loss": 1.0473, + "step": 19167 + }, + { + "epoch": 1.83, + "grad_norm": 0.3293849439536471, + "learning_rate": 3.747340825726553e-06, + "loss": 0.9364, + "step": 19168 + }, + { + "epoch": 1.83, + "grad_norm": 0.3293907754757217, + "learning_rate": 3.743051984923829e-06, + "loss": 1.054, + "step": 19169 + }, + { + "epoch": 1.83, + "grad_norm": 0.34467361681849035, + "learning_rate": 3.738765553011736e-06, + "loss": 1.045, + "step": 19170 + }, + { + "epoch": 1.83, + "grad_norm": 0.27982699296870917, + "learning_rate": 3.7344815300975444e-06, + "loss": 0.9284, + "step": 19171 + }, + { + "epoch": 1.83, + "grad_norm": 0.34727842393089814, + "learning_rate": 3.730199916288435e-06, + "loss": 1.0352, + "step": 19172 + }, + { + "epoch": 1.83, + "grad_norm": 0.3457527301481188, + "learning_rate": 3.7259207116915775e-06, + "loss": 0.9662, + "step": 19173 + }, + { + "epoch": 1.83, + "grad_norm": 0.3465273473910789, + "learning_rate": 3.7216439164140637e-06, + "loss": 1.0784, + "step": 19174 + }, + { + "epoch": 1.83, + "grad_norm": 0.3394776929312667, + "learning_rate": 3.71736953056292e-06, + "loss": 0.9072, + "step": 19175 + }, + { + "epoch": 1.83, + "grad_norm": 0.2597025404457504, + "learning_rate": 3.713097554245115e-06, + "loss": 1.0132, + "step": 19176 + }, + { + "epoch": 1.83, + "grad_norm": 0.33002378802144466, + "learning_rate": 3.7088279875675645e-06, + "loss": 1.0794, + "step": 19177 + }, + { + "epoch": 1.83, + "grad_norm": 0.3099961830717049, + "learning_rate": 3.704560830637105e-06, + "loss": 1.0154, + "step": 19178 + }, + { + "epoch": 1.83, + "grad_norm": 0.30438293369324276, + "learning_rate": 3.7002960835605396e-06, + "loss": 1.0974, + "step": 19179 + }, + { + "epoch": 1.83, + "grad_norm": 0.26857588431996543, + "learning_rate": 3.6960337464445607e-06, + "loss": 0.9577, + "step": 19180 + }, + { + "epoch": 1.84, + "grad_norm": 0.32497815023830523, + "learning_rate": 3.691773819395883e-06, + "loss": 0.995, + "step": 19181 + }, + { + "epoch": 1.84, + "grad_norm": 0.2938954774966076, + "learning_rate": 3.687516302521088e-06, + "loss": 1.0211, + "step": 19182 + }, + { + "epoch": 1.84, + "grad_norm": 0.34294261471805626, + "learning_rate": 3.683261195926735e-06, + "loss": 1.0013, + "step": 19183 + }, + { + "epoch": 1.84, + "grad_norm": 0.33907652298133756, + "learning_rate": 3.679008499719283e-06, + "loss": 1.0785, + "step": 19184 + }, + { + "epoch": 1.84, + "grad_norm": 0.33093089211904947, + "learning_rate": 3.6747582140052027e-06, + "loss": 1.1194, + "step": 19185 + }, + { + "epoch": 1.84, + "grad_norm": 0.33727080245249214, + "learning_rate": 3.670510338890809e-06, + "loss": 1.0513, + "step": 19186 + }, + { + "epoch": 1.84, + "grad_norm": 0.29424687907630337, + "learning_rate": 3.6662648744824613e-06, + "loss": 1.0551, + "step": 19187 + }, + { + "epoch": 1.84, + "grad_norm": 0.2928900181232962, + "learning_rate": 3.6620218208863634e-06, + "loss": 1.073, + "step": 19188 + }, + { + "epoch": 1.84, + "grad_norm": 0.26400361939747075, + "learning_rate": 3.65778117820873e-06, + "loss": 1.0023, + "step": 19189 + }, + { + "epoch": 1.84, + "grad_norm": 0.30763584603982896, + "learning_rate": 3.653542946555655e-06, + "loss": 0.9441, + "step": 19190 + }, + { + "epoch": 1.84, + "grad_norm": 0.31529360526263855, + "learning_rate": 3.64930712603323e-06, + "loss": 1.0181, + "step": 19191 + }, + { + "epoch": 1.84, + "grad_norm": 0.3634410517776593, + "learning_rate": 3.645073716747449e-06, + "loss": 0.9786, + "step": 19192 + }, + { + "epoch": 1.84, + "grad_norm": 0.34610695207131487, + "learning_rate": 3.6408427188042494e-06, + "loss": 1.0445, + "step": 19193 + }, + { + "epoch": 1.84, + "grad_norm": 0.29692380420985914, + "learning_rate": 3.6366141323095127e-06, + "loss": 1.0182, + "step": 19194 + }, + { + "epoch": 1.84, + "grad_norm": 0.36759640133973226, + "learning_rate": 3.632387957369077e-06, + "loss": 1.0021, + "step": 19195 + }, + { + "epoch": 1.84, + "grad_norm": 0.2789556184472715, + "learning_rate": 3.628164194088701e-06, + "loss": 0.9958, + "step": 19196 + }, + { + "epoch": 1.84, + "grad_norm": 0.3125110333096962, + "learning_rate": 3.6239428425740795e-06, + "loss": 1.0677, + "step": 19197 + }, + { + "epoch": 1.84, + "grad_norm": 0.30510248003234974, + "learning_rate": 3.6197239029308718e-06, + "loss": 0.9315, + "step": 19198 + }, + { + "epoch": 1.84, + "grad_norm": 0.2611505725750414, + "learning_rate": 3.6155073752646263e-06, + "loss": 1.0376, + "step": 19199 + }, + { + "epoch": 1.84, + "grad_norm": 0.3250715278134715, + "learning_rate": 3.6112932596808924e-06, + "loss": 1.0265, + "step": 19200 + }, + { + "epoch": 1.84, + "grad_norm": 0.31139417231405153, + "learning_rate": 3.607081556285119e-06, + "loss": 1.0757, + "step": 19201 + }, + { + "epoch": 1.84, + "grad_norm": 0.33680933097012505, + "learning_rate": 3.602872265182722e-06, + "loss": 1.0904, + "step": 19202 + }, + { + "epoch": 1.84, + "grad_norm": 0.3140906527815178, + "learning_rate": 3.5986653864790166e-06, + "loss": 0.9281, + "step": 19203 + }, + { + "epoch": 1.84, + "grad_norm": 0.32599932701797946, + "learning_rate": 3.594460920279308e-06, + "loss": 1.0702, + "step": 19204 + }, + { + "epoch": 1.84, + "grad_norm": 0.34827601250013834, + "learning_rate": 3.5902588666887892e-06, + "loss": 0.9808, + "step": 19205 + }, + { + "epoch": 1.84, + "grad_norm": 0.3255678432048602, + "learning_rate": 3.5860592258126545e-06, + "loss": 1.0135, + "step": 19206 + }, + { + "epoch": 1.84, + "grad_norm": 0.32618875075164777, + "learning_rate": 3.5818619977559533e-06, + "loss": 0.9779, + "step": 19207 + }, + { + "epoch": 1.84, + "grad_norm": 0.33086794497789285, + "learning_rate": 3.5776671826237786e-06, + "loss": 0.9718, + "step": 19208 + }, + { + "epoch": 1.84, + "grad_norm": 0.30689381741200705, + "learning_rate": 3.5734747805210576e-06, + "loss": 0.929, + "step": 19209 + }, + { + "epoch": 1.84, + "grad_norm": 0.3114034798456375, + "learning_rate": 3.5692847915527515e-06, + "loss": 0.8745, + "step": 19210 + }, + { + "epoch": 1.84, + "grad_norm": 0.30894844640306074, + "learning_rate": 3.565097215823687e-06, + "loss": 0.9539, + "step": 19211 + }, + { + "epoch": 1.84, + "grad_norm": 0.3388541885534938, + "learning_rate": 3.560912053438681e-06, + "loss": 0.9417, + "step": 19212 + }, + { + "epoch": 1.84, + "grad_norm": 0.2965427242866805, + "learning_rate": 3.5567293045024487e-06, + "loss": 1.0518, + "step": 19213 + }, + { + "epoch": 1.84, + "grad_norm": 0.3072649879261051, + "learning_rate": 3.5525489691196844e-06, + "loss": 0.9332, + "step": 19214 + }, + { + "epoch": 1.84, + "grad_norm": 0.28144273862455577, + "learning_rate": 3.548371047394994e-06, + "loss": 0.9843, + "step": 19215 + }, + { + "epoch": 1.84, + "grad_norm": 0.3177952072309086, + "learning_rate": 3.5441955394329375e-06, + "loss": 1.0399, + "step": 19216 + }, + { + "epoch": 1.84, + "grad_norm": 0.2914728559881997, + "learning_rate": 3.5400224453380203e-06, + "loss": 1.0562, + "step": 19217 + }, + { + "epoch": 1.84, + "grad_norm": 0.34289077903399845, + "learning_rate": 3.5358517652146485e-06, + "loss": 1.0841, + "step": 19218 + }, + { + "epoch": 1.84, + "grad_norm": 0.3133675322491553, + "learning_rate": 3.5316834991672267e-06, + "loss": 0.9794, + "step": 19219 + }, + { + "epoch": 1.84, + "grad_norm": 0.3066041843681677, + "learning_rate": 3.527517647300027e-06, + "loss": 1.0322, + "step": 19220 + }, + { + "epoch": 1.84, + "grad_norm": 0.32070499517112505, + "learning_rate": 3.5233542097173554e-06, + "loss": 1.035, + "step": 19221 + }, + { + "epoch": 1.84, + "grad_norm": 0.2946643521595732, + "learning_rate": 3.519193186523362e-06, + "loss": 0.9634, + "step": 19222 + }, + { + "epoch": 1.84, + "grad_norm": 0.3446066890601608, + "learning_rate": 3.515034577822207e-06, + "loss": 1.0433, + "step": 19223 + }, + { + "epoch": 1.84, + "grad_norm": 0.3624833636970044, + "learning_rate": 3.5108783837179415e-06, + "loss": 1.0713, + "step": 19224 + }, + { + "epoch": 1.84, + "grad_norm": 0.2722530251936715, + "learning_rate": 3.506724604314593e-06, + "loss": 0.9743, + "step": 19225 + }, + { + "epoch": 1.84, + "grad_norm": 0.31219512461417726, + "learning_rate": 3.5025732397161004e-06, + "loss": 0.9437, + "step": 19226 + }, + { + "epoch": 1.84, + "grad_norm": 0.3227196794669807, + "learning_rate": 3.4984242900263474e-06, + "loss": 1.0726, + "step": 19227 + }, + { + "epoch": 1.84, + "grad_norm": 0.3092879265877368, + "learning_rate": 3.4942777553491847e-06, + "loss": 1.0384, + "step": 19228 + }, + { + "epoch": 1.84, + "grad_norm": 0.29095951811038345, + "learning_rate": 3.490133635788384e-06, + "loss": 1.0868, + "step": 19229 + }, + { + "epoch": 1.84, + "grad_norm": 0.27993478456721166, + "learning_rate": 3.4859919314476296e-06, + "loss": 0.9963, + "step": 19230 + }, + { + "epoch": 1.84, + "grad_norm": 0.32502309583249767, + "learning_rate": 3.4818526424305943e-06, + "loss": 1.0169, + "step": 19231 + }, + { + "epoch": 1.84, + "grad_norm": 0.31526115194001764, + "learning_rate": 3.4777157688408502e-06, + "loss": 1.0788, + "step": 19232 + }, + { + "epoch": 1.84, + "grad_norm": 0.36503994826903724, + "learning_rate": 3.473581310781926e-06, + "loss": 0.9202, + "step": 19233 + }, + { + "epoch": 1.84, + "grad_norm": 0.30336734032765905, + "learning_rate": 3.4694492683572944e-06, + "loss": 0.8607, + "step": 19234 + }, + { + "epoch": 1.84, + "grad_norm": 0.31978953756290224, + "learning_rate": 3.4653196416703725e-06, + "loss": 1.1167, + "step": 19235 + }, + { + "epoch": 1.84, + "grad_norm": 0.33605167687510884, + "learning_rate": 3.461192430824489e-06, + "loss": 0.9464, + "step": 19236 + }, + { + "epoch": 1.84, + "grad_norm": 0.34447976112147366, + "learning_rate": 3.4570676359229283e-06, + "loss": 0.9683, + "step": 19237 + }, + { + "epoch": 1.84, + "grad_norm": 0.27657005160396486, + "learning_rate": 3.45294525706894e-06, + "loss": 0.9751, + "step": 19238 + }, + { + "epoch": 1.84, + "grad_norm": 0.2850309111489699, + "learning_rate": 3.448825294365665e-06, + "loss": 1.0824, + "step": 19239 + }, + { + "epoch": 1.84, + "grad_norm": 0.31794286232880653, + "learning_rate": 3.4447077479162314e-06, + "loss": 1.0228, + "step": 19240 + }, + { + "epoch": 1.84, + "grad_norm": 0.3248385343842268, + "learning_rate": 3.440592617823646e-06, + "loss": 0.9747, + "step": 19241 + }, + { + "epoch": 1.84, + "grad_norm": 0.3479543898846845, + "learning_rate": 3.436479904190937e-06, + "loss": 0.8719, + "step": 19242 + }, + { + "epoch": 1.84, + "grad_norm": 0.26887097109792946, + "learning_rate": 3.432369607120989e-06, + "loss": 0.8725, + "step": 19243 + }, + { + "epoch": 1.84, + "grad_norm": 0.31747525475872007, + "learning_rate": 3.428261726716708e-06, + "loss": 0.9648, + "step": 19244 + }, + { + "epoch": 1.84, + "grad_norm": 0.3118173119146154, + "learning_rate": 3.4241562630808468e-06, + "loss": 0.9223, + "step": 19245 + }, + { + "epoch": 1.84, + "grad_norm": 0.2831736934161838, + "learning_rate": 3.420053216316188e-06, + "loss": 1.0366, + "step": 19246 + }, + { + "epoch": 1.84, + "grad_norm": 0.32837102932532664, + "learning_rate": 3.4159525865253618e-06, + "loss": 1.0397, + "step": 19247 + }, + { + "epoch": 1.84, + "grad_norm": 0.3038090234785231, + "learning_rate": 3.411854373811063e-06, + "loss": 1.0059, + "step": 19248 + }, + { + "epoch": 1.84, + "grad_norm": 0.28906001174000956, + "learning_rate": 3.407758578275788e-06, + "loss": 1.011, + "step": 19249 + }, + { + "epoch": 1.84, + "grad_norm": 0.32834255951842845, + "learning_rate": 3.403665200022077e-06, + "loss": 1.0289, + "step": 19250 + }, + { + "epoch": 1.84, + "grad_norm": 0.3422485889288684, + "learning_rate": 3.3995742391523366e-06, + "loss": 1.0352, + "step": 19251 + }, + { + "epoch": 1.84, + "grad_norm": 0.3086832652367416, + "learning_rate": 3.395485695768963e-06, + "loss": 0.9143, + "step": 19252 + }, + { + "epoch": 1.84, + "grad_norm": 0.3215792440589205, + "learning_rate": 3.391399569974285e-06, + "loss": 1.0505, + "step": 19253 + }, + { + "epoch": 1.84, + "grad_norm": 0.31451947871477065, + "learning_rate": 3.3873158618705436e-06, + "loss": 1.087, + "step": 19254 + }, + { + "epoch": 1.84, + "grad_norm": 0.34134439999456107, + "learning_rate": 3.3832345715599343e-06, + "loss": 0.928, + "step": 19255 + }, + { + "epoch": 1.84, + "grad_norm": 0.2888957852468854, + "learning_rate": 3.3791556991446094e-06, + "loss": 1.1023, + "step": 19256 + }, + { + "epoch": 1.84, + "grad_norm": 0.321948505954246, + "learning_rate": 3.3750792447266422e-06, + "loss": 0.9885, + "step": 19257 + }, + { + "epoch": 1.84, + "grad_norm": 0.29057622261392746, + "learning_rate": 3.371005208408029e-06, + "loss": 1.0313, + "step": 19258 + }, + { + "epoch": 1.84, + "grad_norm": 0.3286367703792019, + "learning_rate": 3.366933590290755e-06, + "loss": 1.08, + "step": 19259 + }, + { + "epoch": 1.84, + "grad_norm": 0.3103141621146589, + "learning_rate": 3.3628643904766943e-06, + "loss": 0.9164, + "step": 19260 + }, + { + "epoch": 1.84, + "grad_norm": 0.3260389222429364, + "learning_rate": 3.3587976090676763e-06, + "loss": 0.9318, + "step": 19261 + }, + { + "epoch": 1.84, + "grad_norm": 0.2826489165110318, + "learning_rate": 3.354733246165498e-06, + "loss": 0.958, + "step": 19262 + }, + { + "epoch": 1.84, + "grad_norm": 0.30525295993732554, + "learning_rate": 3.3506713018718548e-06, + "loss": 0.9637, + "step": 19263 + }, + { + "epoch": 1.84, + "grad_norm": 0.3330419534698357, + "learning_rate": 3.3466117762883997e-06, + "loss": 0.9909, + "step": 19264 + }, + { + "epoch": 1.84, + "grad_norm": 0.3326608708159997, + "learning_rate": 3.34255466951674e-06, + "loss": 0.9724, + "step": 19265 + }, + { + "epoch": 1.84, + "grad_norm": 0.3160511488138079, + "learning_rate": 3.3384999816583938e-06, + "loss": 1.0307, + "step": 19266 + }, + { + "epoch": 1.84, + "grad_norm": 0.2926777176980831, + "learning_rate": 3.3344477128148364e-06, + "loss": 1.0647, + "step": 19267 + }, + { + "epoch": 1.84, + "grad_norm": 0.3131883490912467, + "learning_rate": 3.330397863087453e-06, + "loss": 1.0427, + "step": 19268 + }, + { + "epoch": 1.84, + "grad_norm": 0.3241367285488988, + "learning_rate": 3.3263504325776407e-06, + "loss": 0.9705, + "step": 19269 + }, + { + "epoch": 1.84, + "grad_norm": 0.3336868214384319, + "learning_rate": 3.322305421386662e-06, + "loss": 0.9404, + "step": 19270 + }, + { + "epoch": 1.84, + "grad_norm": 0.34370503534020647, + "learning_rate": 3.3182628296157593e-06, + "loss": 0.9846, + "step": 19271 + }, + { + "epoch": 1.84, + "grad_norm": 0.3099607377734546, + "learning_rate": 3.3142226573660727e-06, + "loss": 0.9409, + "step": 19272 + }, + { + "epoch": 1.84, + "grad_norm": 0.31454373173865835, + "learning_rate": 3.3101849047387334e-06, + "loss": 0.9571, + "step": 19273 + }, + { + "epoch": 1.84, + "grad_norm": 0.3581737064509703, + "learning_rate": 3.3061495718347937e-06, + "loss": 1.1246, + "step": 19274 + }, + { + "epoch": 1.84, + "grad_norm": 0.3150773944912616, + "learning_rate": 3.302116658755228e-06, + "loss": 1.0169, + "step": 19275 + }, + { + "epoch": 1.84, + "grad_norm": 0.3412271084300605, + "learning_rate": 3.2980861656009554e-06, + "loss": 1.0327, + "step": 19276 + }, + { + "epoch": 1.84, + "grad_norm": 0.32823639464504867, + "learning_rate": 3.2940580924728625e-06, + "loss": 0.9482, + "step": 19277 + }, + { + "epoch": 1.84, + "grad_norm": 0.29453162453034637, + "learning_rate": 3.2900324394717464e-06, + "loss": 0.9559, + "step": 19278 + }, + { + "epoch": 1.84, + "grad_norm": 0.3136593131108365, + "learning_rate": 3.2860092066983373e-06, + "loss": 1.1284, + "step": 19279 + }, + { + "epoch": 1.84, + "grad_norm": 0.3406162831270255, + "learning_rate": 3.2819883942533434e-06, + "loss": 0.9578, + "step": 19280 + }, + { + "epoch": 1.84, + "grad_norm": 0.29233350785960616, + "learning_rate": 3.2779700022373518e-06, + "loss": 1.1367, + "step": 19281 + }, + { + "epoch": 1.84, + "grad_norm": 0.3253754418280763, + "learning_rate": 3.27395403075097e-06, + "loss": 1.0672, + "step": 19282 + }, + { + "epoch": 1.84, + "grad_norm": 0.3001478012883046, + "learning_rate": 3.269940479894662e-06, + "loss": 1.0112, + "step": 19283 + }, + { + "epoch": 1.84, + "grad_norm": 0.3076012548284295, + "learning_rate": 3.2659293497689035e-06, + "loss": 1.0942, + "step": 19284 + }, + { + "epoch": 1.85, + "grad_norm": 0.3438420044045165, + "learning_rate": 3.2619206404740475e-06, + "loss": 1.1015, + "step": 19285 + }, + { + "epoch": 1.85, + "grad_norm": 0.3244055688934239, + "learning_rate": 3.257914352110436e-06, + "loss": 1.0163, + "step": 19286 + }, + { + "epoch": 1.85, + "grad_norm": 0.2955255341685788, + "learning_rate": 3.2539104847783e-06, + "loss": 0.9709, + "step": 19287 + }, + { + "epoch": 1.85, + "grad_norm": 0.29276378407300296, + "learning_rate": 3.2499090385778696e-06, + "loss": 1.0786, + "step": 19288 + }, + { + "epoch": 1.85, + "grad_norm": 0.32725220151853085, + "learning_rate": 3.2459100136092656e-06, + "loss": 1.0646, + "step": 19289 + }, + { + "epoch": 1.85, + "grad_norm": 0.33111661532604564, + "learning_rate": 3.2419134099725743e-06, + "loss": 0.9976, + "step": 19290 + }, + { + "epoch": 1.85, + "grad_norm": 0.32729521281559654, + "learning_rate": 3.237919227767805e-06, + "loss": 0.96, + "step": 19291 + }, + { + "epoch": 1.85, + "grad_norm": 0.33981142962947947, + "learning_rate": 3.2339274670949328e-06, + "loss": 1.083, + "step": 19292 + }, + { + "epoch": 1.85, + "grad_norm": 0.3214815863510598, + "learning_rate": 3.229938128053822e-06, + "loss": 0.9095, + "step": 19293 + }, + { + "epoch": 1.85, + "grad_norm": 0.3220105083057529, + "learning_rate": 3.2259512107443378e-06, + "loss": 1.065, + "step": 19294 + }, + { + "epoch": 1.85, + "grad_norm": 0.3320708974988166, + "learning_rate": 3.2219667152662448e-06, + "loss": 1.0225, + "step": 19295 + }, + { + "epoch": 1.85, + "grad_norm": 0.3361252474853972, + "learning_rate": 3.2179846417192627e-06, + "loss": 0.9873, + "step": 19296 + }, + { + "epoch": 1.85, + "grad_norm": 0.3172038029580588, + "learning_rate": 3.2140049902030234e-06, + "loss": 1.0007, + "step": 19297 + }, + { + "epoch": 1.85, + "grad_norm": 0.3162793727181826, + "learning_rate": 3.2100277608171467e-06, + "loss": 1.0517, + "step": 19298 + }, + { + "epoch": 1.85, + "grad_norm": 0.3670262441797768, + "learning_rate": 3.206052953661165e-06, + "loss": 1.0667, + "step": 19299 + }, + { + "epoch": 1.85, + "grad_norm": 0.3618456916956864, + "learning_rate": 3.202080568834531e-06, + "loss": 1.0454, + "step": 19300 + }, + { + "epoch": 1.85, + "grad_norm": 0.32921428706844685, + "learning_rate": 3.1981106064366773e-06, + "loss": 0.9481, + "step": 19301 + }, + { + "epoch": 1.85, + "grad_norm": 0.33173104217487215, + "learning_rate": 3.1941430665669126e-06, + "loss": 0.9707, + "step": 19302 + }, + { + "epoch": 1.85, + "grad_norm": 0.3231902718595912, + "learning_rate": 3.190177949324591e-06, + "loss": 1.0369, + "step": 19303 + }, + { + "epoch": 1.85, + "grad_norm": 0.33294292086029365, + "learning_rate": 3.1862152548088887e-06, + "loss": 1.0225, + "step": 19304 + }, + { + "epoch": 1.85, + "grad_norm": 0.32182517073736067, + "learning_rate": 3.182254983119004e-06, + "loss": 0.9243, + "step": 19305 + }, + { + "epoch": 1.85, + "grad_norm": 0.31835306612822084, + "learning_rate": 3.1782971343540357e-06, + "loss": 1.0784, + "step": 19306 + }, + { + "epoch": 1.85, + "grad_norm": 0.3098363510236963, + "learning_rate": 3.174341708613038e-06, + "loss": 1.1045, + "step": 19307 + }, + { + "epoch": 1.85, + "grad_norm": 0.3273991146504305, + "learning_rate": 3.1703887059949755e-06, + "loss": 0.9996, + "step": 19308 + }, + { + "epoch": 1.85, + "grad_norm": 0.31204358757342854, + "learning_rate": 3.166438126598803e-06, + "loss": 1.0454, + "step": 19309 + }, + { + "epoch": 1.85, + "grad_norm": 0.23514569890641362, + "learning_rate": 3.1624899705233635e-06, + "loss": 0.9741, + "step": 19310 + }, + { + "epoch": 1.85, + "grad_norm": 0.3010907203476646, + "learning_rate": 3.158544237867489e-06, + "loss": 1.0104, + "step": 19311 + }, + { + "epoch": 1.85, + "grad_norm": 0.34812052586978554, + "learning_rate": 3.1546009287298894e-06, + "loss": 1.0273, + "step": 19312 + }, + { + "epoch": 1.85, + "grad_norm": 0.290303972652755, + "learning_rate": 3.150660043209286e-06, + "loss": 1.0275, + "step": 19313 + }, + { + "epoch": 1.85, + "grad_norm": 0.2730319727189548, + "learning_rate": 3.146721581404266e-06, + "loss": 0.9798, + "step": 19314 + }, + { + "epoch": 1.85, + "grad_norm": 0.2989048981858209, + "learning_rate": 3.142785543413418e-06, + "loss": 1.0718, + "step": 19315 + }, + { + "epoch": 1.85, + "grad_norm": 0.30795168276164353, + "learning_rate": 3.138851929335229e-06, + "loss": 1.0216, + "step": 19316 + }, + { + "epoch": 1.85, + "grad_norm": 0.29463381867222477, + "learning_rate": 3.1349207392681547e-06, + "loss": 0.9779, + "step": 19317 + }, + { + "epoch": 1.85, + "grad_norm": 0.2581965738850359, + "learning_rate": 3.1309919733105487e-06, + "loss": 0.9885, + "step": 19318 + }, + { + "epoch": 1.85, + "grad_norm": 0.32834534185235287, + "learning_rate": 3.127065631560755e-06, + "loss": 1.0557, + "step": 19319 + }, + { + "epoch": 1.85, + "grad_norm": 0.3495685544464802, + "learning_rate": 3.1231417141170393e-06, + "loss": 1.0476, + "step": 19320 + }, + { + "epoch": 1.85, + "grad_norm": 0.2813935862410458, + "learning_rate": 3.119220221077568e-06, + "loss": 1.03, + "step": 19321 + }, + { + "epoch": 1.85, + "grad_norm": 0.3212611948712865, + "learning_rate": 3.115301152540506e-06, + "loss": 1.0135, + "step": 19322 + }, + { + "epoch": 1.85, + "grad_norm": 0.3145167801159503, + "learning_rate": 3.1113845086039206e-06, + "loss": 0.9556, + "step": 19323 + }, + { + "epoch": 1.85, + "grad_norm": 0.3022821171429723, + "learning_rate": 3.1074702893658437e-06, + "loss": 0.9746, + "step": 19324 + }, + { + "epoch": 1.85, + "grad_norm": 0.2806690539430515, + "learning_rate": 3.103558494924197e-06, + "loss": 1.0008, + "step": 19325 + }, + { + "epoch": 1.85, + "grad_norm": 0.30023939098900126, + "learning_rate": 3.0996491253769133e-06, + "loss": 1.173, + "step": 19326 + }, + { + "epoch": 1.85, + "grad_norm": 0.34709687488805074, + "learning_rate": 3.0957421808218036e-06, + "loss": 1.0042, + "step": 19327 + }, + { + "epoch": 1.85, + "grad_norm": 0.2850068364191968, + "learning_rate": 3.091837661356656e-06, + "loss": 0.9849, + "step": 19328 + }, + { + "epoch": 1.85, + "grad_norm": 0.34302509272052, + "learning_rate": 3.0879355670791477e-06, + "loss": 1.0956, + "step": 19329 + }, + { + "epoch": 1.85, + "grad_norm": 0.30976553316415445, + "learning_rate": 3.08403589808699e-06, + "loss": 1.0261, + "step": 19330 + }, + { + "epoch": 1.85, + "grad_norm": 0.2899984516554628, + "learning_rate": 3.0801386544777268e-06, + "loss": 1.0432, + "step": 19331 + }, + { + "epoch": 1.85, + "grad_norm": 0.32114009952452177, + "learning_rate": 3.0762438363489133e-06, + "loss": 1.1802, + "step": 19332 + }, + { + "epoch": 1.85, + "grad_norm": 0.3374756687519646, + "learning_rate": 3.072351443798005e-06, + "loss": 1.043, + "step": 19333 + }, + { + "epoch": 1.85, + "grad_norm": 0.2913376559313997, + "learning_rate": 3.0684614769224353e-06, + "loss": 1.0387, + "step": 19334 + }, + { + "epoch": 1.85, + "grad_norm": 0.3110168015896429, + "learning_rate": 3.0645739358195034e-06, + "loss": 1.0507, + "step": 19335 + }, + { + "epoch": 1.85, + "grad_norm": 0.26773399926814356, + "learning_rate": 3.0606888205865545e-06, + "loss": 0.9775, + "step": 19336 + }, + { + "epoch": 1.85, + "grad_norm": 0.32886985967281646, + "learning_rate": 3.0568061313207883e-06, + "loss": 1.0238, + "step": 19337 + }, + { + "epoch": 1.85, + "grad_norm": 0.3603526072478496, + "learning_rate": 3.0529258681193828e-06, + "loss": 1.1972, + "step": 19338 + }, + { + "epoch": 1.85, + "grad_norm": 0.31219935336407095, + "learning_rate": 3.049048031079427e-06, + "loss": 1.0201, + "step": 19339 + }, + { + "epoch": 1.85, + "grad_norm": 0.36874603176818593, + "learning_rate": 3.045172620297965e-06, + "loss": 1.097, + "step": 19340 + }, + { + "epoch": 1.85, + "grad_norm": 0.3216064207242713, + "learning_rate": 3.041299635872019e-06, + "loss": 1.0258, + "step": 19341 + }, + { + "epoch": 1.85, + "grad_norm": 0.2842374468960603, + "learning_rate": 3.037429077898457e-06, + "loss": 0.9181, + "step": 19342 + }, + { + "epoch": 1.85, + "grad_norm": 0.3401192545346754, + "learning_rate": 3.03356094647419e-06, + "loss": 0.978, + "step": 19343 + }, + { + "epoch": 1.85, + "grad_norm": 0.34694735314354114, + "learning_rate": 3.0296952416959845e-06, + "loss": 0.9606, + "step": 19344 + }, + { + "epoch": 1.85, + "grad_norm": 0.32427881235310824, + "learning_rate": 3.025831963660619e-06, + "loss": 1.0782, + "step": 19345 + }, + { + "epoch": 1.85, + "grad_norm": 0.3246021334296683, + "learning_rate": 3.02197111246475e-06, + "loss": 1.1181, + "step": 19346 + }, + { + "epoch": 1.85, + "grad_norm": 0.29672803337241976, + "learning_rate": 3.018112688204999e-06, + "loss": 1.0681, + "step": 19347 + }, + { + "epoch": 1.85, + "grad_norm": 0.2981046918204405, + "learning_rate": 3.0142566909779236e-06, + "loss": 0.9104, + "step": 19348 + }, + { + "epoch": 1.85, + "grad_norm": 0.35995576803066104, + "learning_rate": 3.010403120880034e-06, + "loss": 0.9451, + "step": 19349 + }, + { + "epoch": 1.85, + "grad_norm": 0.28106618834362634, + "learning_rate": 3.0065519780077544e-06, + "loss": 0.9697, + "step": 19350 + }, + { + "epoch": 1.85, + "grad_norm": 0.2982939603890545, + "learning_rate": 3.002703262457485e-06, + "loss": 0.9566, + "step": 19351 + }, + { + "epoch": 1.85, + "grad_norm": 0.3015440624711365, + "learning_rate": 2.9988569743255145e-06, + "loss": 0.9774, + "step": 19352 + }, + { + "epoch": 1.85, + "grad_norm": 0.3070071530417846, + "learning_rate": 2.9950131137081227e-06, + "loss": 1.0031, + "step": 19353 + }, + { + "epoch": 1.85, + "grad_norm": 0.3036430264138528, + "learning_rate": 2.9911716807014767e-06, + "loss": 0.9918, + "step": 19354 + }, + { + "epoch": 1.85, + "grad_norm": 0.33485401816986354, + "learning_rate": 2.9873326754017327e-06, + "loss": 1.0821, + "step": 19355 + }, + { + "epoch": 1.85, + "grad_norm": 0.34508935271370783, + "learning_rate": 2.9834960979049586e-06, + "loss": 1.0241, + "step": 19356 + }, + { + "epoch": 1.85, + "grad_norm": 0.33474945014895413, + "learning_rate": 2.9796619483071774e-06, + "loss": 1.0163, + "step": 19357 + }, + { + "epoch": 1.85, + "grad_norm": 0.3024758514287145, + "learning_rate": 2.9758302267043237e-06, + "loss": 0.8906, + "step": 19358 + }, + { + "epoch": 1.85, + "grad_norm": 0.3648710937266806, + "learning_rate": 2.972000933192287e-06, + "loss": 0.9425, + "step": 19359 + }, + { + "epoch": 1.85, + "grad_norm": 0.27837770233213427, + "learning_rate": 2.9681740678669245e-06, + "loss": 1.0559, + "step": 19360 + }, + { + "epoch": 1.85, + "grad_norm": 0.3148608090051896, + "learning_rate": 2.9643496308239703e-06, + "loss": 0.9032, + "step": 19361 + }, + { + "epoch": 1.85, + "grad_norm": 0.3059120290595125, + "learning_rate": 2.9605276221591705e-06, + "loss": 1.0528, + "step": 19362 + }, + { + "epoch": 1.85, + "grad_norm": 0.3176514368903455, + "learning_rate": 2.956708041968126e-06, + "loss": 1.0948, + "step": 19363 + }, + { + "epoch": 1.85, + "grad_norm": 0.32542647808716, + "learning_rate": 2.9528908903464826e-06, + "loss": 0.9933, + "step": 19364 + }, + { + "epoch": 1.85, + "grad_norm": 0.3521099880692741, + "learning_rate": 2.9490761673897193e-06, + "loss": 0.8795, + "step": 19365 + }, + { + "epoch": 1.85, + "grad_norm": 0.3061353281854188, + "learning_rate": 2.945263873193327e-06, + "loss": 0.9996, + "step": 19366 + }, + { + "epoch": 1.85, + "grad_norm": 0.3130181426484674, + "learning_rate": 2.941454007852695e-06, + "loss": 1.0185, + "step": 19367 + }, + { + "epoch": 1.85, + "grad_norm": 0.27557011315663693, + "learning_rate": 2.937646571463193e-06, + "loss": 1.0412, + "step": 19368 + }, + { + "epoch": 1.85, + "grad_norm": 0.2731230430420238, + "learning_rate": 2.9338415641200544e-06, + "loss": 1.033, + "step": 19369 + }, + { + "epoch": 1.85, + "grad_norm": 0.30858561077119684, + "learning_rate": 2.9300389859185596e-06, + "loss": 1.0655, + "step": 19370 + }, + { + "epoch": 1.85, + "grad_norm": 0.282303719203961, + "learning_rate": 2.926238836953843e-06, + "loss": 1.0341, + "step": 19371 + }, + { + "epoch": 1.85, + "grad_norm": 0.32087676283924527, + "learning_rate": 2.9224411173210066e-06, + "loss": 1.0065, + "step": 19372 + }, + { + "epoch": 1.85, + "grad_norm": 0.31726888804578224, + "learning_rate": 2.9186458271150853e-06, + "loss": 0.9551, + "step": 19373 + }, + { + "epoch": 1.85, + "grad_norm": 0.3389793950877645, + "learning_rate": 2.914852966431081e-06, + "loss": 1.0183, + "step": 19374 + }, + { + "epoch": 1.85, + "grad_norm": 0.3080983945068258, + "learning_rate": 2.911062535363884e-06, + "loss": 0.983, + "step": 19375 + }, + { + "epoch": 1.85, + "grad_norm": 0.3553774377668871, + "learning_rate": 2.907274534008364e-06, + "loss": 0.9511, + "step": 19376 + }, + { + "epoch": 1.85, + "grad_norm": 0.29982866224775356, + "learning_rate": 2.903488962459322e-06, + "loss": 0.9331, + "step": 19377 + }, + { + "epoch": 1.85, + "grad_norm": 0.3061669070569564, + "learning_rate": 2.8997058208114935e-06, + "loss": 0.9391, + "step": 19378 + }, + { + "epoch": 1.85, + "grad_norm": 0.35554637754779406, + "learning_rate": 2.8959251091595476e-06, + "loss": 1.0426, + "step": 19379 + }, + { + "epoch": 1.85, + "grad_norm": 0.30858179399281216, + "learning_rate": 2.8921468275981077e-06, + "loss": 0.9917, + "step": 19380 + }, + { + "epoch": 1.85, + "grad_norm": 0.3278568511904419, + "learning_rate": 2.8883709762217214e-06, + "loss": 1.044, + "step": 19381 + }, + { + "epoch": 1.85, + "grad_norm": 0.36147989310022766, + "learning_rate": 2.8845975551248904e-06, + "loss": 0.9796, + "step": 19382 + }, + { + "epoch": 1.85, + "grad_norm": 0.2974561133209812, + "learning_rate": 2.8808265644020283e-06, + "loss": 1.0478, + "step": 19383 + }, + { + "epoch": 1.85, + "grad_norm": 0.3407121112214055, + "learning_rate": 2.8770580041475147e-06, + "loss": 0.9037, + "step": 19384 + }, + { + "epoch": 1.85, + "grad_norm": 0.3095697785562637, + "learning_rate": 2.873291874455686e-06, + "loss": 1.0009, + "step": 19385 + }, + { + "epoch": 1.85, + "grad_norm": 0.3338170808159102, + "learning_rate": 2.8695281754207548e-06, + "loss": 1.0441, + "step": 19386 + }, + { + "epoch": 1.85, + "grad_norm": 0.3194749984028576, + "learning_rate": 2.8657669071369352e-06, + "loss": 0.9397, + "step": 19387 + }, + { + "epoch": 1.85, + "grad_norm": 0.30626758490730877, + "learning_rate": 2.8620080696983408e-06, + "loss": 1.023, + "step": 19388 + }, + { + "epoch": 1.85, + "grad_norm": 0.3028594173608918, + "learning_rate": 2.858251663199041e-06, + "loss": 1.061, + "step": 19389 + }, + { + "epoch": 1.86, + "grad_norm": 0.33038739086373947, + "learning_rate": 2.8544976877330377e-06, + "loss": 0.8166, + "step": 19390 + }, + { + "epoch": 1.86, + "grad_norm": 0.3068760597781776, + "learning_rate": 2.8507461433943005e-06, + "loss": 0.9536, + "step": 19391 + }, + { + "epoch": 1.86, + "grad_norm": 0.2993565565011934, + "learning_rate": 2.846997030276677e-06, + "loss": 1.0059, + "step": 19392 + }, + { + "epoch": 1.86, + "grad_norm": 0.2867175341393999, + "learning_rate": 2.843250348474036e-06, + "loss": 0.9098, + "step": 19393 + }, + { + "epoch": 1.86, + "grad_norm": 0.33120453942412165, + "learning_rate": 2.839506098080091e-06, + "loss": 1.0055, + "step": 19394 + }, + { + "epoch": 1.86, + "grad_norm": 0.2964630708447631, + "learning_rate": 2.8357642791885908e-06, + "loss": 1.0199, + "step": 19395 + }, + { + "epoch": 1.86, + "grad_norm": 0.33102146757114104, + "learning_rate": 2.832024891893126e-06, + "loss": 1.0605, + "step": 19396 + }, + { + "epoch": 1.86, + "grad_norm": 0.34647115625688385, + "learning_rate": 2.8282879362873326e-06, + "loss": 1.0225, + "step": 19397 + }, + { + "epoch": 1.86, + "grad_norm": 0.3472286285262597, + "learning_rate": 2.824553412464692e-06, + "loss": 0.9575, + "step": 19398 + }, + { + "epoch": 1.86, + "grad_norm": 0.3359158440466367, + "learning_rate": 2.8208213205186853e-06, + "loss": 1.0933, + "step": 19399 + }, + { + "epoch": 1.86, + "grad_norm": 0.35141195318404467, + "learning_rate": 2.8170916605426922e-06, + "loss": 1.1356, + "step": 19400 + }, + { + "epoch": 1.86, + "grad_norm": 0.2969513857612717, + "learning_rate": 2.81336443263005e-06, + "loss": 1.0011, + "step": 19401 + }, + { + "epoch": 1.86, + "grad_norm": 0.31343442217458767, + "learning_rate": 2.8096396368740507e-06, + "loss": 1.0167, + "step": 19402 + }, + { + "epoch": 1.86, + "grad_norm": 0.3299823075244418, + "learning_rate": 2.8059172733678973e-06, + "loss": 1.0649, + "step": 19403 + }, + { + "epoch": 1.86, + "grad_norm": 0.35767599816714507, + "learning_rate": 2.8021973422047485e-06, + "loss": 1.0278, + "step": 19404 + }, + { + "epoch": 1.86, + "grad_norm": 0.30346874329881335, + "learning_rate": 2.798479843477697e-06, + "loss": 1.0417, + "step": 19405 + }, + { + "epoch": 1.86, + "grad_norm": 0.36105703616291895, + "learning_rate": 2.79476477727979e-06, + "loss": 1.0099, + "step": 19406 + }, + { + "epoch": 1.86, + "grad_norm": 0.266979306062002, + "learning_rate": 2.791052143703976e-06, + "loss": 1.0218, + "step": 19407 + }, + { + "epoch": 1.86, + "grad_norm": 0.3048078982448064, + "learning_rate": 2.78734194284318e-06, + "loss": 1.0737, + "step": 19408 + }, + { + "epoch": 1.86, + "grad_norm": 0.31784774853963493, + "learning_rate": 2.7836341747902394e-06, + "loss": 0.9938, + "step": 19409 + }, + { + "epoch": 1.86, + "grad_norm": 0.32105932213018645, + "learning_rate": 2.7799288396379576e-06, + "loss": 1.0166, + "step": 19410 + }, + { + "epoch": 1.86, + "grad_norm": 0.2993689932323707, + "learning_rate": 2.776225937479049e-06, + "loss": 0.9839, + "step": 19411 + }, + { + "epoch": 1.86, + "grad_norm": 0.3228167024482697, + "learning_rate": 2.7725254684061953e-06, + "loss": 0.9903, + "step": 19412 + }, + { + "epoch": 1.86, + "grad_norm": 0.35739979536147926, + "learning_rate": 2.768827432512e-06, + "loss": 0.9609, + "step": 19413 + }, + { + "epoch": 1.86, + "grad_norm": 0.3167443101066417, + "learning_rate": 2.7651318298890116e-06, + "loss": 0.986, + "step": 19414 + }, + { + "epoch": 1.86, + "grad_norm": 0.34728622111886315, + "learning_rate": 2.7614386606296894e-06, + "loss": 1.0337, + "step": 19415 + }, + { + "epoch": 1.86, + "grad_norm": 0.2973320090305171, + "learning_rate": 2.7577479248264925e-06, + "loss": 1.1366, + "step": 19416 + }, + { + "epoch": 1.86, + "grad_norm": 0.3326955863728945, + "learning_rate": 2.7540596225717586e-06, + "loss": 1.0221, + "step": 19417 + }, + { + "epoch": 1.86, + "grad_norm": 0.34829335827876257, + "learning_rate": 2.7503737539578023e-06, + "loss": 1.0244, + "step": 19418 + }, + { + "epoch": 1.86, + "grad_norm": 0.2784418062196841, + "learning_rate": 2.746690319076861e-06, + "loss": 1.0458, + "step": 19419 + }, + { + "epoch": 1.86, + "grad_norm": 0.3193394460779293, + "learning_rate": 2.743009318021128e-06, + "loss": 0.9327, + "step": 19420 + }, + { + "epoch": 1.86, + "grad_norm": 0.3430140611643731, + "learning_rate": 2.739330750882696e-06, + "loss": 1.0755, + "step": 19421 + }, + { + "epoch": 1.86, + "grad_norm": 0.3135910247334878, + "learning_rate": 2.7356546177536466e-06, + "loss": 0.941, + "step": 19422 + }, + { + "epoch": 1.86, + "grad_norm": 0.3052494235994109, + "learning_rate": 2.7319809187259626e-06, + "loss": 1.0234, + "step": 19423 + }, + { + "epoch": 1.86, + "grad_norm": 0.3078453779631307, + "learning_rate": 2.728309653891592e-06, + "loss": 1.0066, + "step": 19424 + }, + { + "epoch": 1.86, + "grad_norm": 0.3011563771969503, + "learning_rate": 2.724640823342395e-06, + "loss": 1.0714, + "step": 19425 + }, + { + "epoch": 1.86, + "grad_norm": 0.29021036571300035, + "learning_rate": 2.7209744271702087e-06, + "loss": 1.0585, + "step": 19426 + }, + { + "epoch": 1.86, + "grad_norm": 0.3588586108815132, + "learning_rate": 2.7173104654667714e-06, + "loss": 1.0429, + "step": 19427 + }, + { + "epoch": 1.86, + "grad_norm": 0.3738473115621421, + "learning_rate": 2.7136489383237762e-06, + "loss": 0.9332, + "step": 19428 + }, + { + "epoch": 1.86, + "grad_norm": 0.3314624625875106, + "learning_rate": 2.709989845832872e-06, + "loss": 1.0915, + "step": 19429 + }, + { + "epoch": 1.86, + "grad_norm": 0.37019998682937244, + "learning_rate": 2.706333188085586e-06, + "loss": 1.0548, + "step": 19430 + }, + { + "epoch": 1.86, + "grad_norm": 0.27370928459721383, + "learning_rate": 2.7026789651734887e-06, + "loss": 0.9482, + "step": 19431 + }, + { + "epoch": 1.86, + "grad_norm": 0.3507789342242378, + "learning_rate": 2.699027177187985e-06, + "loss": 0.9862, + "step": 19432 + }, + { + "epoch": 1.86, + "grad_norm": 0.31592793846527, + "learning_rate": 2.695377824220491e-06, + "loss": 1.057, + "step": 19433 + }, + { + "epoch": 1.86, + "grad_norm": 0.33939417628523105, + "learning_rate": 2.6917309063623e-06, + "loss": 1.1224, + "step": 19434 + }, + { + "epoch": 1.86, + "grad_norm": 0.2870282443134659, + "learning_rate": 2.6880864237047167e-06, + "loss": 0.9078, + "step": 19435 + }, + { + "epoch": 1.86, + "grad_norm": 0.2911164041326248, + "learning_rate": 2.684444376338924e-06, + "loss": 1.0344, + "step": 19436 + }, + { + "epoch": 1.86, + "grad_norm": 0.34464916928369554, + "learning_rate": 2.6808047643560598e-06, + "loss": 0.9716, + "step": 19437 + }, + { + "epoch": 1.86, + "grad_norm": 0.288700980445573, + "learning_rate": 2.6771675878472293e-06, + "loss": 1.0061, + "step": 19438 + }, + { + "epoch": 1.86, + "grad_norm": 0.3510486786167023, + "learning_rate": 2.673532846903448e-06, + "loss": 1.0982, + "step": 19439 + }, + { + "epoch": 1.86, + "grad_norm": 0.33984510458394507, + "learning_rate": 2.6699005416156665e-06, + "loss": 1.0376, + "step": 19440 + }, + { + "epoch": 1.86, + "grad_norm": 0.3117940928618853, + "learning_rate": 2.6662706720748e-06, + "loss": 1.0294, + "step": 19441 + }, + { + "epoch": 1.86, + "grad_norm": 0.306882788707057, + "learning_rate": 2.662643238371687e-06, + "loss": 1.041, + "step": 19442 + }, + { + "epoch": 1.86, + "grad_norm": 0.3747334898121617, + "learning_rate": 2.6590182405970887e-06, + "loss": 0.9986, + "step": 19443 + }, + { + "epoch": 1.86, + "grad_norm": 0.33102149724490054, + "learning_rate": 2.6553956788417435e-06, + "loss": 1.1001, + "step": 19444 + }, + { + "epoch": 1.86, + "grad_norm": 0.30896415743016287, + "learning_rate": 2.6517755531962896e-06, + "loss": 1.0325, + "step": 19445 + }, + { + "epoch": 1.86, + "grad_norm": 0.26218378787565727, + "learning_rate": 2.6481578637513547e-06, + "loss": 0.996, + "step": 19446 + }, + { + "epoch": 1.86, + "grad_norm": 0.2894575192314524, + "learning_rate": 2.6445426105974335e-06, + "loss": 0.9505, + "step": 19447 + }, + { + "epoch": 1.86, + "grad_norm": 0.32895922107914793, + "learning_rate": 2.6409297938250312e-06, + "loss": 1.0192, + "step": 19448 + }, + { + "epoch": 1.86, + "grad_norm": 0.3224490370808495, + "learning_rate": 2.6373194135245414e-06, + "loss": 1.1345, + "step": 19449 + }, + { + "epoch": 1.86, + "grad_norm": 0.30146411854011046, + "learning_rate": 2.6337114697863264e-06, + "loss": 0.8798, + "step": 19450 + }, + { + "epoch": 1.86, + "grad_norm": 0.3535085366196113, + "learning_rate": 2.6301059627006574e-06, + "loss": 1.131, + "step": 19451 + }, + { + "epoch": 1.86, + "grad_norm": 0.309640720944379, + "learning_rate": 2.626502892357807e-06, + "loss": 0.9705, + "step": 19452 + }, + { + "epoch": 1.86, + "grad_norm": 0.3160665908411708, + "learning_rate": 2.6229022588479035e-06, + "loss": 1.0353, + "step": 19453 + }, + { + "epoch": 1.86, + "grad_norm": 0.2852283653046714, + "learning_rate": 2.6193040622610854e-06, + "loss": 1.0499, + "step": 19454 + }, + { + "epoch": 1.86, + "grad_norm": 0.3074067747443144, + "learning_rate": 2.61570830268737e-06, + "loss": 0.9399, + "step": 19455 + }, + { + "epoch": 1.86, + "grad_norm": 0.27520588727726686, + "learning_rate": 2.612114980216773e-06, + "loss": 1.0465, + "step": 19456 + }, + { + "epoch": 1.86, + "grad_norm": 0.30205581547031146, + "learning_rate": 2.60852409493918e-06, + "loss": 1.1, + "step": 19457 + }, + { + "epoch": 1.86, + "grad_norm": 0.3333202470180738, + "learning_rate": 2.6049356469444953e-06, + "loss": 0.9491, + "step": 19458 + }, + { + "epoch": 1.86, + "grad_norm": 0.3420477326268531, + "learning_rate": 2.601349636322503e-06, + "loss": 1.0813, + "step": 19459 + }, + { + "epoch": 1.86, + "grad_norm": 0.31685664135627983, + "learning_rate": 2.597766063162954e-06, + "loss": 1.0257, + "step": 19460 + }, + { + "epoch": 1.86, + "grad_norm": 0.3315192753798576, + "learning_rate": 2.5941849275555208e-06, + "loss": 0.9328, + "step": 19461 + }, + { + "epoch": 1.86, + "grad_norm": 0.28543956737353166, + "learning_rate": 2.5906062295898202e-06, + "loss": 0.9654, + "step": 19462 + }, + { + "epoch": 1.86, + "grad_norm": 0.35021106961958803, + "learning_rate": 2.5870299693554368e-06, + "loss": 0.9814, + "step": 19463 + }, + { + "epoch": 1.86, + "grad_norm": 0.31892345804192684, + "learning_rate": 2.583456146941832e-06, + "loss": 0.9804, + "step": 19464 + }, + { + "epoch": 1.86, + "grad_norm": 0.30906220648916044, + "learning_rate": 2.579884762438467e-06, + "loss": 0.9955, + "step": 19465 + }, + { + "epoch": 1.86, + "grad_norm": 0.3310448434119203, + "learning_rate": 2.5763158159347158e-06, + "loss": 0.9567, + "step": 19466 + }, + { + "epoch": 1.86, + "grad_norm": 0.30164087987014304, + "learning_rate": 2.572749307519895e-06, + "loss": 0.9971, + "step": 19467 + }, + { + "epoch": 1.86, + "grad_norm": 0.296171656318707, + "learning_rate": 2.569185237283234e-06, + "loss": 1.0479, + "step": 19468 + }, + { + "epoch": 1.86, + "grad_norm": 0.33256921819124297, + "learning_rate": 2.5656236053139714e-06, + "loss": 1.0658, + "step": 19469 + }, + { + "epoch": 1.86, + "grad_norm": 0.302649816437568, + "learning_rate": 2.5620644117011927e-06, + "loss": 1.0835, + "step": 19470 + }, + { + "epoch": 1.86, + "grad_norm": 0.3075463719707451, + "learning_rate": 2.558507656533993e-06, + "loss": 1.1066, + "step": 19471 + }, + { + "epoch": 1.86, + "grad_norm": 0.29695505663552985, + "learning_rate": 2.554953339901378e-06, + "loss": 0.9585, + "step": 19472 + }, + { + "epoch": 1.86, + "grad_norm": 0.32704111352259674, + "learning_rate": 2.5514014618923e-06, + "loss": 0.9443, + "step": 19473 + }, + { + "epoch": 1.86, + "grad_norm": 0.34481331881650235, + "learning_rate": 2.5478520225956425e-06, + "loss": 1.1454, + "step": 19474 + }, + { + "epoch": 1.86, + "grad_norm": 0.33761031403413827, + "learning_rate": 2.544305022100246e-06, + "loss": 0.9667, + "step": 19475 + }, + { + "epoch": 1.86, + "grad_norm": 0.304564898846172, + "learning_rate": 2.540760460494862e-06, + "loss": 1.03, + "step": 19476 + }, + { + "epoch": 1.86, + "grad_norm": 0.2803425398101807, + "learning_rate": 2.537218337868197e-06, + "loss": 1.021, + "step": 19477 + }, + { + "epoch": 1.86, + "grad_norm": 0.32604416752316734, + "learning_rate": 2.5336786543088797e-06, + "loss": 0.9787, + "step": 19478 + }, + { + "epoch": 1.86, + "grad_norm": 0.29159309554753843, + "learning_rate": 2.530141409905529e-06, + "loss": 1.0856, + "step": 19479 + }, + { + "epoch": 1.86, + "grad_norm": 0.31437625949185566, + "learning_rate": 2.5266066047466285e-06, + "loss": 1.0497, + "step": 19480 + }, + { + "epoch": 1.86, + "grad_norm": 0.3363623919129498, + "learning_rate": 2.5230742389206752e-06, + "loss": 1.0862, + "step": 19481 + }, + { + "epoch": 1.86, + "grad_norm": 0.342902061558253, + "learning_rate": 2.519544312516042e-06, + "loss": 1.0497, + "step": 19482 + }, + { + "epoch": 1.86, + "grad_norm": 0.3405025887205538, + "learning_rate": 2.5160168256210815e-06, + "loss": 0.9646, + "step": 19483 + }, + { + "epoch": 1.86, + "grad_norm": 0.30212839078534753, + "learning_rate": 2.5124917783240665e-06, + "loss": 1.0258, + "step": 19484 + }, + { + "epoch": 1.86, + "grad_norm": 0.3267755850425377, + "learning_rate": 2.5089691707132045e-06, + "loss": 1.0112, + "step": 19485 + }, + { + "epoch": 1.86, + "grad_norm": 0.2892509017036252, + "learning_rate": 2.5054490028766696e-06, + "loss": 1.0959, + "step": 19486 + }, + { + "epoch": 1.86, + "grad_norm": 0.2959065850308231, + "learning_rate": 2.501931274902536e-06, + "loss": 0.9928, + "step": 19487 + }, + { + "epoch": 1.86, + "grad_norm": 0.3234854026554827, + "learning_rate": 2.498415986878866e-06, + "loss": 1.0714, + "step": 19488 + }, + { + "epoch": 1.86, + "grad_norm": 0.293109095796281, + "learning_rate": 2.4949031388936005e-06, + "loss": 0.9656, + "step": 19489 + }, + { + "epoch": 1.86, + "grad_norm": 0.3276427226798143, + "learning_rate": 2.4913927310346695e-06, + "loss": 1.0024, + "step": 19490 + }, + { + "epoch": 1.86, + "grad_norm": 0.32678976320213343, + "learning_rate": 2.4878847633899026e-06, + "loss": 1.0616, + "step": 19491 + }, + { + "epoch": 1.86, + "grad_norm": 0.32057003109695315, + "learning_rate": 2.4843792360471187e-06, + "loss": 0.9557, + "step": 19492 + }, + { + "epoch": 1.86, + "grad_norm": 0.31816517842838815, + "learning_rate": 2.4808761490940135e-06, + "loss": 1.0053, + "step": 19493 + }, + { + "epoch": 1.87, + "grad_norm": 0.3141195202214637, + "learning_rate": 2.4773755026182956e-06, + "loss": 1.0623, + "step": 19494 + }, + { + "epoch": 1.87, + "grad_norm": 0.3145641537086415, + "learning_rate": 2.4738772967075274e-06, + "loss": 1.0363, + "step": 19495 + }, + { + "epoch": 1.87, + "grad_norm": 0.2804678524555286, + "learning_rate": 2.4703815314492838e-06, + "loss": 1.1116, + "step": 19496 + }, + { + "epoch": 1.87, + "grad_norm": 0.3144102998738556, + "learning_rate": 2.4668882069310394e-06, + "loss": 1.0408, + "step": 19497 + }, + { + "epoch": 1.87, + "grad_norm": 0.3351868400211666, + "learning_rate": 2.4633973232402017e-06, + "loss": 1.0492, + "step": 19498 + }, + { + "epoch": 1.87, + "grad_norm": 0.3224057294362687, + "learning_rate": 2.4599088804641456e-06, + "loss": 1.0149, + "step": 19499 + }, + { + "epoch": 1.87, + "grad_norm": 0.3227730801494362, + "learning_rate": 2.45642287869019e-06, + "loss": 1.064, + "step": 19500 + }, + { + "epoch": 1.87, + "grad_norm": 0.3609953009763253, + "learning_rate": 2.4529393180055425e-06, + "loss": 1.0538, + "step": 19501 + }, + { + "epoch": 1.87, + "grad_norm": 0.25777504696717024, + "learning_rate": 2.449458198497401e-06, + "loss": 0.8141, + "step": 19502 + }, + { + "epoch": 1.87, + "grad_norm": 0.30800533271193364, + "learning_rate": 2.445979520252861e-06, + "loss": 1.01, + "step": 19503 + }, + { + "epoch": 1.87, + "grad_norm": 0.33621861547161785, + "learning_rate": 2.44250328335901e-06, + "loss": 1.1084, + "step": 19504 + }, + { + "epoch": 1.87, + "grad_norm": 0.31891158938881264, + "learning_rate": 2.4390294879028107e-06, + "loss": 1.1149, + "step": 19505 + }, + { + "epoch": 1.87, + "grad_norm": 0.34846826600779773, + "learning_rate": 2.4355581339712386e-06, + "loss": 1.032, + "step": 19506 + }, + { + "epoch": 1.87, + "grad_norm": 0.3048822376402507, + "learning_rate": 2.4320892216511127e-06, + "loss": 0.9269, + "step": 19507 + }, + { + "epoch": 1.87, + "grad_norm": 0.37374911199310284, + "learning_rate": 2.4286227510292857e-06, + "loss": 0.9926, + "step": 19508 + }, + { + "epoch": 1.87, + "grad_norm": 0.3173773357413452, + "learning_rate": 2.4251587221925108e-06, + "loss": 1.1156, + "step": 19509 + }, + { + "epoch": 1.87, + "grad_norm": 0.29974535616835063, + "learning_rate": 2.421697135227441e-06, + "loss": 1.0446, + "step": 19510 + }, + { + "epoch": 1.87, + "grad_norm": 0.3517321956800445, + "learning_rate": 2.4182379902207397e-06, + "loss": 0.9048, + "step": 19511 + }, + { + "epoch": 1.87, + "grad_norm": 0.30004819755870443, + "learning_rate": 2.414781287258938e-06, + "loss": 1.0667, + "step": 19512 + }, + { + "epoch": 1.87, + "grad_norm": 0.3663130461111478, + "learning_rate": 2.411327026428589e-06, + "loss": 1.0032, + "step": 19513 + }, + { + "epoch": 1.87, + "grad_norm": 0.3391513053442959, + "learning_rate": 2.4078752078161016e-06, + "loss": 0.8484, + "step": 19514 + }, + { + "epoch": 1.87, + "grad_norm": 0.3088593374677847, + "learning_rate": 2.404425831507884e-06, + "loss": 0.9299, + "step": 19515 + }, + { + "epoch": 1.87, + "grad_norm": 0.3665675794354239, + "learning_rate": 2.4009788975902337e-06, + "loss": 1.0884, + "step": 19516 + }, + { + "epoch": 1.87, + "grad_norm": 0.30833964967259525, + "learning_rate": 2.3975344061494378e-06, + "loss": 0.954, + "step": 19517 + }, + { + "epoch": 1.87, + "grad_norm": 0.3051108889008445, + "learning_rate": 2.39409235727166e-06, + "loss": 0.9212, + "step": 19518 + }, + { + "epoch": 1.87, + "grad_norm": 0.311110844119058, + "learning_rate": 2.3906527510430878e-06, + "loss": 1.0319, + "step": 19519 + }, + { + "epoch": 1.87, + "grad_norm": 0.2928718324682559, + "learning_rate": 2.3872155875497626e-06, + "loss": 1.0145, + "step": 19520 + }, + { + "epoch": 1.87, + "grad_norm": 0.3506563463753516, + "learning_rate": 2.383780866877727e-06, + "loss": 1.1035, + "step": 19521 + }, + { + "epoch": 1.87, + "grad_norm": 0.3730599872109506, + "learning_rate": 2.3803485891129127e-06, + "loss": 1.1252, + "step": 19522 + }, + { + "epoch": 1.87, + "grad_norm": 0.30143104926800157, + "learning_rate": 2.3769187543412396e-06, + "loss": 0.9147, + "step": 19523 + }, + { + "epoch": 1.87, + "grad_norm": 0.30341252363658305, + "learning_rate": 2.373491362648517e-06, + "loss": 0.9646, + "step": 19524 + }, + { + "epoch": 1.87, + "grad_norm": 0.32666940975479325, + "learning_rate": 2.37006641412052e-06, + "loss": 0.9565, + "step": 19525 + }, + { + "epoch": 1.87, + "grad_norm": 0.3396342305324626, + "learning_rate": 2.3666439088429803e-06, + "loss": 1.1003, + "step": 19526 + }, + { + "epoch": 1.87, + "grad_norm": 0.2560960797656069, + "learning_rate": 2.363223846901541e-06, + "loss": 1.0004, + "step": 19527 + }, + { + "epoch": 1.87, + "grad_norm": 0.2721592146231591, + "learning_rate": 2.359806228381789e-06, + "loss": 0.9925, + "step": 19528 + }, + { + "epoch": 1.87, + "grad_norm": 0.32226118633845, + "learning_rate": 2.3563910533692446e-06, + "loss": 0.8332, + "step": 19529 + }, + { + "epoch": 1.87, + "grad_norm": 0.31067317707510955, + "learning_rate": 2.352978321949384e-06, + "loss": 0.9298, + "step": 19530 + }, + { + "epoch": 1.87, + "grad_norm": 0.31356620067567464, + "learning_rate": 2.3495680342076054e-06, + "loss": 0.9996, + "step": 19531 + }, + { + "epoch": 1.87, + "grad_norm": 0.30903911250829075, + "learning_rate": 2.346160190229252e-06, + "loss": 0.9592, + "step": 19532 + }, + { + "epoch": 1.87, + "grad_norm": 0.3347076067924358, + "learning_rate": 2.3427547900996216e-06, + "loss": 0.9888, + "step": 19533 + }, + { + "epoch": 1.87, + "grad_norm": 0.3020990199256019, + "learning_rate": 2.3393518339039354e-06, + "loss": 1.0407, + "step": 19534 + }, + { + "epoch": 1.87, + "grad_norm": 0.27534808169095093, + "learning_rate": 2.3359513217273367e-06, + "loss": 0.9216, + "step": 19535 + }, + { + "epoch": 1.87, + "grad_norm": 0.32632340875340543, + "learning_rate": 2.3325532536549454e-06, + "loss": 0.8831, + "step": 19536 + }, + { + "epoch": 1.87, + "grad_norm": 0.3020329191165336, + "learning_rate": 2.329157629771772e-06, + "loss": 1.0227, + "step": 19537 + }, + { + "epoch": 1.87, + "grad_norm": 0.35212186263116585, + "learning_rate": 2.3257644501628373e-06, + "loss": 1.0597, + "step": 19538 + }, + { + "epoch": 1.87, + "grad_norm": 0.2773457203537905, + "learning_rate": 2.3223737149130064e-06, + "loss": 0.9018, + "step": 19539 + }, + { + "epoch": 1.87, + "grad_norm": 0.34345291362108865, + "learning_rate": 2.3189854241071783e-06, + "loss": 0.8861, + "step": 19540 + }, + { + "epoch": 1.87, + "grad_norm": 0.28740571041956026, + "learning_rate": 2.315599577830119e-06, + "loss": 1.0549, + "step": 19541 + }, + { + "epoch": 1.87, + "grad_norm": 0.31843725108847015, + "learning_rate": 2.3122161761665926e-06, + "loss": 1.0472, + "step": 19542 + }, + { + "epoch": 1.87, + "grad_norm": 0.3167585135664699, + "learning_rate": 2.308835219201233e-06, + "loss": 1.0507, + "step": 19543 + }, + { + "epoch": 1.87, + "grad_norm": 0.3046040659399777, + "learning_rate": 2.3054567070186715e-06, + "loss": 1.0606, + "step": 19544 + }, + { + "epoch": 1.87, + "grad_norm": 0.3412623321429467, + "learning_rate": 2.3020806397034635e-06, + "loss": 0.9757, + "step": 19545 + }, + { + "epoch": 1.87, + "grad_norm": 0.3411443385261567, + "learning_rate": 2.298707017340085e-06, + "loss": 1.0589, + "step": 19546 + }, + { + "epoch": 1.87, + "grad_norm": 0.2802444279652665, + "learning_rate": 2.2953358400129578e-06, + "loss": 1.0502, + "step": 19547 + }, + { + "epoch": 1.87, + "grad_norm": 0.3653172570501627, + "learning_rate": 2.2919671078064584e-06, + "loss": 1.0694, + "step": 19548 + }, + { + "epoch": 1.87, + "grad_norm": 0.33075200451647147, + "learning_rate": 2.288600820804898e-06, + "loss": 0.8616, + "step": 19549 + }, + { + "epoch": 1.87, + "grad_norm": 0.3122596301668421, + "learning_rate": 2.285236979092498e-06, + "loss": 0.913, + "step": 19550 + }, + { + "epoch": 1.87, + "grad_norm": 0.29716115722812014, + "learning_rate": 2.281875582753468e-06, + "loss": 0.9813, + "step": 19551 + }, + { + "epoch": 1.87, + "grad_norm": 0.32560269515512763, + "learning_rate": 2.2785166318718965e-06, + "loss": 1.0958, + "step": 19552 + }, + { + "epoch": 1.87, + "grad_norm": 0.33716278389474785, + "learning_rate": 2.2751601265318836e-06, + "loss": 0.9966, + "step": 19553 + }, + { + "epoch": 1.87, + "grad_norm": 0.29771315189800196, + "learning_rate": 2.2718060668173946e-06, + "loss": 1.0401, + "step": 19554 + }, + { + "epoch": 1.87, + "grad_norm": 0.3247362463092197, + "learning_rate": 2.2684544528123854e-06, + "loss": 1.0421, + "step": 19555 + }, + { + "epoch": 1.87, + "grad_norm": 0.29845982879666894, + "learning_rate": 2.2651052846007215e-06, + "loss": 0.9012, + "step": 19556 + }, + { + "epoch": 1.87, + "grad_norm": 0.3346484054008134, + "learning_rate": 2.2617585622662253e-06, + "loss": 1.0249, + "step": 19557 + }, + { + "epoch": 1.87, + "grad_norm": 0.30821302544857454, + "learning_rate": 2.2584142858926405e-06, + "loss": 1.0068, + "step": 19558 + }, + { + "epoch": 1.87, + "grad_norm": 0.287408440001934, + "learning_rate": 2.255072455563667e-06, + "loss": 0.9683, + "step": 19559 + }, + { + "epoch": 1.87, + "grad_norm": 0.2944103890219427, + "learning_rate": 2.251733071362938e-06, + "loss": 1.047, + "step": 19560 + }, + { + "epoch": 1.87, + "grad_norm": 0.3120026631557725, + "learning_rate": 2.248396133374031e-06, + "loss": 1.0181, + "step": 19561 + }, + { + "epoch": 1.87, + "grad_norm": 0.3051150121973897, + "learning_rate": 2.2450616416804347e-06, + "loss": 0.9924, + "step": 19562 + }, + { + "epoch": 1.87, + "grad_norm": 0.3442210737221149, + "learning_rate": 2.241729596365616e-06, + "loss": 0.8436, + "step": 19563 + }, + { + "epoch": 1.87, + "grad_norm": 0.32209520691079296, + "learning_rate": 2.238399997512941e-06, + "loss": 0.9377, + "step": 19564 + }, + { + "epoch": 1.87, + "grad_norm": 0.33341904761466906, + "learning_rate": 2.2350728452057545e-06, + "loss": 1.0056, + "step": 19565 + }, + { + "epoch": 1.87, + "grad_norm": 0.31863608902220475, + "learning_rate": 2.231748139527301e-06, + "loss": 1.0822, + "step": 19566 + }, + { + "epoch": 1.87, + "grad_norm": 0.31684023938393024, + "learning_rate": 2.2284258805608137e-06, + "loss": 1.0965, + "step": 19567 + }, + { + "epoch": 1.87, + "grad_norm": 0.2975651710729903, + "learning_rate": 2.2251060683894044e-06, + "loss": 1.0392, + "step": 19568 + }, + { + "epoch": 1.87, + "grad_norm": 0.3477673918758594, + "learning_rate": 2.2217887030961614e-06, + "loss": 1.0637, + "step": 19569 + }, + { + "epoch": 1.87, + "grad_norm": 0.3049810761272185, + "learning_rate": 2.2184737847641192e-06, + "loss": 1.0151, + "step": 19570 + }, + { + "epoch": 1.87, + "grad_norm": 0.32982612345888884, + "learning_rate": 2.2151613134762216e-06, + "loss": 0.9833, + "step": 19571 + }, + { + "epoch": 1.87, + "grad_norm": 0.33426371946182437, + "learning_rate": 2.2118512893153696e-06, + "loss": 1.03, + "step": 19572 + }, + { + "epoch": 1.87, + "grad_norm": 0.3120971544655318, + "learning_rate": 2.208543712364375e-06, + "loss": 0.943, + "step": 19573 + }, + { + "epoch": 1.87, + "grad_norm": 0.343798072929666, + "learning_rate": 2.2052385827060596e-06, + "loss": 1.0835, + "step": 19574 + }, + { + "epoch": 1.87, + "grad_norm": 0.2924518658690372, + "learning_rate": 2.201935900423091e-06, + "loss": 0.899, + "step": 19575 + }, + { + "epoch": 1.87, + "grad_norm": 0.3494584133476198, + "learning_rate": 2.1986356655981587e-06, + "loss": 0.9492, + "step": 19576 + }, + { + "epoch": 1.87, + "grad_norm": 0.29197768592559553, + "learning_rate": 2.19533787831383e-06, + "loss": 1.0737, + "step": 19577 + }, + { + "epoch": 1.87, + "grad_norm": 0.3465176844364263, + "learning_rate": 2.1920425386526388e-06, + "loss": 1.025, + "step": 19578 + }, + { + "epoch": 1.87, + "grad_norm": 0.3095791705276766, + "learning_rate": 2.1887496466970304e-06, + "loss": 1.036, + "step": 19579 + }, + { + "epoch": 1.87, + "grad_norm": 0.3218301081937091, + "learning_rate": 2.1854592025294605e-06, + "loss": 0.967, + "step": 19580 + }, + { + "epoch": 1.87, + "grad_norm": 0.35017885753015826, + "learning_rate": 2.182171206232242e-06, + "loss": 0.8943, + "step": 19581 + }, + { + "epoch": 1.87, + "grad_norm": 0.29513390129921774, + "learning_rate": 2.178885657887664e-06, + "loss": 0.9244, + "step": 19582 + }, + { + "epoch": 1.87, + "grad_norm": 0.3311480171869876, + "learning_rate": 2.1756025575779495e-06, + "loss": 1.0358, + "step": 19583 + }, + { + "epoch": 1.87, + "grad_norm": 0.26837397452615946, + "learning_rate": 2.1723219053852663e-06, + "loss": 1.01, + "step": 19584 + }, + { + "epoch": 1.87, + "grad_norm": 0.28931269545770916, + "learning_rate": 2.1690437013917044e-06, + "loss": 1.0388, + "step": 19585 + }, + { + "epoch": 1.87, + "grad_norm": 0.36001592077483563, + "learning_rate": 2.165767945679309e-06, + "loss": 1.0205, + "step": 19586 + }, + { + "epoch": 1.87, + "grad_norm": 0.31545054690914914, + "learning_rate": 2.1624946383300594e-06, + "loss": 0.9914, + "step": 19587 + }, + { + "epoch": 1.87, + "grad_norm": 0.2746082905204076, + "learning_rate": 2.1592237794258674e-06, + "loss": 0.9763, + "step": 19588 + }, + { + "epoch": 1.87, + "grad_norm": 0.2947733329437678, + "learning_rate": 2.15595536904859e-06, + "loss": 0.8994, + "step": 19589 + }, + { + "epoch": 1.87, + "grad_norm": 0.35219671241224826, + "learning_rate": 2.1526894072800286e-06, + "loss": 0.9978, + "step": 19590 + }, + { + "epoch": 1.87, + "grad_norm": 0.3086210553494151, + "learning_rate": 2.1494258942019062e-06, + "loss": 1.0103, + "step": 19591 + }, + { + "epoch": 1.87, + "grad_norm": 0.3216821816242064, + "learning_rate": 2.1461648298958916e-06, + "loss": 0.9599, + "step": 19592 + }, + { + "epoch": 1.87, + "grad_norm": 0.299484927365542, + "learning_rate": 2.1429062144436074e-06, + "loss": 0.9915, + "step": 19593 + }, + { + "epoch": 1.87, + "grad_norm": 0.26788434902358504, + "learning_rate": 2.139650047926589e-06, + "loss": 0.9667, + "step": 19594 + }, + { + "epoch": 1.87, + "grad_norm": 0.3570360984296879, + "learning_rate": 2.136396330426349e-06, + "loss": 1.0214, + "step": 19595 + }, + { + "epoch": 1.87, + "grad_norm": 0.30151322005241354, + "learning_rate": 2.133145062024278e-06, + "loss": 0.957, + "step": 19596 + }, + { + "epoch": 1.87, + "grad_norm": 0.30175171945668855, + "learning_rate": 2.1298962428017767e-06, + "loss": 0.9879, + "step": 19597 + }, + { + "epoch": 1.87, + "grad_norm": 0.3190672341437996, + "learning_rate": 2.126649872840114e-06, + "loss": 1.089, + "step": 19598 + }, + { + "epoch": 1.88, + "grad_norm": 0.31926825047572666, + "learning_rate": 2.1234059522205583e-06, + "loss": 1.1298, + "step": 19599 + }, + { + "epoch": 1.88, + "grad_norm": 0.3297305869381328, + "learning_rate": 2.120164481024267e-06, + "loss": 1.0247, + "step": 19600 + }, + { + "epoch": 1.88, + "grad_norm": 0.37516438038660643, + "learning_rate": 2.116925459332386e-06, + "loss": 1.0716, + "step": 19601 + }, + { + "epoch": 1.88, + "grad_norm": 0.3475838437332945, + "learning_rate": 2.113688887225962e-06, + "loss": 1.009, + "step": 19602 + }, + { + "epoch": 1.88, + "grad_norm": 0.35591200890021696, + "learning_rate": 2.1104547647859964e-06, + "loss": 1.1895, + "step": 19603 + }, + { + "epoch": 1.88, + "grad_norm": 0.30373122493766597, + "learning_rate": 2.107223092093413e-06, + "loss": 1.1037, + "step": 19604 + }, + { + "epoch": 1.88, + "grad_norm": 0.3590195637188887, + "learning_rate": 2.103993869229115e-06, + "loss": 1.1019, + "step": 19605 + }, + { + "epoch": 1.88, + "grad_norm": 0.3600139551836037, + "learning_rate": 2.1007670962738702e-06, + "loss": 1.0289, + "step": 19606 + }, + { + "epoch": 1.88, + "grad_norm": 0.3326317789622163, + "learning_rate": 2.0975427733084698e-06, + "loss": 1.0867, + "step": 19607 + }, + { + "epoch": 1.88, + "grad_norm": 0.262498608437869, + "learning_rate": 2.094320900413582e-06, + "loss": 1.0623, + "step": 19608 + }, + { + "epoch": 1.88, + "grad_norm": 0.3049379934334317, + "learning_rate": 2.0911014776698543e-06, + "loss": 1.1131, + "step": 19609 + }, + { + "epoch": 1.88, + "grad_norm": 0.2838737813779652, + "learning_rate": 2.0878845051578446e-06, + "loss": 1.0357, + "step": 19610 + }, + { + "epoch": 1.88, + "grad_norm": 0.2925220918421898, + "learning_rate": 2.084669982958054e-06, + "loss": 1.044, + "step": 19611 + }, + { + "epoch": 1.88, + "grad_norm": 0.29522068118542166, + "learning_rate": 2.0814579111509415e-06, + "loss": 0.9994, + "step": 19612 + }, + { + "epoch": 1.88, + "grad_norm": 0.33576694047983235, + "learning_rate": 2.0782482898168866e-06, + "loss": 1.0431, + "step": 19613 + }, + { + "epoch": 1.88, + "grad_norm": 0.30778188218794333, + "learning_rate": 2.075041119036192e-06, + "loss": 1.0515, + "step": 19614 + }, + { + "epoch": 1.88, + "grad_norm": 0.33171168102841325, + "learning_rate": 2.0718363988891486e-06, + "loss": 0.999, + "step": 19615 + }, + { + "epoch": 1.88, + "grad_norm": 0.30668492048396384, + "learning_rate": 2.0686341294559595e-06, + "loss": 0.9949, + "step": 19616 + }, + { + "epoch": 1.88, + "grad_norm": 0.32119356685753664, + "learning_rate": 2.0654343108167273e-06, + "loss": 1.1409, + "step": 19617 + }, + { + "epoch": 1.88, + "grad_norm": 0.31931072627457363, + "learning_rate": 2.0622369430515655e-06, + "loss": 0.9092, + "step": 19618 + }, + { + "epoch": 1.88, + "grad_norm": 0.25709875263268417, + "learning_rate": 2.0590420262404653e-06, + "loss": 0.9573, + "step": 19619 + }, + { + "epoch": 1.88, + "grad_norm": 0.31274246542978573, + "learning_rate": 2.0558495604633966e-06, + "loss": 0.9648, + "step": 19620 + }, + { + "epoch": 1.88, + "grad_norm": 0.321771233685784, + "learning_rate": 2.0526595458002394e-06, + "loss": 1.0595, + "step": 19621 + }, + { + "epoch": 1.88, + "grad_norm": 0.306607124554197, + "learning_rate": 2.049471982330853e-06, + "loss": 1.0393, + "step": 19622 + }, + { + "epoch": 1.88, + "grad_norm": 0.32651846672524826, + "learning_rate": 2.0462868701349724e-06, + "loss": 0.9903, + "step": 19623 + }, + { + "epoch": 1.88, + "grad_norm": 0.31065258746073837, + "learning_rate": 2.0431042092923457e-06, + "loss": 1.0922, + "step": 19624 + }, + { + "epoch": 1.88, + "grad_norm": 0.3056313254641626, + "learning_rate": 2.039923999882576e-06, + "loss": 1.108, + "step": 19625 + }, + { + "epoch": 1.88, + "grad_norm": 0.27578146128481196, + "learning_rate": 2.036746241985288e-06, + "loss": 0.9658, + "step": 19626 + }, + { + "epoch": 1.88, + "grad_norm": 0.31558763880248997, + "learning_rate": 2.033570935679985e-06, + "loss": 0.9334, + "step": 19627 + }, + { + "epoch": 1.88, + "grad_norm": 0.3211141629141735, + "learning_rate": 2.0303980810461474e-06, + "loss": 1.0772, + "step": 19628 + }, + { + "epoch": 1.88, + "grad_norm": 0.3391125766860313, + "learning_rate": 2.0272276781631573e-06, + "loss": 1.0934, + "step": 19629 + }, + { + "epoch": 1.88, + "grad_norm": 0.31917973523562104, + "learning_rate": 2.024059727110372e-06, + "loss": 1.0762, + "step": 19630 + }, + { + "epoch": 1.88, + "grad_norm": 0.2984353886949142, + "learning_rate": 2.020894227967085e-06, + "loss": 1.0871, + "step": 19631 + }, + { + "epoch": 1.88, + "grad_norm": 0.32616110358322853, + "learning_rate": 2.0177311808124768e-06, + "loss": 1.0693, + "step": 19632 + }, + { + "epoch": 1.88, + "grad_norm": 0.32046662122226227, + "learning_rate": 2.014570585725739e-06, + "loss": 1.0, + "step": 19633 + }, + { + "epoch": 1.88, + "grad_norm": 0.3234333833393245, + "learning_rate": 2.0114124427859317e-06, + "loss": 1.0885, + "step": 19634 + }, + { + "epoch": 1.88, + "grad_norm": 0.2654403339811078, + "learning_rate": 2.008256752072135e-06, + "loss": 1.042, + "step": 19635 + }, + { + "epoch": 1.88, + "grad_norm": 0.32249664167958253, + "learning_rate": 2.005103513663287e-06, + "loss": 1.0408, + "step": 19636 + }, + { + "epoch": 1.88, + "grad_norm": 0.29070872369281436, + "learning_rate": 2.0019527276383234e-06, + "loss": 1.0127, + "step": 19637 + }, + { + "epoch": 1.88, + "grad_norm": 0.3181085188830054, + "learning_rate": 1.99880439407607e-06, + "loss": 1.0196, + "step": 19638 + }, + { + "epoch": 1.88, + "grad_norm": 0.3582036523550525, + "learning_rate": 1.995658513055332e-06, + "loss": 1.1028, + "step": 19639 + }, + { + "epoch": 1.88, + "grad_norm": 0.32460301649448964, + "learning_rate": 1.9925150846548226e-06, + "loss": 0.9958, + "step": 19640 + }, + { + "epoch": 1.88, + "grad_norm": 0.308870759810925, + "learning_rate": 1.989374108953235e-06, + "loss": 1.1049, + "step": 19641 + }, + { + "epoch": 1.88, + "grad_norm": 0.3590694634666669, + "learning_rate": 1.9862355860291505e-06, + "loss": 0.981, + "step": 19642 + }, + { + "epoch": 1.88, + "grad_norm": 0.3362388777073273, + "learning_rate": 1.983099515961129e-06, + "loss": 1.04, + "step": 19643 + }, + { + "epoch": 1.88, + "grad_norm": 0.3720658110818117, + "learning_rate": 1.9799658988276294e-06, + "loss": 0.993, + "step": 19644 + }, + { + "epoch": 1.88, + "grad_norm": 0.3601777611228878, + "learning_rate": 1.9768347347071004e-06, + "loss": 1.1178, + "step": 19645 + }, + { + "epoch": 1.88, + "grad_norm": 0.31679338342176966, + "learning_rate": 1.973706023677868e-06, + "loss": 0.8869, + "step": 19646 + }, + { + "epoch": 1.88, + "grad_norm": 0.3399670605141725, + "learning_rate": 1.970579765818259e-06, + "loss": 1.0074, + "step": 19647 + }, + { + "epoch": 1.88, + "grad_norm": 0.29282550526329887, + "learning_rate": 1.967455961206499e-06, + "loss": 1.0601, + "step": 19648 + }, + { + "epoch": 1.88, + "grad_norm": 0.32895776224896955, + "learning_rate": 1.9643346099207816e-06, + "loss": 1.0425, + "step": 19649 + }, + { + "epoch": 1.88, + "grad_norm": 0.29077387846222286, + "learning_rate": 1.961215712039177e-06, + "loss": 1.0492, + "step": 19650 + }, + { + "epoch": 1.88, + "grad_norm": 0.30287049484267786, + "learning_rate": 1.958099267639779e-06, + "loss": 1.0851, + "step": 19651 + }, + { + "epoch": 1.88, + "grad_norm": 0.36280043990939964, + "learning_rate": 1.9549852768005692e-06, + "loss": 1.0366, + "step": 19652 + }, + { + "epoch": 1.88, + "grad_norm": 0.33539276174636196, + "learning_rate": 1.9518737395994635e-06, + "loss": 1.005, + "step": 19653 + }, + { + "epoch": 1.88, + "grad_norm": 0.24579218438035402, + "learning_rate": 1.9487646561143325e-06, + "loss": 0.9323, + "step": 19654 + }, + { + "epoch": 1.88, + "grad_norm": 0.3474604869345327, + "learning_rate": 1.9456580264229916e-06, + "loss": 0.8681, + "step": 19655 + }, + { + "epoch": 1.88, + "grad_norm": 0.31272101459088975, + "learning_rate": 1.9425538506032013e-06, + "loss": 1.0379, + "step": 19656 + }, + { + "epoch": 1.88, + "grad_norm": 0.3020828652577108, + "learning_rate": 1.9394521287326105e-06, + "loss": 0.8589, + "step": 19657 + }, + { + "epoch": 1.88, + "grad_norm": 0.308475641186327, + "learning_rate": 1.9363528608888682e-06, + "loss": 0.955, + "step": 19658 + }, + { + "epoch": 1.88, + "grad_norm": 0.31428800357703035, + "learning_rate": 1.933256047149512e-06, + "loss": 1.0704, + "step": 19659 + }, + { + "epoch": 1.88, + "grad_norm": 0.31411641507788607, + "learning_rate": 1.930161687592069e-06, + "loss": 0.9641, + "step": 19660 + }, + { + "epoch": 1.88, + "grad_norm": 0.30355327388715214, + "learning_rate": 1.9270697822939556e-06, + "loss": 0.9185, + "step": 19661 + }, + { + "epoch": 1.88, + "grad_norm": 0.32961890561935697, + "learning_rate": 1.9239803313325642e-06, + "loss": 1.084, + "step": 19662 + }, + { + "epoch": 1.88, + "grad_norm": 0.3106499046860497, + "learning_rate": 1.920893334785201e-06, + "loss": 1.1493, + "step": 19663 + }, + { + "epoch": 1.88, + "grad_norm": 0.31708862305328334, + "learning_rate": 1.9178087927291253e-06, + "loss": 1.0795, + "step": 19664 + }, + { + "epoch": 1.88, + "grad_norm": 0.3068506429979387, + "learning_rate": 1.9147267052415207e-06, + "loss": 0.9757, + "step": 19665 + }, + { + "epoch": 1.88, + "grad_norm": 0.3051626108697318, + "learning_rate": 1.9116470723995362e-06, + "loss": 1.0126, + "step": 19666 + }, + { + "epoch": 1.88, + "grad_norm": 0.299766142225492, + "learning_rate": 1.908569894280221e-06, + "loss": 0.9775, + "step": 19667 + }, + { + "epoch": 1.88, + "grad_norm": 0.26437691015352943, + "learning_rate": 1.9054951709605918e-06, + "loss": 1.1297, + "step": 19668 + }, + { + "epoch": 1.88, + "grad_norm": 0.3405760236666578, + "learning_rate": 1.9024229025175865e-06, + "loss": 1.0503, + "step": 19669 + }, + { + "epoch": 1.88, + "grad_norm": 0.29905195363655634, + "learning_rate": 1.8993530890281219e-06, + "loss": 1.007, + "step": 19670 + }, + { + "epoch": 1.88, + "grad_norm": 0.3337370217121972, + "learning_rate": 1.8962857305689807e-06, + "loss": 1.0127, + "step": 19671 + }, + { + "epoch": 1.88, + "grad_norm": 0.294469142084537, + "learning_rate": 1.893220827216957e-06, + "loss": 1.0484, + "step": 19672 + }, + { + "epoch": 1.88, + "grad_norm": 0.2916213686281202, + "learning_rate": 1.8901583790487453e-06, + "loss": 0.9994, + "step": 19673 + }, + { + "epoch": 1.88, + "grad_norm": 0.3199485205761152, + "learning_rate": 1.887098386140962e-06, + "loss": 0.9845, + "step": 19674 + }, + { + "epoch": 1.88, + "grad_norm": 0.2932047700828409, + "learning_rate": 1.884040848570212e-06, + "loss": 0.9778, + "step": 19675 + }, + { + "epoch": 1.88, + "grad_norm": 0.37493392636135864, + "learning_rate": 1.8809857664130015e-06, + "loss": 0.9597, + "step": 19676 + }, + { + "epoch": 1.88, + "grad_norm": 0.3034704207371983, + "learning_rate": 1.877933139745791e-06, + "loss": 0.9432, + "step": 19677 + }, + { + "epoch": 1.88, + "grad_norm": 0.25457804697237235, + "learning_rate": 1.8748829686449754e-06, + "loss": 1.0595, + "step": 19678 + }, + { + "epoch": 1.88, + "grad_norm": 0.26483626122793724, + "learning_rate": 1.871835253186882e-06, + "loss": 0.9656, + "step": 19679 + }, + { + "epoch": 1.88, + "grad_norm": 0.29749171450999645, + "learning_rate": 1.8687899934477726e-06, + "loss": 0.9077, + "step": 19680 + }, + { + "epoch": 1.88, + "grad_norm": 0.3181044643416451, + "learning_rate": 1.8657471895038636e-06, + "loss": 1.0128, + "step": 19681 + }, + { + "epoch": 1.88, + "grad_norm": 0.26718768827400047, + "learning_rate": 1.8627068414313165e-06, + "loss": 1.0941, + "step": 19682 + }, + { + "epoch": 1.88, + "grad_norm": 0.30869670990902165, + "learning_rate": 1.8596689493062148e-06, + "loss": 0.9995, + "step": 19683 + }, + { + "epoch": 1.88, + "grad_norm": 0.3167287801854689, + "learning_rate": 1.8566335132045642e-06, + "loss": 1.0497, + "step": 19684 + }, + { + "epoch": 1.88, + "grad_norm": 0.2947383331720644, + "learning_rate": 1.8536005332023488e-06, + "loss": 1.065, + "step": 19685 + }, + { + "epoch": 1.88, + "grad_norm": 0.29100264241279444, + "learning_rate": 1.8505700093754519e-06, + "loss": 0.9938, + "step": 19686 + }, + { + "epoch": 1.88, + "grad_norm": 0.3134698541804527, + "learning_rate": 1.8475419417997354e-06, + "loss": 1.0116, + "step": 19687 + }, + { + "epoch": 1.88, + "grad_norm": 0.2730160620446061, + "learning_rate": 1.8445163305509604e-06, + "loss": 1.0201, + "step": 19688 + }, + { + "epoch": 1.88, + "grad_norm": 0.34977754388611737, + "learning_rate": 1.841493175704856e-06, + "loss": 0.945, + "step": 19689 + }, + { + "epoch": 1.88, + "grad_norm": 0.35336319324537635, + "learning_rate": 1.8384724773370722e-06, + "loss": 0.9946, + "step": 19690 + }, + { + "epoch": 1.88, + "grad_norm": 0.32698691957913967, + "learning_rate": 1.8354542355232152e-06, + "loss": 0.9856, + "step": 19691 + }, + { + "epoch": 1.88, + "grad_norm": 0.28939397415537876, + "learning_rate": 1.8324384503388025e-06, + "loss": 0.8726, + "step": 19692 + }, + { + "epoch": 1.88, + "grad_norm": 0.3106044226400935, + "learning_rate": 1.8294251218593183e-06, + "loss": 1.0788, + "step": 19693 + }, + { + "epoch": 1.88, + "grad_norm": 0.3459802629691805, + "learning_rate": 1.8264142501601689e-06, + "loss": 0.9343, + "step": 19694 + }, + { + "epoch": 1.88, + "grad_norm": 0.3119075306578258, + "learning_rate": 1.8234058353167049e-06, + "loss": 1.0215, + "step": 19695 + }, + { + "epoch": 1.88, + "grad_norm": 0.30601749646487114, + "learning_rate": 1.8203998774041997e-06, + "loss": 1.0994, + "step": 19696 + }, + { + "epoch": 1.88, + "grad_norm": 0.2789902901389461, + "learning_rate": 1.8173963764978929e-06, + "loss": 0.9934, + "step": 19697 + }, + { + "epoch": 1.88, + "grad_norm": 0.355339184005911, + "learning_rate": 1.8143953326729579e-06, + "loss": 1.0059, + "step": 19698 + }, + { + "epoch": 1.88, + "grad_norm": 0.2787532423786138, + "learning_rate": 1.8113967460044789e-06, + "loss": 1.076, + "step": 19699 + }, + { + "epoch": 1.88, + "grad_norm": 0.34994253559916877, + "learning_rate": 1.8084006165675182e-06, + "loss": 1.0048, + "step": 19700 + }, + { + "epoch": 1.88, + "grad_norm": 0.30331340440296467, + "learning_rate": 1.805406944437016e-06, + "loss": 1.0317, + "step": 19701 + }, + { + "epoch": 1.88, + "grad_norm": 0.3239378586732622, + "learning_rate": 1.8024157296879452e-06, + "loss": 0.9627, + "step": 19702 + }, + { + "epoch": 1.89, + "grad_norm": 0.3168559483116884, + "learning_rate": 1.799426972395124e-06, + "loss": 0.9932, + "step": 19703 + }, + { + "epoch": 1.89, + "grad_norm": 0.3096361734486429, + "learning_rate": 1.7964406726333704e-06, + "loss": 1.0139, + "step": 19704 + }, + { + "epoch": 1.89, + "grad_norm": 0.3132436339395305, + "learning_rate": 1.7934568304773913e-06, + "loss": 1.0236, + "step": 19705 + }, + { + "epoch": 1.89, + "grad_norm": 0.3286568409638329, + "learning_rate": 1.7904754460018935e-06, + "loss": 1.0618, + "step": 19706 + }, + { + "epoch": 1.89, + "grad_norm": 0.30130580969377696, + "learning_rate": 1.7874965192814508e-06, + "loss": 1.0057, + "step": 19707 + }, + { + "epoch": 1.89, + "grad_norm": 0.29041695012772956, + "learning_rate": 1.7845200503906367e-06, + "loss": 1.0093, + "step": 19708 + }, + { + "epoch": 1.89, + "grad_norm": 0.356792985371076, + "learning_rate": 1.7815460394039363e-06, + "loss": 1.0095, + "step": 19709 + }, + { + "epoch": 1.89, + "grad_norm": 0.3226565866572813, + "learning_rate": 1.77857448639579e-06, + "loss": 0.9386, + "step": 19710 + }, + { + "epoch": 1.89, + "grad_norm": 0.3771037185013905, + "learning_rate": 1.7756053914405268e-06, + "loss": 1.1159, + "step": 19711 + }, + { + "epoch": 1.89, + "grad_norm": 0.31489803437059444, + "learning_rate": 1.7726387546124768e-06, + "loss": 0.9318, + "step": 19712 + }, + { + "epoch": 1.89, + "grad_norm": 0.3025715471040609, + "learning_rate": 1.7696745759858802e-06, + "loss": 1.0335, + "step": 19713 + }, + { + "epoch": 1.89, + "grad_norm": 0.3755428052773073, + "learning_rate": 1.7667128556349e-06, + "loss": 0.9652, + "step": 19714 + }, + { + "epoch": 1.89, + "grad_norm": 0.33726372585066594, + "learning_rate": 1.7637535936336768e-06, + "loss": 0.8958, + "step": 19715 + }, + { + "epoch": 1.89, + "grad_norm": 0.31315798875183304, + "learning_rate": 1.7607967900562628e-06, + "loss": 0.9627, + "step": 19716 + }, + { + "epoch": 1.89, + "grad_norm": 0.33740420377603153, + "learning_rate": 1.757842444976643e-06, + "loss": 1.0199, + "step": 19717 + }, + { + "epoch": 1.89, + "grad_norm": 0.26173746593267716, + "learning_rate": 1.754890558468758e-06, + "loss": 1.0204, + "step": 19718 + }, + { + "epoch": 1.89, + "grad_norm": 0.32192401274203364, + "learning_rate": 1.7519411306064936e-06, + "loss": 1.0289, + "step": 19719 + }, + { + "epoch": 1.89, + "grad_norm": 0.335529767882214, + "learning_rate": 1.7489941614636351e-06, + "loss": 0.9832, + "step": 19720 + }, + { + "epoch": 1.89, + "grad_norm": 0.3719562245339558, + "learning_rate": 1.7460496511139567e-06, + "loss": 1.1166, + "step": 19721 + }, + { + "epoch": 1.89, + "grad_norm": 0.2947885715088386, + "learning_rate": 1.743107599631122e-06, + "loss": 1.0548, + "step": 19722 + }, + { + "epoch": 1.89, + "grad_norm": 0.33673010322620855, + "learning_rate": 1.7401680070887828e-06, + "loss": 1.1311, + "step": 19723 + }, + { + "epoch": 1.89, + "grad_norm": 0.2896661407396121, + "learning_rate": 1.7372308735604914e-06, + "loss": 1.0532, + "step": 19724 + }, + { + "epoch": 1.89, + "grad_norm": 0.2998460457886428, + "learning_rate": 1.734296199119756e-06, + "loss": 0.9936, + "step": 19725 + }, + { + "epoch": 1.89, + "grad_norm": 0.29035674298187175, + "learning_rate": 1.7313639838400176e-06, + "loss": 1.014, + "step": 19726 + }, + { + "epoch": 1.89, + "grad_norm": 0.2987153751013568, + "learning_rate": 1.7284342277946507e-06, + "loss": 0.9539, + "step": 19727 + }, + { + "epoch": 1.89, + "grad_norm": 0.30610903231631215, + "learning_rate": 1.725506931056975e-06, + "loss": 0.9466, + "step": 19728 + }, + { + "epoch": 1.89, + "grad_norm": 0.33586776668525137, + "learning_rate": 1.7225820937002536e-06, + "loss": 1.073, + "step": 19729 + }, + { + "epoch": 1.89, + "grad_norm": 0.3137006555072208, + "learning_rate": 1.7196597157976834e-06, + "loss": 0.9869, + "step": 19730 + }, + { + "epoch": 1.89, + "grad_norm": 0.35666963361267695, + "learning_rate": 1.7167397974223953e-06, + "loss": 1.0263, + "step": 19731 + }, + { + "epoch": 1.89, + "grad_norm": 0.33688207401768905, + "learning_rate": 1.7138223386474529e-06, + "loss": 1.0772, + "step": 19732 + }, + { + "epoch": 1.89, + "grad_norm": 0.37338443056004733, + "learning_rate": 1.7109073395458863e-06, + "loss": 0.8934, + "step": 19733 + }, + { + "epoch": 1.89, + "grad_norm": 0.3024419360180122, + "learning_rate": 1.7079948001906377e-06, + "loss": 0.9695, + "step": 19734 + }, + { + "epoch": 1.89, + "grad_norm": 0.32102770158133653, + "learning_rate": 1.7050847206545928e-06, + "loss": 1.0639, + "step": 19735 + }, + { + "epoch": 1.89, + "grad_norm": 0.36066879561157744, + "learning_rate": 1.7021771010105714e-06, + "loss": 1.0419, + "step": 19736 + }, + { + "epoch": 1.89, + "grad_norm": 0.3415900252851269, + "learning_rate": 1.6992719413313484e-06, + "loss": 1.1147, + "step": 19737 + }, + { + "epoch": 1.89, + "grad_norm": 0.29407688263524245, + "learning_rate": 1.6963692416896327e-06, + "loss": 1.0688, + "step": 19738 + }, + { + "epoch": 1.89, + "grad_norm": 0.32965233814952793, + "learning_rate": 1.6934690021580546e-06, + "loss": 1.0945, + "step": 19739 + }, + { + "epoch": 1.89, + "grad_norm": 0.3168416532206911, + "learning_rate": 1.6905712228092008e-06, + "loss": 1.0082, + "step": 19740 + }, + { + "epoch": 1.89, + "grad_norm": 0.3273336350804907, + "learning_rate": 1.6876759037155799e-06, + "loss": 0.85, + "step": 19741 + }, + { + "epoch": 1.89, + "grad_norm": 0.3582067691503246, + "learning_rate": 1.684783044949656e-06, + "loss": 1.061, + "step": 19742 + }, + { + "epoch": 1.89, + "grad_norm": 0.3138894461061521, + "learning_rate": 1.681892646583827e-06, + "loss": 0.9119, + "step": 19743 + }, + { + "epoch": 1.89, + "grad_norm": 0.31554061248603466, + "learning_rate": 1.6790047086904348e-06, + "loss": 1.0692, + "step": 19744 + }, + { + "epoch": 1.89, + "grad_norm": 0.3043950655873465, + "learning_rate": 1.6761192313417328e-06, + "loss": 0.9877, + "step": 19745 + }, + { + "epoch": 1.89, + "grad_norm": 0.28101243506496404, + "learning_rate": 1.6732362146099412e-06, + "loss": 0.8671, + "step": 19746 + }, + { + "epoch": 1.89, + "grad_norm": 0.36154608973671054, + "learning_rate": 1.670355658567213e-06, + "loss": 1.005, + "step": 19747 + }, + { + "epoch": 1.89, + "grad_norm": 0.2986116874606921, + "learning_rate": 1.6674775632856353e-06, + "loss": 0.9031, + "step": 19748 + }, + { + "epoch": 1.89, + "grad_norm": 0.3540181585760886, + "learning_rate": 1.6646019288372172e-06, + "loss": 1.0858, + "step": 19749 + }, + { + "epoch": 1.89, + "grad_norm": 0.26909736506423687, + "learning_rate": 1.6617287552939453e-06, + "loss": 0.9413, + "step": 19750 + }, + { + "epoch": 1.89, + "grad_norm": 0.3550706599446534, + "learning_rate": 1.6588580427277177e-06, + "loss": 0.9489, + "step": 19751 + }, + { + "epoch": 1.89, + "grad_norm": 0.32853129028637607, + "learning_rate": 1.6559897912103772e-06, + "loss": 1.0481, + "step": 19752 + }, + { + "epoch": 1.89, + "grad_norm": 0.3176002809225913, + "learning_rate": 1.6531240008136884e-06, + "loss": 0.9524, + "step": 19753 + }, + { + "epoch": 1.89, + "grad_norm": 0.33547509339132375, + "learning_rate": 1.6502606716093827e-06, + "loss": 0.9594, + "step": 19754 + }, + { + "epoch": 1.89, + "grad_norm": 0.28387427136030535, + "learning_rate": 1.647399803669114e-06, + "loss": 1.0041, + "step": 19755 + }, + { + "epoch": 1.89, + "grad_norm": 0.30044703530836375, + "learning_rate": 1.6445413970644808e-06, + "loss": 1.0214, + "step": 19756 + }, + { + "epoch": 1.89, + "grad_norm": 0.3203854127443591, + "learning_rate": 1.6416854518670144e-06, + "loss": 0.9806, + "step": 19757 + }, + { + "epoch": 1.89, + "grad_norm": 0.34379460546698826, + "learning_rate": 1.6388319681481802e-06, + "loss": 0.9641, + "step": 19758 + }, + { + "epoch": 1.89, + "grad_norm": 0.29311188534399923, + "learning_rate": 1.6359809459793985e-06, + "loss": 1.1712, + "step": 19759 + }, + { + "epoch": 1.89, + "grad_norm": 0.304754514356152, + "learning_rate": 1.6331323854320013e-06, + "loss": 1.0851, + "step": 19760 + }, + { + "epoch": 1.89, + "grad_norm": 0.30608027122670595, + "learning_rate": 1.6302862865773094e-06, + "loss": 0.8981, + "step": 19761 + }, + { + "epoch": 1.89, + "grad_norm": 0.32530946754632506, + "learning_rate": 1.6274426494864992e-06, + "loss": 1.0937, + "step": 19762 + }, + { + "epoch": 1.89, + "grad_norm": 0.3071926770214568, + "learning_rate": 1.6246014742307802e-06, + "loss": 0.9864, + "step": 19763 + }, + { + "epoch": 1.89, + "grad_norm": 0.3084297540337558, + "learning_rate": 1.621762760881229e-06, + "loss": 1.108, + "step": 19764 + }, + { + "epoch": 1.89, + "grad_norm": 0.30954006432376746, + "learning_rate": 1.6189265095089001e-06, + "loss": 0.9566, + "step": 19765 + }, + { + "epoch": 1.89, + "grad_norm": 0.3531300728939078, + "learning_rate": 1.6160927201847586e-06, + "loss": 0.9933, + "step": 19766 + }, + { + "epoch": 1.89, + "grad_norm": 0.3457046638225827, + "learning_rate": 1.613261392979737e-06, + "loss": 0.9349, + "step": 19767 + }, + { + "epoch": 1.89, + "grad_norm": 0.2934690846271425, + "learning_rate": 1.6104325279646782e-06, + "loss": 1.0074, + "step": 19768 + }, + { + "epoch": 1.89, + "grad_norm": 0.33225859522590984, + "learning_rate": 1.6076061252103813e-06, + "loss": 1.018, + "step": 19769 + }, + { + "epoch": 1.89, + "grad_norm": 0.3518673343637606, + "learning_rate": 1.6047821847875677e-06, + "loss": 0.9718, + "step": 19770 + }, + { + "epoch": 1.89, + "grad_norm": 0.28629410054467125, + "learning_rate": 1.6019607067669363e-06, + "loss": 0.9659, + "step": 19771 + }, + { + "epoch": 1.89, + "grad_norm": 0.29982623256428415, + "learning_rate": 1.599141691219075e-06, + "loss": 1.005, + "step": 19772 + }, + { + "epoch": 1.89, + "grad_norm": 0.3505731227557048, + "learning_rate": 1.5963251382145272e-06, + "loss": 1.045, + "step": 19773 + }, + { + "epoch": 1.89, + "grad_norm": 0.2797487142790766, + "learning_rate": 1.5935110478237925e-06, + "loss": 1.0074, + "step": 19774 + }, + { + "epoch": 1.89, + "grad_norm": 0.35305810167243007, + "learning_rate": 1.590699420117292e-06, + "loss": 0.9626, + "step": 19775 + }, + { + "epoch": 1.89, + "grad_norm": 0.3232007600854386, + "learning_rate": 1.5878902551653806e-06, + "loss": 0.9122, + "step": 19776 + }, + { + "epoch": 1.89, + "grad_norm": 0.2813847971191708, + "learning_rate": 1.585083553038369e-06, + "loss": 1.0888, + "step": 19777 + }, + { + "epoch": 1.89, + "grad_norm": 0.33458072267399697, + "learning_rate": 1.5822793138064896e-06, + "loss": 1.0772, + "step": 19778 + }, + { + "epoch": 1.89, + "grad_norm": 0.31717864716955246, + "learning_rate": 1.5794775375399196e-06, + "loss": 1.0512, + "step": 19779 + }, + { + "epoch": 1.89, + "grad_norm": 0.3293006200185384, + "learning_rate": 1.5766782243087919e-06, + "loss": 1.0391, + "step": 19780 + }, + { + "epoch": 1.89, + "grad_norm": 0.3430700181565964, + "learning_rate": 1.5738813741831394e-06, + "loss": 1.1134, + "step": 19781 + }, + { + "epoch": 1.89, + "grad_norm": 0.29016668502855814, + "learning_rate": 1.5710869872329726e-06, + "loss": 0.9554, + "step": 19782 + }, + { + "epoch": 1.89, + "grad_norm": 0.3155826093525495, + "learning_rate": 1.5682950635282024e-06, + "loss": 1.0781, + "step": 19783 + }, + { + "epoch": 1.89, + "grad_norm": 0.3418786975166509, + "learning_rate": 1.5655056031387173e-06, + "loss": 1.0117, + "step": 19784 + }, + { + "epoch": 1.89, + "grad_norm": 0.3299018853471425, + "learning_rate": 1.5627186061343168e-06, + "loss": 1.0253, + "step": 19785 + }, + { + "epoch": 1.89, + "grad_norm": 0.3254901295055549, + "learning_rate": 1.5599340725847566e-06, + "loss": 1.1105, + "step": 19786 + }, + { + "epoch": 1.89, + "grad_norm": 0.2795643157489255, + "learning_rate": 1.5571520025597031e-06, + "loss": 0.9342, + "step": 19787 + }, + { + "epoch": 1.89, + "grad_norm": 0.2844968571361711, + "learning_rate": 1.5543723961288004e-06, + "loss": 1.0272, + "step": 19788 + }, + { + "epoch": 1.89, + "grad_norm": 0.2897673932918432, + "learning_rate": 1.551595253361593e-06, + "loss": 1.0075, + "step": 19789 + }, + { + "epoch": 1.89, + "grad_norm": 0.31069590381072537, + "learning_rate": 1.5488205743275807e-06, + "loss": 0.9409, + "step": 19790 + }, + { + "epoch": 1.89, + "grad_norm": 0.3017014518874295, + "learning_rate": 1.5460483590962082e-06, + "loss": 0.9114, + "step": 19791 + }, + { + "epoch": 1.89, + "grad_norm": 0.31662691066639337, + "learning_rate": 1.5432786077368644e-06, + "loss": 0.945, + "step": 19792 + }, + { + "epoch": 1.89, + "grad_norm": 0.3047878921485926, + "learning_rate": 1.5405113203188381e-06, + "loss": 1.0664, + "step": 19793 + }, + { + "epoch": 1.89, + "grad_norm": 0.30314320799375655, + "learning_rate": 1.5377464969114075e-06, + "loss": 1.1004, + "step": 19794 + }, + { + "epoch": 1.89, + "grad_norm": 0.31969214921077627, + "learning_rate": 1.5349841375837392e-06, + "loss": 0.9937, + "step": 19795 + }, + { + "epoch": 1.89, + "grad_norm": 0.29384172759827476, + "learning_rate": 1.5322242424049783e-06, + "loss": 1.0009, + "step": 19796 + }, + { + "epoch": 1.89, + "grad_norm": 0.300356862633028, + "learning_rate": 1.5294668114441912e-06, + "loss": 1.0001, + "step": 19797 + }, + { + "epoch": 1.89, + "grad_norm": 0.31248657118242407, + "learning_rate": 1.52671184477039e-06, + "loss": 0.9995, + "step": 19798 + }, + { + "epoch": 1.89, + "grad_norm": 0.30305709947844645, + "learning_rate": 1.523959342452508e-06, + "loss": 1.1077, + "step": 19799 + }, + { + "epoch": 1.89, + "grad_norm": 0.3238188284337255, + "learning_rate": 1.5212093045594344e-06, + "loss": 1.0085, + "step": 19800 + }, + { + "epoch": 1.89, + "grad_norm": 0.32623209899621874, + "learning_rate": 1.5184617311599924e-06, + "loss": 1.0464, + "step": 19801 + }, + { + "epoch": 1.89, + "grad_norm": 0.33545962165432347, + "learning_rate": 1.515716622322927e-06, + "loss": 1.007, + "step": 19802 + }, + { + "epoch": 1.89, + "grad_norm": 0.33573442329404796, + "learning_rate": 1.5129739781169604e-06, + "loss": 0.998, + "step": 19803 + }, + { + "epoch": 1.89, + "grad_norm": 0.3131585090210633, + "learning_rate": 1.5102337986107052e-06, + "loss": 1.022, + "step": 19804 + }, + { + "epoch": 1.89, + "grad_norm": 0.27445282263163184, + "learning_rate": 1.5074960838727614e-06, + "loss": 1.0163, + "step": 19805 + }, + { + "epoch": 1.89, + "grad_norm": 0.30901675143704727, + "learning_rate": 1.504760833971619e-06, + "loss": 0.9701, + "step": 19806 + }, + { + "epoch": 1.89, + "grad_norm": 0.3199891924223123, + "learning_rate": 1.5020280489757566e-06, + "loss": 0.9461, + "step": 19807 + }, + { + "epoch": 1.9, + "grad_norm": 0.3255333366601719, + "learning_rate": 1.4992977289535192e-06, + "loss": 1.0887, + "step": 19808 + }, + { + "epoch": 1.9, + "grad_norm": 0.33418117650323426, + "learning_rate": 1.4965698739732858e-06, + "loss": 1.0758, + "step": 19809 + }, + { + "epoch": 1.9, + "grad_norm": 0.36341963752872886, + "learning_rate": 1.4938444841032796e-06, + "loss": 0.8613, + "step": 19810 + }, + { + "epoch": 1.9, + "grad_norm": 0.3203901581507369, + "learning_rate": 1.4911215594117344e-06, + "loss": 0.9484, + "step": 19811 + }, + { + "epoch": 1.9, + "grad_norm": 0.33195189978672834, + "learning_rate": 1.488401099966774e-06, + "loss": 1.0411, + "step": 19812 + }, + { + "epoch": 1.9, + "grad_norm": 0.2840307117091524, + "learning_rate": 1.4856831058364995e-06, + "loss": 0.9986, + "step": 19813 + }, + { + "epoch": 1.9, + "grad_norm": 0.28400788188472714, + "learning_rate": 1.4829675770889118e-06, + "loss": 0.9118, + "step": 19814 + }, + { + "epoch": 1.9, + "grad_norm": 0.3203184088936111, + "learning_rate": 1.4802545137919789e-06, + "loss": 1.0414, + "step": 19815 + }, + { + "epoch": 1.9, + "grad_norm": 0.34653774526234715, + "learning_rate": 1.4775439160135907e-06, + "loss": 1.0728, + "step": 19816 + }, + { + "epoch": 1.9, + "grad_norm": 0.3324766260948004, + "learning_rate": 1.474835783821582e-06, + "loss": 0.9718, + "step": 19817 + }, + { + "epoch": 1.9, + "grad_norm": 0.31788957554012864, + "learning_rate": 1.472130117283732e-06, + "loss": 1.0264, + "step": 19818 + }, + { + "epoch": 1.9, + "grad_norm": 0.3404375743523914, + "learning_rate": 1.4694269164677533e-06, + "loss": 0.8786, + "step": 19819 + }, + { + "epoch": 1.9, + "grad_norm": 0.3550283409427779, + "learning_rate": 1.4667261814412913e-06, + "loss": 1.0239, + "step": 19820 + }, + { + "epoch": 1.9, + "grad_norm": 0.3035113764991036, + "learning_rate": 1.4640279122719259e-06, + "loss": 0.9559, + "step": 19821 + }, + { + "epoch": 1.9, + "grad_norm": 0.2999427934356882, + "learning_rate": 1.4613321090271914e-06, + "loss": 1.1333, + "step": 19822 + }, + { + "epoch": 1.9, + "grad_norm": 0.33110020701508386, + "learning_rate": 1.4586387717745454e-06, + "loss": 1.0723, + "step": 19823 + }, + { + "epoch": 1.9, + "grad_norm": 0.312048285571724, + "learning_rate": 1.4559479005814003e-06, + "loss": 1.0073, + "step": 19824 + }, + { + "epoch": 1.9, + "grad_norm": 0.31733274239867776, + "learning_rate": 1.4532594955150914e-06, + "loss": 0.9898, + "step": 19825 + }, + { + "epoch": 1.9, + "grad_norm": 0.3178347492417246, + "learning_rate": 1.450573556642898e-06, + "loss": 1.1034, + "step": 19826 + }, + { + "epoch": 1.9, + "grad_norm": 0.28375576185439705, + "learning_rate": 1.4478900840320443e-06, + "loss": 0.9729, + "step": 19827 + }, + { + "epoch": 1.9, + "grad_norm": 0.2811522223114552, + "learning_rate": 1.4452090777496762e-06, + "loss": 1.0012, + "step": 19828 + }, + { + "epoch": 1.9, + "grad_norm": 0.3225750522735149, + "learning_rate": 1.442530537862885e-06, + "loss": 0.9382, + "step": 19829 + }, + { + "epoch": 1.9, + "grad_norm": 0.34221286412083435, + "learning_rate": 1.4398544644387059e-06, + "loss": 1.0132, + "step": 19830 + }, + { + "epoch": 1.9, + "grad_norm": 0.3370189376965778, + "learning_rate": 1.437180857544107e-06, + "loss": 1.1729, + "step": 19831 + }, + { + "epoch": 1.9, + "grad_norm": 0.30143880242119203, + "learning_rate": 1.434509717246013e-06, + "loss": 1.1002, + "step": 19832 + }, + { + "epoch": 1.9, + "grad_norm": 0.33527025148581413, + "learning_rate": 1.4318410436112595e-06, + "loss": 1.0816, + "step": 19833 + }, + { + "epoch": 1.9, + "grad_norm": 0.32505370110069337, + "learning_rate": 1.429174836706626e-06, + "loss": 0.9893, + "step": 19834 + }, + { + "epoch": 1.9, + "grad_norm": 0.3664137887860694, + "learning_rate": 1.426511096598848e-06, + "loss": 1.0068, + "step": 19835 + }, + { + "epoch": 1.9, + "grad_norm": 0.31115121411846786, + "learning_rate": 1.4238498233545727e-06, + "loss": 0.9728, + "step": 19836 + }, + { + "epoch": 1.9, + "grad_norm": 0.2750989699084077, + "learning_rate": 1.421191017040402e-06, + "loss": 0.9958, + "step": 19837 + }, + { + "epoch": 1.9, + "grad_norm": 0.3181136838706145, + "learning_rate": 1.4185346777229047e-06, + "loss": 1.0138, + "step": 19838 + }, + { + "epoch": 1.9, + "grad_norm": 0.3564168446516408, + "learning_rate": 1.4158808054685168e-06, + "loss": 1.0099, + "step": 19839 + }, + { + "epoch": 1.9, + "grad_norm": 0.30891182110494714, + "learning_rate": 1.413229400343663e-06, + "loss": 1.0459, + "step": 19840 + }, + { + "epoch": 1.9, + "grad_norm": 0.34876497553597324, + "learning_rate": 1.4105804624147234e-06, + "loss": 0.999, + "step": 19841 + }, + { + "epoch": 1.9, + "grad_norm": 0.30801980025511716, + "learning_rate": 1.4079339917479562e-06, + "loss": 0.9841, + "step": 19842 + }, + { + "epoch": 1.9, + "grad_norm": 0.312815357439583, + "learning_rate": 1.4052899884096082e-06, + "loss": 1.105, + "step": 19843 + }, + { + "epoch": 1.9, + "grad_norm": 0.34579609999762123, + "learning_rate": 1.4026484524658267e-06, + "loss": 1.1394, + "step": 19844 + }, + { + "epoch": 1.9, + "grad_norm": 0.31024856518283606, + "learning_rate": 1.4000093839827477e-06, + "loss": 0.9571, + "step": 19845 + }, + { + "epoch": 1.9, + "grad_norm": 0.29200940658026897, + "learning_rate": 1.397372783026396e-06, + "loss": 1.0382, + "step": 19846 + }, + { + "epoch": 1.9, + "grad_norm": 0.2791135446190551, + "learning_rate": 1.3947386496627746e-06, + "loss": 0.9818, + "step": 19847 + }, + { + "epoch": 1.9, + "grad_norm": 0.2910297681996032, + "learning_rate": 1.3921069839577749e-06, + "loss": 0.8917, + "step": 19848 + }, + { + "epoch": 1.9, + "grad_norm": 0.3039542749284502, + "learning_rate": 1.3894777859772778e-06, + "loss": 0.9805, + "step": 19849 + }, + { + "epoch": 1.9, + "grad_norm": 0.37739262209656776, + "learning_rate": 1.3868510557870751e-06, + "loss": 1.0728, + "step": 19850 + }, + { + "epoch": 1.9, + "grad_norm": 0.3465400302970859, + "learning_rate": 1.384226793452892e-06, + "loss": 1.0093, + "step": 19851 + }, + { + "epoch": 1.9, + "grad_norm": 0.28377063430474125, + "learning_rate": 1.381604999040409e-06, + "loss": 0.9915, + "step": 19852 + }, + { + "epoch": 1.9, + "grad_norm": 0.32154551530347425, + "learning_rate": 1.3789856726152517e-06, + "loss": 0.985, + "step": 19853 + }, + { + "epoch": 1.9, + "grad_norm": 0.34560001073508684, + "learning_rate": 1.3763688142429566e-06, + "loss": 1.0621, + "step": 19854 + }, + { + "epoch": 1.9, + "grad_norm": 0.34000355799070087, + "learning_rate": 1.3737544239890154e-06, + "loss": 0.9007, + "step": 19855 + }, + { + "epoch": 1.9, + "grad_norm": 0.2638557792211161, + "learning_rate": 1.371142501918843e-06, + "loss": 1.0512, + "step": 19856 + }, + { + "epoch": 1.9, + "grad_norm": 0.3477348849398422, + "learning_rate": 1.3685330480978197e-06, + "loss": 1.0249, + "step": 19857 + }, + { + "epoch": 1.9, + "grad_norm": 0.29308645596093147, + "learning_rate": 1.3659260625912495e-06, + "loss": 0.9419, + "step": 19858 + }, + { + "epoch": 1.9, + "grad_norm": 0.34384620442807823, + "learning_rate": 1.363321545464369e-06, + "loss": 0.9943, + "step": 19859 + }, + { + "epoch": 1.9, + "grad_norm": 0.3265981582268954, + "learning_rate": 1.360719496782359e-06, + "loss": 1.0017, + "step": 19860 + }, + { + "epoch": 1.9, + "grad_norm": 0.3121963003845591, + "learning_rate": 1.358119916610323e-06, + "loss": 0.9191, + "step": 19861 + }, + { + "epoch": 1.9, + "grad_norm": 0.30668486073695916, + "learning_rate": 1.355522805013354e-06, + "loss": 0.9681, + "step": 19862 + }, + { + "epoch": 1.9, + "grad_norm": 0.32114383210196523, + "learning_rate": 1.3529281620563994e-06, + "loss": 0.9251, + "step": 19863 + }, + { + "epoch": 1.9, + "grad_norm": 0.33316972797814776, + "learning_rate": 1.3503359878044185e-06, + "loss": 1.059, + "step": 19864 + }, + { + "epoch": 1.9, + "grad_norm": 0.33030590635310547, + "learning_rate": 1.347746282322282e-06, + "loss": 0.9726, + "step": 19865 + }, + { + "epoch": 1.9, + "grad_norm": 0.3313886165786226, + "learning_rate": 1.3451590456748043e-06, + "loss": 0.9041, + "step": 19866 + }, + { + "epoch": 1.9, + "grad_norm": 0.34466997152851003, + "learning_rate": 1.3425742779267115e-06, + "loss": 1.0538, + "step": 19867 + }, + { + "epoch": 1.9, + "grad_norm": 0.33033190277422514, + "learning_rate": 1.3399919791427184e-06, + "loss": 1.0682, + "step": 19868 + }, + { + "epoch": 1.9, + "grad_norm": 0.2920235072263877, + "learning_rate": 1.337412149387418e-06, + "loss": 0.9688, + "step": 19869 + }, + { + "epoch": 1.9, + "grad_norm": 0.31757442744414316, + "learning_rate": 1.334834788725392e-06, + "loss": 0.9573, + "step": 19870 + }, + { + "epoch": 1.9, + "grad_norm": 0.2771948385313479, + "learning_rate": 1.3322598972211108e-06, + "loss": 0.991, + "step": 19871 + }, + { + "epoch": 1.9, + "grad_norm": 0.3379129578354819, + "learning_rate": 1.3296874749390676e-06, + "loss": 0.9978, + "step": 19872 + }, + { + "epoch": 1.9, + "grad_norm": 0.33384065083673686, + "learning_rate": 1.3271175219435882e-06, + "loss": 0.9314, + "step": 19873 + }, + { + "epoch": 1.9, + "grad_norm": 0.32214332516789346, + "learning_rate": 1.3245500382990217e-06, + "loss": 1.0777, + "step": 19874 + }, + { + "epoch": 1.9, + "grad_norm": 0.32241664698365924, + "learning_rate": 1.321985024069594e-06, + "loss": 1.0014, + "step": 19875 + }, + { + "epoch": 1.9, + "grad_norm": 0.2965994883537121, + "learning_rate": 1.3194224793195099e-06, + "loss": 1.0976, + "step": 19876 + }, + { + "epoch": 1.9, + "grad_norm": 0.3218406378370473, + "learning_rate": 1.3168624041128951e-06, + "loss": 0.9111, + "step": 19877 + }, + { + "epoch": 1.9, + "grad_norm": 0.29974715917369993, + "learning_rate": 1.3143047985138213e-06, + "loss": 1.0952, + "step": 19878 + }, + { + "epoch": 1.9, + "grad_norm": 0.33665511904941453, + "learning_rate": 1.3117496625862923e-06, + "loss": 1.1052, + "step": 19879 + }, + { + "epoch": 1.9, + "grad_norm": 0.3093140128759133, + "learning_rate": 1.3091969963942464e-06, + "loss": 1.0418, + "step": 19880 + }, + { + "epoch": 1.9, + "grad_norm": 0.339372977851298, + "learning_rate": 1.3066468000015764e-06, + "loss": 0.9948, + "step": 19881 + }, + { + "epoch": 1.9, + "grad_norm": 0.32297386199168154, + "learning_rate": 1.304099073472087e-06, + "loss": 0.9957, + "step": 19882 + }, + { + "epoch": 1.9, + "grad_norm": 0.3646551846203165, + "learning_rate": 1.3015538168695608e-06, + "loss": 1.0761, + "step": 19883 + }, + { + "epoch": 1.9, + "grad_norm": 0.32062185010440336, + "learning_rate": 1.2990110302576687e-06, + "loss": 0.9754, + "step": 19884 + }, + { + "epoch": 1.9, + "grad_norm": 0.29304867493406694, + "learning_rate": 1.2964707137000598e-06, + "loss": 1.176, + "step": 19885 + }, + { + "epoch": 1.9, + "grad_norm": 0.3474736460609039, + "learning_rate": 1.2939328672603056e-06, + "loss": 0.9516, + "step": 19886 + }, + { + "epoch": 1.9, + "grad_norm": 0.3355891060258038, + "learning_rate": 1.2913974910019223e-06, + "loss": 0.9793, + "step": 19887 + }, + { + "epoch": 1.9, + "grad_norm": 0.3001830336064506, + "learning_rate": 1.2888645849883364e-06, + "loss": 0.9125, + "step": 19888 + }, + { + "epoch": 1.9, + "grad_norm": 0.3267114086727877, + "learning_rate": 1.286334149282975e-06, + "loss": 0.997, + "step": 19889 + }, + { + "epoch": 1.9, + "grad_norm": 0.3450379023073134, + "learning_rate": 1.2838061839491211e-06, + "loss": 1.0301, + "step": 19890 + }, + { + "epoch": 1.9, + "grad_norm": 0.31944738483796437, + "learning_rate": 1.2812806890500572e-06, + "loss": 0.9173, + "step": 19891 + }, + { + "epoch": 1.9, + "grad_norm": 0.30961866935779925, + "learning_rate": 1.2787576646489995e-06, + "loss": 1.1072, + "step": 19892 + }, + { + "epoch": 1.9, + "grad_norm": 0.3061448238766944, + "learning_rate": 1.2762371108090754e-06, + "loss": 1.0795, + "step": 19893 + }, + { + "epoch": 1.9, + "grad_norm": 0.32018374344366807, + "learning_rate": 1.2737190275933452e-06, + "loss": 0.9044, + "step": 19894 + }, + { + "epoch": 1.9, + "grad_norm": 0.2853315533968125, + "learning_rate": 1.27120341506487e-06, + "loss": 0.9999, + "step": 19895 + }, + { + "epoch": 1.9, + "grad_norm": 0.32116741506111307, + "learning_rate": 1.2686902732865546e-06, + "loss": 1.0439, + "step": 19896 + }, + { + "epoch": 1.9, + "grad_norm": 0.2972530980447627, + "learning_rate": 1.2661796023213268e-06, + "loss": 0.9839, + "step": 19897 + }, + { + "epoch": 1.9, + "grad_norm": 0.3240003163340138, + "learning_rate": 1.2636714022320028e-06, + "loss": 1.0122, + "step": 19898 + }, + { + "epoch": 1.9, + "grad_norm": 0.3284803565362821, + "learning_rate": 1.2611656730813547e-06, + "loss": 0.9892, + "step": 19899 + }, + { + "epoch": 1.9, + "grad_norm": 0.33306198401107656, + "learning_rate": 1.2586624149320881e-06, + "loss": 1.0143, + "step": 19900 + }, + { + "epoch": 1.9, + "grad_norm": 0.3222233311463827, + "learning_rate": 1.2561616278468635e-06, + "loss": 1.0889, + "step": 19901 + }, + { + "epoch": 1.9, + "grad_norm": 0.31666573740779635, + "learning_rate": 1.2536633118882423e-06, + "loss": 1.0637, + "step": 19902 + }, + { + "epoch": 1.9, + "grad_norm": 0.35631302842763785, + "learning_rate": 1.2511674671187523e-06, + "loss": 1.0354, + "step": 19903 + }, + { + "epoch": 1.9, + "grad_norm": 0.32828871888057165, + "learning_rate": 1.2486740936008766e-06, + "loss": 1.092, + "step": 19904 + }, + { + "epoch": 1.9, + "grad_norm": 0.3438828324124662, + "learning_rate": 1.2461831913969767e-06, + "loss": 0.965, + "step": 19905 + }, + { + "epoch": 1.9, + "grad_norm": 0.3009865509024931, + "learning_rate": 1.2436947605694138e-06, + "loss": 1.1351, + "step": 19906 + }, + { + "epoch": 1.9, + "grad_norm": 0.3176561666525948, + "learning_rate": 1.241208801180449e-06, + "loss": 1.0199, + "step": 19907 + }, + { + "epoch": 1.9, + "grad_norm": 0.3253937632946344, + "learning_rate": 1.2387253132923103e-06, + "loss": 1.0763, + "step": 19908 + }, + { + "epoch": 1.9, + "grad_norm": 0.33064727904459495, + "learning_rate": 1.2362442969671372e-06, + "loss": 0.9285, + "step": 19909 + }, + { + "epoch": 1.9, + "grad_norm": 0.3146988282089934, + "learning_rate": 1.2337657522670242e-06, + "loss": 0.9443, + "step": 19910 + }, + { + "epoch": 1.9, + "grad_norm": 0.3620297969613435, + "learning_rate": 1.2312896792539885e-06, + "loss": 0.9487, + "step": 19911 + }, + { + "epoch": 1.9, + "grad_norm": 0.3078870337022793, + "learning_rate": 1.2288160779900026e-06, + "loss": 0.8765, + "step": 19912 + }, + { + "epoch": 1.91, + "grad_norm": 0.30233545798709793, + "learning_rate": 1.2263449485369615e-06, + "loss": 1.0745, + "step": 19913 + }, + { + "epoch": 1.91, + "grad_norm": 0.3262039598344827, + "learning_rate": 1.2238762909567381e-06, + "loss": 1.0375, + "step": 19914 + }, + { + "epoch": 1.91, + "grad_norm": 0.3216918958436239, + "learning_rate": 1.2214101053110716e-06, + "loss": 1.0035, + "step": 19915 + }, + { + "epoch": 1.91, + "grad_norm": 0.35026334151683436, + "learning_rate": 1.2189463916617016e-06, + "loss": 0.9935, + "step": 19916 + }, + { + "epoch": 1.91, + "grad_norm": 0.31052426646111697, + "learning_rate": 1.216485150070279e-06, + "loss": 0.9616, + "step": 19917 + }, + { + "epoch": 1.91, + "grad_norm": 0.30527897416072874, + "learning_rate": 1.2140263805983986e-06, + "loss": 0.9344, + "step": 19918 + }, + { + "epoch": 1.91, + "grad_norm": 0.31370464114397095, + "learning_rate": 1.2115700833076004e-06, + "loss": 1.0554, + "step": 19919 + }, + { + "epoch": 1.91, + "grad_norm": 0.27780964698906996, + "learning_rate": 1.2091162582593351e-06, + "loss": 1.0303, + "step": 19920 + }, + { + "epoch": 1.91, + "grad_norm": 0.33138005410091514, + "learning_rate": 1.2066649055150315e-06, + "loss": 1.0866, + "step": 19921 + }, + { + "epoch": 1.91, + "grad_norm": 0.32525978423199353, + "learning_rate": 1.204216025136029e-06, + "loss": 0.9861, + "step": 19922 + }, + { + "epoch": 1.91, + "grad_norm": 0.34320760368875564, + "learning_rate": 1.2017696171836123e-06, + "loss": 1.057, + "step": 19923 + }, + { + "epoch": 1.91, + "grad_norm": 0.2749204651631677, + "learning_rate": 1.199325681718988e-06, + "loss": 0.8847, + "step": 19924 + }, + { + "epoch": 1.91, + "grad_norm": 0.34806022006550663, + "learning_rate": 1.1968842188033403e-06, + "loss": 1.0834, + "step": 19925 + }, + { + "epoch": 1.91, + "grad_norm": 0.3333341854582524, + "learning_rate": 1.1944452284977647e-06, + "loss": 1.0318, + "step": 19926 + }, + { + "epoch": 1.91, + "grad_norm": 0.3606961019060216, + "learning_rate": 1.1920087108633015e-06, + "loss": 0.9476, + "step": 19927 + }, + { + "epoch": 1.91, + "grad_norm": 0.37684832155835746, + "learning_rate": 1.189574665960902e-06, + "loss": 1.0899, + "step": 19928 + }, + { + "epoch": 1.91, + "grad_norm": 0.31774780186318613, + "learning_rate": 1.187143093851506e-06, + "loss": 0.9809, + "step": 19929 + }, + { + "epoch": 1.91, + "grad_norm": 0.3097552172272549, + "learning_rate": 1.184713994595954e-06, + "loss": 1.0574, + "step": 19930 + }, + { + "epoch": 1.91, + "grad_norm": 0.33475444702412893, + "learning_rate": 1.1822873682550416e-06, + "loss": 0.9951, + "step": 19931 + }, + { + "epoch": 1.91, + "grad_norm": 0.30854513723249816, + "learning_rate": 1.1798632148894872e-06, + "loss": 1.0622, + "step": 19932 + }, + { + "epoch": 1.91, + "grad_norm": 0.32642197977747917, + "learning_rate": 1.1774415345599642e-06, + "loss": 1.1509, + "step": 19933 + }, + { + "epoch": 1.91, + "grad_norm": 0.3086087099000515, + "learning_rate": 1.1750223273270688e-06, + "loss": 1.012, + "step": 19934 + }, + { + "epoch": 1.91, + "grad_norm": 0.2845204688892346, + "learning_rate": 1.1726055932513635e-06, + "loss": 1.0058, + "step": 19935 + }, + { + "epoch": 1.91, + "grad_norm": 0.3028015681803655, + "learning_rate": 1.1701913323933e-06, + "loss": 1.0155, + "step": 19936 + }, + { + "epoch": 1.91, + "grad_norm": 0.2920550023704409, + "learning_rate": 1.167779544813319e-06, + "loss": 1.0175, + "step": 19937 + }, + { + "epoch": 1.91, + "grad_norm": 0.29381935938021, + "learning_rate": 1.1653702305717607e-06, + "loss": 1.0234, + "step": 19938 + }, + { + "epoch": 1.91, + "grad_norm": 0.3110441088557386, + "learning_rate": 1.1629633897289216e-06, + "loss": 0.9271, + "step": 19939 + }, + { + "epoch": 1.91, + "grad_norm": 0.31887437152573017, + "learning_rate": 1.1605590223450534e-06, + "loss": 1.0804, + "step": 19940 + }, + { + "epoch": 1.91, + "grad_norm": 0.3103027352388874, + "learning_rate": 1.1581571284803195e-06, + "loss": 1.0612, + "step": 19941 + }, + { + "epoch": 1.91, + "grad_norm": 0.3034944168857489, + "learning_rate": 1.1557577081948046e-06, + "loss": 0.9818, + "step": 19942 + }, + { + "epoch": 1.91, + "grad_norm": 0.31566080178990263, + "learning_rate": 1.1533607615485831e-06, + "loss": 1.0605, + "step": 19943 + }, + { + "epoch": 1.91, + "grad_norm": 0.30757746659969365, + "learning_rate": 1.1509662886016403e-06, + "loss": 1.0879, + "step": 19944 + }, + { + "epoch": 1.91, + "grad_norm": 0.36215861419811496, + "learning_rate": 1.148574289413873e-06, + "loss": 0.9694, + "step": 19945 + }, + { + "epoch": 1.91, + "grad_norm": 0.3632498113242414, + "learning_rate": 1.1461847640451663e-06, + "loss": 0.9585, + "step": 19946 + }, + { + "epoch": 1.91, + "grad_norm": 0.33753590899932423, + "learning_rate": 1.1437977125553168e-06, + "loss": 0.9522, + "step": 19947 + }, + { + "epoch": 1.91, + "grad_norm": 0.31008717158171256, + "learning_rate": 1.1414131350040659e-06, + "loss": 1.0822, + "step": 19948 + }, + { + "epoch": 1.91, + "grad_norm": 0.309251450820328, + "learning_rate": 1.1390310314510654e-06, + "loss": 1.0293, + "step": 19949 + }, + { + "epoch": 1.91, + "grad_norm": 0.32931634442168956, + "learning_rate": 1.136651401955957e-06, + "loss": 1.0861, + "step": 19950 + }, + { + "epoch": 1.91, + "grad_norm": 0.2906999091492665, + "learning_rate": 1.134274246578282e-06, + "loss": 0.9413, + "step": 19951 + }, + { + "epoch": 1.91, + "grad_norm": 0.30720204960155845, + "learning_rate": 1.1318995653775145e-06, + "loss": 0.9239, + "step": 19952 + }, + { + "epoch": 1.91, + "grad_norm": 0.2833756502276844, + "learning_rate": 1.1295273584131072e-06, + "loss": 1.0089, + "step": 19953 + }, + { + "epoch": 1.91, + "grad_norm": 0.30854802922159225, + "learning_rate": 1.1271576257444127e-06, + "loss": 1.0386, + "step": 19954 + }, + { + "epoch": 1.91, + "grad_norm": 0.3057922738888097, + "learning_rate": 1.1247903674307392e-06, + "loss": 1.0343, + "step": 19955 + }, + { + "epoch": 1.91, + "grad_norm": 0.2937662953791558, + "learning_rate": 1.122425583531328e-06, + "loss": 0.9918, + "step": 19956 + }, + { + "epoch": 1.91, + "grad_norm": 0.2869501660382217, + "learning_rate": 1.1200632741053652e-06, + "loss": 0.999, + "step": 19957 + }, + { + "epoch": 1.91, + "grad_norm": 0.3116316288283081, + "learning_rate": 1.117703439211959e-06, + "loss": 1.0795, + "step": 19958 + }, + { + "epoch": 1.91, + "grad_norm": 0.32928803854919836, + "learning_rate": 1.115346078910151e-06, + "loss": 1.0728, + "step": 19959 + }, + { + "epoch": 1.91, + "grad_norm": 0.29811675434721335, + "learning_rate": 1.112991193258972e-06, + "loss": 1.0551, + "step": 19960 + }, + { + "epoch": 1.91, + "grad_norm": 0.3156851509161287, + "learning_rate": 1.1106387823173304e-06, + "loss": 0.9923, + "step": 19961 + }, + { + "epoch": 1.91, + "grad_norm": 0.28276663467403806, + "learning_rate": 1.108288846144112e-06, + "loss": 0.9244, + "step": 19962 + }, + { + "epoch": 1.91, + "grad_norm": 0.2997756465126296, + "learning_rate": 1.1059413847981147e-06, + "loss": 1.0663, + "step": 19963 + }, + { + "epoch": 1.91, + "grad_norm": 0.347277078445295, + "learning_rate": 1.1035963983380805e-06, + "loss": 0.9053, + "step": 19964 + }, + { + "epoch": 1.91, + "grad_norm": 0.29628546107330356, + "learning_rate": 1.1012538868227063e-06, + "loss": 0.9657, + "step": 19965 + }, + { + "epoch": 1.91, + "grad_norm": 0.31355792070289606, + "learning_rate": 1.098913850310601e-06, + "loss": 0.9398, + "step": 19966 + }, + { + "epoch": 1.91, + "grad_norm": 0.273330331899111, + "learning_rate": 1.0965762888603404e-06, + "loss": 1.1114, + "step": 19967 + }, + { + "epoch": 1.91, + "grad_norm": 0.3157727407660526, + "learning_rate": 1.0942412025304106e-06, + "loss": 0.9949, + "step": 19968 + }, + { + "epoch": 1.91, + "grad_norm": 0.3258111600130658, + "learning_rate": 1.0919085913792537e-06, + "loss": 0.9905, + "step": 19969 + }, + { + "epoch": 1.91, + "grad_norm": 0.28812632272197625, + "learning_rate": 1.0895784554652566e-06, + "loss": 1.0444, + "step": 19970 + }, + { + "epoch": 1.91, + "grad_norm": 0.3052363930143245, + "learning_rate": 1.087250794846717e-06, + "loss": 1.0272, + "step": 19971 + }, + { + "epoch": 1.91, + "grad_norm": 0.27173211490255816, + "learning_rate": 1.0849256095818884e-06, + "loss": 0.9409, + "step": 19972 + }, + { + "epoch": 1.91, + "grad_norm": 0.32412024264845624, + "learning_rate": 1.0826028997289572e-06, + "loss": 0.9949, + "step": 19973 + }, + { + "epoch": 1.91, + "grad_norm": 0.34466673773707746, + "learning_rate": 1.080282665346055e-06, + "loss": 1.0467, + "step": 19974 + }, + { + "epoch": 1.91, + "grad_norm": 0.3504073216382154, + "learning_rate": 1.0779649064912578e-06, + "loss": 0.9634, + "step": 19975 + }, + { + "epoch": 1.91, + "grad_norm": 0.30750913358220805, + "learning_rate": 1.0756496232225632e-06, + "loss": 1.0487, + "step": 19976 + }, + { + "epoch": 1.91, + "grad_norm": 0.33976019378374567, + "learning_rate": 1.073336815597903e-06, + "loss": 1.0435, + "step": 19977 + }, + { + "epoch": 1.91, + "grad_norm": 0.34510104802682373, + "learning_rate": 1.0710264836751526e-06, + "loss": 1.0043, + "step": 19978 + }, + { + "epoch": 1.91, + "grad_norm": 0.35077633498568683, + "learning_rate": 1.0687186275121442e-06, + "loss": 1.0145, + "step": 19979 + }, + { + "epoch": 1.91, + "grad_norm": 0.2650209781566024, + "learning_rate": 1.066413247166631e-06, + "loss": 1.0739, + "step": 19980 + }, + { + "epoch": 1.91, + "grad_norm": 0.3154618218037207, + "learning_rate": 1.0641103426963006e-06, + "loss": 1.0209, + "step": 19981 + }, + { + "epoch": 1.91, + "grad_norm": 0.3419095425248834, + "learning_rate": 1.0618099141587955e-06, + "loss": 1.0288, + "step": 19982 + }, + { + "epoch": 1.91, + "grad_norm": 0.3130022863051629, + "learning_rate": 1.05951196161167e-06, + "loss": 0.9743, + "step": 19983 + }, + { + "epoch": 1.91, + "grad_norm": 0.3248660627671977, + "learning_rate": 1.0572164851124445e-06, + "loss": 1.0708, + "step": 19984 + }, + { + "epoch": 1.91, + "grad_norm": 0.3060316991042003, + "learning_rate": 1.0549234847185507e-06, + "loss": 1.0053, + "step": 19985 + }, + { + "epoch": 1.91, + "grad_norm": 0.35273420256433946, + "learning_rate": 1.0526329604873874e-06, + "loss": 1.1043, + "step": 19986 + }, + { + "epoch": 1.91, + "grad_norm": 0.29576591131211327, + "learning_rate": 1.0503449124762754e-06, + "loss": 0.9447, + "step": 19987 + }, + { + "epoch": 1.91, + "grad_norm": 0.31462285294799, + "learning_rate": 1.0480593407424577e-06, + "loss": 1.1201, + "step": 19988 + }, + { + "epoch": 1.91, + "grad_norm": 0.2956193932435807, + "learning_rate": 1.0457762453431442e-06, + "loss": 1.04, + "step": 19989 + }, + { + "epoch": 1.91, + "grad_norm": 0.289691610121901, + "learning_rate": 1.0434956263354888e-06, + "loss": 1.113, + "step": 19990 + }, + { + "epoch": 1.91, + "grad_norm": 0.3219331945863588, + "learning_rate": 1.0412174837765243e-06, + "loss": 1.0736, + "step": 19991 + }, + { + "epoch": 1.91, + "grad_norm": 0.3134822384249118, + "learning_rate": 1.0389418177233046e-06, + "loss": 1.0629, + "step": 19992 + }, + { + "epoch": 1.91, + "grad_norm": 0.2868457822906323, + "learning_rate": 1.0366686282327397e-06, + "loss": 1.0757, + "step": 19993 + }, + { + "epoch": 1.91, + "grad_norm": 0.31663620071056015, + "learning_rate": 1.0343979153617622e-06, + "loss": 0.9953, + "step": 19994 + }, + { + "epoch": 1.91, + "grad_norm": 0.30192151855573623, + "learning_rate": 1.0321296791671596e-06, + "loss": 1.0851, + "step": 19995 + }, + { + "epoch": 1.91, + "grad_norm": 0.3084449097487815, + "learning_rate": 1.0298639197057203e-06, + "loss": 1.0348, + "step": 19996 + }, + { + "epoch": 1.91, + "grad_norm": 0.2988797869256231, + "learning_rate": 1.0276006370341206e-06, + "loss": 1.1218, + "step": 19997 + }, + { + "epoch": 1.91, + "grad_norm": 0.35918833771110015, + "learning_rate": 1.025339831209038e-06, + "loss": 1.0362, + "step": 19998 + }, + { + "epoch": 1.91, + "grad_norm": 0.33499319630494234, + "learning_rate": 1.0230815022870154e-06, + "loss": 1.0078, + "step": 19999 + }, + { + "epoch": 1.91, + "grad_norm": 0.3417736422076645, + "learning_rate": 1.0208256503245861e-06, + "loss": 1.0622, + "step": 20000 + }, + { + "epoch": 1.91, + "grad_norm": 0.32004251184233884, + "learning_rate": 1.0185722753781934e-06, + "loss": 0.9671, + "step": 20001 + }, + { + "epoch": 1.91, + "grad_norm": 0.2797512477154076, + "learning_rate": 1.0163213775042479e-06, + "loss": 1.0094, + "step": 20002 + }, + { + "epoch": 1.91, + "grad_norm": 0.3360231892982712, + "learning_rate": 1.0140729567590712e-06, + "loss": 0.9554, + "step": 20003 + }, + { + "epoch": 1.91, + "grad_norm": 0.30779789514750755, + "learning_rate": 1.0118270131989183e-06, + "loss": 0.964, + "step": 20004 + }, + { + "epoch": 1.91, + "grad_norm": 0.3293479689543207, + "learning_rate": 1.0095835468800108e-06, + "loss": 0.9663, + "step": 20005 + }, + { + "epoch": 1.91, + "grad_norm": 0.3316417955165355, + "learning_rate": 1.0073425578584926e-06, + "loss": 0.9581, + "step": 20006 + }, + { + "epoch": 1.91, + "grad_norm": 0.32506444978812227, + "learning_rate": 1.0051040461904304e-06, + "loss": 0.9471, + "step": 20007 + }, + { + "epoch": 1.91, + "grad_norm": 0.30517327939184935, + "learning_rate": 1.0028680119318568e-06, + "loss": 0.9813, + "step": 20008 + }, + { + "epoch": 1.91, + "grad_norm": 0.3076332369235528, + "learning_rate": 1.000634455138738e-06, + "loss": 1.0439, + "step": 20009 + }, + { + "epoch": 1.91, + "grad_norm": 0.30851514288348997, + "learning_rate": 9.984033758669631e-07, + "loss": 0.9767, + "step": 20010 + }, + { + "epoch": 1.91, + "grad_norm": 0.3111340755847164, + "learning_rate": 9.961747741723537e-07, + "loss": 1.0162, + "step": 20011 + }, + { + "epoch": 1.91, + "grad_norm": 0.2921312610748139, + "learning_rate": 9.939486501106987e-07, + "loss": 0.9243, + "step": 20012 + }, + { + "epoch": 1.91, + "grad_norm": 0.2997278954780802, + "learning_rate": 9.917250037376979e-07, + "loss": 0.993, + "step": 20013 + }, + { + "epoch": 1.91, + "grad_norm": 0.37814496063630365, + "learning_rate": 9.895038351090068e-07, + "loss": 0.9108, + "step": 20014 + }, + { + "epoch": 1.91, + "grad_norm": 0.36054961324988616, + "learning_rate": 9.872851442802033e-07, + "loss": 0.9719, + "step": 20015 + }, + { + "epoch": 1.91, + "grad_norm": 0.27614472589263095, + "learning_rate": 9.850689313068207e-07, + "loss": 0.8905, + "step": 20016 + }, + { + "epoch": 1.92, + "grad_norm": 0.33428218420663963, + "learning_rate": 9.828551962443144e-07, + "loss": 0.9645, + "step": 20017 + }, + { + "epoch": 1.92, + "grad_norm": 0.32132040735724277, + "learning_rate": 9.806439391480849e-07, + "loss": 1.1401, + "step": 20018 + }, + { + "epoch": 1.92, + "grad_norm": 0.28055067806539835, + "learning_rate": 9.784351600734764e-07, + "loss": 1.0141, + "step": 20019 + }, + { + "epoch": 1.92, + "grad_norm": 0.30398595658470695, + "learning_rate": 9.762288590757561e-07, + "loss": 0.9203, + "step": 20020 + }, + { + "epoch": 1.92, + "grad_norm": 0.34523673876001454, + "learning_rate": 9.740250362101466e-07, + "loss": 1.0401, + "step": 20021 + }, + { + "epoch": 1.92, + "grad_norm": 0.3128661834302497, + "learning_rate": 9.718236915317925e-07, + "loss": 1.0552, + "step": 20022 + }, + { + "epoch": 1.92, + "grad_norm": 0.3283437104557386, + "learning_rate": 9.69624825095794e-07, + "loss": 1.0345, + "step": 20023 + }, + { + "epoch": 1.92, + "grad_norm": 0.29797938255894046, + "learning_rate": 9.67428436957163e-07, + "loss": 1.1044, + "step": 20024 + }, + { + "epoch": 1.92, + "grad_norm": 0.3131534455955968, + "learning_rate": 9.652345271708773e-07, + "loss": 1.0534, + "step": 20025 + }, + { + "epoch": 1.92, + "grad_norm": 0.333600951338235, + "learning_rate": 9.63043095791849e-07, + "loss": 0.9841, + "step": 20026 + }, + { + "epoch": 1.92, + "grad_norm": 0.32418612292392884, + "learning_rate": 9.608541428749006e-07, + "loss": 1.0216, + "step": 20027 + }, + { + "epoch": 1.92, + "grad_norm": 0.323352931485495, + "learning_rate": 9.58667668474822e-07, + "loss": 1.143, + "step": 20028 + }, + { + "epoch": 1.92, + "grad_norm": 0.2896990979912183, + "learning_rate": 9.564836726463244e-07, + "loss": 1.0166, + "step": 20029 + }, + { + "epoch": 1.92, + "grad_norm": 0.34823159829971817, + "learning_rate": 9.543021554440757e-07, + "loss": 1.0255, + "step": 20030 + }, + { + "epoch": 1.92, + "grad_norm": 0.3080503182950541, + "learning_rate": 9.521231169226652e-07, + "loss": 0.9154, + "step": 20031 + }, + { + "epoch": 1.92, + "grad_norm": 0.3197545170927909, + "learning_rate": 9.499465571366273e-07, + "loss": 1.0302, + "step": 20032 + }, + { + "epoch": 1.92, + "grad_norm": 0.3105808507557536, + "learning_rate": 9.477724761404183e-07, + "loss": 1.0445, + "step": 20033 + }, + { + "epoch": 1.92, + "grad_norm": 0.2636939161220622, + "learning_rate": 9.456008739884503e-07, + "loss": 1.0494, + "step": 20034 + }, + { + "epoch": 1.92, + "grad_norm": 0.3585185716282579, + "learning_rate": 9.434317507350799e-07, + "loss": 1.1748, + "step": 20035 + }, + { + "epoch": 1.92, + "grad_norm": 0.3712670786075611, + "learning_rate": 9.412651064345968e-07, + "loss": 1.0126, + "step": 20036 + }, + { + "epoch": 1.92, + "grad_norm": 0.3244953499957118, + "learning_rate": 9.39100941141191e-07, + "loss": 1.0333, + "step": 20037 + }, + { + "epoch": 1.92, + "grad_norm": 0.3302840925842129, + "learning_rate": 9.369392549090639e-07, + "loss": 1.0184, + "step": 20038 + }, + { + "epoch": 1.92, + "grad_norm": 0.32485837251590544, + "learning_rate": 9.347800477922719e-07, + "loss": 1.0646, + "step": 20039 + }, + { + "epoch": 1.92, + "grad_norm": 0.2956057615542434, + "learning_rate": 9.326233198448719e-07, + "loss": 1.0676, + "step": 20040 + }, + { + "epoch": 1.92, + "grad_norm": 0.2970012556079481, + "learning_rate": 9.30469071120832e-07, + "loss": 0.9863, + "step": 20041 + }, + { + "epoch": 1.92, + "grad_norm": 0.29271475650351775, + "learning_rate": 9.283173016740754e-07, + "loss": 0.9245, + "step": 20042 + }, + { + "epoch": 1.92, + "grad_norm": 0.32910879803242044, + "learning_rate": 9.26168011558437e-07, + "loss": 0.9646, + "step": 20043 + }, + { + "epoch": 1.92, + "grad_norm": 0.31521330190745617, + "learning_rate": 9.240212008277183e-07, + "loss": 0.9918, + "step": 20044 + }, + { + "epoch": 1.92, + "grad_norm": 0.3092379164264637, + "learning_rate": 9.218768695356317e-07, + "loss": 1.0889, + "step": 20045 + }, + { + "epoch": 1.92, + "grad_norm": 0.36539079605674984, + "learning_rate": 9.197350177358344e-07, + "loss": 0.9415, + "step": 20046 + }, + { + "epoch": 1.92, + "grad_norm": 0.36258155857082813, + "learning_rate": 9.175956454819501e-07, + "loss": 1.0058, + "step": 20047 + }, + { + "epoch": 1.92, + "grad_norm": 0.3210423760706069, + "learning_rate": 9.154587528275027e-07, + "loss": 1.0163, + "step": 20048 + }, + { + "epoch": 1.92, + "grad_norm": 0.3229569617177565, + "learning_rate": 9.133243398259717e-07, + "loss": 1.0503, + "step": 20049 + }, + { + "epoch": 1.92, + "grad_norm": 0.3304578244006643, + "learning_rate": 9.111924065307697e-07, + "loss": 0.9809, + "step": 20050 + }, + { + "epoch": 1.92, + "grad_norm": 0.3445562936081138, + "learning_rate": 9.090629529952655e-07, + "loss": 1.0374, + "step": 20051 + }, + { + "epoch": 1.92, + "grad_norm": 0.31879230823593946, + "learning_rate": 9.069359792727161e-07, + "loss": 1.0843, + "step": 20052 + }, + { + "epoch": 1.92, + "grad_norm": 0.3117767812710101, + "learning_rate": 9.048114854163902e-07, + "loss": 1.0609, + "step": 20053 + }, + { + "epoch": 1.92, + "grad_norm": 0.3257745606429297, + "learning_rate": 9.026894714794232e-07, + "loss": 1.0205, + "step": 20054 + }, + { + "epoch": 1.92, + "grad_norm": 0.3051758994087623, + "learning_rate": 9.00569937514939e-07, + "loss": 1.0444, + "step": 20055 + }, + { + "epoch": 1.92, + "grad_norm": 0.3811419023430415, + "learning_rate": 8.984528835759731e-07, + "loss": 1.0897, + "step": 20056 + }, + { + "epoch": 1.92, + "grad_norm": 0.3246416193379085, + "learning_rate": 8.963383097155054e-07, + "loss": 1.0047, + "step": 20057 + }, + { + "epoch": 1.92, + "grad_norm": 0.27936567878958996, + "learning_rate": 8.9422621598646e-07, + "loss": 0.9852, + "step": 20058 + }, + { + "epoch": 1.92, + "grad_norm": 0.30725007462797177, + "learning_rate": 8.921166024416838e-07, + "loss": 0.9367, + "step": 20059 + }, + { + "epoch": 1.92, + "grad_norm": 0.3419017111854611, + "learning_rate": 8.900094691339789e-07, + "loss": 1.1018, + "step": 20060 + }, + { + "epoch": 1.92, + "grad_norm": 0.30720116468032177, + "learning_rate": 8.879048161160697e-07, + "loss": 1.04, + "step": 20061 + }, + { + "epoch": 1.92, + "grad_norm": 0.3054611291728878, + "learning_rate": 8.858026434406364e-07, + "loss": 1.0548, + "step": 20062 + }, + { + "epoch": 1.92, + "grad_norm": 0.3274169037301503, + "learning_rate": 8.837029511602923e-07, + "loss": 0.9991, + "step": 20063 + }, + { + "epoch": 1.92, + "grad_norm": 0.3368260492392844, + "learning_rate": 8.816057393275623e-07, + "loss": 1.0568, + "step": 20064 + }, + { + "epoch": 1.92, + "grad_norm": 0.2859404792357713, + "learning_rate": 8.795110079949486e-07, + "loss": 1.0828, + "step": 20065 + }, + { + "epoch": 1.92, + "grad_norm": 0.3152682672379959, + "learning_rate": 8.774187572148651e-07, + "loss": 0.9682, + "step": 20066 + }, + { + "epoch": 1.92, + "grad_norm": 0.29670408997658293, + "learning_rate": 8.753289870396698e-07, + "loss": 1.0373, + "step": 20067 + }, + { + "epoch": 1.92, + "grad_norm": 0.37802464022306437, + "learning_rate": 8.732416975216651e-07, + "loss": 1.0502, + "step": 20068 + }, + { + "epoch": 1.92, + "grad_norm": 0.3350476176827123, + "learning_rate": 8.711568887130872e-07, + "loss": 1.1857, + "step": 20069 + }, + { + "epoch": 1.92, + "grad_norm": 0.28549408636196355, + "learning_rate": 8.690745606661054e-07, + "loss": 0.9765, + "step": 20070 + }, + { + "epoch": 1.92, + "grad_norm": 0.30596896476579366, + "learning_rate": 8.669947134328337e-07, + "loss": 1.021, + "step": 20071 + }, + { + "epoch": 1.92, + "grad_norm": 0.3065764511521161, + "learning_rate": 8.649173470653305e-07, + "loss": 1.0611, + "step": 20072 + }, + { + "epoch": 1.92, + "grad_norm": 0.2634698365182604, + "learning_rate": 8.62842461615565e-07, + "loss": 1.0121, + "step": 20073 + }, + { + "epoch": 1.92, + "grad_norm": 0.31353497521411794, + "learning_rate": 8.607700571354738e-07, + "loss": 1.0342, + "step": 20074 + }, + { + "epoch": 1.92, + "grad_norm": 0.3104875914689427, + "learning_rate": 8.587001336769263e-07, + "loss": 0.9413, + "step": 20075 + }, + { + "epoch": 1.92, + "grad_norm": 0.3257910644584907, + "learning_rate": 8.566326912917144e-07, + "loss": 1.0739, + "step": 20076 + }, + { + "epoch": 1.92, + "grad_norm": 0.32817544601187365, + "learning_rate": 8.545677300315746e-07, + "loss": 1.1219, + "step": 20077 + }, + { + "epoch": 1.92, + "grad_norm": 0.31159701382458743, + "learning_rate": 8.525052499481878e-07, + "loss": 0.971, + "step": 20078 + }, + { + "epoch": 1.92, + "grad_norm": 0.30312631070928414, + "learning_rate": 8.504452510931682e-07, + "loss": 0.9779, + "step": 20079 + }, + { + "epoch": 1.92, + "grad_norm": 0.3194672656874994, + "learning_rate": 8.483877335180745e-07, + "loss": 0.9484, + "step": 20080 + }, + { + "epoch": 1.92, + "grad_norm": 0.36339664085637485, + "learning_rate": 8.463326972743879e-07, + "loss": 1.0265, + "step": 20081 + }, + { + "epoch": 1.92, + "grad_norm": 0.3499645389878617, + "learning_rate": 8.44280142413545e-07, + "loss": 1.0281, + "step": 20082 + }, + { + "epoch": 1.92, + "grad_norm": 0.27002117749098886, + "learning_rate": 8.422300689869045e-07, + "loss": 0.8705, + "step": 20083 + }, + { + "epoch": 1.92, + "grad_norm": 0.3423560118236019, + "learning_rate": 8.401824770457812e-07, + "loss": 1.0854, + "step": 20084 + }, + { + "epoch": 1.92, + "grad_norm": 0.3247173792341353, + "learning_rate": 8.381373666414005e-07, + "loss": 1.0546, + "step": 20085 + }, + { + "epoch": 1.92, + "grad_norm": 0.2802965548354041, + "learning_rate": 8.360947378249661e-07, + "loss": 1.0194, + "step": 20086 + }, + { + "epoch": 1.92, + "grad_norm": 0.37344089546379416, + "learning_rate": 8.340545906475706e-07, + "loss": 1.0458, + "step": 20087 + }, + { + "epoch": 1.92, + "grad_norm": 0.3219677638964098, + "learning_rate": 8.320169251602839e-07, + "loss": 1.1435, + "step": 20088 + }, + { + "epoch": 1.92, + "grad_norm": 0.2791295015350788, + "learning_rate": 8.299817414140987e-07, + "loss": 1.0408, + "step": 20089 + }, + { + "epoch": 1.92, + "grad_norm": 0.36651146987810124, + "learning_rate": 8.279490394599521e-07, + "loss": 1.0147, + "step": 20090 + }, + { + "epoch": 1.92, + "grad_norm": 0.2801816265476482, + "learning_rate": 8.259188193487033e-07, + "loss": 0.9535, + "step": 20091 + }, + { + "epoch": 1.92, + "grad_norm": 0.3139768392178993, + "learning_rate": 8.238910811311673e-07, + "loss": 0.9153, + "step": 20092 + }, + { + "epoch": 1.92, + "grad_norm": 0.31206260575450584, + "learning_rate": 8.218658248580924e-07, + "loss": 0.9818, + "step": 20093 + }, + { + "epoch": 1.92, + "grad_norm": 0.35039284502375256, + "learning_rate": 8.198430505801491e-07, + "loss": 0.9094, + "step": 20094 + }, + { + "epoch": 1.92, + "grad_norm": 0.30063047379358177, + "learning_rate": 8.178227583479637e-07, + "loss": 1.0215, + "step": 20095 + }, + { + "epoch": 1.92, + "grad_norm": 0.2960607922057935, + "learning_rate": 8.158049482121066e-07, + "loss": 1.0358, + "step": 20096 + }, + { + "epoch": 1.92, + "grad_norm": 0.29615429697093654, + "learning_rate": 8.1378962022306e-07, + "loss": 1.0251, + "step": 20097 + }, + { + "epoch": 1.92, + "grad_norm": 0.30490055686313067, + "learning_rate": 8.11776774431261e-07, + "loss": 1.0607, + "step": 20098 + }, + { + "epoch": 1.92, + "grad_norm": 0.26017096893101116, + "learning_rate": 8.097664108870918e-07, + "loss": 1.0501, + "step": 20099 + }, + { + "epoch": 1.92, + "grad_norm": 0.3053766392449266, + "learning_rate": 8.077585296408563e-07, + "loss": 0.996, + "step": 20100 + }, + { + "epoch": 1.92, + "grad_norm": 0.2917317973754063, + "learning_rate": 8.057531307427923e-07, + "loss": 1.0929, + "step": 20101 + }, + { + "epoch": 1.92, + "grad_norm": 0.3221224186122919, + "learning_rate": 8.037502142431041e-07, + "loss": 0.9851, + "step": 20102 + }, + { + "epoch": 1.92, + "grad_norm": 0.3119299845663148, + "learning_rate": 8.017497801919182e-07, + "loss": 1.1283, + "step": 20103 + }, + { + "epoch": 1.92, + "grad_norm": 0.27917461581061614, + "learning_rate": 7.997518286392835e-07, + "loss": 0.9883, + "step": 20104 + }, + { + "epoch": 1.92, + "grad_norm": 0.2710713827563456, + "learning_rate": 7.977563596352045e-07, + "loss": 0.9428, + "step": 20105 + }, + { + "epoch": 1.92, + "grad_norm": 0.3213292905272226, + "learning_rate": 7.957633732296188e-07, + "loss": 1.0527, + "step": 20106 + }, + { + "epoch": 1.92, + "grad_norm": 0.3685422634378021, + "learning_rate": 7.937728694723979e-07, + "loss": 0.9815, + "step": 20107 + }, + { + "epoch": 1.92, + "grad_norm": 0.3067794628529489, + "learning_rate": 7.917848484133572e-07, + "loss": 0.8545, + "step": 20108 + }, + { + "epoch": 1.92, + "grad_norm": 0.2986160151706887, + "learning_rate": 7.897993101022572e-07, + "loss": 0.9721, + "step": 20109 + }, + { + "epoch": 1.92, + "grad_norm": 0.3198076725978136, + "learning_rate": 7.878162545887802e-07, + "loss": 1.0982, + "step": 20110 + }, + { + "epoch": 1.92, + "grad_norm": 0.2936449414973806, + "learning_rate": 7.858356819225531e-07, + "loss": 0.9985, + "step": 20111 + }, + { + "epoch": 1.92, + "grad_norm": 0.36281008611693355, + "learning_rate": 7.838575921531366e-07, + "loss": 1.0764, + "step": 20112 + }, + { + "epoch": 1.92, + "grad_norm": 0.3286688336584109, + "learning_rate": 7.818819853300463e-07, + "loss": 0.9667, + "step": 20113 + }, + { + "epoch": 1.92, + "grad_norm": 0.32906849340953237, + "learning_rate": 7.799088615027206e-07, + "loss": 1.0575, + "step": 20114 + }, + { + "epoch": 1.92, + "grad_norm": 0.2979852287951642, + "learning_rate": 7.779382207205088e-07, + "loss": 0.9987, + "step": 20115 + }, + { + "epoch": 1.92, + "grad_norm": 0.3185117118364329, + "learning_rate": 7.759700630327826e-07, + "loss": 1.0394, + "step": 20116 + }, + { + "epoch": 1.92, + "grad_norm": 0.31533038288721954, + "learning_rate": 7.740043884887472e-07, + "loss": 1.0388, + "step": 20117 + }, + { + "epoch": 1.92, + "grad_norm": 0.33899468881132294, + "learning_rate": 7.720411971376184e-07, + "loss": 1.0066, + "step": 20118 + }, + { + "epoch": 1.92, + "grad_norm": 0.29360908935224134, + "learning_rate": 7.700804890285129e-07, + "loss": 1.1264, + "step": 20119 + }, + { + "epoch": 1.92, + "grad_norm": 0.3262975278911923, + "learning_rate": 7.681222642105246e-07, + "loss": 1.0125, + "step": 20120 + }, + { + "epoch": 1.92, + "grad_norm": 0.30934436327282155, + "learning_rate": 7.661665227326253e-07, + "loss": 1.0873, + "step": 20121 + }, + { + "epoch": 1.93, + "grad_norm": 0.3451944928666525, + "learning_rate": 7.64213264643776e-07, + "loss": 1.092, + "step": 20122 + }, + { + "epoch": 1.93, + "grad_norm": 0.3151041614795453, + "learning_rate": 7.622624899928599e-07, + "loss": 1.0565, + "step": 20123 + }, + { + "epoch": 1.93, + "grad_norm": 0.32163562351752245, + "learning_rate": 7.603141988287043e-07, + "loss": 1.0447, + "step": 20124 + }, + { + "epoch": 1.93, + "grad_norm": 0.33078943978447295, + "learning_rate": 7.583683912000483e-07, + "loss": 1.0233, + "step": 20125 + }, + { + "epoch": 1.93, + "grad_norm": 0.3449812387731525, + "learning_rate": 7.564250671555973e-07, + "loss": 0.9005, + "step": 20126 + }, + { + "epoch": 1.93, + "grad_norm": 0.3233861322310524, + "learning_rate": 7.544842267439678e-07, + "loss": 0.9777, + "step": 20127 + }, + { + "epoch": 1.93, + "grad_norm": 0.2966488388853607, + "learning_rate": 7.525458700137544e-07, + "loss": 1.0461, + "step": 20128 + }, + { + "epoch": 1.93, + "grad_norm": 0.3018975146321822, + "learning_rate": 7.506099970134517e-07, + "loss": 1.0037, + "step": 20129 + }, + { + "epoch": 1.93, + "grad_norm": 0.34177604120093796, + "learning_rate": 7.486766077915097e-07, + "loss": 1.0236, + "step": 20130 + }, + { + "epoch": 1.93, + "grad_norm": 0.3067874837832294, + "learning_rate": 7.467457023963121e-07, + "loss": 0.9339, + "step": 20131 + }, + { + "epoch": 1.93, + "grad_norm": 0.3441986617162722, + "learning_rate": 7.448172808761866e-07, + "loss": 1.0972, + "step": 20132 + }, + { + "epoch": 1.93, + "grad_norm": 0.3110580992964136, + "learning_rate": 7.428913432793949e-07, + "loss": 1.0771, + "step": 20133 + }, + { + "epoch": 1.93, + "grad_norm": 0.35341031353860813, + "learning_rate": 7.409678896541095e-07, + "loss": 1.0833, + "step": 20134 + }, + { + "epoch": 1.93, + "grad_norm": 0.3198568972916891, + "learning_rate": 7.390469200484918e-07, + "loss": 1.1443, + "step": 20135 + }, + { + "epoch": 1.93, + "grad_norm": 0.3067182448173822, + "learning_rate": 7.371284345106145e-07, + "loss": 0.9982, + "step": 20136 + }, + { + "epoch": 1.93, + "grad_norm": 0.2968476079872452, + "learning_rate": 7.352124330884836e-07, + "loss": 1.0059, + "step": 20137 + }, + { + "epoch": 1.93, + "grad_norm": 0.3261557938463781, + "learning_rate": 7.332989158300385e-07, + "loss": 1.0242, + "step": 20138 + }, + { + "epoch": 1.93, + "grad_norm": 0.34360416725575355, + "learning_rate": 7.313878827831855e-07, + "loss": 1.0176, + "step": 20139 + }, + { + "epoch": 1.93, + "grad_norm": 0.31650860670510106, + "learning_rate": 7.294793339957307e-07, + "loss": 0.9779, + "step": 20140 + }, + { + "epoch": 1.93, + "grad_norm": 0.28960607391846444, + "learning_rate": 7.275732695154469e-07, + "loss": 0.977, + "step": 20141 + }, + { + "epoch": 1.93, + "grad_norm": 0.28844879102488946, + "learning_rate": 7.256696893900184e-07, + "loss": 1.0964, + "step": 20142 + }, + { + "epoch": 1.93, + "grad_norm": 0.3386897095615172, + "learning_rate": 7.237685936671179e-07, + "loss": 1.0333, + "step": 20143 + }, + { + "epoch": 1.93, + "grad_norm": 0.2941152395868368, + "learning_rate": 7.218699823942853e-07, + "loss": 0.9555, + "step": 20144 + }, + { + "epoch": 1.93, + "grad_norm": 0.2861759892142702, + "learning_rate": 7.199738556190716e-07, + "loss": 1.009, + "step": 20145 + }, + { + "epoch": 1.93, + "grad_norm": 0.3816513157156722, + "learning_rate": 7.180802133888831e-07, + "loss": 0.9473, + "step": 20146 + }, + { + "epoch": 1.93, + "grad_norm": 0.31544229847428723, + "learning_rate": 7.161890557511486e-07, + "loss": 1.1014, + "step": 20147 + }, + { + "epoch": 1.93, + "grad_norm": 0.30710647484736764, + "learning_rate": 7.143003827531747e-07, + "loss": 0.9632, + "step": 20148 + }, + { + "epoch": 1.93, + "grad_norm": 0.31069158341677333, + "learning_rate": 7.124141944422347e-07, + "loss": 0.9885, + "step": 20149 + }, + { + "epoch": 1.93, + "grad_norm": 0.36179175498141314, + "learning_rate": 7.105304908655353e-07, + "loss": 1.025, + "step": 20150 + }, + { + "epoch": 1.93, + "grad_norm": 0.28272732175913784, + "learning_rate": 7.086492720702054e-07, + "loss": 1.0051, + "step": 20151 + }, + { + "epoch": 1.93, + "grad_norm": 0.3569102136317873, + "learning_rate": 7.067705381033296e-07, + "loss": 1.0756, + "step": 20152 + }, + { + "epoch": 1.93, + "grad_norm": 0.33907032051185493, + "learning_rate": 7.048942890119147e-07, + "loss": 0.9869, + "step": 20153 + }, + { + "epoch": 1.93, + "grad_norm": 0.29039313715883286, + "learning_rate": 7.030205248429456e-07, + "loss": 0.9987, + "step": 20154 + }, + { + "epoch": 1.93, + "grad_norm": 0.3042956309529747, + "learning_rate": 7.011492456432844e-07, + "loss": 0.919, + "step": 20155 + }, + { + "epoch": 1.93, + "grad_norm": 0.27892592646408765, + "learning_rate": 6.992804514597606e-07, + "loss": 0.9477, + "step": 20156 + }, + { + "epoch": 1.93, + "grad_norm": 0.33513532806621005, + "learning_rate": 6.974141423391589e-07, + "loss": 1.032, + "step": 20157 + }, + { + "epoch": 1.93, + "grad_norm": 0.30533547089880486, + "learning_rate": 6.955503183281863e-07, + "loss": 1.054, + "step": 20158 + }, + { + "epoch": 1.93, + "grad_norm": 0.3398274402485137, + "learning_rate": 6.936889794734613e-07, + "loss": 1.0255, + "step": 20159 + }, + { + "epoch": 1.93, + "grad_norm": 0.3271594509364889, + "learning_rate": 6.91830125821602e-07, + "loss": 1.026, + "step": 20160 + }, + { + "epoch": 1.93, + "grad_norm": 0.28419003458884223, + "learning_rate": 6.899737574190823e-07, + "loss": 1.0008, + "step": 20161 + }, + { + "epoch": 1.93, + "grad_norm": 0.34598660096632744, + "learning_rate": 6.881198743123985e-07, + "loss": 0.918, + "step": 20162 + }, + { + "epoch": 1.93, + "grad_norm": 0.32346103431556084, + "learning_rate": 6.862684765479243e-07, + "loss": 0.9331, + "step": 20163 + }, + { + "epoch": 1.93, + "grad_norm": 0.3504055380874177, + "learning_rate": 6.844195641720008e-07, + "loss": 1.1016, + "step": 20164 + }, + { + "epoch": 1.93, + "grad_norm": 0.2876058349345217, + "learning_rate": 6.825731372309019e-07, + "loss": 1.0596, + "step": 20165 + }, + { + "epoch": 1.93, + "grad_norm": 0.29528455710749674, + "learning_rate": 6.807291957708239e-07, + "loss": 0.954, + "step": 20166 + }, + { + "epoch": 1.93, + "grad_norm": 0.30451691281198984, + "learning_rate": 6.788877398379078e-07, + "loss": 1.0387, + "step": 20167 + }, + { + "epoch": 1.93, + "grad_norm": 0.31626798323527305, + "learning_rate": 6.770487694782612e-07, + "loss": 1.0037, + "step": 20168 + }, + { + "epoch": 1.93, + "grad_norm": 0.3102712720061002, + "learning_rate": 6.752122847378806e-07, + "loss": 1.1403, + "step": 20169 + }, + { + "epoch": 1.93, + "grad_norm": 0.2922669597751978, + "learning_rate": 6.733782856627402e-07, + "loss": 1.0223, + "step": 20170 + }, + { + "epoch": 1.93, + "grad_norm": 0.3371033753875054, + "learning_rate": 6.715467722987368e-07, + "loss": 0.9779, + "step": 20171 + }, + { + "epoch": 1.93, + "grad_norm": 0.3083213627029064, + "learning_rate": 6.697177446917002e-07, + "loss": 0.9131, + "step": 20172 + }, + { + "epoch": 1.93, + "grad_norm": 0.3040181058754586, + "learning_rate": 6.67891202887394e-07, + "loss": 0.9921, + "step": 20173 + }, + { + "epoch": 1.93, + "grad_norm": 0.3334482507549852, + "learning_rate": 6.660671469315482e-07, + "loss": 1.0648, + "step": 20174 + }, + { + "epoch": 1.93, + "grad_norm": 0.30068400478363966, + "learning_rate": 6.64245576869793e-07, + "loss": 0.9807, + "step": 20175 + }, + { + "epoch": 1.93, + "grad_norm": 0.3169811652843307, + "learning_rate": 6.624264927477253e-07, + "loss": 1.0677, + "step": 20176 + }, + { + "epoch": 1.93, + "grad_norm": 0.317125969741024, + "learning_rate": 6.60609894610864e-07, + "loss": 1.0089, + "step": 20177 + }, + { + "epoch": 1.93, + "grad_norm": 0.3048375839124364, + "learning_rate": 6.587957825046842e-07, + "loss": 0.9917, + "step": 20178 + }, + { + "epoch": 1.93, + "grad_norm": 0.30060576965078445, + "learning_rate": 6.569841564745604e-07, + "loss": 0.9216, + "step": 20179 + }, + { + "epoch": 1.93, + "grad_norm": 0.322136887438649, + "learning_rate": 6.551750165658454e-07, + "loss": 1.0208, + "step": 20180 + }, + { + "epoch": 1.93, + "grad_norm": 0.2998843204191531, + "learning_rate": 6.533683628238252e-07, + "loss": 0.994, + "step": 20181 + }, + { + "epoch": 1.93, + "grad_norm": 0.28589431099761325, + "learning_rate": 6.515641952936858e-07, + "loss": 0.9946, + "step": 20182 + }, + { + "epoch": 1.93, + "grad_norm": 0.2868562874167388, + "learning_rate": 6.497625140205909e-07, + "loss": 1.0775, + "step": 20183 + }, + { + "epoch": 1.93, + "grad_norm": 0.3660803681124193, + "learning_rate": 6.47963319049627e-07, + "loss": 1.0168, + "step": 20184 + }, + { + "epoch": 1.93, + "grad_norm": 0.3207722575229483, + "learning_rate": 6.461666104258246e-07, + "loss": 0.9685, + "step": 20185 + }, + { + "epoch": 1.93, + "grad_norm": 0.2861042324660412, + "learning_rate": 6.443723881941477e-07, + "loss": 1.0642, + "step": 20186 + }, + { + "epoch": 1.93, + "grad_norm": 0.3556394517760846, + "learning_rate": 6.425806523994937e-07, + "loss": 1.0193, + "step": 20187 + }, + { + "epoch": 1.93, + "grad_norm": 0.3062361208368377, + "learning_rate": 6.407914030866935e-07, + "loss": 0.9749, + "step": 20188 + }, + { + "epoch": 1.93, + "grad_norm": 0.34515349468362116, + "learning_rate": 6.390046403005445e-07, + "loss": 1.0746, + "step": 20189 + }, + { + "epoch": 1.93, + "grad_norm": 0.3259536905014176, + "learning_rate": 6.372203640857444e-07, + "loss": 1.065, + "step": 20190 + }, + { + "epoch": 1.93, + "grad_norm": 0.2833318246123132, + "learning_rate": 6.354385744869573e-07, + "loss": 0.971, + "step": 20191 + }, + { + "epoch": 1.93, + "grad_norm": 0.33197666987343366, + "learning_rate": 6.336592715487588e-07, + "loss": 1.044, + "step": 20192 + }, + { + "epoch": 1.93, + "grad_norm": 0.27529071803075417, + "learning_rate": 6.31882455315691e-07, + "loss": 1.0207, + "step": 20193 + }, + { + "epoch": 1.93, + "grad_norm": 0.32381112705153764, + "learning_rate": 6.301081258322184e-07, + "loss": 1.0093, + "step": 20194 + }, + { + "epoch": 1.93, + "grad_norm": 0.3804797743723991, + "learning_rate": 6.283362831427386e-07, + "loss": 1.0083, + "step": 20195 + }, + { + "epoch": 1.93, + "grad_norm": 0.28991212843143505, + "learning_rate": 6.265669272915941e-07, + "loss": 1.0673, + "step": 20196 + }, + { + "epoch": 1.93, + "grad_norm": 0.30646255150985846, + "learning_rate": 6.248000583230606e-07, + "loss": 0.9721, + "step": 20197 + }, + { + "epoch": 1.93, + "grad_norm": 0.2848602016397962, + "learning_rate": 6.230356762813805e-07, + "loss": 0.9534, + "step": 20198 + }, + { + "epoch": 1.93, + "grad_norm": 0.3534958782106643, + "learning_rate": 6.212737812106739e-07, + "loss": 0.9831, + "step": 20199 + }, + { + "epoch": 1.93, + "grad_norm": 0.2844546768564776, + "learning_rate": 6.195143731550501e-07, + "loss": 1.0586, + "step": 20200 + }, + { + "epoch": 1.93, + "grad_norm": 0.2895941103418737, + "learning_rate": 6.177574521585405e-07, + "loss": 0.9643, + "step": 20201 + }, + { + "epoch": 1.93, + "grad_norm": 0.33643487566662844, + "learning_rate": 6.160030182651099e-07, + "loss": 1.0644, + "step": 20202 + }, + { + "epoch": 1.93, + "grad_norm": 0.2933211294320333, + "learning_rate": 6.142510715186566e-07, + "loss": 0.9913, + "step": 20203 + }, + { + "epoch": 1.93, + "grad_norm": 0.30930515226245947, + "learning_rate": 6.125016119630345e-07, + "loss": 1.0794, + "step": 20204 + }, + { + "epoch": 1.93, + "grad_norm": 0.32886991869455806, + "learning_rate": 6.107546396420305e-07, + "loss": 1.0978, + "step": 20205 + }, + { + "epoch": 1.93, + "grad_norm": 0.3372399169428921, + "learning_rate": 6.090101545993543e-07, + "loss": 1.0215, + "step": 20206 + }, + { + "epoch": 1.93, + "grad_norm": 0.33557415318107847, + "learning_rate": 6.072681568786598e-07, + "loss": 0.9475, + "step": 20207 + }, + { + "epoch": 1.93, + "grad_norm": 0.30715188098597385, + "learning_rate": 6.055286465235455e-07, + "loss": 1.1129, + "step": 20208 + }, + { + "epoch": 1.93, + "grad_norm": 0.33740657982691613, + "learning_rate": 6.037916235775431e-07, + "loss": 1.0622, + "step": 20209 + }, + { + "epoch": 1.93, + "grad_norm": 0.3314134321135117, + "learning_rate": 6.020570880841181e-07, + "loss": 0.9867, + "step": 20210 + }, + { + "epoch": 1.93, + "grad_norm": 0.3886278733489299, + "learning_rate": 6.00325040086691e-07, + "loss": 1.0609, + "step": 20211 + }, + { + "epoch": 1.93, + "grad_norm": 0.3079091473309773, + "learning_rate": 5.985954796286053e-07, + "loss": 1.1557, + "step": 20212 + }, + { + "epoch": 1.93, + "grad_norm": 0.2852665869311275, + "learning_rate": 5.96868406753126e-07, + "loss": 0.9521, + "step": 20213 + }, + { + "epoch": 1.93, + "grad_norm": 0.30740113186713713, + "learning_rate": 5.951438215034966e-07, + "loss": 1.0673, + "step": 20214 + }, + { + "epoch": 1.93, + "grad_norm": 0.34303333393529745, + "learning_rate": 5.934217239228712e-07, + "loss": 1.0751, + "step": 20215 + }, + { + "epoch": 1.93, + "grad_norm": 0.29528516588074283, + "learning_rate": 5.917021140543266e-07, + "loss": 1.0716, + "step": 20216 + }, + { + "epoch": 1.93, + "grad_norm": 0.2947830041320994, + "learning_rate": 5.899849919409173e-07, + "loss": 0.9551, + "step": 20217 + }, + { + "epoch": 1.93, + "grad_norm": 0.29272507391232205, + "learning_rate": 5.882703576256199e-07, + "loss": 1.1047, + "step": 20218 + }, + { + "epoch": 1.93, + "grad_norm": 0.33487764898784783, + "learning_rate": 5.865582111513335e-07, + "loss": 1.004, + "step": 20219 + }, + { + "epoch": 1.93, + "grad_norm": 0.31530803006152386, + "learning_rate": 5.848485525609016e-07, + "loss": 0.8598, + "step": 20220 + }, + { + "epoch": 1.93, + "grad_norm": 0.37301251469724017, + "learning_rate": 5.831413818971121e-07, + "loss": 0.9526, + "step": 20221 + }, + { + "epoch": 1.93, + "grad_norm": 0.329854963648634, + "learning_rate": 5.814366992026976e-07, + "loss": 0.9079, + "step": 20222 + }, + { + "epoch": 1.93, + "grad_norm": 0.3359802653564421, + "learning_rate": 5.797345045203018e-07, + "loss": 0.9309, + "step": 20223 + }, + { + "epoch": 1.93, + "grad_norm": 0.3026427667588203, + "learning_rate": 5.780347978925348e-07, + "loss": 1.1127, + "step": 20224 + }, + { + "epoch": 1.93, + "grad_norm": 0.31960577397686013, + "learning_rate": 5.763375793619408e-07, + "loss": 1.075, + "step": 20225 + }, + { + "epoch": 1.94, + "grad_norm": 0.29711553698530996, + "learning_rate": 5.746428489709854e-07, + "loss": 0.9686, + "step": 20226 + }, + { + "epoch": 1.94, + "grad_norm": 0.34596679997151186, + "learning_rate": 5.729506067620683e-07, + "loss": 1.0675, + "step": 20227 + }, + { + "epoch": 1.94, + "grad_norm": 0.3222817268741538, + "learning_rate": 5.712608527775554e-07, + "loss": 1.0431, + "step": 20228 + }, + { + "epoch": 1.94, + "grad_norm": 0.32739886262469403, + "learning_rate": 5.695735870597352e-07, + "loss": 0.8924, + "step": 20229 + }, + { + "epoch": 1.94, + "grad_norm": 0.3361154313462858, + "learning_rate": 5.678888096508072e-07, + "loss": 0.9693, + "step": 20230 + }, + { + "epoch": 1.94, + "grad_norm": 0.30632880204969554, + "learning_rate": 5.662065205929711e-07, + "loss": 0.9218, + "step": 20231 + }, + { + "epoch": 1.94, + "grad_norm": 0.3335724829891256, + "learning_rate": 5.645267199283044e-07, + "loss": 0.944, + "step": 20232 + }, + { + "epoch": 1.94, + "grad_norm": 0.2864925187603438, + "learning_rate": 5.628494076988511e-07, + "loss": 1.0162, + "step": 20233 + }, + { + "epoch": 1.94, + "grad_norm": 0.30777059991078876, + "learning_rate": 5.611745839465776e-07, + "loss": 1.028, + "step": 20234 + }, + { + "epoch": 1.94, + "grad_norm": 0.3211446673085799, + "learning_rate": 5.595022487134061e-07, + "loss": 1.0007, + "step": 20235 + }, + { + "epoch": 1.94, + "grad_norm": 0.3002539218024416, + "learning_rate": 5.578324020411918e-07, + "loss": 1.047, + "step": 20236 + }, + { + "epoch": 1.94, + "grad_norm": 0.3889077661502927, + "learning_rate": 5.561650439717125e-07, + "loss": 1.0166, + "step": 20237 + }, + { + "epoch": 1.94, + "grad_norm": 0.3154686137562815, + "learning_rate": 5.545001745467016e-07, + "loss": 1.0004, + "step": 20238 + }, + { + "epoch": 1.94, + "grad_norm": 0.31096571241097604, + "learning_rate": 5.528377938078255e-07, + "loss": 0.9809, + "step": 20239 + }, + { + "epoch": 1.94, + "grad_norm": 0.2967803026621185, + "learning_rate": 5.511779017966845e-07, + "loss": 0.9773, + "step": 20240 + }, + { + "epoch": 1.94, + "grad_norm": 0.3215504580840157, + "learning_rate": 5.495204985548119e-07, + "loss": 1.1142, + "step": 20241 + }, + { + "epoch": 1.94, + "grad_norm": 0.33546628207956575, + "learning_rate": 5.47865584123708e-07, + "loss": 1.0957, + "step": 20242 + }, + { + "epoch": 1.94, + "grad_norm": 0.3274938082351406, + "learning_rate": 5.462131585447505e-07, + "loss": 0.9176, + "step": 20243 + }, + { + "epoch": 1.94, + "grad_norm": 0.3153511132608709, + "learning_rate": 5.445632218593288e-07, + "loss": 1.0152, + "step": 20244 + }, + { + "epoch": 1.94, + "grad_norm": 0.321343765957633, + "learning_rate": 5.429157741087099e-07, + "loss": 1.0273, + "step": 20245 + }, + { + "epoch": 1.94, + "grad_norm": 0.33530501475196184, + "learning_rate": 5.412708153341273e-07, + "loss": 1.0314, + "step": 20246 + }, + { + "epoch": 1.94, + "grad_norm": 0.29407818870185387, + "learning_rate": 5.396283455767592e-07, + "loss": 0.9543, + "step": 20247 + }, + { + "epoch": 1.94, + "grad_norm": 0.2851078719842078, + "learning_rate": 5.379883648776951e-07, + "loss": 0.9386, + "step": 20248 + }, + { + "epoch": 1.94, + "grad_norm": 0.3112723238663482, + "learning_rate": 5.363508732779799e-07, + "loss": 0.986, + "step": 20249 + }, + { + "epoch": 1.94, + "grad_norm": 0.32882726752404884, + "learning_rate": 5.347158708185917e-07, + "loss": 1.0323, + "step": 20250 + }, + { + "epoch": 1.94, + "grad_norm": 0.3450822346693115, + "learning_rate": 5.330833575404537e-07, + "loss": 1.0557, + "step": 20251 + }, + { + "epoch": 1.94, + "grad_norm": 0.3260276895310968, + "learning_rate": 5.314533334844329e-07, + "loss": 1.0283, + "step": 20252 + }, + { + "epoch": 1.94, + "grad_norm": 0.31167523038286493, + "learning_rate": 5.298257986912858e-07, + "loss": 1.0943, + "step": 20253 + }, + { + "epoch": 1.94, + "grad_norm": 0.26449616289597805, + "learning_rate": 5.282007532017685e-07, + "loss": 1.0538, + "step": 20254 + }, + { + "epoch": 1.94, + "grad_norm": 0.3666235552660005, + "learning_rate": 5.265781970565486e-07, + "loss": 0.9162, + "step": 20255 + }, + { + "epoch": 1.94, + "grad_norm": 0.2979971041755462, + "learning_rate": 5.249581302962159e-07, + "loss": 0.9684, + "step": 20256 + }, + { + "epoch": 1.94, + "grad_norm": 0.32013509370064663, + "learning_rate": 5.233405529613267e-07, + "loss": 1.1066, + "step": 20257 + }, + { + "epoch": 1.94, + "grad_norm": 0.32913632492055717, + "learning_rate": 5.217254650923708e-07, + "loss": 1.004, + "step": 20258 + }, + { + "epoch": 1.94, + "grad_norm": 0.33317916256272956, + "learning_rate": 5.201128667297383e-07, + "loss": 1.0392, + "step": 20259 + }, + { + "epoch": 1.94, + "grad_norm": 0.30698931979052324, + "learning_rate": 5.18502757913808e-07, + "loss": 0.9462, + "step": 20260 + }, + { + "epoch": 1.94, + "grad_norm": 0.30373492429655924, + "learning_rate": 5.168951386848697e-07, + "loss": 0.9979, + "step": 20261 + }, + { + "epoch": 1.94, + "grad_norm": 0.3397707315874904, + "learning_rate": 5.15290009083158e-07, + "loss": 1.19, + "step": 20262 + }, + { + "epoch": 1.94, + "grad_norm": 0.3943680735514357, + "learning_rate": 5.136873691488298e-07, + "loss": 0.9989, + "step": 20263 + }, + { + "epoch": 1.94, + "grad_norm": 0.3264043351180867, + "learning_rate": 5.120872189219972e-07, + "loss": 1.1036, + "step": 20264 + }, + { + "epoch": 1.94, + "grad_norm": 0.33839171227016734, + "learning_rate": 5.104895584427172e-07, + "loss": 0.9659, + "step": 20265 + }, + { + "epoch": 1.94, + "grad_norm": 0.32598802449999387, + "learning_rate": 5.088943877509578e-07, + "loss": 1.0991, + "step": 20266 + }, + { + "epoch": 1.94, + "grad_norm": 0.3203063372359266, + "learning_rate": 5.073017068866426e-07, + "loss": 0.8925, + "step": 20267 + }, + { + "epoch": 1.94, + "grad_norm": 0.3380047404232058, + "learning_rate": 5.057115158896397e-07, + "loss": 1.0936, + "step": 20268 + }, + { + "epoch": 1.94, + "grad_norm": 0.3086225977465178, + "learning_rate": 5.041238147997284e-07, + "loss": 1.0077, + "step": 20269 + }, + { + "epoch": 1.94, + "grad_norm": 0.33616215399718014, + "learning_rate": 5.025386036566548e-07, + "loss": 1.0447, + "step": 20270 + }, + { + "epoch": 1.94, + "grad_norm": 0.26417799037639594, + "learning_rate": 5.00955882500076e-07, + "loss": 0.8989, + "step": 20271 + }, + { + "epoch": 1.94, + "grad_norm": 0.33085171011551046, + "learning_rate": 4.993756513696158e-07, + "loss": 0.9771, + "step": 20272 + }, + { + "epoch": 1.94, + "grad_norm": 0.32384191290375863, + "learning_rate": 4.977979103048091e-07, + "loss": 1.0557, + "step": 20273 + }, + { + "epoch": 1.94, + "grad_norm": 0.314443116187827, + "learning_rate": 4.962226593451469e-07, + "loss": 1.0707, + "step": 20274 + }, + { + "epoch": 1.94, + "grad_norm": 0.341460869889988, + "learning_rate": 4.946498985300529e-07, + "loss": 1.119, + "step": 20275 + }, + { + "epoch": 1.94, + "grad_norm": 0.30297517048466804, + "learning_rate": 4.930796278988737e-07, + "loss": 1.0506, + "step": 20276 + }, + { + "epoch": 1.94, + "grad_norm": 0.32483427862857256, + "learning_rate": 4.915118474909108e-07, + "loss": 0.942, + "step": 20277 + }, + { + "epoch": 1.94, + "grad_norm": 0.3194790690090108, + "learning_rate": 4.899465573454109e-07, + "loss": 1.1029, + "step": 20278 + }, + { + "epoch": 1.94, + "grad_norm": 0.3339781971291216, + "learning_rate": 4.883837575015426e-07, + "loss": 0.8965, + "step": 20279 + }, + { + "epoch": 1.94, + "grad_norm": 0.35051512240294985, + "learning_rate": 4.86823447998408e-07, + "loss": 1.0494, + "step": 20280 + }, + { + "epoch": 1.94, + "grad_norm": 0.336140288115144, + "learning_rate": 4.852656288750534e-07, + "loss": 1.0337, + "step": 20281 + }, + { + "epoch": 1.94, + "grad_norm": 0.31484867885813533, + "learning_rate": 4.837103001704702e-07, + "loss": 1.0382, + "step": 20282 + }, + { + "epoch": 1.94, + "grad_norm": 0.26940112932589677, + "learning_rate": 4.821574619235825e-07, + "loss": 0.942, + "step": 20283 + }, + { + "epoch": 1.94, + "grad_norm": 0.310423662625401, + "learning_rate": 4.806071141732482e-07, + "loss": 1.0988, + "step": 20284 + }, + { + "epoch": 1.94, + "grad_norm": 0.34313033552603145, + "learning_rate": 4.790592569582697e-07, + "loss": 0.9064, + "step": 20285 + }, + { + "epoch": 1.94, + "grad_norm": 0.4048978825460347, + "learning_rate": 4.775138903173826e-07, + "loss": 1.0609, + "step": 20286 + }, + { + "epoch": 1.94, + "grad_norm": 0.303210791621742, + "learning_rate": 4.7597101428924483e-07, + "loss": 1.0273, + "step": 20287 + }, + { + "epoch": 1.94, + "grad_norm": 0.3150383220523822, + "learning_rate": 4.7443062891249223e-07, + "loss": 0.9958, + "step": 20288 + }, + { + "epoch": 1.94, + "grad_norm": 0.3282458643296684, + "learning_rate": 4.728927342256606e-07, + "loss": 0.9832, + "step": 20289 + }, + { + "epoch": 1.94, + "grad_norm": 0.3234858941040814, + "learning_rate": 4.713573302672414e-07, + "loss": 0.9829, + "step": 20290 + }, + { + "epoch": 1.94, + "grad_norm": 0.28823508370719286, + "learning_rate": 4.698244170756483e-07, + "loss": 0.8299, + "step": 20291 + }, + { + "epoch": 1.94, + "grad_norm": 0.31153335958405065, + "learning_rate": 4.682939946892506e-07, + "loss": 1.101, + "step": 20292 + }, + { + "epoch": 1.94, + "grad_norm": 0.35637174976020103, + "learning_rate": 4.667660631463511e-07, + "loss": 1.0254, + "step": 20293 + }, + { + "epoch": 1.94, + "grad_norm": 0.35780593344495076, + "learning_rate": 4.6524062248519687e-07, + "loss": 0.9108, + "step": 20294 + }, + { + "epoch": 1.94, + "grad_norm": 0.2801807586225428, + "learning_rate": 4.6371767274393517e-07, + "loss": 0.8702, + "step": 20295 + }, + { + "epoch": 1.94, + "grad_norm": 0.31770518423414856, + "learning_rate": 4.621972139607023e-07, + "loss": 1.1156, + "step": 20296 + }, + { + "epoch": 1.94, + "grad_norm": 0.29934375903282645, + "learning_rate": 4.606792461735454e-07, + "loss": 0.9706, + "step": 20297 + }, + { + "epoch": 1.94, + "grad_norm": 0.3566184118292018, + "learning_rate": 4.5916376942044535e-07, + "loss": 1.0455, + "step": 20298 + }, + { + "epoch": 1.94, + "grad_norm": 0.2788097652858448, + "learning_rate": 4.5765078373932737e-07, + "loss": 0.8525, + "step": 20299 + }, + { + "epoch": 1.94, + "grad_norm": 0.31667916618745245, + "learning_rate": 4.561402891680611e-07, + "loss": 1.0299, + "step": 20300 + }, + { + "epoch": 1.94, + "grad_norm": 0.29591378689360903, + "learning_rate": 4.546322857444496e-07, + "loss": 1.0054, + "step": 20301 + }, + { + "epoch": 1.94, + "grad_norm": 0.3251366487412229, + "learning_rate": 4.5312677350621836e-07, + "loss": 1.0647, + "step": 20302 + }, + { + "epoch": 1.94, + "grad_norm": 0.31714493849807807, + "learning_rate": 4.516237524910594e-07, + "loss": 1.0987, + "step": 20303 + }, + { + "epoch": 1.94, + "grad_norm": 0.30584333048864004, + "learning_rate": 4.5012322273657593e-07, + "loss": 1.0209, + "step": 20304 + }, + { + "epoch": 1.94, + "grad_norm": 0.28396181676710275, + "learning_rate": 4.4862518428032684e-07, + "loss": 1.0813, + "step": 20305 + }, + { + "epoch": 1.94, + "grad_norm": 0.3113360048830721, + "learning_rate": 4.4712963715979326e-07, + "loss": 0.9605, + "step": 20306 + }, + { + "epoch": 1.94, + "grad_norm": 0.3122660710798489, + "learning_rate": 4.456365814124119e-07, + "loss": 1.0237, + "step": 20307 + }, + { + "epoch": 1.94, + "grad_norm": 0.33816370133822554, + "learning_rate": 4.441460170755418e-07, + "loss": 1.0702, + "step": 20308 + }, + { + "epoch": 1.94, + "grad_norm": 0.3248870714305816, + "learning_rate": 4.4265794418648645e-07, + "loss": 1.0406, + "step": 20309 + }, + { + "epoch": 1.94, + "grad_norm": 0.32698851447851845, + "learning_rate": 4.4117236278248265e-07, + "loss": 1.0145, + "step": 20310 + }, + { + "epoch": 1.94, + "grad_norm": 0.3479181084554416, + "learning_rate": 4.396892729007118e-07, + "loss": 1.0113, + "step": 20311 + }, + { + "epoch": 1.94, + "grad_norm": 0.29857531512803204, + "learning_rate": 4.3820867457828873e-07, + "loss": 0.9193, + "step": 20312 + }, + { + "epoch": 1.94, + "grad_norm": 0.293182961264483, + "learning_rate": 4.3673056785226154e-07, + "loss": 0.9865, + "step": 20313 + }, + { + "epoch": 1.94, + "grad_norm": 0.3320039602329452, + "learning_rate": 4.352549527596339e-07, + "loss": 0.9007, + "step": 20314 + }, + { + "epoch": 1.94, + "grad_norm": 0.28559486581954063, + "learning_rate": 4.337818293373208e-07, + "loss": 1.0652, + "step": 20315 + }, + { + "epoch": 1.94, + "grad_norm": 0.3277309628943195, + "learning_rate": 4.3231119762219275e-07, + "loss": 1.0513, + "step": 20316 + }, + { + "epoch": 1.94, + "grad_norm": 0.3231448467587379, + "learning_rate": 4.308430576510536e-07, + "loss": 0.9267, + "step": 20317 + }, + { + "epoch": 1.94, + "grad_norm": 0.31389608952272835, + "learning_rate": 4.293774094606406e-07, + "loss": 1.0717, + "step": 20318 + }, + { + "epoch": 1.94, + "grad_norm": 0.34639874375618557, + "learning_rate": 4.2791425308763566e-07, + "loss": 0.9232, + "step": 20319 + }, + { + "epoch": 1.94, + "grad_norm": 0.3120721694691459, + "learning_rate": 4.264535885686538e-07, + "loss": 1.0271, + "step": 20320 + }, + { + "epoch": 1.94, + "grad_norm": 0.3449219522450688, + "learning_rate": 4.249954159402547e-07, + "loss": 1.0332, + "step": 20321 + }, + { + "epoch": 1.94, + "grad_norm": 0.3190370181545896, + "learning_rate": 4.235397352389314e-07, + "loss": 1.0163, + "step": 20322 + }, + { + "epoch": 1.94, + "grad_norm": 0.3174711949966816, + "learning_rate": 4.2208654650109925e-07, + "loss": 0.9714, + "step": 20323 + }, + { + "epoch": 1.94, + "grad_norm": 0.2730073407868398, + "learning_rate": 4.206358497631402e-07, + "loss": 0.9168, + "step": 20324 + }, + { + "epoch": 1.94, + "grad_norm": 0.34858817691803823, + "learning_rate": 4.1918764506134744e-07, + "loss": 0.9491, + "step": 20325 + }, + { + "epoch": 1.94, + "grad_norm": 0.33302952306021627, + "learning_rate": 4.177419324319698e-07, + "loss": 0.9899, + "step": 20326 + }, + { + "epoch": 1.94, + "grad_norm": 0.2799926701742036, + "learning_rate": 4.162987119111783e-07, + "loss": 0.8703, + "step": 20327 + }, + { + "epoch": 1.94, + "grad_norm": 0.31398489209314584, + "learning_rate": 4.148579835351107e-07, + "loss": 0.9651, + "step": 20328 + }, + { + "epoch": 1.94, + "grad_norm": 0.3131305998503574, + "learning_rate": 4.134197473397938e-07, + "loss": 1.0406, + "step": 20329 + }, + { + "epoch": 1.94, + "grad_norm": 0.307409739405903, + "learning_rate": 4.119840033612543e-07, + "loss": 1.0159, + "step": 20330 + }, + { + "epoch": 1.95, + "grad_norm": 0.2915556432224201, + "learning_rate": 4.105507516353857e-07, + "loss": 0.8923, + "step": 20331 + }, + { + "epoch": 1.95, + "grad_norm": 0.3088241287381776, + "learning_rate": 4.091199921980815e-07, + "loss": 1.0174, + "step": 20332 + }, + { + "epoch": 1.95, + "grad_norm": 0.33055561457345034, + "learning_rate": 4.076917250851353e-07, + "loss": 1.1038, + "step": 20333 + }, + { + "epoch": 1.95, + "grad_norm": 0.3048955049191091, + "learning_rate": 4.062659503323074e-07, + "loss": 1.1369, + "step": 20334 + }, + { + "epoch": 1.95, + "grad_norm": 0.31705919055312337, + "learning_rate": 4.048426679752582e-07, + "loss": 0.9679, + "step": 20335 + }, + { + "epoch": 1.95, + "grad_norm": 0.3629169804490354, + "learning_rate": 4.034218780496146e-07, + "loss": 1.0215, + "step": 20336 + }, + { + "epoch": 1.95, + "grad_norm": 0.3132316160978314, + "learning_rate": 4.0200358059093724e-07, + "loss": 0.9295, + "step": 20337 + }, + { + "epoch": 1.95, + "grad_norm": 0.28464435891886786, + "learning_rate": 4.005877756347087e-07, + "loss": 1.0656, + "step": 20338 + }, + { + "epoch": 1.95, + "grad_norm": 0.30682303104699893, + "learning_rate": 3.991744632163674e-07, + "loss": 1.0465, + "step": 20339 + }, + { + "epoch": 1.95, + "grad_norm": 0.3258311499151425, + "learning_rate": 3.97763643371285e-07, + "loss": 1.0404, + "step": 20340 + }, + { + "epoch": 1.95, + "grad_norm": 0.325899777764082, + "learning_rate": 3.9635531613476663e-07, + "loss": 1.0186, + "step": 20341 + }, + { + "epoch": 1.95, + "grad_norm": 0.3248497780445895, + "learning_rate": 3.949494815420507e-07, + "loss": 1.0276, + "step": 20342 + }, + { + "epoch": 1.95, + "grad_norm": 0.32096088385688165, + "learning_rate": 3.935461396283313e-07, + "loss": 1.0475, + "step": 20343 + }, + { + "epoch": 1.95, + "grad_norm": 0.35237911583403875, + "learning_rate": 3.9214529042870263e-07, + "loss": 1.0501, + "step": 20344 + }, + { + "epoch": 1.95, + "grad_norm": 0.37068349993091, + "learning_rate": 3.907469339782588e-07, + "loss": 0.9692, + "step": 20345 + }, + { + "epoch": 1.95, + "grad_norm": 0.3104981848961256, + "learning_rate": 3.8935107031196074e-07, + "loss": 1.1081, + "step": 20346 + }, + { + "epoch": 1.95, + "grad_norm": 0.2881010213716565, + "learning_rate": 3.8795769946476937e-07, + "loss": 1.0111, + "step": 20347 + }, + { + "epoch": 1.95, + "grad_norm": 0.2988561038812606, + "learning_rate": 3.8656682147152345e-07, + "loss": 1.0457, + "step": 20348 + }, + { + "epoch": 1.95, + "grad_norm": 0.28502005512068723, + "learning_rate": 3.8517843636706184e-07, + "loss": 0.9995, + "step": 20349 + }, + { + "epoch": 1.95, + "grad_norm": 0.3372924741802285, + "learning_rate": 3.8379254418611234e-07, + "loss": 0.9934, + "step": 20350 + }, + { + "epoch": 1.95, + "grad_norm": 0.2953411609104987, + "learning_rate": 3.824091449633582e-07, + "loss": 1.0289, + "step": 20351 + }, + { + "epoch": 1.95, + "grad_norm": 0.35302881623014765, + "learning_rate": 3.810282387334163e-07, + "loss": 1.0058, + "step": 20352 + }, + { + "epoch": 1.95, + "grad_norm": 0.3508001682428721, + "learning_rate": 3.7964982553085894e-07, + "loss": 0.9607, + "step": 20353 + }, + { + "epoch": 1.95, + "grad_norm": 0.2850551722729167, + "learning_rate": 3.782739053901696e-07, + "loss": 1.0213, + "step": 20354 + }, + { + "epoch": 1.95, + "grad_norm": 0.32559416685889214, + "learning_rate": 3.7690047834579853e-07, + "loss": 0.9507, + "step": 20355 + }, + { + "epoch": 1.95, + "grad_norm": 0.2981393487811312, + "learning_rate": 3.7552954443208497e-07, + "loss": 0.9381, + "step": 20356 + }, + { + "epoch": 1.95, + "grad_norm": 0.3326287172677537, + "learning_rate": 3.741611036833681e-07, + "loss": 0.944, + "step": 20357 + }, + { + "epoch": 1.95, + "grad_norm": 0.37224797310506597, + "learning_rate": 3.7279515613387605e-07, + "loss": 0.972, + "step": 20358 + }, + { + "epoch": 1.95, + "grad_norm": 0.3277747264054883, + "learning_rate": 3.7143170181780376e-07, + "loss": 1.0565, + "step": 20359 + }, + { + "epoch": 1.95, + "grad_norm": 0.3288202114346729, + "learning_rate": 3.700707407692683e-07, + "loss": 0.8876, + "step": 20360 + }, + { + "epoch": 1.95, + "grad_norm": 0.3141561477351356, + "learning_rate": 3.6871227302232023e-07, + "loss": 0.9704, + "step": 20361 + }, + { + "epoch": 1.95, + "grad_norm": 0.33555195164810336, + "learning_rate": 3.6735629861096576e-07, + "loss": 0.9862, + "step": 20362 + }, + { + "epoch": 1.95, + "grad_norm": 0.3299883684362604, + "learning_rate": 3.6600281756914433e-07, + "loss": 1.0823, + "step": 20363 + }, + { + "epoch": 1.95, + "grad_norm": 0.31891280858821935, + "learning_rate": 3.6465182993070666e-07, + "loss": 1.0402, + "step": 20364 + }, + { + "epoch": 1.95, + "grad_norm": 0.3091807098632681, + "learning_rate": 3.6330333572949236e-07, + "loss": 1.0916, + "step": 20365 + }, + { + "epoch": 1.95, + "grad_norm": 0.2725797193970479, + "learning_rate": 3.619573349992189e-07, + "loss": 0.9531, + "step": 20366 + }, + { + "epoch": 1.95, + "grad_norm": 0.3301743457788013, + "learning_rate": 3.6061382777358154e-07, + "loss": 0.965, + "step": 20367 + }, + { + "epoch": 1.95, + "grad_norm": 0.34980341517089597, + "learning_rate": 3.5927281408620895e-07, + "loss": 1.0417, + "step": 20368 + }, + { + "epoch": 1.95, + "grad_norm": 0.2660199408399253, + "learning_rate": 3.5793429397065204e-07, + "loss": 1.0497, + "step": 20369 + }, + { + "epoch": 1.95, + "grad_norm": 0.3224836389413272, + "learning_rate": 3.5659826746042844e-07, + "loss": 0.9468, + "step": 20370 + }, + { + "epoch": 1.95, + "grad_norm": 0.3242055879241137, + "learning_rate": 3.5526473458893374e-07, + "loss": 1.0814, + "step": 20371 + }, + { + "epoch": 1.95, + "grad_norm": 0.2844521812339122, + "learning_rate": 3.5393369538957444e-07, + "loss": 0.9758, + "step": 20372 + }, + { + "epoch": 1.95, + "grad_norm": 0.3069611240090361, + "learning_rate": 3.526051498956462e-07, + "loss": 1.027, + "step": 20373 + }, + { + "epoch": 1.95, + "grad_norm": 0.3027040568679486, + "learning_rate": 3.512790981404113e-07, + "loss": 1.0283, + "step": 20374 + }, + { + "epoch": 1.95, + "grad_norm": 0.29058113584022943, + "learning_rate": 3.4995554015703203e-07, + "loss": 0.995, + "step": 20375 + }, + { + "epoch": 1.95, + "grad_norm": 0.3383511580355953, + "learning_rate": 3.4863447597864863e-07, + "loss": 0.9561, + "step": 20376 + }, + { + "epoch": 1.95, + "grad_norm": 0.3693949169627847, + "learning_rate": 3.473159056383124e-07, + "loss": 0.9423, + "step": 20377 + }, + { + "epoch": 1.95, + "grad_norm": 0.2984526641507388, + "learning_rate": 3.459998291690303e-07, + "loss": 1.1253, + "step": 20378 + }, + { + "epoch": 1.95, + "grad_norm": 0.3299032815213594, + "learning_rate": 3.446862466037204e-07, + "loss": 1.0128, + "step": 20379 + }, + { + "epoch": 1.95, + "grad_norm": 0.2993996315793805, + "learning_rate": 3.433751579752897e-07, + "loss": 0.9224, + "step": 20380 + }, + { + "epoch": 1.95, + "grad_norm": 0.3328754324835419, + "learning_rate": 3.420665633165121e-07, + "loss": 0.9523, + "step": 20381 + }, + { + "epoch": 1.95, + "grad_norm": 0.35392423303659676, + "learning_rate": 3.4076046266015013e-07, + "loss": 1.0888, + "step": 20382 + }, + { + "epoch": 1.95, + "grad_norm": 0.3134629886815614, + "learning_rate": 3.3945685603889996e-07, + "loss": 0.8739, + "step": 20383 + }, + { + "epoch": 1.95, + "grad_norm": 0.302597987051284, + "learning_rate": 3.381557434853688e-07, + "loss": 0.9899, + "step": 20384 + }, + { + "epoch": 1.95, + "grad_norm": 0.3263964836862991, + "learning_rate": 3.368571250321306e-07, + "loss": 1.0076, + "step": 20385 + }, + { + "epoch": 1.95, + "grad_norm": 0.28418175423505376, + "learning_rate": 3.355610007116705e-07, + "loss": 0.9512, + "step": 20386 + }, + { + "epoch": 1.95, + "grad_norm": 0.3266369287304834, + "learning_rate": 3.342673705564403e-07, + "loss": 0.9293, + "step": 20387 + }, + { + "epoch": 1.95, + "grad_norm": 0.29363107422881696, + "learning_rate": 3.32976234598803e-07, + "loss": 1.0223, + "step": 20388 + }, + { + "epoch": 1.95, + "grad_norm": 0.35951401181666354, + "learning_rate": 3.316875928710772e-07, + "loss": 0.9543, + "step": 20389 + }, + { + "epoch": 1.95, + "grad_norm": 0.3127308846125641, + "learning_rate": 3.304014454055038e-07, + "loss": 0.9593, + "step": 20390 + }, + { + "epoch": 1.95, + "grad_norm": 0.33751532709279447, + "learning_rate": 3.291177922342792e-07, + "loss": 1.0695, + "step": 20391 + }, + { + "epoch": 1.95, + "grad_norm": 0.3132492737996188, + "learning_rate": 3.278366333895222e-07, + "loss": 0.9563, + "step": 20392 + }, + { + "epoch": 1.95, + "grad_norm": 0.3024344940502603, + "learning_rate": 3.26557968903296e-07, + "loss": 0.8752, + "step": 20393 + }, + { + "epoch": 1.95, + "grad_norm": 0.3354374415425633, + "learning_rate": 3.2528179880759734e-07, + "loss": 0.9995, + "step": 20394 + }, + { + "epoch": 1.95, + "grad_norm": 0.3564267069465302, + "learning_rate": 3.2400812313436724e-07, + "loss": 1.0568, + "step": 20395 + }, + { + "epoch": 1.95, + "grad_norm": 0.30280445476997486, + "learning_rate": 3.227369419154691e-07, + "loss": 0.9618, + "step": 20396 + }, + { + "epoch": 1.95, + "grad_norm": 0.31302675280539355, + "learning_rate": 3.2146825518273307e-07, + "loss": 1.0522, + "step": 20397 + }, + { + "epoch": 1.95, + "grad_norm": 0.31815831476924855, + "learning_rate": 3.2020206296790035e-07, + "loss": 1.0171, + "step": 20398 + }, + { + "epoch": 1.95, + "grad_norm": 0.313441996584062, + "learning_rate": 3.1893836530264563e-07, + "loss": 0.9815, + "step": 20399 + }, + { + "epoch": 1.95, + "grad_norm": 0.33852974822346965, + "learning_rate": 3.176771622186103e-07, + "loss": 1.0382, + "step": 20400 + }, + { + "epoch": 1.95, + "grad_norm": 0.3121047357088149, + "learning_rate": 3.16418453747358e-07, + "loss": 0.8846, + "step": 20401 + }, + { + "epoch": 1.95, + "grad_norm": 0.3118990892423256, + "learning_rate": 3.1516223992038573e-07, + "loss": 1.0538, + "step": 20402 + }, + { + "epoch": 1.95, + "grad_norm": 0.2880688037427415, + "learning_rate": 3.139085207691128e-07, + "loss": 1.0357, + "step": 20403 + }, + { + "epoch": 1.95, + "grad_norm": 0.2976784581166368, + "learning_rate": 3.1265729632494744e-07, + "loss": 1.0776, + "step": 20404 + }, + { + "epoch": 1.95, + "grad_norm": 0.35906073962759616, + "learning_rate": 3.1140856661917573e-07, + "loss": 1.1303, + "step": 20405 + }, + { + "epoch": 1.95, + "grad_norm": 0.3715334168972156, + "learning_rate": 3.1016233168305044e-07, + "loss": 0.9857, + "step": 20406 + }, + { + "epoch": 1.95, + "grad_norm": 0.33634816547479895, + "learning_rate": 3.089185915477688e-07, + "loss": 0.9546, + "step": 20407 + }, + { + "epoch": 1.95, + "grad_norm": 0.3282827983386865, + "learning_rate": 3.0767734624446156e-07, + "loss": 1.0948, + "step": 20408 + }, + { + "epoch": 1.95, + "grad_norm": 0.3240701762090972, + "learning_rate": 3.0643859580417045e-07, + "loss": 0.9614, + "step": 20409 + }, + { + "epoch": 1.95, + "grad_norm": 0.3101700524101439, + "learning_rate": 3.0520234025792627e-07, + "loss": 0.9905, + "step": 20410 + }, + { + "epoch": 1.95, + "grad_norm": 0.3643937935745227, + "learning_rate": 3.039685796366376e-07, + "loss": 0.9395, + "step": 20411 + }, + { + "epoch": 1.95, + "grad_norm": 0.3007752698532387, + "learning_rate": 3.027373139711909e-07, + "loss": 0.9676, + "step": 20412 + }, + { + "epoch": 1.95, + "grad_norm": 0.3486817608311987, + "learning_rate": 3.0150854329239473e-07, + "loss": 0.9658, + "step": 20413 + }, + { + "epoch": 1.95, + "grad_norm": 0.3138096994522684, + "learning_rate": 3.002822676310135e-07, + "loss": 1.0, + "step": 20414 + }, + { + "epoch": 1.95, + "grad_norm": 0.27979212331830333, + "learning_rate": 2.9905848701773374e-07, + "loss": 0.9424, + "step": 20415 + }, + { + "epoch": 1.95, + "grad_norm": 0.3052464917310288, + "learning_rate": 2.978372014831643e-07, + "loss": 1.0052, + "step": 20416 + }, + { + "epoch": 1.95, + "grad_norm": 0.30395708227566104, + "learning_rate": 2.966184110578807e-07, + "loss": 0.9923, + "step": 20417 + }, + { + "epoch": 1.95, + "grad_norm": 0.33903583393869585, + "learning_rate": 2.9540211577239183e-07, + "loss": 1.0266, + "step": 20418 + }, + { + "epoch": 1.95, + "grad_norm": 0.306405934699315, + "learning_rate": 2.941883156571179e-07, + "loss": 1.055, + "step": 20419 + }, + { + "epoch": 1.95, + "grad_norm": 0.3130101802956793, + "learning_rate": 2.9297701074244563e-07, + "loss": 1.0452, + "step": 20420 + }, + { + "epoch": 1.95, + "grad_norm": 0.31301341614067185, + "learning_rate": 2.917682010586842e-07, + "loss": 1.1243, + "step": 20421 + }, + { + "epoch": 1.95, + "grad_norm": 0.26306108934413464, + "learning_rate": 2.905618866360982e-07, + "loss": 0.8956, + "step": 20422 + }, + { + "epoch": 1.95, + "grad_norm": 0.3058354499552484, + "learning_rate": 2.893580675048524e-07, + "loss": 1.0281, + "step": 20423 + }, + { + "epoch": 1.95, + "grad_norm": 0.2889667357374862, + "learning_rate": 2.8815674369508937e-07, + "loss": 0.9952, + "step": 20424 + }, + { + "epoch": 1.95, + "grad_norm": 0.36199333670874495, + "learning_rate": 2.869579152368851e-07, + "loss": 1.1169, + "step": 20425 + }, + { + "epoch": 1.95, + "grad_norm": 0.3500196780878492, + "learning_rate": 2.8576158216020443e-07, + "loss": 1.0404, + "step": 20426 + }, + { + "epoch": 1.95, + "grad_norm": 0.294090982879717, + "learning_rate": 2.8456774449502345e-07, + "loss": 0.961, + "step": 20427 + }, + { + "epoch": 1.95, + "grad_norm": 0.30200824420578937, + "learning_rate": 2.83376402271196e-07, + "loss": 1.1123, + "step": 20428 + }, + { + "epoch": 1.95, + "grad_norm": 0.29356917325391213, + "learning_rate": 2.821875555185538e-07, + "loss": 0.96, + "step": 20429 + }, + { + "epoch": 1.95, + "grad_norm": 0.3224844762820428, + "learning_rate": 2.8100120426682863e-07, + "loss": 1.0386, + "step": 20430 + }, + { + "epoch": 1.95, + "grad_norm": 0.3124885453253642, + "learning_rate": 2.7981734854573003e-07, + "loss": 0.9934, + "step": 20431 + }, + { + "epoch": 1.95, + "grad_norm": 0.3016693807044563, + "learning_rate": 2.786359883848566e-07, + "loss": 1.0095, + "step": 20432 + }, + { + "epoch": 1.95, + "grad_norm": 0.29569433830282504, + "learning_rate": 2.7745712381380685e-07, + "loss": 1.0322, + "step": 20433 + }, + { + "epoch": 1.95, + "grad_norm": 0.33434255025117887, + "learning_rate": 2.7628075486205716e-07, + "loss": 1.01, + "step": 20434 + }, + { + "epoch": 1.96, + "grad_norm": 0.34916646354912734, + "learning_rate": 2.7510688155906186e-07, + "loss": 1.1197, + "step": 20435 + }, + { + "epoch": 1.96, + "grad_norm": 0.27487736634311605, + "learning_rate": 2.739355039341751e-07, + "loss": 0.9227, + "step": 20436 + }, + { + "epoch": 1.96, + "grad_norm": 0.3110705659204715, + "learning_rate": 2.727666220167513e-07, + "loss": 0.9559, + "step": 20437 + }, + { + "epoch": 1.96, + "grad_norm": 0.30439525864342937, + "learning_rate": 2.7160023583600037e-07, + "loss": 0.9197, + "step": 20438 + }, + { + "epoch": 1.96, + "grad_norm": 0.2900750505484709, + "learning_rate": 2.7043634542114336e-07, + "loss": 0.9412, + "step": 20439 + }, + { + "epoch": 1.96, + "grad_norm": 0.3170282788145645, + "learning_rate": 2.692749508012793e-07, + "loss": 1.0357, + "step": 20440 + }, + { + "epoch": 1.96, + "grad_norm": 0.33862108640236216, + "learning_rate": 2.6811605200550706e-07, + "loss": 1.094, + "step": 20441 + }, + { + "epoch": 1.96, + "grad_norm": 0.34364044594530524, + "learning_rate": 2.6695964906279235e-07, + "loss": 0.9982, + "step": 20442 + }, + { + "epoch": 1.96, + "grad_norm": 0.284149629323534, + "learning_rate": 2.6580574200210094e-07, + "loss": 0.9857, + "step": 20443 + }, + { + "epoch": 1.96, + "grad_norm": 0.3026187717446175, + "learning_rate": 2.646543308523097e-07, + "loss": 1.0555, + "step": 20444 + }, + { + "epoch": 1.96, + "grad_norm": 0.28141937155480595, + "learning_rate": 2.635054156422068e-07, + "loss": 1.1251, + "step": 20445 + }, + { + "epoch": 1.96, + "grad_norm": 0.29272309848732875, + "learning_rate": 2.6235899640058013e-07, + "loss": 1.0156, + "step": 20446 + }, + { + "epoch": 1.96, + "grad_norm": 0.3040073282426707, + "learning_rate": 2.612150731560958e-07, + "loss": 0.9726, + "step": 20447 + }, + { + "epoch": 1.96, + "grad_norm": 0.33042129560690836, + "learning_rate": 2.600736459373754e-07, + "loss": 1.0593, + "step": 20448 + }, + { + "epoch": 1.96, + "grad_norm": 0.3119188072170354, + "learning_rate": 2.5893471477300704e-07, + "loss": 0.8523, + "step": 20449 + }, + { + "epoch": 1.96, + "grad_norm": 0.2819895546840128, + "learning_rate": 2.5779827969149017e-07, + "loss": 0.9132, + "step": 20450 + }, + { + "epoch": 1.96, + "grad_norm": 0.29296878699795487, + "learning_rate": 2.566643407212466e-07, + "loss": 1.1118, + "step": 20451 + }, + { + "epoch": 1.96, + "grad_norm": 0.28928181937381253, + "learning_rate": 2.555328978906646e-07, + "loss": 1.072, + "step": 20452 + }, + { + "epoch": 1.96, + "grad_norm": 0.29090460038656957, + "learning_rate": 2.5440395122806604e-07, + "loss": 1.0366, + "step": 20453 + }, + { + "epoch": 1.96, + "grad_norm": 0.3015749191762644, + "learning_rate": 2.53277500761695e-07, + "loss": 0.9898, + "step": 20454 + }, + { + "epoch": 1.96, + "grad_norm": 0.30114075964655895, + "learning_rate": 2.5215354651972887e-07, + "loss": 1.075, + "step": 20455 + }, + { + "epoch": 1.96, + "grad_norm": 0.3128983783900397, + "learning_rate": 2.51032088530323e-07, + "loss": 0.9531, + "step": 20456 + }, + { + "epoch": 1.96, + "grad_norm": 0.3566338545542135, + "learning_rate": 2.4991312682153265e-07, + "loss": 0.9517, + "step": 20457 + }, + { + "epoch": 1.96, + "grad_norm": 0.3004693261054401, + "learning_rate": 2.487966614213466e-07, + "loss": 0.9572, + "step": 20458 + }, + { + "epoch": 1.96, + "grad_norm": 0.32484394703050873, + "learning_rate": 2.4768269235772025e-07, + "loss": 1.0268, + "step": 20459 + }, + { + "epoch": 1.96, + "grad_norm": 0.2760132845394392, + "learning_rate": 2.4657121965853126e-07, + "loss": 1.0638, + "step": 20460 + }, + { + "epoch": 1.96, + "grad_norm": 0.29110415104196163, + "learning_rate": 2.454622433515796e-07, + "loss": 1.1253, + "step": 20461 + }, + { + "epoch": 1.96, + "grad_norm": 0.2866953503359657, + "learning_rate": 2.4435576346463207e-07, + "loss": 0.9873, + "step": 20462 + }, + { + "epoch": 1.96, + "grad_norm": 0.3531830700225617, + "learning_rate": 2.4325178002537754e-07, + "loss": 0.9286, + "step": 20463 + }, + { + "epoch": 1.96, + "grad_norm": 0.35127737858458363, + "learning_rate": 2.421502930614383e-07, + "loss": 0.9771, + "step": 20464 + }, + { + "epoch": 1.96, + "grad_norm": 0.34293675333549356, + "learning_rate": 2.4105130260039243e-07, + "loss": 1.0732, + "step": 20465 + }, + { + "epoch": 1.96, + "grad_norm": 0.2984151703848057, + "learning_rate": 2.39954808669729e-07, + "loss": 0.9518, + "step": 20466 + }, + { + "epoch": 1.96, + "grad_norm": 0.2932479471231924, + "learning_rate": 2.3886081129688154e-07, + "loss": 0.9915, + "step": 20467 + }, + { + "epoch": 1.96, + "grad_norm": 0.3202066422444072, + "learning_rate": 2.3776931050926154e-07, + "loss": 1.0405, + "step": 20468 + }, + { + "epoch": 1.96, + "grad_norm": 0.3183787031510111, + "learning_rate": 2.3668030633414716e-07, + "loss": 1.0763, + "step": 20469 + }, + { + "epoch": 1.96, + "grad_norm": 0.3032493730740976, + "learning_rate": 2.355937987988055e-07, + "loss": 1.0413, + "step": 20470 + }, + { + "epoch": 1.96, + "grad_norm": 0.3428814872750879, + "learning_rate": 2.34509787930437e-07, + "loss": 0.946, + "step": 20471 + }, + { + "epoch": 1.96, + "grad_norm": 0.33273807828335245, + "learning_rate": 2.334282737561644e-07, + "loss": 0.9901, + "step": 20472 + }, + { + "epoch": 1.96, + "grad_norm": 0.2811087421157265, + "learning_rate": 2.3234925630304384e-07, + "loss": 1.0042, + "step": 20473 + }, + { + "epoch": 1.96, + "grad_norm": 0.2989755929481172, + "learning_rate": 2.3127273559808705e-07, + "loss": 1.1224, + "step": 20474 + }, + { + "epoch": 1.96, + "grad_norm": 0.29336925412092707, + "learning_rate": 2.3019871166822804e-07, + "loss": 1.073, + "step": 20475 + }, + { + "epoch": 1.96, + "grad_norm": 0.3387624250716984, + "learning_rate": 2.2912718454035643e-07, + "loss": 0.9752, + "step": 20476 + }, + { + "epoch": 1.96, + "grad_norm": 0.3630323100615506, + "learning_rate": 2.2805815424127296e-07, + "loss": 0.9815, + "step": 20477 + }, + { + "epoch": 1.96, + "grad_norm": 0.327809359922079, + "learning_rate": 2.2699162079774516e-07, + "loss": 1.0751, + "step": 20478 + }, + { + "epoch": 1.96, + "grad_norm": 0.29116001741429376, + "learning_rate": 2.2592758423645167e-07, + "loss": 1.0607, + "step": 20479 + }, + { + "epoch": 1.96, + "grad_norm": 0.34176511640667384, + "learning_rate": 2.2486604458402673e-07, + "loss": 1.0068, + "step": 20480 + }, + { + "epoch": 1.96, + "grad_norm": 0.3218440377116705, + "learning_rate": 2.2380700186703795e-07, + "loss": 1.0373, + "step": 20481 + }, + { + "epoch": 1.96, + "grad_norm": 0.3413727855279227, + "learning_rate": 2.227504561119864e-07, + "loss": 1.0681, + "step": 20482 + }, + { + "epoch": 1.96, + "grad_norm": 0.35672769839506, + "learning_rate": 2.216964073453065e-07, + "loss": 1.1357, + "step": 20483 + }, + { + "epoch": 1.96, + "grad_norm": 0.3408506173267495, + "learning_rate": 2.2064485559338822e-07, + "loss": 0.9602, + "step": 20484 + }, + { + "epoch": 1.96, + "grad_norm": 0.3240021736686785, + "learning_rate": 2.1959580088254384e-07, + "loss": 0.933, + "step": 20485 + }, + { + "epoch": 1.96, + "grad_norm": 0.3256947806663534, + "learning_rate": 2.1854924323901905e-07, + "loss": 0.9459, + "step": 20486 + }, + { + "epoch": 1.96, + "grad_norm": 0.3064820741220329, + "learning_rate": 2.175051826890151e-07, + "loss": 1.0151, + "step": 20487 + }, + { + "epoch": 1.96, + "grad_norm": 0.28099740773488624, + "learning_rate": 2.1646361925864444e-07, + "loss": 0.9624, + "step": 20488 + }, + { + "epoch": 1.96, + "grad_norm": 0.30059077499683584, + "learning_rate": 2.1542455297398622e-07, + "loss": 0.9638, + "step": 20489 + }, + { + "epoch": 1.96, + "grad_norm": 0.3917995251198406, + "learning_rate": 2.1438798386105298e-07, + "loss": 1.0444, + "step": 20490 + }, + { + "epoch": 1.96, + "grad_norm": 0.3171042774353573, + "learning_rate": 2.1335391194575727e-07, + "loss": 1.0237, + "step": 20491 + }, + { + "epoch": 1.96, + "grad_norm": 0.28967736317369264, + "learning_rate": 2.1232233725400064e-07, + "loss": 1.0696, + "step": 20492 + }, + { + "epoch": 1.96, + "grad_norm": 0.3192046730222756, + "learning_rate": 2.1129325981159576e-07, + "loss": 1.0244, + "step": 20493 + }, + { + "epoch": 1.96, + "grad_norm": 0.30495201666153693, + "learning_rate": 2.102666796442887e-07, + "loss": 0.9662, + "step": 20494 + }, + { + "epoch": 1.96, + "grad_norm": 0.30609514240337166, + "learning_rate": 2.0924259677777004e-07, + "loss": 0.9598, + "step": 20495 + }, + { + "epoch": 1.96, + "grad_norm": 0.31324549788007455, + "learning_rate": 2.0822101123767478e-07, + "loss": 1.1082, + "step": 20496 + }, + { + "epoch": 1.96, + "grad_norm": 0.30811035467137626, + "learning_rate": 2.072019230495603e-07, + "loss": 1.0683, + "step": 20497 + }, + { + "epoch": 1.96, + "grad_norm": 0.32702546809252786, + "learning_rate": 2.0618533223895064e-07, + "loss": 1.0554, + "step": 20498 + }, + { + "epoch": 1.96, + "grad_norm": 0.27297749934258586, + "learning_rate": 2.0517123883125878e-07, + "loss": 1.0564, + "step": 20499 + }, + { + "epoch": 1.96, + "grad_norm": 0.3163885954173541, + "learning_rate": 2.041596428518755e-07, + "loss": 0.947, + "step": 20500 + }, + { + "epoch": 1.96, + "grad_norm": 0.32093326863211896, + "learning_rate": 2.031505443261139e-07, + "loss": 1.0653, + "step": 20501 + }, + { + "epoch": 1.96, + "grad_norm": 0.29571509284678865, + "learning_rate": 2.0214394327923158e-07, + "loss": 0.8198, + "step": 20502 + }, + { + "epoch": 1.96, + "grad_norm": 0.29782737846336277, + "learning_rate": 2.0113983973641948e-07, + "loss": 0.9677, + "step": 20503 + }, + { + "epoch": 1.96, + "grad_norm": 0.28729207674307317, + "learning_rate": 2.0013823372281304e-07, + "loss": 1.0884, + "step": 20504 + }, + { + "epoch": 1.96, + "grad_norm": 0.31492913364964276, + "learning_rate": 1.991391252634589e-07, + "loss": 0.9743, + "step": 20505 + }, + { + "epoch": 1.96, + "grad_norm": 0.3267842481281692, + "learning_rate": 1.9814251438337038e-07, + "loss": 1.0285, + "step": 20506 + }, + { + "epoch": 1.96, + "grad_norm": 0.3446299767822215, + "learning_rate": 1.971484011074942e-07, + "loss": 1.0567, + "step": 20507 + }, + { + "epoch": 1.96, + "grad_norm": 0.30087620210288984, + "learning_rate": 1.9615678546069938e-07, + "loss": 0.9433, + "step": 20508 + }, + { + "epoch": 1.96, + "grad_norm": 0.29851433206263905, + "learning_rate": 1.9516766746779936e-07, + "loss": 1.0909, + "step": 20509 + }, + { + "epoch": 1.96, + "grad_norm": 0.3142086893077652, + "learning_rate": 1.9418104715356323e-07, + "loss": 0.961, + "step": 20510 + }, + { + "epoch": 1.96, + "grad_norm": 0.32699570451034327, + "learning_rate": 1.9319692454267124e-07, + "loss": 1.015, + "step": 20511 + }, + { + "epoch": 1.96, + "grad_norm": 0.3161056261940681, + "learning_rate": 1.9221529965974817e-07, + "loss": 1.1811, + "step": 20512 + }, + { + "epoch": 1.96, + "grad_norm": 0.3497840574422665, + "learning_rate": 1.9123617252936321e-07, + "loss": 0.9732, + "step": 20513 + }, + { + "epoch": 1.96, + "grad_norm": 0.27194687046741817, + "learning_rate": 1.9025954317601903e-07, + "loss": 1.0103, + "step": 20514 + }, + { + "epoch": 1.96, + "grad_norm": 0.28928833529282094, + "learning_rate": 1.892854116241627e-07, + "loss": 1.1341, + "step": 20515 + }, + { + "epoch": 1.96, + "grad_norm": 0.28775963543398936, + "learning_rate": 1.8831377789816363e-07, + "loss": 1.085, + "step": 20516 + }, + { + "epoch": 1.96, + "grad_norm": 0.3136003404374992, + "learning_rate": 1.873446420223468e-07, + "loss": 1.066, + "step": 20517 + }, + { + "epoch": 1.96, + "grad_norm": 0.35247874012495595, + "learning_rate": 1.863780040209595e-07, + "loss": 1.1178, + "step": 20518 + }, + { + "epoch": 1.96, + "grad_norm": 0.30931404639303695, + "learning_rate": 1.8541386391819348e-07, + "loss": 1.0764, + "step": 20519 + }, + { + "epoch": 1.96, + "grad_norm": 0.2878663423692716, + "learning_rate": 1.8445222173817388e-07, + "loss": 1.1345, + "step": 20520 + }, + { + "epoch": 1.96, + "grad_norm": 0.28419709411618377, + "learning_rate": 1.8349307750497037e-07, + "loss": 0.9975, + "step": 20521 + }, + { + "epoch": 1.96, + "grad_norm": 0.3682947747737957, + "learning_rate": 1.825364312425859e-07, + "loss": 1.1109, + "step": 20522 + }, + { + "epoch": 1.96, + "grad_norm": 0.3185465974640335, + "learning_rate": 1.8158228297495695e-07, + "loss": 0.957, + "step": 20523 + }, + { + "epoch": 1.96, + "grad_norm": 0.34006638907526804, + "learning_rate": 1.8063063272596437e-07, + "loss": 1.0607, + "step": 20524 + }, + { + "epoch": 1.96, + "grad_norm": 0.32901540162079346, + "learning_rate": 1.7968148051943356e-07, + "loss": 1.0171, + "step": 20525 + }, + { + "epoch": 1.96, + "grad_norm": 0.2888825599382331, + "learning_rate": 1.787348263791011e-07, + "loss": 0.9562, + "step": 20526 + }, + { + "epoch": 1.96, + "grad_norm": 0.3221520300821555, + "learning_rate": 1.7779067032865916e-07, + "loss": 0.9988, + "step": 20527 + }, + { + "epoch": 1.96, + "grad_norm": 0.2902303091386519, + "learning_rate": 1.7684901239175545e-07, + "loss": 1.0802, + "step": 20528 + }, + { + "epoch": 1.96, + "grad_norm": 0.2939547955046905, + "learning_rate": 1.7590985259192672e-07, + "loss": 1.1069, + "step": 20529 + }, + { + "epoch": 1.96, + "grad_norm": 0.3005791042054773, + "learning_rate": 1.749731909526986e-07, + "loss": 0.8911, + "step": 20530 + }, + { + "epoch": 1.96, + "grad_norm": 0.26954892792294105, + "learning_rate": 1.7403902749750788e-07, + "loss": 0.9686, + "step": 20531 + }, + { + "epoch": 1.96, + "grad_norm": 0.31699504598567524, + "learning_rate": 1.731073622497248e-07, + "loss": 1.0531, + "step": 20532 + }, + { + "epoch": 1.96, + "grad_norm": 0.32529000900695726, + "learning_rate": 1.7217819523266398e-07, + "loss": 1.0053, + "step": 20533 + }, + { + "epoch": 1.96, + "grad_norm": 0.3368318784940214, + "learning_rate": 1.7125152646959576e-07, + "loss": 1.0389, + "step": 20534 + }, + { + "epoch": 1.96, + "grad_norm": 0.29474943063248, + "learning_rate": 1.7032735598369044e-07, + "loss": 1.1043, + "step": 20535 + }, + { + "epoch": 1.96, + "grad_norm": 0.3391739348545085, + "learning_rate": 1.6940568379808507e-07, + "loss": 1.0574, + "step": 20536 + }, + { + "epoch": 1.96, + "grad_norm": 0.33881299861195496, + "learning_rate": 1.6848650993583904e-07, + "loss": 1.0446, + "step": 20537 + }, + { + "epoch": 1.96, + "grad_norm": 0.33927251176361606, + "learning_rate": 1.675698344199672e-07, + "loss": 1.0719, + "step": 20538 + }, + { + "epoch": 1.96, + "grad_norm": 0.2676704151985491, + "learning_rate": 1.6665565727339572e-07, + "loss": 0.8159, + "step": 20539 + }, + { + "epoch": 1.97, + "grad_norm": 0.2952676779267439, + "learning_rate": 1.6574397851900626e-07, + "loss": 0.9908, + "step": 20540 + }, + { + "epoch": 1.97, + "grad_norm": 0.2649699698255593, + "learning_rate": 1.64834798179625e-07, + "loss": 0.8655, + "step": 20541 + }, + { + "epoch": 1.97, + "grad_norm": 0.28451303443244147, + "learning_rate": 1.6392811627800043e-07, + "loss": 1.1223, + "step": 20542 + }, + { + "epoch": 1.97, + "grad_norm": 0.31112959691873493, + "learning_rate": 1.6302393283680329e-07, + "loss": 1.0492, + "step": 20543 + }, + { + "epoch": 1.97, + "grad_norm": 0.2928978827083986, + "learning_rate": 1.6212224787869324e-07, + "loss": 1.004, + "step": 20544 + }, + { + "epoch": 1.97, + "grad_norm": 0.3493822462478057, + "learning_rate": 1.612230614262189e-07, + "loss": 0.9816, + "step": 20545 + }, + { + "epoch": 1.97, + "grad_norm": 0.3413102687138707, + "learning_rate": 1.603263735018734e-07, + "loss": 1.0754, + "step": 20546 + }, + { + "epoch": 1.97, + "grad_norm": 0.33380117043466867, + "learning_rate": 1.5943218412811657e-07, + "loss": 1.0946, + "step": 20547 + }, + { + "epoch": 1.97, + "grad_norm": 0.28758801639296405, + "learning_rate": 1.5854049332730825e-07, + "loss": 1.0224, + "step": 20548 + }, + { + "epoch": 1.97, + "grad_norm": 0.2951187990427071, + "learning_rate": 1.5765130112177506e-07, + "loss": 0.8846, + "step": 20549 + }, + { + "epoch": 1.97, + "grad_norm": 0.3042776020681916, + "learning_rate": 1.5676460753377697e-07, + "loss": 1.0803, + "step": 20550 + }, + { + "epoch": 1.97, + "grad_norm": 0.3032287977064069, + "learning_rate": 1.5588041258548514e-07, + "loss": 0.9731, + "step": 20551 + }, + { + "epoch": 1.97, + "grad_norm": 0.2921920475105017, + "learning_rate": 1.5499871629903738e-07, + "loss": 0.9854, + "step": 20552 + }, + { + "epoch": 1.97, + "grad_norm": 0.31800217701467065, + "learning_rate": 1.541195186964939e-07, + "loss": 1.0568, + "step": 20553 + }, + { + "epoch": 1.97, + "grad_norm": 0.2845630752697774, + "learning_rate": 1.5324281979985922e-07, + "loss": 0.8993, + "step": 20554 + }, + { + "epoch": 1.97, + "grad_norm": 0.3223883168642679, + "learning_rate": 1.523686196310825e-07, + "loss": 0.9921, + "step": 20555 + }, + { + "epoch": 1.97, + "grad_norm": 0.32189451479672604, + "learning_rate": 1.5149691821202406e-07, + "loss": 0.963, + "step": 20556 + }, + { + "epoch": 1.97, + "grad_norm": 0.2798664490501485, + "learning_rate": 1.5062771556451084e-07, + "loss": 1.0231, + "step": 20557 + }, + { + "epoch": 1.97, + "grad_norm": 0.293064132294947, + "learning_rate": 1.497610117102921e-07, + "loss": 0.9589, + "step": 20558 + }, + { + "epoch": 1.97, + "grad_norm": 0.31844828612421167, + "learning_rate": 1.4889680667106164e-07, + "loss": 0.9595, + "step": 20559 + }, + { + "epoch": 1.97, + "grad_norm": 0.3131688295632413, + "learning_rate": 1.4803510046843549e-07, + "loss": 1.0029, + "step": 20560 + }, + { + "epoch": 1.97, + "grad_norm": 0.346699393991042, + "learning_rate": 1.471758931239964e-07, + "loss": 0.9943, + "step": 20561 + }, + { + "epoch": 1.97, + "grad_norm": 0.3209558756921424, + "learning_rate": 1.4631918465921602e-07, + "loss": 1.0605, + "step": 20562 + }, + { + "epoch": 1.97, + "grad_norm": 0.28395677784851425, + "learning_rate": 1.4546497509556612e-07, + "loss": 0.9952, + "step": 20563 + }, + { + "epoch": 1.97, + "grad_norm": 0.3113599094516055, + "learning_rate": 1.4461326445441846e-07, + "loss": 1.0926, + "step": 20564 + }, + { + "epoch": 1.97, + "grad_norm": 0.31013715297190053, + "learning_rate": 1.4376405275707828e-07, + "loss": 0.9569, + "step": 20565 + }, + { + "epoch": 1.97, + "grad_norm": 0.3576026503601545, + "learning_rate": 1.4291734002478408e-07, + "loss": 1.0006, + "step": 20566 + }, + { + "epoch": 1.97, + "grad_norm": 0.2840085077650111, + "learning_rate": 1.4207312627876335e-07, + "loss": 1.0012, + "step": 20567 + }, + { + "epoch": 1.97, + "grad_norm": 0.3268999072811751, + "learning_rate": 1.4123141154009921e-07, + "loss": 1.0616, + "step": 20568 + }, + { + "epoch": 1.97, + "grad_norm": 0.3083658046634938, + "learning_rate": 1.4039219582989704e-07, + "loss": 0.9413, + "step": 20569 + }, + { + "epoch": 1.97, + "grad_norm": 0.46167615888940855, + "learning_rate": 1.3955547916912892e-07, + "loss": 1.0109, + "step": 20570 + }, + { + "epoch": 1.97, + "grad_norm": 0.34682998104034196, + "learning_rate": 1.3872126157874477e-07, + "loss": 0.941, + "step": 20571 + }, + { + "epoch": 1.97, + "grad_norm": 0.3107972280172396, + "learning_rate": 1.3788954307962788e-07, + "loss": 1.0635, + "step": 20572 + }, + { + "epoch": 1.97, + "grad_norm": 0.34928330573730887, + "learning_rate": 1.370603236925838e-07, + "loss": 1.0222, + "step": 20573 + }, + { + "epoch": 1.97, + "grad_norm": 0.29931955618752065, + "learning_rate": 1.3623360343836267e-07, + "loss": 1.1024, + "step": 20574 + }, + { + "epoch": 1.97, + "grad_norm": 0.3337104906024839, + "learning_rate": 1.35409382337659e-07, + "loss": 1.0202, + "step": 20575 + }, + { + "epoch": 1.97, + "grad_norm": 0.36179341691340156, + "learning_rate": 1.3458766041110072e-07, + "loss": 1.0173, + "step": 20576 + }, + { + "epoch": 1.97, + "grad_norm": 0.3314092476448048, + "learning_rate": 1.3376843767924917e-07, + "loss": 1.0197, + "step": 20577 + }, + { + "epoch": 1.97, + "grad_norm": 0.30955277965095906, + "learning_rate": 1.329517141625991e-07, + "loss": 0.9646, + "step": 20578 + }, + { + "epoch": 1.97, + "grad_norm": 0.33724427565437054, + "learning_rate": 1.321374898816008e-07, + "loss": 0.9703, + "step": 20579 + }, + { + "epoch": 1.97, + "grad_norm": 0.3316845389350797, + "learning_rate": 1.3132576485662685e-07, + "loss": 1.0778, + "step": 20580 + }, + { + "epoch": 1.97, + "grad_norm": 0.31860425342234605, + "learning_rate": 1.3051653910799432e-07, + "loss": 1.0286, + "step": 20581 + }, + { + "epoch": 1.97, + "grad_norm": 0.2676558437599854, + "learning_rate": 1.2970981265595372e-07, + "loss": 0.9546, + "step": 20582 + }, + { + "epoch": 1.97, + "grad_norm": 0.3352643091513331, + "learning_rate": 1.2890558552067778e-07, + "loss": 0.9861, + "step": 20583 + }, + { + "epoch": 1.97, + "grad_norm": 0.31334541347180694, + "learning_rate": 1.2810385772231704e-07, + "loss": 1.0185, + "step": 20584 + }, + { + "epoch": 1.97, + "grad_norm": 0.3375888874015558, + "learning_rate": 1.2730462928092212e-07, + "loss": 1.0955, + "step": 20585 + }, + { + "epoch": 1.97, + "grad_norm": 0.3180115774760353, + "learning_rate": 1.2650790021649928e-07, + "loss": 0.9648, + "step": 20586 + }, + { + "epoch": 1.97, + "grad_norm": 0.306854331393969, + "learning_rate": 1.2571367054897699e-07, + "loss": 1.0072, + "step": 20587 + }, + { + "epoch": 1.97, + "grad_norm": 0.3406163183716173, + "learning_rate": 1.2492194029823933e-07, + "loss": 1.0152, + "step": 20588 + }, + { + "epoch": 1.97, + "grad_norm": 0.3082124604915374, + "learning_rate": 1.2413270948410383e-07, + "loss": 1.1016, + "step": 20589 + }, + { + "epoch": 1.97, + "grad_norm": 0.2847690945791855, + "learning_rate": 1.233459781263213e-07, + "loss": 1.0184, + "step": 20590 + }, + { + "epoch": 1.97, + "grad_norm": 0.33349488724222653, + "learning_rate": 1.2256174624456495e-07, + "loss": 0.9696, + "step": 20591 + }, + { + "epoch": 1.97, + "grad_norm": 0.3070904679995063, + "learning_rate": 1.217800138584746e-07, + "loss": 1.0798, + "step": 20592 + }, + { + "epoch": 1.97, + "grad_norm": 0.2943949296596036, + "learning_rate": 1.2100078098761237e-07, + "loss": 0.9663, + "step": 20593 + }, + { + "epoch": 1.97, + "grad_norm": 0.34699409849787444, + "learning_rate": 1.2022404765148487e-07, + "loss": 1.0476, + "step": 20594 + }, + { + "epoch": 1.97, + "grad_norm": 0.33062038313464837, + "learning_rate": 1.1944981386950993e-07, + "loss": 1.0515, + "step": 20595 + }, + { + "epoch": 1.97, + "grad_norm": 0.30175709646287807, + "learning_rate": 1.1867807966108312e-07, + "loss": 0.9734, + "step": 20596 + }, + { + "epoch": 1.97, + "grad_norm": 0.30060402217973703, + "learning_rate": 1.1790884504551125e-07, + "loss": 0.9285, + "step": 20597 + }, + { + "epoch": 1.97, + "grad_norm": 0.3161183187133331, + "learning_rate": 1.1714211004203445e-07, + "loss": 1.02, + "step": 20598 + }, + { + "epoch": 1.97, + "grad_norm": 0.31473720883447304, + "learning_rate": 1.1637787466985961e-07, + "loss": 1.0409, + "step": 20599 + }, + { + "epoch": 1.97, + "grad_norm": 0.3248297091565262, + "learning_rate": 1.1561613894810475e-07, + "loss": 0.9749, + "step": 20600 + }, + { + "epoch": 1.97, + "grad_norm": 0.31653371844265343, + "learning_rate": 1.148569028958213e-07, + "loss": 1.0515, + "step": 20601 + }, + { + "epoch": 1.97, + "grad_norm": 0.3160141234571062, + "learning_rate": 1.1410016653202738e-07, + "loss": 0.9559, + "step": 20602 + }, + { + "epoch": 1.97, + "grad_norm": 0.29765517775212785, + "learning_rate": 1.1334592987565229e-07, + "loss": 1.0532, + "step": 20603 + }, + { + "epoch": 1.97, + "grad_norm": 0.32531539937699444, + "learning_rate": 1.1259419294556983e-07, + "loss": 1.0858, + "step": 20604 + }, + { + "epoch": 1.97, + "grad_norm": 0.3423689510750839, + "learning_rate": 1.1184495576059828e-07, + "loss": 1.0798, + "step": 20605 + }, + { + "epoch": 1.97, + "grad_norm": 0.34862937888043827, + "learning_rate": 1.110982183394782e-07, + "loss": 0.9788, + "step": 20606 + }, + { + "epoch": 1.97, + "grad_norm": 0.33176279770544637, + "learning_rate": 1.1035398070090575e-07, + "loss": 1.0735, + "step": 20607 + }, + { + "epoch": 1.97, + "grad_norm": 0.31248143216153745, + "learning_rate": 1.0961224286349935e-07, + "loss": 0.9187, + "step": 20608 + }, + { + "epoch": 1.97, + "grad_norm": 0.3370518000113145, + "learning_rate": 1.0887300484583307e-07, + "loss": 0.9689, + "step": 20609 + }, + { + "epoch": 1.97, + "grad_norm": 0.2901159120700695, + "learning_rate": 1.0813626666639209e-07, + "loss": 0.9954, + "step": 20610 + }, + { + "epoch": 1.97, + "grad_norm": 0.31559341145528824, + "learning_rate": 1.0740202834361724e-07, + "loss": 0.9961, + "step": 20611 + }, + { + "epoch": 1.97, + "grad_norm": 0.2889710859062488, + "learning_rate": 1.0667028989589378e-07, + "loss": 1.1021, + "step": 20612 + }, + { + "epoch": 1.97, + "grad_norm": 0.3124421920480115, + "learning_rate": 1.0594105134152932e-07, + "loss": 1.0562, + "step": 20613 + }, + { + "epoch": 1.97, + "grad_norm": 0.28722453219663024, + "learning_rate": 1.052143126987648e-07, + "loss": 1.0877, + "step": 20614 + }, + { + "epoch": 1.97, + "grad_norm": 0.3096499182362944, + "learning_rate": 1.044900739857968e-07, + "loss": 1.0123, + "step": 20615 + }, + { + "epoch": 1.97, + "grad_norm": 0.3626678329918216, + "learning_rate": 1.0376833522074414e-07, + "loss": 1.0811, + "step": 20616 + }, + { + "epoch": 1.97, + "grad_norm": 0.313429921443796, + "learning_rate": 1.0304909642168126e-07, + "loss": 1.0804, + "step": 20617 + }, + { + "epoch": 1.97, + "grad_norm": 0.3012789460890628, + "learning_rate": 1.0233235760658266e-07, + "loss": 1.0293, + "step": 20618 + }, + { + "epoch": 1.97, + "grad_norm": 0.2895646612005743, + "learning_rate": 1.0161811879341176e-07, + "loss": 1.0334, + "step": 20619 + }, + { + "epoch": 1.97, + "grad_norm": 0.3228990628561729, + "learning_rate": 1.0090638000003205e-07, + "loss": 1.0944, + "step": 20620 + }, + { + "epoch": 1.97, + "grad_norm": 0.2826036586741376, + "learning_rate": 1.001971412442404e-07, + "loss": 1.0106, + "step": 20621 + }, + { + "epoch": 1.97, + "grad_norm": 0.3129395581112897, + "learning_rate": 9.949040254381147e-08, + "loss": 0.9829, + "step": 20622 + }, + { + "epoch": 1.97, + "grad_norm": 0.3407876098325084, + "learning_rate": 9.878616391642004e-08, + "loss": 1.0641, + "step": 20623 + }, + { + "epoch": 1.97, + "grad_norm": 0.34607715041913445, + "learning_rate": 9.808442537968532e-08, + "loss": 1.1006, + "step": 20624 + }, + { + "epoch": 1.97, + "grad_norm": 0.27503298726217446, + "learning_rate": 9.738518695118215e-08, + "loss": 0.9958, + "step": 20625 + }, + { + "epoch": 1.97, + "grad_norm": 0.30085509035017916, + "learning_rate": 9.668844864839654e-08, + "loss": 1.0219, + "step": 20626 + }, + { + "epoch": 1.97, + "grad_norm": 0.3117881570857931, + "learning_rate": 9.5994210488759e-08, + "loss": 0.9802, + "step": 20627 + }, + { + "epoch": 1.97, + "grad_norm": 0.3179455340566814, + "learning_rate": 9.530247248966672e-08, + "loss": 0.9777, + "step": 20628 + }, + { + "epoch": 1.97, + "grad_norm": 0.31183927519759963, + "learning_rate": 9.461323466841698e-08, + "loss": 1.054, + "step": 20629 + }, + { + "epoch": 1.97, + "grad_norm": 0.3078203763136594, + "learning_rate": 9.392649704225153e-08, + "loss": 1.0062, + "step": 20630 + }, + { + "epoch": 1.97, + "grad_norm": 0.33589893997896103, + "learning_rate": 9.324225962836775e-08, + "loss": 0.9934, + "step": 20631 + }, + { + "epoch": 1.97, + "grad_norm": 0.3380291040354226, + "learning_rate": 9.256052244389634e-08, + "loss": 1.043, + "step": 20632 + }, + { + "epoch": 1.97, + "grad_norm": 0.28681328437680226, + "learning_rate": 9.188128550586817e-08, + "loss": 1.0198, + "step": 20633 + }, + { + "epoch": 1.97, + "grad_norm": 0.2833407710299887, + "learning_rate": 9.120454883131402e-08, + "loss": 0.9371, + "step": 20634 + }, + { + "epoch": 1.97, + "grad_norm": 0.31027756941274387, + "learning_rate": 9.05303124371537e-08, + "loss": 1.0045, + "step": 20635 + }, + { + "epoch": 1.97, + "grad_norm": 0.3108956005152537, + "learning_rate": 8.985857634026262e-08, + "loss": 0.9939, + "step": 20636 + }, + { + "epoch": 1.97, + "grad_norm": 0.31444669178428963, + "learning_rate": 8.918934055744954e-08, + "loss": 1.0755, + "step": 20637 + }, + { + "epoch": 1.97, + "grad_norm": 0.312817133373912, + "learning_rate": 8.852260510546773e-08, + "loss": 0.9884, + "step": 20638 + }, + { + "epoch": 1.97, + "grad_norm": 0.3446907376043467, + "learning_rate": 8.785837000100384e-08, + "loss": 1.006, + "step": 20639 + }, + { + "epoch": 1.97, + "grad_norm": 0.3242630302673269, + "learning_rate": 8.719663526066679e-08, + "loss": 0.8797, + "step": 20640 + }, + { + "epoch": 1.97, + "grad_norm": 0.3029534243440211, + "learning_rate": 8.653740090103224e-08, + "loss": 0.9901, + "step": 20641 + }, + { + "epoch": 1.97, + "grad_norm": 0.3247604753516124, + "learning_rate": 8.588066693858698e-08, + "loss": 1.087, + "step": 20642 + }, + { + "epoch": 1.97, + "grad_norm": 0.3429524197002356, + "learning_rate": 8.522643338978453e-08, + "loss": 0.9957, + "step": 20643 + }, + { + "epoch": 1.98, + "grad_norm": 0.3321748219205923, + "learning_rate": 8.457470027096736e-08, + "loss": 0.9288, + "step": 20644 + }, + { + "epoch": 1.98, + "grad_norm": 0.33647692514786404, + "learning_rate": 8.392546759847796e-08, + "loss": 1.1404, + "step": 20645 + }, + { + "epoch": 1.98, + "grad_norm": 0.34480373537804576, + "learning_rate": 8.32787353885367e-08, + "loss": 0.9879, + "step": 20646 + }, + { + "epoch": 1.98, + "grad_norm": 0.32633251745788716, + "learning_rate": 8.263450365735282e-08, + "loss": 0.9759, + "step": 20647 + }, + { + "epoch": 1.98, + "grad_norm": 0.29753622939097474, + "learning_rate": 8.199277242102455e-08, + "loss": 0.9882, + "step": 20648 + }, + { + "epoch": 1.98, + "grad_norm": 0.2923211768538003, + "learning_rate": 8.135354169562792e-08, + "loss": 1.0393, + "step": 20649 + }, + { + "epoch": 1.98, + "grad_norm": 0.3388885740619906, + "learning_rate": 8.071681149716126e-08, + "loss": 0.9754, + "step": 20650 + }, + { + "epoch": 1.98, + "grad_norm": 0.3413945242659253, + "learning_rate": 8.008258184154516e-08, + "loss": 1.1353, + "step": 20651 + }, + { + "epoch": 1.98, + "grad_norm": 0.31111683073359275, + "learning_rate": 7.94508527446669e-08, + "loss": 1.1258, + "step": 20652 + }, + { + "epoch": 1.98, + "grad_norm": 0.3169165528907371, + "learning_rate": 7.882162422232497e-08, + "loss": 1.051, + "step": 20653 + }, + { + "epoch": 1.98, + "grad_norm": 0.2991928944879046, + "learning_rate": 7.819489629026234e-08, + "loss": 1.1318, + "step": 20654 + }, + { + "epoch": 1.98, + "grad_norm": 0.31779540003689155, + "learning_rate": 7.757066896417752e-08, + "loss": 0.9892, + "step": 20655 + }, + { + "epoch": 1.98, + "grad_norm": 0.3035541765862874, + "learning_rate": 7.69489422596914e-08, + "loss": 0.9809, + "step": 20656 + }, + { + "epoch": 1.98, + "grad_norm": 0.3162714024805696, + "learning_rate": 7.632971619234707e-08, + "loss": 1.0554, + "step": 20657 + }, + { + "epoch": 1.98, + "grad_norm": 0.29228844989163466, + "learning_rate": 7.571299077765437e-08, + "loss": 0.9712, + "step": 20658 + }, + { + "epoch": 1.98, + "grad_norm": 0.3291779297022572, + "learning_rate": 7.509876603104537e-08, + "loss": 1.1572, + "step": 20659 + }, + { + "epoch": 1.98, + "grad_norm": 0.30321818653578847, + "learning_rate": 7.448704196789669e-08, + "loss": 1.0955, + "step": 20660 + }, + { + "epoch": 1.98, + "grad_norm": 0.25859606775570565, + "learning_rate": 7.387781860350717e-08, + "loss": 1.0342, + "step": 20661 + }, + { + "epoch": 1.98, + "grad_norm": 0.3370782250679355, + "learning_rate": 7.32710959531202e-08, + "loss": 1.0956, + "step": 20662 + }, + { + "epoch": 1.98, + "grad_norm": 0.30181928123144625, + "learning_rate": 7.266687403193473e-08, + "loss": 1.0964, + "step": 20663 + }, + { + "epoch": 1.98, + "grad_norm": 0.3134583854283347, + "learning_rate": 7.206515285504978e-08, + "loss": 0.975, + "step": 20664 + }, + { + "epoch": 1.98, + "grad_norm": 0.36804659525406414, + "learning_rate": 7.146593243754218e-08, + "loss": 1.0022, + "step": 20665 + }, + { + "epoch": 1.98, + "grad_norm": 0.31059069374230036, + "learning_rate": 7.086921279441105e-08, + "loss": 0.9471, + "step": 20666 + }, + { + "epoch": 1.98, + "grad_norm": 0.30268963079376304, + "learning_rate": 7.02749939405778e-08, + "loss": 1.0748, + "step": 20667 + }, + { + "epoch": 1.98, + "grad_norm": 0.3504742566010096, + "learning_rate": 6.968327589090828e-08, + "loss": 0.9816, + "step": 20668 + }, + { + "epoch": 1.98, + "grad_norm": 0.30001115708219717, + "learning_rate": 6.909405866022401e-08, + "loss": 1.0412, + "step": 20669 + }, + { + "epoch": 1.98, + "grad_norm": 0.34588387142136184, + "learning_rate": 6.850734226326871e-08, + "loss": 1.0378, + "step": 20670 + }, + { + "epoch": 1.98, + "grad_norm": 0.3022110413513049, + "learning_rate": 6.792312671470847e-08, + "loss": 0.9471, + "step": 20671 + }, + { + "epoch": 1.98, + "grad_norm": 0.30527970893208145, + "learning_rate": 6.73414120291871e-08, + "loss": 1.0837, + "step": 20672 + }, + { + "epoch": 1.98, + "grad_norm": 0.3228384440586447, + "learning_rate": 6.676219822125962e-08, + "loss": 1.0167, + "step": 20673 + }, + { + "epoch": 1.98, + "grad_norm": 0.3287127474300913, + "learning_rate": 6.618548530540336e-08, + "loss": 0.9798, + "step": 20674 + }, + { + "epoch": 1.98, + "grad_norm": 0.29648347330149694, + "learning_rate": 6.561127329606232e-08, + "loss": 1.0145, + "step": 20675 + }, + { + "epoch": 1.98, + "grad_norm": 0.3175994161285185, + "learning_rate": 6.503956220760276e-08, + "loss": 0.9848, + "step": 20676 + }, + { + "epoch": 1.98, + "grad_norm": 0.35710901644919746, + "learning_rate": 6.44703520543466e-08, + "loss": 1.1021, + "step": 20677 + }, + { + "epoch": 1.98, + "grad_norm": 0.35216070348557826, + "learning_rate": 6.390364285051576e-08, + "loss": 1.0261, + "step": 20678 + }, + { + "epoch": 1.98, + "grad_norm": 0.32202418957481516, + "learning_rate": 6.333943461031e-08, + "loss": 1.0082, + "step": 20679 + }, + { + "epoch": 1.98, + "grad_norm": 0.30476104940682713, + "learning_rate": 6.277772734784027e-08, + "loss": 0.9862, + "step": 20680 + }, + { + "epoch": 1.98, + "grad_norm": 0.34716357883440496, + "learning_rate": 6.221852107716197e-08, + "loss": 1.1281, + "step": 20681 + }, + { + "epoch": 1.98, + "grad_norm": 0.300107397414897, + "learning_rate": 6.166181581227503e-08, + "loss": 1.0676, + "step": 20682 + }, + { + "epoch": 1.98, + "grad_norm": 0.29516752292661563, + "learning_rate": 6.110761156711275e-08, + "loss": 1.0099, + "step": 20683 + }, + { + "epoch": 1.98, + "grad_norm": 0.34197980280687507, + "learning_rate": 6.055590835554182e-08, + "loss": 0.9922, + "step": 20684 + }, + { + "epoch": 1.98, + "grad_norm": 0.2923269208555453, + "learning_rate": 6.000670619137339e-08, + "loss": 1.0805, + "step": 20685 + }, + { + "epoch": 1.98, + "grad_norm": 0.3477945564342247, + "learning_rate": 5.946000508834093e-08, + "loss": 1.0684, + "step": 20686 + }, + { + "epoch": 1.98, + "grad_norm": 0.3359080274906652, + "learning_rate": 5.89158050601335e-08, + "loss": 1.0017, + "step": 20687 + }, + { + "epoch": 1.98, + "grad_norm": 0.30453302203535254, + "learning_rate": 5.837410612036243e-08, + "loss": 1.1118, + "step": 20688 + }, + { + "epoch": 1.98, + "grad_norm": 0.3010682651781762, + "learning_rate": 5.783490828260574e-08, + "loss": 1.0681, + "step": 20689 + }, + { + "epoch": 1.98, + "grad_norm": 0.33582186671689035, + "learning_rate": 5.729821156033044e-08, + "loss": 1.0016, + "step": 20690 + }, + { + "epoch": 1.98, + "grad_norm": 0.2993290064264186, + "learning_rate": 5.6764015966981335e-08, + "loss": 1.0467, + "step": 20691 + }, + { + "epoch": 1.98, + "grad_norm": 0.32696911887553354, + "learning_rate": 5.62323215159255e-08, + "loss": 1.0414, + "step": 20692 + }, + { + "epoch": 1.98, + "grad_norm": 0.2862678683190878, + "learning_rate": 5.570312822046342e-08, + "loss": 0.9086, + "step": 20693 + }, + { + "epoch": 1.98, + "grad_norm": 0.2825968332049816, + "learning_rate": 5.517643609385115e-08, + "loss": 0.8892, + "step": 20694 + }, + { + "epoch": 1.98, + "grad_norm": 0.32284460396576126, + "learning_rate": 5.4652245149255934e-08, + "loss": 0.993, + "step": 20695 + }, + { + "epoch": 1.98, + "grad_norm": 0.32598066413130117, + "learning_rate": 5.413055539980061e-08, + "loss": 0.9475, + "step": 20696 + }, + { + "epoch": 1.98, + "grad_norm": 0.2730536371205291, + "learning_rate": 5.36113668585414e-08, + "loss": 0.9438, + "step": 20697 + }, + { + "epoch": 1.98, + "grad_norm": 0.3222861520195792, + "learning_rate": 5.309467953847902e-08, + "loss": 1.1009, + "step": 20698 + }, + { + "epoch": 1.98, + "grad_norm": 0.3226028624738142, + "learning_rate": 5.258049345252536e-08, + "loss": 1.0276, + "step": 20699 + }, + { + "epoch": 1.98, + "grad_norm": 0.3247174548342395, + "learning_rate": 5.2068808613570106e-08, + "loss": 1.0599, + "step": 20700 + }, + { + "epoch": 1.98, + "grad_norm": 0.2778946211202319, + "learning_rate": 5.155962503440303e-08, + "loss": 1.0196, + "step": 20701 + }, + { + "epoch": 1.98, + "grad_norm": 0.27156367012300475, + "learning_rate": 5.1052942727769506e-08, + "loss": 1.03, + "step": 20702 + }, + { + "epoch": 1.98, + "grad_norm": 0.3677008415239914, + "learning_rate": 5.054876170635936e-08, + "loss": 1.05, + "step": 20703 + }, + { + "epoch": 1.98, + "grad_norm": 0.33050626046664416, + "learning_rate": 5.004708198277364e-08, + "loss": 1.055, + "step": 20704 + }, + { + "epoch": 1.98, + "grad_norm": 0.3519532243877039, + "learning_rate": 4.954790356958006e-08, + "loss": 1.0493, + "step": 20705 + }, + { + "epoch": 1.98, + "grad_norm": 0.3573324931502153, + "learning_rate": 4.905122647926863e-08, + "loss": 1.0888, + "step": 20706 + }, + { + "epoch": 1.98, + "grad_norm": 0.33207934390075056, + "learning_rate": 4.855705072426275e-08, + "loss": 1.0813, + "step": 20707 + }, + { + "epoch": 1.98, + "grad_norm": 0.29631149041412014, + "learning_rate": 4.806537631694141e-08, + "loss": 1.0548, + "step": 20708 + }, + { + "epoch": 1.98, + "grad_norm": 0.2748126623494873, + "learning_rate": 4.7576203269594775e-08, + "loss": 0.9941, + "step": 20709 + }, + { + "epoch": 1.98, + "grad_norm": 0.30785496461931044, + "learning_rate": 4.7089531594468607e-08, + "loss": 1.161, + "step": 20710 + }, + { + "epoch": 1.98, + "grad_norm": 0.31116090647723804, + "learning_rate": 4.660536130375315e-08, + "loss": 1.0251, + "step": 20711 + }, + { + "epoch": 1.98, + "grad_norm": 0.3275061713831137, + "learning_rate": 4.612369240954984e-08, + "loss": 0.9884, + "step": 20712 + }, + { + "epoch": 1.98, + "grad_norm": 0.2863046312715887, + "learning_rate": 4.5644524923926793e-08, + "loss": 0.8492, + "step": 20713 + }, + { + "epoch": 1.98, + "grad_norm": 0.2957682209435248, + "learning_rate": 4.516785885886332e-08, + "loss": 1.06, + "step": 20714 + }, + { + "epoch": 1.98, + "grad_norm": 0.33214499413440646, + "learning_rate": 4.4693694226294326e-08, + "loss": 1.1028, + "step": 20715 + }, + { + "epoch": 1.98, + "grad_norm": 0.33433414167683007, + "learning_rate": 4.4222031038088084e-08, + "loss": 1.0632, + "step": 20716 + }, + { + "epoch": 1.98, + "grad_norm": 0.3163898268271869, + "learning_rate": 4.375286930603517e-08, + "loss": 1.1107, + "step": 20717 + }, + { + "epoch": 1.98, + "grad_norm": 0.3216460541883978, + "learning_rate": 4.3286209041892845e-08, + "loss": 0.9906, + "step": 20718 + }, + { + "epoch": 1.98, + "grad_norm": 0.3321606525713277, + "learning_rate": 4.2822050257340654e-08, + "loss": 1.0589, + "step": 20719 + }, + { + "epoch": 1.98, + "grad_norm": 0.2970241007404699, + "learning_rate": 4.236039296398042e-08, + "loss": 0.9278, + "step": 20720 + }, + { + "epoch": 1.98, + "grad_norm": 0.3214768397179136, + "learning_rate": 4.190123717336958e-08, + "loss": 1.0281, + "step": 20721 + }, + { + "epoch": 1.98, + "grad_norm": 0.2706956566542636, + "learning_rate": 4.144458289701003e-08, + "loss": 0.9068, + "step": 20722 + }, + { + "epoch": 1.98, + "grad_norm": 0.3345711284452542, + "learning_rate": 4.099043014631487e-08, + "loss": 0.9737, + "step": 20723 + }, + { + "epoch": 1.98, + "grad_norm": 0.3301766064484646, + "learning_rate": 4.053877893266389e-08, + "loss": 0.9334, + "step": 20724 + }, + { + "epoch": 1.98, + "grad_norm": 0.35770662224112015, + "learning_rate": 4.008962926734805e-08, + "loss": 0.9384, + "step": 20725 + }, + { + "epoch": 1.98, + "grad_norm": 0.35319205769299067, + "learning_rate": 3.964298116160281e-08, + "loss": 0.9903, + "step": 20726 + }, + { + "epoch": 1.98, + "grad_norm": 0.3414170263220684, + "learning_rate": 3.919883462661922e-08, + "loss": 0.9494, + "step": 20727 + }, + { + "epoch": 1.98, + "grad_norm": 0.3441568836606846, + "learning_rate": 3.875718967351061e-08, + "loss": 1.0283, + "step": 20728 + }, + { + "epoch": 1.98, + "grad_norm": 0.34486510104865664, + "learning_rate": 3.83180463133348e-08, + "loss": 0.9574, + "step": 20729 + }, + { + "epoch": 1.98, + "grad_norm": 0.3130429683045617, + "learning_rate": 3.7881404557060796e-08, + "loss": 1.0269, + "step": 20730 + }, + { + "epoch": 1.98, + "grad_norm": 0.31396237202110866, + "learning_rate": 3.74472644156354e-08, + "loss": 0.9574, + "step": 20731 + }, + { + "epoch": 1.98, + "grad_norm": 0.345858201477811, + "learning_rate": 3.7015625899916584e-08, + "loss": 1.0467, + "step": 20732 + }, + { + "epoch": 1.98, + "grad_norm": 0.28351303900266084, + "learning_rate": 3.658648902069572e-08, + "loss": 1.0941, + "step": 20733 + }, + { + "epoch": 1.98, + "grad_norm": 0.2870707018439127, + "learning_rate": 3.615985378873088e-08, + "loss": 0.8287, + "step": 20734 + }, + { + "epoch": 1.98, + "grad_norm": 0.3422908843135994, + "learning_rate": 3.5735720214691293e-08, + "loss": 0.998, + "step": 20735 + }, + { + "epoch": 1.98, + "grad_norm": 0.3001213215901377, + "learning_rate": 3.53140883091907e-08, + "loss": 1.0357, + "step": 20736 + }, + { + "epoch": 1.98, + "grad_norm": 0.31980663634436635, + "learning_rate": 3.489495808277621e-08, + "loss": 0.9886, + "step": 20737 + }, + { + "epoch": 1.98, + "grad_norm": 0.3467053058193493, + "learning_rate": 3.447832954595054e-08, + "loss": 0.991, + "step": 20738 + }, + { + "epoch": 1.98, + "grad_norm": 0.292826111191967, + "learning_rate": 3.406420270911648e-08, + "loss": 0.9574, + "step": 20739 + }, + { + "epoch": 1.98, + "grad_norm": 0.2889943969496846, + "learning_rate": 3.365257758266571e-08, + "loss": 0.9447, + "step": 20740 + }, + { + "epoch": 1.98, + "grad_norm": 0.285552278574632, + "learning_rate": 3.324345417687891e-08, + "loss": 1.0541, + "step": 20741 + }, + { + "epoch": 1.98, + "grad_norm": 0.2903556346434334, + "learning_rate": 3.2836832502003424e-08, + "loss": 0.9496, + "step": 20742 + }, + { + "epoch": 1.98, + "grad_norm": 0.31415080516177657, + "learning_rate": 3.24327125682089e-08, + "loss": 0.8968, + "step": 20743 + }, + { + "epoch": 1.98, + "grad_norm": 0.3343408341716518, + "learning_rate": 3.203109438562058e-08, + "loss": 1.0201, + "step": 20744 + }, + { + "epoch": 1.98, + "grad_norm": 0.31040856424164015, + "learning_rate": 3.1631977964274865e-08, + "loss": 1.0962, + "step": 20745 + }, + { + "epoch": 1.98, + "grad_norm": 0.30371771077963206, + "learning_rate": 3.123536331416377e-08, + "loss": 0.926, + "step": 20746 + }, + { + "epoch": 1.98, + "grad_norm": 0.29587958716894236, + "learning_rate": 3.084125044522379e-08, + "loss": 0.9924, + "step": 20747 + }, + { + "epoch": 1.98, + "grad_norm": 0.2968689328707841, + "learning_rate": 3.044963936730261e-08, + "loss": 1.0657, + "step": 20748 + }, + { + "epoch": 1.99, + "grad_norm": 0.3016608795715057, + "learning_rate": 3.006053009021459e-08, + "loss": 0.9985, + "step": 20749 + }, + { + "epoch": 1.99, + "grad_norm": 0.30646416995344355, + "learning_rate": 2.9673922623696392e-08, + "loss": 0.996, + "step": 20750 + }, + { + "epoch": 1.99, + "grad_norm": 0.2958041294582072, + "learning_rate": 2.928981697740696e-08, + "loss": 0.9607, + "step": 20751 + }, + { + "epoch": 1.99, + "grad_norm": 0.2938554127983297, + "learning_rate": 2.890821316097192e-08, + "loss": 0.9735, + "step": 20752 + }, + { + "epoch": 1.99, + "grad_norm": 0.34867710093949517, + "learning_rate": 2.8529111183928092e-08, + "loss": 0.9812, + "step": 20753 + }, + { + "epoch": 1.99, + "grad_norm": 0.3336504289086753, + "learning_rate": 2.8152511055790087e-08, + "loss": 0.9765, + "step": 20754 + }, + { + "epoch": 1.99, + "grad_norm": 0.35923834300777396, + "learning_rate": 2.7778412785950392e-08, + "loss": 0.962, + "step": 20755 + }, + { + "epoch": 1.99, + "grad_norm": 0.29754020679737014, + "learning_rate": 2.7406816383801494e-08, + "loss": 0.984, + "step": 20756 + }, + { + "epoch": 1.99, + "grad_norm": 0.3156752507793664, + "learning_rate": 2.703772185861375e-08, + "loss": 1.0514, + "step": 20757 + }, + { + "epoch": 1.99, + "grad_norm": 0.32401709339152496, + "learning_rate": 2.667112921964643e-08, + "loss": 0.9944, + "step": 20758 + }, + { + "epoch": 1.99, + "grad_norm": 0.3401136936833863, + "learning_rate": 2.6307038476058864e-08, + "loss": 1.0974, + "step": 20759 + }, + { + "epoch": 1.99, + "grad_norm": 0.30521101179830035, + "learning_rate": 2.5945449636977094e-08, + "loss": 0.9813, + "step": 20760 + }, + { + "epoch": 1.99, + "grad_norm": 0.2760100484268643, + "learning_rate": 2.5586362711427225e-08, + "loss": 1.0186, + "step": 20761 + }, + { + "epoch": 1.99, + "grad_norm": 0.3246208932518041, + "learning_rate": 2.5229777708424274e-08, + "loss": 1.0876, + "step": 20762 + }, + { + "epoch": 1.99, + "grad_norm": 0.3102332944200023, + "learning_rate": 2.4875694636872227e-08, + "loss": 1.0697, + "step": 20763 + }, + { + "epoch": 1.99, + "grad_norm": 0.3046436141491857, + "learning_rate": 2.4524113505641765e-08, + "loss": 1.0295, + "step": 20764 + }, + { + "epoch": 1.99, + "grad_norm": 0.31376737852727743, + "learning_rate": 2.417503432352586e-08, + "loss": 0.995, + "step": 20765 + }, + { + "epoch": 1.99, + "grad_norm": 0.33239787168311025, + "learning_rate": 2.3828457099250857e-08, + "loss": 1.0778, + "step": 20766 + }, + { + "epoch": 1.99, + "grad_norm": 0.3318009274833, + "learning_rate": 2.348438184152091e-08, + "loss": 1.0893, + "step": 20767 + }, + { + "epoch": 1.99, + "grad_norm": 0.3184571466519443, + "learning_rate": 2.3142808558918038e-08, + "loss": 1.039, + "step": 20768 + }, + { + "epoch": 1.99, + "grad_norm": 0.2905516995827711, + "learning_rate": 2.2803737259990965e-08, + "loss": 1.0177, + "step": 20769 + }, + { + "epoch": 1.99, + "grad_norm": 0.2964402204477705, + "learning_rate": 2.2467167953243996e-08, + "loss": 1.0874, + "step": 20770 + }, + { + "epoch": 1.99, + "grad_norm": 0.2838974274985829, + "learning_rate": 2.213310064709262e-08, + "loss": 1.0345, + "step": 20771 + }, + { + "epoch": 1.99, + "grad_norm": 0.3465566387793811, + "learning_rate": 2.1801535349885716e-08, + "loss": 1.0268, + "step": 20772 + }, + { + "epoch": 1.99, + "grad_norm": 0.3221747316316864, + "learning_rate": 2.147247206992775e-08, + "loss": 1.0578, + "step": 20773 + }, + { + "epoch": 1.99, + "grad_norm": 0.35946012724954596, + "learning_rate": 2.1145910815467683e-08, + "loss": 1.1184, + "step": 20774 + }, + { + "epoch": 1.99, + "grad_norm": 0.35924640791494816, + "learning_rate": 2.0821851594654552e-08, + "loss": 1.026, + "step": 20775 + }, + { + "epoch": 1.99, + "grad_norm": 0.3263992629113238, + "learning_rate": 2.0500294415615184e-08, + "loss": 0.9364, + "step": 20776 + }, + { + "epoch": 1.99, + "grad_norm": 0.2571854377731657, + "learning_rate": 2.0181239286387597e-08, + "loss": 1.1018, + "step": 20777 + }, + { + "epoch": 1.99, + "grad_norm": 0.29449887463781105, + "learning_rate": 1.98646862149543e-08, + "loss": 1.0076, + "step": 20778 + }, + { + "epoch": 1.99, + "grad_norm": 0.3186336131075811, + "learning_rate": 1.9550635209253375e-08, + "loss": 0.9721, + "step": 20779 + }, + { + "epoch": 1.99, + "grad_norm": 0.3119016734937588, + "learning_rate": 1.9239086277123007e-08, + "loss": 1.0619, + "step": 20780 + }, + { + "epoch": 1.99, + "grad_norm": 0.33452405251960127, + "learning_rate": 1.8930039426379164e-08, + "loss": 1.0852, + "step": 20781 + }, + { + "epoch": 1.99, + "grad_norm": 0.30413396120641134, + "learning_rate": 1.8623494664737894e-08, + "loss": 1.0808, + "step": 20782 + }, + { + "epoch": 1.99, + "grad_norm": 0.29545872377325627, + "learning_rate": 1.8319451999893044e-08, + "loss": 1.0712, + "step": 20783 + }, + { + "epoch": 1.99, + "grad_norm": 0.3303936457130071, + "learning_rate": 1.8017911439427436e-08, + "loss": 0.998, + "step": 20784 + }, + { + "epoch": 1.99, + "grad_norm": 0.31827542412472043, + "learning_rate": 1.7718872990901692e-08, + "loss": 0.9885, + "step": 20785 + }, + { + "epoch": 1.99, + "grad_norm": 0.28482048563297163, + "learning_rate": 1.742233666179871e-08, + "loss": 0.9298, + "step": 20786 + }, + { + "epoch": 1.99, + "grad_norm": 0.368399045805558, + "learning_rate": 1.7128302459545887e-08, + "loss": 0.9869, + "step": 20787 + }, + { + "epoch": 1.99, + "grad_norm": 0.36489687517162306, + "learning_rate": 1.6836770391481793e-08, + "loss": 0.997, + "step": 20788 + }, + { + "epoch": 1.99, + "grad_norm": 0.2973109800339607, + "learning_rate": 1.65477404649228e-08, + "loss": 1.0378, + "step": 20789 + }, + { + "epoch": 1.99, + "grad_norm": 0.3741436067683066, + "learning_rate": 1.626121268709646e-08, + "loss": 1.0081, + "step": 20790 + }, + { + "epoch": 1.99, + "grad_norm": 0.33014741996785046, + "learning_rate": 1.5977187065163712e-08, + "loss": 0.9363, + "step": 20791 + }, + { + "epoch": 1.99, + "grad_norm": 0.3096323838989413, + "learning_rate": 1.5695663606241083e-08, + "loss": 1.1023, + "step": 20792 + }, + { + "epoch": 1.99, + "grad_norm": 0.3143245461789224, + "learning_rate": 1.541664231736739e-08, + "loss": 1.0634, + "step": 20793 + }, + { + "epoch": 1.99, + "grad_norm": 0.33568206506095744, + "learning_rate": 1.514012320553704e-08, + "loss": 1.077, + "step": 20794 + }, + { + "epoch": 1.99, + "grad_norm": 0.3141345131515858, + "learning_rate": 1.4866106277666714e-08, + "loss": 1.0422, + "step": 20795 + }, + { + "epoch": 1.99, + "grad_norm": 0.32676483922448146, + "learning_rate": 1.4594591540606495e-08, + "loss": 1.1175, + "step": 20796 + }, + { + "epoch": 1.99, + "grad_norm": 0.30940245234090674, + "learning_rate": 1.4325579001150946e-08, + "loss": 1.0005, + "step": 20797 + }, + { + "epoch": 1.99, + "grad_norm": 0.3319572150445928, + "learning_rate": 1.4059068666039121e-08, + "loss": 0.9204, + "step": 20798 + }, + { + "epoch": 1.99, + "grad_norm": 0.2842846354968263, + "learning_rate": 1.379506054194346e-08, + "loss": 1.0403, + "step": 20799 + }, + { + "epoch": 1.99, + "grad_norm": 0.3470554315670575, + "learning_rate": 1.353355463545869e-08, + "loss": 1.0864, + "step": 20800 + }, + { + "epoch": 1.99, + "grad_norm": 0.28965929566971715, + "learning_rate": 1.3274550953135123e-08, + "loss": 1.0375, + "step": 20801 + }, + { + "epoch": 1.99, + "grad_norm": 0.30052492196545905, + "learning_rate": 1.3018049501456464e-08, + "loss": 1.0603, + "step": 20802 + }, + { + "epoch": 1.99, + "grad_norm": 0.31349110536906416, + "learning_rate": 1.2764050286839802e-08, + "loss": 0.8906, + "step": 20803 + }, + { + "epoch": 1.99, + "grad_norm": 0.3128030125455311, + "learning_rate": 1.2512553315646713e-08, + "loss": 1.066, + "step": 20804 + }, + { + "epoch": 1.99, + "grad_norm": 0.3342980920366819, + "learning_rate": 1.2263558594161063e-08, + "loss": 1.0436, + "step": 20805 + }, + { + "epoch": 1.99, + "grad_norm": 0.3622440870467158, + "learning_rate": 1.2017066128622301e-08, + "loss": 0.9159, + "step": 20806 + }, + { + "epoch": 1.99, + "grad_norm": 0.323884382240448, + "learning_rate": 1.177307592520327e-08, + "loss": 1.1146, + "step": 20807 + }, + { + "epoch": 1.99, + "grad_norm": 0.3590516027246436, + "learning_rate": 1.1531587989999093e-08, + "loss": 1.0644, + "step": 20808 + }, + { + "epoch": 1.99, + "grad_norm": 0.3068777600065667, + "learning_rate": 1.1292602329049385e-08, + "loss": 0.9722, + "step": 20809 + }, + { + "epoch": 1.99, + "grad_norm": 0.33802441655523807, + "learning_rate": 1.1056118948349348e-08, + "loss": 1.1074, + "step": 20810 + }, + { + "epoch": 1.99, + "grad_norm": 0.37663448173564007, + "learning_rate": 1.0822137853816472e-08, + "loss": 1.0393, + "step": 20811 + }, + { + "epoch": 1.99, + "grad_norm": 0.3031695364429374, + "learning_rate": 1.059065905129053e-08, + "loss": 0.9896, + "step": 20812 + }, + { + "epoch": 1.99, + "grad_norm": 0.3308343250517348, + "learning_rate": 1.036168254657799e-08, + "loss": 1.0453, + "step": 20813 + }, + { + "epoch": 1.99, + "grad_norm": 0.32880004095786125, + "learning_rate": 1.0135208345407598e-08, + "loss": 0.9607, + "step": 20814 + }, + { + "epoch": 1.99, + "grad_norm": 0.30964291891114387, + "learning_rate": 9.911236453441497e-09, + "loss": 0.9739, + "step": 20815 + }, + { + "epoch": 1.99, + "grad_norm": 0.36950368591272154, + "learning_rate": 9.68976687628631e-09, + "loss": 1.0599, + "step": 20816 + }, + { + "epoch": 1.99, + "grad_norm": 0.330502130308281, + "learning_rate": 9.470799619493153e-09, + "loss": 0.9632, + "step": 20817 + }, + { + "epoch": 1.99, + "grad_norm": 0.3458798980459582, + "learning_rate": 9.254334688535426e-09, + "loss": 0.9823, + "step": 20818 + }, + { + "epoch": 1.99, + "grad_norm": 0.3342767368920896, + "learning_rate": 9.040372088819914e-09, + "loss": 1.0728, + "step": 20819 + }, + { + "epoch": 1.99, + "grad_norm": 0.34892299379614594, + "learning_rate": 8.828911825720098e-09, + "loss": 1.0427, + "step": 20820 + }, + { + "epoch": 1.99, + "grad_norm": 0.3073456008379574, + "learning_rate": 8.619953904509536e-09, + "loss": 0.9295, + "step": 20821 + }, + { + "epoch": 1.99, + "grad_norm": 0.3050477454234244, + "learning_rate": 8.413498330439584e-09, + "loss": 1.0, + "step": 20822 + }, + { + "epoch": 1.99, + "grad_norm": 0.29674642190221173, + "learning_rate": 8.209545108650573e-09, + "loss": 0.9908, + "step": 20823 + }, + { + "epoch": 1.99, + "grad_norm": 0.29083667821763315, + "learning_rate": 8.008094244271735e-09, + "loss": 0.823, + "step": 20824 + }, + { + "epoch": 1.99, + "grad_norm": 0.3057456852443986, + "learning_rate": 7.80914574233238e-09, + "loss": 1.178, + "step": 20825 + }, + { + "epoch": 1.99, + "grad_norm": 0.3332352081032006, + "learning_rate": 7.612699607806306e-09, + "loss": 1.0363, + "step": 20826 + }, + { + "epoch": 1.99, + "grad_norm": 0.27308415675491365, + "learning_rate": 7.418755845611802e-09, + "loss": 1.0757, + "step": 20827 + }, + { + "epoch": 1.99, + "grad_norm": 0.3023310873675471, + "learning_rate": 7.227314460611645e-09, + "loss": 0.9906, + "step": 20828 + }, + { + "epoch": 1.99, + "grad_norm": 0.32565087004344345, + "learning_rate": 7.0383754575908955e-09, + "loss": 1.0569, + "step": 20829 + }, + { + "epoch": 1.99, + "grad_norm": 0.2910356487326149, + "learning_rate": 6.851938841279104e-09, + "loss": 1.0315, + "step": 20830 + }, + { + "epoch": 1.99, + "grad_norm": 0.3330093316315968, + "learning_rate": 6.668004616339207e-09, + "loss": 0.9834, + "step": 20831 + }, + { + "epoch": 1.99, + "grad_norm": 0.3015142033580944, + "learning_rate": 6.486572787378631e-09, + "loss": 1.0366, + "step": 20832 + }, + { + "epoch": 1.99, + "grad_norm": 0.3178896312488613, + "learning_rate": 6.307643358927084e-09, + "loss": 1.0074, + "step": 20833 + }, + { + "epoch": 1.99, + "grad_norm": 0.3002832951611161, + "learning_rate": 6.131216335480972e-09, + "loss": 1.0769, + "step": 20834 + }, + { + "epoch": 1.99, + "grad_norm": 0.3237563755118089, + "learning_rate": 5.9572917214367755e-09, + "loss": 1.0021, + "step": 20835 + }, + { + "epoch": 1.99, + "grad_norm": 0.28540943369211924, + "learning_rate": 5.785869521157672e-09, + "loss": 0.8606, + "step": 20836 + }, + { + "epoch": 1.99, + "grad_norm": 0.28421729407288737, + "learning_rate": 5.616949738929123e-09, + "loss": 0.9872, + "step": 20837 + }, + { + "epoch": 1.99, + "grad_norm": 0.3300884131144644, + "learning_rate": 5.450532378992179e-09, + "loss": 1.0937, + "step": 20838 + }, + { + "epoch": 1.99, + "grad_norm": 0.2999111608662199, + "learning_rate": 5.286617445487974e-09, + "loss": 0.8052, + "step": 20839 + }, + { + "epoch": 1.99, + "grad_norm": 0.32672183804878907, + "learning_rate": 5.125204942535433e-09, + "loss": 0.9746, + "step": 20840 + }, + { + "epoch": 1.99, + "grad_norm": 0.35925206289594425, + "learning_rate": 4.966294874164667e-09, + "loss": 1.127, + "step": 20841 + }, + { + "epoch": 1.99, + "grad_norm": 0.2989297842636852, + "learning_rate": 4.809887244361377e-09, + "loss": 0.9799, + "step": 20842 + }, + { + "epoch": 1.99, + "grad_norm": 0.3216760643650161, + "learning_rate": 4.655982057033548e-09, + "loss": 1.1124, + "step": 20843 + }, + { + "epoch": 1.99, + "grad_norm": 0.29657997281368326, + "learning_rate": 4.5045793160336525e-09, + "loss": 0.9157, + "step": 20844 + }, + { + "epoch": 1.99, + "grad_norm": 0.3361189796598653, + "learning_rate": 4.355679025147552e-09, + "loss": 1.0263, + "step": 20845 + }, + { + "epoch": 1.99, + "grad_norm": 0.26990525281284444, + "learning_rate": 4.209281188116699e-09, + "loss": 1.0365, + "step": 20846 + }, + { + "epoch": 1.99, + "grad_norm": 0.2963467442295381, + "learning_rate": 4.065385808582623e-09, + "loss": 0.9481, + "step": 20847 + }, + { + "epoch": 1.99, + "grad_norm": 0.32004739795401177, + "learning_rate": 3.923992890164652e-09, + "loss": 0.9474, + "step": 20848 + }, + { + "epoch": 1.99, + "grad_norm": 0.29304365229530144, + "learning_rate": 3.785102436382193e-09, + "loss": 1.049, + "step": 20849 + }, + { + "epoch": 1.99, + "grad_norm": 0.3099502598517248, + "learning_rate": 3.648714450732449e-09, + "loss": 1.0361, + "step": 20850 + }, + { + "epoch": 1.99, + "grad_norm": 0.30584993952326633, + "learning_rate": 3.514828936623804e-09, + "loss": 1.172, + "step": 20851 + }, + { + "epoch": 1.99, + "grad_norm": 0.3425779514623115, + "learning_rate": 3.38344589739803e-09, + "loss": 1.1308, + "step": 20852 + }, + { + "epoch": 2.0, + "grad_norm": 0.31840293970313965, + "learning_rate": 3.2545653363413865e-09, + "loss": 1.0331, + "step": 20853 + }, + { + "epoch": 2.0, + "grad_norm": 0.30475958025890254, + "learning_rate": 3.128187256684623e-09, + "loss": 0.9086, + "step": 20854 + }, + { + "epoch": 2.0, + "grad_norm": 0.33601885077791416, + "learning_rate": 3.0043116616029767e-09, + "loss": 1.007, + "step": 20855 + }, + { + "epoch": 2.0, + "grad_norm": 0.2782770685673901, + "learning_rate": 2.882938554171766e-09, + "loss": 0.9757, + "step": 20856 + }, + { + "epoch": 2.0, + "grad_norm": 0.33031283880430073, + "learning_rate": 2.7640679374441036e-09, + "loss": 1.1637, + "step": 20857 + }, + { + "epoch": 2.0, + "grad_norm": 0.2948553839780387, + "learning_rate": 2.647699814395388e-09, + "loss": 1.0203, + "step": 20858 + }, + { + "epoch": 2.0, + "grad_norm": 0.31179834224657377, + "learning_rate": 2.533834187934403e-09, + "loss": 1.0109, + "step": 20859 + }, + { + "epoch": 2.0, + "grad_norm": 0.3224327474340479, + "learning_rate": 2.4224710609033197e-09, + "loss": 1.0637, + "step": 20860 + }, + { + "epoch": 2.0, + "grad_norm": 0.3357175478459066, + "learning_rate": 2.3136104360999002e-09, + "loss": 0.9157, + "step": 20861 + }, + { + "epoch": 2.0, + "grad_norm": 0.3217031719591415, + "learning_rate": 2.2072523162441904e-09, + "loss": 1.0496, + "step": 20862 + }, + { + "epoch": 2.0, + "grad_norm": 0.37161605482160304, + "learning_rate": 2.1033967040007262e-09, + "loss": 1.0153, + "step": 20863 + }, + { + "epoch": 2.0, + "grad_norm": 0.34245468801566115, + "learning_rate": 2.002043601956327e-09, + "loss": 1.1166, + "step": 20864 + }, + { + "epoch": 2.0, + "grad_norm": 0.27809407698146954, + "learning_rate": 1.903193012664506e-09, + "loss": 0.8039, + "step": 20865 + }, + { + "epoch": 2.0, + "grad_norm": 0.33048151174501983, + "learning_rate": 1.80684493860106e-09, + "loss": 1.0527, + "step": 20866 + }, + { + "epoch": 2.0, + "grad_norm": 0.29174150311861163, + "learning_rate": 1.7129993821529689e-09, + "loss": 1.0513, + "step": 20867 + }, + { + "epoch": 2.0, + "grad_norm": 0.3113769998817027, + "learning_rate": 1.6216563456850076e-09, + "loss": 1.0422, + "step": 20868 + }, + { + "epoch": 2.0, + "grad_norm": 0.30062949081609747, + "learning_rate": 1.5328158314953377e-09, + "loss": 1.0904, + "step": 20869 + }, + { + "epoch": 2.0, + "grad_norm": 0.3468310803485738, + "learning_rate": 1.4464778417822012e-09, + "loss": 0.9818, + "step": 20870 + }, + { + "epoch": 2.0, + "grad_norm": 0.26678497338098084, + "learning_rate": 1.3626423787216348e-09, + "loss": 0.9218, + "step": 20871 + }, + { + "epoch": 2.0, + "grad_norm": 0.2657637724983373, + "learning_rate": 1.2813094444008577e-09, + "loss": 0.9409, + "step": 20872 + }, + { + "epoch": 2.0, + "grad_norm": 0.3277302115940635, + "learning_rate": 1.202479040873783e-09, + "loss": 0.9403, + "step": 20873 + }, + { + "epoch": 2.0, + "grad_norm": 0.33057161627396603, + "learning_rate": 1.1261511700944027e-09, + "loss": 0.9782, + "step": 20874 + }, + { + "epoch": 2.0, + "grad_norm": 0.3297319235423377, + "learning_rate": 1.0523258339723009e-09, + "loss": 1.0455, + "step": 20875 + }, + { + "epoch": 2.0, + "grad_norm": 0.344243839448284, + "learning_rate": 9.810030343726517e-10, + "loss": 1.015, + "step": 20876 + }, + { + "epoch": 2.0, + "grad_norm": 0.3031485945474143, + "learning_rate": 9.121827730718124e-10, + "loss": 0.9786, + "step": 20877 + }, + { + "epoch": 2.0, + "grad_norm": 0.32183840202854536, + "learning_rate": 8.458650517906285e-10, + "loss": 1.0449, + "step": 20878 + }, + { + "epoch": 2.0, + "grad_norm": 0.35362777570056986, + "learning_rate": 7.820498721833325e-10, + "loss": 1.0093, + "step": 20879 + }, + { + "epoch": 2.0, + "grad_norm": 0.31948069295431664, + "learning_rate": 7.207372358486452e-10, + "loss": 0.9505, + "step": 20880 + }, + { + "epoch": 2.0, + "grad_norm": 0.29381911861235815, + "learning_rate": 6.619271443408792e-10, + "loss": 0.8994, + "step": 20881 + }, + { + "epoch": 2.0, + "grad_norm": 0.28722896822284183, + "learning_rate": 6.056195991033242e-10, + "loss": 0.8341, + "step": 20882 + }, + { + "epoch": 2.0, + "grad_norm": 0.3176559113486959, + "learning_rate": 5.518146015570658e-10, + "loss": 1.0617, + "step": 20883 + }, + { + "epoch": 2.0, + "grad_norm": 0.3421126125088014, + "learning_rate": 5.005121530565759e-10, + "loss": 1.0209, + "step": 20884 + }, + { + "epoch": 2.0, + "grad_norm": 0.33636103155931574, + "learning_rate": 4.5171225486750903e-10, + "loss": 0.9165, + "step": 20885 + }, + { + "epoch": 2.0, + "grad_norm": 0.3455929664766023, + "learning_rate": 4.0541490823331473e-10, + "loss": 1.1375, + "step": 20886 + }, + { + "epoch": 2.0, + "grad_norm": 0.2711455817650343, + "learning_rate": 3.6162011428642063e-10, + "loss": 0.9855, + "step": 20887 + }, + { + "epoch": 2.0, + "grad_norm": 0.2950559266827984, + "learning_rate": 3.203278741481519e-10, + "loss": 0.9185, + "step": 20888 + }, + { + "epoch": 2.0, + "grad_norm": 0.2946877160975644, + "learning_rate": 2.8153818883991377e-10, + "loss": 0.9602, + "step": 20889 + }, + { + "epoch": 2.0, + "grad_norm": 0.30017334088704206, + "learning_rate": 2.452510593387025e-10, + "loss": 1.1406, + "step": 20890 + }, + { + "epoch": 2.0, + "grad_norm": 0.29525864121017376, + "learning_rate": 2.1146648653269653e-10, + "loss": 0.9903, + "step": 20891 + }, + { + "epoch": 2.0, + "grad_norm": 0.38665768014093277, + "learning_rate": 1.8018447129897197e-10, + "loss": 1.079, + "step": 20892 + }, + { + "epoch": 2.0, + "grad_norm": 0.3997233910643502, + "learning_rate": 1.5140501439248055e-10, + "loss": 0.936, + "step": 20893 + }, + { + "epoch": 2.0, + "grad_norm": 0.3233568942901715, + "learning_rate": 1.2512811654596946e-10, + "loss": 1.0108, + "step": 20894 + }, + { + "epoch": 2.0, + "grad_norm": 0.3109678606554696, + "learning_rate": 1.0135377842557247e-10, + "loss": 0.9199, + "step": 20895 + }, + { + "epoch": 2.0, + "grad_norm": 0.31174791257344026, + "learning_rate": 8.008200060860561e-11, + "loss": 1.0536, + "step": 20896 + }, + { + "epoch": 2.0, + "grad_norm": 0.31805106552648377, + "learning_rate": 6.131278362797588e-11, + "loss": 0.9641, + "step": 20897 + }, + { + "epoch": 2.0, + "grad_norm": 0.31602732720537613, + "learning_rate": 4.504612797218144e-11, + "loss": 0.962, + "step": 20898 + }, + { + "epoch": 2.0, + "grad_norm": 0.2506208224950041, + "learning_rate": 3.1282034029800344e-11, + "loss": 1.012, + "step": 20899 + }, + { + "epoch": 2.0, + "grad_norm": 0.3227623641876244, + "learning_rate": 2.002050215610396e-11, + "loss": 0.9707, + "step": 20900 + }, + { + "epoch": 2.0, + "grad_norm": 0.2974759428170827, + "learning_rate": 1.1261532628648041e-11, + "loss": 1.0837, + "step": 20901 + }, + { + "epoch": 2.0, + "grad_norm": 0.34056986023831465, + "learning_rate": 5.005125669477195e-12, + "loss": 1.0279, + "step": 20902 + }, + { + "epoch": 2.0, + "grad_norm": 0.2968971096908255, + "learning_rate": 1.251281422920414e-12, + "loss": 1.0098, + "step": 20903 + }, + { + "epoch": 2.0, + "grad_norm": 0.35579361993081154, + "learning_rate": 0.0, + "loss": 1.0677, + "step": 20904 + }, + { + "epoch": 2.0, + "eval_loss": 1.1242717504501343, + "eval_runtime": 4224.8941, + "eval_samples_per_second": 19.792, + "eval_steps_per_second": 2.474, + "step": 20904 + } + ], + "logging_steps": 1, + "max_steps": 20904, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 10452, + "total_flos": 4.609128114683904e+16, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}