diff --git "a/checkpoint-10452/trainer_state.json" "b/checkpoint-10452/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-10452/trainer_state.json" @@ -0,0 +1,73225 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9999521645539344, + "eval_steps": 2613, + "global_step": 10452, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 3.240769189720181, + "learning_rate": 1.9138755980861247e-07, + "loss": 1.9865, + "step": 1 + }, + { + "epoch": 0.0, + "eval_loss": 2.280837297439575, + "eval_runtime": 4185.9784, + "eval_samples_per_second": 19.976, + "eval_steps_per_second": 2.497, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 3.223090418187222, + "learning_rate": 3.8277511961722493e-07, + "loss": 1.9703, + "step": 2 + }, + { + "epoch": 0.0, + "grad_norm": 3.6145046678806407, + "learning_rate": 5.741626794258373e-07, + "loss": 2.1381, + "step": 3 + }, + { + "epoch": 0.0, + "grad_norm": 4.6296311605868885, + "learning_rate": 7.655502392344499e-07, + "loss": 2.456, + "step": 4 + }, + { + "epoch": 0.0, + "grad_norm": 3.8401650197500428, + "learning_rate": 9.569377990430622e-07, + "loss": 2.1077, + "step": 5 + }, + { + "epoch": 0.0, + "grad_norm": 3.683890691252437, + "learning_rate": 1.1483253588516746e-06, + "loss": 2.1413, + "step": 6 + }, + { + "epoch": 0.0, + "grad_norm": 3.6432534368591782, + "learning_rate": 1.339712918660287e-06, + "loss": 2.1568, + "step": 7 + }, + { + "epoch": 0.0, + "grad_norm": 4.663365277527622, + "learning_rate": 1.5311004784688997e-06, + "loss": 2.3463, + "step": 8 + }, + { + "epoch": 0.0, + "grad_norm": 3.6372978147069697, + "learning_rate": 1.722488038277512e-06, + "loss": 2.1712, + "step": 9 + }, + { + "epoch": 0.0, + "grad_norm": 3.852872878211269, + "learning_rate": 1.9138755980861244e-06, + "loss": 2.2722, + "step": 10 + }, + { + "epoch": 0.0, + "grad_norm": 3.6549109607459043, + "learning_rate": 2.105263157894737e-06, + "loss": 1.9729, + "step": 11 + }, + { + "epoch": 0.0, + "grad_norm": 3.2318002906734535, + "learning_rate": 2.2966507177033493e-06, + "loss": 2.0595, + "step": 12 + }, + { + "epoch": 0.0, + "grad_norm": 3.6991822490019413, + "learning_rate": 2.488038277511962e-06, + "loss": 2.0928, + "step": 13 + }, + { + "epoch": 0.0, + "grad_norm": 4.159355481440864, + "learning_rate": 2.679425837320574e-06, + "loss": 2.1411, + "step": 14 + }, + { + "epoch": 0.0, + "grad_norm": 4.061437005322893, + "learning_rate": 2.870813397129187e-06, + "loss": 2.3334, + "step": 15 + }, + { + "epoch": 0.0, + "grad_norm": 3.7145200953671766, + "learning_rate": 3.0622009569377995e-06, + "loss": 1.9779, + "step": 16 + }, + { + "epoch": 0.0, + "grad_norm": 4.3951391847417565, + "learning_rate": 3.2535885167464113e-06, + "loss": 2.24, + "step": 17 + }, + { + "epoch": 0.0, + "grad_norm": 4.225335307764433, + "learning_rate": 3.444976076555024e-06, + "loss": 2.2625, + "step": 18 + }, + { + "epoch": 0.0, + "grad_norm": 4.08014522806263, + "learning_rate": 3.636363636363636e-06, + "loss": 2.1072, + "step": 19 + }, + { + "epoch": 0.0, + "grad_norm": 4.750695114221436, + "learning_rate": 3.827751196172249e-06, + "loss": 2.2998, + "step": 20 + }, + { + "epoch": 0.0, + "grad_norm": 4.576295112434383, + "learning_rate": 4.019138755980861e-06, + "loss": 2.1414, + "step": 21 + }, + { + "epoch": 0.0, + "grad_norm": 5.5573739807898805, + "learning_rate": 4.210526315789474e-06, + "loss": 2.3373, + "step": 22 + }, + { + "epoch": 0.0, + "grad_norm": 4.369397245889477, + "learning_rate": 4.401913875598086e-06, + "loss": 2.1658, + "step": 23 + }, + { + "epoch": 0.0, + "grad_norm": 5.28624090773153, + "learning_rate": 4.5933014354066986e-06, + "loss": 2.2938, + "step": 24 + }, + { + "epoch": 0.0, + "grad_norm": 4.9534751396409655, + "learning_rate": 4.784688995215311e-06, + "loss": 2.2733, + "step": 25 + }, + { + "epoch": 0.0, + "grad_norm": 5.381089984082533, + "learning_rate": 4.976076555023924e-06, + "loss": 2.2244, + "step": 26 + }, + { + "epoch": 0.0, + "grad_norm": 6.226111694547378, + "learning_rate": 5.167464114832536e-06, + "loss": 2.2223, + "step": 27 + }, + { + "epoch": 0.0, + "grad_norm": 3.5666909272147245, + "learning_rate": 5.358851674641148e-06, + "loss": 1.7762, + "step": 28 + }, + { + "epoch": 0.0, + "grad_norm": 5.398756537107184, + "learning_rate": 5.550239234449761e-06, + "loss": 2.4073, + "step": 29 + }, + { + "epoch": 0.0, + "grad_norm": 4.315805192926013, + "learning_rate": 5.741626794258374e-06, + "loss": 2.1847, + "step": 30 + }, + { + "epoch": 0.0, + "grad_norm": 3.532277066830244, + "learning_rate": 5.933014354066986e-06, + "loss": 2.2245, + "step": 31 + }, + { + "epoch": 0.0, + "grad_norm": 3.413171467566141, + "learning_rate": 6.124401913875599e-06, + "loss": 2.0755, + "step": 32 + }, + { + "epoch": 0.0, + "grad_norm": 3.2310871639043746, + "learning_rate": 6.315789473684211e-06, + "loss": 2.2345, + "step": 33 + }, + { + "epoch": 0.0, + "grad_norm": 3.3848849049261207, + "learning_rate": 6.5071770334928226e-06, + "loss": 2.0588, + "step": 34 + }, + { + "epoch": 0.0, + "grad_norm": 3.0082260641673706, + "learning_rate": 6.698564593301436e-06, + "loss": 2.1575, + "step": 35 + }, + { + "epoch": 0.0, + "grad_norm": 2.45419363377176, + "learning_rate": 6.889952153110048e-06, + "loss": 2.2532, + "step": 36 + }, + { + "epoch": 0.0, + "grad_norm": 1.316572782291838, + "learning_rate": 7.081339712918661e-06, + "loss": 1.9725, + "step": 37 + }, + { + "epoch": 0.0, + "grad_norm": 1.2302956707592998, + "learning_rate": 7.272727272727272e-06, + "loss": 2.2166, + "step": 38 + }, + { + "epoch": 0.0, + "grad_norm": 1.2417025568909201, + "learning_rate": 7.464114832535886e-06, + "loss": 2.2785, + "step": 39 + }, + { + "epoch": 0.0, + "grad_norm": 1.0623452341947435, + "learning_rate": 7.655502392344498e-06, + "loss": 2.1188, + "step": 40 + }, + { + "epoch": 0.0, + "grad_norm": 0.9459508227302174, + "learning_rate": 7.846889952153112e-06, + "loss": 2.1225, + "step": 41 + }, + { + "epoch": 0.0, + "grad_norm": 0.9821381236389229, + "learning_rate": 8.038277511961722e-06, + "loss": 1.9644, + "step": 42 + }, + { + "epoch": 0.0, + "grad_norm": 0.8227720071849114, + "learning_rate": 8.229665071770336e-06, + "loss": 1.8818, + "step": 43 + }, + { + "epoch": 0.0, + "grad_norm": 0.9772589694714745, + "learning_rate": 8.421052631578948e-06, + "loss": 2.1738, + "step": 44 + }, + { + "epoch": 0.0, + "grad_norm": 0.8288391194584817, + "learning_rate": 8.61244019138756e-06, + "loss": 2.0598, + "step": 45 + }, + { + "epoch": 0.0, + "grad_norm": 0.7365091983688079, + "learning_rate": 8.803827751196173e-06, + "loss": 2.0051, + "step": 46 + }, + { + "epoch": 0.0, + "grad_norm": 0.8202389292532828, + "learning_rate": 8.995215311004785e-06, + "loss": 2.0005, + "step": 47 + }, + { + "epoch": 0.0, + "grad_norm": 0.7517874679840845, + "learning_rate": 9.186602870813397e-06, + "loss": 1.9756, + "step": 48 + }, + { + "epoch": 0.0, + "grad_norm": 0.8041110653388241, + "learning_rate": 9.377990430622011e-06, + "loss": 2.1019, + "step": 49 + }, + { + "epoch": 0.0, + "grad_norm": 0.7364867668939584, + "learning_rate": 9.569377990430622e-06, + "loss": 1.8784, + "step": 50 + }, + { + "epoch": 0.0, + "grad_norm": 0.5933286532299948, + "learning_rate": 9.760765550239234e-06, + "loss": 1.743, + "step": 51 + }, + { + "epoch": 0.0, + "grad_norm": 0.792868413879173, + "learning_rate": 9.952153110047848e-06, + "loss": 1.8373, + "step": 52 + }, + { + "epoch": 0.01, + "grad_norm": 0.8312221240251918, + "learning_rate": 1.014354066985646e-05, + "loss": 1.9098, + "step": 53 + }, + { + "epoch": 0.01, + "grad_norm": 0.8539046550616419, + "learning_rate": 1.0334928229665072e-05, + "loss": 2.0752, + "step": 54 + }, + { + "epoch": 0.01, + "grad_norm": 0.7680076367144099, + "learning_rate": 1.0526315789473684e-05, + "loss": 2.0076, + "step": 55 + }, + { + "epoch": 0.01, + "grad_norm": 0.6386760911858004, + "learning_rate": 1.0717703349282297e-05, + "loss": 1.8095, + "step": 56 + }, + { + "epoch": 0.01, + "grad_norm": 0.7287884197298926, + "learning_rate": 1.0909090909090909e-05, + "loss": 1.9806, + "step": 57 + }, + { + "epoch": 0.01, + "grad_norm": 0.7722026786294885, + "learning_rate": 1.1100478468899523e-05, + "loss": 1.9635, + "step": 58 + }, + { + "epoch": 0.01, + "grad_norm": 0.7231409481435274, + "learning_rate": 1.1291866028708133e-05, + "loss": 1.8613, + "step": 59 + }, + { + "epoch": 0.01, + "grad_norm": 0.7824727930965788, + "learning_rate": 1.1483253588516747e-05, + "loss": 2.1725, + "step": 60 + }, + { + "epoch": 0.01, + "grad_norm": 0.6677563917530093, + "learning_rate": 1.167464114832536e-05, + "loss": 1.9352, + "step": 61 + }, + { + "epoch": 0.01, + "grad_norm": 0.7009328416411565, + "learning_rate": 1.1866028708133972e-05, + "loss": 1.8884, + "step": 62 + }, + { + "epoch": 0.01, + "grad_norm": 0.9109348341862854, + "learning_rate": 1.2057416267942584e-05, + "loss": 2.3183, + "step": 63 + }, + { + "epoch": 0.01, + "grad_norm": 0.8411347289624691, + "learning_rate": 1.2248803827751198e-05, + "loss": 1.8075, + "step": 64 + }, + { + "epoch": 0.01, + "grad_norm": 0.7674394625955453, + "learning_rate": 1.2440191387559808e-05, + "loss": 1.9963, + "step": 65 + }, + { + "epoch": 0.01, + "grad_norm": 0.7848289715014719, + "learning_rate": 1.2631578947368422e-05, + "loss": 1.9276, + "step": 66 + }, + { + "epoch": 0.01, + "grad_norm": 0.7238409990110771, + "learning_rate": 1.2822966507177035e-05, + "loss": 1.9353, + "step": 67 + }, + { + "epoch": 0.01, + "grad_norm": 0.8235100929493389, + "learning_rate": 1.3014354066985645e-05, + "loss": 1.8938, + "step": 68 + }, + { + "epoch": 0.01, + "grad_norm": 0.6283052176051154, + "learning_rate": 1.320574162679426e-05, + "loss": 1.8028, + "step": 69 + }, + { + "epoch": 0.01, + "grad_norm": 0.654778519712797, + "learning_rate": 1.3397129186602871e-05, + "loss": 1.6469, + "step": 70 + }, + { + "epoch": 0.01, + "grad_norm": 0.7728600382414659, + "learning_rate": 1.3588516746411483e-05, + "loss": 1.7839, + "step": 71 + }, + { + "epoch": 0.01, + "grad_norm": 0.9160985506203067, + "learning_rate": 1.3779904306220096e-05, + "loss": 1.8154, + "step": 72 + }, + { + "epoch": 0.01, + "grad_norm": 0.8592513182940328, + "learning_rate": 1.397129186602871e-05, + "loss": 1.8178, + "step": 73 + }, + { + "epoch": 0.01, + "grad_norm": 1.0716389160468884, + "learning_rate": 1.4162679425837322e-05, + "loss": 1.9752, + "step": 74 + }, + { + "epoch": 0.01, + "grad_norm": 1.0526260733114412, + "learning_rate": 1.4354066985645934e-05, + "loss": 1.8998, + "step": 75 + }, + { + "epoch": 0.01, + "grad_norm": 0.9676822741503439, + "learning_rate": 1.4545454545454545e-05, + "loss": 1.7514, + "step": 76 + }, + { + "epoch": 0.01, + "grad_norm": 1.2545561036432802, + "learning_rate": 1.4736842105263157e-05, + "loss": 1.8165, + "step": 77 + }, + { + "epoch": 0.01, + "grad_norm": 1.834675739891747, + "learning_rate": 1.4928229665071772e-05, + "loss": 1.8987, + "step": 78 + }, + { + "epoch": 0.01, + "grad_norm": 1.25568745378183, + "learning_rate": 1.5119617224880383e-05, + "loss": 1.8836, + "step": 79 + }, + { + "epoch": 0.01, + "grad_norm": 0.8547248072580093, + "learning_rate": 1.5311004784688995e-05, + "loss": 1.625, + "step": 80 + }, + { + "epoch": 0.01, + "grad_norm": 0.9130624521617219, + "learning_rate": 1.5502392344497607e-05, + "loss": 1.65, + "step": 81 + }, + { + "epoch": 0.01, + "grad_norm": 1.4571618225829908, + "learning_rate": 1.5693779904306223e-05, + "loss": 1.7592, + "step": 82 + }, + { + "epoch": 0.01, + "grad_norm": 1.5681812827811266, + "learning_rate": 1.5885167464114832e-05, + "loss": 1.7274, + "step": 83 + }, + { + "epoch": 0.01, + "grad_norm": 1.7707101692645586, + "learning_rate": 1.6076555023923444e-05, + "loss": 1.7496, + "step": 84 + }, + { + "epoch": 0.01, + "grad_norm": 1.4920166991759014, + "learning_rate": 1.6267942583732056e-05, + "loss": 1.815, + "step": 85 + }, + { + "epoch": 0.01, + "grad_norm": 0.765151570274179, + "learning_rate": 1.6459330143540672e-05, + "loss": 1.6131, + "step": 86 + }, + { + "epoch": 0.01, + "grad_norm": 0.7780322419019302, + "learning_rate": 1.6650717703349284e-05, + "loss": 1.6553, + "step": 87 + }, + { + "epoch": 0.01, + "grad_norm": 0.6971849026842456, + "learning_rate": 1.6842105263157896e-05, + "loss": 1.6212, + "step": 88 + }, + { + "epoch": 0.01, + "grad_norm": 0.6737627863204966, + "learning_rate": 1.7033492822966505e-05, + "loss": 1.6319, + "step": 89 + }, + { + "epoch": 0.01, + "grad_norm": 0.7581487338796276, + "learning_rate": 1.722488038277512e-05, + "loss": 1.5457, + "step": 90 + }, + { + "epoch": 0.01, + "grad_norm": 0.6884712501693219, + "learning_rate": 1.7416267942583733e-05, + "loss": 1.682, + "step": 91 + }, + { + "epoch": 0.01, + "grad_norm": 0.6472187205118166, + "learning_rate": 1.7607655502392345e-05, + "loss": 1.7893, + "step": 92 + }, + { + "epoch": 0.01, + "grad_norm": 0.5392317136449472, + "learning_rate": 1.7799043062200958e-05, + "loss": 1.5724, + "step": 93 + }, + { + "epoch": 0.01, + "grad_norm": 0.613083864173068, + "learning_rate": 1.799043062200957e-05, + "loss": 1.5933, + "step": 94 + }, + { + "epoch": 0.01, + "grad_norm": 0.554854239010525, + "learning_rate": 1.8181818181818182e-05, + "loss": 1.4444, + "step": 95 + }, + { + "epoch": 0.01, + "grad_norm": 0.5895819501317506, + "learning_rate": 1.8373205741626794e-05, + "loss": 1.463, + "step": 96 + }, + { + "epoch": 0.01, + "grad_norm": 0.6834369586439761, + "learning_rate": 1.8564593301435407e-05, + "loss": 1.6334, + "step": 97 + }, + { + "epoch": 0.01, + "grad_norm": 0.5941595870489239, + "learning_rate": 1.8755980861244022e-05, + "loss": 1.6881, + "step": 98 + }, + { + "epoch": 0.01, + "grad_norm": 0.6765509171345243, + "learning_rate": 1.8947368421052634e-05, + "loss": 1.5896, + "step": 99 + }, + { + "epoch": 0.01, + "grad_norm": 0.5523045136213458, + "learning_rate": 1.9138755980861243e-05, + "loss": 1.4551, + "step": 100 + }, + { + "epoch": 0.01, + "grad_norm": 0.6537194645181346, + "learning_rate": 1.9330143540669855e-05, + "loss": 1.5513, + "step": 101 + }, + { + "epoch": 0.01, + "grad_norm": 0.5087498837023968, + "learning_rate": 1.9521531100478468e-05, + "loss": 1.4715, + "step": 102 + }, + { + "epoch": 0.01, + "grad_norm": 0.6335719742406299, + "learning_rate": 1.9712918660287083e-05, + "loss": 1.5313, + "step": 103 + }, + { + "epoch": 0.01, + "grad_norm": 0.647032541641365, + "learning_rate": 1.9904306220095696e-05, + "loss": 1.4794, + "step": 104 + }, + { + "epoch": 0.01, + "grad_norm": 0.5641054209541718, + "learning_rate": 2.0095693779904308e-05, + "loss": 1.5301, + "step": 105 + }, + { + "epoch": 0.01, + "grad_norm": 0.623411201252464, + "learning_rate": 2.028708133971292e-05, + "loss": 1.4618, + "step": 106 + }, + { + "epoch": 0.01, + "grad_norm": 0.6469566283028377, + "learning_rate": 2.0478468899521532e-05, + "loss": 1.5438, + "step": 107 + }, + { + "epoch": 0.01, + "grad_norm": 0.5328026134772941, + "learning_rate": 2.0669856459330144e-05, + "loss": 1.4587, + "step": 108 + }, + { + "epoch": 0.01, + "grad_norm": 0.5404945621124031, + "learning_rate": 2.0861244019138757e-05, + "loss": 1.3749, + "step": 109 + }, + { + "epoch": 0.01, + "grad_norm": 0.6198671572468233, + "learning_rate": 2.105263157894737e-05, + "loss": 1.5662, + "step": 110 + }, + { + "epoch": 0.01, + "grad_norm": 0.5344071404450242, + "learning_rate": 2.1244019138755985e-05, + "loss": 1.6258, + "step": 111 + }, + { + "epoch": 0.01, + "grad_norm": 0.5077736180550365, + "learning_rate": 2.1435406698564593e-05, + "loss": 1.4085, + "step": 112 + }, + { + "epoch": 0.01, + "grad_norm": 0.5916690974950262, + "learning_rate": 2.1626794258373206e-05, + "loss": 1.4315, + "step": 113 + }, + { + "epoch": 0.01, + "grad_norm": 0.4799015981708095, + "learning_rate": 2.1818181818181818e-05, + "loss": 1.4183, + "step": 114 + }, + { + "epoch": 0.01, + "grad_norm": 0.4571232168458159, + "learning_rate": 2.2009569377990433e-05, + "loss": 1.3658, + "step": 115 + }, + { + "epoch": 0.01, + "grad_norm": 0.4917591648756044, + "learning_rate": 2.2200956937799046e-05, + "loss": 1.3724, + "step": 116 + }, + { + "epoch": 0.01, + "grad_norm": 0.4170683829952185, + "learning_rate": 2.2392344497607658e-05, + "loss": 1.4197, + "step": 117 + }, + { + "epoch": 0.01, + "grad_norm": 0.49893891430948417, + "learning_rate": 2.2583732057416267e-05, + "loss": 1.4005, + "step": 118 + }, + { + "epoch": 0.01, + "grad_norm": 0.4056789281529712, + "learning_rate": 2.2775119617224882e-05, + "loss": 1.4419, + "step": 119 + }, + { + "epoch": 0.01, + "grad_norm": 0.47274513283719816, + "learning_rate": 2.2966507177033495e-05, + "loss": 1.3163, + "step": 120 + }, + { + "epoch": 0.01, + "grad_norm": 0.4495315355981231, + "learning_rate": 2.3157894736842107e-05, + "loss": 1.3599, + "step": 121 + }, + { + "epoch": 0.01, + "grad_norm": 0.4142057365520808, + "learning_rate": 2.334928229665072e-05, + "loss": 1.4484, + "step": 122 + }, + { + "epoch": 0.01, + "grad_norm": 0.4421864791497608, + "learning_rate": 2.354066985645933e-05, + "loss": 1.3629, + "step": 123 + }, + { + "epoch": 0.01, + "grad_norm": 0.44850267009277406, + "learning_rate": 2.3732057416267943e-05, + "loss": 1.4036, + "step": 124 + }, + { + "epoch": 0.01, + "grad_norm": 0.49021107734394687, + "learning_rate": 2.3923444976076556e-05, + "loss": 1.3724, + "step": 125 + }, + { + "epoch": 0.01, + "grad_norm": 0.47738353199872885, + "learning_rate": 2.4114832535885168e-05, + "loss": 1.3304, + "step": 126 + }, + { + "epoch": 0.01, + "grad_norm": 0.4704616937001858, + "learning_rate": 2.4306220095693784e-05, + "loss": 1.376, + "step": 127 + }, + { + "epoch": 0.01, + "grad_norm": 0.4351806446488345, + "learning_rate": 2.4497607655502396e-05, + "loss": 1.3438, + "step": 128 + }, + { + "epoch": 0.01, + "grad_norm": 0.5275547600152736, + "learning_rate": 2.4688995215311005e-05, + "loss": 1.2731, + "step": 129 + }, + { + "epoch": 0.01, + "grad_norm": 0.3787982803600526, + "learning_rate": 2.4880382775119617e-05, + "loss": 1.3392, + "step": 130 + }, + { + "epoch": 0.01, + "grad_norm": 0.3850735556535156, + "learning_rate": 2.507177033492823e-05, + "loss": 1.2673, + "step": 131 + }, + { + "epoch": 0.01, + "grad_norm": 0.4339128757111585, + "learning_rate": 2.5263157894736845e-05, + "loss": 1.3284, + "step": 132 + }, + { + "epoch": 0.01, + "grad_norm": 0.39005187564392557, + "learning_rate": 2.5454545454545454e-05, + "loss": 1.3347, + "step": 133 + }, + { + "epoch": 0.01, + "grad_norm": 0.35602922226403455, + "learning_rate": 2.564593301435407e-05, + "loss": 1.2362, + "step": 134 + }, + { + "epoch": 0.01, + "grad_norm": 0.40601160369590406, + "learning_rate": 2.583732057416268e-05, + "loss": 1.3591, + "step": 135 + }, + { + "epoch": 0.01, + "grad_norm": 0.4408392546545607, + "learning_rate": 2.602870813397129e-05, + "loss": 1.2837, + "step": 136 + }, + { + "epoch": 0.01, + "grad_norm": 0.46362552165830334, + "learning_rate": 2.6220095693779906e-05, + "loss": 1.316, + "step": 137 + }, + { + "epoch": 0.01, + "grad_norm": 0.3504892333833388, + "learning_rate": 2.641148325358852e-05, + "loss": 1.1857, + "step": 138 + }, + { + "epoch": 0.01, + "grad_norm": 0.4362884443543931, + "learning_rate": 2.660287081339713e-05, + "loss": 1.2435, + "step": 139 + }, + { + "epoch": 0.01, + "grad_norm": 0.50850093231113, + "learning_rate": 2.6794258373205743e-05, + "loss": 1.3235, + "step": 140 + }, + { + "epoch": 0.01, + "grad_norm": 0.4187742784108479, + "learning_rate": 2.698564593301435e-05, + "loss": 1.3693, + "step": 141 + }, + { + "epoch": 0.01, + "grad_norm": 0.44107080428520234, + "learning_rate": 2.7177033492822967e-05, + "loss": 1.202, + "step": 142 + }, + { + "epoch": 0.01, + "grad_norm": 0.42167608964495673, + "learning_rate": 2.7368421052631583e-05, + "loss": 1.2693, + "step": 143 + }, + { + "epoch": 0.01, + "grad_norm": 0.4269486411389392, + "learning_rate": 2.755980861244019e-05, + "loss": 1.1825, + "step": 144 + }, + { + "epoch": 0.01, + "grad_norm": 0.4220464029961541, + "learning_rate": 2.7751196172248807e-05, + "loss": 1.3205, + "step": 145 + }, + { + "epoch": 0.01, + "grad_norm": 0.4620642882432513, + "learning_rate": 2.794258373205742e-05, + "loss": 1.2891, + "step": 146 + }, + { + "epoch": 0.01, + "grad_norm": 0.43743294942437483, + "learning_rate": 2.8133971291866028e-05, + "loss": 1.1969, + "step": 147 + }, + { + "epoch": 0.01, + "grad_norm": 0.4414019243397515, + "learning_rate": 2.8325358851674644e-05, + "loss": 1.1672, + "step": 148 + }, + { + "epoch": 0.01, + "grad_norm": 1.1295244665123245, + "learning_rate": 2.8516746411483253e-05, + "loss": 1.3175, + "step": 149 + }, + { + "epoch": 0.01, + "grad_norm": 0.5055922001428373, + "learning_rate": 2.8708133971291868e-05, + "loss": 1.2696, + "step": 150 + }, + { + "epoch": 0.01, + "grad_norm": 0.48319661000499564, + "learning_rate": 2.889952153110048e-05, + "loss": 1.3168, + "step": 151 + }, + { + "epoch": 0.01, + "grad_norm": 0.5720466946520338, + "learning_rate": 2.909090909090909e-05, + "loss": 1.2982, + "step": 152 + }, + { + "epoch": 0.01, + "grad_norm": 0.68753321360197, + "learning_rate": 2.9282296650717705e-05, + "loss": 1.2791, + "step": 153 + }, + { + "epoch": 0.01, + "grad_norm": 0.6221880802372888, + "learning_rate": 2.9473684210526314e-05, + "loss": 1.2647, + "step": 154 + }, + { + "epoch": 0.01, + "grad_norm": 0.5504261765445813, + "learning_rate": 2.966507177033493e-05, + "loss": 1.3102, + "step": 155 + }, + { + "epoch": 0.01, + "grad_norm": 0.6618113787746298, + "learning_rate": 2.9856459330143545e-05, + "loss": 1.292, + "step": 156 + }, + { + "epoch": 0.02, + "grad_norm": 0.6002633450221915, + "learning_rate": 3.0047846889952154e-05, + "loss": 1.33, + "step": 157 + }, + { + "epoch": 0.02, + "grad_norm": 0.7118508153621136, + "learning_rate": 3.0239234449760766e-05, + "loss": 1.2062, + "step": 158 + }, + { + "epoch": 0.02, + "grad_norm": 0.6591754857765905, + "learning_rate": 3.043062200956938e-05, + "loss": 1.2539, + "step": 159 + }, + { + "epoch": 0.02, + "grad_norm": 0.4714338006090391, + "learning_rate": 3.062200956937799e-05, + "loss": 1.2384, + "step": 160 + }, + { + "epoch": 0.02, + "grad_norm": 1.0300987484894157, + "learning_rate": 3.08133971291866e-05, + "loss": 1.2171, + "step": 161 + }, + { + "epoch": 0.02, + "grad_norm": 0.6469298394859133, + "learning_rate": 3.1004784688995215e-05, + "loss": 1.0836, + "step": 162 + }, + { + "epoch": 0.02, + "grad_norm": 0.5601908591127592, + "learning_rate": 3.119617224880383e-05, + "loss": 1.1867, + "step": 163 + }, + { + "epoch": 0.02, + "grad_norm": 0.7363177930575833, + "learning_rate": 3.1387559808612446e-05, + "loss": 1.3128, + "step": 164 + }, + { + "epoch": 0.02, + "grad_norm": 0.7107903409156414, + "learning_rate": 3.157894736842105e-05, + "loss": 1.2501, + "step": 165 + }, + { + "epoch": 0.02, + "grad_norm": 0.464636577316954, + "learning_rate": 3.1770334928229664e-05, + "loss": 1.1671, + "step": 166 + }, + { + "epoch": 0.02, + "grad_norm": 0.8259444575108333, + "learning_rate": 3.196172248803828e-05, + "loss": 1.2928, + "step": 167 + }, + { + "epoch": 0.02, + "grad_norm": 0.7945572957938246, + "learning_rate": 3.215311004784689e-05, + "loss": 1.2516, + "step": 168 + }, + { + "epoch": 0.02, + "grad_norm": 0.5895739273419747, + "learning_rate": 3.234449760765551e-05, + "loss": 1.162, + "step": 169 + }, + { + "epoch": 0.02, + "grad_norm": 0.6927568627921579, + "learning_rate": 3.253588516746411e-05, + "loss": 1.2802, + "step": 170 + }, + { + "epoch": 0.02, + "grad_norm": 0.639241808296525, + "learning_rate": 3.272727272727273e-05, + "loss": 1.1999, + "step": 171 + }, + { + "epoch": 0.02, + "grad_norm": 0.8979192020036151, + "learning_rate": 3.2918660287081344e-05, + "loss": 1.1854, + "step": 172 + }, + { + "epoch": 0.02, + "grad_norm": 0.5314730657868201, + "learning_rate": 3.311004784688995e-05, + "loss": 1.2365, + "step": 173 + }, + { + "epoch": 0.02, + "grad_norm": 0.3835011610660225, + "learning_rate": 3.330143540669857e-05, + "loss": 1.3014, + "step": 174 + }, + { + "epoch": 0.02, + "grad_norm": 0.46600481942740146, + "learning_rate": 3.349282296650718e-05, + "loss": 1.2222, + "step": 175 + }, + { + "epoch": 0.02, + "grad_norm": 0.568331869349057, + "learning_rate": 3.368421052631579e-05, + "loss": 1.2219, + "step": 176 + }, + { + "epoch": 0.02, + "grad_norm": 0.5161782745938108, + "learning_rate": 3.3875598086124405e-05, + "loss": 1.0828, + "step": 177 + }, + { + "epoch": 0.02, + "grad_norm": 0.5678907934059946, + "learning_rate": 3.406698564593301e-05, + "loss": 1.241, + "step": 178 + }, + { + "epoch": 0.02, + "grad_norm": 0.40729089487435055, + "learning_rate": 3.425837320574163e-05, + "loss": 1.2201, + "step": 179 + }, + { + "epoch": 0.02, + "grad_norm": 0.5273474366443621, + "learning_rate": 3.444976076555024e-05, + "loss": 1.1262, + "step": 180 + }, + { + "epoch": 0.02, + "grad_norm": 0.4807276904108668, + "learning_rate": 3.4641148325358854e-05, + "loss": 1.1957, + "step": 181 + }, + { + "epoch": 0.02, + "grad_norm": 0.5372452514446321, + "learning_rate": 3.4832535885167466e-05, + "loss": 1.1621, + "step": 182 + }, + { + "epoch": 0.02, + "grad_norm": 0.6605408366116067, + "learning_rate": 3.502392344497608e-05, + "loss": 1.2327, + "step": 183 + }, + { + "epoch": 0.02, + "grad_norm": 0.5555779108688886, + "learning_rate": 3.521531100478469e-05, + "loss": 1.1938, + "step": 184 + }, + { + "epoch": 0.02, + "grad_norm": 0.5517608446497938, + "learning_rate": 3.54066985645933e-05, + "loss": 1.2651, + "step": 185 + }, + { + "epoch": 0.02, + "grad_norm": 0.4524359827250868, + "learning_rate": 3.5598086124401915e-05, + "loss": 1.2182, + "step": 186 + }, + { + "epoch": 0.02, + "grad_norm": 0.4495056117139968, + "learning_rate": 3.578947368421053e-05, + "loss": 1.2133, + "step": 187 + }, + { + "epoch": 0.02, + "grad_norm": 0.43420633627744476, + "learning_rate": 3.598086124401914e-05, + "loss": 1.078, + "step": 188 + }, + { + "epoch": 0.02, + "grad_norm": 0.4991578407930919, + "learning_rate": 3.617224880382775e-05, + "loss": 1.0683, + "step": 189 + }, + { + "epoch": 0.02, + "grad_norm": 0.46736895171034254, + "learning_rate": 3.6363636363636364e-05, + "loss": 1.1533, + "step": 190 + }, + { + "epoch": 0.02, + "grad_norm": 0.48830760264155376, + "learning_rate": 3.6555023923444976e-05, + "loss": 1.2539, + "step": 191 + }, + { + "epoch": 0.02, + "grad_norm": 0.5148588474362115, + "learning_rate": 3.674641148325359e-05, + "loss": 1.0486, + "step": 192 + }, + { + "epoch": 0.02, + "grad_norm": 0.4669961130515612, + "learning_rate": 3.693779904306221e-05, + "loss": 1.1949, + "step": 193 + }, + { + "epoch": 0.02, + "grad_norm": 0.32879504352969363, + "learning_rate": 3.712918660287081e-05, + "loss": 1.189, + "step": 194 + }, + { + "epoch": 0.02, + "grad_norm": 0.35633398240141656, + "learning_rate": 3.7320574162679425e-05, + "loss": 1.2112, + "step": 195 + }, + { + "epoch": 0.02, + "grad_norm": 0.42378382756263683, + "learning_rate": 3.7511961722488044e-05, + "loss": 1.1999, + "step": 196 + }, + { + "epoch": 0.02, + "grad_norm": 0.41483450868577093, + "learning_rate": 3.770334928229665e-05, + "loss": 1.1283, + "step": 197 + }, + { + "epoch": 0.02, + "grad_norm": 0.4603179290827383, + "learning_rate": 3.789473684210527e-05, + "loss": 1.2353, + "step": 198 + }, + { + "epoch": 0.02, + "grad_norm": 0.5574543274654291, + "learning_rate": 3.8086124401913874e-05, + "loss": 1.1948, + "step": 199 + }, + { + "epoch": 0.02, + "grad_norm": 0.6846239185739854, + "learning_rate": 3.8277511961722486e-05, + "loss": 1.3231, + "step": 200 + }, + { + "epoch": 0.02, + "grad_norm": 0.520683870817622, + "learning_rate": 3.8468899521531105e-05, + "loss": 1.2201, + "step": 201 + }, + { + "epoch": 0.02, + "grad_norm": 0.42683935885681074, + "learning_rate": 3.866028708133971e-05, + "loss": 1.165, + "step": 202 + }, + { + "epoch": 0.02, + "grad_norm": 0.39470003294526096, + "learning_rate": 3.885167464114833e-05, + "loss": 1.1994, + "step": 203 + }, + { + "epoch": 0.02, + "grad_norm": 0.4391753396869876, + "learning_rate": 3.9043062200956935e-05, + "loss": 1.1813, + "step": 204 + }, + { + "epoch": 0.02, + "grad_norm": 0.4449276816002856, + "learning_rate": 3.9234449760765554e-05, + "loss": 1.1764, + "step": 205 + }, + { + "epoch": 0.02, + "grad_norm": 0.45472581470602813, + "learning_rate": 3.9425837320574167e-05, + "loss": 1.1766, + "step": 206 + }, + { + "epoch": 0.02, + "grad_norm": 0.37367376899076454, + "learning_rate": 3.961722488038277e-05, + "loss": 1.0581, + "step": 207 + }, + { + "epoch": 0.02, + "grad_norm": 0.5183447282360463, + "learning_rate": 3.980861244019139e-05, + "loss": 1.1351, + "step": 208 + }, + { + "epoch": 0.02, + "grad_norm": 0.5895054254055697, + "learning_rate": 4e-05, + "loss": 1.2017, + "step": 209 + }, + { + "epoch": 0.02, + "grad_norm": 0.39968348371397444, + "learning_rate": 4.0191387559808616e-05, + "loss": 1.2353, + "step": 210 + }, + { + "epoch": 0.02, + "grad_norm": 0.4089518582688394, + "learning_rate": 4.038277511961723e-05, + "loss": 1.2072, + "step": 211 + }, + { + "epoch": 0.02, + "grad_norm": 0.5490805672572786, + "learning_rate": 4.057416267942584e-05, + "loss": 1.2593, + "step": 212 + }, + { + "epoch": 0.02, + "grad_norm": 0.42397521041394676, + "learning_rate": 4.076555023923445e-05, + "loss": 1.155, + "step": 213 + }, + { + "epoch": 0.02, + "grad_norm": 0.4981026360180907, + "learning_rate": 4.0956937799043064e-05, + "loss": 1.2279, + "step": 214 + }, + { + "epoch": 0.02, + "grad_norm": 0.503200310794026, + "learning_rate": 4.114832535885168e-05, + "loss": 1.1134, + "step": 215 + }, + { + "epoch": 0.02, + "grad_norm": 0.5127306365983669, + "learning_rate": 4.133971291866029e-05, + "loss": 1.1993, + "step": 216 + }, + { + "epoch": 0.02, + "grad_norm": 0.4935452094233486, + "learning_rate": 4.15311004784689e-05, + "loss": 1.1962, + "step": 217 + }, + { + "epoch": 0.02, + "grad_norm": 0.38834813884933866, + "learning_rate": 4.172248803827751e-05, + "loss": 1.2529, + "step": 218 + }, + { + "epoch": 0.02, + "grad_norm": 0.4742710182667002, + "learning_rate": 4.1913875598086126e-05, + "loss": 1.1861, + "step": 219 + }, + { + "epoch": 0.02, + "grad_norm": 0.4762816225973413, + "learning_rate": 4.210526315789474e-05, + "loss": 1.2221, + "step": 220 + }, + { + "epoch": 0.02, + "grad_norm": 0.5405607329270613, + "learning_rate": 4.229665071770335e-05, + "loss": 1.3099, + "step": 221 + }, + { + "epoch": 0.02, + "grad_norm": 0.38474621695351074, + "learning_rate": 4.248803827751197e-05, + "loss": 1.1669, + "step": 222 + }, + { + "epoch": 0.02, + "grad_norm": 0.47522706488556143, + "learning_rate": 4.2679425837320574e-05, + "loss": 1.1813, + "step": 223 + }, + { + "epoch": 0.02, + "grad_norm": 0.43816325603257117, + "learning_rate": 4.287081339712919e-05, + "loss": 1.1912, + "step": 224 + }, + { + "epoch": 0.02, + "grad_norm": 0.47579537819240086, + "learning_rate": 4.3062200956937806e-05, + "loss": 1.165, + "step": 225 + }, + { + "epoch": 0.02, + "grad_norm": 0.5148567080870619, + "learning_rate": 4.325358851674641e-05, + "loss": 1.1997, + "step": 226 + }, + { + "epoch": 0.02, + "grad_norm": 0.4019413785534228, + "learning_rate": 4.344497607655503e-05, + "loss": 1.2448, + "step": 227 + }, + { + "epoch": 0.02, + "grad_norm": 0.4766231281692962, + "learning_rate": 4.3636363636363636e-05, + "loss": 1.1374, + "step": 228 + }, + { + "epoch": 0.02, + "grad_norm": 0.4375975135050194, + "learning_rate": 4.382775119617225e-05, + "loss": 1.2484, + "step": 229 + }, + { + "epoch": 0.02, + "grad_norm": 0.4312959866778293, + "learning_rate": 4.401913875598087e-05, + "loss": 1.1143, + "step": 230 + }, + { + "epoch": 0.02, + "grad_norm": 0.4329834158025715, + "learning_rate": 4.421052631578947e-05, + "loss": 1.2335, + "step": 231 + }, + { + "epoch": 0.02, + "grad_norm": 0.37836691609878936, + "learning_rate": 4.440191387559809e-05, + "loss": 1.118, + "step": 232 + }, + { + "epoch": 0.02, + "grad_norm": 0.37526619539689987, + "learning_rate": 4.45933014354067e-05, + "loss": 1.2968, + "step": 233 + }, + { + "epoch": 0.02, + "grad_norm": 0.4175158591003783, + "learning_rate": 4.4784688995215316e-05, + "loss": 1.1954, + "step": 234 + }, + { + "epoch": 0.02, + "grad_norm": 0.37094222885704037, + "learning_rate": 4.497607655502393e-05, + "loss": 1.0976, + "step": 235 + }, + { + "epoch": 0.02, + "grad_norm": 0.4586856505320375, + "learning_rate": 4.5167464114832533e-05, + "loss": 1.1628, + "step": 236 + }, + { + "epoch": 0.02, + "grad_norm": 0.44587593738328374, + "learning_rate": 4.535885167464115e-05, + "loss": 1.1917, + "step": 237 + }, + { + "epoch": 0.02, + "grad_norm": 0.3594860823761438, + "learning_rate": 4.5550239234449765e-05, + "loss": 1.192, + "step": 238 + }, + { + "epoch": 0.02, + "grad_norm": 0.38283214874386773, + "learning_rate": 4.574162679425838e-05, + "loss": 1.2205, + "step": 239 + }, + { + "epoch": 0.02, + "grad_norm": 0.33492904991363276, + "learning_rate": 4.593301435406699e-05, + "loss": 1.2155, + "step": 240 + }, + { + "epoch": 0.02, + "grad_norm": 0.4113511012996176, + "learning_rate": 4.6124401913875595e-05, + "loss": 1.2222, + "step": 241 + }, + { + "epoch": 0.02, + "grad_norm": 0.4337572841707065, + "learning_rate": 4.6315789473684214e-05, + "loss": 1.0193, + "step": 242 + }, + { + "epoch": 0.02, + "grad_norm": 0.34650969754183997, + "learning_rate": 4.6507177033492826e-05, + "loss": 1.2571, + "step": 243 + }, + { + "epoch": 0.02, + "grad_norm": 0.4758935405313615, + "learning_rate": 4.669856459330144e-05, + "loss": 1.1967, + "step": 244 + }, + { + "epoch": 0.02, + "grad_norm": 0.39684085109310996, + "learning_rate": 4.688995215311005e-05, + "loss": 1.1169, + "step": 245 + }, + { + "epoch": 0.02, + "grad_norm": 0.4823470965603195, + "learning_rate": 4.708133971291866e-05, + "loss": 1.2347, + "step": 246 + }, + { + "epoch": 0.02, + "grad_norm": 0.37717028125791024, + "learning_rate": 4.7272727272727275e-05, + "loss": 1.094, + "step": 247 + }, + { + "epoch": 0.02, + "grad_norm": 0.3667379872819918, + "learning_rate": 4.746411483253589e-05, + "loss": 1.2391, + "step": 248 + }, + { + "epoch": 0.02, + "grad_norm": 0.3407975972338634, + "learning_rate": 4.76555023923445e-05, + "loss": 1.0632, + "step": 249 + }, + { + "epoch": 0.02, + "grad_norm": 0.49238987132507245, + "learning_rate": 4.784688995215311e-05, + "loss": 1.2121, + "step": 250 + }, + { + "epoch": 0.02, + "grad_norm": 0.46706882037804415, + "learning_rate": 4.8038277511961724e-05, + "loss": 1.2437, + "step": 251 + }, + { + "epoch": 0.02, + "grad_norm": 0.4088473587226235, + "learning_rate": 4.8229665071770336e-05, + "loss": 1.1572, + "step": 252 + }, + { + "epoch": 0.02, + "grad_norm": 0.3821462444529073, + "learning_rate": 4.842105263157895e-05, + "loss": 1.1603, + "step": 253 + }, + { + "epoch": 0.02, + "grad_norm": 0.4782990514183433, + "learning_rate": 4.861244019138757e-05, + "loss": 1.2409, + "step": 254 + }, + { + "epoch": 0.02, + "grad_norm": 0.45087555655452305, + "learning_rate": 4.880382775119617e-05, + "loss": 1.1556, + "step": 255 + }, + { + "epoch": 0.02, + "grad_norm": 0.3865676678179632, + "learning_rate": 4.899521531100479e-05, + "loss": 1.1945, + "step": 256 + }, + { + "epoch": 0.02, + "grad_norm": 0.4099982787948705, + "learning_rate": 4.91866028708134e-05, + "loss": 1.17, + "step": 257 + }, + { + "epoch": 0.02, + "grad_norm": 0.46016977600480446, + "learning_rate": 4.937799043062201e-05, + "loss": 1.2024, + "step": 258 + }, + { + "epoch": 0.02, + "grad_norm": 0.4712335677622079, + "learning_rate": 4.956937799043063e-05, + "loss": 1.1252, + "step": 259 + }, + { + "epoch": 0.02, + "grad_norm": 0.3955435220531026, + "learning_rate": 4.9760765550239234e-05, + "loss": 1.201, + "step": 260 + }, + { + "epoch": 0.02, + "grad_norm": 0.48382588647342273, + "learning_rate": 4.995215311004785e-05, + "loss": 1.3135, + "step": 261 + }, + { + "epoch": 0.03, + "grad_norm": 0.39028060459262043, + "learning_rate": 5.014354066985646e-05, + "loss": 1.2573, + "step": 262 + }, + { + "epoch": 0.03, + "grad_norm": 0.4094204023390644, + "learning_rate": 5.033492822966508e-05, + "loss": 1.1411, + "step": 263 + }, + { + "epoch": 0.03, + "grad_norm": 0.37405702073279246, + "learning_rate": 5.052631578947369e-05, + "loss": 1.1924, + "step": 264 + }, + { + "epoch": 0.03, + "grad_norm": 0.38656220760519894, + "learning_rate": 5.0717703349282295e-05, + "loss": 1.2062, + "step": 265 + }, + { + "epoch": 0.03, + "grad_norm": 0.4152109433386491, + "learning_rate": 5.090909090909091e-05, + "loss": 1.2412, + "step": 266 + }, + { + "epoch": 0.03, + "grad_norm": 0.3485732917182752, + "learning_rate": 5.1100478468899526e-05, + "loss": 1.2917, + "step": 267 + }, + { + "epoch": 0.03, + "grad_norm": 0.3551920352066169, + "learning_rate": 5.129186602870814e-05, + "loss": 1.1494, + "step": 268 + }, + { + "epoch": 0.03, + "grad_norm": 0.3817527119708618, + "learning_rate": 5.1483253588516744e-05, + "loss": 1.2689, + "step": 269 + }, + { + "epoch": 0.03, + "grad_norm": 0.35043446712727827, + "learning_rate": 5.167464114832536e-05, + "loss": 1.1263, + "step": 270 + }, + { + "epoch": 0.03, + "grad_norm": 0.4560107489672836, + "learning_rate": 5.1866028708133975e-05, + "loss": 1.2355, + "step": 271 + }, + { + "epoch": 0.03, + "grad_norm": 0.411519693827521, + "learning_rate": 5.205741626794258e-05, + "loss": 1.2213, + "step": 272 + }, + { + "epoch": 0.03, + "grad_norm": 0.4017892181431639, + "learning_rate": 5.22488038277512e-05, + "loss": 1.2103, + "step": 273 + }, + { + "epoch": 0.03, + "grad_norm": 0.46457066156222404, + "learning_rate": 5.244019138755981e-05, + "loss": 1.1638, + "step": 274 + }, + { + "epoch": 0.03, + "grad_norm": 0.41073989648253845, + "learning_rate": 5.2631578947368424e-05, + "loss": 1.2773, + "step": 275 + }, + { + "epoch": 0.03, + "grad_norm": 0.3730027098705776, + "learning_rate": 5.282296650717704e-05, + "loss": 1.1376, + "step": 276 + }, + { + "epoch": 0.03, + "grad_norm": 0.3860157617102252, + "learning_rate": 5.301435406698565e-05, + "loss": 1.2109, + "step": 277 + }, + { + "epoch": 0.03, + "grad_norm": 0.4349386298334119, + "learning_rate": 5.320574162679426e-05, + "loss": 1.1575, + "step": 278 + }, + { + "epoch": 0.03, + "grad_norm": 0.4640773330272392, + "learning_rate": 5.339712918660288e-05, + "loss": 1.1985, + "step": 279 + }, + { + "epoch": 0.03, + "grad_norm": 0.45865729558497004, + "learning_rate": 5.3588516746411485e-05, + "loss": 1.2145, + "step": 280 + }, + { + "epoch": 0.03, + "grad_norm": 0.37111837421497373, + "learning_rate": 5.37799043062201e-05, + "loss": 1.1538, + "step": 281 + }, + { + "epoch": 0.03, + "grad_norm": 0.40099943789483994, + "learning_rate": 5.39712918660287e-05, + "loss": 1.0804, + "step": 282 + }, + { + "epoch": 0.03, + "grad_norm": 0.42817755702563415, + "learning_rate": 5.416267942583733e-05, + "loss": 1.202, + "step": 283 + }, + { + "epoch": 0.03, + "grad_norm": 0.34712198940947375, + "learning_rate": 5.4354066985645934e-05, + "loss": 1.2341, + "step": 284 + }, + { + "epoch": 0.03, + "grad_norm": 0.39457404932238177, + "learning_rate": 5.4545454545454546e-05, + "loss": 1.1571, + "step": 285 + }, + { + "epoch": 0.03, + "grad_norm": 0.36930568124767427, + "learning_rate": 5.4736842105263165e-05, + "loss": 1.1275, + "step": 286 + }, + { + "epoch": 0.03, + "grad_norm": 0.4662030178725213, + "learning_rate": 5.492822966507177e-05, + "loss": 1.1811, + "step": 287 + }, + { + "epoch": 0.03, + "grad_norm": 0.40587674329213624, + "learning_rate": 5.511961722488038e-05, + "loss": 1.102, + "step": 288 + }, + { + "epoch": 0.03, + "grad_norm": 0.35779077893034156, + "learning_rate": 5.5311004784689e-05, + "loss": 1.2788, + "step": 289 + }, + { + "epoch": 0.03, + "grad_norm": 0.31159530938785, + "learning_rate": 5.5502392344497614e-05, + "loss": 1.1778, + "step": 290 + }, + { + "epoch": 0.03, + "grad_norm": 0.4160898781250681, + "learning_rate": 5.569377990430622e-05, + "loss": 1.2064, + "step": 291 + }, + { + "epoch": 0.03, + "grad_norm": 0.43660026046564104, + "learning_rate": 5.588516746411484e-05, + "loss": 1.1494, + "step": 292 + }, + { + "epoch": 0.03, + "grad_norm": 0.40119987919562433, + "learning_rate": 5.607655502392345e-05, + "loss": 1.1623, + "step": 293 + }, + { + "epoch": 0.03, + "grad_norm": 0.37717288362355916, + "learning_rate": 5.6267942583732056e-05, + "loss": 1.1194, + "step": 294 + }, + { + "epoch": 0.03, + "grad_norm": 0.4057229096001704, + "learning_rate": 5.645933014354067e-05, + "loss": 1.0581, + "step": 295 + }, + { + "epoch": 0.03, + "grad_norm": 0.4670818825975868, + "learning_rate": 5.665071770334929e-05, + "loss": 1.171, + "step": 296 + }, + { + "epoch": 0.03, + "grad_norm": 0.5482039795115164, + "learning_rate": 5.68421052631579e-05, + "loss": 1.1082, + "step": 297 + }, + { + "epoch": 0.03, + "grad_norm": 0.4468277071138632, + "learning_rate": 5.7033492822966505e-05, + "loss": 1.2785, + "step": 298 + }, + { + "epoch": 0.03, + "grad_norm": 0.3500497863626851, + "learning_rate": 5.7224880382775124e-05, + "loss": 1.1713, + "step": 299 + }, + { + "epoch": 0.03, + "grad_norm": 0.5457539009822924, + "learning_rate": 5.7416267942583736e-05, + "loss": 1.1259, + "step": 300 + }, + { + "epoch": 0.03, + "grad_norm": 0.3521279155346515, + "learning_rate": 5.760765550239234e-05, + "loss": 1.2143, + "step": 301 + }, + { + "epoch": 0.03, + "grad_norm": 0.38014559184795155, + "learning_rate": 5.779904306220096e-05, + "loss": 1.2169, + "step": 302 + }, + { + "epoch": 0.03, + "grad_norm": 0.4368587205355613, + "learning_rate": 5.799043062200957e-05, + "loss": 1.1354, + "step": 303 + }, + { + "epoch": 0.03, + "grad_norm": 0.3773750084543655, + "learning_rate": 5.818181818181818e-05, + "loss": 1.204, + "step": 304 + }, + { + "epoch": 0.03, + "grad_norm": 0.4661908984222087, + "learning_rate": 5.8373205741626804e-05, + "loss": 1.2339, + "step": 305 + }, + { + "epoch": 0.03, + "grad_norm": 0.4233656313317972, + "learning_rate": 5.856459330143541e-05, + "loss": 1.3069, + "step": 306 + }, + { + "epoch": 0.03, + "grad_norm": 0.4452233614591402, + "learning_rate": 5.875598086124402e-05, + "loss": 1.1966, + "step": 307 + }, + { + "epoch": 0.03, + "grad_norm": 0.379869871739064, + "learning_rate": 5.894736842105263e-05, + "loss": 1.1233, + "step": 308 + }, + { + "epoch": 0.03, + "grad_norm": 0.3634900550395493, + "learning_rate": 5.9138755980861246e-05, + "loss": 1.2106, + "step": 309 + }, + { + "epoch": 0.03, + "grad_norm": 0.38506818193045056, + "learning_rate": 5.933014354066986e-05, + "loss": 1.2411, + "step": 310 + }, + { + "epoch": 0.03, + "grad_norm": 0.41159846103302217, + "learning_rate": 5.9521531100478464e-05, + "loss": 1.3279, + "step": 311 + }, + { + "epoch": 0.03, + "grad_norm": 0.28921210577002654, + "learning_rate": 5.971291866028709e-05, + "loss": 1.0785, + "step": 312 + }, + { + "epoch": 0.03, + "grad_norm": 0.38616302251379914, + "learning_rate": 5.9904306220095695e-05, + "loss": 1.2536, + "step": 313 + }, + { + "epoch": 0.03, + "grad_norm": 0.4101883225822147, + "learning_rate": 6.009569377990431e-05, + "loss": 1.1666, + "step": 314 + }, + { + "epoch": 0.03, + "grad_norm": 0.3423514370886581, + "learning_rate": 6.028708133971293e-05, + "loss": 1.2331, + "step": 315 + }, + { + "epoch": 0.03, + "grad_norm": 0.41207456262790537, + "learning_rate": 6.047846889952153e-05, + "loss": 1.1403, + "step": 316 + }, + { + "epoch": 0.03, + "grad_norm": 0.40588734564935647, + "learning_rate": 6.0669856459330144e-05, + "loss": 1.1986, + "step": 317 + }, + { + "epoch": 0.03, + "grad_norm": 0.37104840888562607, + "learning_rate": 6.086124401913876e-05, + "loss": 1.169, + "step": 318 + }, + { + "epoch": 0.03, + "grad_norm": 0.39655898223668296, + "learning_rate": 6.105263157894737e-05, + "loss": 1.1296, + "step": 319 + }, + { + "epoch": 0.03, + "grad_norm": 0.5221245550161207, + "learning_rate": 6.124401913875598e-05, + "loss": 1.0809, + "step": 320 + }, + { + "epoch": 0.03, + "grad_norm": 0.4021445879644258, + "learning_rate": 6.143540669856461e-05, + "loss": 1.1897, + "step": 321 + }, + { + "epoch": 0.03, + "grad_norm": 0.38180254930918234, + "learning_rate": 6.16267942583732e-05, + "loss": 1.1859, + "step": 322 + }, + { + "epoch": 0.03, + "grad_norm": 0.3828401683172079, + "learning_rate": 6.181818181818182e-05, + "loss": 1.0985, + "step": 323 + }, + { + "epoch": 0.03, + "grad_norm": 0.43356536766643855, + "learning_rate": 6.200956937799043e-05, + "loss": 1.1524, + "step": 324 + }, + { + "epoch": 0.03, + "grad_norm": 0.39197554253988764, + "learning_rate": 6.220095693779904e-05, + "loss": 1.1607, + "step": 325 + }, + { + "epoch": 0.03, + "grad_norm": 0.3768192836960789, + "learning_rate": 6.239234449760765e-05, + "loss": 1.2115, + "step": 326 + }, + { + "epoch": 0.03, + "grad_norm": 0.4123016217273888, + "learning_rate": 6.258373205741627e-05, + "loss": 1.2046, + "step": 327 + }, + { + "epoch": 0.03, + "grad_norm": 0.38690332283125584, + "learning_rate": 6.277511961722489e-05, + "loss": 1.2207, + "step": 328 + }, + { + "epoch": 0.03, + "grad_norm": 0.3958359496615601, + "learning_rate": 6.296650717703349e-05, + "loss": 1.1334, + "step": 329 + }, + { + "epoch": 0.03, + "grad_norm": 0.36561214007046366, + "learning_rate": 6.31578947368421e-05, + "loss": 1.1196, + "step": 330 + }, + { + "epoch": 0.03, + "grad_norm": 0.36953124111301633, + "learning_rate": 6.334928229665073e-05, + "loss": 1.1203, + "step": 331 + }, + { + "epoch": 0.03, + "grad_norm": 0.391180586728009, + "learning_rate": 6.354066985645933e-05, + "loss": 1.2479, + "step": 332 + }, + { + "epoch": 0.03, + "grad_norm": 0.483739471817649, + "learning_rate": 6.373205741626794e-05, + "loss": 1.1225, + "step": 333 + }, + { + "epoch": 0.03, + "grad_norm": 0.4233252985553476, + "learning_rate": 6.392344497607657e-05, + "loss": 1.1772, + "step": 334 + }, + { + "epoch": 0.03, + "grad_norm": 0.4261190391654762, + "learning_rate": 6.411483253588518e-05, + "loss": 1.1752, + "step": 335 + }, + { + "epoch": 0.03, + "grad_norm": 0.4217550233368759, + "learning_rate": 6.430622009569378e-05, + "loss": 1.2335, + "step": 336 + }, + { + "epoch": 0.03, + "grad_norm": 0.4126104400962645, + "learning_rate": 6.449760765550239e-05, + "loss": 1.1095, + "step": 337 + }, + { + "epoch": 0.03, + "grad_norm": 0.48994576863601885, + "learning_rate": 6.468899521531101e-05, + "loss": 1.2933, + "step": 338 + }, + { + "epoch": 0.03, + "grad_norm": 0.3928982582790676, + "learning_rate": 6.488038277511961e-05, + "loss": 1.1598, + "step": 339 + }, + { + "epoch": 0.03, + "grad_norm": 0.38476512783934874, + "learning_rate": 6.507177033492823e-05, + "loss": 1.3208, + "step": 340 + }, + { + "epoch": 0.03, + "grad_norm": 0.3810331248367921, + "learning_rate": 6.526315789473685e-05, + "loss": 1.1183, + "step": 341 + }, + { + "epoch": 0.03, + "grad_norm": 0.38985962649469763, + "learning_rate": 6.545454545454546e-05, + "loss": 1.2319, + "step": 342 + }, + { + "epoch": 0.03, + "grad_norm": 0.44280071804731985, + "learning_rate": 6.564593301435406e-05, + "loss": 1.2374, + "step": 343 + }, + { + "epoch": 0.03, + "grad_norm": 0.3221949054184306, + "learning_rate": 6.583732057416269e-05, + "loss": 1.182, + "step": 344 + }, + { + "epoch": 0.03, + "grad_norm": 0.3765883193637935, + "learning_rate": 6.60287081339713e-05, + "loss": 1.085, + "step": 345 + }, + { + "epoch": 0.03, + "grad_norm": 0.36077725709386577, + "learning_rate": 6.62200956937799e-05, + "loss": 1.1741, + "step": 346 + }, + { + "epoch": 0.03, + "grad_norm": 0.3844614909912839, + "learning_rate": 6.641148325358852e-05, + "loss": 1.1308, + "step": 347 + }, + { + "epoch": 0.03, + "grad_norm": 0.356160190962461, + "learning_rate": 6.660287081339714e-05, + "loss": 1.2303, + "step": 348 + }, + { + "epoch": 0.03, + "grad_norm": 0.40199466397728084, + "learning_rate": 6.679425837320575e-05, + "loss": 1.1404, + "step": 349 + }, + { + "epoch": 0.03, + "grad_norm": 0.40760457805411343, + "learning_rate": 6.698564593301436e-05, + "loss": 1.1441, + "step": 350 + }, + { + "epoch": 0.03, + "grad_norm": 0.36664650507067836, + "learning_rate": 6.717703349282297e-05, + "loss": 1.0822, + "step": 351 + }, + { + "epoch": 0.03, + "grad_norm": 0.3774008230280487, + "learning_rate": 6.736842105263159e-05, + "loss": 1.1882, + "step": 352 + }, + { + "epoch": 0.03, + "grad_norm": 0.3743284992535933, + "learning_rate": 6.755980861244018e-05, + "loss": 1.1174, + "step": 353 + }, + { + "epoch": 0.03, + "grad_norm": 0.3803668006858151, + "learning_rate": 6.775119617224881e-05, + "loss": 1.2074, + "step": 354 + }, + { + "epoch": 0.03, + "grad_norm": 0.3777040131335034, + "learning_rate": 6.794258373205742e-05, + "loss": 1.0946, + "step": 355 + }, + { + "epoch": 0.03, + "grad_norm": 0.3794465077922223, + "learning_rate": 6.813397129186602e-05, + "loss": 1.0683, + "step": 356 + }, + { + "epoch": 0.03, + "grad_norm": 0.3794400803152343, + "learning_rate": 6.832535885167465e-05, + "loss": 1.1377, + "step": 357 + }, + { + "epoch": 0.03, + "grad_norm": 0.3246160370634747, + "learning_rate": 6.851674641148326e-05, + "loss": 1.2539, + "step": 358 + }, + { + "epoch": 0.03, + "grad_norm": 0.3663994418682161, + "learning_rate": 6.870813397129187e-05, + "loss": 1.179, + "step": 359 + }, + { + "epoch": 0.03, + "grad_norm": 0.4165644814006045, + "learning_rate": 6.889952153110048e-05, + "loss": 1.2068, + "step": 360 + }, + { + "epoch": 0.03, + "grad_norm": 0.40002794816857074, + "learning_rate": 6.90909090909091e-05, + "loss": 1.236, + "step": 361 + }, + { + "epoch": 0.03, + "grad_norm": 0.36752801689828113, + "learning_rate": 6.928229665071771e-05, + "loss": 1.1806, + "step": 362 + }, + { + "epoch": 0.03, + "grad_norm": 0.4118641376720516, + "learning_rate": 6.947368421052632e-05, + "loss": 1.2099, + "step": 363 + }, + { + "epoch": 0.03, + "grad_norm": 0.342905855911731, + "learning_rate": 6.966507177033493e-05, + "loss": 1.1753, + "step": 364 + }, + { + "epoch": 0.03, + "grad_norm": 0.37601728725049927, + "learning_rate": 6.985645933014354e-05, + "loss": 1.2123, + "step": 365 + }, + { + "epoch": 0.04, + "grad_norm": 0.3996876187572624, + "learning_rate": 7.004784688995216e-05, + "loss": 1.2063, + "step": 366 + }, + { + "epoch": 0.04, + "grad_norm": 0.4995513430917594, + "learning_rate": 7.023923444976077e-05, + "loss": 1.1249, + "step": 367 + }, + { + "epoch": 0.04, + "grad_norm": 0.38011883982869116, + "learning_rate": 7.043062200956938e-05, + "loss": 1.091, + "step": 368 + }, + { + "epoch": 0.04, + "grad_norm": 0.40185997614954744, + "learning_rate": 7.0622009569378e-05, + "loss": 1.2879, + "step": 369 + }, + { + "epoch": 0.04, + "grad_norm": 0.37530412149005404, + "learning_rate": 7.08133971291866e-05, + "loss": 1.2832, + "step": 370 + }, + { + "epoch": 0.04, + "grad_norm": 0.4067556360842582, + "learning_rate": 7.100478468899522e-05, + "loss": 1.0513, + "step": 371 + }, + { + "epoch": 0.04, + "grad_norm": 0.4450195271251879, + "learning_rate": 7.119617224880383e-05, + "loss": 1.1781, + "step": 372 + }, + { + "epoch": 0.04, + "grad_norm": 0.3878038365086055, + "learning_rate": 7.138755980861244e-05, + "loss": 1.2654, + "step": 373 + }, + { + "epoch": 0.04, + "grad_norm": 0.34902250016212555, + "learning_rate": 7.157894736842105e-05, + "loss": 1.0807, + "step": 374 + }, + { + "epoch": 0.04, + "grad_norm": 0.4789371923266489, + "learning_rate": 7.177033492822967e-05, + "loss": 1.0782, + "step": 375 + }, + { + "epoch": 0.04, + "grad_norm": 0.3443621112474482, + "learning_rate": 7.196172248803828e-05, + "loss": 1.0874, + "step": 376 + }, + { + "epoch": 0.04, + "grad_norm": 0.4354953964559702, + "learning_rate": 7.215311004784689e-05, + "loss": 1.1893, + "step": 377 + }, + { + "epoch": 0.04, + "grad_norm": 0.3806981757319183, + "learning_rate": 7.23444976076555e-05, + "loss": 1.1033, + "step": 378 + }, + { + "epoch": 0.04, + "grad_norm": 0.3640708591532273, + "learning_rate": 7.253588516746413e-05, + "loss": 1.1008, + "step": 379 + }, + { + "epoch": 0.04, + "grad_norm": 0.40487824027906655, + "learning_rate": 7.272727272727273e-05, + "loss": 1.2676, + "step": 380 + }, + { + "epoch": 0.04, + "grad_norm": 0.3974099352341619, + "learning_rate": 7.291866028708134e-05, + "loss": 1.1297, + "step": 381 + }, + { + "epoch": 0.04, + "grad_norm": 0.4412922873426402, + "learning_rate": 7.311004784688995e-05, + "loss": 1.2022, + "step": 382 + }, + { + "epoch": 0.04, + "grad_norm": 0.4177424000999219, + "learning_rate": 7.330143540669856e-05, + "loss": 1.0769, + "step": 383 + }, + { + "epoch": 0.04, + "grad_norm": 0.37843676467423115, + "learning_rate": 7.349282296650718e-05, + "loss": 1.1895, + "step": 384 + }, + { + "epoch": 0.04, + "grad_norm": 0.37904784029757954, + "learning_rate": 7.368421052631579e-05, + "loss": 1.0346, + "step": 385 + }, + { + "epoch": 0.04, + "grad_norm": 0.3678229762318383, + "learning_rate": 7.387559808612442e-05, + "loss": 1.1415, + "step": 386 + }, + { + "epoch": 0.04, + "grad_norm": 0.44631509974989175, + "learning_rate": 7.406698564593301e-05, + "loss": 1.1593, + "step": 387 + }, + { + "epoch": 0.04, + "grad_norm": 0.4904633395013365, + "learning_rate": 7.425837320574163e-05, + "loss": 1.1944, + "step": 388 + }, + { + "epoch": 0.04, + "grad_norm": 0.37152475181241584, + "learning_rate": 7.444976076555025e-05, + "loss": 1.1698, + "step": 389 + }, + { + "epoch": 0.04, + "grad_norm": 0.4011204212131284, + "learning_rate": 7.464114832535885e-05, + "loss": 1.205, + "step": 390 + }, + { + "epoch": 0.04, + "grad_norm": 0.36761682195572815, + "learning_rate": 7.483253588516746e-05, + "loss": 1.2973, + "step": 391 + }, + { + "epoch": 0.04, + "grad_norm": 0.40374546939981903, + "learning_rate": 7.502392344497609e-05, + "loss": 1.1507, + "step": 392 + }, + { + "epoch": 0.04, + "grad_norm": 0.4255006771988514, + "learning_rate": 7.52153110047847e-05, + "loss": 1.1891, + "step": 393 + }, + { + "epoch": 0.04, + "grad_norm": 0.3865518214834786, + "learning_rate": 7.54066985645933e-05, + "loss": 1.185, + "step": 394 + }, + { + "epoch": 0.04, + "grad_norm": 0.41827717268535636, + "learning_rate": 7.559808612440191e-05, + "loss": 1.0622, + "step": 395 + }, + { + "epoch": 0.04, + "grad_norm": 0.37110879057589674, + "learning_rate": 7.578947368421054e-05, + "loss": 1.1381, + "step": 396 + }, + { + "epoch": 0.04, + "grad_norm": 0.46618966480421226, + "learning_rate": 7.598086124401914e-05, + "loss": 1.2105, + "step": 397 + }, + { + "epoch": 0.04, + "grad_norm": 0.4266096767917638, + "learning_rate": 7.617224880382775e-05, + "loss": 1.2304, + "step": 398 + }, + { + "epoch": 0.04, + "grad_norm": 0.3951434711007925, + "learning_rate": 7.636363636363637e-05, + "loss": 1.1194, + "step": 399 + }, + { + "epoch": 0.04, + "grad_norm": 0.5249468851294161, + "learning_rate": 7.655502392344497e-05, + "loss": 1.1557, + "step": 400 + }, + { + "epoch": 0.04, + "grad_norm": 0.35849242393679126, + "learning_rate": 7.674641148325359e-05, + "loss": 1.1586, + "step": 401 + }, + { + "epoch": 0.04, + "grad_norm": 0.4163074752217252, + "learning_rate": 7.693779904306221e-05, + "loss": 1.1884, + "step": 402 + }, + { + "epoch": 0.04, + "grad_norm": 0.39359654231287206, + "learning_rate": 7.712918660287082e-05, + "loss": 1.2162, + "step": 403 + }, + { + "epoch": 0.04, + "grad_norm": 0.40661594470681456, + "learning_rate": 7.732057416267942e-05, + "loss": 1.1578, + "step": 404 + }, + { + "epoch": 0.04, + "grad_norm": 0.4060870336417891, + "learning_rate": 7.751196172248805e-05, + "loss": 1.2462, + "step": 405 + }, + { + "epoch": 0.04, + "grad_norm": 0.34776732343372285, + "learning_rate": 7.770334928229666e-05, + "loss": 1.1907, + "step": 406 + }, + { + "epoch": 0.04, + "grad_norm": 0.37012929061225097, + "learning_rate": 7.789473684210526e-05, + "loss": 1.084, + "step": 407 + }, + { + "epoch": 0.04, + "grad_norm": 0.3191687282557576, + "learning_rate": 7.808612440191387e-05, + "loss": 1.1337, + "step": 408 + }, + { + "epoch": 0.04, + "grad_norm": 0.428711696888013, + "learning_rate": 7.82775119617225e-05, + "loss": 1.2142, + "step": 409 + }, + { + "epoch": 0.04, + "grad_norm": 0.5197761387762115, + "learning_rate": 7.846889952153111e-05, + "loss": 1.2915, + "step": 410 + }, + { + "epoch": 0.04, + "grad_norm": 0.41273170978967216, + "learning_rate": 7.866028708133971e-05, + "loss": 1.1028, + "step": 411 + }, + { + "epoch": 0.04, + "grad_norm": 0.3568549043871766, + "learning_rate": 7.885167464114833e-05, + "loss": 1.1575, + "step": 412 + }, + { + "epoch": 0.04, + "grad_norm": 0.43257689825178397, + "learning_rate": 7.904306220095695e-05, + "loss": 1.1115, + "step": 413 + }, + { + "epoch": 0.04, + "grad_norm": 0.4355392516104201, + "learning_rate": 7.923444976076554e-05, + "loss": 1.208, + "step": 414 + }, + { + "epoch": 0.04, + "grad_norm": 0.48359013518775035, + "learning_rate": 7.942583732057417e-05, + "loss": 1.1864, + "step": 415 + }, + { + "epoch": 0.04, + "grad_norm": 0.3730295702560985, + "learning_rate": 7.961722488038278e-05, + "loss": 1.1261, + "step": 416 + }, + { + "epoch": 0.04, + "grad_norm": 0.3966685187279031, + "learning_rate": 7.98086124401914e-05, + "loss": 1.2786, + "step": 417 + }, + { + "epoch": 0.04, + "grad_norm": 0.3786523559458125, + "learning_rate": 8e-05, + "loss": 1.1127, + "step": 418 + }, + { + "epoch": 0.04, + "grad_norm": 0.4039420784784596, + "learning_rate": 8.019138755980862e-05, + "loss": 1.1653, + "step": 419 + }, + { + "epoch": 0.04, + "grad_norm": 0.3836563304661635, + "learning_rate": 8.038277511961723e-05, + "loss": 1.1504, + "step": 420 + }, + { + "epoch": 0.04, + "grad_norm": 0.3915882211031514, + "learning_rate": 8.057416267942584e-05, + "loss": 1.1881, + "step": 421 + }, + { + "epoch": 0.04, + "grad_norm": 0.3585833381861096, + "learning_rate": 8.076555023923446e-05, + "loss": 1.1215, + "step": 422 + }, + { + "epoch": 0.04, + "grad_norm": 0.34093033582588217, + "learning_rate": 8.095693779904307e-05, + "loss": 1.1502, + "step": 423 + }, + { + "epoch": 0.04, + "grad_norm": 0.35015224800454925, + "learning_rate": 8.114832535885168e-05, + "loss": 1.1377, + "step": 424 + }, + { + "epoch": 0.04, + "grad_norm": 0.36225302300603957, + "learning_rate": 8.133971291866029e-05, + "loss": 1.146, + "step": 425 + }, + { + "epoch": 0.04, + "grad_norm": 0.401290151391233, + "learning_rate": 8.15311004784689e-05, + "loss": 1.1257, + "step": 426 + }, + { + "epoch": 0.04, + "grad_norm": 0.42354295033157774, + "learning_rate": 8.172248803827752e-05, + "loss": 1.1861, + "step": 427 + }, + { + "epoch": 0.04, + "grad_norm": 0.47531433461725525, + "learning_rate": 8.191387559808613e-05, + "loss": 1.2387, + "step": 428 + }, + { + "epoch": 0.04, + "grad_norm": 0.3956478442168383, + "learning_rate": 8.210526315789474e-05, + "loss": 1.2046, + "step": 429 + }, + { + "epoch": 0.04, + "grad_norm": 0.40300702532566746, + "learning_rate": 8.229665071770335e-05, + "loss": 1.2882, + "step": 430 + }, + { + "epoch": 0.04, + "grad_norm": 0.4247965626832039, + "learning_rate": 8.248803827751197e-05, + "loss": 1.2151, + "step": 431 + }, + { + "epoch": 0.04, + "grad_norm": 0.3667653514802216, + "learning_rate": 8.267942583732058e-05, + "loss": 1.2397, + "step": 432 + }, + { + "epoch": 0.04, + "grad_norm": 0.368289021497025, + "learning_rate": 8.287081339712919e-05, + "loss": 1.1336, + "step": 433 + }, + { + "epoch": 0.04, + "grad_norm": 0.35548517154282255, + "learning_rate": 8.30622009569378e-05, + "loss": 1.0854, + "step": 434 + }, + { + "epoch": 0.04, + "grad_norm": 0.43281064565162874, + "learning_rate": 8.325358851674641e-05, + "loss": 1.1418, + "step": 435 + }, + { + "epoch": 0.04, + "grad_norm": 0.36804478901968624, + "learning_rate": 8.344497607655503e-05, + "loss": 1.1754, + "step": 436 + }, + { + "epoch": 0.04, + "grad_norm": 0.3680855445618144, + "learning_rate": 8.363636363636364e-05, + "loss": 1.1909, + "step": 437 + }, + { + "epoch": 0.04, + "grad_norm": 0.38659927520583764, + "learning_rate": 8.382775119617225e-05, + "loss": 1.2076, + "step": 438 + }, + { + "epoch": 0.04, + "grad_norm": 0.341767677446713, + "learning_rate": 8.401913875598086e-05, + "loss": 1.0057, + "step": 439 + }, + { + "epoch": 0.04, + "grad_norm": 0.4256342728148254, + "learning_rate": 8.421052631578948e-05, + "loss": 1.2104, + "step": 440 + }, + { + "epoch": 0.04, + "grad_norm": 0.37216536384041526, + "learning_rate": 8.440191387559809e-05, + "loss": 1.1144, + "step": 441 + }, + { + "epoch": 0.04, + "grad_norm": 0.3791754076001631, + "learning_rate": 8.45933014354067e-05, + "loss": 1.2319, + "step": 442 + }, + { + "epoch": 0.04, + "grad_norm": 0.42421700255160355, + "learning_rate": 8.478468899521531e-05, + "loss": 1.1934, + "step": 443 + }, + { + "epoch": 0.04, + "grad_norm": 0.3678445305043515, + "learning_rate": 8.497607655502394e-05, + "loss": 1.1886, + "step": 444 + }, + { + "epoch": 0.04, + "grad_norm": 0.4219847133147515, + "learning_rate": 8.516746411483254e-05, + "loss": 1.1748, + "step": 445 + }, + { + "epoch": 0.04, + "grad_norm": 0.33966219534041503, + "learning_rate": 8.535885167464115e-05, + "loss": 1.1416, + "step": 446 + }, + { + "epoch": 0.04, + "grad_norm": 0.3426950270667347, + "learning_rate": 8.555023923444977e-05, + "loss": 1.2117, + "step": 447 + }, + { + "epoch": 0.04, + "grad_norm": 0.31268235060757094, + "learning_rate": 8.574162679425837e-05, + "loss": 1.1099, + "step": 448 + }, + { + "epoch": 0.04, + "grad_norm": 0.4283403832969395, + "learning_rate": 8.593301435406699e-05, + "loss": 1.1204, + "step": 449 + }, + { + "epoch": 0.04, + "grad_norm": 0.360981174023238, + "learning_rate": 8.612440191387561e-05, + "loss": 1.1203, + "step": 450 + }, + { + "epoch": 0.04, + "grad_norm": 0.31441146897954253, + "learning_rate": 8.631578947368421e-05, + "loss": 1.1703, + "step": 451 + }, + { + "epoch": 0.04, + "grad_norm": 0.6665116249955306, + "learning_rate": 8.650717703349282e-05, + "loss": 1.1965, + "step": 452 + }, + { + "epoch": 0.04, + "grad_norm": 0.34282354864583103, + "learning_rate": 8.669856459330143e-05, + "loss": 1.1108, + "step": 453 + }, + { + "epoch": 0.04, + "grad_norm": 0.3353048313959663, + "learning_rate": 8.688995215311006e-05, + "loss": 1.0893, + "step": 454 + }, + { + "epoch": 0.04, + "grad_norm": 0.3851891541705526, + "learning_rate": 8.708133971291866e-05, + "loss": 1.065, + "step": 455 + }, + { + "epoch": 0.04, + "grad_norm": 0.40582359225585646, + "learning_rate": 8.727272727272727e-05, + "loss": 1.1551, + "step": 456 + }, + { + "epoch": 0.04, + "grad_norm": 0.3310055383140587, + "learning_rate": 8.74641148325359e-05, + "loss": 1.1374, + "step": 457 + }, + { + "epoch": 0.04, + "grad_norm": 0.3697503368052755, + "learning_rate": 8.76555023923445e-05, + "loss": 1.0852, + "step": 458 + }, + { + "epoch": 0.04, + "grad_norm": 0.3494933513359935, + "learning_rate": 8.784688995215311e-05, + "loss": 1.1092, + "step": 459 + }, + { + "epoch": 0.04, + "grad_norm": 0.359799849220072, + "learning_rate": 8.803827751196173e-05, + "loss": 1.0291, + "step": 460 + }, + { + "epoch": 0.04, + "grad_norm": 0.3692901471865079, + "learning_rate": 8.822966507177035e-05, + "loss": 1.1516, + "step": 461 + }, + { + "epoch": 0.04, + "grad_norm": 0.31771285202360866, + "learning_rate": 8.842105263157894e-05, + "loss": 1.0414, + "step": 462 + }, + { + "epoch": 0.04, + "grad_norm": 0.39382351019262535, + "learning_rate": 8.861244019138757e-05, + "loss": 1.1987, + "step": 463 + }, + { + "epoch": 0.04, + "grad_norm": 0.37948154502600623, + "learning_rate": 8.880382775119618e-05, + "loss": 1.0162, + "step": 464 + }, + { + "epoch": 0.04, + "grad_norm": 0.4454086610740419, + "learning_rate": 8.899521531100478e-05, + "loss": 1.1686, + "step": 465 + }, + { + "epoch": 0.04, + "grad_norm": 0.3992597027171503, + "learning_rate": 8.91866028708134e-05, + "loss": 1.1067, + "step": 466 + }, + { + "epoch": 0.04, + "grad_norm": 0.402787251335542, + "learning_rate": 8.937799043062202e-05, + "loss": 1.1733, + "step": 467 + }, + { + "epoch": 0.04, + "grad_norm": 0.3772514735030477, + "learning_rate": 8.956937799043063e-05, + "loss": 1.1926, + "step": 468 + }, + { + "epoch": 0.04, + "grad_norm": 0.38379331799775773, + "learning_rate": 8.976076555023923e-05, + "loss": 1.2207, + "step": 469 + }, + { + "epoch": 0.04, + "grad_norm": 0.37849416076396786, + "learning_rate": 8.995215311004786e-05, + "loss": 1.158, + "step": 470 + }, + { + "epoch": 0.05, + "grad_norm": 0.40932072449271345, + "learning_rate": 9.014354066985647e-05, + "loss": 1.124, + "step": 471 + }, + { + "epoch": 0.05, + "grad_norm": 0.34313554427354404, + "learning_rate": 9.033492822966507e-05, + "loss": 1.2141, + "step": 472 + }, + { + "epoch": 0.05, + "grad_norm": 0.2960457574671995, + "learning_rate": 9.052631578947369e-05, + "loss": 1.1269, + "step": 473 + }, + { + "epoch": 0.05, + "grad_norm": 0.3364724543703671, + "learning_rate": 9.07177033492823e-05, + "loss": 1.0963, + "step": 474 + }, + { + "epoch": 0.05, + "grad_norm": 0.4102740455894671, + "learning_rate": 9.090909090909092e-05, + "loss": 0.9836, + "step": 475 + }, + { + "epoch": 0.05, + "grad_norm": 0.3576390479541009, + "learning_rate": 9.110047846889953e-05, + "loss": 1.088, + "step": 476 + }, + { + "epoch": 0.05, + "grad_norm": 0.4063740081724684, + "learning_rate": 9.129186602870814e-05, + "loss": 1.0916, + "step": 477 + }, + { + "epoch": 0.05, + "grad_norm": 0.3866688811135483, + "learning_rate": 9.148325358851675e-05, + "loss": 1.1582, + "step": 478 + }, + { + "epoch": 0.05, + "grad_norm": 0.35233155736688976, + "learning_rate": 9.167464114832537e-05, + "loss": 1.2166, + "step": 479 + }, + { + "epoch": 0.05, + "grad_norm": 0.3523742613417453, + "learning_rate": 9.186602870813398e-05, + "loss": 1.1492, + "step": 480 + }, + { + "epoch": 0.05, + "grad_norm": 0.4091175614367992, + "learning_rate": 9.205741626794259e-05, + "loss": 1.2015, + "step": 481 + }, + { + "epoch": 0.05, + "grad_norm": 0.40461705008021104, + "learning_rate": 9.224880382775119e-05, + "loss": 1.1021, + "step": 482 + }, + { + "epoch": 0.05, + "grad_norm": 0.3555229450892527, + "learning_rate": 9.244019138755981e-05, + "loss": 1.1677, + "step": 483 + }, + { + "epoch": 0.05, + "grad_norm": 0.39219146946985217, + "learning_rate": 9.263157894736843e-05, + "loss": 1.1448, + "step": 484 + }, + { + "epoch": 0.05, + "grad_norm": 0.3959881230046531, + "learning_rate": 9.282296650717704e-05, + "loss": 1.1031, + "step": 485 + }, + { + "epoch": 0.05, + "grad_norm": 0.37437224905160343, + "learning_rate": 9.301435406698565e-05, + "loss": 1.2172, + "step": 486 + }, + { + "epoch": 0.05, + "grad_norm": 0.34627056702655357, + "learning_rate": 9.320574162679426e-05, + "loss": 1.0524, + "step": 487 + }, + { + "epoch": 0.05, + "grad_norm": 0.34958866976468206, + "learning_rate": 9.339712918660288e-05, + "loss": 1.1962, + "step": 488 + }, + { + "epoch": 0.05, + "grad_norm": 0.3592293846050586, + "learning_rate": 9.358851674641149e-05, + "loss": 1.1292, + "step": 489 + }, + { + "epoch": 0.05, + "grad_norm": 0.4512710733697596, + "learning_rate": 9.37799043062201e-05, + "loss": 1.1657, + "step": 490 + }, + { + "epoch": 0.05, + "grad_norm": 0.3630522232972779, + "learning_rate": 9.397129186602871e-05, + "loss": 1.1852, + "step": 491 + }, + { + "epoch": 0.05, + "grad_norm": 0.37647580929450847, + "learning_rate": 9.416267942583733e-05, + "loss": 1.1968, + "step": 492 + }, + { + "epoch": 0.05, + "grad_norm": 0.4947723206359184, + "learning_rate": 9.435406698564594e-05, + "loss": 1.1535, + "step": 493 + }, + { + "epoch": 0.05, + "grad_norm": 0.384481476371926, + "learning_rate": 9.454545454545455e-05, + "loss": 1.0762, + "step": 494 + }, + { + "epoch": 0.05, + "grad_norm": 0.30759408641071373, + "learning_rate": 9.473684210526316e-05, + "loss": 1.2025, + "step": 495 + }, + { + "epoch": 0.05, + "grad_norm": 0.39262382714536653, + "learning_rate": 9.492822966507177e-05, + "loss": 1.1072, + "step": 496 + }, + { + "epoch": 0.05, + "grad_norm": 0.3707577579487458, + "learning_rate": 9.511961722488039e-05, + "loss": 1.201, + "step": 497 + }, + { + "epoch": 0.05, + "grad_norm": 0.37259701318014815, + "learning_rate": 9.5311004784689e-05, + "loss": 1.1808, + "step": 498 + }, + { + "epoch": 0.05, + "grad_norm": 0.3899497483331902, + "learning_rate": 9.550239234449761e-05, + "loss": 1.155, + "step": 499 + }, + { + "epoch": 0.05, + "grad_norm": 0.3056736695030619, + "learning_rate": 9.569377990430622e-05, + "loss": 1.1485, + "step": 500 + }, + { + "epoch": 0.05, + "grad_norm": 0.3771828447592661, + "learning_rate": 9.588516746411484e-05, + "loss": 1.1475, + "step": 501 + }, + { + "epoch": 0.05, + "grad_norm": 0.3512506645855738, + "learning_rate": 9.607655502392345e-05, + "loss": 1.137, + "step": 502 + }, + { + "epoch": 0.05, + "grad_norm": 0.44787972025939427, + "learning_rate": 9.626794258373206e-05, + "loss": 1.1137, + "step": 503 + }, + { + "epoch": 0.05, + "grad_norm": 0.43899468578920847, + "learning_rate": 9.645933014354067e-05, + "loss": 1.198, + "step": 504 + }, + { + "epoch": 0.05, + "grad_norm": 0.37520028378057024, + "learning_rate": 9.66507177033493e-05, + "loss": 1.12, + "step": 505 + }, + { + "epoch": 0.05, + "grad_norm": 0.44437839309325783, + "learning_rate": 9.68421052631579e-05, + "loss": 1.1266, + "step": 506 + }, + { + "epoch": 0.05, + "grad_norm": 0.3713684887370264, + "learning_rate": 9.703349282296651e-05, + "loss": 1.0703, + "step": 507 + }, + { + "epoch": 0.05, + "grad_norm": 0.3785759236620265, + "learning_rate": 9.722488038277513e-05, + "loss": 1.1825, + "step": 508 + }, + { + "epoch": 0.05, + "grad_norm": 0.3949986518868616, + "learning_rate": 9.741626794258373e-05, + "loss": 1.205, + "step": 509 + }, + { + "epoch": 0.05, + "grad_norm": 0.32820866590924014, + "learning_rate": 9.760765550239235e-05, + "loss": 1.0473, + "step": 510 + }, + { + "epoch": 0.05, + "grad_norm": 0.3438055422000543, + "learning_rate": 9.779904306220096e-05, + "loss": 1.2331, + "step": 511 + }, + { + "epoch": 0.05, + "grad_norm": 0.4782013192332416, + "learning_rate": 9.799043062200958e-05, + "loss": 1.0641, + "step": 512 + }, + { + "epoch": 0.05, + "grad_norm": 0.37127997208806374, + "learning_rate": 9.818181818181818e-05, + "loss": 1.1557, + "step": 513 + }, + { + "epoch": 0.05, + "grad_norm": 0.37810255782547136, + "learning_rate": 9.83732057416268e-05, + "loss": 1.1545, + "step": 514 + }, + { + "epoch": 0.05, + "grad_norm": 0.4105466030008024, + "learning_rate": 9.856459330143542e-05, + "loss": 1.2272, + "step": 515 + }, + { + "epoch": 0.05, + "grad_norm": 0.4517743884986355, + "learning_rate": 9.875598086124402e-05, + "loss": 1.1229, + "step": 516 + }, + { + "epoch": 0.05, + "grad_norm": 0.399218104135688, + "learning_rate": 9.894736842105263e-05, + "loss": 1.1467, + "step": 517 + }, + { + "epoch": 0.05, + "grad_norm": 0.3693460754043359, + "learning_rate": 9.913875598086126e-05, + "loss": 1.169, + "step": 518 + }, + { + "epoch": 0.05, + "grad_norm": 0.3446333300820591, + "learning_rate": 9.933014354066987e-05, + "loss": 1.0783, + "step": 519 + }, + { + "epoch": 0.05, + "grad_norm": 0.41719266096581403, + "learning_rate": 9.952153110047847e-05, + "loss": 1.211, + "step": 520 + }, + { + "epoch": 0.05, + "grad_norm": 0.4139824733210239, + "learning_rate": 9.97129186602871e-05, + "loss": 1.2271, + "step": 521 + }, + { + "epoch": 0.05, + "grad_norm": 0.3284583647624778, + "learning_rate": 9.99043062200957e-05, + "loss": 1.1363, + "step": 522 + }, + { + "epoch": 0.05, + "grad_norm": 0.3743628377274405, + "learning_rate": 0.0001000956937799043, + "loss": 1.1254, + "step": 523 + }, + { + "epoch": 0.05, + "grad_norm": 0.4187480747840408, + "learning_rate": 0.00010028708133971292, + "loss": 1.1542, + "step": 524 + }, + { + "epoch": 0.05, + "grad_norm": 0.35701214246846424, + "learning_rate": 0.00010047846889952153, + "loss": 1.1318, + "step": 525 + }, + { + "epoch": 0.05, + "grad_norm": 0.39966360139311247, + "learning_rate": 0.00010066985645933015, + "loss": 1.0679, + "step": 526 + }, + { + "epoch": 0.05, + "grad_norm": 0.42408986083219885, + "learning_rate": 0.00010086124401913877, + "loss": 1.086, + "step": 527 + }, + { + "epoch": 0.05, + "grad_norm": 0.33931985700987544, + "learning_rate": 0.00010105263157894738, + "loss": 1.1684, + "step": 528 + }, + { + "epoch": 0.05, + "grad_norm": 0.33040989829144524, + "learning_rate": 0.00010124401913875599, + "loss": 1.1301, + "step": 529 + }, + { + "epoch": 0.05, + "grad_norm": 0.3404311052002135, + "learning_rate": 0.00010143540669856459, + "loss": 1.0848, + "step": 530 + }, + { + "epoch": 0.05, + "grad_norm": 0.34611637641008364, + "learning_rate": 0.0001016267942583732, + "loss": 1.1635, + "step": 531 + }, + { + "epoch": 0.05, + "grad_norm": 0.3672822694004909, + "learning_rate": 0.00010181818181818181, + "loss": 1.0811, + "step": 532 + }, + { + "epoch": 0.05, + "grad_norm": 0.34761252015363225, + "learning_rate": 0.00010200956937799044, + "loss": 1.1502, + "step": 533 + }, + { + "epoch": 0.05, + "grad_norm": 0.27503858553543464, + "learning_rate": 0.00010220095693779905, + "loss": 1.1257, + "step": 534 + }, + { + "epoch": 0.05, + "grad_norm": 0.3368407495501332, + "learning_rate": 0.00010239234449760766, + "loss": 1.1195, + "step": 535 + }, + { + "epoch": 0.05, + "grad_norm": 0.3448997570516004, + "learning_rate": 0.00010258373205741628, + "loss": 1.1955, + "step": 536 + }, + { + "epoch": 0.05, + "grad_norm": 0.4366845539188124, + "learning_rate": 0.00010277511961722488, + "loss": 1.1175, + "step": 537 + }, + { + "epoch": 0.05, + "grad_norm": 0.35681716286224935, + "learning_rate": 0.00010296650717703349, + "loss": 1.1577, + "step": 538 + }, + { + "epoch": 0.05, + "grad_norm": 0.3359937686441125, + "learning_rate": 0.00010315789473684211, + "loss": 1.1318, + "step": 539 + }, + { + "epoch": 0.05, + "grad_norm": 0.3398927803425864, + "learning_rate": 0.00010334928229665073, + "loss": 1.1278, + "step": 540 + }, + { + "epoch": 0.05, + "grad_norm": 0.38532827109393014, + "learning_rate": 0.00010354066985645934, + "loss": 1.1273, + "step": 541 + }, + { + "epoch": 0.05, + "grad_norm": 0.3740363062511281, + "learning_rate": 0.00010373205741626795, + "loss": 1.0253, + "step": 542 + }, + { + "epoch": 0.05, + "grad_norm": 0.34342518179762227, + "learning_rate": 0.00010392344497607656, + "loss": 1.1462, + "step": 543 + }, + { + "epoch": 0.05, + "grad_norm": 0.3590449087862375, + "learning_rate": 0.00010411483253588516, + "loss": 1.1392, + "step": 544 + }, + { + "epoch": 0.05, + "grad_norm": 0.37655451333728496, + "learning_rate": 0.00010430622009569377, + "loss": 1.1218, + "step": 545 + }, + { + "epoch": 0.05, + "grad_norm": 0.3336387102623628, + "learning_rate": 0.0001044976076555024, + "loss": 1.2268, + "step": 546 + }, + { + "epoch": 0.05, + "grad_norm": 0.38550698369605846, + "learning_rate": 0.00010468899521531101, + "loss": 1.1752, + "step": 547 + }, + { + "epoch": 0.05, + "grad_norm": 0.39228288833022, + "learning_rate": 0.00010488038277511962, + "loss": 1.2562, + "step": 548 + }, + { + "epoch": 0.05, + "grad_norm": 0.34746795623865406, + "learning_rate": 0.00010507177033492824, + "loss": 1.2464, + "step": 549 + }, + { + "epoch": 0.05, + "grad_norm": 0.350081344824654, + "learning_rate": 0.00010526315789473685, + "loss": 1.0909, + "step": 550 + }, + { + "epoch": 0.05, + "grad_norm": 0.35258812822080665, + "learning_rate": 0.00010545454545454545, + "loss": 1.1647, + "step": 551 + }, + { + "epoch": 0.05, + "grad_norm": 0.3894512796451852, + "learning_rate": 0.00010564593301435409, + "loss": 1.0903, + "step": 552 + }, + { + "epoch": 0.05, + "grad_norm": 0.3560681821041678, + "learning_rate": 0.00010583732057416268, + "loss": 1.0612, + "step": 553 + }, + { + "epoch": 0.05, + "grad_norm": 0.37166266505359585, + "learning_rate": 0.0001060287081339713, + "loss": 1.2101, + "step": 554 + }, + { + "epoch": 0.05, + "grad_norm": 0.36530860942069354, + "learning_rate": 0.00010622009569377991, + "loss": 1.044, + "step": 555 + }, + { + "epoch": 0.05, + "grad_norm": 0.33955376662887854, + "learning_rate": 0.00010641148325358852, + "loss": 1.093, + "step": 556 + }, + { + "epoch": 0.05, + "grad_norm": 0.31114780875878933, + "learning_rate": 0.00010660287081339712, + "loss": 1.146, + "step": 557 + }, + { + "epoch": 0.05, + "grad_norm": 0.3676757652032558, + "learning_rate": 0.00010679425837320576, + "loss": 1.0386, + "step": 558 + }, + { + "epoch": 0.05, + "grad_norm": 0.32774291097340136, + "learning_rate": 0.00010698564593301437, + "loss": 1.0935, + "step": 559 + }, + { + "epoch": 0.05, + "grad_norm": 0.33490784632156506, + "learning_rate": 0.00010717703349282297, + "loss": 1.1692, + "step": 560 + }, + { + "epoch": 0.05, + "grad_norm": 0.3502773499002359, + "learning_rate": 0.00010736842105263158, + "loss": 1.1153, + "step": 561 + }, + { + "epoch": 0.05, + "grad_norm": 0.36351558175076165, + "learning_rate": 0.0001075598086124402, + "loss": 1.0359, + "step": 562 + }, + { + "epoch": 0.05, + "grad_norm": 0.32317295912626365, + "learning_rate": 0.00010775119617224881, + "loss": 1.0597, + "step": 563 + }, + { + "epoch": 0.05, + "grad_norm": 0.37457465032706805, + "learning_rate": 0.0001079425837320574, + "loss": 1.2346, + "step": 564 + }, + { + "epoch": 0.05, + "grad_norm": 0.3230784156429463, + "learning_rate": 0.00010813397129186604, + "loss": 1.0292, + "step": 565 + }, + { + "epoch": 0.05, + "grad_norm": 0.29938140614694264, + "learning_rate": 0.00010832535885167466, + "loss": 1.068, + "step": 566 + }, + { + "epoch": 0.05, + "grad_norm": 0.37830783204331137, + "learning_rate": 0.00010851674641148326, + "loss": 1.1449, + "step": 567 + }, + { + "epoch": 0.05, + "grad_norm": 0.36419764707025626, + "learning_rate": 0.00010870813397129187, + "loss": 1.0981, + "step": 568 + }, + { + "epoch": 0.05, + "grad_norm": 0.3748154030309419, + "learning_rate": 0.00010889952153110048, + "loss": 1.2252, + "step": 569 + }, + { + "epoch": 0.05, + "grad_norm": 0.3368617151990764, + "learning_rate": 0.00010909090909090909, + "loss": 1.2124, + "step": 570 + }, + { + "epoch": 0.05, + "grad_norm": 0.37988770907164504, + "learning_rate": 0.00010928229665071772, + "loss": 1.2043, + "step": 571 + }, + { + "epoch": 0.05, + "grad_norm": 0.31449121687746323, + "learning_rate": 0.00010947368421052633, + "loss": 1.1775, + "step": 572 + }, + { + "epoch": 0.05, + "grad_norm": 0.3568124415520435, + "learning_rate": 0.00010966507177033494, + "loss": 1.1014, + "step": 573 + }, + { + "epoch": 0.05, + "grad_norm": 0.3673211031508777, + "learning_rate": 0.00010985645933014354, + "loss": 1.2182, + "step": 574 + }, + { + "epoch": 0.06, + "grad_norm": 0.34202187757429514, + "learning_rate": 0.00011004784688995215, + "loss": 1.0369, + "step": 575 + }, + { + "epoch": 0.06, + "grad_norm": 0.3905415136176411, + "learning_rate": 0.00011023923444976077, + "loss": 1.1507, + "step": 576 + }, + { + "epoch": 0.06, + "grad_norm": 0.3468401555240628, + "learning_rate": 0.00011043062200956938, + "loss": 1.1642, + "step": 577 + }, + { + "epoch": 0.06, + "grad_norm": 0.4192072995004005, + "learning_rate": 0.000110622009569378, + "loss": 1.164, + "step": 578 + }, + { + "epoch": 0.06, + "grad_norm": 0.379758035778582, + "learning_rate": 0.00011081339712918662, + "loss": 1.1536, + "step": 579 + }, + { + "epoch": 0.06, + "grad_norm": 0.41419578240684846, + "learning_rate": 0.00011100478468899523, + "loss": 1.0945, + "step": 580 + }, + { + "epoch": 0.06, + "grad_norm": 0.3250523681546178, + "learning_rate": 0.00011119617224880383, + "loss": 1.099, + "step": 581 + }, + { + "epoch": 0.06, + "grad_norm": 0.38315025894656307, + "learning_rate": 0.00011138755980861244, + "loss": 1.1642, + "step": 582 + }, + { + "epoch": 0.06, + "grad_norm": 0.30382004794249146, + "learning_rate": 0.00011157894736842105, + "loss": 1.252, + "step": 583 + }, + { + "epoch": 0.06, + "grad_norm": 0.36873995920664776, + "learning_rate": 0.00011177033492822968, + "loss": 1.1613, + "step": 584 + }, + { + "epoch": 0.06, + "grad_norm": 0.40209606862367175, + "learning_rate": 0.00011196172248803829, + "loss": 1.2187, + "step": 585 + }, + { + "epoch": 0.06, + "grad_norm": 0.39972092223451644, + "learning_rate": 0.0001121531100478469, + "loss": 1.1339, + "step": 586 + }, + { + "epoch": 0.06, + "grad_norm": 0.3581792841100828, + "learning_rate": 0.00011234449760765551, + "loss": 1.1467, + "step": 587 + }, + { + "epoch": 0.06, + "grad_norm": 0.33476559121109767, + "learning_rate": 0.00011253588516746411, + "loss": 1.1809, + "step": 588 + }, + { + "epoch": 0.06, + "grad_norm": 0.3450568464002908, + "learning_rate": 0.00011272727272727272, + "loss": 1.1955, + "step": 589 + }, + { + "epoch": 0.06, + "grad_norm": 0.30713056981495374, + "learning_rate": 0.00011291866028708134, + "loss": 1.153, + "step": 590 + }, + { + "epoch": 0.06, + "grad_norm": 0.40056590609091713, + "learning_rate": 0.00011311004784688996, + "loss": 1.0824, + "step": 591 + }, + { + "epoch": 0.06, + "grad_norm": 0.32202083458180564, + "learning_rate": 0.00011330143540669858, + "loss": 1.1039, + "step": 592 + }, + { + "epoch": 0.06, + "grad_norm": 0.28281909786135145, + "learning_rate": 0.00011349282296650719, + "loss": 1.1722, + "step": 593 + }, + { + "epoch": 0.06, + "grad_norm": 0.2930441788185507, + "learning_rate": 0.0001136842105263158, + "loss": 1.1902, + "step": 594 + }, + { + "epoch": 0.06, + "grad_norm": 0.3491373061731604, + "learning_rate": 0.0001138755980861244, + "loss": 1.1247, + "step": 595 + }, + { + "epoch": 0.06, + "grad_norm": 0.3110622779886572, + "learning_rate": 0.00011406698564593301, + "loss": 1.2289, + "step": 596 + }, + { + "epoch": 0.06, + "grad_norm": 0.34519065720613423, + "learning_rate": 0.00011425837320574164, + "loss": 1.1169, + "step": 597 + }, + { + "epoch": 0.06, + "grad_norm": 0.3066625621843041, + "learning_rate": 0.00011444976076555025, + "loss": 1.1645, + "step": 598 + }, + { + "epoch": 0.06, + "grad_norm": 0.32116731229953854, + "learning_rate": 0.00011464114832535886, + "loss": 1.0933, + "step": 599 + }, + { + "epoch": 0.06, + "grad_norm": 0.3511568531959789, + "learning_rate": 0.00011483253588516747, + "loss": 1.1087, + "step": 600 + }, + { + "epoch": 0.06, + "grad_norm": 0.32112239871920967, + "learning_rate": 0.00011502392344497607, + "loss": 1.1406, + "step": 601 + }, + { + "epoch": 0.06, + "grad_norm": 0.39367325401303266, + "learning_rate": 0.00011521531100478468, + "loss": 1.1545, + "step": 602 + }, + { + "epoch": 0.06, + "grad_norm": 0.3392107735520774, + "learning_rate": 0.0001154066985645933, + "loss": 1.1566, + "step": 603 + }, + { + "epoch": 0.06, + "grad_norm": 0.35936783606471423, + "learning_rate": 0.00011559808612440192, + "loss": 1.1135, + "step": 604 + }, + { + "epoch": 0.06, + "grad_norm": 0.3453223570806925, + "learning_rate": 0.00011578947368421053, + "loss": 1.1679, + "step": 605 + }, + { + "epoch": 0.06, + "grad_norm": 0.3988207022091826, + "learning_rate": 0.00011598086124401915, + "loss": 1.1266, + "step": 606 + }, + { + "epoch": 0.06, + "grad_norm": 0.35616581701014133, + "learning_rate": 0.00011617224880382776, + "loss": 1.0747, + "step": 607 + }, + { + "epoch": 0.06, + "grad_norm": 0.34856430848542924, + "learning_rate": 0.00011636363636363636, + "loss": 1.1737, + "step": 608 + }, + { + "epoch": 0.06, + "grad_norm": 0.39749502570874873, + "learning_rate": 0.00011655502392344497, + "loss": 1.1367, + "step": 609 + }, + { + "epoch": 0.06, + "grad_norm": 0.3817892480214725, + "learning_rate": 0.00011674641148325361, + "loss": 1.1423, + "step": 610 + }, + { + "epoch": 0.06, + "grad_norm": 0.37169774084550616, + "learning_rate": 0.00011693779904306221, + "loss": 1.2363, + "step": 611 + }, + { + "epoch": 0.06, + "grad_norm": 0.36680842275104286, + "learning_rate": 0.00011712918660287082, + "loss": 1.1137, + "step": 612 + }, + { + "epoch": 0.06, + "grad_norm": 0.30862259202802894, + "learning_rate": 0.00011732057416267943, + "loss": 1.1156, + "step": 613 + }, + { + "epoch": 0.06, + "grad_norm": 0.3381253043590406, + "learning_rate": 0.00011751196172248804, + "loss": 1.1913, + "step": 614 + }, + { + "epoch": 0.06, + "grad_norm": 0.35640646013161875, + "learning_rate": 0.00011770334928229664, + "loss": 1.2953, + "step": 615 + }, + { + "epoch": 0.06, + "grad_norm": 0.3180351478003401, + "learning_rate": 0.00011789473684210525, + "loss": 1.1151, + "step": 616 + }, + { + "epoch": 0.06, + "grad_norm": 0.2838039245590443, + "learning_rate": 0.0001180861244019139, + "loss": 1.1178, + "step": 617 + }, + { + "epoch": 0.06, + "grad_norm": 0.32734113884885613, + "learning_rate": 0.00011827751196172249, + "loss": 1.1095, + "step": 618 + }, + { + "epoch": 0.06, + "grad_norm": 0.4976044214884747, + "learning_rate": 0.0001184688995215311, + "loss": 1.1425, + "step": 619 + }, + { + "epoch": 0.06, + "grad_norm": 0.32042584160221055, + "learning_rate": 0.00011866028708133972, + "loss": 1.0799, + "step": 620 + }, + { + "epoch": 0.06, + "grad_norm": 0.3258958466495425, + "learning_rate": 0.00011885167464114833, + "loss": 1.221, + "step": 621 + }, + { + "epoch": 0.06, + "grad_norm": 0.3112060174372619, + "learning_rate": 0.00011904306220095693, + "loss": 1.2115, + "step": 622 + }, + { + "epoch": 0.06, + "grad_norm": 0.30118176636144206, + "learning_rate": 0.00011923444976076557, + "loss": 1.1361, + "step": 623 + }, + { + "epoch": 0.06, + "grad_norm": 0.3361051300196263, + "learning_rate": 0.00011942583732057418, + "loss": 1.191, + "step": 624 + }, + { + "epoch": 0.06, + "grad_norm": 0.2931267938868614, + "learning_rate": 0.00011961722488038278, + "loss": 1.0851, + "step": 625 + }, + { + "epoch": 0.06, + "grad_norm": 0.32113885088697364, + "learning_rate": 0.00011980861244019139, + "loss": 1.1867, + "step": 626 + }, + { + "epoch": 0.06, + "grad_norm": 0.34101527118946584, + "learning_rate": 0.00012, + "loss": 1.1836, + "step": 627 + }, + { + "epoch": 0.06, + "grad_norm": 0.27834089475655605, + "learning_rate": 0.00012019138755980862, + "loss": 1.2618, + "step": 628 + }, + { + "epoch": 0.06, + "grad_norm": 0.3689617429853991, + "learning_rate": 0.00012038277511961724, + "loss": 1.1289, + "step": 629 + }, + { + "epoch": 0.06, + "grad_norm": 0.32141982153995574, + "learning_rate": 0.00012057416267942585, + "loss": 1.245, + "step": 630 + }, + { + "epoch": 0.06, + "grad_norm": 0.3577846092529067, + "learning_rate": 0.00012076555023923447, + "loss": 1.1214, + "step": 631 + }, + { + "epoch": 0.06, + "grad_norm": 0.3547093306453314, + "learning_rate": 0.00012095693779904306, + "loss": 1.1151, + "step": 632 + }, + { + "epoch": 0.06, + "grad_norm": 0.3235863986240357, + "learning_rate": 0.00012114832535885168, + "loss": 1.1431, + "step": 633 + }, + { + "epoch": 0.06, + "grad_norm": 0.3315145079366769, + "learning_rate": 0.00012133971291866029, + "loss": 1.1589, + "step": 634 + }, + { + "epoch": 0.06, + "grad_norm": 0.44616643435072917, + "learning_rate": 0.0001215311004784689, + "loss": 1.1899, + "step": 635 + }, + { + "epoch": 0.06, + "grad_norm": 0.34695172958733533, + "learning_rate": 0.00012172248803827753, + "loss": 1.2423, + "step": 636 + }, + { + "epoch": 0.06, + "grad_norm": 0.32832699890038897, + "learning_rate": 0.00012191387559808614, + "loss": 1.0669, + "step": 637 + }, + { + "epoch": 0.06, + "grad_norm": 0.34250522043183074, + "learning_rate": 0.00012210526315789474, + "loss": 1.1031, + "step": 638 + }, + { + "epoch": 0.06, + "grad_norm": 0.3536036436534466, + "learning_rate": 0.00012229665071770336, + "loss": 1.1833, + "step": 639 + }, + { + "epoch": 0.06, + "grad_norm": 0.38952860754869895, + "learning_rate": 0.00012248803827751196, + "loss": 1.2269, + "step": 640 + }, + { + "epoch": 0.06, + "grad_norm": 0.38372067281414196, + "learning_rate": 0.00012267942583732056, + "loss": 1.1696, + "step": 641 + }, + { + "epoch": 0.06, + "grad_norm": 0.36604610363956575, + "learning_rate": 0.00012287081339712921, + "loss": 1.143, + "step": 642 + }, + { + "epoch": 0.06, + "grad_norm": 0.29457638902628325, + "learning_rate": 0.0001230622009569378, + "loss": 1.1118, + "step": 643 + }, + { + "epoch": 0.06, + "grad_norm": 0.31971053516113995, + "learning_rate": 0.0001232535885167464, + "loss": 1.2257, + "step": 644 + }, + { + "epoch": 0.06, + "grad_norm": 0.3479548829091419, + "learning_rate": 0.00012344497607655504, + "loss": 1.2003, + "step": 645 + }, + { + "epoch": 0.06, + "grad_norm": 0.39895729607686864, + "learning_rate": 0.00012363636363636364, + "loss": 1.2362, + "step": 646 + }, + { + "epoch": 0.06, + "grad_norm": 0.32542242184693576, + "learning_rate": 0.00012382775119617226, + "loss": 1.1924, + "step": 647 + }, + { + "epoch": 0.06, + "grad_norm": 0.37946269414290873, + "learning_rate": 0.00012401913875598086, + "loss": 1.2259, + "step": 648 + }, + { + "epoch": 0.06, + "grad_norm": 0.37785595963877666, + "learning_rate": 0.00012421052631578949, + "loss": 1.252, + "step": 649 + }, + { + "epoch": 0.06, + "grad_norm": 0.39908004119966145, + "learning_rate": 0.00012440191387559808, + "loss": 1.1444, + "step": 650 + }, + { + "epoch": 0.06, + "grad_norm": 0.38865614676807153, + "learning_rate": 0.0001245933014354067, + "loss": 1.2314, + "step": 651 + }, + { + "epoch": 0.06, + "grad_norm": 0.3204842742106689, + "learning_rate": 0.0001247846889952153, + "loss": 1.0824, + "step": 652 + }, + { + "epoch": 0.06, + "grad_norm": 0.35369352498295387, + "learning_rate": 0.00012497607655502393, + "loss": 1.0264, + "step": 653 + }, + { + "epoch": 0.06, + "grad_norm": 0.3305618992525529, + "learning_rate": 0.00012516746411483253, + "loss": 1.1012, + "step": 654 + }, + { + "epoch": 0.06, + "grad_norm": 0.3757616845139893, + "learning_rate": 0.00012535885167464116, + "loss": 1.2916, + "step": 655 + }, + { + "epoch": 0.06, + "grad_norm": 0.32567276622705355, + "learning_rate": 0.00012555023923444978, + "loss": 1.218, + "step": 656 + }, + { + "epoch": 0.06, + "grad_norm": 0.30320222866051544, + "learning_rate": 0.00012574162679425838, + "loss": 1.0551, + "step": 657 + }, + { + "epoch": 0.06, + "grad_norm": 0.47041450898052456, + "learning_rate": 0.00012593301435406698, + "loss": 1.2101, + "step": 658 + }, + { + "epoch": 0.06, + "grad_norm": 0.3265512828583142, + "learning_rate": 0.0001261244019138756, + "loss": 1.0902, + "step": 659 + }, + { + "epoch": 0.06, + "grad_norm": 0.3095200661644063, + "learning_rate": 0.0001263157894736842, + "loss": 1.2483, + "step": 660 + }, + { + "epoch": 0.06, + "grad_norm": 0.39865694866961127, + "learning_rate": 0.0001265071770334928, + "loss": 1.0507, + "step": 661 + }, + { + "epoch": 0.06, + "grad_norm": 0.35606148522081404, + "learning_rate": 0.00012669856459330146, + "loss": 1.0753, + "step": 662 + }, + { + "epoch": 0.06, + "grad_norm": 0.3445720816931114, + "learning_rate": 0.00012688995215311006, + "loss": 1.143, + "step": 663 + }, + { + "epoch": 0.06, + "grad_norm": 0.3677160012348687, + "learning_rate": 0.00012708133971291866, + "loss": 1.0608, + "step": 664 + }, + { + "epoch": 0.06, + "grad_norm": 0.2980120877326159, + "learning_rate": 0.00012727272727272728, + "loss": 1.0872, + "step": 665 + }, + { + "epoch": 0.06, + "grad_norm": 0.2896118505469009, + "learning_rate": 0.00012746411483253588, + "loss": 1.1485, + "step": 666 + }, + { + "epoch": 0.06, + "grad_norm": 0.3350125319603418, + "learning_rate": 0.0001276555023923445, + "loss": 1.1236, + "step": 667 + }, + { + "epoch": 0.06, + "grad_norm": 0.3517268797460554, + "learning_rate": 0.00012784688995215313, + "loss": 1.1378, + "step": 668 + }, + { + "epoch": 0.06, + "grad_norm": 0.4220707921759215, + "learning_rate": 0.00012803827751196173, + "loss": 1.1656, + "step": 669 + }, + { + "epoch": 0.06, + "grad_norm": 0.3098050517214006, + "learning_rate": 0.00012822966507177036, + "loss": 1.0732, + "step": 670 + }, + { + "epoch": 0.06, + "grad_norm": 0.38036416406983276, + "learning_rate": 0.00012842105263157895, + "loss": 1.1597, + "step": 671 + }, + { + "epoch": 0.06, + "grad_norm": 0.32201151129472433, + "learning_rate": 0.00012861244019138755, + "loss": 1.1557, + "step": 672 + }, + { + "epoch": 0.06, + "grad_norm": 0.3477368553208273, + "learning_rate": 0.00012880382775119618, + "loss": 1.093, + "step": 673 + }, + { + "epoch": 0.06, + "grad_norm": 0.33206153473346633, + "learning_rate": 0.00012899521531100478, + "loss": 1.0872, + "step": 674 + }, + { + "epoch": 0.06, + "grad_norm": 0.3797973671348287, + "learning_rate": 0.0001291866028708134, + "loss": 1.1932, + "step": 675 + }, + { + "epoch": 0.06, + "grad_norm": 0.38021465107794655, + "learning_rate": 0.00012937799043062203, + "loss": 1.2037, + "step": 676 + }, + { + "epoch": 0.06, + "grad_norm": 0.40680529142131094, + "learning_rate": 0.00012956937799043063, + "loss": 1.13, + "step": 677 + }, + { + "epoch": 0.06, + "grad_norm": 0.3662653154346482, + "learning_rate": 0.00012976076555023923, + "loss": 1.137, + "step": 678 + }, + { + "epoch": 0.06, + "grad_norm": 0.389523749301837, + "learning_rate": 0.00012995215311004785, + "loss": 1.1714, + "step": 679 + }, + { + "epoch": 0.07, + "grad_norm": 0.33672031522727297, + "learning_rate": 0.00013014354066985645, + "loss": 1.08, + "step": 680 + }, + { + "epoch": 0.07, + "grad_norm": 0.3259489924855725, + "learning_rate": 0.00013033492822966508, + "loss": 1.1592, + "step": 681 + }, + { + "epoch": 0.07, + "grad_norm": 0.4310205006695421, + "learning_rate": 0.0001305263157894737, + "loss": 1.1125, + "step": 682 + }, + { + "epoch": 0.07, + "grad_norm": 0.3354208256542673, + "learning_rate": 0.0001307177033492823, + "loss": 1.1612, + "step": 683 + }, + { + "epoch": 0.07, + "grad_norm": 0.33633246177327786, + "learning_rate": 0.00013090909090909093, + "loss": 1.1075, + "step": 684 + }, + { + "epoch": 0.07, + "grad_norm": 0.31028161222188255, + "learning_rate": 0.00013110047846889953, + "loss": 1.1152, + "step": 685 + }, + { + "epoch": 0.07, + "grad_norm": 0.4524180110599837, + "learning_rate": 0.00013129186602870812, + "loss": 1.1259, + "step": 686 + }, + { + "epoch": 0.07, + "grad_norm": 0.3742481866033862, + "learning_rate": 0.00013148325358851675, + "loss": 1.2494, + "step": 687 + }, + { + "epoch": 0.07, + "grad_norm": 0.3619012526518613, + "learning_rate": 0.00013167464114832538, + "loss": 1.101, + "step": 688 + }, + { + "epoch": 0.07, + "grad_norm": 0.32394020898287806, + "learning_rate": 0.00013186602870813397, + "loss": 1.1392, + "step": 689 + }, + { + "epoch": 0.07, + "grad_norm": 0.33391715304609637, + "learning_rate": 0.0001320574162679426, + "loss": 1.1594, + "step": 690 + }, + { + "epoch": 0.07, + "grad_norm": 0.28785180948649514, + "learning_rate": 0.0001322488038277512, + "loss": 1.0496, + "step": 691 + }, + { + "epoch": 0.07, + "grad_norm": 0.38088138786466363, + "learning_rate": 0.0001324401913875598, + "loss": 1.065, + "step": 692 + }, + { + "epoch": 0.07, + "grad_norm": 0.3208927040153503, + "learning_rate": 0.00013263157894736842, + "loss": 1.1636, + "step": 693 + }, + { + "epoch": 0.07, + "grad_norm": 0.3033641718971694, + "learning_rate": 0.00013282296650717705, + "loss": 1.1452, + "step": 694 + }, + { + "epoch": 0.07, + "grad_norm": 0.2948469058966827, + "learning_rate": 0.00013301435406698565, + "loss": 1.1589, + "step": 695 + }, + { + "epoch": 0.07, + "grad_norm": 0.30477569159510964, + "learning_rate": 0.00013320574162679427, + "loss": 1.181, + "step": 696 + }, + { + "epoch": 0.07, + "grad_norm": 0.3410300113129024, + "learning_rate": 0.00013339712918660287, + "loss": 1.1628, + "step": 697 + }, + { + "epoch": 0.07, + "grad_norm": 0.3058561315902832, + "learning_rate": 0.0001335885167464115, + "loss": 1.084, + "step": 698 + }, + { + "epoch": 0.07, + "grad_norm": 0.34051994364928995, + "learning_rate": 0.0001337799043062201, + "loss": 1.0576, + "step": 699 + }, + { + "epoch": 0.07, + "grad_norm": 0.3096222099295184, + "learning_rate": 0.00013397129186602872, + "loss": 1.0741, + "step": 700 + }, + { + "epoch": 0.07, + "grad_norm": 0.28113955153682396, + "learning_rate": 0.00013416267942583732, + "loss": 1.1019, + "step": 701 + }, + { + "epoch": 0.07, + "grad_norm": 0.2956254577619277, + "learning_rate": 0.00013435406698564595, + "loss": 1.1044, + "step": 702 + }, + { + "epoch": 0.07, + "grad_norm": 0.31157316700478505, + "learning_rate": 0.00013454545454545455, + "loss": 1.1694, + "step": 703 + }, + { + "epoch": 0.07, + "grad_norm": 0.3072975606896904, + "learning_rate": 0.00013473684210526317, + "loss": 1.1256, + "step": 704 + }, + { + "epoch": 0.07, + "grad_norm": 0.35422315692846823, + "learning_rate": 0.00013492822966507177, + "loss": 1.202, + "step": 705 + }, + { + "epoch": 0.07, + "grad_norm": 0.3039926886966394, + "learning_rate": 0.00013511961722488037, + "loss": 1.1784, + "step": 706 + }, + { + "epoch": 0.07, + "grad_norm": 0.272337523028655, + "learning_rate": 0.00013531100478468902, + "loss": 1.126, + "step": 707 + }, + { + "epoch": 0.07, + "grad_norm": 0.3396634306821353, + "learning_rate": 0.00013550239234449762, + "loss": 1.1249, + "step": 708 + }, + { + "epoch": 0.07, + "grad_norm": 0.32221861659032364, + "learning_rate": 0.00013569377990430622, + "loss": 1.1312, + "step": 709 + }, + { + "epoch": 0.07, + "grad_norm": 0.30678629631733856, + "learning_rate": 0.00013588516746411485, + "loss": 1.1462, + "step": 710 + }, + { + "epoch": 0.07, + "grad_norm": 0.331787225449244, + "learning_rate": 0.00013607655502392344, + "loss": 1.1713, + "step": 711 + }, + { + "epoch": 0.07, + "grad_norm": 0.2617883116352453, + "learning_rate": 0.00013626794258373204, + "loss": 1.1893, + "step": 712 + }, + { + "epoch": 0.07, + "grad_norm": 0.34346153319515627, + "learning_rate": 0.0001364593301435407, + "loss": 1.1891, + "step": 713 + }, + { + "epoch": 0.07, + "grad_norm": 0.3606280112508664, + "learning_rate": 0.0001366507177033493, + "loss": 1.3581, + "step": 714 + }, + { + "epoch": 0.07, + "grad_norm": 0.30602332471532506, + "learning_rate": 0.0001368421052631579, + "loss": 1.2075, + "step": 715 + }, + { + "epoch": 0.07, + "grad_norm": 0.2966643308304905, + "learning_rate": 0.00013703349282296652, + "loss": 1.1248, + "step": 716 + }, + { + "epoch": 0.07, + "grad_norm": 0.3692677324708085, + "learning_rate": 0.00013722488038277512, + "loss": 1.1325, + "step": 717 + }, + { + "epoch": 0.07, + "grad_norm": 0.3153436398786279, + "learning_rate": 0.00013741626794258374, + "loss": 1.141, + "step": 718 + }, + { + "epoch": 0.07, + "grad_norm": 0.2993620796785782, + "learning_rate": 0.00013760765550239234, + "loss": 1.151, + "step": 719 + }, + { + "epoch": 0.07, + "grad_norm": 0.360199053324579, + "learning_rate": 0.00013779904306220097, + "loss": 1.1671, + "step": 720 + }, + { + "epoch": 0.07, + "grad_norm": 0.34616040756962774, + "learning_rate": 0.0001379904306220096, + "loss": 1.1314, + "step": 721 + }, + { + "epoch": 0.07, + "grad_norm": 0.32093543405134595, + "learning_rate": 0.0001381818181818182, + "loss": 1.1017, + "step": 722 + }, + { + "epoch": 0.07, + "grad_norm": 0.3072115942434, + "learning_rate": 0.0001383732057416268, + "loss": 1.2032, + "step": 723 + }, + { + "epoch": 0.07, + "grad_norm": 0.33680085828062, + "learning_rate": 0.00013856459330143542, + "loss": 1.191, + "step": 724 + }, + { + "epoch": 0.07, + "grad_norm": 0.27852291513017413, + "learning_rate": 0.00013875598086124402, + "loss": 1.2035, + "step": 725 + }, + { + "epoch": 0.07, + "grad_norm": 0.3269080941652961, + "learning_rate": 0.00013894736842105264, + "loss": 1.1417, + "step": 726 + }, + { + "epoch": 0.07, + "grad_norm": 0.2911551586198448, + "learning_rate": 0.00013913875598086127, + "loss": 1.2055, + "step": 727 + }, + { + "epoch": 0.07, + "grad_norm": 0.3478754660709439, + "learning_rate": 0.00013933014354066987, + "loss": 1.1967, + "step": 728 + }, + { + "epoch": 0.07, + "grad_norm": 0.31136552748186935, + "learning_rate": 0.00013952153110047846, + "loss": 1.1666, + "step": 729 + }, + { + "epoch": 0.07, + "grad_norm": 0.29853571203421375, + "learning_rate": 0.0001397129186602871, + "loss": 1.1213, + "step": 730 + }, + { + "epoch": 0.07, + "grad_norm": 0.29830710212889877, + "learning_rate": 0.0001399043062200957, + "loss": 1.2283, + "step": 731 + }, + { + "epoch": 0.07, + "grad_norm": 0.2988658722078936, + "learning_rate": 0.00014009569377990431, + "loss": 1.2391, + "step": 732 + }, + { + "epoch": 0.07, + "grad_norm": 0.3453871024753651, + "learning_rate": 0.00014028708133971294, + "loss": 1.2111, + "step": 733 + }, + { + "epoch": 0.07, + "grad_norm": 0.3495768486847103, + "learning_rate": 0.00014047846889952154, + "loss": 1.1027, + "step": 734 + }, + { + "epoch": 0.07, + "grad_norm": 0.3164024002188871, + "learning_rate": 0.00014066985645933016, + "loss": 1.1015, + "step": 735 + }, + { + "epoch": 0.07, + "grad_norm": 0.3044139633248399, + "learning_rate": 0.00014086124401913876, + "loss": 1.0509, + "step": 736 + }, + { + "epoch": 0.07, + "grad_norm": 0.281079329494108, + "learning_rate": 0.00014105263157894736, + "loss": 1.0826, + "step": 737 + }, + { + "epoch": 0.07, + "grad_norm": 0.30636375000054217, + "learning_rate": 0.000141244019138756, + "loss": 1.2202, + "step": 738 + }, + { + "epoch": 0.07, + "grad_norm": 0.29149180784544115, + "learning_rate": 0.0001414354066985646, + "loss": 1.1551, + "step": 739 + }, + { + "epoch": 0.07, + "grad_norm": 0.3073819307679817, + "learning_rate": 0.0001416267942583732, + "loss": 1.2248, + "step": 740 + }, + { + "epoch": 0.07, + "grad_norm": 0.3217985704338287, + "learning_rate": 0.00014181818181818184, + "loss": 1.2045, + "step": 741 + }, + { + "epoch": 0.07, + "grad_norm": 0.3369269229369114, + "learning_rate": 0.00014200956937799044, + "loss": 1.1671, + "step": 742 + }, + { + "epoch": 0.07, + "grad_norm": 0.2981155510935532, + "learning_rate": 0.00014220095693779904, + "loss": 1.1354, + "step": 743 + }, + { + "epoch": 0.07, + "grad_norm": 0.3002935893022973, + "learning_rate": 0.00014239234449760766, + "loss": 1.0369, + "step": 744 + }, + { + "epoch": 0.07, + "grad_norm": 0.3061234355072447, + "learning_rate": 0.00014258373205741626, + "loss": 1.0122, + "step": 745 + }, + { + "epoch": 0.07, + "grad_norm": 0.28759317860073835, + "learning_rate": 0.00014277511961722489, + "loss": 1.0997, + "step": 746 + }, + { + "epoch": 0.07, + "grad_norm": 0.3064828735905134, + "learning_rate": 0.0001429665071770335, + "loss": 1.1225, + "step": 747 + }, + { + "epoch": 0.07, + "grad_norm": 0.3604086247045263, + "learning_rate": 0.0001431578947368421, + "loss": 1.1635, + "step": 748 + }, + { + "epoch": 0.07, + "grad_norm": 0.2914789050629064, + "learning_rate": 0.0001433492822966507, + "loss": 1.1704, + "step": 749 + }, + { + "epoch": 0.07, + "grad_norm": 0.3105462532363453, + "learning_rate": 0.00014354066985645933, + "loss": 1.144, + "step": 750 + }, + { + "epoch": 0.07, + "grad_norm": 0.29242484393022483, + "learning_rate": 0.00014373205741626793, + "loss": 1.0991, + "step": 751 + }, + { + "epoch": 0.07, + "grad_norm": 0.3009843941043775, + "learning_rate": 0.00014392344497607656, + "loss": 1.1409, + "step": 752 + }, + { + "epoch": 0.07, + "grad_norm": 0.35368948528183997, + "learning_rate": 0.00014411483253588518, + "loss": 1.0839, + "step": 753 + }, + { + "epoch": 0.07, + "grad_norm": 0.2908104621427735, + "learning_rate": 0.00014430622009569378, + "loss": 1.0997, + "step": 754 + }, + { + "epoch": 0.07, + "grad_norm": 0.2761031765983028, + "learning_rate": 0.0001444976076555024, + "loss": 1.0389, + "step": 755 + }, + { + "epoch": 0.07, + "grad_norm": 0.34458640320872364, + "learning_rate": 0.000144688995215311, + "loss": 1.0666, + "step": 756 + }, + { + "epoch": 0.07, + "grad_norm": 0.3426791854461418, + "learning_rate": 0.0001448803827751196, + "loss": 1.0227, + "step": 757 + }, + { + "epoch": 0.07, + "grad_norm": 0.33484757936373594, + "learning_rate": 0.00014507177033492826, + "loss": 1.1276, + "step": 758 + }, + { + "epoch": 0.07, + "grad_norm": 0.31113235116750904, + "learning_rate": 0.00014526315789473686, + "loss": 1.1435, + "step": 759 + }, + { + "epoch": 0.07, + "grad_norm": 0.3187397115174627, + "learning_rate": 0.00014545454545454546, + "loss": 1.2091, + "step": 760 + }, + { + "epoch": 0.07, + "grad_norm": 0.322859636752832, + "learning_rate": 0.00014564593301435408, + "loss": 1.1463, + "step": 761 + }, + { + "epoch": 0.07, + "grad_norm": 0.3054079698873811, + "learning_rate": 0.00014583732057416268, + "loss": 1.1532, + "step": 762 + }, + { + "epoch": 0.07, + "grad_norm": 0.37827200804472255, + "learning_rate": 0.00014602870813397128, + "loss": 1.1536, + "step": 763 + }, + { + "epoch": 0.07, + "grad_norm": 0.33688004627148077, + "learning_rate": 0.0001462200956937799, + "loss": 1.1633, + "step": 764 + }, + { + "epoch": 0.07, + "grad_norm": 0.3057781806456222, + "learning_rate": 0.00014641148325358853, + "loss": 1.1336, + "step": 765 + }, + { + "epoch": 0.07, + "grad_norm": 0.3214472678202446, + "learning_rate": 0.00014660287081339713, + "loss": 1.178, + "step": 766 + }, + { + "epoch": 0.07, + "grad_norm": 0.3615283182183831, + "learning_rate": 0.00014679425837320576, + "loss": 1.1158, + "step": 767 + }, + { + "epoch": 0.07, + "grad_norm": 0.3147571028922824, + "learning_rate": 0.00014698564593301435, + "loss": 1.131, + "step": 768 + }, + { + "epoch": 0.07, + "grad_norm": 0.269304950091198, + "learning_rate": 0.00014717703349282298, + "loss": 1.1885, + "step": 769 + }, + { + "epoch": 0.07, + "grad_norm": 0.2817004499058875, + "learning_rate": 0.00014736842105263158, + "loss": 1.0634, + "step": 770 + }, + { + "epoch": 0.07, + "grad_norm": 0.34677018154047495, + "learning_rate": 0.0001475598086124402, + "loss": 1.2329, + "step": 771 + }, + { + "epoch": 0.07, + "grad_norm": 0.33187657643162116, + "learning_rate": 0.00014775119617224883, + "loss": 1.0673, + "step": 772 + }, + { + "epoch": 0.07, + "grad_norm": 0.33397275501257906, + "learning_rate": 0.00014794258373205743, + "loss": 1.0975, + "step": 773 + }, + { + "epoch": 0.07, + "grad_norm": 0.2977218730080119, + "learning_rate": 0.00014813397129186603, + "loss": 1.0581, + "step": 774 + }, + { + "epoch": 0.07, + "grad_norm": 0.3480266756123412, + "learning_rate": 0.00014832535885167465, + "loss": 1.2395, + "step": 775 + }, + { + "epoch": 0.07, + "grad_norm": 0.33681513429762355, + "learning_rate": 0.00014851674641148325, + "loss": 1.1306, + "step": 776 + }, + { + "epoch": 0.07, + "grad_norm": 0.31749818370425387, + "learning_rate": 0.00014870813397129185, + "loss": 1.0901, + "step": 777 + }, + { + "epoch": 0.07, + "grad_norm": 0.33455138564966774, + "learning_rate": 0.0001488995215311005, + "loss": 1.2032, + "step": 778 + }, + { + "epoch": 0.07, + "grad_norm": 0.3504419380990198, + "learning_rate": 0.0001490909090909091, + "loss": 1.0602, + "step": 779 + }, + { + "epoch": 0.07, + "grad_norm": 0.3023880222584541, + "learning_rate": 0.0001492822966507177, + "loss": 1.1374, + "step": 780 + }, + { + "epoch": 0.07, + "grad_norm": 0.5469036927255182, + "learning_rate": 0.00014947368421052633, + "loss": 1.1802, + "step": 781 + }, + { + "epoch": 0.07, + "grad_norm": 0.308842167335779, + "learning_rate": 0.00014966507177033493, + "loss": 1.0936, + "step": 782 + }, + { + "epoch": 0.07, + "grad_norm": 0.2986359721179498, + "learning_rate": 0.00014985645933014355, + "loss": 1.1307, + "step": 783 + }, + { + "epoch": 0.08, + "grad_norm": 0.31664348432490785, + "learning_rate": 0.00015004784688995218, + "loss": 1.2007, + "step": 784 + }, + { + "epoch": 0.08, + "grad_norm": 0.265541663614485, + "learning_rate": 0.00015023923444976078, + "loss": 1.1276, + "step": 785 + }, + { + "epoch": 0.08, + "grad_norm": 0.3204030653032886, + "learning_rate": 0.0001504306220095694, + "loss": 1.1439, + "step": 786 + }, + { + "epoch": 0.08, + "grad_norm": 0.2783784466928858, + "learning_rate": 0.000150622009569378, + "loss": 1.2329, + "step": 787 + }, + { + "epoch": 0.08, + "grad_norm": 0.30216230746794037, + "learning_rate": 0.0001508133971291866, + "loss": 1.1853, + "step": 788 + }, + { + "epoch": 0.08, + "grad_norm": 0.3085281413718923, + "learning_rate": 0.00015100478468899522, + "loss": 1.1631, + "step": 789 + }, + { + "epoch": 0.08, + "grad_norm": 0.3221875710439296, + "learning_rate": 0.00015119617224880382, + "loss": 1.0776, + "step": 790 + }, + { + "epoch": 0.08, + "grad_norm": 0.2846073440656778, + "learning_rate": 0.00015138755980861245, + "loss": 1.1563, + "step": 791 + }, + { + "epoch": 0.08, + "grad_norm": 0.26550401235533877, + "learning_rate": 0.00015157894736842108, + "loss": 1.0467, + "step": 792 + }, + { + "epoch": 0.08, + "grad_norm": 0.37881120245858113, + "learning_rate": 0.00015177033492822967, + "loss": 1.1699, + "step": 793 + }, + { + "epoch": 0.08, + "grad_norm": 0.33594596707038277, + "learning_rate": 0.00015196172248803827, + "loss": 1.1653, + "step": 794 + }, + { + "epoch": 0.08, + "grad_norm": 0.3233311908095246, + "learning_rate": 0.0001521531100478469, + "loss": 1.1507, + "step": 795 + }, + { + "epoch": 0.08, + "grad_norm": 0.267677768320064, + "learning_rate": 0.0001523444976076555, + "loss": 1.1356, + "step": 796 + }, + { + "epoch": 0.08, + "grad_norm": 0.29484155965355746, + "learning_rate": 0.00015253588516746412, + "loss": 1.0681, + "step": 797 + }, + { + "epoch": 0.08, + "grad_norm": 0.3284519128368135, + "learning_rate": 0.00015272727272727275, + "loss": 1.0024, + "step": 798 + }, + { + "epoch": 0.08, + "grad_norm": 0.28701234783478413, + "learning_rate": 0.00015291866028708135, + "loss": 1.0892, + "step": 799 + }, + { + "epoch": 0.08, + "grad_norm": 0.31864235047065265, + "learning_rate": 0.00015311004784688995, + "loss": 1.1727, + "step": 800 + }, + { + "epoch": 0.08, + "grad_norm": 0.2745618500591329, + "learning_rate": 0.00015330143540669857, + "loss": 1.0224, + "step": 801 + }, + { + "epoch": 0.08, + "grad_norm": 0.30047657670046785, + "learning_rate": 0.00015349282296650717, + "loss": 1.1017, + "step": 802 + }, + { + "epoch": 0.08, + "grad_norm": 0.2914367942025512, + "learning_rate": 0.0001536842105263158, + "loss": 1.0163, + "step": 803 + }, + { + "epoch": 0.08, + "grad_norm": 0.2816221100141218, + "learning_rate": 0.00015387559808612442, + "loss": 1.1372, + "step": 804 + }, + { + "epoch": 0.08, + "grad_norm": 0.34419073166680986, + "learning_rate": 0.00015406698564593302, + "loss": 1.1991, + "step": 805 + }, + { + "epoch": 0.08, + "grad_norm": 0.2888432316245811, + "learning_rate": 0.00015425837320574165, + "loss": 1.1627, + "step": 806 + }, + { + "epoch": 0.08, + "grad_norm": 0.3191302154072048, + "learning_rate": 0.00015444976076555024, + "loss": 1.2458, + "step": 807 + }, + { + "epoch": 0.08, + "grad_norm": 0.2727293598902053, + "learning_rate": 0.00015464114832535884, + "loss": 1.1085, + "step": 808 + }, + { + "epoch": 0.08, + "grad_norm": 0.3029996217533104, + "learning_rate": 0.00015483253588516747, + "loss": 1.1319, + "step": 809 + }, + { + "epoch": 0.08, + "grad_norm": 0.2874429714766323, + "learning_rate": 0.0001550239234449761, + "loss": 1.1123, + "step": 810 + }, + { + "epoch": 0.08, + "grad_norm": 0.2665739546686572, + "learning_rate": 0.0001552153110047847, + "loss": 1.143, + "step": 811 + }, + { + "epoch": 0.08, + "grad_norm": 0.2958054625397739, + "learning_rate": 0.00015540669856459332, + "loss": 1.135, + "step": 812 + }, + { + "epoch": 0.08, + "grad_norm": 0.255412895550101, + "learning_rate": 0.00015559808612440192, + "loss": 1.1859, + "step": 813 + }, + { + "epoch": 0.08, + "grad_norm": 0.31547097343732156, + "learning_rate": 0.00015578947368421052, + "loss": 1.1657, + "step": 814 + }, + { + "epoch": 0.08, + "grad_norm": 0.34867394487181774, + "learning_rate": 0.00015598086124401914, + "loss": 1.0777, + "step": 815 + }, + { + "epoch": 0.08, + "grad_norm": 0.2921678038171233, + "learning_rate": 0.00015617224880382774, + "loss": 1.0522, + "step": 816 + }, + { + "epoch": 0.08, + "grad_norm": 0.28919534963089716, + "learning_rate": 0.00015636363636363637, + "loss": 1.0779, + "step": 817 + }, + { + "epoch": 0.08, + "grad_norm": 0.3114505303709412, + "learning_rate": 0.000156555023923445, + "loss": 1.1119, + "step": 818 + }, + { + "epoch": 0.08, + "grad_norm": 0.3065682691442617, + "learning_rate": 0.0001567464114832536, + "loss": 1.1352, + "step": 819 + }, + { + "epoch": 0.08, + "grad_norm": 0.3614199195441891, + "learning_rate": 0.00015693779904306222, + "loss": 1.1612, + "step": 820 + }, + { + "epoch": 0.08, + "grad_norm": 0.3101608207788147, + "learning_rate": 0.00015712918660287082, + "loss": 1.252, + "step": 821 + }, + { + "epoch": 0.08, + "grad_norm": 0.2975075722366304, + "learning_rate": 0.00015732057416267941, + "loss": 1.0687, + "step": 822 + }, + { + "epoch": 0.08, + "grad_norm": 0.2664048730695144, + "learning_rate": 0.00015751196172248807, + "loss": 1.0832, + "step": 823 + }, + { + "epoch": 0.08, + "grad_norm": 0.2952527620974602, + "learning_rate": 0.00015770334928229667, + "loss": 1.1082, + "step": 824 + }, + { + "epoch": 0.08, + "grad_norm": 0.3295245234429144, + "learning_rate": 0.00015789473684210527, + "loss": 1.162, + "step": 825 + }, + { + "epoch": 0.08, + "grad_norm": 0.3102397113238992, + "learning_rate": 0.0001580861244019139, + "loss": 1.0738, + "step": 826 + }, + { + "epoch": 0.08, + "grad_norm": 0.2693269386909286, + "learning_rate": 0.0001582775119617225, + "loss": 1.028, + "step": 827 + }, + { + "epoch": 0.08, + "grad_norm": 0.3669225930993825, + "learning_rate": 0.0001584688995215311, + "loss": 1.15, + "step": 828 + }, + { + "epoch": 0.08, + "grad_norm": 0.29318593683220057, + "learning_rate": 0.00015866028708133974, + "loss": 1.2408, + "step": 829 + }, + { + "epoch": 0.08, + "grad_norm": 0.2894248048442511, + "learning_rate": 0.00015885167464114834, + "loss": 1.1752, + "step": 830 + }, + { + "epoch": 0.08, + "grad_norm": 0.29125115224083087, + "learning_rate": 0.00015904306220095694, + "loss": 0.9574, + "step": 831 + }, + { + "epoch": 0.08, + "grad_norm": 0.40878890760263803, + "learning_rate": 0.00015923444976076556, + "loss": 1.2104, + "step": 832 + }, + { + "epoch": 0.08, + "grad_norm": 0.27861032872082103, + "learning_rate": 0.00015942583732057416, + "loss": 1.1413, + "step": 833 + }, + { + "epoch": 0.08, + "grad_norm": 0.2982611167661862, + "learning_rate": 0.0001596172248803828, + "loss": 1.2593, + "step": 834 + }, + { + "epoch": 0.08, + "grad_norm": 0.29326214489772795, + "learning_rate": 0.0001598086124401914, + "loss": 1.18, + "step": 835 + }, + { + "epoch": 0.08, + "grad_norm": 0.27131394008530785, + "learning_rate": 0.00016, + "loss": 1.1674, + "step": 836 + }, + { + "epoch": 0.08, + "grad_norm": 0.26712034034407034, + "learning_rate": 0.00016019138755980864, + "loss": 1.0161, + "step": 837 + }, + { + "epoch": 0.08, + "grad_norm": 0.36369725154573823, + "learning_rate": 0.00016038277511961724, + "loss": 1.175, + "step": 838 + }, + { + "epoch": 0.08, + "grad_norm": 0.3085671724318983, + "learning_rate": 0.00016057416267942584, + "loss": 1.1461, + "step": 839 + }, + { + "epoch": 0.08, + "grad_norm": 0.28077141855727894, + "learning_rate": 0.00016076555023923446, + "loss": 1.0922, + "step": 840 + }, + { + "epoch": 0.08, + "grad_norm": 0.3270351461507469, + "learning_rate": 0.00016095693779904306, + "loss": 1.0463, + "step": 841 + }, + { + "epoch": 0.08, + "grad_norm": 0.23981764247780088, + "learning_rate": 0.0001611483253588517, + "loss": 0.9635, + "step": 842 + }, + { + "epoch": 0.08, + "grad_norm": 0.28201419160149344, + "learning_rate": 0.0001613397129186603, + "loss": 1.1173, + "step": 843 + }, + { + "epoch": 0.08, + "grad_norm": 0.26889491956006867, + "learning_rate": 0.0001615311004784689, + "loss": 1.1132, + "step": 844 + }, + { + "epoch": 0.08, + "grad_norm": 0.27688897066555573, + "learning_rate": 0.0001617224880382775, + "loss": 1.0963, + "step": 845 + }, + { + "epoch": 0.08, + "grad_norm": 0.24565660227717426, + "learning_rate": 0.00016191387559808614, + "loss": 1.0694, + "step": 846 + }, + { + "epoch": 0.08, + "grad_norm": 0.28311675225629357, + "learning_rate": 0.00016210526315789473, + "loss": 1.0727, + "step": 847 + }, + { + "epoch": 0.08, + "grad_norm": 0.25275425247450756, + "learning_rate": 0.00016229665071770336, + "loss": 1.1726, + "step": 848 + }, + { + "epoch": 0.08, + "grad_norm": 0.3073349091629191, + "learning_rate": 0.00016248803827751199, + "loss": 1.237, + "step": 849 + }, + { + "epoch": 0.08, + "grad_norm": 0.3698105893782691, + "learning_rate": 0.00016267942583732058, + "loss": 1.1529, + "step": 850 + }, + { + "epoch": 0.08, + "grad_norm": 0.3066504421764291, + "learning_rate": 0.00016287081339712918, + "loss": 1.1875, + "step": 851 + }, + { + "epoch": 0.08, + "grad_norm": 0.2853734077261547, + "learning_rate": 0.0001630622009569378, + "loss": 1.1232, + "step": 852 + }, + { + "epoch": 0.08, + "grad_norm": 0.27498683022213083, + "learning_rate": 0.0001632535885167464, + "loss": 1.2345, + "step": 853 + }, + { + "epoch": 0.08, + "grad_norm": 0.26436373680139863, + "learning_rate": 0.00016344497607655503, + "loss": 1.1443, + "step": 854 + }, + { + "epoch": 0.08, + "grad_norm": 0.29039546604591765, + "learning_rate": 0.00016363636363636366, + "loss": 1.1451, + "step": 855 + }, + { + "epoch": 0.08, + "grad_norm": 0.2845332734411919, + "learning_rate": 0.00016382775119617226, + "loss": 1.1658, + "step": 856 + }, + { + "epoch": 0.08, + "grad_norm": 0.3118984941168386, + "learning_rate": 0.00016401913875598088, + "loss": 1.133, + "step": 857 + }, + { + "epoch": 0.08, + "grad_norm": 0.2910324342007811, + "learning_rate": 0.00016421052631578948, + "loss": 1.138, + "step": 858 + }, + { + "epoch": 0.08, + "grad_norm": 0.3067211385198509, + "learning_rate": 0.00016440191387559808, + "loss": 1.1517, + "step": 859 + }, + { + "epoch": 0.08, + "grad_norm": 0.290740982507053, + "learning_rate": 0.0001645933014354067, + "loss": 1.0561, + "step": 860 + }, + { + "epoch": 0.08, + "grad_norm": 0.3144516777697552, + "learning_rate": 0.0001647846889952153, + "loss": 1.0661, + "step": 861 + }, + { + "epoch": 0.08, + "grad_norm": 0.2970636821654555, + "learning_rate": 0.00016497607655502393, + "loss": 1.1634, + "step": 862 + }, + { + "epoch": 0.08, + "grad_norm": 0.3146333025319219, + "learning_rate": 0.00016516746411483256, + "loss": 1.0652, + "step": 863 + }, + { + "epoch": 0.08, + "grad_norm": 0.2644767264588937, + "learning_rate": 0.00016535885167464116, + "loss": 1.1516, + "step": 864 + }, + { + "epoch": 0.08, + "grad_norm": 0.3006840203451009, + "learning_rate": 0.00016555023923444975, + "loss": 1.1175, + "step": 865 + }, + { + "epoch": 0.08, + "grad_norm": 0.2809420339644184, + "learning_rate": 0.00016574162679425838, + "loss": 1.1057, + "step": 866 + }, + { + "epoch": 0.08, + "grad_norm": 0.3769059520574524, + "learning_rate": 0.00016593301435406698, + "loss": 1.1713, + "step": 867 + }, + { + "epoch": 0.08, + "grad_norm": 0.33622542833176833, + "learning_rate": 0.0001661244019138756, + "loss": 1.2223, + "step": 868 + }, + { + "epoch": 0.08, + "grad_norm": 0.2715119578007493, + "learning_rate": 0.00016631578947368423, + "loss": 1.0926, + "step": 869 + }, + { + "epoch": 0.08, + "grad_norm": 0.2788006611781337, + "learning_rate": 0.00016650717703349283, + "loss": 1.0598, + "step": 870 + }, + { + "epoch": 0.08, + "grad_norm": 0.29918887354582546, + "learning_rate": 0.00016669856459330145, + "loss": 1.1363, + "step": 871 + }, + { + "epoch": 0.08, + "grad_norm": 0.27116956033088324, + "learning_rate": 0.00016688995215311005, + "loss": 1.1357, + "step": 872 + }, + { + "epoch": 0.08, + "grad_norm": 0.3651972053932287, + "learning_rate": 0.00016708133971291865, + "loss": 1.1862, + "step": 873 + }, + { + "epoch": 0.08, + "grad_norm": 0.2941314020229377, + "learning_rate": 0.00016727272727272728, + "loss": 1.229, + "step": 874 + }, + { + "epoch": 0.08, + "grad_norm": 0.29667386462622886, + "learning_rate": 0.0001674641148325359, + "loss": 1.105, + "step": 875 + }, + { + "epoch": 0.08, + "grad_norm": 0.2888327808151174, + "learning_rate": 0.0001676555023923445, + "loss": 1.1328, + "step": 876 + }, + { + "epoch": 0.08, + "grad_norm": 0.31564538771648376, + "learning_rate": 0.00016784688995215313, + "loss": 1.2407, + "step": 877 + }, + { + "epoch": 0.08, + "grad_norm": 0.31419100780147885, + "learning_rate": 0.00016803827751196173, + "loss": 1.0472, + "step": 878 + }, + { + "epoch": 0.08, + "grad_norm": 0.26956520480679047, + "learning_rate": 0.00016822966507177033, + "loss": 1.1524, + "step": 879 + }, + { + "epoch": 0.08, + "grad_norm": 0.2726391902939466, + "learning_rate": 0.00016842105263157895, + "loss": 1.11, + "step": 880 + }, + { + "epoch": 0.08, + "grad_norm": 0.2993169367221595, + "learning_rate": 0.00016861244019138758, + "loss": 1.2059, + "step": 881 + }, + { + "epoch": 0.08, + "grad_norm": 0.2936290798225595, + "learning_rate": 0.00016880382775119618, + "loss": 1.1026, + "step": 882 + }, + { + "epoch": 0.08, + "grad_norm": 0.25728140759420537, + "learning_rate": 0.0001689952153110048, + "loss": 1.136, + "step": 883 + }, + { + "epoch": 0.08, + "grad_norm": 0.2659884049250215, + "learning_rate": 0.0001691866028708134, + "loss": 1.0311, + "step": 884 + }, + { + "epoch": 0.08, + "grad_norm": 0.29849696827544475, + "learning_rate": 0.00016937799043062203, + "loss": 1.0995, + "step": 885 + }, + { + "epoch": 0.08, + "grad_norm": 0.28395796526200556, + "learning_rate": 0.00016956937799043062, + "loss": 1.0948, + "step": 886 + }, + { + "epoch": 0.08, + "grad_norm": 0.3445355283030851, + "learning_rate": 0.00016976076555023925, + "loss": 1.1103, + "step": 887 + }, + { + "epoch": 0.08, + "grad_norm": 0.2538735838657434, + "learning_rate": 0.00016995215311004788, + "loss": 1.0752, + "step": 888 + }, + { + "epoch": 0.09, + "grad_norm": 0.3026322031952384, + "learning_rate": 0.00017014354066985647, + "loss": 1.1359, + "step": 889 + }, + { + "epoch": 0.09, + "grad_norm": 0.3200296529545863, + "learning_rate": 0.00017033492822966507, + "loss": 1.1851, + "step": 890 + }, + { + "epoch": 0.09, + "grad_norm": 0.29333134950174405, + "learning_rate": 0.0001705263157894737, + "loss": 1.0547, + "step": 891 + }, + { + "epoch": 0.09, + "grad_norm": 0.3189167751421413, + "learning_rate": 0.0001707177033492823, + "loss": 1.0605, + "step": 892 + }, + { + "epoch": 0.09, + "grad_norm": 0.28038685627245685, + "learning_rate": 0.0001709090909090909, + "loss": 1.1087, + "step": 893 + }, + { + "epoch": 0.09, + "grad_norm": 0.27600976689482803, + "learning_rate": 0.00017110047846889955, + "loss": 1.1681, + "step": 894 + }, + { + "epoch": 0.09, + "grad_norm": 0.30930194872855393, + "learning_rate": 0.00017129186602870815, + "loss": 1.1325, + "step": 895 + }, + { + "epoch": 0.09, + "grad_norm": 0.31300532276472537, + "learning_rate": 0.00017148325358851675, + "loss": 1.1002, + "step": 896 + }, + { + "epoch": 0.09, + "grad_norm": 0.2991016495421357, + "learning_rate": 0.00017167464114832537, + "loss": 1.076, + "step": 897 + }, + { + "epoch": 0.09, + "grad_norm": 0.31594108017658423, + "learning_rate": 0.00017186602870813397, + "loss": 1.136, + "step": 898 + }, + { + "epoch": 0.09, + "grad_norm": 0.27497095491638146, + "learning_rate": 0.0001720574162679426, + "loss": 1.2323, + "step": 899 + }, + { + "epoch": 0.09, + "grad_norm": 0.310962024257193, + "learning_rate": 0.00017224880382775122, + "loss": 1.1398, + "step": 900 + }, + { + "epoch": 0.09, + "grad_norm": 0.29697512814616595, + "learning_rate": 0.00017244019138755982, + "loss": 1.1342, + "step": 901 + }, + { + "epoch": 0.09, + "grad_norm": 0.239946301235031, + "learning_rate": 0.00017263157894736842, + "loss": 1.2081, + "step": 902 + }, + { + "epoch": 0.09, + "grad_norm": 0.2874130680609666, + "learning_rate": 0.00017282296650717705, + "loss": 1.125, + "step": 903 + }, + { + "epoch": 0.09, + "grad_norm": 0.3210023421862061, + "learning_rate": 0.00017301435406698564, + "loss": 1.2268, + "step": 904 + }, + { + "epoch": 0.09, + "grad_norm": 0.25985019789372976, + "learning_rate": 0.00017320574162679427, + "loss": 1.0704, + "step": 905 + }, + { + "epoch": 0.09, + "grad_norm": 0.28932579761775323, + "learning_rate": 0.00017339712918660287, + "loss": 1.1939, + "step": 906 + }, + { + "epoch": 0.09, + "grad_norm": 0.2984436701321717, + "learning_rate": 0.0001735885167464115, + "loss": 1.1441, + "step": 907 + }, + { + "epoch": 0.09, + "grad_norm": 0.33279429925895665, + "learning_rate": 0.00017377990430622012, + "loss": 1.2299, + "step": 908 + }, + { + "epoch": 0.09, + "grad_norm": 0.28785481315035893, + "learning_rate": 0.00017397129186602872, + "loss": 1.118, + "step": 909 + }, + { + "epoch": 0.09, + "grad_norm": 0.31655385538112546, + "learning_rate": 0.00017416267942583732, + "loss": 1.189, + "step": 910 + }, + { + "epoch": 0.09, + "grad_norm": 0.3038855880357351, + "learning_rate": 0.00017435406698564594, + "loss": 1.0654, + "step": 911 + }, + { + "epoch": 0.09, + "grad_norm": 0.3042729440177485, + "learning_rate": 0.00017454545454545454, + "loss": 1.1553, + "step": 912 + }, + { + "epoch": 0.09, + "grad_norm": 0.266680706575244, + "learning_rate": 0.00017473684210526317, + "loss": 1.1006, + "step": 913 + }, + { + "epoch": 0.09, + "grad_norm": 0.2741425104907168, + "learning_rate": 0.0001749282296650718, + "loss": 1.2319, + "step": 914 + }, + { + "epoch": 0.09, + "grad_norm": 0.2520129822636353, + "learning_rate": 0.0001751196172248804, + "loss": 1.1394, + "step": 915 + }, + { + "epoch": 0.09, + "grad_norm": 0.28607836069753895, + "learning_rate": 0.000175311004784689, + "loss": 1.2108, + "step": 916 + }, + { + "epoch": 0.09, + "grad_norm": 0.28836296745411716, + "learning_rate": 0.00017550239234449762, + "loss": 1.0772, + "step": 917 + }, + { + "epoch": 0.09, + "grad_norm": 0.26291170956841414, + "learning_rate": 0.00017569377990430622, + "loss": 1.1532, + "step": 918 + }, + { + "epoch": 0.09, + "grad_norm": 0.27624746568071396, + "learning_rate": 0.00017588516746411484, + "loss": 1.1178, + "step": 919 + }, + { + "epoch": 0.09, + "grad_norm": 0.28200386776822395, + "learning_rate": 0.00017607655502392347, + "loss": 1.1105, + "step": 920 + }, + { + "epoch": 0.09, + "grad_norm": 0.25462518315632554, + "learning_rate": 0.00017626794258373207, + "loss": 1.0717, + "step": 921 + }, + { + "epoch": 0.09, + "grad_norm": 0.27932944411599797, + "learning_rate": 0.0001764593301435407, + "loss": 1.2486, + "step": 922 + }, + { + "epoch": 0.09, + "grad_norm": 0.29462379215808215, + "learning_rate": 0.0001766507177033493, + "loss": 1.226, + "step": 923 + }, + { + "epoch": 0.09, + "grad_norm": 0.2741976731865599, + "learning_rate": 0.0001768421052631579, + "loss": 1.1797, + "step": 924 + }, + { + "epoch": 0.09, + "grad_norm": 0.2532434659032646, + "learning_rate": 0.00017703349282296652, + "loss": 1.0828, + "step": 925 + }, + { + "epoch": 0.09, + "grad_norm": 0.329346060797211, + "learning_rate": 0.00017722488038277514, + "loss": 1.1125, + "step": 926 + }, + { + "epoch": 0.09, + "grad_norm": 0.2644644824352827, + "learning_rate": 0.00017741626794258374, + "loss": 1.1048, + "step": 927 + }, + { + "epoch": 0.09, + "grad_norm": 0.2617940651450908, + "learning_rate": 0.00017760765550239237, + "loss": 1.2178, + "step": 928 + }, + { + "epoch": 0.09, + "grad_norm": 0.29432756373678265, + "learning_rate": 0.00017779904306220096, + "loss": 1.1336, + "step": 929 + }, + { + "epoch": 0.09, + "grad_norm": 0.28911304731696175, + "learning_rate": 0.00017799043062200956, + "loss": 1.1578, + "step": 930 + }, + { + "epoch": 0.09, + "grad_norm": 0.3006870934673598, + "learning_rate": 0.0001781818181818182, + "loss": 1.0588, + "step": 931 + }, + { + "epoch": 0.09, + "grad_norm": 0.31210608325092193, + "learning_rate": 0.0001783732057416268, + "loss": 1.2426, + "step": 932 + }, + { + "epoch": 0.09, + "grad_norm": 0.27626145357478726, + "learning_rate": 0.0001785645933014354, + "loss": 1.1609, + "step": 933 + }, + { + "epoch": 0.09, + "grad_norm": 0.2683905399507039, + "learning_rate": 0.00017875598086124404, + "loss": 1.1457, + "step": 934 + }, + { + "epoch": 0.09, + "grad_norm": 0.2661353870666551, + "learning_rate": 0.00017894736842105264, + "loss": 1.2095, + "step": 935 + }, + { + "epoch": 0.09, + "grad_norm": 0.33062559297582395, + "learning_rate": 0.00017913875598086126, + "loss": 1.2396, + "step": 936 + }, + { + "epoch": 0.09, + "grad_norm": 0.26950737804952357, + "learning_rate": 0.00017933014354066986, + "loss": 1.1983, + "step": 937 + }, + { + "epoch": 0.09, + "grad_norm": 0.29499843784362234, + "learning_rate": 0.00017952153110047846, + "loss": 1.2442, + "step": 938 + }, + { + "epoch": 0.09, + "grad_norm": 0.31186904072609634, + "learning_rate": 0.00017971291866028709, + "loss": 1.1494, + "step": 939 + }, + { + "epoch": 0.09, + "grad_norm": 0.2514154775647367, + "learning_rate": 0.0001799043062200957, + "loss": 1.1061, + "step": 940 + }, + { + "epoch": 0.09, + "grad_norm": 0.28595401946483395, + "learning_rate": 0.0001800956937799043, + "loss": 1.0322, + "step": 941 + }, + { + "epoch": 0.09, + "grad_norm": 0.32459069445525873, + "learning_rate": 0.00018028708133971294, + "loss": 1.2007, + "step": 942 + }, + { + "epoch": 0.09, + "grad_norm": 0.31789984803696647, + "learning_rate": 0.00018047846889952154, + "loss": 1.1057, + "step": 943 + }, + { + "epoch": 0.09, + "grad_norm": 0.2893543986536651, + "learning_rate": 0.00018066985645933013, + "loss": 1.1108, + "step": 944 + }, + { + "epoch": 0.09, + "grad_norm": 0.225754938363265, + "learning_rate": 0.00018086124401913876, + "loss": 1.0842, + "step": 945 + }, + { + "epoch": 0.09, + "grad_norm": 0.3473860341063463, + "learning_rate": 0.00018105263157894739, + "loss": 1.0824, + "step": 946 + }, + { + "epoch": 0.09, + "grad_norm": 0.2922950981615233, + "learning_rate": 0.00018124401913875598, + "loss": 1.1143, + "step": 947 + }, + { + "epoch": 0.09, + "grad_norm": 0.29161352434420446, + "learning_rate": 0.0001814354066985646, + "loss": 1.2325, + "step": 948 + }, + { + "epoch": 0.09, + "grad_norm": 0.29481985803408, + "learning_rate": 0.0001816267942583732, + "loss": 1.1527, + "step": 949 + }, + { + "epoch": 0.09, + "grad_norm": 0.2874729386092549, + "learning_rate": 0.00018181818181818183, + "loss": 1.1476, + "step": 950 + }, + { + "epoch": 0.09, + "grad_norm": 0.24921692256090058, + "learning_rate": 0.00018200956937799043, + "loss": 1.1567, + "step": 951 + }, + { + "epoch": 0.09, + "grad_norm": 0.31742487463024494, + "learning_rate": 0.00018220095693779906, + "loss": 1.1013, + "step": 952 + }, + { + "epoch": 0.09, + "grad_norm": 0.27456172247325683, + "learning_rate": 0.00018239234449760766, + "loss": 1.2069, + "step": 953 + }, + { + "epoch": 0.09, + "grad_norm": 0.28859064993450634, + "learning_rate": 0.00018258373205741628, + "loss": 1.2123, + "step": 954 + }, + { + "epoch": 0.09, + "grad_norm": 0.2750397123362856, + "learning_rate": 0.00018277511961722488, + "loss": 1.1231, + "step": 955 + }, + { + "epoch": 0.09, + "grad_norm": 0.23915748062608722, + "learning_rate": 0.0001829665071770335, + "loss": 1.1612, + "step": 956 + }, + { + "epoch": 0.09, + "grad_norm": 0.2722479832588104, + "learning_rate": 0.0001831578947368421, + "loss": 1.1897, + "step": 957 + }, + { + "epoch": 0.09, + "grad_norm": 0.2863352675014705, + "learning_rate": 0.00018334928229665073, + "loss": 1.1062, + "step": 958 + }, + { + "epoch": 0.09, + "grad_norm": 0.25258807482282203, + "learning_rate": 0.00018354066985645936, + "loss": 1.1574, + "step": 959 + }, + { + "epoch": 0.09, + "grad_norm": 0.27455220282180104, + "learning_rate": 0.00018373205741626796, + "loss": 1.0886, + "step": 960 + }, + { + "epoch": 0.09, + "grad_norm": 0.2693559937731345, + "learning_rate": 0.00018392344497607656, + "loss": 1.1873, + "step": 961 + }, + { + "epoch": 0.09, + "grad_norm": 0.2487980816971801, + "learning_rate": 0.00018411483253588518, + "loss": 1.2245, + "step": 962 + }, + { + "epoch": 0.09, + "grad_norm": 0.6732577941816555, + "learning_rate": 0.00018430622009569378, + "loss": 1.1813, + "step": 963 + }, + { + "epoch": 0.09, + "grad_norm": 0.2912759304052633, + "learning_rate": 0.00018449760765550238, + "loss": 1.168, + "step": 964 + }, + { + "epoch": 0.09, + "grad_norm": 0.26989856763778836, + "learning_rate": 0.00018468899521531103, + "loss": 1.0137, + "step": 965 + }, + { + "epoch": 0.09, + "grad_norm": 0.25602835842131616, + "learning_rate": 0.00018488038277511963, + "loss": 1.1879, + "step": 966 + }, + { + "epoch": 0.09, + "grad_norm": 0.25725078226468107, + "learning_rate": 0.00018507177033492823, + "loss": 1.174, + "step": 967 + }, + { + "epoch": 0.09, + "grad_norm": 0.27889203556658276, + "learning_rate": 0.00018526315789473685, + "loss": 1.2013, + "step": 968 + }, + { + "epoch": 0.09, + "grad_norm": 0.28462011286220296, + "learning_rate": 0.00018545454545454545, + "loss": 1.0878, + "step": 969 + }, + { + "epoch": 0.09, + "grad_norm": 0.27682759364760257, + "learning_rate": 0.00018564593301435408, + "loss": 1.085, + "step": 970 + }, + { + "epoch": 0.09, + "grad_norm": 0.28136179421463786, + "learning_rate": 0.0001858373205741627, + "loss": 1.1807, + "step": 971 + }, + { + "epoch": 0.09, + "grad_norm": 0.2506017584700625, + "learning_rate": 0.0001860287081339713, + "loss": 1.1538, + "step": 972 + }, + { + "epoch": 0.09, + "grad_norm": 0.2681898339952538, + "learning_rate": 0.00018622009569377993, + "loss": 1.0615, + "step": 973 + }, + { + "epoch": 0.09, + "grad_norm": 0.2838246571307257, + "learning_rate": 0.00018641148325358853, + "loss": 1.1778, + "step": 974 + }, + { + "epoch": 0.09, + "grad_norm": 0.2758038504041395, + "learning_rate": 0.00018660287081339713, + "loss": 1.1038, + "step": 975 + }, + { + "epoch": 0.09, + "grad_norm": 0.28266068816982276, + "learning_rate": 0.00018679425837320575, + "loss": 1.1487, + "step": 976 + }, + { + "epoch": 0.09, + "grad_norm": 0.2655825547541941, + "learning_rate": 0.00018698564593301435, + "loss": 1.0846, + "step": 977 + }, + { + "epoch": 0.09, + "grad_norm": 0.2750864417199089, + "learning_rate": 0.00018717703349282298, + "loss": 1.0925, + "step": 978 + }, + { + "epoch": 0.09, + "grad_norm": 0.28328763891237363, + "learning_rate": 0.0001873684210526316, + "loss": 1.1602, + "step": 979 + }, + { + "epoch": 0.09, + "grad_norm": 0.274427495879147, + "learning_rate": 0.0001875598086124402, + "loss": 1.0184, + "step": 980 + }, + { + "epoch": 0.09, + "grad_norm": 0.29677822769592865, + "learning_rate": 0.0001877511961722488, + "loss": 1.1316, + "step": 981 + }, + { + "epoch": 0.09, + "grad_norm": 0.35675044865453487, + "learning_rate": 0.00018794258373205743, + "loss": 1.1299, + "step": 982 + }, + { + "epoch": 0.09, + "grad_norm": 0.27471990871455726, + "learning_rate": 0.00018813397129186602, + "loss": 1.0977, + "step": 983 + }, + { + "epoch": 0.09, + "grad_norm": 0.32677576558264015, + "learning_rate": 0.00018832535885167465, + "loss": 1.0416, + "step": 984 + }, + { + "epoch": 0.09, + "grad_norm": 0.3449420887466517, + "learning_rate": 0.00018851674641148328, + "loss": 1.218, + "step": 985 + }, + { + "epoch": 0.09, + "grad_norm": 0.3187055721961639, + "learning_rate": 0.00018870813397129187, + "loss": 1.1091, + "step": 986 + }, + { + "epoch": 0.09, + "grad_norm": 0.3143792697319127, + "learning_rate": 0.0001888995215311005, + "loss": 1.067, + "step": 987 + }, + { + "epoch": 0.09, + "grad_norm": 0.2742909947428014, + "learning_rate": 0.0001890909090909091, + "loss": 1.225, + "step": 988 + }, + { + "epoch": 0.09, + "grad_norm": 0.27319677319302543, + "learning_rate": 0.0001892822966507177, + "loss": 1.1487, + "step": 989 + }, + { + "epoch": 0.09, + "grad_norm": 0.2758157497549949, + "learning_rate": 0.00018947368421052632, + "loss": 1.1109, + "step": 990 + }, + { + "epoch": 0.09, + "grad_norm": 0.2574079506381213, + "learning_rate": 0.00018966507177033495, + "loss": 1.1476, + "step": 991 + }, + { + "epoch": 0.09, + "grad_norm": 0.332702187603211, + "learning_rate": 0.00018985645933014355, + "loss": 1.0896, + "step": 992 + }, + { + "epoch": 0.1, + "grad_norm": 0.2861721583962, + "learning_rate": 0.00019004784688995217, + "loss": 1.2234, + "step": 993 + }, + { + "epoch": 0.1, + "grad_norm": 0.2467936267051518, + "learning_rate": 0.00019023923444976077, + "loss": 1.1656, + "step": 994 + }, + { + "epoch": 0.1, + "grad_norm": 0.28781558903434595, + "learning_rate": 0.00019043062200956937, + "loss": 1.1853, + "step": 995 + }, + { + "epoch": 0.1, + "grad_norm": 0.2916455357271407, + "learning_rate": 0.000190622009569378, + "loss": 1.0269, + "step": 996 + }, + { + "epoch": 0.1, + "grad_norm": 0.3201870144576391, + "learning_rate": 0.00019081339712918662, + "loss": 1.1852, + "step": 997 + }, + { + "epoch": 0.1, + "grad_norm": 0.27545254213477577, + "learning_rate": 0.00019100478468899522, + "loss": 1.0957, + "step": 998 + }, + { + "epoch": 0.1, + "grad_norm": 0.2826496819385951, + "learning_rate": 0.00019119617224880385, + "loss": 1.2255, + "step": 999 + }, + { + "epoch": 0.1, + "grad_norm": 0.2967102485192698, + "learning_rate": 0.00019138755980861245, + "loss": 1.1536, + "step": 1000 + }, + { + "epoch": 0.1, + "grad_norm": 0.29117608778714893, + "learning_rate": 0.00019157894736842104, + "loss": 1.0878, + "step": 1001 + }, + { + "epoch": 0.1, + "grad_norm": 0.28851304804169287, + "learning_rate": 0.00019177033492822967, + "loss": 1.0898, + "step": 1002 + }, + { + "epoch": 0.1, + "grad_norm": 0.27111717804566754, + "learning_rate": 0.00019196172248803827, + "loss": 1.2214, + "step": 1003 + }, + { + "epoch": 0.1, + "grad_norm": 0.29632228590140464, + "learning_rate": 0.0001921531100478469, + "loss": 1.1534, + "step": 1004 + }, + { + "epoch": 0.1, + "grad_norm": 0.30166486227944156, + "learning_rate": 0.00019234449760765552, + "loss": 1.1784, + "step": 1005 + }, + { + "epoch": 0.1, + "grad_norm": 0.261168294050402, + "learning_rate": 0.00019253588516746412, + "loss": 1.2274, + "step": 1006 + }, + { + "epoch": 0.1, + "grad_norm": 0.2696524388115216, + "learning_rate": 0.00019272727272727274, + "loss": 1.1256, + "step": 1007 + }, + { + "epoch": 0.1, + "grad_norm": 0.30883168940001077, + "learning_rate": 0.00019291866028708134, + "loss": 1.1804, + "step": 1008 + }, + { + "epoch": 0.1, + "grad_norm": 0.2901725454324794, + "learning_rate": 0.00019311004784688994, + "loss": 1.0656, + "step": 1009 + }, + { + "epoch": 0.1, + "grad_norm": 0.30050679633218647, + "learning_rate": 0.0001933014354066986, + "loss": 1.1217, + "step": 1010 + }, + { + "epoch": 0.1, + "grad_norm": 0.2763711001518656, + "learning_rate": 0.0001934928229665072, + "loss": 1.2114, + "step": 1011 + }, + { + "epoch": 0.1, + "grad_norm": 0.2676109407157463, + "learning_rate": 0.0001936842105263158, + "loss": 1.0474, + "step": 1012 + }, + { + "epoch": 0.1, + "grad_norm": 0.2747480845011328, + "learning_rate": 0.00019387559808612442, + "loss": 1.038, + "step": 1013 + }, + { + "epoch": 0.1, + "grad_norm": 0.24960295337688276, + "learning_rate": 0.00019406698564593302, + "loss": 1.0625, + "step": 1014 + }, + { + "epoch": 0.1, + "grad_norm": 0.2721591800223072, + "learning_rate": 0.00019425837320574162, + "loss": 1.1327, + "step": 1015 + }, + { + "epoch": 0.1, + "grad_norm": 0.2877329511310855, + "learning_rate": 0.00019444976076555027, + "loss": 1.228, + "step": 1016 + }, + { + "epoch": 0.1, + "grad_norm": 0.2568028077694964, + "learning_rate": 0.00019464114832535887, + "loss": 1.0683, + "step": 1017 + }, + { + "epoch": 0.1, + "grad_norm": 0.2678405294971607, + "learning_rate": 0.00019483253588516747, + "loss": 1.1125, + "step": 1018 + }, + { + "epoch": 0.1, + "grad_norm": 0.2963652522200652, + "learning_rate": 0.0001950239234449761, + "loss": 1.0905, + "step": 1019 + }, + { + "epoch": 0.1, + "grad_norm": 0.26009393679319537, + "learning_rate": 0.0001952153110047847, + "loss": 1.1036, + "step": 1020 + }, + { + "epoch": 0.1, + "grad_norm": 0.3049720818580699, + "learning_rate": 0.00019540669856459332, + "loss": 1.1964, + "step": 1021 + }, + { + "epoch": 0.1, + "grad_norm": 0.3050130613963167, + "learning_rate": 0.00019559808612440191, + "loss": 1.1293, + "step": 1022 + }, + { + "epoch": 0.1, + "grad_norm": 0.24297369971258104, + "learning_rate": 0.00019578947368421054, + "loss": 1.1143, + "step": 1023 + }, + { + "epoch": 0.1, + "grad_norm": 0.24077286684290172, + "learning_rate": 0.00019598086124401917, + "loss": 1.0764, + "step": 1024 + }, + { + "epoch": 0.1, + "grad_norm": 0.3113100418888948, + "learning_rate": 0.00019617224880382777, + "loss": 1.1246, + "step": 1025 + }, + { + "epoch": 0.1, + "grad_norm": 0.2784731985247703, + "learning_rate": 0.00019636363636363636, + "loss": 1.0998, + "step": 1026 + }, + { + "epoch": 0.1, + "grad_norm": 0.2542533680624268, + "learning_rate": 0.000196555023923445, + "loss": 1.114, + "step": 1027 + }, + { + "epoch": 0.1, + "grad_norm": 0.28332309977048276, + "learning_rate": 0.0001967464114832536, + "loss": 1.1719, + "step": 1028 + }, + { + "epoch": 0.1, + "grad_norm": 0.25261282572279636, + "learning_rate": 0.00019693779904306221, + "loss": 1.1069, + "step": 1029 + }, + { + "epoch": 0.1, + "grad_norm": 0.28908512950153364, + "learning_rate": 0.00019712918660287084, + "loss": 1.0939, + "step": 1030 + }, + { + "epoch": 0.1, + "grad_norm": 0.2624681443069945, + "learning_rate": 0.00019732057416267944, + "loss": 1.1426, + "step": 1031 + }, + { + "epoch": 0.1, + "grad_norm": 0.26954809036931093, + "learning_rate": 0.00019751196172248804, + "loss": 1.0777, + "step": 1032 + }, + { + "epoch": 0.1, + "grad_norm": 0.2921435580998635, + "learning_rate": 0.00019770334928229666, + "loss": 1.0915, + "step": 1033 + }, + { + "epoch": 0.1, + "grad_norm": 0.21257005701595452, + "learning_rate": 0.00019789473684210526, + "loss": 1.1055, + "step": 1034 + }, + { + "epoch": 0.1, + "grad_norm": 0.27523674720420943, + "learning_rate": 0.0001980861244019139, + "loss": 1.0515, + "step": 1035 + }, + { + "epoch": 0.1, + "grad_norm": 0.26415508019617007, + "learning_rate": 0.0001982775119617225, + "loss": 1.0515, + "step": 1036 + }, + { + "epoch": 0.1, + "grad_norm": 0.25592610307218705, + "learning_rate": 0.0001984688995215311, + "loss": 1.1751, + "step": 1037 + }, + { + "epoch": 0.1, + "grad_norm": 0.25430310175648296, + "learning_rate": 0.00019866028708133974, + "loss": 1.123, + "step": 1038 + }, + { + "epoch": 0.1, + "grad_norm": 0.2861528947212422, + "learning_rate": 0.00019885167464114834, + "loss": 1.0859, + "step": 1039 + }, + { + "epoch": 0.1, + "grad_norm": 0.2738046774076065, + "learning_rate": 0.00019904306220095693, + "loss": 1.1405, + "step": 1040 + }, + { + "epoch": 0.1, + "grad_norm": 0.2726143048105954, + "learning_rate": 0.00019923444976076556, + "loss": 1.264, + "step": 1041 + }, + { + "epoch": 0.1, + "grad_norm": 0.27872152562297303, + "learning_rate": 0.0001994258373205742, + "loss": 1.3155, + "step": 1042 + }, + { + "epoch": 0.1, + "grad_norm": 0.2852218650666301, + "learning_rate": 0.00019961722488038279, + "loss": 1.1655, + "step": 1043 + }, + { + "epoch": 0.1, + "grad_norm": 0.23588395214808744, + "learning_rate": 0.0001998086124401914, + "loss": 1.0397, + "step": 1044 + }, + { + "epoch": 0.1, + "grad_norm": 0.2666368869674148, + "learning_rate": 0.0002, + "loss": 1.1416, + "step": 1045 + }, + { + "epoch": 0.1, + "grad_norm": 0.2930022628207633, + "learning_rate": 0.00019999999874871857, + "loss": 1.0405, + "step": 1046 + }, + { + "epoch": 0.1, + "grad_norm": 0.2710774549293637, + "learning_rate": 0.00019999999499487433, + "loss": 1.1506, + "step": 1047 + }, + { + "epoch": 0.1, + "grad_norm": 0.25385995300238745, + "learning_rate": 0.00019999998873846737, + "loss": 1.267, + "step": 1048 + }, + { + "epoch": 0.1, + "grad_norm": 0.27774997441775784, + "learning_rate": 0.00019999997997949785, + "loss": 1.1939, + "step": 1049 + }, + { + "epoch": 0.1, + "grad_norm": 0.26076832391011084, + "learning_rate": 0.00019999996871796597, + "loss": 1.1117, + "step": 1050 + }, + { + "epoch": 0.1, + "grad_norm": 0.235868498103916, + "learning_rate": 0.00019999995495387202, + "loss": 1.1701, + "step": 1051 + }, + { + "epoch": 0.1, + "grad_norm": 0.24077986955464514, + "learning_rate": 0.00019999993868721638, + "loss": 1.137, + "step": 1052 + }, + { + "epoch": 0.1, + "grad_norm": 0.2512166776788439, + "learning_rate": 0.0001999999199179994, + "loss": 1.1414, + "step": 1053 + }, + { + "epoch": 0.1, + "grad_norm": 0.2654407191870313, + "learning_rate": 0.00019999989864622159, + "loss": 1.0333, + "step": 1054 + }, + { + "epoch": 0.1, + "grad_norm": 0.2642901952276395, + "learning_rate": 0.00019999987487188348, + "loss": 1.1385, + "step": 1055 + }, + { + "epoch": 0.1, + "grad_norm": 0.23723222653673273, + "learning_rate": 0.00019999984859498562, + "loss": 1.1103, + "step": 1056 + }, + { + "epoch": 0.1, + "grad_norm": 0.2576508658140534, + "learning_rate": 0.00019999981981552872, + "loss": 1.0641, + "step": 1057 + }, + { + "epoch": 0.1, + "grad_norm": 0.2512702002371694, + "learning_rate": 0.00019999978853351346, + "loss": 1.1742, + "step": 1058 + }, + { + "epoch": 0.1, + "grad_norm": 0.2544113883838849, + "learning_rate": 0.0001999997547489407, + "loss": 1.249, + "step": 1059 + }, + { + "epoch": 0.1, + "grad_norm": 0.29453181610522905, + "learning_rate": 0.00019999971846181117, + "loss": 1.0817, + "step": 1060 + }, + { + "epoch": 0.1, + "grad_norm": 0.3046660451518799, + "learning_rate": 0.00019999967967212587, + "loss": 1.1202, + "step": 1061 + }, + { + "epoch": 0.1, + "grad_norm": 0.2711240525076058, + "learning_rate": 0.0001999996383798857, + "loss": 1.1488, + "step": 1062 + }, + { + "epoch": 0.1, + "grad_norm": 0.26069781523913904, + "learning_rate": 0.0001999995945850918, + "loss": 1.1204, + "step": 1063 + }, + { + "epoch": 0.1, + "grad_norm": 0.2759936162347398, + "learning_rate": 0.00019999954828774514, + "loss": 1.2418, + "step": 1064 + }, + { + "epoch": 0.1, + "grad_norm": 0.32559445650786323, + "learning_rate": 0.00019999949948784696, + "loss": 1.0396, + "step": 1065 + }, + { + "epoch": 0.1, + "grad_norm": 0.29446236701732303, + "learning_rate": 0.00019999944818539843, + "loss": 0.9828, + "step": 1066 + }, + { + "epoch": 0.1, + "grad_norm": 0.2512414839667447, + "learning_rate": 0.00019999939438040092, + "loss": 1.0964, + "step": 1067 + }, + { + "epoch": 0.1, + "grad_norm": 0.2522166834487399, + "learning_rate": 0.00019999933807285567, + "loss": 1.1991, + "step": 1068 + }, + { + "epoch": 0.1, + "grad_norm": 0.2776488352071124, + "learning_rate": 0.00019999927926276417, + "loss": 1.2357, + "step": 1069 + }, + { + "epoch": 0.1, + "grad_norm": 0.3107756422517976, + "learning_rate": 0.00019999921795012783, + "loss": 1.1875, + "step": 1070 + }, + { + "epoch": 0.1, + "grad_norm": 0.23140904026513692, + "learning_rate": 0.00019999915413494823, + "loss": 1.0612, + "step": 1071 + }, + { + "epoch": 0.1, + "grad_norm": 0.28830876945790945, + "learning_rate": 0.00019999908781722693, + "loss": 1.0622, + "step": 1072 + }, + { + "epoch": 0.1, + "grad_norm": 0.24641652710238304, + "learning_rate": 0.00019999901899696564, + "loss": 1.1553, + "step": 1073 + }, + { + "epoch": 0.1, + "grad_norm": 0.3285726477920543, + "learning_rate": 0.00019999894767416603, + "loss": 1.1287, + "step": 1074 + }, + { + "epoch": 0.1, + "grad_norm": 0.2868313621923491, + "learning_rate": 0.00019999887384882992, + "loss": 1.1679, + "step": 1075 + }, + { + "epoch": 0.1, + "grad_norm": 0.2888935086026084, + "learning_rate": 0.00019999879752095914, + "loss": 1.078, + "step": 1076 + }, + { + "epoch": 0.1, + "grad_norm": 0.25581751198117825, + "learning_rate": 0.0001999987186905556, + "loss": 1.1637, + "step": 1077 + }, + { + "epoch": 0.1, + "grad_norm": 0.28668728467855203, + "learning_rate": 0.0001999986373576213, + "loss": 1.1557, + "step": 1078 + }, + { + "epoch": 0.1, + "grad_norm": 0.2858855815822476, + "learning_rate": 0.00019999855352215824, + "loss": 1.1543, + "step": 1079 + }, + { + "epoch": 0.1, + "grad_norm": 0.24866679351848656, + "learning_rate": 0.0001999984671841685, + "loss": 1.088, + "step": 1080 + }, + { + "epoch": 0.1, + "grad_norm": 0.2802988000108613, + "learning_rate": 0.00019999837834365432, + "loss": 1.1045, + "step": 1081 + }, + { + "epoch": 0.1, + "grad_norm": 0.28290564790646, + "learning_rate": 0.00019999828700061786, + "loss": 1.1013, + "step": 1082 + }, + { + "epoch": 0.1, + "grad_norm": 0.2670174309559056, + "learning_rate": 0.0001999981931550614, + "loss": 1.0202, + "step": 1083 + }, + { + "epoch": 0.1, + "grad_norm": 0.29742334914408336, + "learning_rate": 0.00019999809680698734, + "loss": 1.1634, + "step": 1084 + }, + { + "epoch": 0.1, + "grad_norm": 0.2613603430268145, + "learning_rate": 0.00019999799795639804, + "loss": 1.1906, + "step": 1085 + }, + { + "epoch": 0.1, + "grad_norm": 0.2368844788947555, + "learning_rate": 0.000199997896603296, + "loss": 1.1789, + "step": 1086 + }, + { + "epoch": 0.1, + "grad_norm": 0.28495588713062425, + "learning_rate": 0.00019999779274768376, + "loss": 1.1759, + "step": 1087 + }, + { + "epoch": 0.1, + "grad_norm": 0.2822715528001476, + "learning_rate": 0.0001999976863895639, + "loss": 1.0508, + "step": 1088 + }, + { + "epoch": 0.1, + "grad_norm": 0.28574997489486803, + "learning_rate": 0.0001999975775289391, + "loss": 1.1224, + "step": 1089 + }, + { + "epoch": 0.1, + "grad_norm": 0.261898017632014, + "learning_rate": 0.00019999746616581208, + "loss": 1.1035, + "step": 1090 + }, + { + "epoch": 0.1, + "grad_norm": 0.28859851706983464, + "learning_rate": 0.00019999735230018562, + "loss": 1.1726, + "step": 1091 + }, + { + "epoch": 0.1, + "grad_norm": 0.26017489671317706, + "learning_rate": 0.00019999723593206256, + "loss": 1.0777, + "step": 1092 + }, + { + "epoch": 0.1, + "grad_norm": 0.2808346316808804, + "learning_rate": 0.00019999711706144584, + "loss": 1.1169, + "step": 1093 + }, + { + "epoch": 0.1, + "grad_norm": 0.26961618081539596, + "learning_rate": 0.0001999969956883384, + "loss": 1.1525, + "step": 1094 + }, + { + "epoch": 0.1, + "grad_norm": 0.23998423085029977, + "learning_rate": 0.0001999968718127433, + "loss": 1.1088, + "step": 1095 + }, + { + "epoch": 0.1, + "grad_norm": 0.27133623294565795, + "learning_rate": 0.00019999674543466368, + "loss": 1.0187, + "step": 1096 + }, + { + "epoch": 0.1, + "grad_norm": 0.28888071684005906, + "learning_rate": 0.00019999661655410261, + "loss": 1.0476, + "step": 1097 + }, + { + "epoch": 0.11, + "grad_norm": 0.26874358976360063, + "learning_rate": 0.0001999964851710634, + "loss": 1.1429, + "step": 1098 + }, + { + "epoch": 0.11, + "grad_norm": 0.2803133560421145, + "learning_rate": 0.0001999963512855493, + "loss": 1.2504, + "step": 1099 + }, + { + "epoch": 0.11, + "grad_norm": 0.2913063287806824, + "learning_rate": 0.00019999621489756364, + "loss": 1.1604, + "step": 1100 + }, + { + "epoch": 0.11, + "grad_norm": 0.27440490495841235, + "learning_rate": 0.00019999607600710984, + "loss": 1.1517, + "step": 1101 + }, + { + "epoch": 0.11, + "grad_norm": 0.277572309820751, + "learning_rate": 0.00019999593461419144, + "loss": 1.0957, + "step": 1102 + }, + { + "epoch": 0.11, + "grad_norm": 0.3075924298382781, + "learning_rate": 0.0001999957907188119, + "loss": 1.142, + "step": 1103 + }, + { + "epoch": 0.11, + "grad_norm": 0.23985413219751897, + "learning_rate": 0.00019999564432097487, + "loss": 1.1932, + "step": 1104 + }, + { + "epoch": 0.11, + "grad_norm": 0.2408338302884486, + "learning_rate": 0.00019999549542068395, + "loss": 1.0735, + "step": 1105 + }, + { + "epoch": 0.11, + "grad_norm": 0.26874856387294116, + "learning_rate": 0.00019999534401794297, + "loss": 1.1553, + "step": 1106 + }, + { + "epoch": 0.11, + "grad_norm": 0.3014584853984502, + "learning_rate": 0.00019999519011275566, + "loss": 1.1655, + "step": 1107 + }, + { + "epoch": 0.11, + "grad_norm": 0.2843833242046219, + "learning_rate": 0.00019999503370512583, + "loss": 1.1877, + "step": 1108 + }, + { + "epoch": 0.11, + "grad_norm": 0.2512315616335756, + "learning_rate": 0.00019999487479505746, + "loss": 1.2, + "step": 1109 + }, + { + "epoch": 0.11, + "grad_norm": 0.3854687733857706, + "learning_rate": 0.00019999471338255452, + "loss": 1.1755, + "step": 1110 + }, + { + "epoch": 0.11, + "grad_norm": 0.2957638364283729, + "learning_rate": 0.00019999454946762103, + "loss": 1.1496, + "step": 1111 + }, + { + "epoch": 0.11, + "grad_norm": 0.2866505879252708, + "learning_rate": 0.00019999438305026108, + "loss": 0.9671, + "step": 1112 + }, + { + "epoch": 0.11, + "grad_norm": 0.31100005319009444, + "learning_rate": 0.00019999421413047886, + "loss": 1.1924, + "step": 1113 + }, + { + "epoch": 0.11, + "grad_norm": 0.2736868129625665, + "learning_rate": 0.00019999404270827856, + "loss": 1.0565, + "step": 1114 + }, + { + "epoch": 0.11, + "grad_norm": 0.3082559508155182, + "learning_rate": 0.00019999386878366454, + "loss": 1.1636, + "step": 1115 + }, + { + "epoch": 0.11, + "grad_norm": 0.2709734888315765, + "learning_rate": 0.0001999936923566411, + "loss": 1.1289, + "step": 1116 + }, + { + "epoch": 0.11, + "grad_norm": 0.32185710854614685, + "learning_rate": 0.00019999351342721262, + "loss": 1.1404, + "step": 1117 + }, + { + "epoch": 0.11, + "grad_norm": 0.31162451372291133, + "learning_rate": 0.0001999933319953837, + "loss": 1.112, + "step": 1118 + }, + { + "epoch": 0.11, + "grad_norm": 0.2752825720487004, + "learning_rate": 0.00019999314806115872, + "loss": 1.143, + "step": 1119 + }, + { + "epoch": 0.11, + "grad_norm": 0.2917340741765025, + "learning_rate": 0.0001999929616245424, + "loss": 1.0736, + "step": 1120 + }, + { + "epoch": 0.11, + "grad_norm": 0.302518080441679, + "learning_rate": 0.0001999927726855394, + "loss": 1.0372, + "step": 1121 + }, + { + "epoch": 0.11, + "grad_norm": 0.25312327730893897, + "learning_rate": 0.00019999258124415442, + "loss": 1.1355, + "step": 1122 + }, + { + "epoch": 0.11, + "grad_norm": 0.2656439197184839, + "learning_rate": 0.00019999238730039222, + "loss": 1.0496, + "step": 1123 + }, + { + "epoch": 0.11, + "grad_norm": 0.24862847164472834, + "learning_rate": 0.00019999219085425768, + "loss": 1.0786, + "step": 1124 + }, + { + "epoch": 0.11, + "grad_norm": 0.28410932219305585, + "learning_rate": 0.00019999199190575575, + "loss": 1.0904, + "step": 1125 + }, + { + "epoch": 0.11, + "grad_norm": 0.2720824714159536, + "learning_rate": 0.00019999179045489135, + "loss": 1.0153, + "step": 1126 + }, + { + "epoch": 0.11, + "grad_norm": 0.2759096778009793, + "learning_rate": 0.00019999158650166958, + "loss": 1.1001, + "step": 1127 + }, + { + "epoch": 0.11, + "grad_norm": 0.6106436469666682, + "learning_rate": 0.0001999913800460955, + "loss": 1.1342, + "step": 1128 + }, + { + "epoch": 0.11, + "grad_norm": 0.23702252854532238, + "learning_rate": 0.00019999117108817428, + "loss": 1.0916, + "step": 1129 + }, + { + "epoch": 0.11, + "grad_norm": 0.25849383000967896, + "learning_rate": 0.0001999909596279112, + "loss": 1.1749, + "step": 1130 + }, + { + "epoch": 0.11, + "grad_norm": 0.28015440839970107, + "learning_rate": 0.0001999907456653115, + "loss": 1.1269, + "step": 1131 + }, + { + "epoch": 0.11, + "grad_norm": 0.26460543807236786, + "learning_rate": 0.00019999052920038053, + "loss": 1.1749, + "step": 1132 + }, + { + "epoch": 0.11, + "grad_norm": 0.27199116543714963, + "learning_rate": 0.0001999903102331237, + "loss": 1.0885, + "step": 1133 + }, + { + "epoch": 0.11, + "grad_norm": 0.22326728149419828, + "learning_rate": 0.00019999008876354658, + "loss": 1.106, + "step": 1134 + }, + { + "epoch": 0.11, + "grad_norm": 0.2957916257263048, + "learning_rate": 0.0001999898647916546, + "loss": 1.0432, + "step": 1135 + }, + { + "epoch": 0.11, + "grad_norm": 0.2926664019325837, + "learning_rate": 0.00019998963831745344, + "loss": 1.0905, + "step": 1136 + }, + { + "epoch": 0.11, + "grad_norm": 0.2560158671609372, + "learning_rate": 0.00019998940934094872, + "loss": 1.0585, + "step": 1137 + }, + { + "epoch": 0.11, + "grad_norm": 0.26739898061958195, + "learning_rate": 0.00019998917786214618, + "loss": 1.0375, + "step": 1138 + }, + { + "epoch": 0.11, + "grad_norm": 0.28363806649148315, + "learning_rate": 0.00019998894388105164, + "loss": 1.1372, + "step": 1139 + }, + { + "epoch": 0.11, + "grad_norm": 0.24811695289905492, + "learning_rate": 0.00019998870739767094, + "loss": 0.963, + "step": 1140 + }, + { + "epoch": 0.11, + "grad_norm": 0.28924944955268556, + "learning_rate": 0.00019998846841201, + "loss": 1.084, + "step": 1141 + }, + { + "epoch": 0.11, + "grad_norm": 0.2636078259266071, + "learning_rate": 0.00019998822692407478, + "loss": 1.0698, + "step": 1142 + }, + { + "epoch": 0.11, + "grad_norm": 0.2960715052303091, + "learning_rate": 0.0001999879829338714, + "loss": 1.2418, + "step": 1143 + }, + { + "epoch": 0.11, + "grad_norm": 0.23055523423338184, + "learning_rate": 0.00019998773644140584, + "loss": 1.169, + "step": 1144 + }, + { + "epoch": 0.11, + "grad_norm": 0.3043989132237114, + "learning_rate": 0.00019998748744668436, + "loss": 1.1707, + "step": 1145 + }, + { + "epoch": 0.11, + "grad_norm": 0.2652220574427321, + "learning_rate": 0.00019998723594971316, + "loss": 1.0908, + "step": 1146 + }, + { + "epoch": 0.11, + "grad_norm": 0.3564678490888735, + "learning_rate": 0.00019998698195049857, + "loss": 1.2161, + "step": 1147 + }, + { + "epoch": 0.11, + "grad_norm": 0.31147303496629464, + "learning_rate": 0.0001999867254490469, + "loss": 1.0795, + "step": 1148 + }, + { + "epoch": 0.11, + "grad_norm": 0.33889241778400275, + "learning_rate": 0.00019998646644536457, + "loss": 1.2739, + "step": 1149 + }, + { + "epoch": 0.11, + "grad_norm": 0.27545717862181845, + "learning_rate": 0.00019998620493945807, + "loss": 1.074, + "step": 1150 + }, + { + "epoch": 0.11, + "grad_norm": 0.2576593733645889, + "learning_rate": 0.00019998594093133395, + "loss": 1.1171, + "step": 1151 + }, + { + "epoch": 0.11, + "grad_norm": 0.25688278200109543, + "learning_rate": 0.00019998567442099888, + "loss": 1.081, + "step": 1152 + }, + { + "epoch": 0.11, + "grad_norm": 0.3149168655482506, + "learning_rate": 0.0001999854054084594, + "loss": 1.1463, + "step": 1153 + }, + { + "epoch": 0.11, + "grad_norm": 0.28337563224538714, + "learning_rate": 0.00019998513389372233, + "loss": 1.1503, + "step": 1154 + }, + { + "epoch": 0.11, + "grad_norm": 0.28770905143410885, + "learning_rate": 0.00019998485987679447, + "loss": 1.0847, + "step": 1155 + }, + { + "epoch": 0.11, + "grad_norm": 0.2606963858756736, + "learning_rate": 0.00019998458335768264, + "loss": 1.2108, + "step": 1156 + }, + { + "epoch": 0.11, + "grad_norm": 0.2764798228490211, + "learning_rate": 0.00019998430433639376, + "loss": 1.1206, + "step": 1157 + }, + { + "epoch": 0.11, + "grad_norm": 0.3028071620221027, + "learning_rate": 0.00019998402281293484, + "loss": 1.1628, + "step": 1158 + }, + { + "epoch": 0.11, + "grad_norm": 0.23132033284887418, + "learning_rate": 0.00019998373878731291, + "loss": 1.0603, + "step": 1159 + }, + { + "epoch": 0.11, + "grad_norm": 0.3197463940127305, + "learning_rate": 0.0001999834522595351, + "loss": 1.1337, + "step": 1160 + }, + { + "epoch": 0.11, + "grad_norm": 0.258332321698546, + "learning_rate": 0.00019998316322960853, + "loss": 1.1347, + "step": 1161 + }, + { + "epoch": 0.11, + "grad_norm": 0.37002001593224093, + "learning_rate": 0.00019998287169754045, + "loss": 1.0973, + "step": 1162 + }, + { + "epoch": 0.11, + "grad_norm": 0.35455352761567094, + "learning_rate": 0.00019998257766333822, + "loss": 1.0645, + "step": 1163 + }, + { + "epoch": 0.11, + "grad_norm": 0.25846010518779355, + "learning_rate": 0.00019998228112700912, + "loss": 1.099, + "step": 1164 + }, + { + "epoch": 0.11, + "grad_norm": 0.45574094165617823, + "learning_rate": 0.00019998198208856058, + "loss": 1.2218, + "step": 1165 + }, + { + "epoch": 0.11, + "grad_norm": 0.2806569349689396, + "learning_rate": 0.0001999816805480001, + "loss": 1.163, + "step": 1166 + }, + { + "epoch": 0.11, + "grad_norm": 0.3230556556910955, + "learning_rate": 0.00019998137650533527, + "loss": 1.0275, + "step": 1167 + }, + { + "epoch": 0.11, + "grad_norm": 0.295834882980768, + "learning_rate": 0.0001999810699605736, + "loss": 1.0928, + "step": 1168 + }, + { + "epoch": 0.11, + "grad_norm": 0.2838870309959414, + "learning_rate": 0.0001999807609137229, + "loss": 1.1008, + "step": 1169 + }, + { + "epoch": 0.11, + "grad_norm": 0.3164419453755688, + "learning_rate": 0.00019998044936479076, + "loss": 1.1307, + "step": 1170 + }, + { + "epoch": 0.11, + "grad_norm": 0.22581223994903243, + "learning_rate": 0.00019998013531378504, + "loss": 1.1228, + "step": 1171 + }, + { + "epoch": 0.11, + "grad_norm": 0.2611545463660394, + "learning_rate": 0.00019997981876071364, + "loss": 1.1299, + "step": 1172 + }, + { + "epoch": 0.11, + "grad_norm": 0.2531091040846973, + "learning_rate": 0.00019997949970558437, + "loss": 1.2127, + "step": 1173 + }, + { + "epoch": 0.11, + "grad_norm": 0.3026109003824534, + "learning_rate": 0.00019997917814840537, + "loss": 1.1661, + "step": 1174 + }, + { + "epoch": 0.11, + "grad_norm": 0.3246636144590807, + "learning_rate": 0.00019997885408918454, + "loss": 1.0933, + "step": 1175 + }, + { + "epoch": 0.11, + "grad_norm": 0.2574562133107501, + "learning_rate": 0.0001999785275279301, + "loss": 1.1417, + "step": 1176 + }, + { + "epoch": 0.11, + "grad_norm": 0.2774322857021015, + "learning_rate": 0.00019997819846465014, + "loss": 1.2012, + "step": 1177 + }, + { + "epoch": 0.11, + "grad_norm": 0.2616318983640859, + "learning_rate": 0.00019997786689935292, + "loss": 1.064, + "step": 1178 + }, + { + "epoch": 0.11, + "grad_norm": 0.28254029261985597, + "learning_rate": 0.00019997753283204677, + "loss": 1.0777, + "step": 1179 + }, + { + "epoch": 0.11, + "grad_norm": 0.27452271119130867, + "learning_rate": 0.00019997719626274, + "loss": 1.2698, + "step": 1180 + }, + { + "epoch": 0.11, + "grad_norm": 0.28281251810785174, + "learning_rate": 0.0001999768571914411, + "loss": 1.0866, + "step": 1181 + }, + { + "epoch": 0.11, + "grad_norm": 0.28877654959328175, + "learning_rate": 0.00019997651561815848, + "loss": 1.0607, + "step": 1182 + }, + { + "epoch": 0.11, + "grad_norm": 0.31037965805002504, + "learning_rate": 0.00019997617154290077, + "loss": 1.1267, + "step": 1183 + }, + { + "epoch": 0.11, + "grad_norm": 0.28875914132128616, + "learning_rate": 0.0001999758249656765, + "loss": 1.159, + "step": 1184 + }, + { + "epoch": 0.11, + "grad_norm": 0.26263865798211755, + "learning_rate": 0.00019997547588649438, + "loss": 1.1947, + "step": 1185 + }, + { + "epoch": 0.11, + "grad_norm": 0.25456732695723555, + "learning_rate": 0.00019997512430536314, + "loss": 1.1032, + "step": 1186 + }, + { + "epoch": 0.11, + "grad_norm": 0.32332294350656676, + "learning_rate": 0.00019997477022229158, + "loss": 1.1283, + "step": 1187 + }, + { + "epoch": 0.11, + "grad_norm": 0.3066913219600098, + "learning_rate": 0.00019997441363728857, + "loss": 1.2178, + "step": 1188 + }, + { + "epoch": 0.11, + "grad_norm": 0.2982903122596879, + "learning_rate": 0.00019997405455036304, + "loss": 1.1613, + "step": 1189 + }, + { + "epoch": 0.11, + "grad_norm": 0.3072555573162715, + "learning_rate": 0.00019997369296152396, + "loss": 1.1927, + "step": 1190 + }, + { + "epoch": 0.11, + "grad_norm": 0.27576655968710867, + "learning_rate": 0.00019997332887078034, + "loss": 1.148, + "step": 1191 + }, + { + "epoch": 0.11, + "grad_norm": 0.2651145511693654, + "learning_rate": 0.0001999729622781414, + "loss": 1.061, + "step": 1192 + }, + { + "epoch": 0.11, + "grad_norm": 0.26492079668590324, + "learning_rate": 0.00019997259318361622, + "loss": 1.0942, + "step": 1193 + }, + { + "epoch": 0.11, + "grad_norm": 0.268256812103858, + "learning_rate": 0.00019997222158721405, + "loss": 1.0478, + "step": 1194 + }, + { + "epoch": 0.11, + "grad_norm": 0.29544097416774406, + "learning_rate": 0.00019997184748894422, + "loss": 1.084, + "step": 1195 + }, + { + "epoch": 0.11, + "grad_norm": 0.24315111594316274, + "learning_rate": 0.00019997147088881607, + "loss": 1.1187, + "step": 1196 + }, + { + "epoch": 0.11, + "grad_norm": 0.2887864253869539, + "learning_rate": 0.00019997109178683905, + "loss": 1.1425, + "step": 1197 + }, + { + "epoch": 0.11, + "grad_norm": 0.243613837120699, + "learning_rate": 0.0001999707101830226, + "loss": 1.2192, + "step": 1198 + }, + { + "epoch": 0.11, + "grad_norm": 0.2670339437152679, + "learning_rate": 0.00019997032607737633, + "loss": 0.9346, + "step": 1199 + }, + { + "epoch": 0.11, + "grad_norm": 0.286415306705152, + "learning_rate": 0.0001999699394699098, + "loss": 1.2044, + "step": 1200 + }, + { + "epoch": 0.11, + "grad_norm": 0.2649888516882499, + "learning_rate": 0.0001999695503606327, + "loss": 1.1028, + "step": 1201 + }, + { + "epoch": 0.11, + "grad_norm": 0.2784005327190465, + "learning_rate": 0.00019996915874955477, + "loss": 1.1883, + "step": 1202 + }, + { + "epoch": 0.12, + "grad_norm": 0.2827618352465213, + "learning_rate": 0.00019996876463668586, + "loss": 1.1373, + "step": 1203 + }, + { + "epoch": 0.12, + "grad_norm": 0.27252281665016315, + "learning_rate": 0.00019996836802203575, + "loss": 1.1434, + "step": 1204 + }, + { + "epoch": 0.12, + "grad_norm": 0.2829042974144935, + "learning_rate": 0.00019996796890561438, + "loss": 1.1242, + "step": 1205 + }, + { + "epoch": 0.12, + "grad_norm": 0.25919288560265524, + "learning_rate": 0.0001999675672874318, + "loss": 1.0836, + "step": 1206 + }, + { + "epoch": 0.12, + "grad_norm": 0.2462264710662166, + "learning_rate": 0.00019996716316749802, + "loss": 1.0824, + "step": 1207 + }, + { + "epoch": 0.12, + "grad_norm": 0.24248848464047051, + "learning_rate": 0.00019996675654582313, + "loss": 1.1398, + "step": 1208 + }, + { + "epoch": 0.12, + "grad_norm": 0.2542004323802939, + "learning_rate": 0.00019996634742241732, + "loss": 1.0721, + "step": 1209 + }, + { + "epoch": 0.12, + "grad_norm": 0.2665303881003603, + "learning_rate": 0.0001999659357972909, + "loss": 1.1183, + "step": 1210 + }, + { + "epoch": 0.12, + "grad_norm": 0.2776270813403137, + "learning_rate": 0.00019996552167045407, + "loss": 1.2601, + "step": 1211 + }, + { + "epoch": 0.12, + "grad_norm": 0.3169789236787061, + "learning_rate": 0.00019996510504191722, + "loss": 1.2331, + "step": 1212 + }, + { + "epoch": 0.12, + "grad_norm": 0.247880062307769, + "learning_rate": 0.00019996468591169082, + "loss": 1.1088, + "step": 1213 + }, + { + "epoch": 0.12, + "grad_norm": 0.2743889662351891, + "learning_rate": 0.00019996426427978532, + "loss": 1.0541, + "step": 1214 + }, + { + "epoch": 0.12, + "grad_norm": 0.2911803551557875, + "learning_rate": 0.00019996384014621128, + "loss": 1.0826, + "step": 1215 + }, + { + "epoch": 0.12, + "grad_norm": 0.26095098186965116, + "learning_rate": 0.0001999634135109793, + "loss": 1.3256, + "step": 1216 + }, + { + "epoch": 0.12, + "grad_norm": 0.2856385392323691, + "learning_rate": 0.0001999629843741001, + "loss": 1.1093, + "step": 1217 + }, + { + "epoch": 0.12, + "grad_norm": 0.2530173370522601, + "learning_rate": 0.00019996255273558436, + "loss": 1.0579, + "step": 1218 + }, + { + "epoch": 0.12, + "grad_norm": 0.2549728915097247, + "learning_rate": 0.00019996211859544296, + "loss": 0.9691, + "step": 1219 + }, + { + "epoch": 0.12, + "grad_norm": 0.2799442188519433, + "learning_rate": 0.00019996168195368668, + "loss": 1.0273, + "step": 1220 + }, + { + "epoch": 0.12, + "grad_norm": 0.30580035630418173, + "learning_rate": 0.0001999612428103265, + "loss": 1.2193, + "step": 1221 + }, + { + "epoch": 0.12, + "grad_norm": 0.2773508640042361, + "learning_rate": 0.00019996080116537339, + "loss": 1.059, + "step": 1222 + }, + { + "epoch": 0.12, + "grad_norm": 0.2917592835447524, + "learning_rate": 0.0001999603570188384, + "loss": 1.1454, + "step": 1223 + }, + { + "epoch": 0.12, + "grad_norm": 0.27511773778172, + "learning_rate": 0.00019995991037073267, + "loss": 1.0708, + "step": 1224 + }, + { + "epoch": 0.12, + "grad_norm": 0.2299812326773386, + "learning_rate": 0.00019995946122106735, + "loss": 0.9796, + "step": 1225 + }, + { + "epoch": 0.12, + "grad_norm": 0.28024776222182823, + "learning_rate": 0.00019995900956985369, + "loss": 1.1255, + "step": 1226 + }, + { + "epoch": 0.12, + "grad_norm": 0.26814475354624795, + "learning_rate": 0.000199958555417103, + "loss": 1.0957, + "step": 1227 + }, + { + "epoch": 0.12, + "grad_norm": 0.2872677212088015, + "learning_rate": 0.00019995809876282664, + "loss": 1.035, + "step": 1228 + }, + { + "epoch": 0.12, + "grad_norm": 0.2521840908110662, + "learning_rate": 0.00019995763960703605, + "loss": 1.0637, + "step": 1229 + }, + { + "epoch": 0.12, + "grad_norm": 0.40673410489243833, + "learning_rate": 0.00019995717794974268, + "loss": 1.0153, + "step": 1230 + }, + { + "epoch": 0.12, + "grad_norm": 0.31023442713481986, + "learning_rate": 0.0001999567137909581, + "loss": 1.1233, + "step": 1231 + }, + { + "epoch": 0.12, + "grad_norm": 0.2712959430519531, + "learning_rate": 0.000199956247130694, + "loss": 1.1379, + "step": 1232 + }, + { + "epoch": 0.12, + "grad_norm": 0.28625127223535557, + "learning_rate": 0.0001999557779689619, + "loss": 1.2184, + "step": 1233 + }, + { + "epoch": 0.12, + "grad_norm": 0.30076306094072325, + "learning_rate": 0.0001999553063057737, + "loss": 1.1837, + "step": 1234 + }, + { + "epoch": 0.12, + "grad_norm": 0.2653252660175731, + "learning_rate": 0.00019995483214114114, + "loss": 1.0155, + "step": 1235 + }, + { + "epoch": 0.12, + "grad_norm": 0.2752439328777632, + "learning_rate": 0.0001999543554750761, + "loss": 1.1711, + "step": 1236 + }, + { + "epoch": 0.12, + "grad_norm": 0.27211117879122465, + "learning_rate": 0.00019995387630759046, + "loss": 1.1123, + "step": 1237 + }, + { + "epoch": 0.12, + "grad_norm": 0.26487361645343366, + "learning_rate": 0.00019995339463869626, + "loss": 1.074, + "step": 1238 + }, + { + "epoch": 0.12, + "grad_norm": 0.3021321790418319, + "learning_rate": 0.00019995291046840554, + "loss": 1.119, + "step": 1239 + }, + { + "epoch": 0.12, + "grad_norm": 0.2361755130550678, + "learning_rate": 0.00019995242379673041, + "loss": 1.1614, + "step": 1240 + }, + { + "epoch": 0.12, + "grad_norm": 0.254336538394881, + "learning_rate": 0.00019995193462368308, + "loss": 1.0516, + "step": 1241 + }, + { + "epoch": 0.12, + "grad_norm": 0.26752053595877906, + "learning_rate": 0.00019995144294927575, + "loss": 1.0767, + "step": 1242 + }, + { + "epoch": 0.12, + "grad_norm": 0.2597082485713151, + "learning_rate": 0.00019995094877352075, + "loss": 1.0225, + "step": 1243 + }, + { + "epoch": 0.12, + "grad_norm": 0.2455539656845191, + "learning_rate": 0.00019995045209643042, + "loss": 1.1359, + "step": 1244 + }, + { + "epoch": 0.12, + "grad_norm": 0.27400725474036985, + "learning_rate": 0.00019994995291801725, + "loss": 1.1361, + "step": 1245 + }, + { + "epoch": 0.12, + "grad_norm": 0.2924655478762353, + "learning_rate": 0.00019994945123829366, + "loss": 1.1821, + "step": 1246 + }, + { + "epoch": 0.12, + "grad_norm": 0.29710840841514063, + "learning_rate": 0.00019994894705727224, + "loss": 1.2383, + "step": 1247 + }, + { + "epoch": 0.12, + "grad_norm": 0.2813768369686393, + "learning_rate": 0.0001999484403749656, + "loss": 1.0407, + "step": 1248 + }, + { + "epoch": 0.12, + "grad_norm": 0.28144059564025686, + "learning_rate": 0.00019994793119138644, + "loss": 1.2073, + "step": 1249 + }, + { + "epoch": 0.12, + "grad_norm": 0.24328115129403632, + "learning_rate": 0.00019994741950654746, + "loss": 1.1233, + "step": 1250 + }, + { + "epoch": 0.12, + "grad_norm": 0.2779013414605166, + "learning_rate": 0.00019994690532046155, + "loss": 1.1908, + "step": 1251 + }, + { + "epoch": 0.12, + "grad_norm": 0.2939081176965555, + "learning_rate": 0.00019994638863314146, + "loss": 1.0965, + "step": 1252 + }, + { + "epoch": 0.12, + "grad_norm": 0.267694708686987, + "learning_rate": 0.0001999458694446002, + "loss": 1.0733, + "step": 1253 + }, + { + "epoch": 0.12, + "grad_norm": 0.31511116622951807, + "learning_rate": 0.00019994534775485075, + "loss": 1.0266, + "step": 1254 + }, + { + "epoch": 0.12, + "grad_norm": 0.2718983181420745, + "learning_rate": 0.00019994482356390617, + "loss": 1.1895, + "step": 1255 + }, + { + "epoch": 0.12, + "grad_norm": 0.2558317058922546, + "learning_rate": 0.00019994429687177957, + "loss": 1.1239, + "step": 1256 + }, + { + "epoch": 0.12, + "grad_norm": 0.2744763799941594, + "learning_rate": 0.00019994376767848407, + "loss": 1.2122, + "step": 1257 + }, + { + "epoch": 0.12, + "grad_norm": 0.2643964783968129, + "learning_rate": 0.00019994323598403302, + "loss": 1.0028, + "step": 1258 + }, + { + "epoch": 0.12, + "grad_norm": 0.25268007251056396, + "learning_rate": 0.0001999427017884397, + "loss": 1.1376, + "step": 1259 + }, + { + "epoch": 0.12, + "grad_norm": 0.230312980862343, + "learning_rate": 0.0001999421650917174, + "loss": 1.0732, + "step": 1260 + }, + { + "epoch": 0.12, + "grad_norm": 0.2740089140496981, + "learning_rate": 0.00019994162589387964, + "loss": 0.9982, + "step": 1261 + }, + { + "epoch": 0.12, + "grad_norm": 0.2726058611551938, + "learning_rate": 0.0001999410841949399, + "loss": 1.039, + "step": 1262 + }, + { + "epoch": 0.12, + "grad_norm": 0.3233351751878856, + "learning_rate": 0.00019994053999491167, + "loss": 1.2084, + "step": 1263 + }, + { + "epoch": 0.12, + "grad_norm": 0.24718142863026807, + "learning_rate": 0.00019993999329380864, + "loss": 0.9947, + "step": 1264 + }, + { + "epoch": 0.12, + "grad_norm": 0.2877635724046095, + "learning_rate": 0.00019993944409164448, + "loss": 1.1189, + "step": 1265 + }, + { + "epoch": 0.12, + "grad_norm": 0.3002448997033507, + "learning_rate": 0.00019993889238843288, + "loss": 1.0936, + "step": 1266 + }, + { + "epoch": 0.12, + "grad_norm": 0.24948366812390463, + "learning_rate": 0.00019993833818418772, + "loss": 1.1574, + "step": 1267 + }, + { + "epoch": 0.12, + "grad_norm": 0.24037766838141317, + "learning_rate": 0.00019993778147892285, + "loss": 1.1475, + "step": 1268 + }, + { + "epoch": 0.12, + "grad_norm": 0.24578207537112048, + "learning_rate": 0.00019993722227265218, + "loss": 1.1365, + "step": 1269 + }, + { + "epoch": 0.12, + "grad_norm": 0.24088318104194462, + "learning_rate": 0.00019993666056538972, + "loss": 1.0947, + "step": 1270 + }, + { + "epoch": 0.12, + "grad_norm": 0.2921571119742658, + "learning_rate": 0.0001999360963571495, + "loss": 1.0772, + "step": 1271 + }, + { + "epoch": 0.12, + "grad_norm": 0.30049773628170273, + "learning_rate": 0.00019993552964794566, + "loss": 1.2072, + "step": 1272 + }, + { + "epoch": 0.12, + "grad_norm": 0.3160778257013834, + "learning_rate": 0.0001999349604377924, + "loss": 1.0676, + "step": 1273 + }, + { + "epoch": 0.12, + "grad_norm": 0.23884600224412095, + "learning_rate": 0.00019993438872670396, + "loss": 1.0855, + "step": 1274 + }, + { + "epoch": 0.12, + "grad_norm": 0.2615500721708398, + "learning_rate": 0.0001999338145146946, + "loss": 1.1958, + "step": 1275 + }, + { + "epoch": 0.12, + "grad_norm": 0.2591847182045251, + "learning_rate": 0.00019993323780177874, + "loss": 1.0991, + "step": 1276 + }, + { + "epoch": 0.12, + "grad_norm": 0.2800525884700228, + "learning_rate": 0.00019993265858797083, + "loss": 1.0018, + "step": 1277 + }, + { + "epoch": 0.12, + "grad_norm": 0.25703108671920066, + "learning_rate": 0.0001999320768732853, + "loss": 1.0842, + "step": 1278 + }, + { + "epoch": 0.12, + "grad_norm": 0.2814109826464174, + "learning_rate": 0.00019993149265773674, + "loss": 1.1056, + "step": 1279 + }, + { + "epoch": 0.12, + "grad_norm": 0.26560101203311826, + "learning_rate": 0.0001999309059413398, + "loss": 1.1028, + "step": 1280 + }, + { + "epoch": 0.12, + "grad_norm": 0.2592301570333206, + "learning_rate": 0.00019993031672410912, + "loss": 1.2395, + "step": 1281 + }, + { + "epoch": 0.12, + "grad_norm": 0.2903887529235589, + "learning_rate": 0.00019992972500605945, + "loss": 1.2269, + "step": 1282 + }, + { + "epoch": 0.12, + "grad_norm": 0.30985749070799845, + "learning_rate": 0.00019992913078720559, + "loss": 1.0394, + "step": 1283 + }, + { + "epoch": 0.12, + "grad_norm": 0.2427582461596586, + "learning_rate": 0.00019992853406756246, + "loss": 1.0323, + "step": 1284 + }, + { + "epoch": 0.12, + "grad_norm": 0.2674764639506977, + "learning_rate": 0.00019992793484714495, + "loss": 1.0569, + "step": 1285 + }, + { + "epoch": 0.12, + "grad_norm": 0.27159902681019893, + "learning_rate": 0.00019992733312596808, + "loss": 1.0051, + "step": 1286 + }, + { + "epoch": 0.12, + "grad_norm": 0.27222923746834743, + "learning_rate": 0.00019992672890404689, + "loss": 1.1311, + "step": 1287 + }, + { + "epoch": 0.12, + "grad_norm": 0.2557430982813261, + "learning_rate": 0.0001999261221813965, + "loss": 1.121, + "step": 1288 + }, + { + "epoch": 0.12, + "grad_norm": 0.2927850888283984, + "learning_rate": 0.0001999255129580321, + "loss": 1.1497, + "step": 1289 + }, + { + "epoch": 0.12, + "grad_norm": 0.2859310412286254, + "learning_rate": 0.00019992490123396897, + "loss": 1.1786, + "step": 1290 + }, + { + "epoch": 0.12, + "grad_norm": 0.25665923491793874, + "learning_rate": 0.00019992428700922236, + "loss": 1.0947, + "step": 1291 + }, + { + "epoch": 0.12, + "grad_norm": 0.24916615055802435, + "learning_rate": 0.00019992367028380764, + "loss": 1.1687, + "step": 1292 + }, + { + "epoch": 0.12, + "grad_norm": 0.2731992285743491, + "learning_rate": 0.00019992305105774033, + "loss": 1.1253, + "step": 1293 + }, + { + "epoch": 0.12, + "grad_norm": 0.26082758101857634, + "learning_rate": 0.0001999224293310358, + "loss": 1.1551, + "step": 1294 + }, + { + "epoch": 0.12, + "grad_norm": 0.2677306810392585, + "learning_rate": 0.00019992180510370976, + "loss": 1.2005, + "step": 1295 + }, + { + "epoch": 0.12, + "grad_norm": 0.23143531134792353, + "learning_rate": 0.00019992117837577768, + "loss": 1.0862, + "step": 1296 + }, + { + "epoch": 0.12, + "grad_norm": 0.2655840524523936, + "learning_rate": 0.00019992054914725533, + "loss": 1.1536, + "step": 1297 + }, + { + "epoch": 0.12, + "grad_norm": 0.2640787870858716, + "learning_rate": 0.00019991991741815849, + "loss": 1.1011, + "step": 1298 + }, + { + "epoch": 0.12, + "grad_norm": 0.26001791207694314, + "learning_rate": 0.00019991928318850285, + "loss": 1.191, + "step": 1299 + }, + { + "epoch": 0.12, + "grad_norm": 0.34274104078637135, + "learning_rate": 0.0001999186464583044, + "loss": 1.1189, + "step": 1300 + }, + { + "epoch": 0.12, + "grad_norm": 0.25164999546725786, + "learning_rate": 0.000199918007227579, + "loss": 1.1755, + "step": 1301 + }, + { + "epoch": 0.12, + "grad_norm": 0.2619736635468605, + "learning_rate": 0.00019991736549634267, + "loss": 1.0093, + "step": 1302 + }, + { + "epoch": 0.12, + "grad_norm": 0.2610328681252208, + "learning_rate": 0.00019991672126461147, + "loss": 1.2091, + "step": 1303 + }, + { + "epoch": 0.12, + "grad_norm": 0.27609879841972346, + "learning_rate": 0.00019991607453240153, + "loss": 1.1545, + "step": 1304 + }, + { + "epoch": 0.12, + "grad_norm": 0.2438919416348471, + "learning_rate": 0.00019991542529972905, + "loss": 1.0793, + "step": 1305 + }, + { + "epoch": 0.12, + "grad_norm": 0.34769105624706653, + "learning_rate": 0.00019991477356661022, + "loss": 1.0689, + "step": 1306 + }, + { + "epoch": 0.13, + "grad_norm": 0.24988099416617487, + "learning_rate": 0.0001999141193330614, + "loss": 0.9971, + "step": 1307 + }, + { + "epoch": 0.13, + "grad_norm": 0.26587548843318687, + "learning_rate": 0.00019991346259909897, + "loss": 1.0847, + "step": 1308 + }, + { + "epoch": 0.13, + "grad_norm": 0.27946757540245054, + "learning_rate": 0.00019991280336473935, + "loss": 1.137, + "step": 1309 + }, + { + "epoch": 0.13, + "grad_norm": 0.2928663604215012, + "learning_rate": 0.000199912141629999, + "loss": 1.157, + "step": 1310 + }, + { + "epoch": 0.13, + "grad_norm": 0.24871198294790245, + "learning_rate": 0.00019991147739489455, + "loss": 1.0734, + "step": 1311 + }, + { + "epoch": 0.13, + "grad_norm": 0.2384639920280004, + "learning_rate": 0.00019991081065944254, + "loss": 1.0737, + "step": 1312 + }, + { + "epoch": 0.13, + "grad_norm": 0.2656285356842293, + "learning_rate": 0.00019991014142365976, + "loss": 1.0513, + "step": 1313 + }, + { + "epoch": 0.13, + "grad_norm": 0.28562521186701684, + "learning_rate": 0.00019990946968756286, + "loss": 1.1639, + "step": 1314 + }, + { + "epoch": 0.13, + "grad_norm": 0.2648382753477716, + "learning_rate": 0.0001999087954511687, + "loss": 1.1691, + "step": 1315 + }, + { + "epoch": 0.13, + "grad_norm": 0.3183214028731789, + "learning_rate": 0.00019990811871449412, + "loss": 1.1523, + "step": 1316 + }, + { + "epoch": 0.13, + "grad_norm": 0.27420941387779174, + "learning_rate": 0.0001999074394775561, + "loss": 1.0882, + "step": 1317 + }, + { + "epoch": 0.13, + "grad_norm": 0.24879969547328998, + "learning_rate": 0.00019990675774037164, + "loss": 1.0498, + "step": 1318 + }, + { + "epoch": 0.13, + "grad_norm": 0.2738940434550004, + "learning_rate": 0.00019990607350295776, + "loss": 1.145, + "step": 1319 + }, + { + "epoch": 0.13, + "grad_norm": 0.25098756631874913, + "learning_rate": 0.0001999053867653316, + "loss": 1.0748, + "step": 1320 + }, + { + "epoch": 0.13, + "grad_norm": 0.285510083056974, + "learning_rate": 0.00019990469752751032, + "loss": 1.0964, + "step": 1321 + }, + { + "epoch": 0.13, + "grad_norm": 0.2528776643597391, + "learning_rate": 0.00019990400578951125, + "loss": 1.1219, + "step": 1322 + }, + { + "epoch": 0.13, + "grad_norm": 0.2737172954753307, + "learning_rate": 0.0001999033115513516, + "loss": 1.144, + "step": 1323 + }, + { + "epoch": 0.13, + "grad_norm": 0.2788543402132724, + "learning_rate": 0.00019990261481304882, + "loss": 1.2348, + "step": 1324 + }, + { + "epoch": 0.13, + "grad_norm": 0.26612476420962583, + "learning_rate": 0.00019990191557462032, + "loss": 1.1158, + "step": 1325 + }, + { + "epoch": 0.13, + "grad_norm": 0.27339420148068405, + "learning_rate": 0.00019990121383608357, + "loss": 1.193, + "step": 1326 + }, + { + "epoch": 0.13, + "grad_norm": 0.2660334700185283, + "learning_rate": 0.0001999005095974562, + "loss": 1.0692, + "step": 1327 + }, + { + "epoch": 0.13, + "grad_norm": 0.2779459146879295, + "learning_rate": 0.00019989980285875576, + "loss": 1.1296, + "step": 1328 + }, + { + "epoch": 0.13, + "grad_norm": 0.30588845858874797, + "learning_rate": 0.00019989909361999998, + "loss": 1.102, + "step": 1329 + }, + { + "epoch": 0.13, + "grad_norm": 0.27026409826467807, + "learning_rate": 0.0001998983818812066, + "loss": 1.0786, + "step": 1330 + }, + { + "epoch": 0.13, + "grad_norm": 0.23833256583341253, + "learning_rate": 0.00019989766764239342, + "loss": 1.1167, + "step": 1331 + }, + { + "epoch": 0.13, + "grad_norm": 0.21010010951243135, + "learning_rate": 0.00019989695090357832, + "loss": 0.9995, + "step": 1332 + }, + { + "epoch": 0.13, + "grad_norm": 0.28853987325226016, + "learning_rate": 0.00019989623166477926, + "loss": 0.9722, + "step": 1333 + }, + { + "epoch": 0.13, + "grad_norm": 0.27581828527816954, + "learning_rate": 0.0001998955099260142, + "loss": 1.1367, + "step": 1334 + }, + { + "epoch": 0.13, + "grad_norm": 0.3266981204077468, + "learning_rate": 0.00019989478568730124, + "loss": 1.0954, + "step": 1335 + }, + { + "epoch": 0.13, + "grad_norm": 0.26634585407279676, + "learning_rate": 0.00019989405894865848, + "loss": 1.0318, + "step": 1336 + }, + { + "epoch": 0.13, + "grad_norm": 0.2862324766372512, + "learning_rate": 0.0001998933297101041, + "loss": 1.2006, + "step": 1337 + }, + { + "epoch": 0.13, + "grad_norm": 0.3219899860952693, + "learning_rate": 0.0001998925979716564, + "loss": 1.2428, + "step": 1338 + }, + { + "epoch": 0.13, + "grad_norm": 0.2593333848699128, + "learning_rate": 0.0001998918637333336, + "loss": 1.0661, + "step": 1339 + }, + { + "epoch": 0.13, + "grad_norm": 0.25460421291647545, + "learning_rate": 0.00019989112699515417, + "loss": 1.1549, + "step": 1340 + }, + { + "epoch": 0.13, + "grad_norm": 0.2806093932847469, + "learning_rate": 0.0001998903877571365, + "loss": 1.1396, + "step": 1341 + }, + { + "epoch": 0.13, + "grad_norm": 0.28375827740366566, + "learning_rate": 0.00019988964601929911, + "loss": 1.1933, + "step": 1342 + }, + { + "epoch": 0.13, + "grad_norm": 0.25635075031051086, + "learning_rate": 0.00019988890178166053, + "loss": 1.0908, + "step": 1343 + }, + { + "epoch": 0.13, + "grad_norm": 0.2888250115582476, + "learning_rate": 0.00019988815504423942, + "loss": 1.1556, + "step": 1344 + }, + { + "epoch": 0.13, + "grad_norm": 0.23088975772081866, + "learning_rate": 0.00019988740580705443, + "loss": 1.1304, + "step": 1345 + }, + { + "epoch": 0.13, + "grad_norm": 0.27596330498232263, + "learning_rate": 0.00019988665407012435, + "loss": 1.11, + "step": 1346 + }, + { + "epoch": 0.13, + "grad_norm": 0.24512542720942831, + "learning_rate": 0.00019988589983346798, + "loss": 1.1509, + "step": 1347 + }, + { + "epoch": 0.13, + "grad_norm": 0.25605979489959907, + "learning_rate": 0.00019988514309710417, + "loss": 1.0923, + "step": 1348 + }, + { + "epoch": 0.13, + "grad_norm": 0.2532094527125873, + "learning_rate": 0.0001998843838610519, + "loss": 1.0768, + "step": 1349 + }, + { + "epoch": 0.13, + "grad_norm": 0.2951636432930729, + "learning_rate": 0.00019988362212533013, + "loss": 1.1431, + "step": 1350 + }, + { + "epoch": 0.13, + "grad_norm": 0.25484076775146086, + "learning_rate": 0.000199882857889958, + "loss": 1.1345, + "step": 1351 + }, + { + "epoch": 0.13, + "grad_norm": 0.23726743985931845, + "learning_rate": 0.0001998820911549545, + "loss": 1.106, + "step": 1352 + }, + { + "epoch": 0.13, + "grad_norm": 0.2587569140280319, + "learning_rate": 0.00019988132192033892, + "loss": 1.1131, + "step": 1353 + }, + { + "epoch": 0.13, + "grad_norm": 0.26259253430908974, + "learning_rate": 0.0001998805501861305, + "loss": 1.1066, + "step": 1354 + }, + { + "epoch": 0.13, + "grad_norm": 0.24656763039460808, + "learning_rate": 0.00019987977595234852, + "loss": 1.1207, + "step": 1355 + }, + { + "epoch": 0.13, + "grad_norm": 0.27728609474508775, + "learning_rate": 0.0001998789992190124, + "loss": 1.0683, + "step": 1356 + }, + { + "epoch": 0.13, + "grad_norm": 0.266917547376331, + "learning_rate": 0.00019987821998614154, + "loss": 1.1693, + "step": 1357 + }, + { + "epoch": 0.13, + "grad_norm": 0.2915661042761893, + "learning_rate": 0.00019987743825375544, + "loss": 1.1064, + "step": 1358 + }, + { + "epoch": 0.13, + "grad_norm": 0.24772664636633338, + "learning_rate": 0.00019987665402187367, + "loss": 0.9948, + "step": 1359 + }, + { + "epoch": 0.13, + "grad_norm": 0.2855087767458927, + "learning_rate": 0.0001998758672905159, + "loss": 1.1449, + "step": 1360 + }, + { + "epoch": 0.13, + "grad_norm": 0.2772392094730354, + "learning_rate": 0.00019987507805970176, + "loss": 1.1349, + "step": 1361 + }, + { + "epoch": 0.13, + "grad_norm": 0.2612704349828971, + "learning_rate": 0.000199874286329451, + "loss": 1.0995, + "step": 1362 + }, + { + "epoch": 0.13, + "grad_norm": 0.24900155104384822, + "learning_rate": 0.00019987349209978352, + "loss": 1.1157, + "step": 1363 + }, + { + "epoch": 0.13, + "grad_norm": 0.26136521449937644, + "learning_rate": 0.0001998726953707191, + "loss": 1.1324, + "step": 1364 + }, + { + "epoch": 0.13, + "grad_norm": 0.27010231238155247, + "learning_rate": 0.0001998718961422777, + "loss": 1.1295, + "step": 1365 + }, + { + "epoch": 0.13, + "grad_norm": 0.28056706441584167, + "learning_rate": 0.00019987109441447934, + "loss": 1.1236, + "step": 1366 + }, + { + "epoch": 0.13, + "grad_norm": 0.24673474376997923, + "learning_rate": 0.00019987029018734407, + "loss": 1.1493, + "step": 1367 + }, + { + "epoch": 0.13, + "grad_norm": 0.25553237785153865, + "learning_rate": 0.00019986948346089201, + "loss": 1.1698, + "step": 1368 + }, + { + "epoch": 0.13, + "grad_norm": 0.2370749606006542, + "learning_rate": 0.0001998686742351434, + "loss": 1.03, + "step": 1369 + }, + { + "epoch": 0.13, + "grad_norm": 0.2786623699017042, + "learning_rate": 0.00019986786251011842, + "loss": 1.0002, + "step": 1370 + }, + { + "epoch": 0.13, + "grad_norm": 0.2582710457594854, + "learning_rate": 0.0001998670482858374, + "loss": 1.1957, + "step": 1371 + }, + { + "epoch": 0.13, + "grad_norm": 0.2414613658655144, + "learning_rate": 0.00019986623156232076, + "loss": 1.1471, + "step": 1372 + }, + { + "epoch": 0.13, + "grad_norm": 0.28959428619565936, + "learning_rate": 0.0001998654123395889, + "loss": 1.1716, + "step": 1373 + }, + { + "epoch": 0.13, + "grad_norm": 0.22250465518264687, + "learning_rate": 0.00019986459061766234, + "loss": 1.146, + "step": 1374 + }, + { + "epoch": 0.13, + "grad_norm": 0.26717391829997156, + "learning_rate": 0.00019986376639656163, + "loss": 1.1858, + "step": 1375 + }, + { + "epoch": 0.13, + "grad_norm": 0.2580258297035148, + "learning_rate": 0.00019986293967630742, + "loss": 1.1115, + "step": 1376 + }, + { + "epoch": 0.13, + "grad_norm": 0.2678948286097016, + "learning_rate": 0.0001998621104569204, + "loss": 1.1202, + "step": 1377 + }, + { + "epoch": 0.13, + "grad_norm": 0.291208905746566, + "learning_rate": 0.00019986127873842128, + "loss": 1.1424, + "step": 1378 + }, + { + "epoch": 0.13, + "grad_norm": 0.21902207523941578, + "learning_rate": 0.00019986044452083087, + "loss": 1.1687, + "step": 1379 + }, + { + "epoch": 0.13, + "grad_norm": 0.2907782173595465, + "learning_rate": 0.00019985960780417012, + "loss": 1.068, + "step": 1380 + }, + { + "epoch": 0.13, + "grad_norm": 0.2629226322026941, + "learning_rate": 0.0001998587685884599, + "loss": 1.0816, + "step": 1381 + }, + { + "epoch": 0.13, + "grad_norm": 0.2828450761974692, + "learning_rate": 0.00019985792687372126, + "loss": 1.0958, + "step": 1382 + }, + { + "epoch": 0.13, + "grad_norm": 0.26457153514550896, + "learning_rate": 0.00019985708265997523, + "loss": 1.1454, + "step": 1383 + }, + { + "epoch": 0.13, + "grad_norm": 0.25857136554982324, + "learning_rate": 0.00019985623594724294, + "loss": 1.074, + "step": 1384 + }, + { + "epoch": 0.13, + "grad_norm": 0.24297032009355968, + "learning_rate": 0.00019985538673554558, + "loss": 1.2311, + "step": 1385 + }, + { + "epoch": 0.13, + "grad_norm": 0.29342141362537133, + "learning_rate": 0.00019985453502490447, + "loss": 1.0765, + "step": 1386 + }, + { + "epoch": 0.13, + "grad_norm": 0.27688151236884495, + "learning_rate": 0.0001998536808153408, + "loss": 1.1203, + "step": 1387 + }, + { + "epoch": 0.13, + "grad_norm": 0.288243343301449, + "learning_rate": 0.000199852824106876, + "loss": 1.0883, + "step": 1388 + }, + { + "epoch": 0.13, + "grad_norm": 0.252259697983258, + "learning_rate": 0.00019985196489953158, + "loss": 1.2147, + "step": 1389 + }, + { + "epoch": 0.13, + "grad_norm": 0.26515095142272627, + "learning_rate": 0.00019985110319332896, + "loss": 1.0793, + "step": 1390 + }, + { + "epoch": 0.13, + "grad_norm": 0.28200900869959167, + "learning_rate": 0.00019985023898828972, + "loss": 1.0851, + "step": 1391 + }, + { + "epoch": 0.13, + "grad_norm": 0.22101241462701582, + "learning_rate": 0.0001998493722844355, + "loss": 1.0172, + "step": 1392 + }, + { + "epoch": 0.13, + "grad_norm": 0.29717507654200653, + "learning_rate": 0.000199848503081788, + "loss": 1.1634, + "step": 1393 + }, + { + "epoch": 0.13, + "grad_norm": 0.2734078285310837, + "learning_rate": 0.00019984763138036893, + "loss": 1.206, + "step": 1394 + }, + { + "epoch": 0.13, + "grad_norm": 0.31430473861156777, + "learning_rate": 0.00019984675718020016, + "loss": 1.1355, + "step": 1395 + }, + { + "epoch": 0.13, + "grad_norm": 0.25927920918090913, + "learning_rate": 0.00019984588048130352, + "loss": 1.0166, + "step": 1396 + }, + { + "epoch": 0.13, + "grad_norm": 0.2943475145072853, + "learning_rate": 0.000199845001283701, + "loss": 1.0848, + "step": 1397 + }, + { + "epoch": 0.13, + "grad_norm": 0.26595828195937876, + "learning_rate": 0.0001998441195874145, + "loss": 1.0692, + "step": 1398 + }, + { + "epoch": 0.13, + "grad_norm": 0.27821823753565383, + "learning_rate": 0.00019984323539246624, + "loss": 1.2192, + "step": 1399 + }, + { + "epoch": 0.13, + "grad_norm": 0.2678637821368645, + "learning_rate": 0.00019984234869887825, + "loss": 1.121, + "step": 1400 + }, + { + "epoch": 0.13, + "grad_norm": 0.25399068189204, + "learning_rate": 0.0001998414595066727, + "loss": 1.0925, + "step": 1401 + }, + { + "epoch": 0.13, + "grad_norm": 0.23859094414183193, + "learning_rate": 0.00019984056781587191, + "loss": 1.0955, + "step": 1402 + }, + { + "epoch": 0.13, + "grad_norm": 0.23969916810545017, + "learning_rate": 0.00019983967362649814, + "loss": 1.0125, + "step": 1403 + }, + { + "epoch": 0.13, + "grad_norm": 0.2542524563052129, + "learning_rate": 0.0001998387769385738, + "loss": 1.0373, + "step": 1404 + }, + { + "epoch": 0.13, + "grad_norm": 0.27487621829351494, + "learning_rate": 0.0001998378777521213, + "loss": 1.0952, + "step": 1405 + }, + { + "epoch": 0.13, + "grad_norm": 0.2654518507826389, + "learning_rate": 0.0001998369760671632, + "loss": 1.1369, + "step": 1406 + }, + { + "epoch": 0.13, + "grad_norm": 0.273289916929377, + "learning_rate": 0.000199836071883722, + "loss": 1.1703, + "step": 1407 + }, + { + "epoch": 0.13, + "grad_norm": 0.26544162174771085, + "learning_rate": 0.0001998351652018204, + "loss": 1.1784, + "step": 1408 + }, + { + "epoch": 0.13, + "grad_norm": 0.2746593440350384, + "learning_rate": 0.000199834256021481, + "loss": 1.0443, + "step": 1409 + }, + { + "epoch": 0.13, + "grad_norm": 0.2501215693772626, + "learning_rate": 0.00019983334434272662, + "loss": 1.089, + "step": 1410 + }, + { + "epoch": 0.13, + "grad_norm": 0.268614062421362, + "learning_rate": 0.00019983243016558007, + "loss": 0.9597, + "step": 1411 + }, + { + "epoch": 0.14, + "grad_norm": 0.25030986160663105, + "learning_rate": 0.00019983151349006417, + "loss": 1.2056, + "step": 1412 + }, + { + "epoch": 0.14, + "grad_norm": 0.2603381688146406, + "learning_rate": 0.00019983059431620195, + "loss": 1.1547, + "step": 1413 + }, + { + "epoch": 0.14, + "grad_norm": 0.2421120755485211, + "learning_rate": 0.0001998296726440163, + "loss": 1.1084, + "step": 1414 + }, + { + "epoch": 0.14, + "grad_norm": 0.2521346105354822, + "learning_rate": 0.00019982874847353043, + "loss": 1.0624, + "step": 1415 + }, + { + "epoch": 0.14, + "grad_norm": 0.27862535658172966, + "learning_rate": 0.00019982782180476733, + "loss": 1.1186, + "step": 1416 + }, + { + "epoch": 0.14, + "grad_norm": 0.2947914398618809, + "learning_rate": 0.00019982689263775026, + "loss": 1.0843, + "step": 1417 + }, + { + "epoch": 0.14, + "grad_norm": 0.28536726878487906, + "learning_rate": 0.0001998259609725025, + "loss": 1.0653, + "step": 1418 + }, + { + "epoch": 0.14, + "grad_norm": 0.2854261291390814, + "learning_rate": 0.00019982502680904732, + "loss": 1.1266, + "step": 1419 + }, + { + "epoch": 0.14, + "grad_norm": 0.2788285385425596, + "learning_rate": 0.0001998240901474081, + "loss": 1.1592, + "step": 1420 + }, + { + "epoch": 0.14, + "grad_norm": 0.2919295150525652, + "learning_rate": 0.00019982315098760825, + "loss": 1.23, + "step": 1421 + }, + { + "epoch": 0.14, + "grad_norm": 0.2667438613342299, + "learning_rate": 0.00019982220932967135, + "loss": 1.1653, + "step": 1422 + }, + { + "epoch": 0.14, + "grad_norm": 0.2937856653774384, + "learning_rate": 0.00019982126517362092, + "loss": 1.054, + "step": 1423 + }, + { + "epoch": 0.14, + "grad_norm": 0.2787585191603891, + "learning_rate": 0.0001998203185194806, + "loss": 1.1195, + "step": 1424 + }, + { + "epoch": 0.14, + "grad_norm": 0.2808724466906523, + "learning_rate": 0.00019981936936727402, + "loss": 1.1324, + "step": 1425 + }, + { + "epoch": 0.14, + "grad_norm": 0.24401430651001996, + "learning_rate": 0.00019981841771702505, + "loss": 1.138, + "step": 1426 + }, + { + "epoch": 0.14, + "grad_norm": 0.25174511987554116, + "learning_rate": 0.00019981746356875744, + "loss": 1.1906, + "step": 1427 + }, + { + "epoch": 0.14, + "grad_norm": 0.2538503054933492, + "learning_rate": 0.00019981650692249504, + "loss": 1.1029, + "step": 1428 + }, + { + "epoch": 0.14, + "grad_norm": 0.2764211181550846, + "learning_rate": 0.00019981554777826185, + "loss": 1.058, + "step": 1429 + }, + { + "epoch": 0.14, + "grad_norm": 0.2952219050552689, + "learning_rate": 0.00019981458613608182, + "loss": 1.0941, + "step": 1430 + }, + { + "epoch": 0.14, + "grad_norm": 0.2787064009838231, + "learning_rate": 0.00019981362199597907, + "loss": 1.1565, + "step": 1431 + }, + { + "epoch": 0.14, + "grad_norm": 0.2737728015545912, + "learning_rate": 0.00019981265535797766, + "loss": 1.181, + "step": 1432 + }, + { + "epoch": 0.14, + "grad_norm": 0.24759285916434062, + "learning_rate": 0.00019981168622210184, + "loss": 1.1008, + "step": 1433 + }, + { + "epoch": 0.14, + "grad_norm": 0.25095588755500636, + "learning_rate": 0.00019981071458837586, + "loss": 1.1312, + "step": 1434 + }, + { + "epoch": 0.14, + "grad_norm": 0.24482769279016886, + "learning_rate": 0.00019980974045682399, + "loss": 1.0652, + "step": 1435 + }, + { + "epoch": 0.14, + "grad_norm": 0.3060859989141741, + "learning_rate": 0.00019980876382747064, + "loss": 1.157, + "step": 1436 + }, + { + "epoch": 0.14, + "grad_norm": 0.2880153757669946, + "learning_rate": 0.00019980778470034025, + "loss": 1.2203, + "step": 1437 + }, + { + "epoch": 0.14, + "grad_norm": 0.27454359894274216, + "learning_rate": 0.00019980680307545733, + "loss": 1.0434, + "step": 1438 + }, + { + "epoch": 0.14, + "grad_norm": 0.2883237368727595, + "learning_rate": 0.00019980581895284646, + "loss": 1.1272, + "step": 1439 + }, + { + "epoch": 0.14, + "grad_norm": 0.2703448143010634, + "learning_rate": 0.0001998048323325322, + "loss": 1.0698, + "step": 1440 + }, + { + "epoch": 0.14, + "grad_norm": 0.2583337496657767, + "learning_rate": 0.00019980384321453931, + "loss": 1.0424, + "step": 1441 + }, + { + "epoch": 0.14, + "grad_norm": 0.2698951572905667, + "learning_rate": 0.00019980285159889251, + "loss": 1.1694, + "step": 1442 + }, + { + "epoch": 0.14, + "grad_norm": 0.3289970000039555, + "learning_rate": 0.00019980185748561663, + "loss": 1.1242, + "step": 1443 + }, + { + "epoch": 0.14, + "grad_norm": 0.33181434848256175, + "learning_rate": 0.00019980086087473655, + "loss": 1.2397, + "step": 1444 + }, + { + "epoch": 0.14, + "grad_norm": 0.3011135638992476, + "learning_rate": 0.0001997998617662772, + "loss": 1.1714, + "step": 1445 + }, + { + "epoch": 0.14, + "grad_norm": 0.26023630385034113, + "learning_rate": 0.0001997988601602636, + "loss": 1.1072, + "step": 1446 + }, + { + "epoch": 0.14, + "grad_norm": 0.27251940674998737, + "learning_rate": 0.00019979785605672078, + "loss": 1.0825, + "step": 1447 + }, + { + "epoch": 0.14, + "grad_norm": 0.2554656785010738, + "learning_rate": 0.0001997968494556739, + "loss": 1.0752, + "step": 1448 + }, + { + "epoch": 0.14, + "grad_norm": 0.23684498161731513, + "learning_rate": 0.00019979584035714813, + "loss": 1.06, + "step": 1449 + }, + { + "epoch": 0.14, + "grad_norm": 0.31872148847175163, + "learning_rate": 0.00019979482876116876, + "loss": 1.1667, + "step": 1450 + }, + { + "epoch": 0.14, + "grad_norm": 0.25697407458502153, + "learning_rate": 0.00019979381466776107, + "loss": 1.0822, + "step": 1451 + }, + { + "epoch": 0.14, + "grad_norm": 0.2680749561732018, + "learning_rate": 0.00019979279807695046, + "loss": 1.1473, + "step": 1452 + }, + { + "epoch": 0.14, + "grad_norm": 0.23738996733574236, + "learning_rate": 0.00019979177898876233, + "loss": 1.1218, + "step": 1453 + }, + { + "epoch": 0.14, + "grad_norm": 0.2518660152482727, + "learning_rate": 0.00019979075740322224, + "loss": 1.1364, + "step": 1454 + }, + { + "epoch": 0.14, + "grad_norm": 0.2843340075762548, + "learning_rate": 0.00019978973332035574, + "loss": 1.1052, + "step": 1455 + }, + { + "epoch": 0.14, + "grad_norm": 0.23668848969056627, + "learning_rate": 0.0001997887067401884, + "loss": 1.1692, + "step": 1456 + }, + { + "epoch": 0.14, + "grad_norm": 0.28502539435405605, + "learning_rate": 0.000199787677662746, + "loss": 1.2374, + "step": 1457 + }, + { + "epoch": 0.14, + "grad_norm": 0.25655559740700473, + "learning_rate": 0.00019978664608805423, + "loss": 1.1088, + "step": 1458 + }, + { + "epoch": 0.14, + "grad_norm": 0.2585840843019671, + "learning_rate": 0.00019978561201613895, + "loss": 1.1601, + "step": 1459 + }, + { + "epoch": 0.14, + "grad_norm": 0.2550422041426665, + "learning_rate": 0.00019978457544702602, + "loss": 1.1033, + "step": 1460 + }, + { + "epoch": 0.14, + "grad_norm": 0.2899278429933367, + "learning_rate": 0.00019978353638074137, + "loss": 1.1611, + "step": 1461 + }, + { + "epoch": 0.14, + "grad_norm": 0.234358263645623, + "learning_rate": 0.000199782494817311, + "loss": 1.0151, + "step": 1462 + }, + { + "epoch": 0.14, + "grad_norm": 0.24241424067059672, + "learning_rate": 0.00019978145075676097, + "loss": 1.1854, + "step": 1463 + }, + { + "epoch": 0.14, + "grad_norm": 0.2685159364940143, + "learning_rate": 0.00019978040419911744, + "loss": 1.0691, + "step": 1464 + }, + { + "epoch": 0.14, + "grad_norm": 0.31415333163030407, + "learning_rate": 0.0001997793551444066, + "loss": 1.0344, + "step": 1465 + }, + { + "epoch": 0.14, + "grad_norm": 0.2366061685991993, + "learning_rate": 0.00019977830359265472, + "loss": 1.1004, + "step": 1466 + }, + { + "epoch": 0.14, + "grad_norm": 0.2485341188969311, + "learning_rate": 0.00019977724954388802, + "loss": 1.1078, + "step": 1467 + }, + { + "epoch": 0.14, + "grad_norm": 0.29387809184755664, + "learning_rate": 0.00019977619299813297, + "loss": 1.1114, + "step": 1468 + }, + { + "epoch": 0.14, + "grad_norm": 0.2652962832837201, + "learning_rate": 0.00019977513395541598, + "loss": 1.1429, + "step": 1469 + }, + { + "epoch": 0.14, + "grad_norm": 0.296485600586474, + "learning_rate": 0.00019977407241576355, + "loss": 1.1725, + "step": 1470 + }, + { + "epoch": 0.14, + "grad_norm": 0.25206336596375595, + "learning_rate": 0.00019977300837920227, + "loss": 1.1611, + "step": 1471 + }, + { + "epoch": 0.14, + "grad_norm": 0.2550564024652486, + "learning_rate": 0.00019977194184575873, + "loss": 1.1339, + "step": 1472 + }, + { + "epoch": 0.14, + "grad_norm": 0.27675386051052386, + "learning_rate": 0.00019977087281545966, + "loss": 1.1179, + "step": 1473 + }, + { + "epoch": 0.14, + "grad_norm": 0.2615226908183446, + "learning_rate": 0.00019976980128833178, + "loss": 1.0797, + "step": 1474 + }, + { + "epoch": 0.14, + "grad_norm": 0.268113224944456, + "learning_rate": 0.00019976872726440193, + "loss": 1.1163, + "step": 1475 + }, + { + "epoch": 0.14, + "grad_norm": 0.23912506420391394, + "learning_rate": 0.00019976765074369697, + "loss": 1.099, + "step": 1476 + }, + { + "epoch": 0.14, + "grad_norm": 0.28513459887618126, + "learning_rate": 0.00019976657172624383, + "loss": 1.0811, + "step": 1477 + }, + { + "epoch": 0.14, + "grad_norm": 0.28114711060770275, + "learning_rate": 0.00019976549021206958, + "loss": 1.2168, + "step": 1478 + }, + { + "epoch": 0.14, + "grad_norm": 0.2572724456046117, + "learning_rate": 0.0001997644062012012, + "loss": 1.0752, + "step": 1479 + }, + { + "epoch": 0.14, + "grad_norm": 0.3019482336468289, + "learning_rate": 0.00019976331969366587, + "loss": 1.1129, + "step": 1480 + }, + { + "epoch": 0.14, + "grad_norm": 0.2566195486989078, + "learning_rate": 0.00019976223068949076, + "loss": 1.1825, + "step": 1481 + }, + { + "epoch": 0.14, + "grad_norm": 0.27980667259100733, + "learning_rate": 0.00019976113918870314, + "loss": 1.1828, + "step": 1482 + }, + { + "epoch": 0.14, + "grad_norm": 0.30150809201942436, + "learning_rate": 0.0001997600451913303, + "loss": 1.1247, + "step": 1483 + }, + { + "epoch": 0.14, + "grad_norm": 0.2716498140574145, + "learning_rate": 0.0001997589486973996, + "loss": 1.1254, + "step": 1484 + }, + { + "epoch": 0.14, + "grad_norm": 0.2625386839698671, + "learning_rate": 0.00019975784970693855, + "loss": 1.1081, + "step": 1485 + }, + { + "epoch": 0.14, + "grad_norm": 0.29792002537853113, + "learning_rate": 0.00019975674821997463, + "loss": 1.1597, + "step": 1486 + }, + { + "epoch": 0.14, + "grad_norm": 0.2301276867707596, + "learning_rate": 0.00019975564423653538, + "loss": 1.1342, + "step": 1487 + }, + { + "epoch": 0.14, + "grad_norm": 0.27119741051691004, + "learning_rate": 0.00019975453775664844, + "loss": 1.0453, + "step": 1488 + }, + { + "epoch": 0.14, + "grad_norm": 0.25640967867486597, + "learning_rate": 0.0001997534287803415, + "loss": 1.0959, + "step": 1489 + }, + { + "epoch": 0.14, + "grad_norm": 0.2817881208330018, + "learning_rate": 0.00019975231730764227, + "loss": 1.1004, + "step": 1490 + }, + { + "epoch": 0.14, + "grad_norm": 0.2632972390210799, + "learning_rate": 0.00019975120333857866, + "loss": 1.0682, + "step": 1491 + }, + { + "epoch": 0.14, + "grad_norm": 0.256251122254461, + "learning_rate": 0.0001997500868731785, + "loss": 1.1663, + "step": 1492 + }, + { + "epoch": 0.14, + "grad_norm": 0.26242859708220295, + "learning_rate": 0.0001997489679114697, + "loss": 1.0843, + "step": 1493 + }, + { + "epoch": 0.14, + "grad_norm": 0.28009382446859793, + "learning_rate": 0.0001997478464534803, + "loss": 1.1744, + "step": 1494 + }, + { + "epoch": 0.14, + "grad_norm": 0.2448192879010323, + "learning_rate": 0.0001997467224992383, + "loss": 1.1268, + "step": 1495 + }, + { + "epoch": 0.14, + "grad_norm": 0.23428824139945728, + "learning_rate": 0.00019974559604877195, + "loss": 1.0997, + "step": 1496 + }, + { + "epoch": 0.14, + "grad_norm": 0.2833166934983789, + "learning_rate": 0.00019974446710210934, + "loss": 1.0867, + "step": 1497 + }, + { + "epoch": 0.14, + "grad_norm": 0.25022212744952455, + "learning_rate": 0.00019974333565927878, + "loss": 1.0903, + "step": 1498 + }, + { + "epoch": 0.14, + "grad_norm": 0.26855972657496696, + "learning_rate": 0.00019974220172030852, + "loss": 1.0304, + "step": 1499 + }, + { + "epoch": 0.14, + "grad_norm": 0.26473122846509034, + "learning_rate": 0.000199741065285227, + "loss": 1.0981, + "step": 1500 + }, + { + "epoch": 0.14, + "grad_norm": 0.2915333904654244, + "learning_rate": 0.00019973992635406265, + "loss": 1.157, + "step": 1501 + }, + { + "epoch": 0.14, + "grad_norm": 0.37217781009539613, + "learning_rate": 0.00019973878492684393, + "loss": 1.1401, + "step": 1502 + }, + { + "epoch": 0.14, + "grad_norm": 0.2745430144825149, + "learning_rate": 0.00019973764100359942, + "loss": 1.1318, + "step": 1503 + }, + { + "epoch": 0.14, + "grad_norm": 0.2755977808247509, + "learning_rate": 0.0001997364945843578, + "loss": 1.1609, + "step": 1504 + }, + { + "epoch": 0.14, + "grad_norm": 0.2915733276431069, + "learning_rate": 0.00019973534566914772, + "loss": 1.086, + "step": 1505 + }, + { + "epoch": 0.14, + "grad_norm": 0.2599783190286603, + "learning_rate": 0.00019973419425799792, + "loss": 1.1108, + "step": 1506 + }, + { + "epoch": 0.14, + "grad_norm": 0.280753933923634, + "learning_rate": 0.0001997330403509372, + "loss": 1.147, + "step": 1507 + }, + { + "epoch": 0.14, + "grad_norm": 0.27055165527737557, + "learning_rate": 0.0001997318839479945, + "loss": 1.0952, + "step": 1508 + }, + { + "epoch": 0.14, + "grad_norm": 0.2491475388699138, + "learning_rate": 0.00019973072504919875, + "loss": 1.0893, + "step": 1509 + }, + { + "epoch": 0.14, + "grad_norm": 0.22529007248240643, + "learning_rate": 0.00019972956365457887, + "loss": 0.9453, + "step": 1510 + }, + { + "epoch": 0.14, + "grad_norm": 0.29669549846388793, + "learning_rate": 0.000199728399764164, + "loss": 1.2225, + "step": 1511 + }, + { + "epoch": 0.14, + "grad_norm": 0.27716026577726227, + "learning_rate": 0.00019972723337798327, + "loss": 1.2158, + "step": 1512 + }, + { + "epoch": 0.14, + "grad_norm": 0.2615727080933855, + "learning_rate": 0.00019972606449606583, + "loss": 1.1745, + "step": 1513 + }, + { + "epoch": 0.14, + "grad_norm": 0.2613301650841691, + "learning_rate": 0.00019972489311844097, + "loss": 1.0105, + "step": 1514 + }, + { + "epoch": 0.14, + "grad_norm": 0.2637350666912871, + "learning_rate": 0.00019972371924513796, + "loss": 1.2445, + "step": 1515 + }, + { + "epoch": 0.15, + "grad_norm": 0.23442111345358183, + "learning_rate": 0.0001997225428761862, + "loss": 1.0897, + "step": 1516 + }, + { + "epoch": 0.15, + "grad_norm": 0.2790398743404188, + "learning_rate": 0.00019972136401161516, + "loss": 1.0976, + "step": 1517 + }, + { + "epoch": 0.15, + "grad_norm": 0.24161573374816125, + "learning_rate": 0.00019972018265145428, + "loss": 1.0778, + "step": 1518 + }, + { + "epoch": 0.15, + "grad_norm": 0.2599798006860897, + "learning_rate": 0.00019971899879573317, + "loss": 1.0797, + "step": 1519 + }, + { + "epoch": 0.15, + "grad_norm": 0.24106641987255334, + "learning_rate": 0.00019971781244448145, + "loss": 1.0863, + "step": 1520 + }, + { + "epoch": 0.15, + "grad_norm": 0.28462310372801436, + "learning_rate": 0.0001997166235977288, + "loss": 1.2261, + "step": 1521 + }, + { + "epoch": 0.15, + "grad_norm": 0.2715980154291507, + "learning_rate": 0.00019971543225550498, + "loss": 1.074, + "step": 1522 + }, + { + "epoch": 0.15, + "grad_norm": 0.24961681655649778, + "learning_rate": 0.0001997142384178398, + "loss": 1.102, + "step": 1523 + }, + { + "epoch": 0.15, + "grad_norm": 0.27808524258122086, + "learning_rate": 0.00019971304208476313, + "loss": 1.0456, + "step": 1524 + }, + { + "epoch": 0.15, + "grad_norm": 0.272393710564596, + "learning_rate": 0.0001997118432563049, + "loss": 0.9979, + "step": 1525 + }, + { + "epoch": 0.15, + "grad_norm": 0.2985225834683353, + "learning_rate": 0.00019971064193249517, + "loss": 1.1062, + "step": 1526 + }, + { + "epoch": 0.15, + "grad_norm": 0.31751577230140715, + "learning_rate": 0.0001997094381133639, + "loss": 1.1574, + "step": 1527 + }, + { + "epoch": 0.15, + "grad_norm": 0.2311208784177862, + "learning_rate": 0.00019970823179894134, + "loss": 1.0605, + "step": 1528 + }, + { + "epoch": 0.15, + "grad_norm": 0.27395265358450693, + "learning_rate": 0.00019970702298925756, + "loss": 1.0394, + "step": 1529 + }, + { + "epoch": 0.15, + "grad_norm": 0.31050969831593717, + "learning_rate": 0.0001997058116843429, + "loss": 1.21, + "step": 1530 + }, + { + "epoch": 0.15, + "grad_norm": 0.29740261342037966, + "learning_rate": 0.00019970459788422762, + "loss": 1.0973, + "step": 1531 + }, + { + "epoch": 0.15, + "grad_norm": 0.30710726934897853, + "learning_rate": 0.00019970338158894213, + "loss": 1.0899, + "step": 1532 + }, + { + "epoch": 0.15, + "grad_norm": 0.38054688041895546, + "learning_rate": 0.00019970216279851686, + "loss": 1.0608, + "step": 1533 + }, + { + "epoch": 0.15, + "grad_norm": 0.26727890820952926, + "learning_rate": 0.0001997009415129823, + "loss": 1.1018, + "step": 1534 + }, + { + "epoch": 0.15, + "grad_norm": 0.29384471950665425, + "learning_rate": 0.000199699717732369, + "loss": 1.088, + "step": 1535 + }, + { + "epoch": 0.15, + "grad_norm": 0.245541461264911, + "learning_rate": 0.00019969849145670763, + "loss": 1.0829, + "step": 1536 + }, + { + "epoch": 0.15, + "grad_norm": 0.3031137108785239, + "learning_rate": 0.0001996972626860288, + "loss": 1.138, + "step": 1537 + }, + { + "epoch": 0.15, + "grad_norm": 0.27477763946901834, + "learning_rate": 0.0001996960314203634, + "loss": 1.1612, + "step": 1538 + }, + { + "epoch": 0.15, + "grad_norm": 0.2970092386217553, + "learning_rate": 0.0001996947976597421, + "loss": 1.0688, + "step": 1539 + }, + { + "epoch": 0.15, + "grad_norm": 0.3258862801086761, + "learning_rate": 0.00019969356140419584, + "loss": 1.1302, + "step": 1540 + }, + { + "epoch": 0.15, + "grad_norm": 0.2995257919569518, + "learning_rate": 0.00019969232265375556, + "loss": 1.0475, + "step": 1541 + }, + { + "epoch": 0.15, + "grad_norm": 0.26050135512792294, + "learning_rate": 0.00019969108140845224, + "loss": 1.1356, + "step": 1542 + }, + { + "epoch": 0.15, + "grad_norm": 0.258577882146825, + "learning_rate": 0.00019968983766831695, + "loss": 1.0545, + "step": 1543 + }, + { + "epoch": 0.15, + "grad_norm": 0.2627007677252112, + "learning_rate": 0.00019968859143338084, + "loss": 1.1386, + "step": 1544 + }, + { + "epoch": 0.15, + "grad_norm": 0.2633365267212909, + "learning_rate": 0.00019968734270367505, + "loss": 1.1784, + "step": 1545 + }, + { + "epoch": 0.15, + "grad_norm": 0.28411120690910896, + "learning_rate": 0.0001996860914792309, + "loss": 1.0888, + "step": 1546 + }, + { + "epoch": 0.15, + "grad_norm": 0.26646479947544477, + "learning_rate": 0.00019968483776007962, + "loss": 1.1659, + "step": 1547 + }, + { + "epoch": 0.15, + "grad_norm": 0.24786146098955286, + "learning_rate": 0.00019968358154625265, + "loss": 1.2185, + "step": 1548 + }, + { + "epoch": 0.15, + "grad_norm": 0.23607348527883257, + "learning_rate": 0.0001996823228377814, + "loss": 1.0735, + "step": 1549 + }, + { + "epoch": 0.15, + "grad_norm": 0.2570468565891483, + "learning_rate": 0.00019968106163469735, + "loss": 1.0616, + "step": 1550 + }, + { + "epoch": 0.15, + "grad_norm": 0.25852020457845143, + "learning_rate": 0.00019967979793703212, + "loss": 1.093, + "step": 1551 + }, + { + "epoch": 0.15, + "grad_norm": 0.28278864970040046, + "learning_rate": 0.00019967853174481727, + "loss": 1.0388, + "step": 1552 + }, + { + "epoch": 0.15, + "grad_norm": 0.2941535073545967, + "learning_rate": 0.00019967726305808453, + "loss": 1.0706, + "step": 1553 + }, + { + "epoch": 0.15, + "grad_norm": 0.2622528773036849, + "learning_rate": 0.00019967599187686562, + "loss": 1.0883, + "step": 1554 + }, + { + "epoch": 0.15, + "grad_norm": 0.27742024001398574, + "learning_rate": 0.00019967471820119242, + "loss": 1.0728, + "step": 1555 + }, + { + "epoch": 0.15, + "grad_norm": 0.3052593236911406, + "learning_rate": 0.00019967344203109671, + "loss": 1.1789, + "step": 1556 + }, + { + "epoch": 0.15, + "grad_norm": 0.28599434696605697, + "learning_rate": 0.0001996721633666105, + "loss": 1.1562, + "step": 1557 + }, + { + "epoch": 0.15, + "grad_norm": 0.24494683600954692, + "learning_rate": 0.00019967088220776573, + "loss": 1.2048, + "step": 1558 + }, + { + "epoch": 0.15, + "grad_norm": 0.25067138267095485, + "learning_rate": 0.0001996695985545945, + "loss": 1.1256, + "step": 1559 + }, + { + "epoch": 0.15, + "grad_norm": 0.3027501173185624, + "learning_rate": 0.00019966831240712893, + "loss": 1.1089, + "step": 1560 + }, + { + "epoch": 0.15, + "grad_norm": 0.29832312031178204, + "learning_rate": 0.0001996670237654012, + "loss": 1.1435, + "step": 1561 + }, + { + "epoch": 0.15, + "grad_norm": 0.2602686000520101, + "learning_rate": 0.00019966573262944357, + "loss": 1.2332, + "step": 1562 + }, + { + "epoch": 0.15, + "grad_norm": 0.2805057631189005, + "learning_rate": 0.00019966443899928831, + "loss": 1.0837, + "step": 1563 + }, + { + "epoch": 0.15, + "grad_norm": 0.38166616945412846, + "learning_rate": 0.00019966314287496787, + "loss": 1.059, + "step": 1564 + }, + { + "epoch": 0.15, + "grad_norm": 0.2764938914379305, + "learning_rate": 0.00019966184425651464, + "loss": 1.1139, + "step": 1565 + }, + { + "epoch": 0.15, + "grad_norm": 0.24932842836394434, + "learning_rate": 0.0001996605431439611, + "loss": 1.1142, + "step": 1566 + }, + { + "epoch": 0.15, + "grad_norm": 0.29196882597427254, + "learning_rate": 0.00019965923953733987, + "loss": 1.1607, + "step": 1567 + }, + { + "epoch": 0.15, + "grad_norm": 0.2709530754699249, + "learning_rate": 0.00019965793343668347, + "loss": 1.0495, + "step": 1568 + }, + { + "epoch": 0.15, + "grad_norm": 0.2598432982826659, + "learning_rate": 0.0001996566248420247, + "loss": 1.1288, + "step": 1569 + }, + { + "epoch": 0.15, + "grad_norm": 0.46312766196550453, + "learning_rate": 0.00019965531375339628, + "loss": 1.1307, + "step": 1570 + }, + { + "epoch": 0.15, + "grad_norm": 0.26262906846828266, + "learning_rate": 0.00019965400017083097, + "loss": 1.1543, + "step": 1571 + }, + { + "epoch": 0.15, + "grad_norm": 0.25314149624310883, + "learning_rate": 0.00019965268409436168, + "loss": 1.0466, + "step": 1572 + }, + { + "epoch": 0.15, + "grad_norm": 0.2687356408453828, + "learning_rate": 0.00019965136552402136, + "loss": 1.1159, + "step": 1573 + }, + { + "epoch": 0.15, + "grad_norm": 0.2708428789609682, + "learning_rate": 0.00019965004445984298, + "loss": 1.055, + "step": 1574 + }, + { + "epoch": 0.15, + "grad_norm": 0.3839155117397319, + "learning_rate": 0.0001996487209018596, + "loss": 1.0844, + "step": 1575 + }, + { + "epoch": 0.15, + "grad_norm": 0.7734934544648255, + "learning_rate": 0.00019964739485010436, + "loss": 1.1704, + "step": 1576 + }, + { + "epoch": 0.15, + "grad_norm": 0.2985808342595769, + "learning_rate": 0.00019964606630461042, + "loss": 1.1233, + "step": 1577 + }, + { + "epoch": 0.15, + "grad_norm": 0.28199357321934904, + "learning_rate": 0.00019964473526541107, + "loss": 1.1306, + "step": 1578 + }, + { + "epoch": 0.15, + "grad_norm": 0.2708719269403245, + "learning_rate": 0.0001996434017325396, + "loss": 1.2283, + "step": 1579 + }, + { + "epoch": 0.15, + "grad_norm": 0.2792985547025937, + "learning_rate": 0.00019964206570602936, + "loss": 1.1385, + "step": 1580 + }, + { + "epoch": 0.15, + "grad_norm": 0.23415050247231725, + "learning_rate": 0.0001996407271859138, + "loss": 1.0389, + "step": 1581 + }, + { + "epoch": 0.15, + "grad_norm": 0.2740160527199919, + "learning_rate": 0.00019963938617222643, + "loss": 1.1243, + "step": 1582 + }, + { + "epoch": 0.15, + "grad_norm": 0.27399387661110913, + "learning_rate": 0.0001996380426650008, + "loss": 1.1087, + "step": 1583 + }, + { + "epoch": 0.15, + "grad_norm": 0.2692759658009343, + "learning_rate": 0.0001996366966642705, + "loss": 1.1163, + "step": 1584 + }, + { + "epoch": 0.15, + "grad_norm": 0.9607464989916183, + "learning_rate": 0.0001996353481700693, + "loss": 1.0442, + "step": 1585 + }, + { + "epoch": 0.15, + "grad_norm": 0.24459484067831805, + "learning_rate": 0.00019963399718243084, + "loss": 1.0841, + "step": 1586 + }, + { + "epoch": 0.15, + "grad_norm": 0.2762521737160385, + "learning_rate": 0.00019963264370138903, + "loss": 1.1496, + "step": 1587 + }, + { + "epoch": 0.15, + "grad_norm": 0.25933505174178323, + "learning_rate": 0.0001996312877269777, + "loss": 1.2522, + "step": 1588 + }, + { + "epoch": 0.15, + "grad_norm": 0.27189774149978774, + "learning_rate": 0.00019962992925923073, + "loss": 1.1413, + "step": 1589 + }, + { + "epoch": 0.15, + "grad_norm": 3.576402306240423, + "learning_rate": 0.00019962856829818223, + "loss": 1.1702, + "step": 1590 + }, + { + "epoch": 0.15, + "grad_norm": 0.28151915869781446, + "learning_rate": 0.00019962720484386614, + "loss": 1.1608, + "step": 1591 + }, + { + "epoch": 0.15, + "grad_norm": 3.487170407807799, + "learning_rate": 0.00019962583889631663, + "loss": 1.1038, + "step": 1592 + }, + { + "epoch": 0.15, + "grad_norm": 0.25611542436265444, + "learning_rate": 0.00019962447045556792, + "loss": 1.0629, + "step": 1593 + }, + { + "epoch": 0.15, + "grad_norm": 0.24414987846237798, + "learning_rate": 0.00019962309952165425, + "loss": 1.0264, + "step": 1594 + }, + { + "epoch": 0.15, + "grad_norm": 0.26011148045017146, + "learning_rate": 0.00019962172609460982, + "loss": 1.0993, + "step": 1595 + }, + { + "epoch": 0.15, + "grad_norm": 0.2618849192631391, + "learning_rate": 0.00019962035017446916, + "loss": 1.1054, + "step": 1596 + }, + { + "epoch": 0.15, + "grad_norm": 0.24456979964789494, + "learning_rate": 0.0001996189717612666, + "loss": 1.1605, + "step": 1597 + }, + { + "epoch": 0.15, + "grad_norm": 0.2832924711395636, + "learning_rate": 0.00019961759085503666, + "loss": 1.1245, + "step": 1598 + }, + { + "epoch": 0.15, + "grad_norm": 0.24882178646084718, + "learning_rate": 0.00019961620745581387, + "loss": 1.0725, + "step": 1599 + }, + { + "epoch": 0.15, + "grad_norm": 0.22925300602806323, + "learning_rate": 0.00019961482156363296, + "loss": 1.0953, + "step": 1600 + }, + { + "epoch": 0.15, + "grad_norm": 0.2531531358177607, + "learning_rate": 0.00019961343317852846, + "loss": 1.0204, + "step": 1601 + }, + { + "epoch": 0.15, + "grad_norm": 0.2738032960535575, + "learning_rate": 0.00019961204230053525, + "loss": 1.1092, + "step": 1602 + }, + { + "epoch": 0.15, + "grad_norm": 0.2731400838916497, + "learning_rate": 0.00019961064892968806, + "loss": 1.1173, + "step": 1603 + }, + { + "epoch": 0.15, + "grad_norm": 0.2493741546517925, + "learning_rate": 0.00019960925306602176, + "loss": 1.0769, + "step": 1604 + }, + { + "epoch": 0.15, + "grad_norm": 0.25050950474007155, + "learning_rate": 0.0001996078547095713, + "loss": 1.133, + "step": 1605 + }, + { + "epoch": 0.15, + "grad_norm": 0.29817105037980673, + "learning_rate": 0.0001996064538603717, + "loss": 1.1855, + "step": 1606 + }, + { + "epoch": 0.15, + "grad_norm": 0.28263632425868207, + "learning_rate": 0.00019960505051845796, + "loss": 1.136, + "step": 1607 + }, + { + "epoch": 0.15, + "grad_norm": 0.23133396852491225, + "learning_rate": 0.00019960364468386526, + "loss": 0.9476, + "step": 1608 + }, + { + "epoch": 0.15, + "grad_norm": 0.2775442108767544, + "learning_rate": 0.00019960223635662874, + "loss": 1.1606, + "step": 1609 + }, + { + "epoch": 0.15, + "grad_norm": 0.2805090162171342, + "learning_rate": 0.00019960082553678365, + "loss": 1.143, + "step": 1610 + }, + { + "epoch": 0.15, + "grad_norm": 0.2537997461349602, + "learning_rate": 0.0001995994122243653, + "loss": 1.1834, + "step": 1611 + }, + { + "epoch": 0.15, + "grad_norm": 0.32560983069595756, + "learning_rate": 0.00019959799641940907, + "loss": 0.9919, + "step": 1612 + }, + { + "epoch": 0.15, + "grad_norm": 0.2777624913037504, + "learning_rate": 0.0001995965781219504, + "loss": 1.1872, + "step": 1613 + }, + { + "epoch": 0.15, + "grad_norm": 0.2758857420110973, + "learning_rate": 0.00019959515733202477, + "loss": 1.137, + "step": 1614 + }, + { + "epoch": 0.15, + "grad_norm": 0.256755743287946, + "learning_rate": 0.0001995937340496677, + "loss": 1.0922, + "step": 1615 + }, + { + "epoch": 0.15, + "grad_norm": 0.2517079524462437, + "learning_rate": 0.00019959230827491488, + "loss": 1.0859, + "step": 1616 + }, + { + "epoch": 0.15, + "grad_norm": 0.2724013994730525, + "learning_rate": 0.00019959088000780193, + "loss": 1.0288, + "step": 1617 + }, + { + "epoch": 0.15, + "grad_norm": 0.3008600441450719, + "learning_rate": 0.00019958944924836463, + "loss": 1.044, + "step": 1618 + }, + { + "epoch": 0.15, + "grad_norm": 0.29350285017468836, + "learning_rate": 0.00019958801599663877, + "loss": 1.1309, + "step": 1619 + }, + { + "epoch": 0.15, + "grad_norm": 0.26935905493782025, + "learning_rate": 0.0001995865802526602, + "loss": 1.1497, + "step": 1620 + }, + { + "epoch": 0.16, + "grad_norm": 0.2619749727571851, + "learning_rate": 0.0001995851420164649, + "loss": 1.226, + "step": 1621 + }, + { + "epoch": 0.16, + "grad_norm": 0.2566559504175784, + "learning_rate": 0.00019958370128808883, + "loss": 1.126, + "step": 1622 + }, + { + "epoch": 0.16, + "grad_norm": 0.2950868154557528, + "learning_rate": 0.00019958225806756806, + "loss": 1.1165, + "step": 1623 + }, + { + "epoch": 0.16, + "grad_norm": 0.262703858238673, + "learning_rate": 0.00019958081235493867, + "loss": 1.1535, + "step": 1624 + }, + { + "epoch": 0.16, + "grad_norm": 0.27043560750628914, + "learning_rate": 0.00019957936415023687, + "loss": 1.1192, + "step": 1625 + }, + { + "epoch": 0.16, + "grad_norm": 0.263242304281921, + "learning_rate": 0.00019957791345349892, + "loss": 1.0326, + "step": 1626 + }, + { + "epoch": 0.16, + "grad_norm": 0.2815449484727248, + "learning_rate": 0.0001995764602647611, + "loss": 1.0835, + "step": 1627 + }, + { + "epoch": 0.16, + "grad_norm": 0.2868280654211211, + "learning_rate": 0.00019957500458405976, + "loss": 1.1983, + "step": 1628 + }, + { + "epoch": 0.16, + "grad_norm": 0.31775561559603943, + "learning_rate": 0.00019957354641143136, + "loss": 1.163, + "step": 1629 + }, + { + "epoch": 0.16, + "grad_norm": 0.2962143104484358, + "learning_rate": 0.00019957208574691238, + "loss": 1.2085, + "step": 1630 + }, + { + "epoch": 0.16, + "grad_norm": 0.33134737467969283, + "learning_rate": 0.0001995706225905394, + "loss": 1.1736, + "step": 1631 + }, + { + "epoch": 0.16, + "grad_norm": 0.2973914160477879, + "learning_rate": 0.00019956915694234895, + "loss": 1.1877, + "step": 1632 + }, + { + "epoch": 0.16, + "grad_norm": 0.23797305266943983, + "learning_rate": 0.00019956768880237781, + "loss": 1.13, + "step": 1633 + }, + { + "epoch": 0.16, + "grad_norm": 0.28430135940967705, + "learning_rate": 0.0001995662181706627, + "loss": 1.1628, + "step": 1634 + }, + { + "epoch": 0.16, + "grad_norm": 0.2606274355294148, + "learning_rate": 0.00019956474504724038, + "loss": 1.1124, + "step": 1635 + }, + { + "epoch": 0.16, + "grad_norm": 0.26540643513755174, + "learning_rate": 0.00019956326943214775, + "loss": 1.0509, + "step": 1636 + }, + { + "epoch": 0.16, + "grad_norm": 0.25782459518811457, + "learning_rate": 0.00019956179132542173, + "loss": 1.0932, + "step": 1637 + }, + { + "epoch": 0.16, + "grad_norm": 0.24697183220142635, + "learning_rate": 0.00019956031072709932, + "loss": 1.2274, + "step": 1638 + }, + { + "epoch": 0.16, + "grad_norm": 0.2609501474414402, + "learning_rate": 0.0001995588276372175, + "loss": 1.0552, + "step": 1639 + }, + { + "epoch": 0.16, + "grad_norm": 0.2649049635321031, + "learning_rate": 0.00019955734205581352, + "loss": 0.9704, + "step": 1640 + }, + { + "epoch": 0.16, + "grad_norm": 0.2593017975988553, + "learning_rate": 0.00019955585398292447, + "loss": 1.1551, + "step": 1641 + }, + { + "epoch": 0.16, + "grad_norm": 0.25956189155243276, + "learning_rate": 0.0001995543634185876, + "loss": 1.1989, + "step": 1642 + }, + { + "epoch": 0.16, + "grad_norm": 0.2560637809135954, + "learning_rate": 0.0001995528703628402, + "loss": 1.1312, + "step": 1643 + }, + { + "epoch": 0.16, + "grad_norm": 0.27697911347482446, + "learning_rate": 0.00019955137481571968, + "loss": 1.2054, + "step": 1644 + }, + { + "epoch": 0.16, + "grad_norm": 0.24778928635339068, + "learning_rate": 0.00019954987677726343, + "loss": 1.1358, + "step": 1645 + }, + { + "epoch": 0.16, + "grad_norm": 0.26610781310481363, + "learning_rate": 0.00019954837624750895, + "loss": 1.1007, + "step": 1646 + }, + { + "epoch": 0.16, + "grad_norm": 0.2799296986376897, + "learning_rate": 0.0001995468732264938, + "loss": 1.0621, + "step": 1647 + }, + { + "epoch": 0.16, + "grad_norm": 0.26342475793330683, + "learning_rate": 0.00019954536771425556, + "loss": 1.1325, + "step": 1648 + }, + { + "epoch": 0.16, + "grad_norm": 0.2809166549010256, + "learning_rate": 0.00019954385971083193, + "loss": 1.0778, + "step": 1649 + }, + { + "epoch": 0.16, + "grad_norm": 0.28395265589643537, + "learning_rate": 0.00019954234921626068, + "loss": 0.9792, + "step": 1650 + }, + { + "epoch": 0.16, + "grad_norm": 0.3037040271080715, + "learning_rate": 0.00019954083623057955, + "loss": 1.1754, + "step": 1651 + }, + { + "epoch": 0.16, + "grad_norm": 0.23645934008105318, + "learning_rate": 0.00019953932075382646, + "loss": 1.1307, + "step": 1652 + }, + { + "epoch": 0.16, + "grad_norm": 0.2723890058669645, + "learning_rate": 0.00019953780278603932, + "loss": 1.1161, + "step": 1653 + }, + { + "epoch": 0.16, + "grad_norm": 0.2607964386960627, + "learning_rate": 0.00019953628232725608, + "loss": 1.1741, + "step": 1654 + }, + { + "epoch": 0.16, + "grad_norm": 0.27487552426540823, + "learning_rate": 0.0001995347593775148, + "loss": 1.131, + "step": 1655 + }, + { + "epoch": 0.16, + "grad_norm": 0.24819637701868438, + "learning_rate": 0.00019953323393685367, + "loss": 1.1246, + "step": 1656 + }, + { + "epoch": 0.16, + "grad_norm": 0.2880503601951672, + "learning_rate": 0.00019953170600531074, + "loss": 1.1414, + "step": 1657 + }, + { + "epoch": 0.16, + "grad_norm": 0.2661262998136285, + "learning_rate": 0.00019953017558292438, + "loss": 1.1857, + "step": 1658 + }, + { + "epoch": 0.16, + "grad_norm": 0.2660986350757843, + "learning_rate": 0.00019952864266973278, + "loss": 1.1092, + "step": 1659 + }, + { + "epoch": 0.16, + "grad_norm": 0.2713659116488214, + "learning_rate": 0.00019952710726577435, + "loss": 1.0772, + "step": 1660 + }, + { + "epoch": 0.16, + "grad_norm": 0.29223182156019084, + "learning_rate": 0.00019952556937108753, + "loss": 1.0789, + "step": 1661 + }, + { + "epoch": 0.16, + "grad_norm": 0.2582755115758311, + "learning_rate": 0.00019952402898571077, + "loss": 1.1875, + "step": 1662 + }, + { + "epoch": 0.16, + "grad_norm": 0.2654852860667591, + "learning_rate": 0.00019952248610968264, + "loss": 1.0323, + "step": 1663 + }, + { + "epoch": 0.16, + "grad_norm": 0.2935124336052021, + "learning_rate": 0.00019952094074304175, + "loss": 0.9859, + "step": 1664 + }, + { + "epoch": 0.16, + "grad_norm": 0.2546178333136953, + "learning_rate": 0.00019951939288582676, + "loss": 1.0756, + "step": 1665 + }, + { + "epoch": 0.16, + "grad_norm": 0.3002327247319587, + "learning_rate": 0.0001995178425380764, + "loss": 1.1356, + "step": 1666 + }, + { + "epoch": 0.16, + "grad_norm": 0.2792232404834516, + "learning_rate": 0.00019951628969982953, + "loss": 1.1251, + "step": 1667 + }, + { + "epoch": 0.16, + "grad_norm": 0.28042254510601033, + "learning_rate": 0.00019951473437112495, + "loss": 1.0406, + "step": 1668 + }, + { + "epoch": 0.16, + "grad_norm": 0.2869293910747367, + "learning_rate": 0.0001995131765520016, + "loss": 1.0321, + "step": 1669 + }, + { + "epoch": 0.16, + "grad_norm": 0.26731773733638436, + "learning_rate": 0.00019951161624249844, + "loss": 1.1865, + "step": 1670 + }, + { + "epoch": 0.16, + "grad_norm": 0.26494146088310505, + "learning_rate": 0.00019951005344265462, + "loss": 1.105, + "step": 1671 + }, + { + "epoch": 0.16, + "grad_norm": 0.26633126397973417, + "learning_rate": 0.0001995084881525091, + "loss": 1.0366, + "step": 1672 + }, + { + "epoch": 0.16, + "grad_norm": 0.27563381490550426, + "learning_rate": 0.00019950692037210113, + "loss": 1.1346, + "step": 1673 + }, + { + "epoch": 0.16, + "grad_norm": 0.2808572907767721, + "learning_rate": 0.00019950535010146994, + "loss": 1.1304, + "step": 1674 + }, + { + "epoch": 0.16, + "grad_norm": 0.249082820217972, + "learning_rate": 0.00019950377734065486, + "loss": 1.1375, + "step": 1675 + }, + { + "epoch": 0.16, + "grad_norm": 0.242030163260417, + "learning_rate": 0.00019950220208969519, + "loss": 1.0647, + "step": 1676 + }, + { + "epoch": 0.16, + "grad_norm": 0.23873021173771233, + "learning_rate": 0.00019950062434863038, + "loss": 1.1427, + "step": 1677 + }, + { + "epoch": 0.16, + "grad_norm": 0.2526857334515431, + "learning_rate": 0.00019949904411749995, + "loss": 1.0652, + "step": 1678 + }, + { + "epoch": 0.16, + "grad_norm": 0.22005647549892257, + "learning_rate": 0.00019949746139634336, + "loss": 1.2141, + "step": 1679 + }, + { + "epoch": 0.16, + "grad_norm": 0.29830008364083893, + "learning_rate": 0.0001994958761852003, + "loss": 0.9724, + "step": 1680 + }, + { + "epoch": 0.16, + "grad_norm": 0.2336810810860809, + "learning_rate": 0.00019949428848411036, + "loss": 1.0718, + "step": 1681 + }, + { + "epoch": 0.16, + "grad_norm": 0.2596500357999934, + "learning_rate": 0.00019949269829311336, + "loss": 1.1449, + "step": 1682 + }, + { + "epoch": 0.16, + "grad_norm": 0.28287000384537997, + "learning_rate": 0.00019949110561224905, + "loss": 1.1513, + "step": 1683 + }, + { + "epoch": 0.16, + "grad_norm": 0.27286503590527955, + "learning_rate": 0.00019948951044155728, + "loss": 1.095, + "step": 1684 + }, + { + "epoch": 0.16, + "grad_norm": 0.26619367233558344, + "learning_rate": 0.000199487912781078, + "loss": 1.0861, + "step": 1685 + }, + { + "epoch": 0.16, + "grad_norm": 0.29588681836665753, + "learning_rate": 0.0001994863126308512, + "loss": 1.0361, + "step": 1686 + }, + { + "epoch": 0.16, + "grad_norm": 0.27036346205815504, + "learning_rate": 0.00019948470999091685, + "loss": 1.1066, + "step": 1687 + }, + { + "epoch": 0.16, + "grad_norm": 0.24182695997990206, + "learning_rate": 0.00019948310486131513, + "loss": 0.9643, + "step": 1688 + }, + { + "epoch": 0.16, + "grad_norm": 0.26214221449152814, + "learning_rate": 0.0001994814972420862, + "loss": 1.1246, + "step": 1689 + }, + { + "epoch": 0.16, + "grad_norm": 0.2804122217034823, + "learning_rate": 0.0001994798871332703, + "loss": 1.1795, + "step": 1690 + }, + { + "epoch": 0.16, + "grad_norm": 0.2594172377288309, + "learning_rate": 0.00019947827453490767, + "loss": 1.1388, + "step": 1691 + }, + { + "epoch": 0.16, + "grad_norm": 0.2893954998932536, + "learning_rate": 0.0001994766594470387, + "loss": 1.0079, + "step": 1692 + }, + { + "epoch": 0.16, + "grad_norm": 0.2936972888903826, + "learning_rate": 0.0001994750418697038, + "loss": 1.1258, + "step": 1693 + }, + { + "epoch": 0.16, + "grad_norm": 0.26900132587121695, + "learning_rate": 0.00019947342180294346, + "loss": 1.2546, + "step": 1694 + }, + { + "epoch": 0.16, + "grad_norm": 0.26577176120374074, + "learning_rate": 0.00019947179924679825, + "loss": 1.1571, + "step": 1695 + }, + { + "epoch": 0.16, + "grad_norm": 0.2734445337941885, + "learning_rate": 0.00019947017420130872, + "loss": 1.0859, + "step": 1696 + }, + { + "epoch": 0.16, + "grad_norm": 0.24000154406623206, + "learning_rate": 0.0001994685466665156, + "loss": 1.1031, + "step": 1697 + }, + { + "epoch": 0.16, + "grad_norm": 0.22779641522136282, + "learning_rate": 0.00019946691664245956, + "loss": 1.0854, + "step": 1698 + }, + { + "epoch": 0.16, + "grad_norm": 0.2479814495549998, + "learning_rate": 0.0001994652841291814, + "loss": 1.1121, + "step": 1699 + }, + { + "epoch": 0.16, + "grad_norm": 0.250268222311568, + "learning_rate": 0.00019946364912672203, + "loss": 1.0779, + "step": 1700 + }, + { + "epoch": 0.16, + "grad_norm": 0.24912184436362922, + "learning_rate": 0.00019946201163512233, + "loss": 1.0067, + "step": 1701 + }, + { + "epoch": 0.16, + "grad_norm": 0.3207294751787716, + "learning_rate": 0.00019946037165442327, + "loss": 1.303, + "step": 1702 + }, + { + "epoch": 0.16, + "grad_norm": 0.28907607252287926, + "learning_rate": 0.0001994587291846659, + "loss": 1.1877, + "step": 1703 + }, + { + "epoch": 0.16, + "grad_norm": 0.25848051240176273, + "learning_rate": 0.0001994570842258913, + "loss": 1.0081, + "step": 1704 + }, + { + "epoch": 0.16, + "grad_norm": 0.2756315421016595, + "learning_rate": 0.00019945543677814067, + "loss": 1.0329, + "step": 1705 + }, + { + "epoch": 0.16, + "grad_norm": 0.2505304319664102, + "learning_rate": 0.00019945378684145526, + "loss": 1.1502, + "step": 1706 + }, + { + "epoch": 0.16, + "grad_norm": 0.3140138337651653, + "learning_rate": 0.00019945213441587633, + "loss": 1.108, + "step": 1707 + }, + { + "epoch": 0.16, + "grad_norm": 0.2650198799790111, + "learning_rate": 0.0001994504795014452, + "loss": 1.0843, + "step": 1708 + }, + { + "epoch": 0.16, + "grad_norm": 0.27741715785219834, + "learning_rate": 0.00019944882209820333, + "loss": 1.0471, + "step": 1709 + }, + { + "epoch": 0.16, + "grad_norm": 0.28688576394081394, + "learning_rate": 0.0001994471622061922, + "loss": 1.0745, + "step": 1710 + }, + { + "epoch": 0.16, + "grad_norm": 0.3160625561308828, + "learning_rate": 0.0001994454998254533, + "loss": 1.0412, + "step": 1711 + }, + { + "epoch": 0.16, + "grad_norm": 0.29390242181574067, + "learning_rate": 0.0001994438349560283, + "loss": 1.0297, + "step": 1712 + }, + { + "epoch": 0.16, + "grad_norm": 0.2662667067917202, + "learning_rate": 0.00019944216759795885, + "loss": 1.0189, + "step": 1713 + }, + { + "epoch": 0.16, + "grad_norm": 0.2652127225373576, + "learning_rate": 0.00019944049775128661, + "loss": 1.0433, + "step": 1714 + }, + { + "epoch": 0.16, + "grad_norm": 0.28198238645107127, + "learning_rate": 0.00019943882541605343, + "loss": 1.1984, + "step": 1715 + }, + { + "epoch": 0.16, + "grad_norm": 0.2930815699661024, + "learning_rate": 0.00019943715059230117, + "loss": 1.0741, + "step": 1716 + }, + { + "epoch": 0.16, + "grad_norm": 0.32027322958606264, + "learning_rate": 0.0001994354732800717, + "loss": 1.097, + "step": 1717 + }, + { + "epoch": 0.16, + "grad_norm": 0.23663660962171493, + "learning_rate": 0.00019943379347940704, + "loss": 1.0879, + "step": 1718 + }, + { + "epoch": 0.16, + "grad_norm": 0.26819927151645284, + "learning_rate": 0.0001994321111903492, + "loss": 1.1009, + "step": 1719 + }, + { + "epoch": 0.16, + "grad_norm": 0.23575422949387753, + "learning_rate": 0.00019943042641294028, + "loss": 1.1155, + "step": 1720 + }, + { + "epoch": 0.16, + "grad_norm": 0.23615429167267799, + "learning_rate": 0.00019942873914722243, + "loss": 1.1978, + "step": 1721 + }, + { + "epoch": 0.16, + "grad_norm": 0.23075710530915816, + "learning_rate": 0.00019942704939323794, + "loss": 1.1802, + "step": 1722 + }, + { + "epoch": 0.16, + "grad_norm": 0.26619797669604706, + "learning_rate": 0.00019942535715102903, + "loss": 1.2049, + "step": 1723 + }, + { + "epoch": 0.16, + "grad_norm": 0.2766557794492051, + "learning_rate": 0.00019942366242063807, + "loss": 1.1062, + "step": 1724 + }, + { + "epoch": 0.17, + "grad_norm": 0.2650480870677421, + "learning_rate": 0.00019942196520210748, + "loss": 1.2134, + "step": 1725 + }, + { + "epoch": 0.17, + "grad_norm": 0.25567802198203454, + "learning_rate": 0.00019942026549547973, + "loss": 1.1747, + "step": 1726 + }, + { + "epoch": 0.17, + "grad_norm": 0.27010898900806307, + "learning_rate": 0.00019941856330079732, + "loss": 1.2341, + "step": 1727 + }, + { + "epoch": 0.17, + "grad_norm": 0.2672243888343421, + "learning_rate": 0.0001994168586181029, + "loss": 1.038, + "step": 1728 + }, + { + "epoch": 0.17, + "grad_norm": 0.27037675730748634, + "learning_rate": 0.00019941515144743913, + "loss": 1.1484, + "step": 1729 + }, + { + "epoch": 0.17, + "grad_norm": 0.27423511003272355, + "learning_rate": 0.00019941344178884868, + "loss": 1.1925, + "step": 1730 + }, + { + "epoch": 0.17, + "grad_norm": 0.25846818596543564, + "learning_rate": 0.0001994117296423744, + "loss": 1.0063, + "step": 1731 + }, + { + "epoch": 0.17, + "grad_norm": 0.2686589532404364, + "learning_rate": 0.0001994100150080591, + "loss": 1.0285, + "step": 1732 + }, + { + "epoch": 0.17, + "grad_norm": 0.2766088906498271, + "learning_rate": 0.00019940829788594569, + "loss": 1.0765, + "step": 1733 + }, + { + "epoch": 0.17, + "grad_norm": 0.26339315358801624, + "learning_rate": 0.00019940657827607715, + "loss": 1.1692, + "step": 1734 + }, + { + "epoch": 0.17, + "grad_norm": 0.2603560134384319, + "learning_rate": 0.0001994048561784965, + "loss": 1.0406, + "step": 1735 + }, + { + "epoch": 0.17, + "grad_norm": 0.2904525258177709, + "learning_rate": 0.0001994031315932469, + "loss": 0.9866, + "step": 1736 + }, + { + "epoch": 0.17, + "grad_norm": 0.254967597751732, + "learning_rate": 0.00019940140452037142, + "loss": 1.1711, + "step": 1737 + }, + { + "epoch": 0.17, + "grad_norm": 0.2587539173182572, + "learning_rate": 0.00019939967495991332, + "loss": 1.1377, + "step": 1738 + }, + { + "epoch": 0.17, + "grad_norm": 0.26239329017748414, + "learning_rate": 0.0001993979429119159, + "loss": 1.0846, + "step": 1739 + }, + { + "epoch": 0.17, + "grad_norm": 0.2641030306092387, + "learning_rate": 0.00019939620837642247, + "loss": 1.1515, + "step": 1740 + }, + { + "epoch": 0.17, + "grad_norm": 0.29629448536301656, + "learning_rate": 0.00019939447135347647, + "loss": 1.1464, + "step": 1741 + }, + { + "epoch": 0.17, + "grad_norm": 0.259586106165695, + "learning_rate": 0.00019939273184312137, + "loss": 1.0899, + "step": 1742 + }, + { + "epoch": 0.17, + "grad_norm": 0.26296382387837713, + "learning_rate": 0.0001993909898454007, + "loss": 1.0482, + "step": 1743 + }, + { + "epoch": 0.17, + "grad_norm": 0.28051523053769833, + "learning_rate": 0.000199389245360358, + "loss": 1.1547, + "step": 1744 + }, + { + "epoch": 0.17, + "grad_norm": 0.2671245457129155, + "learning_rate": 0.00019938749838803696, + "loss": 1.0592, + "step": 1745 + }, + { + "epoch": 0.17, + "grad_norm": 0.3181482392262407, + "learning_rate": 0.00019938574892848135, + "loss": 1.0635, + "step": 1746 + }, + { + "epoch": 0.17, + "grad_norm": 0.2899294422265618, + "learning_rate": 0.0001993839969817349, + "loss": 1.0768, + "step": 1747 + }, + { + "epoch": 0.17, + "grad_norm": 0.25002771184882505, + "learning_rate": 0.00019938224254784147, + "loss": 1.1619, + "step": 1748 + }, + { + "epoch": 0.17, + "grad_norm": 0.2568140049401059, + "learning_rate": 0.00019938048562684495, + "loss": 1.0229, + "step": 1749 + }, + { + "epoch": 0.17, + "grad_norm": 0.36292519666598, + "learning_rate": 0.00019937872621878934, + "loss": 1.1794, + "step": 1750 + }, + { + "epoch": 0.17, + "grad_norm": 0.2932029976728667, + "learning_rate": 0.0001993769643237186, + "loss": 1.1902, + "step": 1751 + }, + { + "epoch": 0.17, + "grad_norm": 0.2771533942225469, + "learning_rate": 0.00019937519994167694, + "loss": 1.0731, + "step": 1752 + }, + { + "epoch": 0.17, + "grad_norm": 0.30070233491956233, + "learning_rate": 0.00019937343307270842, + "loss": 1.1186, + "step": 1753 + }, + { + "epoch": 0.17, + "grad_norm": 0.23520740536849988, + "learning_rate": 0.00019937166371685727, + "loss": 1.0768, + "step": 1754 + }, + { + "epoch": 0.17, + "grad_norm": 0.2689615596060478, + "learning_rate": 0.0001993698918741678, + "loss": 1.0889, + "step": 1755 + }, + { + "epoch": 0.17, + "grad_norm": 0.2667096465377454, + "learning_rate": 0.0001993681175446843, + "loss": 1.1273, + "step": 1756 + }, + { + "epoch": 0.17, + "grad_norm": 0.2774024784883662, + "learning_rate": 0.00019936634072845126, + "loss": 1.0687, + "step": 1757 + }, + { + "epoch": 0.17, + "grad_norm": 0.2948555113393165, + "learning_rate": 0.00019936456142551306, + "loss": 1.1369, + "step": 1758 + }, + { + "epoch": 0.17, + "grad_norm": 0.2783762471401322, + "learning_rate": 0.00019936277963591428, + "loss": 1.1218, + "step": 1759 + }, + { + "epoch": 0.17, + "grad_norm": 0.24719261245618435, + "learning_rate": 0.00019936099535969946, + "loss": 1.1024, + "step": 1760 + }, + { + "epoch": 0.17, + "grad_norm": 0.3182614470412637, + "learning_rate": 0.00019935920859691332, + "loss": 1.1849, + "step": 1761 + }, + { + "epoch": 0.17, + "grad_norm": 0.23710818113640875, + "learning_rate": 0.00019935741934760053, + "loss": 1.0466, + "step": 1762 + }, + { + "epoch": 0.17, + "grad_norm": 0.2638247973270966, + "learning_rate": 0.00019935562761180586, + "loss": 1.046, + "step": 1763 + }, + { + "epoch": 0.17, + "grad_norm": 0.29051740614485994, + "learning_rate": 0.0001993538333895742, + "loss": 1.0124, + "step": 1764 + }, + { + "epoch": 0.17, + "grad_norm": 0.344576484247007, + "learning_rate": 0.0001993520366809504, + "loss": 1.0553, + "step": 1765 + }, + { + "epoch": 0.17, + "grad_norm": 0.2734046389788186, + "learning_rate": 0.00019935023748597942, + "loss": 1.1175, + "step": 1766 + }, + { + "epoch": 0.17, + "grad_norm": 0.2658344263426795, + "learning_rate": 0.00019934843580470633, + "loss": 1.1288, + "step": 1767 + }, + { + "epoch": 0.17, + "grad_norm": 0.24386557141888807, + "learning_rate": 0.0001993466316371762, + "loss": 1.0556, + "step": 1768 + }, + { + "epoch": 0.17, + "grad_norm": 0.2903532438407926, + "learning_rate": 0.00019934482498343417, + "loss": 1.1214, + "step": 1769 + }, + { + "epoch": 0.17, + "grad_norm": 0.23273429637205256, + "learning_rate": 0.00019934301584352543, + "loss": 1.1819, + "step": 1770 + }, + { + "epoch": 0.17, + "grad_norm": 0.26436150903465644, + "learning_rate": 0.0001993412042174953, + "loss": 1.2129, + "step": 1771 + }, + { + "epoch": 0.17, + "grad_norm": 0.272699055168821, + "learning_rate": 0.00019933939010538914, + "loss": 1.0845, + "step": 1772 + }, + { + "epoch": 0.17, + "grad_norm": 0.24313514050010182, + "learning_rate": 0.00019933757350725227, + "loss": 1.0947, + "step": 1773 + }, + { + "epoch": 0.17, + "grad_norm": 0.2743855476662608, + "learning_rate": 0.00019933575442313022, + "loss": 1.0721, + "step": 1774 + }, + { + "epoch": 0.17, + "grad_norm": 0.27484803850628725, + "learning_rate": 0.00019933393285306847, + "loss": 1.086, + "step": 1775 + }, + { + "epoch": 0.17, + "grad_norm": 0.2862674448464463, + "learning_rate": 0.0001993321087971126, + "loss": 1.1687, + "step": 1776 + }, + { + "epoch": 0.17, + "grad_norm": 0.2654433158884731, + "learning_rate": 0.00019933028225530832, + "loss": 1.1524, + "step": 1777 + }, + { + "epoch": 0.17, + "grad_norm": 0.28570985555687595, + "learning_rate": 0.00019932845322770127, + "loss": 1.1032, + "step": 1778 + }, + { + "epoch": 0.17, + "grad_norm": 0.2398384669725776, + "learning_rate": 0.00019932662171433726, + "loss": 1.1805, + "step": 1779 + }, + { + "epoch": 0.17, + "grad_norm": 0.27132648388882097, + "learning_rate": 0.00019932478771526212, + "loss": 1.1706, + "step": 1780 + }, + { + "epoch": 0.17, + "grad_norm": 0.2490907055488986, + "learning_rate": 0.00019932295123052175, + "loss": 1.0303, + "step": 1781 + }, + { + "epoch": 0.17, + "grad_norm": 0.2483810784888273, + "learning_rate": 0.0001993211122601621, + "loss": 1.183, + "step": 1782 + }, + { + "epoch": 0.17, + "grad_norm": 0.23864167216181773, + "learning_rate": 0.00019931927080422921, + "loss": 1.0438, + "step": 1783 + }, + { + "epoch": 0.17, + "grad_norm": 0.27530929268313675, + "learning_rate": 0.0001993174268627691, + "loss": 1.0515, + "step": 1784 + }, + { + "epoch": 0.17, + "grad_norm": 0.24646180451212257, + "learning_rate": 0.00019931558043582802, + "loss": 1.1064, + "step": 1785 + }, + { + "epoch": 0.17, + "grad_norm": 0.3072867379708468, + "learning_rate": 0.00019931373152345206, + "loss": 1.0433, + "step": 1786 + }, + { + "epoch": 0.17, + "grad_norm": 0.2699984246251364, + "learning_rate": 0.0001993118801256876, + "loss": 1.2135, + "step": 1787 + }, + { + "epoch": 0.17, + "grad_norm": 0.29491554910347884, + "learning_rate": 0.00019931002624258093, + "loss": 1.0451, + "step": 1788 + }, + { + "epoch": 0.17, + "grad_norm": 0.2592446364845811, + "learning_rate": 0.00019930816987417843, + "loss": 1.0678, + "step": 1789 + }, + { + "epoch": 0.17, + "grad_norm": 0.27551160462797714, + "learning_rate": 0.00019930631102052656, + "loss": 1.099, + "step": 1790 + }, + { + "epoch": 0.17, + "grad_norm": 0.3004312874610158, + "learning_rate": 0.00019930444968167184, + "loss": 1.2349, + "step": 1791 + }, + { + "epoch": 0.17, + "grad_norm": 0.2902276940371733, + "learning_rate": 0.00019930258585766083, + "loss": 1.0191, + "step": 1792 + }, + { + "epoch": 0.17, + "grad_norm": 0.2880266394681308, + "learning_rate": 0.00019930071954854026, + "loss": 1.0946, + "step": 1793 + }, + { + "epoch": 0.17, + "grad_norm": 0.27739961453298684, + "learning_rate": 0.00019929885075435673, + "loss": 1.0883, + "step": 1794 + }, + { + "epoch": 0.17, + "grad_norm": 0.26237306161853147, + "learning_rate": 0.00019929697947515705, + "loss": 1.13, + "step": 1795 + }, + { + "epoch": 0.17, + "grad_norm": 0.25732915392100997, + "learning_rate": 0.0001992951057109881, + "loss": 1.0223, + "step": 1796 + }, + { + "epoch": 0.17, + "grad_norm": 0.26231079106015476, + "learning_rate": 0.00019929322946189669, + "loss": 1.2334, + "step": 1797 + }, + { + "epoch": 0.17, + "grad_norm": 0.2689310820422752, + "learning_rate": 0.00019929135072792979, + "loss": 1.0859, + "step": 1798 + }, + { + "epoch": 0.17, + "grad_norm": 0.28231810159636933, + "learning_rate": 0.00019928946950913446, + "loss": 1.1499, + "step": 1799 + }, + { + "epoch": 0.17, + "grad_norm": 0.26114756115035764, + "learning_rate": 0.00019928758580555777, + "loss": 1.0692, + "step": 1800 + }, + { + "epoch": 0.17, + "grad_norm": 0.21499560470777954, + "learning_rate": 0.00019928569961724684, + "loss": 1.0246, + "step": 1801 + }, + { + "epoch": 0.17, + "grad_norm": 0.2499670112727543, + "learning_rate": 0.00019928381094424887, + "loss": 1.0571, + "step": 1802 + }, + { + "epoch": 0.17, + "grad_norm": 0.2668951326275685, + "learning_rate": 0.00019928191978661112, + "loss": 1.0914, + "step": 1803 + }, + { + "epoch": 0.17, + "grad_norm": 0.2436534609761875, + "learning_rate": 0.00019928002614438096, + "loss": 1.115, + "step": 1804 + }, + { + "epoch": 0.17, + "grad_norm": 0.24543054408218462, + "learning_rate": 0.00019927813001760573, + "loss": 1.1398, + "step": 1805 + }, + { + "epoch": 0.17, + "grad_norm": 0.275192488837919, + "learning_rate": 0.0001992762314063329, + "loss": 1.157, + "step": 1806 + }, + { + "epoch": 0.17, + "grad_norm": 0.24910350712903606, + "learning_rate": 0.00019927433031061, + "loss": 1.1897, + "step": 1807 + }, + { + "epoch": 0.17, + "grad_norm": 0.2440628864265639, + "learning_rate": 0.0001992724267304846, + "loss": 1.1515, + "step": 1808 + }, + { + "epoch": 0.17, + "grad_norm": 0.2878309034150722, + "learning_rate": 0.0001992705206660043, + "loss": 1.2264, + "step": 1809 + }, + { + "epoch": 0.17, + "grad_norm": 0.2910816442764025, + "learning_rate": 0.00019926861211721684, + "loss": 1.0837, + "step": 1810 + }, + { + "epoch": 0.17, + "grad_norm": 0.26250435365108804, + "learning_rate": 0.00019926670108416997, + "loss": 1.2048, + "step": 1811 + }, + { + "epoch": 0.17, + "grad_norm": 0.2870815342486622, + "learning_rate": 0.00019926478756691153, + "loss": 1.1559, + "step": 1812 + }, + { + "epoch": 0.17, + "grad_norm": 0.27090400410537946, + "learning_rate": 0.0001992628715654894, + "loss": 1.2068, + "step": 1813 + }, + { + "epoch": 0.17, + "grad_norm": 0.28120763276720057, + "learning_rate": 0.0001992609530799515, + "loss": 1.0354, + "step": 1814 + }, + { + "epoch": 0.17, + "grad_norm": 0.2894660442297703, + "learning_rate": 0.0001992590321103459, + "loss": 1.0584, + "step": 1815 + }, + { + "epoch": 0.17, + "grad_norm": 0.2780933045178876, + "learning_rate": 0.00019925710865672063, + "loss": 1.2506, + "step": 1816 + }, + { + "epoch": 0.17, + "grad_norm": 0.28692036245587266, + "learning_rate": 0.0001992551827191238, + "loss": 1.1621, + "step": 1817 + }, + { + "epoch": 0.17, + "grad_norm": 0.2982135909302484, + "learning_rate": 0.00019925325429760368, + "loss": 1.0948, + "step": 1818 + }, + { + "epoch": 0.17, + "grad_norm": 0.2820173648746567, + "learning_rate": 0.0001992513233922085, + "loss": 1.1477, + "step": 1819 + }, + { + "epoch": 0.17, + "grad_norm": 0.2639471711581256, + "learning_rate": 0.00019924939000298656, + "loss": 1.0992, + "step": 1820 + }, + { + "epoch": 0.17, + "grad_norm": 0.3258628877911058, + "learning_rate": 0.00019924745412998625, + "loss": 1.0337, + "step": 1821 + }, + { + "epoch": 0.17, + "grad_norm": 0.32533873777010336, + "learning_rate": 0.00019924551577325605, + "loss": 1.036, + "step": 1822 + }, + { + "epoch": 0.17, + "grad_norm": 0.29923641265792245, + "learning_rate": 0.00019924357493284443, + "loss": 1.1199, + "step": 1823 + }, + { + "epoch": 0.17, + "grad_norm": 0.2871942608589218, + "learning_rate": 0.00019924163160879997, + "loss": 1.111, + "step": 1824 + }, + { + "epoch": 0.17, + "grad_norm": 0.32570726041176123, + "learning_rate": 0.0001992396858011713, + "loss": 1.208, + "step": 1825 + }, + { + "epoch": 0.17, + "grad_norm": 0.2745227229675084, + "learning_rate": 0.00019923773751000714, + "loss": 1.0936, + "step": 1826 + }, + { + "epoch": 0.17, + "grad_norm": 0.29321444725480744, + "learning_rate": 0.00019923578673535622, + "loss": 1.1939, + "step": 1827 + }, + { + "epoch": 0.17, + "grad_norm": 0.31320266548147835, + "learning_rate": 0.0001992338334772674, + "loss": 1.092, + "step": 1828 + }, + { + "epoch": 0.17, + "grad_norm": 0.27134903395742416, + "learning_rate": 0.0001992318777357895, + "loss": 1.1936, + "step": 1829 + }, + { + "epoch": 0.18, + "grad_norm": 0.25932835062459336, + "learning_rate": 0.0001992299195109715, + "loss": 1.1709, + "step": 1830 + }, + { + "epoch": 0.18, + "grad_norm": 0.2821426140168247, + "learning_rate": 0.0001992279588028624, + "loss": 1.0848, + "step": 1831 + }, + { + "epoch": 0.18, + "grad_norm": 0.27780532527873364, + "learning_rate": 0.00019922599561151126, + "loss": 1.0701, + "step": 1832 + }, + { + "epoch": 0.18, + "grad_norm": 0.3213413443644143, + "learning_rate": 0.00019922402993696725, + "loss": 1.2066, + "step": 1833 + }, + { + "epoch": 0.18, + "grad_norm": 0.24152642775118963, + "learning_rate": 0.00019922206177927948, + "loss": 1.0779, + "step": 1834 + }, + { + "epoch": 0.18, + "grad_norm": 0.26358149683834353, + "learning_rate": 0.00019922009113849728, + "loss": 1.0631, + "step": 1835 + }, + { + "epoch": 0.18, + "grad_norm": 0.2771938797580255, + "learning_rate": 0.00019921811801466995, + "loss": 1.0627, + "step": 1836 + }, + { + "epoch": 0.18, + "grad_norm": 0.24390434386104548, + "learning_rate": 0.00019921614240784688, + "loss": 1.0826, + "step": 1837 + }, + { + "epoch": 0.18, + "grad_norm": 0.2883536207722251, + "learning_rate": 0.00019921416431807748, + "loss": 1.0587, + "step": 1838 + }, + { + "epoch": 0.18, + "grad_norm": 0.26058794322850115, + "learning_rate": 0.00019921218374541124, + "loss": 1.0926, + "step": 1839 + }, + { + "epoch": 0.18, + "grad_norm": 0.253199761945334, + "learning_rate": 0.00019921020068989776, + "loss": 1.0659, + "step": 1840 + }, + { + "epoch": 0.18, + "grad_norm": 0.27605817699682705, + "learning_rate": 0.00019920821515158666, + "loss": 1.0807, + "step": 1841 + }, + { + "epoch": 0.18, + "grad_norm": 0.34296685061041043, + "learning_rate": 0.0001992062271305276, + "loss": 1.0399, + "step": 1842 + }, + { + "epoch": 0.18, + "grad_norm": 0.24791123495573145, + "learning_rate": 0.0001992042366267704, + "loss": 1.0986, + "step": 1843 + }, + { + "epoch": 0.18, + "grad_norm": 0.2730759255331824, + "learning_rate": 0.0001992022436403648, + "loss": 1.1249, + "step": 1844 + }, + { + "epoch": 0.18, + "grad_norm": 0.25825382605152414, + "learning_rate": 0.0001992002481713607, + "loss": 1.1515, + "step": 1845 + }, + { + "epoch": 0.18, + "grad_norm": 0.2709947998255462, + "learning_rate": 0.0001991982502198081, + "loss": 1.0644, + "step": 1846 + }, + { + "epoch": 0.18, + "grad_norm": 0.3041130032344396, + "learning_rate": 0.0001991962497857569, + "loss": 1.2027, + "step": 1847 + }, + { + "epoch": 0.18, + "grad_norm": 0.2514054383827712, + "learning_rate": 0.00019919424686925722, + "loss": 1.1181, + "step": 1848 + }, + { + "epoch": 0.18, + "grad_norm": 0.2671075622096655, + "learning_rate": 0.00019919224147035914, + "loss": 1.0748, + "step": 1849 + }, + { + "epoch": 0.18, + "grad_norm": 0.2563779371324541, + "learning_rate": 0.00019919023358911292, + "loss": 1.1708, + "step": 1850 + }, + { + "epoch": 0.18, + "grad_norm": 0.27518701288612535, + "learning_rate": 0.00019918822322556877, + "loss": 1.068, + "step": 1851 + }, + { + "epoch": 0.18, + "grad_norm": 0.23464980973730304, + "learning_rate": 0.00019918621037977693, + "loss": 1.115, + "step": 1852 + }, + { + "epoch": 0.18, + "grad_norm": 0.22754310999457095, + "learning_rate": 0.0001991841950517879, + "loss": 1.143, + "step": 1853 + }, + { + "epoch": 0.18, + "grad_norm": 0.2402610477626132, + "learning_rate": 0.00019918217724165205, + "loss": 1.045, + "step": 1854 + }, + { + "epoch": 0.18, + "grad_norm": 0.2466867783180613, + "learning_rate": 0.00019918015694941988, + "loss": 1.1413, + "step": 1855 + }, + { + "epoch": 0.18, + "grad_norm": 0.27021081362108734, + "learning_rate": 0.00019917813417514194, + "loss": 1.1366, + "step": 1856 + }, + { + "epoch": 0.18, + "grad_norm": 0.2290266939618748, + "learning_rate": 0.00019917610891886884, + "loss": 1.077, + "step": 1857 + }, + { + "epoch": 0.18, + "grad_norm": 0.2912099506203328, + "learning_rate": 0.0001991740811806513, + "loss": 1.2381, + "step": 1858 + }, + { + "epoch": 0.18, + "grad_norm": 0.25710098640782725, + "learning_rate": 0.00019917205096054005, + "loss": 1.1494, + "step": 1859 + }, + { + "epoch": 0.18, + "grad_norm": 0.23444875824179745, + "learning_rate": 0.00019917001825858592, + "loss": 1.1993, + "step": 1860 + }, + { + "epoch": 0.18, + "grad_norm": 0.27972502638604985, + "learning_rate": 0.00019916798307483973, + "loss": 1.0881, + "step": 1861 + }, + { + "epoch": 0.18, + "grad_norm": 0.22927317304767958, + "learning_rate": 0.00019916594540935246, + "loss": 1.1226, + "step": 1862 + }, + { + "epoch": 0.18, + "grad_norm": 0.2777824230056472, + "learning_rate": 0.00019916390526217507, + "loss": 1.2791, + "step": 1863 + }, + { + "epoch": 0.18, + "grad_norm": 0.29153525342371706, + "learning_rate": 0.0001991618626333586, + "loss": 1.1707, + "step": 1864 + }, + { + "epoch": 0.18, + "grad_norm": 0.26506907160815607, + "learning_rate": 0.00019915981752295422, + "loss": 1.1309, + "step": 1865 + }, + { + "epoch": 0.18, + "grad_norm": 0.24281279217203466, + "learning_rate": 0.00019915776993101311, + "loss": 1.117, + "step": 1866 + }, + { + "epoch": 0.18, + "grad_norm": 0.2915237131661395, + "learning_rate": 0.00019915571985758645, + "loss": 1.1615, + "step": 1867 + }, + { + "epoch": 0.18, + "grad_norm": 0.26598319618634586, + "learning_rate": 0.00019915366730272562, + "loss": 1.2443, + "step": 1868 + }, + { + "epoch": 0.18, + "grad_norm": 0.25083927497077174, + "learning_rate": 0.00019915161226648193, + "loss": 1.091, + "step": 1869 + }, + { + "epoch": 0.18, + "grad_norm": 0.24871843858798745, + "learning_rate": 0.00019914955474890683, + "loss": 1.2225, + "step": 1870 + }, + { + "epoch": 0.18, + "grad_norm": 0.2807205864271564, + "learning_rate": 0.00019914749475005182, + "loss": 1.0856, + "step": 1871 + }, + { + "epoch": 0.18, + "grad_norm": 0.2564207640831039, + "learning_rate": 0.00019914543226996846, + "loss": 1.1381, + "step": 1872 + }, + { + "epoch": 0.18, + "grad_norm": 0.2711542979084927, + "learning_rate": 0.00019914336730870828, + "loss": 1.1482, + "step": 1873 + }, + { + "epoch": 0.18, + "grad_norm": 0.24605707766197607, + "learning_rate": 0.00019914129986632308, + "loss": 1.0468, + "step": 1874 + }, + { + "epoch": 0.18, + "grad_norm": 0.25895981792303296, + "learning_rate": 0.00019913922994286453, + "loss": 1.1124, + "step": 1875 + }, + { + "epoch": 0.18, + "grad_norm": 0.26071698636999885, + "learning_rate": 0.00019913715753838444, + "loss": 1.0977, + "step": 1876 + }, + { + "epoch": 0.18, + "grad_norm": 0.26270406723403217, + "learning_rate": 0.00019913508265293468, + "loss": 0.9724, + "step": 1877 + }, + { + "epoch": 0.18, + "grad_norm": 0.2729621283459525, + "learning_rate": 0.00019913300528656718, + "loss": 1.0379, + "step": 1878 + }, + { + "epoch": 0.18, + "grad_norm": 0.2667009672221466, + "learning_rate": 0.00019913092543933392, + "loss": 1.201, + "step": 1879 + }, + { + "epoch": 0.18, + "grad_norm": 0.3062682088725694, + "learning_rate": 0.00019912884311128692, + "loss": 1.2133, + "step": 1880 + }, + { + "epoch": 0.18, + "grad_norm": 0.2471603219169982, + "learning_rate": 0.00019912675830247834, + "loss": 1.0426, + "step": 1881 + }, + { + "epoch": 0.18, + "grad_norm": 0.2706944754253154, + "learning_rate": 0.00019912467101296035, + "loss": 1.0739, + "step": 1882 + }, + { + "epoch": 0.18, + "grad_norm": 0.23877771124529687, + "learning_rate": 0.00019912258124278517, + "loss": 1.1371, + "step": 1883 + }, + { + "epoch": 0.18, + "grad_norm": 0.2501800385911424, + "learning_rate": 0.00019912048899200507, + "loss": 0.985, + "step": 1884 + }, + { + "epoch": 0.18, + "grad_norm": 0.2543490544283289, + "learning_rate": 0.00019911839426067245, + "loss": 1.0922, + "step": 1885 + }, + { + "epoch": 0.18, + "grad_norm": 0.2610356534075215, + "learning_rate": 0.0001991162970488397, + "loss": 1.062, + "step": 1886 + }, + { + "epoch": 0.18, + "grad_norm": 0.31809853336105126, + "learning_rate": 0.0001991141973565594, + "loss": 1.0172, + "step": 1887 + }, + { + "epoch": 0.18, + "grad_norm": 0.3037942696710212, + "learning_rate": 0.00019911209518388393, + "loss": 1.2201, + "step": 1888 + }, + { + "epoch": 0.18, + "grad_norm": 0.2612653091466676, + "learning_rate": 0.00019910999053086604, + "loss": 1.1529, + "step": 1889 + }, + { + "epoch": 0.18, + "grad_norm": 0.25927362339534166, + "learning_rate": 0.00019910788339755833, + "loss": 1.1727, + "step": 1890 + }, + { + "epoch": 0.18, + "grad_norm": 0.26339036209636246, + "learning_rate": 0.00019910577378401355, + "loss": 1.0759, + "step": 1891 + }, + { + "epoch": 0.18, + "grad_norm": 0.26180032269848985, + "learning_rate": 0.00019910366169028452, + "loss": 1.0782, + "step": 1892 + }, + { + "epoch": 0.18, + "grad_norm": 0.27889254421508974, + "learning_rate": 0.00019910154711642403, + "loss": 1.1011, + "step": 1893 + }, + { + "epoch": 0.18, + "grad_norm": 0.2549984542896029, + "learning_rate": 0.00019909943006248505, + "loss": 1.15, + "step": 1894 + }, + { + "epoch": 0.18, + "grad_norm": 0.2274655301809786, + "learning_rate": 0.0001990973105285206, + "loss": 1.1843, + "step": 1895 + }, + { + "epoch": 0.18, + "grad_norm": 0.259196981448463, + "learning_rate": 0.00019909518851458363, + "loss": 1.0451, + "step": 1896 + }, + { + "epoch": 0.18, + "grad_norm": 0.37452396398591586, + "learning_rate": 0.0001990930640207273, + "loss": 1.1319, + "step": 1897 + }, + { + "epoch": 0.18, + "grad_norm": 0.30511870306850003, + "learning_rate": 0.00019909093704700473, + "loss": 1.1613, + "step": 1898 + }, + { + "epoch": 0.18, + "grad_norm": 0.29479478078757926, + "learning_rate": 0.00019908880759346925, + "loss": 1.1725, + "step": 1899 + }, + { + "epoch": 0.18, + "grad_norm": 0.3086910524148668, + "learning_rate": 0.00019908667566017406, + "loss": 1.1686, + "step": 1900 + }, + { + "epoch": 0.18, + "grad_norm": 0.2974264651570825, + "learning_rate": 0.0001990845412471725, + "loss": 1.1257, + "step": 1901 + }, + { + "epoch": 0.18, + "grad_norm": 0.256728492572231, + "learning_rate": 0.00019908240435451805, + "loss": 1.0166, + "step": 1902 + }, + { + "epoch": 0.18, + "grad_norm": 0.23937868610540455, + "learning_rate": 0.00019908026498226418, + "loss": 1.1205, + "step": 1903 + }, + { + "epoch": 0.18, + "grad_norm": 0.272693028176549, + "learning_rate": 0.00019907812313046437, + "loss": 1.1055, + "step": 1904 + }, + { + "epoch": 0.18, + "grad_norm": 0.27354401730685546, + "learning_rate": 0.00019907597879917227, + "loss": 1.1253, + "step": 1905 + }, + { + "epoch": 0.18, + "grad_norm": 0.27305416183963116, + "learning_rate": 0.00019907383198844157, + "loss": 1.0841, + "step": 1906 + }, + { + "epoch": 0.18, + "grad_norm": 0.31852024294594067, + "learning_rate": 0.00019907168269832592, + "loss": 1.1546, + "step": 1907 + }, + { + "epoch": 0.18, + "grad_norm": 0.29709692567412604, + "learning_rate": 0.00019906953092887916, + "loss": 1.2313, + "step": 1908 + }, + { + "epoch": 0.18, + "grad_norm": 0.2617821076298669, + "learning_rate": 0.00019906737668015515, + "loss": 0.943, + "step": 1909 + }, + { + "epoch": 0.18, + "grad_norm": 0.2636246176724674, + "learning_rate": 0.00019906521995220774, + "loss": 1.0627, + "step": 1910 + }, + { + "epoch": 0.18, + "grad_norm": 0.2679413444345284, + "learning_rate": 0.00019906306074509095, + "loss": 1.1503, + "step": 1911 + }, + { + "epoch": 0.18, + "grad_norm": 0.24358831048061988, + "learning_rate": 0.0001990608990588588, + "loss": 1.0467, + "step": 1912 + }, + { + "epoch": 0.18, + "grad_norm": 0.2873239577441084, + "learning_rate": 0.0001990587348935654, + "loss": 1.2148, + "step": 1913 + }, + { + "epoch": 0.18, + "grad_norm": 0.2970349427468681, + "learning_rate": 0.00019905656824926492, + "loss": 1.1718, + "step": 1914 + }, + { + "epoch": 0.18, + "grad_norm": 0.328114785718262, + "learning_rate": 0.00019905439912601156, + "loss": 1.0894, + "step": 1915 + }, + { + "epoch": 0.18, + "grad_norm": 0.2768655282500371, + "learning_rate": 0.00019905222752385958, + "loss": 0.9798, + "step": 1916 + }, + { + "epoch": 0.18, + "grad_norm": 0.24581149927304233, + "learning_rate": 0.00019905005344286338, + "loss": 1.1947, + "step": 1917 + }, + { + "epoch": 0.18, + "grad_norm": 0.24905142815716402, + "learning_rate": 0.00019904787688307735, + "loss": 1.0603, + "step": 1918 + }, + { + "epoch": 0.18, + "grad_norm": 0.25006568481196073, + "learning_rate": 0.00019904569784455592, + "loss": 1.1451, + "step": 1919 + }, + { + "epoch": 0.18, + "grad_norm": 0.24640239497730002, + "learning_rate": 0.0001990435163273537, + "loss": 1.1513, + "step": 1920 + }, + { + "epoch": 0.18, + "grad_norm": 0.3158678665771197, + "learning_rate": 0.00019904133233152518, + "loss": 1.1675, + "step": 1921 + }, + { + "epoch": 0.18, + "grad_norm": 0.2925864535264506, + "learning_rate": 0.0001990391458571251, + "loss": 1.0303, + "step": 1922 + }, + { + "epoch": 0.18, + "grad_norm": 0.2919025168815423, + "learning_rate": 0.00019903695690420817, + "loss": 1.2033, + "step": 1923 + }, + { + "epoch": 0.18, + "grad_norm": 0.2537949728039277, + "learning_rate": 0.00019903476547282914, + "loss": 1.144, + "step": 1924 + }, + { + "epoch": 0.18, + "grad_norm": 0.2583763269969566, + "learning_rate": 0.00019903257156304285, + "loss": 1.1037, + "step": 1925 + }, + { + "epoch": 0.18, + "grad_norm": 0.2613319587588986, + "learning_rate": 0.00019903037517490422, + "loss": 1.0958, + "step": 1926 + }, + { + "epoch": 0.18, + "grad_norm": 0.2595089644687432, + "learning_rate": 0.00019902817630846822, + "loss": 1.1155, + "step": 1927 + }, + { + "epoch": 0.18, + "grad_norm": 0.25297594336763907, + "learning_rate": 0.00019902597496378985, + "loss": 1.1028, + "step": 1928 + }, + { + "epoch": 0.18, + "grad_norm": 0.30513489135176775, + "learning_rate": 0.00019902377114092425, + "loss": 1.1394, + "step": 1929 + }, + { + "epoch": 0.18, + "grad_norm": 0.3021684093102426, + "learning_rate": 0.00019902156483992653, + "loss": 0.9847, + "step": 1930 + }, + { + "epoch": 0.18, + "grad_norm": 0.2590031991887478, + "learning_rate": 0.00019901935606085193, + "loss": 1.036, + "step": 1931 + }, + { + "epoch": 0.18, + "grad_norm": 0.27555119779022413, + "learning_rate": 0.00019901714480375572, + "loss": 1.0828, + "step": 1932 + }, + { + "epoch": 0.18, + "grad_norm": 0.2590125935515052, + "learning_rate": 0.0001990149310686932, + "loss": 1.0109, + "step": 1933 + }, + { + "epoch": 0.19, + "grad_norm": 0.27535539380394297, + "learning_rate": 0.0001990127148557198, + "loss": 1.0645, + "step": 1934 + }, + { + "epoch": 0.19, + "grad_norm": 0.2604038154667027, + "learning_rate": 0.000199010496164891, + "loss": 1.1085, + "step": 1935 + }, + { + "epoch": 0.19, + "grad_norm": 0.24979308709369744, + "learning_rate": 0.0001990082749962623, + "loss": 1.084, + "step": 1936 + }, + { + "epoch": 0.19, + "grad_norm": 0.3032327587240259, + "learning_rate": 0.0001990060513498893, + "loss": 1.1131, + "step": 1937 + }, + { + "epoch": 0.19, + "grad_norm": 0.2678255255634505, + "learning_rate": 0.00019900382522582765, + "loss": 1.1368, + "step": 1938 + }, + { + "epoch": 0.19, + "grad_norm": 0.2180195498119384, + "learning_rate": 0.00019900159662413305, + "loss": 1.0595, + "step": 1939 + }, + { + "epoch": 0.19, + "grad_norm": 0.23825147073097672, + "learning_rate": 0.00019899936554486128, + "loss": 1.1574, + "step": 1940 + }, + { + "epoch": 0.19, + "grad_norm": 0.23730357351239173, + "learning_rate": 0.00019899713198806812, + "loss": 1.1184, + "step": 1941 + }, + { + "epoch": 0.19, + "grad_norm": 0.2805064829854749, + "learning_rate": 0.00019899489595380957, + "loss": 1.2007, + "step": 1942 + }, + { + "epoch": 0.19, + "grad_norm": 0.2473758737702457, + "learning_rate": 0.00019899265744214152, + "loss": 1.0602, + "step": 1943 + }, + { + "epoch": 0.19, + "grad_norm": 0.2718909446342109, + "learning_rate": 0.00019899041645312, + "loss": 1.1384, + "step": 1944 + }, + { + "epoch": 0.19, + "grad_norm": 0.2541735443801485, + "learning_rate": 0.0001989881729868011, + "loss": 1.0676, + "step": 1945 + }, + { + "epoch": 0.19, + "grad_norm": 0.2500138203165472, + "learning_rate": 0.00019898592704324094, + "loss": 1.0983, + "step": 1946 + }, + { + "epoch": 0.19, + "grad_norm": 0.2593377151457589, + "learning_rate": 0.00019898367862249575, + "loss": 1.0257, + "step": 1947 + }, + { + "epoch": 0.19, + "grad_norm": 0.2495745540042679, + "learning_rate": 0.00019898142772462182, + "loss": 1.0384, + "step": 1948 + }, + { + "epoch": 0.19, + "grad_norm": 0.2969538903134302, + "learning_rate": 0.00019897917434967544, + "loss": 1.1127, + "step": 1949 + }, + { + "epoch": 0.19, + "grad_norm": 0.26632242247343, + "learning_rate": 0.00019897691849771301, + "loss": 1.1186, + "step": 1950 + }, + { + "epoch": 0.19, + "grad_norm": 0.29353632758131, + "learning_rate": 0.00019897466016879098, + "loss": 1.0999, + "step": 1951 + }, + { + "epoch": 0.19, + "grad_norm": 0.2888464592603483, + "learning_rate": 0.00019897239936296588, + "loss": 1.0546, + "step": 1952 + }, + { + "epoch": 0.19, + "grad_norm": 0.2576028803170492, + "learning_rate": 0.00019897013608029428, + "loss": 1.0409, + "step": 1953 + }, + { + "epoch": 0.19, + "grad_norm": 0.310447192134622, + "learning_rate": 0.00019896787032083285, + "loss": 1.1755, + "step": 1954 + }, + { + "epoch": 0.19, + "grad_norm": 0.28394813161935395, + "learning_rate": 0.00019896560208463825, + "loss": 1.071, + "step": 1955 + }, + { + "epoch": 0.19, + "grad_norm": 0.23857662880995645, + "learning_rate": 0.00019896333137176726, + "loss": 0.9972, + "step": 1956 + }, + { + "epoch": 0.19, + "grad_norm": 0.27471986726786446, + "learning_rate": 0.00019896105818227673, + "loss": 1.1453, + "step": 1957 + }, + { + "epoch": 0.19, + "grad_norm": 0.26890257495666114, + "learning_rate": 0.00019895878251622348, + "loss": 1.0331, + "step": 1958 + }, + { + "epoch": 0.19, + "grad_norm": 0.2663478542354145, + "learning_rate": 0.00019895650437366452, + "loss": 1.0474, + "step": 1959 + }, + { + "epoch": 0.19, + "grad_norm": 0.2604279634361445, + "learning_rate": 0.00019895422375465686, + "loss": 1.1096, + "step": 1960 + }, + { + "epoch": 0.19, + "grad_norm": 0.2606384507083339, + "learning_rate": 0.00019895194065925754, + "loss": 1.0248, + "step": 1961 + }, + { + "epoch": 0.19, + "grad_norm": 0.27935688700196437, + "learning_rate": 0.00019894965508752375, + "loss": 1.2211, + "step": 1962 + }, + { + "epoch": 0.19, + "grad_norm": 0.2745748469246835, + "learning_rate": 0.00019894736703951263, + "loss": 1.0072, + "step": 1963 + }, + { + "epoch": 0.19, + "grad_norm": 0.23091984266360946, + "learning_rate": 0.00019894507651528148, + "loss": 1.043, + "step": 1964 + }, + { + "epoch": 0.19, + "grad_norm": 0.22878065896236086, + "learning_rate": 0.00019894278351488757, + "loss": 1.0798, + "step": 1965 + }, + { + "epoch": 0.19, + "grad_norm": 0.2554600035367144, + "learning_rate": 0.00019894048803838834, + "loss": 1.1627, + "step": 1966 + }, + { + "epoch": 0.19, + "grad_norm": 0.2737920223861705, + "learning_rate": 0.00019893819008584123, + "loss": 1.1778, + "step": 1967 + }, + { + "epoch": 0.19, + "grad_norm": 0.2708368675723523, + "learning_rate": 0.0001989358896573037, + "loss": 0.9277, + "step": 1968 + }, + { + "epoch": 0.19, + "grad_norm": 0.24550243352794962, + "learning_rate": 0.00019893358675283337, + "loss": 1.1226, + "step": 1969 + }, + { + "epoch": 0.19, + "grad_norm": 0.2642424699796039, + "learning_rate": 0.00019893128137248787, + "loss": 1.1078, + "step": 1970 + }, + { + "epoch": 0.19, + "grad_norm": 0.2573419222534839, + "learning_rate": 0.00019892897351632484, + "loss": 1.2793, + "step": 1971 + }, + { + "epoch": 0.19, + "grad_norm": 0.27224735814088064, + "learning_rate": 0.00019892666318440213, + "loss": 1.0788, + "step": 1972 + }, + { + "epoch": 0.19, + "grad_norm": 0.2766376630314068, + "learning_rate": 0.00019892435037677746, + "loss": 1.1132, + "step": 1973 + }, + { + "epoch": 0.19, + "grad_norm": 0.3067049268603965, + "learning_rate": 0.00019892203509350875, + "loss": 1.0906, + "step": 1974 + }, + { + "epoch": 0.19, + "grad_norm": 0.27698131540064885, + "learning_rate": 0.00019891971733465395, + "loss": 1.1791, + "step": 1975 + }, + { + "epoch": 0.19, + "grad_norm": 0.2874873201823343, + "learning_rate": 0.00019891739710027105, + "loss": 1.1604, + "step": 1976 + }, + { + "epoch": 0.19, + "grad_norm": 0.28015375821188654, + "learning_rate": 0.00019891507439041814, + "loss": 1.1313, + "step": 1977 + }, + { + "epoch": 0.19, + "grad_norm": 0.2690366043454793, + "learning_rate": 0.0001989127492051533, + "loss": 1.199, + "step": 1978 + }, + { + "epoch": 0.19, + "grad_norm": 0.23644122244331564, + "learning_rate": 0.00019891042154453477, + "loss": 1.0604, + "step": 1979 + }, + { + "epoch": 0.19, + "grad_norm": 0.23968738721131738, + "learning_rate": 0.00019890809140862077, + "loss": 1.0409, + "step": 1980 + }, + { + "epoch": 0.19, + "grad_norm": 0.24461539048772374, + "learning_rate": 0.0001989057587974696, + "loss": 1.103, + "step": 1981 + }, + { + "epoch": 0.19, + "grad_norm": 0.2826392500904235, + "learning_rate": 0.0001989034237111397, + "loss": 1.1352, + "step": 1982 + }, + { + "epoch": 0.19, + "grad_norm": 0.2772470869290075, + "learning_rate": 0.0001989010861496894, + "loss": 1.1124, + "step": 1983 + }, + { + "epoch": 0.19, + "grad_norm": 0.2848620471084776, + "learning_rate": 0.00019889874611317732, + "loss": 1.0845, + "step": 1984 + }, + { + "epoch": 0.19, + "grad_norm": 0.2797268047479781, + "learning_rate": 0.00019889640360166194, + "loss": 1.1135, + "step": 1985 + }, + { + "epoch": 0.19, + "grad_norm": 0.2569875278431773, + "learning_rate": 0.00019889405861520188, + "loss": 1.1096, + "step": 1986 + }, + { + "epoch": 0.19, + "grad_norm": 0.2617687295928765, + "learning_rate": 0.0001988917111538559, + "loss": 1.0682, + "step": 1987 + }, + { + "epoch": 0.19, + "grad_norm": 0.28290160215390237, + "learning_rate": 0.00019888936121768266, + "loss": 1.1322, + "step": 1988 + }, + { + "epoch": 0.19, + "grad_norm": 0.2626538158195342, + "learning_rate": 0.00019888700880674103, + "loss": 1.1404, + "step": 1989 + }, + { + "epoch": 0.19, + "grad_norm": 0.27468104620820544, + "learning_rate": 0.00019888465392108986, + "loss": 1.211, + "step": 1990 + }, + { + "epoch": 0.19, + "grad_norm": 0.2684528257690631, + "learning_rate": 0.00019888229656078808, + "loss": 1.086, + "step": 1991 + }, + { + "epoch": 0.19, + "grad_norm": 0.2701652519028749, + "learning_rate": 0.00019887993672589466, + "loss": 1.1998, + "step": 1992 + }, + { + "epoch": 0.19, + "grad_norm": 0.26483806146239974, + "learning_rate": 0.00019887757441646868, + "loss": 1.0015, + "step": 1993 + }, + { + "epoch": 0.19, + "grad_norm": 0.25776041537869526, + "learning_rate": 0.00019887520963256927, + "loss": 1.1646, + "step": 1994 + }, + { + "epoch": 0.19, + "grad_norm": 0.2857416415267135, + "learning_rate": 0.00019887284237425558, + "loss": 1.1295, + "step": 1995 + }, + { + "epoch": 0.19, + "grad_norm": 0.2720535216646917, + "learning_rate": 0.00019887047264158692, + "loss": 1.0362, + "step": 1996 + }, + { + "epoch": 0.19, + "grad_norm": 0.2556899383886345, + "learning_rate": 0.0001988681004346225, + "loss": 1.239, + "step": 1997 + }, + { + "epoch": 0.19, + "grad_norm": 0.29555721414719655, + "learning_rate": 0.00019886572575342174, + "loss": 1.1347, + "step": 1998 + }, + { + "epoch": 0.19, + "grad_norm": 0.28921149656501194, + "learning_rate": 0.00019886334859804406, + "loss": 1.1826, + "step": 1999 + }, + { + "epoch": 0.19, + "grad_norm": 0.274012668267724, + "learning_rate": 0.00019886096896854896, + "loss": 1.0865, + "step": 2000 + }, + { + "epoch": 0.19, + "grad_norm": 0.2422334744946127, + "learning_rate": 0.00019885858686499594, + "loss": 1.0813, + "step": 2001 + }, + { + "epoch": 0.19, + "grad_norm": 0.30780112818652483, + "learning_rate": 0.00019885620228744468, + "loss": 1.0997, + "step": 2002 + }, + { + "epoch": 0.19, + "grad_norm": 0.24632043466385958, + "learning_rate": 0.00019885381523595484, + "loss": 1.0984, + "step": 2003 + }, + { + "epoch": 0.19, + "grad_norm": 0.27662537602831366, + "learning_rate": 0.00019885142571058614, + "loss": 1.1465, + "step": 2004 + }, + { + "epoch": 0.19, + "grad_norm": 0.24285561219836202, + "learning_rate": 0.00019884903371139838, + "loss": 1.0971, + "step": 2005 + }, + { + "epoch": 0.19, + "grad_norm": 0.24656395139808884, + "learning_rate": 0.00019884663923845142, + "loss": 1.143, + "step": 2006 + }, + { + "epoch": 0.19, + "grad_norm": 0.2629422784962355, + "learning_rate": 0.0001988442422918052, + "loss": 1.1407, + "step": 2007 + }, + { + "epoch": 0.19, + "grad_norm": 0.2811274440605523, + "learning_rate": 0.0001988418428715197, + "loss": 1.159, + "step": 2008 + }, + { + "epoch": 0.19, + "grad_norm": 0.28599512163427754, + "learning_rate": 0.00019883944097765497, + "loss": 1.2293, + "step": 2009 + }, + { + "epoch": 0.19, + "grad_norm": 0.2637511197971286, + "learning_rate": 0.0001988370366102711, + "loss": 1.0897, + "step": 2010 + }, + { + "epoch": 0.19, + "grad_norm": 0.2621878190951607, + "learning_rate": 0.00019883462976942826, + "loss": 1.0737, + "step": 2011 + }, + { + "epoch": 0.19, + "grad_norm": 0.2762514383867885, + "learning_rate": 0.0001988322204551867, + "loss": 1.1701, + "step": 2012 + }, + { + "epoch": 0.19, + "grad_norm": 0.24133383394031283, + "learning_rate": 0.00019882980866760673, + "loss": 1.0147, + "step": 2013 + }, + { + "epoch": 0.19, + "grad_norm": 0.26354770911760284, + "learning_rate": 0.00019882739440674863, + "loss": 1.1734, + "step": 2014 + }, + { + "epoch": 0.19, + "grad_norm": 0.28989400649087416, + "learning_rate": 0.00019882497767267294, + "loss": 0.9902, + "step": 2015 + }, + { + "epoch": 0.19, + "grad_norm": 0.27932170163937037, + "learning_rate": 0.00019882255846544005, + "loss": 1.1016, + "step": 2016 + }, + { + "epoch": 0.19, + "grad_norm": 0.25887068615609143, + "learning_rate": 0.00019882013678511052, + "loss": 1.0908, + "step": 2017 + }, + { + "epoch": 0.19, + "grad_norm": 0.28911454498855693, + "learning_rate": 0.000198817712631745, + "loss": 1.1229, + "step": 2018 + }, + { + "epoch": 0.19, + "grad_norm": 0.2546367208219804, + "learning_rate": 0.00019881528600540404, + "loss": 1.1906, + "step": 2019 + }, + { + "epoch": 0.19, + "grad_norm": 0.2736344196999536, + "learning_rate": 0.0001988128569061485, + "loss": 1.1621, + "step": 2020 + }, + { + "epoch": 0.19, + "grad_norm": 0.2818744152091652, + "learning_rate": 0.0001988104253340391, + "loss": 1.0773, + "step": 2021 + }, + { + "epoch": 0.19, + "grad_norm": 0.2826631297854352, + "learning_rate": 0.00019880799128913672, + "loss": 1.043, + "step": 2022 + }, + { + "epoch": 0.19, + "grad_norm": 0.22842355541316772, + "learning_rate": 0.00019880555477150223, + "loss": 0.8834, + "step": 2023 + }, + { + "epoch": 0.19, + "grad_norm": 0.3137419246856175, + "learning_rate": 0.00019880311578119667, + "loss": 1.1337, + "step": 2024 + }, + { + "epoch": 0.19, + "grad_norm": 0.2733714490698032, + "learning_rate": 0.00019880067431828102, + "loss": 1.183, + "step": 2025 + }, + { + "epoch": 0.19, + "grad_norm": 0.26580523535512374, + "learning_rate": 0.00019879823038281642, + "loss": 1.121, + "step": 2026 + }, + { + "epoch": 0.19, + "grad_norm": 0.26858458548658276, + "learning_rate": 0.000198795783974864, + "loss": 1.0527, + "step": 2027 + }, + { + "epoch": 0.19, + "grad_norm": 0.2763698286676362, + "learning_rate": 0.00019879333509448496, + "loss": 1.0042, + "step": 2028 + }, + { + "epoch": 0.19, + "grad_norm": 0.24269742693449786, + "learning_rate": 0.00019879088374174066, + "loss": 1.1615, + "step": 2029 + }, + { + "epoch": 0.19, + "grad_norm": 0.30073652024639724, + "learning_rate": 0.0001987884299166924, + "loss": 1.121, + "step": 2030 + }, + { + "epoch": 0.19, + "grad_norm": 0.2556748923041058, + "learning_rate": 0.00019878597361940161, + "loss": 0.8961, + "step": 2031 + }, + { + "epoch": 0.19, + "grad_norm": 0.2591634855819998, + "learning_rate": 0.00019878351484992974, + "loss": 1.1487, + "step": 2032 + }, + { + "epoch": 0.19, + "grad_norm": 0.2600632642281181, + "learning_rate": 0.00019878105360833832, + "loss": 1.1916, + "step": 2033 + }, + { + "epoch": 0.19, + "grad_norm": 0.28116821489552873, + "learning_rate": 0.00019877858989468894, + "loss": 1.0512, + "step": 2034 + }, + { + "epoch": 0.19, + "grad_norm": 0.20261666938146597, + "learning_rate": 0.0001987761237090433, + "loss": 1.1838, + "step": 2035 + }, + { + "epoch": 0.19, + "grad_norm": 0.29561896466102555, + "learning_rate": 0.00019877365505146304, + "loss": 1.0852, + "step": 2036 + }, + { + "epoch": 0.19, + "grad_norm": 0.29527708211041853, + "learning_rate": 0.00019877118392201, + "loss": 1.1186, + "step": 2037 + }, + { + "epoch": 0.19, + "grad_norm": 0.250583627663694, + "learning_rate": 0.00019876871032074603, + "loss": 1.045, + "step": 2038 + }, + { + "epoch": 0.2, + "grad_norm": 0.3097749132789377, + "learning_rate": 0.000198766234247733, + "loss": 1.1162, + "step": 2039 + }, + { + "epoch": 0.2, + "grad_norm": 0.2557816245578032, + "learning_rate": 0.0001987637557030329, + "loss": 1.0323, + "step": 2040 + }, + { + "epoch": 0.2, + "grad_norm": 0.28034027348239304, + "learning_rate": 0.00019876127468670772, + "loss": 1.1111, + "step": 2041 + }, + { + "epoch": 0.2, + "grad_norm": 0.27069634417343275, + "learning_rate": 0.00019875879119881957, + "loss": 1.0432, + "step": 2042 + }, + { + "epoch": 0.2, + "grad_norm": 0.2579239339061907, + "learning_rate": 0.00019875630523943062, + "loss": 1.0104, + "step": 2043 + }, + { + "epoch": 0.2, + "grad_norm": 0.28379405935029695, + "learning_rate": 0.00019875381680860304, + "loss": 1.1044, + "step": 2044 + }, + { + "epoch": 0.2, + "grad_norm": 0.25276024384054346, + "learning_rate": 0.00019875132590639917, + "loss": 1.0816, + "step": 2045 + }, + { + "epoch": 0.2, + "grad_norm": 0.2529581499654312, + "learning_rate": 0.00019874883253288126, + "loss": 1.0982, + "step": 2046 + }, + { + "epoch": 0.2, + "grad_norm": 0.27524747771098035, + "learning_rate": 0.00019874633668811177, + "loss": 1.1365, + "step": 2047 + }, + { + "epoch": 0.2, + "grad_norm": 0.28979122397869284, + "learning_rate": 0.00019874383837215314, + "loss": 1.1472, + "step": 2048 + }, + { + "epoch": 0.2, + "grad_norm": 0.26208338201886544, + "learning_rate": 0.00019874133758506792, + "loss": 1.0844, + "step": 2049 + }, + { + "epoch": 0.2, + "grad_norm": 0.23244493332423305, + "learning_rate": 0.00019873883432691868, + "loss": 1.0652, + "step": 2050 + }, + { + "epoch": 0.2, + "grad_norm": 0.26326298210214855, + "learning_rate": 0.000198736328597768, + "loss": 1.1114, + "step": 2051 + }, + { + "epoch": 0.2, + "grad_norm": 0.25775993234870526, + "learning_rate": 0.0001987338203976787, + "loss": 0.9868, + "step": 2052 + }, + { + "epoch": 0.2, + "grad_norm": 0.2558696856240754, + "learning_rate": 0.00019873130972671347, + "loss": 1.0485, + "step": 2053 + }, + { + "epoch": 0.2, + "grad_norm": 0.25810577145871305, + "learning_rate": 0.00019872879658493515, + "loss": 1.0948, + "step": 2054 + }, + { + "epoch": 0.2, + "grad_norm": 0.26764993308160495, + "learning_rate": 0.00019872628097240667, + "loss": 1.1752, + "step": 2055 + }, + { + "epoch": 0.2, + "grad_norm": 0.2844642341098131, + "learning_rate": 0.00019872376288919093, + "loss": 1.1397, + "step": 2056 + }, + { + "epoch": 0.2, + "grad_norm": 0.27934363473211593, + "learning_rate": 0.00019872124233535102, + "loss": 1.2909, + "step": 2057 + }, + { + "epoch": 0.2, + "grad_norm": 0.28305414844226917, + "learning_rate": 0.00019871871931094996, + "loss": 1.1307, + "step": 2058 + }, + { + "epoch": 0.2, + "grad_norm": 0.2547448994013944, + "learning_rate": 0.0001987161938160509, + "loss": 1.0631, + "step": 2059 + }, + { + "epoch": 0.2, + "grad_norm": 0.2457309364326485, + "learning_rate": 0.00019871366585071706, + "loss": 1.0862, + "step": 2060 + }, + { + "epoch": 0.2, + "grad_norm": 0.2892837695062467, + "learning_rate": 0.00019871113541501168, + "loss": 1.1139, + "step": 2061 + }, + { + "epoch": 0.2, + "grad_norm": 0.297033511165508, + "learning_rate": 0.0001987086025089981, + "loss": 1.158, + "step": 2062 + }, + { + "epoch": 0.2, + "grad_norm": 0.28710128200720647, + "learning_rate": 0.00019870606713273968, + "loss": 1.0244, + "step": 2063 + }, + { + "epoch": 0.2, + "grad_norm": 0.29989733418311626, + "learning_rate": 0.00019870352928629993, + "loss": 1.0888, + "step": 2064 + }, + { + "epoch": 0.2, + "grad_norm": 0.2698108293542839, + "learning_rate": 0.00019870098896974234, + "loss": 1.0765, + "step": 2065 + }, + { + "epoch": 0.2, + "grad_norm": 0.2833643780662451, + "learning_rate": 0.00019869844618313046, + "loss": 1.0251, + "step": 2066 + }, + { + "epoch": 0.2, + "grad_norm": 0.3349028897187998, + "learning_rate": 0.00019869590092652791, + "loss": 1.1527, + "step": 2067 + }, + { + "epoch": 0.2, + "grad_norm": 0.26736402567126216, + "learning_rate": 0.0001986933531999984, + "loss": 1.1222, + "step": 2068 + }, + { + "epoch": 0.2, + "grad_norm": 0.24629779305771238, + "learning_rate": 0.00019869080300360576, + "loss": 1.0542, + "step": 2069 + }, + { + "epoch": 0.2, + "grad_norm": 0.2659353555548773, + "learning_rate": 0.00019868825033741373, + "loss": 1.1196, + "step": 2070 + }, + { + "epoch": 0.2, + "grad_norm": 0.2880977765710642, + "learning_rate": 0.00019868569520148618, + "loss": 1.1662, + "step": 2071 + }, + { + "epoch": 0.2, + "grad_norm": 0.26777705281811776, + "learning_rate": 0.0001986831375958871, + "loss": 1.1153, + "step": 2072 + }, + { + "epoch": 0.2, + "grad_norm": 0.2446161703167875, + "learning_rate": 0.0001986805775206805, + "loss": 1.1845, + "step": 2073 + }, + { + "epoch": 0.2, + "grad_norm": 0.2873479752588241, + "learning_rate": 0.00019867801497593042, + "loss": 1.19, + "step": 2074 + }, + { + "epoch": 0.2, + "grad_norm": 0.2574173866976346, + "learning_rate": 0.000198675449961701, + "loss": 1.1004, + "step": 2075 + }, + { + "epoch": 0.2, + "grad_norm": 0.31037055152728826, + "learning_rate": 0.00019867288247805642, + "loss": 1.1266, + "step": 2076 + }, + { + "epoch": 0.2, + "grad_norm": 0.25230875618755544, + "learning_rate": 0.00019867031252506095, + "loss": 1.0861, + "step": 2077 + }, + { + "epoch": 0.2, + "grad_norm": 0.28027907388788925, + "learning_rate": 0.0001986677401027789, + "loss": 1.0899, + "step": 2078 + }, + { + "epoch": 0.2, + "grad_norm": 0.2629017191349244, + "learning_rate": 0.00019866516521127462, + "loss": 1.1268, + "step": 2079 + }, + { + "epoch": 0.2, + "grad_norm": 0.2858944012914975, + "learning_rate": 0.0001986625878506126, + "loss": 1.1248, + "step": 2080 + }, + { + "epoch": 0.2, + "grad_norm": 0.3010750896726883, + "learning_rate": 0.00019866000802085728, + "loss": 1.109, + "step": 2081 + }, + { + "epoch": 0.2, + "grad_norm": 0.27335673435624314, + "learning_rate": 0.0001986574257220733, + "loss": 1.0929, + "step": 2082 + }, + { + "epoch": 0.2, + "grad_norm": 0.25277983760592904, + "learning_rate": 0.0001986548409543252, + "loss": 1.0946, + "step": 2083 + }, + { + "epoch": 0.2, + "grad_norm": 0.2522955007250379, + "learning_rate": 0.00019865225371767773, + "loss": 1.1279, + "step": 2084 + }, + { + "epoch": 0.2, + "grad_norm": 0.2876473859106391, + "learning_rate": 0.00019864966401219559, + "loss": 1.044, + "step": 2085 + }, + { + "epoch": 0.2, + "grad_norm": 0.5481907916561444, + "learning_rate": 0.00019864707183794362, + "loss": 1.3456, + "step": 2086 + }, + { + "epoch": 0.2, + "grad_norm": 0.26281784160346944, + "learning_rate": 0.00019864447719498667, + "loss": 1.2029, + "step": 2087 + }, + { + "epoch": 0.2, + "grad_norm": 0.27656267392775247, + "learning_rate": 0.00019864188008338968, + "loss": 1.1244, + "step": 2088 + }, + { + "epoch": 0.2, + "grad_norm": 0.2712507307178155, + "learning_rate": 0.00019863928050321765, + "loss": 1.2326, + "step": 2089 + }, + { + "epoch": 0.2, + "grad_norm": 0.27024578206691424, + "learning_rate": 0.00019863667845453563, + "loss": 1.1642, + "step": 2090 + }, + { + "epoch": 0.2, + "grad_norm": 0.305876067098806, + "learning_rate": 0.00019863407393740876, + "loss": 1.2, + "step": 2091 + }, + { + "epoch": 0.2, + "grad_norm": 0.2665786067215833, + "learning_rate": 0.00019863146695190217, + "loss": 1.1217, + "step": 2092 + }, + { + "epoch": 0.2, + "grad_norm": 0.23808439538640014, + "learning_rate": 0.00019862885749808115, + "loss": 1.089, + "step": 2093 + }, + { + "epoch": 0.2, + "grad_norm": 0.27236917331109767, + "learning_rate": 0.00019862624557601103, + "loss": 1.1333, + "step": 2094 + }, + { + "epoch": 0.2, + "grad_norm": 0.26305710425726253, + "learning_rate": 0.00019862363118575705, + "loss": 1.1396, + "step": 2095 + }, + { + "epoch": 0.2, + "grad_norm": 0.2302881958735561, + "learning_rate": 0.00019862101432738475, + "loss": 1.0263, + "step": 2096 + }, + { + "epoch": 0.2, + "grad_norm": 0.2954108631439019, + "learning_rate": 0.0001986183950009596, + "loss": 1.1058, + "step": 2097 + }, + { + "epoch": 0.2, + "grad_norm": 0.2948895500433461, + "learning_rate": 0.00019861577320654712, + "loss": 1.1621, + "step": 2098 + }, + { + "epoch": 0.2, + "grad_norm": 0.24612354772654924, + "learning_rate": 0.00019861314894421294, + "loss": 1.1682, + "step": 2099 + }, + { + "epoch": 0.2, + "grad_norm": 0.2770485117180903, + "learning_rate": 0.00019861052221402275, + "loss": 1.0537, + "step": 2100 + }, + { + "epoch": 0.2, + "grad_norm": 0.2803003393924788, + "learning_rate": 0.00019860789301604222, + "loss": 1.1575, + "step": 2101 + }, + { + "epoch": 0.2, + "grad_norm": 0.263398275519541, + "learning_rate": 0.00019860526135033723, + "loss": 1.1161, + "step": 2102 + }, + { + "epoch": 0.2, + "grad_norm": 0.2735697531308213, + "learning_rate": 0.0001986026272169736, + "loss": 1.1304, + "step": 2103 + }, + { + "epoch": 0.2, + "grad_norm": 0.2837690209815238, + "learning_rate": 0.00019859999061601726, + "loss": 0.9939, + "step": 2104 + }, + { + "epoch": 0.2, + "grad_norm": 0.2611549781543971, + "learning_rate": 0.00019859735154753418, + "loss": 1.0968, + "step": 2105 + }, + { + "epoch": 0.2, + "grad_norm": 0.2858960886543411, + "learning_rate": 0.0001985947100115904, + "loss": 1.1623, + "step": 2106 + }, + { + "epoch": 0.2, + "grad_norm": 0.3657978801967696, + "learning_rate": 0.00019859206600825207, + "loss": 1.2114, + "step": 2107 + }, + { + "epoch": 0.2, + "grad_norm": 0.24528859351726237, + "learning_rate": 0.0001985894195375853, + "loss": 1.1096, + "step": 2108 + }, + { + "epoch": 0.2, + "grad_norm": 0.309781272595587, + "learning_rate": 0.00019858677059965632, + "loss": 1.1382, + "step": 2109 + }, + { + "epoch": 0.2, + "grad_norm": 0.3015108916795954, + "learning_rate": 0.0001985841191945315, + "loss": 1.0789, + "step": 2110 + }, + { + "epoch": 0.2, + "grad_norm": 0.27510018422236365, + "learning_rate": 0.0001985814653222771, + "loss": 1.1214, + "step": 2111 + }, + { + "epoch": 0.2, + "grad_norm": 0.2504556220073607, + "learning_rate": 0.0001985788089829596, + "loss": 1.1829, + "step": 2112 + }, + { + "epoch": 0.2, + "grad_norm": 0.27607247184581263, + "learning_rate": 0.00019857615017664543, + "loss": 1.2014, + "step": 2113 + }, + { + "epoch": 0.2, + "grad_norm": 0.28257879262143415, + "learning_rate": 0.00019857348890340117, + "loss": 1.1302, + "step": 2114 + }, + { + "epoch": 0.2, + "grad_norm": 0.2961265516298664, + "learning_rate": 0.0001985708251632934, + "loss": 1.0324, + "step": 2115 + }, + { + "epoch": 0.2, + "grad_norm": 0.2942307299808682, + "learning_rate": 0.00019856815895638876, + "loss": 1.0799, + "step": 2116 + }, + { + "epoch": 0.2, + "grad_norm": 0.2541933617332, + "learning_rate": 0.000198565490282754, + "loss": 1.1498, + "step": 2117 + }, + { + "epoch": 0.2, + "grad_norm": 0.28011641730308906, + "learning_rate": 0.0001985628191424559, + "loss": 1.1392, + "step": 2118 + }, + { + "epoch": 0.2, + "grad_norm": 0.2807759455450216, + "learning_rate": 0.0001985601455355613, + "loss": 1.1776, + "step": 2119 + }, + { + "epoch": 0.2, + "grad_norm": 0.32430654597893255, + "learning_rate": 0.00019855746946213714, + "loss": 1.1778, + "step": 2120 + }, + { + "epoch": 0.2, + "grad_norm": 0.2525816278621571, + "learning_rate": 0.00019855479092225037, + "loss": 1.1537, + "step": 2121 + }, + { + "epoch": 0.2, + "grad_norm": 0.26640266147857056, + "learning_rate": 0.00019855210991596796, + "loss": 1.096, + "step": 2122 + }, + { + "epoch": 0.2, + "grad_norm": 0.25884771414681745, + "learning_rate": 0.00019854942644335712, + "loss": 1.1562, + "step": 2123 + }, + { + "epoch": 0.2, + "grad_norm": 0.27617724462201587, + "learning_rate": 0.00019854674050448493, + "loss": 1.1385, + "step": 2124 + }, + { + "epoch": 0.2, + "grad_norm": 0.2919548651872331, + "learning_rate": 0.00019854405209941863, + "loss": 1.0791, + "step": 2125 + }, + { + "epoch": 0.2, + "grad_norm": 0.24993403620332835, + "learning_rate": 0.00019854136122822547, + "loss": 1.0431, + "step": 2126 + }, + { + "epoch": 0.2, + "grad_norm": 0.23940290308480794, + "learning_rate": 0.0001985386678909728, + "loss": 1.0944, + "step": 2127 + }, + { + "epoch": 0.2, + "grad_norm": 0.2852028804707256, + "learning_rate": 0.00019853597208772808, + "loss": 1.0735, + "step": 2128 + }, + { + "epoch": 0.2, + "grad_norm": 0.2799825280793891, + "learning_rate": 0.0001985332738185587, + "loss": 1.1108, + "step": 2129 + }, + { + "epoch": 0.2, + "grad_norm": 0.2803961566058768, + "learning_rate": 0.00019853057308353225, + "loss": 1.1428, + "step": 2130 + }, + { + "epoch": 0.2, + "grad_norm": 0.2681024606858511, + "learning_rate": 0.00019852786988271628, + "loss": 1.1777, + "step": 2131 + }, + { + "epoch": 0.2, + "grad_norm": 0.28971370065149094, + "learning_rate": 0.0001985251642161784, + "loss": 1.1166, + "step": 2132 + }, + { + "epoch": 0.2, + "grad_norm": 0.2809462172886824, + "learning_rate": 0.0001985224560839864, + "loss": 1.1337, + "step": 2133 + }, + { + "epoch": 0.2, + "grad_norm": 0.2662105547019178, + "learning_rate": 0.00019851974548620803, + "loss": 1.2131, + "step": 2134 + }, + { + "epoch": 0.2, + "grad_norm": 0.2689850661970803, + "learning_rate": 0.0001985170324229111, + "loss": 1.1857, + "step": 2135 + }, + { + "epoch": 0.2, + "grad_norm": 0.2831472805779883, + "learning_rate": 0.00019851431689416353, + "loss": 1.1575, + "step": 2136 + }, + { + "epoch": 0.2, + "grad_norm": 0.2877033555483126, + "learning_rate": 0.00019851159890003323, + "loss": 1.0868, + "step": 2137 + }, + { + "epoch": 0.2, + "grad_norm": 0.29781126767542937, + "learning_rate": 0.00019850887844058827, + "loss": 1.1535, + "step": 2138 + }, + { + "epoch": 0.2, + "grad_norm": 0.2528619996193506, + "learning_rate": 0.00019850615551589672, + "loss": 1.0632, + "step": 2139 + }, + { + "epoch": 0.2, + "grad_norm": 0.2605060917972941, + "learning_rate": 0.00019850343012602672, + "loss": 1.1709, + "step": 2140 + }, + { + "epoch": 0.2, + "grad_norm": 0.2773145775379898, + "learning_rate": 0.0001985007022710465, + "loss": 1.1957, + "step": 2141 + }, + { + "epoch": 0.2, + "grad_norm": 0.28927051493387645, + "learning_rate": 0.00019849797195102426, + "loss": 1.0608, + "step": 2142 + }, + { + "epoch": 0.21, + "grad_norm": 0.31331821900541645, + "learning_rate": 0.0001984952391660284, + "loss": 1.1371, + "step": 2143 + }, + { + "epoch": 0.21, + "grad_norm": 0.2785760487723139, + "learning_rate": 0.00019849250391612726, + "loss": 1.1553, + "step": 2144 + }, + { + "epoch": 0.21, + "grad_norm": 0.28557512665641493, + "learning_rate": 0.0001984897662013893, + "loss": 1.1505, + "step": 2145 + }, + { + "epoch": 0.21, + "grad_norm": 0.2640270431957491, + "learning_rate": 0.00019848702602188304, + "loss": 1.0196, + "step": 2146 + }, + { + "epoch": 0.21, + "grad_norm": 0.26301343020312196, + "learning_rate": 0.00019848428337767708, + "loss": 1.0716, + "step": 2147 + }, + { + "epoch": 0.21, + "grad_norm": 0.24955497958144957, + "learning_rate": 0.00019848153826884004, + "loss": 1.1068, + "step": 2148 + }, + { + "epoch": 0.21, + "grad_norm": 0.25592481094445924, + "learning_rate": 0.00019847879069544058, + "loss": 1.0493, + "step": 2149 + }, + { + "epoch": 0.21, + "grad_norm": 0.2690607872687816, + "learning_rate": 0.0001984760406575475, + "loss": 1.1645, + "step": 2150 + }, + { + "epoch": 0.21, + "grad_norm": 0.29603325449239903, + "learning_rate": 0.00019847328815522964, + "loss": 1.0333, + "step": 2151 + }, + { + "epoch": 0.21, + "grad_norm": 0.25742762041890327, + "learning_rate": 0.00019847053318855582, + "loss": 1.2017, + "step": 2152 + }, + { + "epoch": 0.21, + "grad_norm": 0.30645008656891, + "learning_rate": 0.00019846777575759504, + "loss": 1.1346, + "step": 2153 + }, + { + "epoch": 0.21, + "grad_norm": 0.27044205667054494, + "learning_rate": 0.00019846501586241627, + "loss": 1.097, + "step": 2154 + }, + { + "epoch": 0.21, + "grad_norm": 0.23220679441493658, + "learning_rate": 0.00019846225350308864, + "loss": 1.0664, + "step": 2155 + }, + { + "epoch": 0.21, + "grad_norm": 0.26546624754158665, + "learning_rate": 0.00019845948867968117, + "loss": 1.0479, + "step": 2156 + }, + { + "epoch": 0.21, + "grad_norm": 0.2798970718045841, + "learning_rate": 0.00019845672139226316, + "loss": 1.0244, + "step": 2157 + }, + { + "epoch": 0.21, + "grad_norm": 0.2784787690728781, + "learning_rate": 0.00019845395164090382, + "loss": 1.1114, + "step": 2158 + }, + { + "epoch": 0.21, + "grad_norm": 0.24443020869424956, + "learning_rate": 0.00019845117942567244, + "loss": 1.1341, + "step": 2159 + }, + { + "epoch": 0.21, + "grad_norm": 0.244756739484968, + "learning_rate": 0.00019844840474663843, + "loss": 1.0807, + "step": 2160 + }, + { + "epoch": 0.21, + "grad_norm": 0.2702201314078314, + "learning_rate": 0.00019844562760387122, + "loss": 1.1269, + "step": 2161 + }, + { + "epoch": 0.21, + "grad_norm": 0.29077240998538223, + "learning_rate": 0.00019844284799744032, + "loss": 1.1688, + "step": 2162 + }, + { + "epoch": 0.21, + "grad_norm": 0.2683927419703879, + "learning_rate": 0.00019844006592741525, + "loss": 1.0173, + "step": 2163 + }, + { + "epoch": 0.21, + "grad_norm": 0.3109361300962534, + "learning_rate": 0.0001984372813938657, + "loss": 1.148, + "step": 2164 + }, + { + "epoch": 0.21, + "grad_norm": 0.2501468531327423, + "learning_rate": 0.00019843449439686128, + "loss": 1.1907, + "step": 2165 + }, + { + "epoch": 0.21, + "grad_norm": 0.2664858040953975, + "learning_rate": 0.0001984317049364718, + "loss": 1.1097, + "step": 2166 + }, + { + "epoch": 0.21, + "grad_norm": 0.2549104447589198, + "learning_rate": 0.00019842891301276704, + "loss": 1.0737, + "step": 2167 + }, + { + "epoch": 0.21, + "grad_norm": 0.2908504505180112, + "learning_rate": 0.00019842611862581685, + "loss": 1.0539, + "step": 2168 + }, + { + "epoch": 0.21, + "grad_norm": 0.2959941920542236, + "learning_rate": 0.00019842332177569122, + "loss": 1.1418, + "step": 2169 + }, + { + "epoch": 0.21, + "grad_norm": 0.26475147358616613, + "learning_rate": 0.00019842052246246008, + "loss": 1.0866, + "step": 2170 + }, + { + "epoch": 0.21, + "grad_norm": 0.28262317137702664, + "learning_rate": 0.0001984177206861935, + "loss": 1.1431, + "step": 2171 + }, + { + "epoch": 0.21, + "grad_norm": 0.2615494504849684, + "learning_rate": 0.00019841491644696164, + "loss": 1.1576, + "step": 2172 + }, + { + "epoch": 0.21, + "grad_norm": 0.27419165343889973, + "learning_rate": 0.00019841210974483464, + "loss": 1.1325, + "step": 2173 + }, + { + "epoch": 0.21, + "grad_norm": 0.24040329342282296, + "learning_rate": 0.0001984093005798827, + "loss": 1.1437, + "step": 2174 + }, + { + "epoch": 0.21, + "grad_norm": 0.29409408115598895, + "learning_rate": 0.00019840648895217623, + "loss": 1.1064, + "step": 2175 + }, + { + "epoch": 0.21, + "grad_norm": 0.2523665722905447, + "learning_rate": 0.00019840367486178548, + "loss": 1.07, + "step": 2176 + }, + { + "epoch": 0.21, + "grad_norm": 0.25402746178920604, + "learning_rate": 0.00019840085830878095, + "loss": 1.0573, + "step": 2177 + }, + { + "epoch": 0.21, + "grad_norm": 0.24450222623833068, + "learning_rate": 0.00019839803929323305, + "loss": 1.1127, + "step": 2178 + }, + { + "epoch": 0.21, + "grad_norm": 0.24554337014527297, + "learning_rate": 0.00019839521781521245, + "loss": 1.0781, + "step": 2179 + }, + { + "epoch": 0.21, + "grad_norm": 0.29644405786503714, + "learning_rate": 0.00019839239387478962, + "loss": 1.1072, + "step": 2180 + }, + { + "epoch": 0.21, + "grad_norm": 0.23532298664328116, + "learning_rate": 0.00019838956747203533, + "loss": 0.9529, + "step": 2181 + }, + { + "epoch": 0.21, + "grad_norm": 0.255567234762623, + "learning_rate": 0.00019838673860702027, + "loss": 1.2165, + "step": 2182 + }, + { + "epoch": 0.21, + "grad_norm": 0.2565392465769596, + "learning_rate": 0.00019838390727981527, + "loss": 1.049, + "step": 2183 + }, + { + "epoch": 0.21, + "grad_norm": 0.28869850717775036, + "learning_rate": 0.00019838107349049111, + "loss": 1.2043, + "step": 2184 + }, + { + "epoch": 0.21, + "grad_norm": 0.2717398071388341, + "learning_rate": 0.0001983782372391188, + "loss": 1.1689, + "step": 2185 + }, + { + "epoch": 0.21, + "grad_norm": 0.2714997167115452, + "learning_rate": 0.00019837539852576923, + "loss": 1.0412, + "step": 2186 + }, + { + "epoch": 0.21, + "grad_norm": 0.2528223715764014, + "learning_rate": 0.0001983725573505135, + "loss": 1.0636, + "step": 2187 + }, + { + "epoch": 0.21, + "grad_norm": 0.24457509080188328, + "learning_rate": 0.0001983697137134227, + "loss": 1.0427, + "step": 2188 + }, + { + "epoch": 0.21, + "grad_norm": 0.2647502073171626, + "learning_rate": 0.00019836686761456803, + "loss": 1.1109, + "step": 2189 + }, + { + "epoch": 0.21, + "grad_norm": 0.25621160412291943, + "learning_rate": 0.00019836401905402062, + "loss": 1.1426, + "step": 2190 + }, + { + "epoch": 0.21, + "grad_norm": 0.2875669800942636, + "learning_rate": 0.00019836116803185184, + "loss": 1.0843, + "step": 2191 + }, + { + "epoch": 0.21, + "grad_norm": 0.26793735322362255, + "learning_rate": 0.000198358314548133, + "loss": 1.2198, + "step": 2192 + }, + { + "epoch": 0.21, + "grad_norm": 0.30265207091393975, + "learning_rate": 0.00019835545860293551, + "loss": 0.9996, + "step": 2193 + }, + { + "epoch": 0.21, + "grad_norm": 0.25384247043679864, + "learning_rate": 0.0001983526001963309, + "loss": 1.1222, + "step": 2194 + }, + { + "epoch": 0.21, + "grad_norm": 0.25800061296353, + "learning_rate": 0.00019834973932839062, + "loss": 1.0905, + "step": 2195 + }, + { + "epoch": 0.21, + "grad_norm": 0.27769824178444574, + "learning_rate": 0.00019834687599918632, + "loss": 1.0538, + "step": 2196 + }, + { + "epoch": 0.21, + "grad_norm": 0.2772132245071213, + "learning_rate": 0.00019834401020878963, + "loss": 1.0624, + "step": 2197 + }, + { + "epoch": 0.21, + "grad_norm": 0.26363087570930127, + "learning_rate": 0.0001983411419572723, + "loss": 0.9887, + "step": 2198 + }, + { + "epoch": 0.21, + "grad_norm": 0.2815107276805014, + "learning_rate": 0.00019833827124470608, + "loss": 1.1811, + "step": 2199 + }, + { + "epoch": 0.21, + "grad_norm": 0.2785544171403854, + "learning_rate": 0.0001983353980711628, + "loss": 1.1437, + "step": 2200 + }, + { + "epoch": 0.21, + "grad_norm": 0.3011117821316356, + "learning_rate": 0.0001983325224367144, + "loss": 1.0398, + "step": 2201 + }, + { + "epoch": 0.21, + "grad_norm": 0.2445670553372607, + "learning_rate": 0.00019832964434143282, + "loss": 1.101, + "step": 2202 + }, + { + "epoch": 0.21, + "grad_norm": 0.25914004062255874, + "learning_rate": 0.00019832676378539005, + "loss": 1.1808, + "step": 2203 + }, + { + "epoch": 0.21, + "grad_norm": 0.2754672779595424, + "learning_rate": 0.00019832388076865826, + "loss": 1.0929, + "step": 2204 + }, + { + "epoch": 0.21, + "grad_norm": 0.2565507943348922, + "learning_rate": 0.00019832099529130959, + "loss": 1.0699, + "step": 2205 + }, + { + "epoch": 0.21, + "grad_norm": 0.3343174008427606, + "learning_rate": 0.00019831810735341618, + "loss": 1.0145, + "step": 2206 + }, + { + "epoch": 0.21, + "grad_norm": 0.2804796337948149, + "learning_rate": 0.00019831521695505035, + "loss": 1.0897, + "step": 2207 + }, + { + "epoch": 0.21, + "grad_norm": 0.2525345630451486, + "learning_rate": 0.00019831232409628445, + "loss": 1.0794, + "step": 2208 + }, + { + "epoch": 0.21, + "grad_norm": 0.24602578478108195, + "learning_rate": 0.0001983094287771908, + "loss": 1.1439, + "step": 2209 + }, + { + "epoch": 0.21, + "grad_norm": 0.26742569319862003, + "learning_rate": 0.00019830653099784195, + "loss": 1.1399, + "step": 2210 + }, + { + "epoch": 0.21, + "grad_norm": 0.30626650715688947, + "learning_rate": 0.00019830363075831037, + "loss": 1.2276, + "step": 2211 + }, + { + "epoch": 0.21, + "grad_norm": 0.2978427240509176, + "learning_rate": 0.00019830072805866866, + "loss": 1.215, + "step": 2212 + }, + { + "epoch": 0.21, + "grad_norm": 0.2795761909559458, + "learning_rate": 0.00019829782289898943, + "loss": 1.2044, + "step": 2213 + }, + { + "epoch": 0.21, + "grad_norm": 0.28310847050083876, + "learning_rate": 0.0001982949152793454, + "loss": 1.1433, + "step": 2214 + }, + { + "epoch": 0.21, + "grad_norm": 0.2796955122824297, + "learning_rate": 0.00019829200519980937, + "loss": 1.0606, + "step": 2215 + }, + { + "epoch": 0.21, + "grad_norm": 0.2528523703698838, + "learning_rate": 0.0001982890926604541, + "loss": 1.068, + "step": 2216 + }, + { + "epoch": 0.21, + "grad_norm": 0.2523940552498862, + "learning_rate": 0.00019828617766135255, + "loss": 1.0647, + "step": 2217 + }, + { + "epoch": 0.21, + "grad_norm": 0.2970231511295705, + "learning_rate": 0.0001982832602025776, + "loss": 1.2357, + "step": 2218 + }, + { + "epoch": 0.21, + "grad_norm": 0.27974233774133495, + "learning_rate": 0.00019828034028420232, + "loss": 1.0735, + "step": 2219 + }, + { + "epoch": 0.21, + "grad_norm": 0.269451164229955, + "learning_rate": 0.00019827741790629975, + "loss": 1.0784, + "step": 2220 + }, + { + "epoch": 0.21, + "grad_norm": 0.2658199878337128, + "learning_rate": 0.00019827449306894304, + "loss": 1.0841, + "step": 2221 + }, + { + "epoch": 0.21, + "grad_norm": 0.257731802421506, + "learning_rate": 0.00019827156577220537, + "loss": 1.2333, + "step": 2222 + }, + { + "epoch": 0.21, + "grad_norm": 0.32039613850942644, + "learning_rate": 0.00019826863601616, + "loss": 1.1436, + "step": 2223 + }, + { + "epoch": 0.21, + "grad_norm": 0.23336247900235474, + "learning_rate": 0.00019826570380088025, + "loss": 1.1719, + "step": 2224 + }, + { + "epoch": 0.21, + "grad_norm": 0.28395673225685364, + "learning_rate": 0.0001982627691264395, + "loss": 1.183, + "step": 2225 + }, + { + "epoch": 0.21, + "grad_norm": 0.2848669382303132, + "learning_rate": 0.00019825983199291122, + "loss": 1.1098, + "step": 2226 + }, + { + "epoch": 0.21, + "grad_norm": 0.27510914460004676, + "learning_rate": 0.0001982568924003689, + "loss": 1.1285, + "step": 2227 + }, + { + "epoch": 0.21, + "grad_norm": 0.2921265346308141, + "learning_rate": 0.00019825395034888605, + "loss": 1.1692, + "step": 2228 + }, + { + "epoch": 0.21, + "grad_norm": 0.2761631545730766, + "learning_rate": 0.00019825100583853637, + "loss": 1.1872, + "step": 2229 + }, + { + "epoch": 0.21, + "grad_norm": 0.2592595323871488, + "learning_rate": 0.00019824805886939353, + "loss": 1.0289, + "step": 2230 + }, + { + "epoch": 0.21, + "grad_norm": 0.29309277372861997, + "learning_rate": 0.00019824510944153125, + "loss": 1.1123, + "step": 2231 + }, + { + "epoch": 0.21, + "grad_norm": 0.27306258003376527, + "learning_rate": 0.00019824215755502337, + "loss": 1.1453, + "step": 2232 + }, + { + "epoch": 0.21, + "grad_norm": 0.263424668387752, + "learning_rate": 0.00019823920320994373, + "loss": 1.1002, + "step": 2233 + }, + { + "epoch": 0.21, + "grad_norm": 0.2938351047841059, + "learning_rate": 0.00019823624640636633, + "loss": 1.127, + "step": 2234 + }, + { + "epoch": 0.21, + "grad_norm": 0.24927930137531826, + "learning_rate": 0.0001982332871443651, + "loss": 1.0708, + "step": 2235 + }, + { + "epoch": 0.21, + "grad_norm": 0.2786877424279345, + "learning_rate": 0.00019823032542401413, + "loss": 1.0868, + "step": 2236 + }, + { + "epoch": 0.21, + "grad_norm": 0.2798063724399594, + "learning_rate": 0.00019822736124538754, + "loss": 1.1573, + "step": 2237 + }, + { + "epoch": 0.21, + "grad_norm": 0.2633597183766863, + "learning_rate": 0.00019822439460855947, + "loss": 1.1058, + "step": 2238 + }, + { + "epoch": 0.21, + "grad_norm": 0.24785877289941977, + "learning_rate": 0.00019822142551360422, + "loss": 1.0471, + "step": 2239 + }, + { + "epoch": 0.21, + "grad_norm": 0.2713868952406667, + "learning_rate": 0.00019821845396059606, + "loss": 1.0428, + "step": 2240 + }, + { + "epoch": 0.21, + "grad_norm": 0.2811381014767392, + "learning_rate": 0.0001982154799496094, + "loss": 1.0762, + "step": 2241 + }, + { + "epoch": 0.21, + "grad_norm": 0.2725029043198944, + "learning_rate": 0.00019821250348071856, + "loss": 1.1293, + "step": 2242 + }, + { + "epoch": 0.21, + "grad_norm": 0.2827100509016029, + "learning_rate": 0.00019820952455399814, + "loss": 1.1447, + "step": 2243 + }, + { + "epoch": 0.21, + "grad_norm": 0.2758022374658988, + "learning_rate": 0.00019820654316952263, + "loss": 1.1659, + "step": 2244 + }, + { + "epoch": 0.21, + "grad_norm": 0.2633396491372797, + "learning_rate": 0.00019820355932736666, + "loss": 1.0462, + "step": 2245 + }, + { + "epoch": 0.21, + "grad_norm": 0.2731226166685037, + "learning_rate": 0.00019820057302760488, + "loss": 0.9548, + "step": 2246 + }, + { + "epoch": 0.21, + "grad_norm": 0.25556114167089006, + "learning_rate": 0.00019819758427031206, + "loss": 1.2312, + "step": 2247 + }, + { + "epoch": 0.22, + "grad_norm": 0.28848997593382414, + "learning_rate": 0.00019819459305556297, + "loss": 1.0739, + "step": 2248 + }, + { + "epoch": 0.22, + "grad_norm": 0.23351405165987268, + "learning_rate": 0.0001981915993834325, + "loss": 1.0641, + "step": 2249 + }, + { + "epoch": 0.22, + "grad_norm": 0.26791227089364905, + "learning_rate": 0.00019818860325399552, + "loss": 1.1015, + "step": 2250 + }, + { + "epoch": 0.22, + "grad_norm": 0.2820213434051579, + "learning_rate": 0.00019818560466732706, + "loss": 1.063, + "step": 2251 + }, + { + "epoch": 0.22, + "grad_norm": 0.24527078074648306, + "learning_rate": 0.00019818260362350213, + "loss": 1.1702, + "step": 2252 + }, + { + "epoch": 0.22, + "grad_norm": 0.2720420411260554, + "learning_rate": 0.0001981796001225958, + "loss": 1.0912, + "step": 2253 + }, + { + "epoch": 0.22, + "grad_norm": 0.2713012314046693, + "learning_rate": 0.00019817659416468332, + "loss": 1.0524, + "step": 2254 + }, + { + "epoch": 0.22, + "grad_norm": 0.26924822640795093, + "learning_rate": 0.00019817358574983983, + "loss": 1.0871, + "step": 2255 + }, + { + "epoch": 0.22, + "grad_norm": 0.27363733386951783, + "learning_rate": 0.0001981705748781407, + "loss": 1.0598, + "step": 2256 + }, + { + "epoch": 0.22, + "grad_norm": 0.24189809697792714, + "learning_rate": 0.0001981675615496612, + "loss": 1.084, + "step": 2257 + }, + { + "epoch": 0.22, + "grad_norm": 0.2706773118754261, + "learning_rate": 0.0001981645457644768, + "loss": 1.0637, + "step": 2258 + }, + { + "epoch": 0.22, + "grad_norm": 0.27820113281091335, + "learning_rate": 0.00019816152752266292, + "loss": 1.1624, + "step": 2259 + }, + { + "epoch": 0.22, + "grad_norm": 0.23093374468477146, + "learning_rate": 0.00019815850682429516, + "loss": 1.1735, + "step": 2260 + }, + { + "epoch": 0.22, + "grad_norm": 0.26705391917092386, + "learning_rate": 0.00019815548366944904, + "loss": 1.049, + "step": 2261 + }, + { + "epoch": 0.22, + "grad_norm": 0.28355313423369083, + "learning_rate": 0.00019815245805820028, + "loss": 1.0949, + "step": 2262 + }, + { + "epoch": 0.22, + "grad_norm": 0.2395712965169708, + "learning_rate": 0.00019814942999062457, + "loss": 1.05, + "step": 2263 + }, + { + "epoch": 0.22, + "grad_norm": 0.285804478616941, + "learning_rate": 0.00019814639946679768, + "loss": 1.1369, + "step": 2264 + }, + { + "epoch": 0.22, + "grad_norm": 0.25061529704124386, + "learning_rate": 0.00019814336648679546, + "loss": 1.0655, + "step": 2265 + }, + { + "epoch": 0.22, + "grad_norm": 0.2909795934470434, + "learning_rate": 0.0001981403310506938, + "loss": 1.1807, + "step": 2266 + }, + { + "epoch": 0.22, + "grad_norm": 0.23297851873356334, + "learning_rate": 0.00019813729315856869, + "loss": 1.152, + "step": 2267 + }, + { + "epoch": 0.22, + "grad_norm": 0.25889655866668293, + "learning_rate": 0.00019813425281049613, + "loss": 1.1054, + "step": 2268 + }, + { + "epoch": 0.22, + "grad_norm": 0.23977654506120644, + "learning_rate": 0.00019813121000655223, + "loss": 1.1002, + "step": 2269 + }, + { + "epoch": 0.22, + "grad_norm": 0.2549715877517098, + "learning_rate": 0.00019812816474681314, + "loss": 1.057, + "step": 2270 + }, + { + "epoch": 0.22, + "grad_norm": 0.26247547673776234, + "learning_rate": 0.00019812511703135504, + "loss": 1.0619, + "step": 2271 + }, + { + "epoch": 0.22, + "grad_norm": 0.2867669044848128, + "learning_rate": 0.00019812206686025424, + "loss": 1.1794, + "step": 2272 + }, + { + "epoch": 0.22, + "grad_norm": 0.27304725331072943, + "learning_rate": 0.000198119014233587, + "loss": 1.1895, + "step": 2273 + }, + { + "epoch": 0.22, + "grad_norm": 0.25831673662414345, + "learning_rate": 0.00019811595915142979, + "loss": 1.088, + "step": 2274 + }, + { + "epoch": 0.22, + "grad_norm": 0.23021022674148214, + "learning_rate": 0.00019811290161385906, + "loss": 1.0841, + "step": 2275 + }, + { + "epoch": 0.22, + "grad_norm": 0.2575794159303839, + "learning_rate": 0.00019810984162095129, + "loss": 1.0906, + "step": 2276 + }, + { + "epoch": 0.22, + "grad_norm": 0.2609400179291492, + "learning_rate": 0.00019810677917278305, + "loss": 1.1717, + "step": 2277 + }, + { + "epoch": 0.22, + "grad_norm": 0.28398870543915045, + "learning_rate": 0.00019810371426943105, + "loss": 1.1347, + "step": 2278 + }, + { + "epoch": 0.22, + "grad_norm": 0.26309231855699067, + "learning_rate": 0.0001981006469109719, + "loss": 1.1804, + "step": 2279 + }, + { + "epoch": 0.22, + "grad_norm": 0.2902283356657511, + "learning_rate": 0.00019809757709748243, + "loss": 1.1167, + "step": 2280 + }, + { + "epoch": 0.22, + "grad_norm": 0.25748170041372764, + "learning_rate": 0.00019809450482903942, + "loss": 1.1476, + "step": 2281 + }, + { + "epoch": 0.22, + "grad_norm": 0.2667597093950612, + "learning_rate": 0.0001980914301057198, + "loss": 1.1277, + "step": 2282 + }, + { + "epoch": 0.22, + "grad_norm": 0.27836946786170796, + "learning_rate": 0.0001980883529276005, + "loss": 1.1525, + "step": 2283 + }, + { + "epoch": 0.22, + "grad_norm": 0.3083167991873422, + "learning_rate": 0.0001980852732947585, + "loss": 1.1216, + "step": 2284 + }, + { + "epoch": 0.22, + "grad_norm": 0.2676745480686396, + "learning_rate": 0.00019808219120727086, + "loss": 1.1328, + "step": 2285 + }, + { + "epoch": 0.22, + "grad_norm": 0.25527058852259726, + "learning_rate": 0.0001980791066652148, + "loss": 1.065, + "step": 2286 + }, + { + "epoch": 0.22, + "grad_norm": 0.28337351811282757, + "learning_rate": 0.00019807601966866746, + "loss": 1.1723, + "step": 2287 + }, + { + "epoch": 0.22, + "grad_norm": 0.27319098266987507, + "learning_rate": 0.00019807293021770604, + "loss": 1.0549, + "step": 2288 + }, + { + "epoch": 0.22, + "grad_norm": 0.30841736311542484, + "learning_rate": 0.00019806983831240795, + "loss": 1.1445, + "step": 2289 + }, + { + "epoch": 0.22, + "grad_norm": 0.2555510965247522, + "learning_rate": 0.0001980667439528505, + "loss": 1.1424, + "step": 2290 + }, + { + "epoch": 0.22, + "grad_norm": 0.2641571799003314, + "learning_rate": 0.00019806364713911116, + "loss": 1.033, + "step": 2291 + }, + { + "epoch": 0.22, + "grad_norm": 0.2838900082793651, + "learning_rate": 0.0001980605478712674, + "loss": 1.0774, + "step": 2292 + }, + { + "epoch": 0.22, + "grad_norm": 0.31407713147896055, + "learning_rate": 0.00019805744614939682, + "loss": 1.2683, + "step": 2293 + }, + { + "epoch": 0.22, + "grad_norm": 0.27082803879903133, + "learning_rate": 0.00019805434197357703, + "loss": 1.1711, + "step": 2294 + }, + { + "epoch": 0.22, + "grad_norm": 0.27007517574821516, + "learning_rate": 0.0001980512353438857, + "loss": 1.1142, + "step": 2295 + }, + { + "epoch": 0.22, + "grad_norm": 0.25200965101215933, + "learning_rate": 0.00019804812626040056, + "loss": 1.1365, + "step": 2296 + }, + { + "epoch": 0.22, + "grad_norm": 0.2482335861309017, + "learning_rate": 0.00019804501472319946, + "loss": 1.0387, + "step": 2297 + }, + { + "epoch": 0.22, + "grad_norm": 0.27093750047305093, + "learning_rate": 0.0001980419007323602, + "loss": 1.0562, + "step": 2298 + }, + { + "epoch": 0.22, + "grad_norm": 0.2823581467965368, + "learning_rate": 0.00019803878428796082, + "loss": 1.2542, + "step": 2299 + }, + { + "epoch": 0.22, + "grad_norm": 0.27114630287941716, + "learning_rate": 0.00019803566539007924, + "loss": 1.1863, + "step": 2300 + }, + { + "epoch": 0.22, + "grad_norm": 0.27533847009087203, + "learning_rate": 0.0001980325440387935, + "loss": 0.9211, + "step": 2301 + }, + { + "epoch": 0.22, + "grad_norm": 0.24736978230602902, + "learning_rate": 0.00019802942023418175, + "loss": 1.1474, + "step": 2302 + }, + { + "epoch": 0.22, + "grad_norm": 0.24528908482065118, + "learning_rate": 0.00019802629397632212, + "loss": 1.0203, + "step": 2303 + }, + { + "epoch": 0.22, + "grad_norm": 0.3102766690223985, + "learning_rate": 0.00019802316526529293, + "loss": 1.1166, + "step": 2304 + }, + { + "epoch": 0.22, + "grad_norm": 0.2645211126197188, + "learning_rate": 0.00019802003410117238, + "loss": 1.09, + "step": 2305 + }, + { + "epoch": 0.22, + "grad_norm": 0.24314341601375852, + "learning_rate": 0.0001980169004840389, + "loss": 1.067, + "step": 2306 + }, + { + "epoch": 0.22, + "grad_norm": 0.28901370914218866, + "learning_rate": 0.00019801376441397087, + "loss": 1.125, + "step": 2307 + }, + { + "epoch": 0.22, + "grad_norm": 0.22977734256634133, + "learning_rate": 0.00019801062589104676, + "loss": 1.1017, + "step": 2308 + }, + { + "epoch": 0.22, + "grad_norm": 0.2763909341056602, + "learning_rate": 0.00019800748491534517, + "loss": 1.1466, + "step": 2309 + }, + { + "epoch": 0.22, + "grad_norm": 0.2638965585187832, + "learning_rate": 0.00019800434148694468, + "loss": 1.0884, + "step": 2310 + }, + { + "epoch": 0.22, + "grad_norm": 0.25365372028597144, + "learning_rate": 0.00019800119560592393, + "loss": 1.1063, + "step": 2311 + }, + { + "epoch": 0.22, + "grad_norm": 0.2946212685821378, + "learning_rate": 0.0001979980472723617, + "loss": 1.0209, + "step": 2312 + }, + { + "epoch": 0.22, + "grad_norm": 0.28135433865494547, + "learning_rate": 0.00019799489648633675, + "loss": 1.1626, + "step": 2313 + }, + { + "epoch": 0.22, + "grad_norm": 0.3064233206742547, + "learning_rate": 0.00019799174324792787, + "loss": 1.1433, + "step": 2314 + }, + { + "epoch": 0.22, + "grad_norm": 0.2800872851032662, + "learning_rate": 0.00019798858755721405, + "loss": 1.039, + "step": 2315 + }, + { + "epoch": 0.22, + "grad_norm": 0.25898676357853834, + "learning_rate": 0.00019798542941427426, + "loss": 1.1401, + "step": 2316 + }, + { + "epoch": 0.22, + "grad_norm": 0.26001355286806555, + "learning_rate": 0.00019798226881918753, + "loss": 1.0741, + "step": 2317 + }, + { + "epoch": 0.22, + "grad_norm": 0.31195060460939816, + "learning_rate": 0.00019797910577203293, + "loss": 1.155, + "step": 2318 + }, + { + "epoch": 0.22, + "grad_norm": 0.2853360314799912, + "learning_rate": 0.00019797594027288963, + "loss": 1.1006, + "step": 2319 + }, + { + "epoch": 0.22, + "grad_norm": 0.2282844809122414, + "learning_rate": 0.00019797277232183684, + "loss": 1.0532, + "step": 2320 + }, + { + "epoch": 0.22, + "grad_norm": 0.28551517768089857, + "learning_rate": 0.00019796960191895385, + "loss": 1.0486, + "step": 2321 + }, + { + "epoch": 0.22, + "grad_norm": 0.2590261403859847, + "learning_rate": 0.00019796642906432004, + "loss": 1.0397, + "step": 2322 + }, + { + "epoch": 0.22, + "grad_norm": 0.23480527152036285, + "learning_rate": 0.0001979632537580147, + "loss": 1.0072, + "step": 2323 + }, + { + "epoch": 0.22, + "grad_norm": 0.26973107870500546, + "learning_rate": 0.00019796007600011742, + "loss": 1.1077, + "step": 2324 + }, + { + "epoch": 0.22, + "grad_norm": 0.26492201462618237, + "learning_rate": 0.0001979568957907077, + "loss": 1.1157, + "step": 2325 + }, + { + "epoch": 0.22, + "grad_norm": 0.279425391462633, + "learning_rate": 0.00019795371312986504, + "loss": 1.0199, + "step": 2326 + }, + { + "epoch": 0.22, + "grad_norm": 0.2836141793634087, + "learning_rate": 0.00019795052801766915, + "loss": 1.1172, + "step": 2327 + }, + { + "epoch": 0.22, + "grad_norm": 0.2544431831518797, + "learning_rate": 0.0001979473404541998, + "loss": 1.0867, + "step": 2328 + }, + { + "epoch": 0.22, + "grad_norm": 0.2402666227462647, + "learning_rate": 0.0001979441504395366, + "loss": 1.0139, + "step": 2329 + }, + { + "epoch": 0.22, + "grad_norm": 0.2503197635519736, + "learning_rate": 0.00019794095797375953, + "loss": 1.0556, + "step": 2330 + }, + { + "epoch": 0.22, + "grad_norm": 0.2581127509866034, + "learning_rate": 0.00019793776305694846, + "loss": 0.9451, + "step": 2331 + }, + { + "epoch": 0.22, + "grad_norm": 0.24403281634446966, + "learning_rate": 0.0001979345656891833, + "loss": 1.1031, + "step": 2332 + }, + { + "epoch": 0.22, + "grad_norm": 0.2532440703626388, + "learning_rate": 0.00019793136587054405, + "loss": 1.0738, + "step": 2333 + }, + { + "epoch": 0.22, + "grad_norm": 0.2597097391411967, + "learning_rate": 0.00019792816360111087, + "loss": 1.1359, + "step": 2334 + }, + { + "epoch": 0.22, + "grad_norm": 0.27991032105875546, + "learning_rate": 0.00019792495888096382, + "loss": 0.9373, + "step": 2335 + }, + { + "epoch": 0.22, + "grad_norm": 0.2713405323422737, + "learning_rate": 0.00019792175171018313, + "loss": 1.0818, + "step": 2336 + }, + { + "epoch": 0.22, + "grad_norm": 0.2942256218211258, + "learning_rate": 0.00019791854208884907, + "loss": 1.1304, + "step": 2337 + }, + { + "epoch": 0.22, + "grad_norm": 0.24670357822710387, + "learning_rate": 0.00019791533001704194, + "loss": 1.0164, + "step": 2338 + }, + { + "epoch": 0.22, + "grad_norm": 0.25797879496591175, + "learning_rate": 0.00019791211549484216, + "loss": 1.0922, + "step": 2339 + }, + { + "epoch": 0.22, + "grad_norm": 0.28874847418524446, + "learning_rate": 0.00019790889852233016, + "loss": 1.2126, + "step": 2340 + }, + { + "epoch": 0.22, + "grad_norm": 0.2831045631432414, + "learning_rate": 0.00019790567909958644, + "loss": 1.1913, + "step": 2341 + }, + { + "epoch": 0.22, + "grad_norm": 0.25052218126412557, + "learning_rate": 0.00019790245722669153, + "loss": 1.1242, + "step": 2342 + }, + { + "epoch": 0.22, + "grad_norm": 0.2980993557396919, + "learning_rate": 0.00019789923290372614, + "loss": 1.0499, + "step": 2343 + }, + { + "epoch": 0.22, + "grad_norm": 0.26321626908133683, + "learning_rate": 0.00019789600613077092, + "loss": 1.0864, + "step": 2344 + }, + { + "epoch": 0.22, + "grad_norm": 0.26596294842052304, + "learning_rate": 0.0001978927769079066, + "loss": 1.0538, + "step": 2345 + }, + { + "epoch": 0.22, + "grad_norm": 0.25422065022309154, + "learning_rate": 0.00019788954523521402, + "loss": 1.2115, + "step": 2346 + }, + { + "epoch": 0.22, + "grad_norm": 0.23598171182692104, + "learning_rate": 0.00019788631111277406, + "loss": 1.0686, + "step": 2347 + }, + { + "epoch": 0.22, + "grad_norm": 0.2847052874921601, + "learning_rate": 0.00019788307454066763, + "loss": 1.0641, + "step": 2348 + }, + { + "epoch": 0.22, + "grad_norm": 0.25357051014186355, + "learning_rate": 0.00019787983551897576, + "loss": 1.0484, + "step": 2349 + }, + { + "epoch": 0.22, + "grad_norm": 0.2411514518029808, + "learning_rate": 0.00019787659404777946, + "loss": 1.133, + "step": 2350 + }, + { + "epoch": 0.22, + "grad_norm": 0.25446854039811434, + "learning_rate": 0.0001978733501271599, + "loss": 1.1672, + "step": 2351 + }, + { + "epoch": 0.23, + "grad_norm": 0.26777410057209505, + "learning_rate": 0.00019787010375719826, + "loss": 1.138, + "step": 2352 + }, + { + "epoch": 0.23, + "grad_norm": 0.27808638420934434, + "learning_rate": 0.0001978668549379757, + "loss": 1.1782, + "step": 2353 + }, + { + "epoch": 0.23, + "grad_norm": 0.2592972824957828, + "learning_rate": 0.00019786360366957367, + "loss": 1.1013, + "step": 2354 + }, + { + "epoch": 0.23, + "grad_norm": 0.25628836959733703, + "learning_rate": 0.0001978603499520734, + "loss": 0.9967, + "step": 2355 + }, + { + "epoch": 0.23, + "grad_norm": 0.2636173075017777, + "learning_rate": 0.0001978570937855564, + "loss": 1.1131, + "step": 2356 + }, + { + "epoch": 0.23, + "grad_norm": 0.2965203958377406, + "learning_rate": 0.0001978538351701041, + "loss": 1.1412, + "step": 2357 + }, + { + "epoch": 0.23, + "grad_norm": 0.2551730645828324, + "learning_rate": 0.0001978505741057981, + "loss": 1.1347, + "step": 2358 + }, + { + "epoch": 0.23, + "grad_norm": 0.3173189097283464, + "learning_rate": 0.00019784731059271996, + "loss": 1.1354, + "step": 2359 + }, + { + "epoch": 0.23, + "grad_norm": 0.287566565828461, + "learning_rate": 0.00019784404463095144, + "loss": 1.0052, + "step": 2360 + }, + { + "epoch": 0.23, + "grad_norm": 0.24804179463343937, + "learning_rate": 0.00019784077622057416, + "loss": 1.1699, + "step": 2361 + }, + { + "epoch": 0.23, + "grad_norm": 0.3073578708099378, + "learning_rate": 0.00019783750536166993, + "loss": 1.1715, + "step": 2362 + }, + { + "epoch": 0.23, + "grad_norm": 0.22734700217911738, + "learning_rate": 0.0001978342320543207, + "loss": 0.9563, + "step": 2363 + }, + { + "epoch": 0.23, + "grad_norm": 0.2590258564685437, + "learning_rate": 0.0001978309562986083, + "loss": 1.0816, + "step": 2364 + }, + { + "epoch": 0.23, + "grad_norm": 0.27557855270603093, + "learning_rate": 0.00019782767809461475, + "loss": 1.077, + "step": 2365 + }, + { + "epoch": 0.23, + "grad_norm": 0.25603874246559705, + "learning_rate": 0.00019782439744242205, + "loss": 1.146, + "step": 2366 + }, + { + "epoch": 0.23, + "grad_norm": 0.2761244679810728, + "learning_rate": 0.00019782111434211235, + "loss": 1.107, + "step": 2367 + }, + { + "epoch": 0.23, + "grad_norm": 0.26568572466382623, + "learning_rate": 0.00019781782879376775, + "loss": 1.1306, + "step": 2368 + }, + { + "epoch": 0.23, + "grad_norm": 0.2723644753582144, + "learning_rate": 0.00019781454079747054, + "loss": 1.1705, + "step": 2369 + }, + { + "epoch": 0.23, + "grad_norm": 0.26902633260641967, + "learning_rate": 0.00019781125035330297, + "loss": 1.0754, + "step": 2370 + }, + { + "epoch": 0.23, + "grad_norm": 0.2711384423626026, + "learning_rate": 0.0001978079574613474, + "loss": 1.1202, + "step": 2371 + }, + { + "epoch": 0.23, + "grad_norm": 0.28204033368789444, + "learning_rate": 0.0001978046621216862, + "loss": 1.1706, + "step": 2372 + }, + { + "epoch": 0.23, + "grad_norm": 0.26064152152322406, + "learning_rate": 0.00019780136433440184, + "loss": 1.0596, + "step": 2373 + }, + { + "epoch": 0.23, + "grad_norm": 0.2907116807628749, + "learning_rate": 0.00019779806409957692, + "loss": 1.1855, + "step": 2374 + }, + { + "epoch": 0.23, + "grad_norm": 0.2739091926383725, + "learning_rate": 0.00019779476141729396, + "loss": 1.1093, + "step": 2375 + }, + { + "epoch": 0.23, + "grad_norm": 0.24773766770558472, + "learning_rate": 0.00019779145628763564, + "loss": 1.0423, + "step": 2376 + }, + { + "epoch": 0.23, + "grad_norm": 0.31039070403873764, + "learning_rate": 0.00019778814871068465, + "loss": 1.125, + "step": 2377 + }, + { + "epoch": 0.23, + "grad_norm": 0.25711704080946696, + "learning_rate": 0.0001977848386865238, + "loss": 1.1752, + "step": 2378 + }, + { + "epoch": 0.23, + "grad_norm": 0.24583579381340756, + "learning_rate": 0.0001977815262152359, + "loss": 1.0188, + "step": 2379 + }, + { + "epoch": 0.23, + "grad_norm": 0.28912054275819865, + "learning_rate": 0.00019777821129690387, + "loss": 1.0406, + "step": 2380 + }, + { + "epoch": 0.23, + "grad_norm": 0.27568971892193483, + "learning_rate": 0.0001977748939316106, + "loss": 1.1377, + "step": 2381 + }, + { + "epoch": 0.23, + "grad_norm": 0.2703312754633334, + "learning_rate": 0.0001977715741194392, + "loss": 1.0889, + "step": 2382 + }, + { + "epoch": 0.23, + "grad_norm": 0.3244168867345363, + "learning_rate": 0.00019776825186047268, + "loss": 1.2365, + "step": 2383 + }, + { + "epoch": 0.23, + "grad_norm": 0.26694652233140037, + "learning_rate": 0.00019776492715479428, + "loss": 0.9792, + "step": 2384 + }, + { + "epoch": 0.23, + "grad_norm": 0.25774010566880007, + "learning_rate": 0.00019776160000248706, + "loss": 1.0835, + "step": 2385 + }, + { + "epoch": 0.23, + "grad_norm": 0.24706317723666119, + "learning_rate": 0.0001977582704036344, + "loss": 1.0586, + "step": 2386 + }, + { + "epoch": 0.23, + "grad_norm": 0.2990804163988819, + "learning_rate": 0.00019775493835831959, + "loss": 1.0996, + "step": 2387 + }, + { + "epoch": 0.23, + "grad_norm": 0.25726590762789603, + "learning_rate": 0.00019775160386662597, + "loss": 1.117, + "step": 2388 + }, + { + "epoch": 0.23, + "grad_norm": 0.2650577582941275, + "learning_rate": 0.00019774826692863705, + "loss": 1.0128, + "step": 2389 + }, + { + "epoch": 0.23, + "grad_norm": 0.2668462355680707, + "learning_rate": 0.00019774492754443635, + "loss": 1.0804, + "step": 2390 + }, + { + "epoch": 0.23, + "grad_norm": 0.2992539843834909, + "learning_rate": 0.00019774158571410737, + "loss": 1.2372, + "step": 2391 + }, + { + "epoch": 0.23, + "grad_norm": 0.2864988762111991, + "learning_rate": 0.00019773824143773377, + "loss": 1.0984, + "step": 2392 + }, + { + "epoch": 0.23, + "grad_norm": 0.304882593233717, + "learning_rate": 0.00019773489471539926, + "loss": 1.1334, + "step": 2393 + }, + { + "epoch": 0.23, + "grad_norm": 0.28019107709650354, + "learning_rate": 0.00019773154554718762, + "loss": 1.0708, + "step": 2394 + }, + { + "epoch": 0.23, + "grad_norm": 0.22866457976217683, + "learning_rate": 0.00019772819393318262, + "loss": 1.0887, + "step": 2395 + }, + { + "epoch": 0.23, + "grad_norm": 0.26481598326652184, + "learning_rate": 0.00019772483987346812, + "loss": 1.1488, + "step": 2396 + }, + { + "epoch": 0.23, + "grad_norm": 0.27476032047452265, + "learning_rate": 0.0001977214833681281, + "loss": 1.0884, + "step": 2397 + }, + { + "epoch": 0.23, + "grad_norm": 0.2671685597445549, + "learning_rate": 0.00019771812441724652, + "loss": 1.021, + "step": 2398 + }, + { + "epoch": 0.23, + "grad_norm": 0.2777768402686966, + "learning_rate": 0.00019771476302090754, + "loss": 1.0786, + "step": 2399 + }, + { + "epoch": 0.23, + "grad_norm": 0.25286519332057394, + "learning_rate": 0.00019771139917919512, + "loss": 0.9957, + "step": 2400 + }, + { + "epoch": 0.23, + "grad_norm": 0.25899496107583564, + "learning_rate": 0.00019770803289219355, + "loss": 1.232, + "step": 2401 + }, + { + "epoch": 0.23, + "grad_norm": 0.2554893399049988, + "learning_rate": 0.00019770466415998706, + "loss": 1.1702, + "step": 2402 + }, + { + "epoch": 0.23, + "grad_norm": 0.26615765059825036, + "learning_rate": 0.00019770129298265994, + "loss": 1.1589, + "step": 2403 + }, + { + "epoch": 0.23, + "grad_norm": 0.2867731760890007, + "learning_rate": 0.00019769791936029657, + "loss": 1.0231, + "step": 2404 + }, + { + "epoch": 0.23, + "grad_norm": 0.25865612120055026, + "learning_rate": 0.00019769454329298134, + "loss": 1.0742, + "step": 2405 + }, + { + "epoch": 0.23, + "grad_norm": 0.27168406194912653, + "learning_rate": 0.00019769116478079876, + "loss": 1.11, + "step": 2406 + }, + { + "epoch": 0.23, + "grad_norm": 0.29872484561116197, + "learning_rate": 0.00019768778382383344, + "loss": 1.0637, + "step": 2407 + }, + { + "epoch": 0.23, + "grad_norm": 0.27164584328580743, + "learning_rate": 0.0001976844004221699, + "loss": 1.0909, + "step": 2408 + }, + { + "epoch": 0.23, + "grad_norm": 0.2739762871753536, + "learning_rate": 0.00019768101457589283, + "loss": 1.0961, + "step": 2409 + }, + { + "epoch": 0.23, + "grad_norm": 0.25357273039005795, + "learning_rate": 0.00019767762628508702, + "loss": 1.0625, + "step": 2410 + }, + { + "epoch": 0.23, + "grad_norm": 0.26504750469407357, + "learning_rate": 0.00019767423554983718, + "loss": 1.0843, + "step": 2411 + }, + { + "epoch": 0.23, + "grad_norm": 0.274182183625254, + "learning_rate": 0.00019767084237022823, + "loss": 1.144, + "step": 2412 + }, + { + "epoch": 0.23, + "grad_norm": 0.282485486193323, + "learning_rate": 0.00019766744674634508, + "loss": 1.1495, + "step": 2413 + }, + { + "epoch": 0.23, + "grad_norm": 0.24729551379445136, + "learning_rate": 0.00019766404867827269, + "loss": 1.1148, + "step": 2414 + }, + { + "epoch": 0.23, + "grad_norm": 0.23229018534328089, + "learning_rate": 0.00019766064816609607, + "loss": 1.0309, + "step": 2415 + }, + { + "epoch": 0.23, + "grad_norm": 0.252259979313865, + "learning_rate": 0.00019765724520990038, + "loss": 1.1207, + "step": 2416 + }, + { + "epoch": 0.23, + "grad_norm": 0.23700561147186552, + "learning_rate": 0.00019765383980977074, + "loss": 1.1039, + "step": 2417 + }, + { + "epoch": 0.23, + "grad_norm": 0.26556810321631696, + "learning_rate": 0.0001976504319657924, + "loss": 1.1749, + "step": 2418 + }, + { + "epoch": 0.23, + "grad_norm": 0.23143332904854963, + "learning_rate": 0.00019764702167805064, + "loss": 1.1775, + "step": 2419 + }, + { + "epoch": 0.23, + "grad_norm": 0.27696732231432797, + "learning_rate": 0.00019764360894663076, + "loss": 1.0399, + "step": 2420 + }, + { + "epoch": 0.23, + "grad_norm": 0.27237322850717344, + "learning_rate": 0.00019764019377161823, + "loss": 1.0703, + "step": 2421 + }, + { + "epoch": 0.23, + "grad_norm": 0.2535297853960644, + "learning_rate": 0.00019763677615309847, + "loss": 1.0938, + "step": 2422 + }, + { + "epoch": 0.23, + "grad_norm": 0.26018058934136795, + "learning_rate": 0.00019763335609115703, + "loss": 1.0601, + "step": 2423 + }, + { + "epoch": 0.23, + "grad_norm": 0.3439807499205691, + "learning_rate": 0.0001976299335858795, + "loss": 1.04, + "step": 2424 + }, + { + "epoch": 0.23, + "grad_norm": 0.24885428737327994, + "learning_rate": 0.0001976265086373515, + "loss": 1.0878, + "step": 2425 + }, + { + "epoch": 0.23, + "grad_norm": 0.2518594237134699, + "learning_rate": 0.0001976230812456588, + "loss": 1.1054, + "step": 2426 + }, + { + "epoch": 0.23, + "grad_norm": 0.27045653240229783, + "learning_rate": 0.0001976196514108871, + "loss": 0.9974, + "step": 2427 + }, + { + "epoch": 0.23, + "grad_norm": 0.27897290427116067, + "learning_rate": 0.0001976162191331223, + "loss": 1.1238, + "step": 2428 + }, + { + "epoch": 0.23, + "grad_norm": 0.26488970207228696, + "learning_rate": 0.00019761278441245023, + "loss": 1.0545, + "step": 2429 + }, + { + "epoch": 0.23, + "grad_norm": 0.2676698182906196, + "learning_rate": 0.00019760934724895692, + "loss": 1.1392, + "step": 2430 + }, + { + "epoch": 0.23, + "grad_norm": 0.2554504815510488, + "learning_rate": 0.00019760590764272834, + "loss": 1.1628, + "step": 2431 + }, + { + "epoch": 0.23, + "grad_norm": 0.23803751811537502, + "learning_rate": 0.0001976024655938506, + "loss": 1.1082, + "step": 2432 + }, + { + "epoch": 0.23, + "grad_norm": 0.2843747176664285, + "learning_rate": 0.00019759902110240977, + "loss": 1.0482, + "step": 2433 + }, + { + "epoch": 0.23, + "grad_norm": 0.23996308884375614, + "learning_rate": 0.00019759557416849214, + "loss": 1.0972, + "step": 2434 + }, + { + "epoch": 0.23, + "grad_norm": 0.26431108924886854, + "learning_rate": 0.00019759212479218393, + "loss": 1.1708, + "step": 2435 + }, + { + "epoch": 0.23, + "grad_norm": 0.26205638413878823, + "learning_rate": 0.0001975886729735714, + "loss": 1.1489, + "step": 2436 + }, + { + "epoch": 0.23, + "grad_norm": 0.26602482241211184, + "learning_rate": 0.00019758521871274107, + "loss": 1.1432, + "step": 2437 + }, + { + "epoch": 0.23, + "grad_norm": 0.2611287789955635, + "learning_rate": 0.00019758176200977928, + "loss": 1.0747, + "step": 2438 + }, + { + "epoch": 0.23, + "grad_norm": 0.2512098131069008, + "learning_rate": 0.00019757830286477258, + "loss": 1.1516, + "step": 2439 + }, + { + "epoch": 0.23, + "grad_norm": 0.300485183834668, + "learning_rate": 0.0001975748412778075, + "loss": 1.0355, + "step": 2440 + }, + { + "epoch": 0.23, + "grad_norm": 0.24888115485358228, + "learning_rate": 0.00019757137724897073, + "loss": 1.1842, + "step": 2441 + }, + { + "epoch": 0.23, + "grad_norm": 0.24322023911969542, + "learning_rate": 0.0001975679107783489, + "loss": 1.1434, + "step": 2442 + }, + { + "epoch": 0.23, + "grad_norm": 0.25278550537075395, + "learning_rate": 0.00019756444186602877, + "loss": 1.1414, + "step": 2443 + }, + { + "epoch": 0.23, + "grad_norm": 0.2629822500133049, + "learning_rate": 0.0001975609705120972, + "loss": 1.1144, + "step": 2444 + }, + { + "epoch": 0.23, + "grad_norm": 0.25330431801210734, + "learning_rate": 0.00019755749671664102, + "loss": 1.1006, + "step": 2445 + }, + { + "epoch": 0.23, + "grad_norm": 0.27277917321975415, + "learning_rate": 0.00019755402047974717, + "loss": 1.1218, + "step": 2446 + }, + { + "epoch": 0.23, + "grad_norm": 0.24954891050559894, + "learning_rate": 0.00019755054180150262, + "loss": 1.168, + "step": 2447 + }, + { + "epoch": 0.23, + "grad_norm": 0.24441850562430112, + "learning_rate": 0.00019754706068199446, + "loss": 0.9717, + "step": 2448 + }, + { + "epoch": 0.23, + "grad_norm": 0.26980275249400415, + "learning_rate": 0.00019754357712130984, + "loss": 1.0781, + "step": 2449 + }, + { + "epoch": 0.23, + "grad_norm": 0.2644012899065647, + "learning_rate": 0.00019754009111953586, + "loss": 1.2219, + "step": 2450 + }, + { + "epoch": 0.23, + "grad_norm": 0.28502445348284167, + "learning_rate": 0.00019753660267675982, + "loss": 1.1411, + "step": 2451 + }, + { + "epoch": 0.23, + "grad_norm": 0.24105312734962844, + "learning_rate": 0.000197533111793069, + "loss": 1.0395, + "step": 2452 + }, + { + "epoch": 0.23, + "grad_norm": 0.28402705301365877, + "learning_rate": 0.0001975296184685507, + "loss": 1.1438, + "step": 2453 + }, + { + "epoch": 0.23, + "grad_norm": 0.251590448441616, + "learning_rate": 0.00019752612270329247, + "loss": 1.0017, + "step": 2454 + }, + { + "epoch": 0.23, + "grad_norm": 0.27385681060181105, + "learning_rate": 0.0001975226244973817, + "loss": 1.2012, + "step": 2455 + }, + { + "epoch": 0.23, + "grad_norm": 0.24837745278204337, + "learning_rate": 0.000197519123850906, + "loss": 1.1111, + "step": 2456 + }, + { + "epoch": 0.24, + "grad_norm": 0.25263755603908267, + "learning_rate": 0.0001975156207639529, + "loss": 1.1382, + "step": 2457 + }, + { + "epoch": 0.24, + "grad_norm": 0.2913062612133686, + "learning_rate": 0.0001975121152366101, + "loss": 1.144, + "step": 2458 + }, + { + "epoch": 0.24, + "grad_norm": 0.279969123492799, + "learning_rate": 0.00019750860726896536, + "loss": 1.1385, + "step": 2459 + }, + { + "epoch": 0.24, + "grad_norm": 0.24114428531190943, + "learning_rate": 0.00019750509686110643, + "loss": 1.0758, + "step": 2460 + }, + { + "epoch": 0.24, + "grad_norm": 0.24865099555297668, + "learning_rate": 0.00019750158401312117, + "loss": 1.0137, + "step": 2461 + }, + { + "epoch": 0.24, + "grad_norm": 0.30049591909461865, + "learning_rate": 0.0001974980687250975, + "loss": 1.324, + "step": 2462 + }, + { + "epoch": 0.24, + "grad_norm": 0.2535091796023302, + "learning_rate": 0.00019749455099712332, + "loss": 1.1444, + "step": 2463 + }, + { + "epoch": 0.24, + "grad_norm": 0.24893191882319649, + "learning_rate": 0.00019749103082928682, + "loss": 0.8933, + "step": 2464 + }, + { + "epoch": 0.24, + "grad_norm": 0.2930991676695541, + "learning_rate": 0.00019748750822167594, + "loss": 1.0118, + "step": 2465 + }, + { + "epoch": 0.24, + "grad_norm": 0.2436760336989602, + "learning_rate": 0.00019748398317437894, + "loss": 1.0733, + "step": 2466 + }, + { + "epoch": 0.24, + "grad_norm": 0.2815078161913315, + "learning_rate": 0.00019748045568748396, + "loss": 1.1311, + "step": 2467 + }, + { + "epoch": 0.24, + "grad_norm": 0.2707645703704046, + "learning_rate": 0.00019747692576107935, + "loss": 1.1313, + "step": 2468 + }, + { + "epoch": 0.24, + "grad_norm": 0.2605533882651966, + "learning_rate": 0.00019747339339525337, + "loss": 1.0691, + "step": 2469 + }, + { + "epoch": 0.24, + "grad_norm": 0.25404802860541503, + "learning_rate": 0.00019746985859009448, + "loss": 1.1801, + "step": 2470 + }, + { + "epoch": 0.24, + "grad_norm": 0.2769596865608125, + "learning_rate": 0.00019746632134569114, + "loss": 1.0646, + "step": 2471 + }, + { + "epoch": 0.24, + "grad_norm": 0.28755610236991974, + "learning_rate": 0.0001974627816621318, + "loss": 1.0567, + "step": 2472 + }, + { + "epoch": 0.24, + "grad_norm": 0.2561327873358053, + "learning_rate": 0.00019745923953950516, + "loss": 1.1097, + "step": 2473 + }, + { + "epoch": 0.24, + "grad_norm": 0.27123359888401705, + "learning_rate": 0.00019745569497789975, + "loss": 1.0804, + "step": 2474 + }, + { + "epoch": 0.24, + "grad_norm": 0.3011360941789264, + "learning_rate": 0.00019745214797740437, + "loss": 0.9762, + "step": 2475 + }, + { + "epoch": 0.24, + "grad_norm": 0.23621703283724582, + "learning_rate": 0.00019744859853810772, + "loss": 1.2314, + "step": 2476 + }, + { + "epoch": 0.24, + "grad_norm": 0.2585948578333519, + "learning_rate": 0.00019744504666009864, + "loss": 1.1219, + "step": 2477 + }, + { + "epoch": 0.24, + "grad_norm": 0.23886114739760986, + "learning_rate": 0.00019744149234346604, + "loss": 1.0854, + "step": 2478 + }, + { + "epoch": 0.24, + "grad_norm": 0.27090142697493064, + "learning_rate": 0.00019743793558829885, + "loss": 1.1247, + "step": 2479 + }, + { + "epoch": 0.24, + "grad_norm": 0.26214502250978616, + "learning_rate": 0.00019743437639468606, + "loss": 1.0928, + "step": 2480 + }, + { + "epoch": 0.24, + "grad_norm": 0.26399673853102246, + "learning_rate": 0.00019743081476271675, + "loss": 1.229, + "step": 2481 + }, + { + "epoch": 0.24, + "grad_norm": 0.3147492977324059, + "learning_rate": 0.00019742725069248014, + "loss": 1.1473, + "step": 2482 + }, + { + "epoch": 0.24, + "grad_norm": 0.26237262208219775, + "learning_rate": 0.0001974236841840653, + "loss": 1.0796, + "step": 2483 + }, + { + "epoch": 0.24, + "grad_norm": 0.2556663951288371, + "learning_rate": 0.00019742011523756154, + "loss": 1.0103, + "step": 2484 + }, + { + "epoch": 0.24, + "grad_norm": 0.2780031870200213, + "learning_rate": 0.0001974165438530582, + "loss": 1.1362, + "step": 2485 + }, + { + "epoch": 0.24, + "grad_norm": 0.27186052495109936, + "learning_rate": 0.0001974129700306446, + "loss": 1.0304, + "step": 2486 + }, + { + "epoch": 0.24, + "grad_norm": 0.28105904722620484, + "learning_rate": 0.0001974093937704102, + "loss": 1.1844, + "step": 2487 + }, + { + "epoch": 0.24, + "grad_norm": 0.25863957400723964, + "learning_rate": 0.00019740581507244449, + "loss": 1.1119, + "step": 2488 + }, + { + "epoch": 0.24, + "grad_norm": 0.28394214465783835, + "learning_rate": 0.00019740223393683706, + "loss": 1.1119, + "step": 2489 + }, + { + "epoch": 0.24, + "grad_norm": 0.26485094517848373, + "learning_rate": 0.00019739865036367751, + "loss": 1.1412, + "step": 2490 + }, + { + "epoch": 0.24, + "grad_norm": 0.28470688440068453, + "learning_rate": 0.0001973950643530555, + "loss": 1.0705, + "step": 2491 + }, + { + "epoch": 0.24, + "grad_norm": 0.26331398514634713, + "learning_rate": 0.00019739147590506085, + "loss": 1.052, + "step": 2492 + }, + { + "epoch": 0.24, + "grad_norm": 0.25097699443135785, + "learning_rate": 0.00019738788501978325, + "loss": 1.0758, + "step": 2493 + }, + { + "epoch": 0.24, + "grad_norm": 0.24766896766086546, + "learning_rate": 0.00019738429169731262, + "loss": 1.0952, + "step": 2494 + }, + { + "epoch": 0.24, + "grad_norm": 0.28022175227433616, + "learning_rate": 0.00019738069593773893, + "loss": 1.0738, + "step": 2495 + }, + { + "epoch": 0.24, + "grad_norm": 0.2522932834715685, + "learning_rate": 0.0001973770977411521, + "loss": 1.1787, + "step": 2496 + }, + { + "epoch": 0.24, + "grad_norm": 0.2926767224083834, + "learning_rate": 0.0001973734971076422, + "loss": 1.1114, + "step": 2497 + }, + { + "epoch": 0.24, + "grad_norm": 0.2915842765013931, + "learning_rate": 0.00019736989403729935, + "loss": 1.0235, + "step": 2498 + }, + { + "epoch": 0.24, + "grad_norm": 0.2462394398490279, + "learning_rate": 0.0001973662885302137, + "loss": 1.0616, + "step": 2499 + }, + { + "epoch": 0.24, + "grad_norm": 0.2608600216455592, + "learning_rate": 0.00019736268058647547, + "loss": 1.2348, + "step": 2500 + }, + { + "epoch": 0.24, + "grad_norm": 0.27127694754468956, + "learning_rate": 0.000197359070206175, + "loss": 1.2243, + "step": 2501 + }, + { + "epoch": 0.24, + "grad_norm": 0.27181980704622394, + "learning_rate": 0.00019735545738940258, + "loss": 1.0685, + "step": 2502 + }, + { + "epoch": 0.24, + "grad_norm": 0.26463599414343286, + "learning_rate": 0.00019735184213624866, + "loss": 1.083, + "step": 2503 + }, + { + "epoch": 0.24, + "grad_norm": 0.2858932841784855, + "learning_rate": 0.00019734822444680372, + "loss": 1.1853, + "step": 2504 + }, + { + "epoch": 0.24, + "grad_norm": 0.29334417625161935, + "learning_rate": 0.00019734460432115826, + "loss": 1.149, + "step": 2505 + }, + { + "epoch": 0.24, + "grad_norm": 0.30013981243424426, + "learning_rate": 0.00019734098175940292, + "loss": 1.1454, + "step": 2506 + }, + { + "epoch": 0.24, + "grad_norm": 0.25388848360987426, + "learning_rate": 0.00019733735676162833, + "loss": 1.0328, + "step": 2507 + }, + { + "epoch": 0.24, + "grad_norm": 0.2911267465827057, + "learning_rate": 0.0001973337293279252, + "loss": 1.0848, + "step": 2508 + }, + { + "epoch": 0.24, + "grad_norm": 0.3184503213445064, + "learning_rate": 0.00019733009945838435, + "loss": 1.1812, + "step": 2509 + }, + { + "epoch": 0.24, + "grad_norm": 0.24860610501636035, + "learning_rate": 0.00019732646715309656, + "loss": 1.0944, + "step": 2510 + }, + { + "epoch": 0.24, + "grad_norm": 0.2523314374067835, + "learning_rate": 0.00019732283241215276, + "loss": 1.0563, + "step": 2511 + }, + { + "epoch": 0.24, + "grad_norm": 0.26493511638695577, + "learning_rate": 0.00019731919523564395, + "loss": 1.0797, + "step": 2512 + }, + { + "epoch": 0.24, + "grad_norm": 0.34257964691618786, + "learning_rate": 0.00019731555562366108, + "loss": 1.0442, + "step": 2513 + }, + { + "epoch": 0.24, + "grad_norm": 0.278969034097831, + "learning_rate": 0.0001973119135762953, + "loss": 1.085, + "step": 2514 + }, + { + "epoch": 0.24, + "grad_norm": 0.2821218023861396, + "learning_rate": 0.00019730826909363771, + "loss": 1.1072, + "step": 2515 + }, + { + "epoch": 0.24, + "grad_norm": 0.2550975254765698, + "learning_rate": 0.00019730462217577955, + "loss": 1.0796, + "step": 2516 + }, + { + "epoch": 0.24, + "grad_norm": 0.24932942154107635, + "learning_rate": 0.00019730097282281202, + "loss": 1.0744, + "step": 2517 + }, + { + "epoch": 0.24, + "grad_norm": 0.278741555046221, + "learning_rate": 0.00019729732103482652, + "loss": 1.2485, + "step": 2518 + }, + { + "epoch": 0.24, + "grad_norm": 0.28093405939217236, + "learning_rate": 0.0001972936668119144, + "loss": 1.107, + "step": 2519 + }, + { + "epoch": 0.24, + "grad_norm": 0.25703049143132695, + "learning_rate": 0.00019729001015416714, + "loss": 1.1391, + "step": 2520 + }, + { + "epoch": 0.24, + "grad_norm": 0.26010024870942694, + "learning_rate": 0.00019728635106167622, + "loss": 1.0808, + "step": 2521 + }, + { + "epoch": 0.24, + "grad_norm": 0.2640446473647651, + "learning_rate": 0.00019728268953453324, + "loss": 1.0537, + "step": 2522 + }, + { + "epoch": 0.24, + "grad_norm": 0.26949297336392625, + "learning_rate": 0.0001972790255728298, + "loss": 1.0032, + "step": 2523 + }, + { + "epoch": 0.24, + "grad_norm": 0.27078666739320834, + "learning_rate": 0.00019727535917665764, + "loss": 1.0726, + "step": 2524 + }, + { + "epoch": 0.24, + "grad_norm": 0.2270614158675183, + "learning_rate": 0.00019727169034610843, + "loss": 1.084, + "step": 2525 + }, + { + "epoch": 0.24, + "grad_norm": 0.2772909155669702, + "learning_rate": 0.00019726801908127403, + "loss": 1.0104, + "step": 2526 + }, + { + "epoch": 0.24, + "grad_norm": 0.26805528081435065, + "learning_rate": 0.00019726434538224638, + "loss": 1.0985, + "step": 2527 + }, + { + "epoch": 0.24, + "grad_norm": 0.3021364267615302, + "learning_rate": 0.00019726066924911732, + "loss": 1.1961, + "step": 2528 + }, + { + "epoch": 0.24, + "grad_norm": 0.2742459883930204, + "learning_rate": 0.0001972569906819789, + "loss": 0.8326, + "step": 2529 + }, + { + "epoch": 0.24, + "grad_norm": 0.2965953559002404, + "learning_rate": 0.00019725330968092315, + "loss": 1.107, + "step": 2530 + }, + { + "epoch": 0.24, + "grad_norm": 0.2684835136930391, + "learning_rate": 0.0001972496262460422, + "loss": 1.0751, + "step": 2531 + }, + { + "epoch": 0.24, + "grad_norm": 0.2718707210872007, + "learning_rate": 0.00019724594037742824, + "loss": 1.0515, + "step": 2532 + }, + { + "epoch": 0.24, + "grad_norm": 0.25725745750943513, + "learning_rate": 0.00019724225207517354, + "loss": 1.1485, + "step": 2533 + }, + { + "epoch": 0.24, + "grad_norm": 0.2890461500320697, + "learning_rate": 0.0001972385613393703, + "loss": 1.1419, + "step": 2534 + }, + { + "epoch": 0.24, + "grad_norm": 0.2440614465912653, + "learning_rate": 0.000197234868170111, + "loss": 1.0766, + "step": 2535 + }, + { + "epoch": 0.24, + "grad_norm": 0.2631890740774793, + "learning_rate": 0.00019723117256748802, + "loss": 1.0583, + "step": 2536 + }, + { + "epoch": 0.24, + "grad_norm": 0.2654794914552144, + "learning_rate": 0.0001972274745315938, + "loss": 1.103, + "step": 2537 + }, + { + "epoch": 0.24, + "grad_norm": 0.25772820159308557, + "learning_rate": 0.00019722377406252095, + "loss": 1.1091, + "step": 2538 + }, + { + "epoch": 0.24, + "grad_norm": 0.2759349478590902, + "learning_rate": 0.00019722007116036204, + "loss": 1.0997, + "step": 2539 + }, + { + "epoch": 0.24, + "grad_norm": 0.29182935407079413, + "learning_rate": 0.00019721636582520978, + "loss": 0.9975, + "step": 2540 + }, + { + "epoch": 0.24, + "grad_norm": 0.2706259485506027, + "learning_rate": 0.00019721265805715686, + "loss": 1.0848, + "step": 2541 + }, + { + "epoch": 0.24, + "grad_norm": 0.27778012967187665, + "learning_rate": 0.00019720894785629604, + "loss": 1.1421, + "step": 2542 + }, + { + "epoch": 0.24, + "grad_norm": 0.24224758551615272, + "learning_rate": 0.00019720523522272023, + "loss": 1.1094, + "step": 2543 + }, + { + "epoch": 0.24, + "grad_norm": 0.27897485799673416, + "learning_rate": 0.0001972015201565223, + "loss": 0.9736, + "step": 2544 + }, + { + "epoch": 0.24, + "grad_norm": 0.25755355735125685, + "learning_rate": 0.00019719780265779527, + "loss": 1.1512, + "step": 2545 + }, + { + "epoch": 0.24, + "grad_norm": 0.26839822619910775, + "learning_rate": 0.00019719408272663211, + "loss": 1.0875, + "step": 2546 + }, + { + "epoch": 0.24, + "grad_norm": 0.3230859730143896, + "learning_rate": 0.00019719036036312595, + "loss": 1.1185, + "step": 2547 + }, + { + "epoch": 0.24, + "grad_norm": 0.3000203530525595, + "learning_rate": 0.00019718663556736997, + "loss": 0.9699, + "step": 2548 + }, + { + "epoch": 0.24, + "grad_norm": 0.2673877046628615, + "learning_rate": 0.00019718290833945732, + "loss": 1.0584, + "step": 2549 + }, + { + "epoch": 0.24, + "grad_norm": 0.28638413313248523, + "learning_rate": 0.00019717917867948136, + "loss": 1.1215, + "step": 2550 + }, + { + "epoch": 0.24, + "grad_norm": 0.27932539137807955, + "learning_rate": 0.00019717544658753533, + "loss": 1.0187, + "step": 2551 + }, + { + "epoch": 0.24, + "grad_norm": 0.29577216947674934, + "learning_rate": 0.00019717171206371268, + "loss": 1.0782, + "step": 2552 + }, + { + "epoch": 0.24, + "grad_norm": 0.29132618922289466, + "learning_rate": 0.00019716797510810688, + "loss": 1.1716, + "step": 2553 + }, + { + "epoch": 0.24, + "grad_norm": 0.30978702723114215, + "learning_rate": 0.00019716423572081144, + "loss": 1.176, + "step": 2554 + }, + { + "epoch": 0.24, + "grad_norm": 0.26767419284947913, + "learning_rate": 0.0001971604939019199, + "loss": 0.9729, + "step": 2555 + }, + { + "epoch": 0.24, + "grad_norm": 0.24950178862061026, + "learning_rate": 0.000197156749651526, + "loss": 1.1032, + "step": 2556 + }, + { + "epoch": 0.24, + "grad_norm": 0.2676153474407372, + "learning_rate": 0.00019715300296972333, + "loss": 1.0011, + "step": 2557 + }, + { + "epoch": 0.24, + "grad_norm": 0.24233870876913635, + "learning_rate": 0.00019714925385660572, + "loss": 1.0203, + "step": 2558 + }, + { + "epoch": 0.24, + "grad_norm": 0.26922786191679665, + "learning_rate": 0.00019714550231226697, + "loss": 1.0598, + "step": 2559 + }, + { + "epoch": 0.24, + "grad_norm": 0.25818023471817164, + "learning_rate": 0.000197141748336801, + "loss": 1.1347, + "step": 2560 + }, + { + "epoch": 0.25, + "grad_norm": 0.2618807690441127, + "learning_rate": 0.00019713799193030166, + "loss": 1.1211, + "step": 2561 + }, + { + "epoch": 0.25, + "grad_norm": 0.29741704565126115, + "learning_rate": 0.00019713423309286309, + "loss": 1.0597, + "step": 2562 + }, + { + "epoch": 0.25, + "grad_norm": 0.28884125218560003, + "learning_rate": 0.00019713047182457928, + "loss": 1.0358, + "step": 2563 + }, + { + "epoch": 0.25, + "grad_norm": 0.26649025648335334, + "learning_rate": 0.00019712670812554434, + "loss": 1.162, + "step": 2564 + }, + { + "epoch": 0.25, + "grad_norm": 0.2947735363660406, + "learning_rate": 0.00019712294199585248, + "loss": 1.0586, + "step": 2565 + }, + { + "epoch": 0.25, + "grad_norm": 0.25677075754484635, + "learning_rate": 0.000197119173435598, + "loss": 1.1841, + "step": 2566 + }, + { + "epoch": 0.25, + "grad_norm": 0.2637205544668129, + "learning_rate": 0.00019711540244487515, + "loss": 1.0261, + "step": 2567 + }, + { + "epoch": 0.25, + "grad_norm": 0.24823082695362084, + "learning_rate": 0.0001971116290237783, + "loss": 1.0277, + "step": 2568 + }, + { + "epoch": 0.25, + "grad_norm": 0.25229952638255043, + "learning_rate": 0.0001971078531724019, + "loss": 1.0751, + "step": 2569 + }, + { + "epoch": 0.25, + "grad_norm": 0.30254391196407815, + "learning_rate": 0.00019710407489084047, + "loss": 1.1231, + "step": 2570 + }, + { + "epoch": 0.25, + "grad_norm": 0.3021948667516261, + "learning_rate": 0.00019710029417918854, + "loss": 1.0033, + "step": 2571 + }, + { + "epoch": 0.25, + "grad_norm": 0.23588733965722716, + "learning_rate": 0.00019709651103754067, + "loss": 1.1007, + "step": 2572 + }, + { + "epoch": 0.25, + "grad_norm": 0.24382613129524622, + "learning_rate": 0.00019709272546599164, + "loss": 1.0672, + "step": 2573 + }, + { + "epoch": 0.25, + "grad_norm": 0.2733353690652961, + "learning_rate": 0.00019708893746463613, + "loss": 1.1596, + "step": 2574 + }, + { + "epoch": 0.25, + "grad_norm": 0.26912681562116336, + "learning_rate": 0.00019708514703356894, + "loss": 1.1428, + "step": 2575 + }, + { + "epoch": 0.25, + "grad_norm": 0.2956110006161434, + "learning_rate": 0.00019708135417288491, + "loss": 1.0804, + "step": 2576 + }, + { + "epoch": 0.25, + "grad_norm": 0.2630689064650718, + "learning_rate": 0.000197077558882679, + "loss": 1.0586, + "step": 2577 + }, + { + "epoch": 0.25, + "grad_norm": 0.2689950863610693, + "learning_rate": 0.00019707376116304617, + "loss": 1.1105, + "step": 2578 + }, + { + "epoch": 0.25, + "grad_norm": 0.2910234057476756, + "learning_rate": 0.00019706996101408146, + "loss": 1.0988, + "step": 2579 + }, + { + "epoch": 0.25, + "grad_norm": 0.27854677846223636, + "learning_rate": 0.00019706615843587995, + "loss": 1.1806, + "step": 2580 + }, + { + "epoch": 0.25, + "grad_norm": 0.24191714015622726, + "learning_rate": 0.00019706235342853683, + "loss": 1.0358, + "step": 2581 + }, + { + "epoch": 0.25, + "grad_norm": 0.2654110236015743, + "learning_rate": 0.00019705854599214734, + "loss": 1.1535, + "step": 2582 + }, + { + "epoch": 0.25, + "grad_norm": 0.24454579378976135, + "learning_rate": 0.0001970547361268067, + "loss": 1.0781, + "step": 2583 + }, + { + "epoch": 0.25, + "grad_norm": 0.24819542572155914, + "learning_rate": 0.00019705092383261028, + "loss": 1.1531, + "step": 2584 + }, + { + "epoch": 0.25, + "grad_norm": 0.31712338038836446, + "learning_rate": 0.00019704710910965352, + "loss": 1.2182, + "step": 2585 + }, + { + "epoch": 0.25, + "grad_norm": 0.2770504057135462, + "learning_rate": 0.00019704329195803188, + "loss": 0.9587, + "step": 2586 + }, + { + "epoch": 0.25, + "grad_norm": 0.2381187276476534, + "learning_rate": 0.00019703947237784087, + "loss": 0.9862, + "step": 2587 + }, + { + "epoch": 0.25, + "grad_norm": 0.29302550861588544, + "learning_rate": 0.00019703565036917605, + "loss": 1.0806, + "step": 2588 + }, + { + "epoch": 0.25, + "grad_norm": 0.26005985628816053, + "learning_rate": 0.0001970318259321331, + "loss": 0.9799, + "step": 2589 + }, + { + "epoch": 0.25, + "grad_norm": 0.27882886267706336, + "learning_rate": 0.0001970279990668077, + "loss": 1.1686, + "step": 2590 + }, + { + "epoch": 0.25, + "grad_norm": 0.26205774855080516, + "learning_rate": 0.0001970241697732957, + "loss": 1.1234, + "step": 2591 + }, + { + "epoch": 0.25, + "grad_norm": 0.26668517374001244, + "learning_rate": 0.00019702033805169285, + "loss": 1.0884, + "step": 2592 + }, + { + "epoch": 0.25, + "grad_norm": 0.25715862286863395, + "learning_rate": 0.00019701650390209504, + "loss": 1.2244, + "step": 2593 + }, + { + "epoch": 0.25, + "grad_norm": 0.3058673752020222, + "learning_rate": 0.00019701266732459827, + "loss": 1.0458, + "step": 2594 + }, + { + "epoch": 0.25, + "grad_norm": 0.2529497741528999, + "learning_rate": 0.00019700882831929852, + "loss": 1.3244, + "step": 2595 + }, + { + "epoch": 0.25, + "grad_norm": 0.2943742754766903, + "learning_rate": 0.0001970049868862919, + "loss": 1.0521, + "step": 2596 + }, + { + "epoch": 0.25, + "grad_norm": 0.25522996903552037, + "learning_rate": 0.0001970011430256745, + "loss": 1.0895, + "step": 2597 + }, + { + "epoch": 0.25, + "grad_norm": 0.2671836556503752, + "learning_rate": 0.00019699729673754255, + "loss": 1.1154, + "step": 2598 + }, + { + "epoch": 0.25, + "grad_norm": 0.3007590432393228, + "learning_rate": 0.00019699344802199224, + "loss": 1.2871, + "step": 2599 + }, + { + "epoch": 0.25, + "grad_norm": 0.2991338944551984, + "learning_rate": 0.00019698959687911998, + "loss": 1.0738, + "step": 2600 + }, + { + "epoch": 0.25, + "grad_norm": 0.26091410208452626, + "learning_rate": 0.00019698574330902208, + "loss": 1.2443, + "step": 2601 + }, + { + "epoch": 0.25, + "grad_norm": 0.27563048614294183, + "learning_rate": 0.00019698188731179502, + "loss": 1.0806, + "step": 2602 + }, + { + "epoch": 0.25, + "grad_norm": 0.26585816626226483, + "learning_rate": 0.00019697802888753526, + "loss": 0.9916, + "step": 2603 + }, + { + "epoch": 0.25, + "grad_norm": 0.27522846970536974, + "learning_rate": 0.0001969741680363394, + "loss": 1.0998, + "step": 2604 + }, + { + "epoch": 0.25, + "grad_norm": 0.2602657313030757, + "learning_rate": 0.00019697030475830402, + "loss": 1.0419, + "step": 2605 + }, + { + "epoch": 0.25, + "grad_norm": 0.2453226312120246, + "learning_rate": 0.00019696643905352582, + "loss": 0.9631, + "step": 2606 + }, + { + "epoch": 0.25, + "grad_norm": 0.29471825123420187, + "learning_rate": 0.00019696257092210155, + "loss": 1.0666, + "step": 2607 + }, + { + "epoch": 0.25, + "grad_norm": 0.2902443561039826, + "learning_rate": 0.000196958700364128, + "loss": 1.0988, + "step": 2608 + }, + { + "epoch": 0.25, + "grad_norm": 0.2538912185129694, + "learning_rate": 0.00019695482737970202, + "loss": 1.1586, + "step": 2609 + }, + { + "epoch": 0.25, + "grad_norm": 0.28579871153918274, + "learning_rate": 0.0001969509519689206, + "loss": 1.0839, + "step": 2610 + }, + { + "epoch": 0.25, + "grad_norm": 0.23553918792552284, + "learning_rate": 0.00019694707413188062, + "loss": 1.0165, + "step": 2611 + }, + { + "epoch": 0.25, + "grad_norm": 0.28381053637003834, + "learning_rate": 0.0001969431938686792, + "loss": 1.0899, + "step": 2612 + }, + { + "epoch": 0.25, + "grad_norm": 0.2605318408738508, + "learning_rate": 0.00019693931117941346, + "loss": 1.1033, + "step": 2613 + }, + { + "epoch": 0.25, + "eval_loss": 1.1338403224945068, + "eval_runtime": 4230.6339, + "eval_samples_per_second": 19.765, + "eval_steps_per_second": 2.471, + "step": 2613 + }, + { + "epoch": 0.25, + "grad_norm": 0.255252333620048, + "learning_rate": 0.00019693542606418052, + "loss": 1.1488, + "step": 2614 + }, + { + "epoch": 0.25, + "grad_norm": 0.30633153587103285, + "learning_rate": 0.00019693153852307757, + "loss": 1.0757, + "step": 2615 + }, + { + "epoch": 0.25, + "grad_norm": 0.27505703870757664, + "learning_rate": 0.000196927648556202, + "loss": 1.1299, + "step": 2616 + }, + { + "epoch": 0.25, + "grad_norm": 0.2713935955775416, + "learning_rate": 0.00019692375616365112, + "loss": 1.0189, + "step": 2617 + }, + { + "epoch": 0.25, + "grad_norm": 0.2684321316986978, + "learning_rate": 0.00019691986134552227, + "loss": 1.1241, + "step": 2618 + }, + { + "epoch": 0.25, + "grad_norm": 0.25621412932428106, + "learning_rate": 0.00019691596410191303, + "loss": 1.0671, + "step": 2619 + }, + { + "epoch": 0.25, + "grad_norm": 0.26289530369330816, + "learning_rate": 0.00019691206443292085, + "loss": 0.9654, + "step": 2620 + }, + { + "epoch": 0.25, + "grad_norm": 0.2650174904695066, + "learning_rate": 0.00019690816233864337, + "loss": 1.0996, + "step": 2621 + }, + { + "epoch": 0.25, + "grad_norm": 0.2575327507563281, + "learning_rate": 0.0001969042578191782, + "loss": 1.1166, + "step": 2622 + }, + { + "epoch": 0.25, + "grad_norm": 0.26454046915240054, + "learning_rate": 0.00019690035087462307, + "loss": 0.9949, + "step": 2623 + }, + { + "epoch": 0.25, + "grad_norm": 0.2805971262563047, + "learning_rate": 0.0001968964415050758, + "loss": 1.0859, + "step": 2624 + }, + { + "epoch": 0.25, + "grad_norm": 0.21973703426240196, + "learning_rate": 0.00019689252971063416, + "loss": 1.1679, + "step": 2625 + }, + { + "epoch": 0.25, + "grad_norm": 0.3139171336564937, + "learning_rate": 0.00019688861549139607, + "loss": 1.2573, + "step": 2626 + }, + { + "epoch": 0.25, + "grad_norm": 0.3076875510868648, + "learning_rate": 0.0001968846988474595, + "loss": 1.1237, + "step": 2627 + }, + { + "epoch": 0.25, + "grad_norm": 0.28812414941928133, + "learning_rate": 0.00019688077977892245, + "loss": 1.0934, + "step": 2628 + }, + { + "epoch": 0.25, + "grad_norm": 0.31182412232961415, + "learning_rate": 0.00019687685828588297, + "loss": 1.0891, + "step": 2629 + }, + { + "epoch": 0.25, + "grad_norm": 0.2625520463054526, + "learning_rate": 0.00019687293436843926, + "loss": 1.1112, + "step": 2630 + }, + { + "epoch": 0.25, + "grad_norm": 0.25837520805297015, + "learning_rate": 0.00019686900802668946, + "loss": 0.9673, + "step": 2631 + }, + { + "epoch": 0.25, + "grad_norm": 0.27460647995152365, + "learning_rate": 0.00019686507926073188, + "loss": 1.1495, + "step": 2632 + }, + { + "epoch": 0.25, + "grad_norm": 0.26861650854795094, + "learning_rate": 0.00019686114807066478, + "loss": 1.0432, + "step": 2633 + }, + { + "epoch": 0.25, + "grad_norm": 0.28385341495610805, + "learning_rate": 0.0001968572144565866, + "loss": 1.1461, + "step": 2634 + }, + { + "epoch": 0.25, + "grad_norm": 0.271570918418974, + "learning_rate": 0.00019685327841859572, + "loss": 1.185, + "step": 2635 + }, + { + "epoch": 0.25, + "grad_norm": 0.2673269371926348, + "learning_rate": 0.00019684933995679074, + "loss": 1.1282, + "step": 2636 + }, + { + "epoch": 0.25, + "grad_norm": 0.2403632191882647, + "learning_rate": 0.0001968453990712701, + "loss": 1.0608, + "step": 2637 + }, + { + "epoch": 0.25, + "grad_norm": 0.2586849785633008, + "learning_rate": 0.00019684145576213252, + "loss": 1.1015, + "step": 2638 + }, + { + "epoch": 0.25, + "grad_norm": 0.2649189546870879, + "learning_rate": 0.00019683751002947663, + "loss": 1.1428, + "step": 2639 + }, + { + "epoch": 0.25, + "grad_norm": 0.2524600423422356, + "learning_rate": 0.0001968335618734012, + "loss": 1.0497, + "step": 2640 + }, + { + "epoch": 0.25, + "grad_norm": 0.25016994107064466, + "learning_rate": 0.00019682961129400503, + "loss": 1.1158, + "step": 2641 + }, + { + "epoch": 0.25, + "grad_norm": 0.26421571146347217, + "learning_rate": 0.000196825658291387, + "loss": 1.0628, + "step": 2642 + }, + { + "epoch": 0.25, + "grad_norm": 0.282173839472887, + "learning_rate": 0.00019682170286564597, + "loss": 0.998, + "step": 2643 + }, + { + "epoch": 0.25, + "grad_norm": 0.2323812037769926, + "learning_rate": 0.00019681774501688102, + "loss": 1.0137, + "step": 2644 + }, + { + "epoch": 0.25, + "grad_norm": 0.2901381801505397, + "learning_rate": 0.0001968137847451911, + "loss": 1.0866, + "step": 2645 + }, + { + "epoch": 0.25, + "grad_norm": 0.26576724197777407, + "learning_rate": 0.0001968098220506754, + "loss": 1.0927, + "step": 2646 + }, + { + "epoch": 0.25, + "grad_norm": 0.2696230704622509, + "learning_rate": 0.0001968058569334331, + "loss": 1.1894, + "step": 2647 + }, + { + "epoch": 0.25, + "grad_norm": 0.26017036741055427, + "learning_rate": 0.00019680188939356336, + "loss": 0.9647, + "step": 2648 + }, + { + "epoch": 0.25, + "grad_norm": 0.2677987402758377, + "learning_rate": 0.0001967979194311655, + "loss": 1.0862, + "step": 2649 + }, + { + "epoch": 0.25, + "grad_norm": 0.28711631837028534, + "learning_rate": 0.00019679394704633888, + "loss": 1.2582, + "step": 2650 + }, + { + "epoch": 0.25, + "grad_norm": 0.26765916334917605, + "learning_rate": 0.00019678997223918288, + "loss": 1.0659, + "step": 2651 + }, + { + "epoch": 0.25, + "grad_norm": 0.26777059535455733, + "learning_rate": 0.000196785995009797, + "loss": 1.1978, + "step": 2652 + }, + { + "epoch": 0.25, + "grad_norm": 0.22518302167201712, + "learning_rate": 0.00019678201535828076, + "loss": 1.0836, + "step": 2653 + }, + { + "epoch": 0.25, + "grad_norm": 0.2639486825941676, + "learning_rate": 0.00019677803328473377, + "loss": 1.1509, + "step": 2654 + }, + { + "epoch": 0.25, + "grad_norm": 0.23538041430872067, + "learning_rate": 0.00019677404878925566, + "loss": 0.9895, + "step": 2655 + }, + { + "epoch": 0.25, + "grad_norm": 0.295994436523556, + "learning_rate": 0.00019677006187194618, + "loss": 1.0979, + "step": 2656 + }, + { + "epoch": 0.25, + "grad_norm": 0.28307792530010506, + "learning_rate": 0.00019676607253290508, + "loss": 1.0768, + "step": 2657 + }, + { + "epoch": 0.25, + "grad_norm": 0.2761519436955159, + "learning_rate": 0.0001967620807722322, + "loss": 1.0224, + "step": 2658 + }, + { + "epoch": 0.25, + "grad_norm": 0.2879025567486724, + "learning_rate": 0.00019675808659002744, + "loss": 1.1083, + "step": 2659 + }, + { + "epoch": 0.25, + "grad_norm": 0.3004898697546929, + "learning_rate": 0.00019675408998639076, + "loss": 1.1356, + "step": 2660 + }, + { + "epoch": 0.25, + "grad_norm": 0.31015042588546826, + "learning_rate": 0.00019675009096142214, + "loss": 1.1486, + "step": 2661 + }, + { + "epoch": 0.25, + "grad_norm": 0.2811558967690104, + "learning_rate": 0.0001967460895152217, + "loss": 1.0928, + "step": 2662 + }, + { + "epoch": 0.25, + "grad_norm": 0.2633365965205071, + "learning_rate": 0.00019674208564788957, + "loss": 1.1153, + "step": 2663 + }, + { + "epoch": 0.25, + "grad_norm": 0.2860499750304183, + "learning_rate": 0.00019673807935952596, + "loss": 1.1082, + "step": 2664 + }, + { + "epoch": 0.25, + "grad_norm": 0.29758088608843136, + "learning_rate": 0.0001967340706502311, + "loss": 1.2912, + "step": 2665 + }, + { + "epoch": 0.26, + "grad_norm": 0.2568418533290371, + "learning_rate": 0.00019673005952010534, + "loss": 1.1485, + "step": 2666 + }, + { + "epoch": 0.26, + "grad_norm": 0.26096215810194917, + "learning_rate": 0.00019672604596924904, + "loss": 1.0748, + "step": 2667 + }, + { + "epoch": 0.26, + "grad_norm": 0.28867342954233294, + "learning_rate": 0.00019672202999776266, + "loss": 1.0794, + "step": 2668 + }, + { + "epoch": 0.26, + "grad_norm": 0.21142091669081053, + "learning_rate": 0.0001967180116057467, + "loss": 1.0324, + "step": 2669 + }, + { + "epoch": 0.26, + "grad_norm": 0.2592661813230898, + "learning_rate": 0.00019671399079330168, + "loss": 1.1463, + "step": 2670 + }, + { + "epoch": 0.26, + "grad_norm": 0.23960804899506707, + "learning_rate": 0.00019670996756052827, + "loss": 1.0908, + "step": 2671 + }, + { + "epoch": 0.26, + "grad_norm": 0.24684475939341896, + "learning_rate": 0.00019670594190752713, + "loss": 1.036, + "step": 2672 + }, + { + "epoch": 0.26, + "grad_norm": 0.28350271509951525, + "learning_rate": 0.00019670191383439907, + "loss": 1.0251, + "step": 2673 + }, + { + "epoch": 0.26, + "grad_norm": 0.5535529462100653, + "learning_rate": 0.00019669788334124476, + "loss": 1.052, + "step": 2674 + }, + { + "epoch": 0.26, + "grad_norm": 0.3077728064262205, + "learning_rate": 0.0001966938504281652, + "loss": 1.116, + "step": 2675 + }, + { + "epoch": 0.26, + "grad_norm": 0.2656350026116821, + "learning_rate": 0.00019668981509526128, + "loss": 1.2018, + "step": 2676 + }, + { + "epoch": 0.26, + "grad_norm": 0.2479927609636305, + "learning_rate": 0.00019668577734263394, + "loss": 1.104, + "step": 2677 + }, + { + "epoch": 0.26, + "grad_norm": 0.2777215172332179, + "learning_rate": 0.00019668173717038426, + "loss": 1.1844, + "step": 2678 + }, + { + "epoch": 0.26, + "grad_norm": 0.2829530059947927, + "learning_rate": 0.00019667769457861335, + "loss": 1.0842, + "step": 2679 + }, + { + "epoch": 0.26, + "grad_norm": 0.2550816083760124, + "learning_rate": 0.00019667364956742236, + "loss": 1.1394, + "step": 2680 + }, + { + "epoch": 0.26, + "grad_norm": 0.2753775575550518, + "learning_rate": 0.00019666960213691255, + "loss": 1.1293, + "step": 2681 + }, + { + "epoch": 0.26, + "grad_norm": 0.2767717867669593, + "learning_rate": 0.0001966655522871852, + "loss": 1.0783, + "step": 2682 + }, + { + "epoch": 0.26, + "grad_norm": 0.26993060617287373, + "learning_rate": 0.00019666150001834164, + "loss": 1.0235, + "step": 2683 + }, + { + "epoch": 0.26, + "grad_norm": 0.2870195128020576, + "learning_rate": 0.00019665744533048328, + "loss": 1.0422, + "step": 2684 + }, + { + "epoch": 0.26, + "grad_norm": 0.27640315301309837, + "learning_rate": 0.0001966533882237116, + "loss": 1.1165, + "step": 2685 + }, + { + "epoch": 0.26, + "grad_norm": 0.2694095090611699, + "learning_rate": 0.00019664932869812814, + "loss": 1.0239, + "step": 2686 + }, + { + "epoch": 0.26, + "grad_norm": 0.32106995191546245, + "learning_rate": 0.0001966452667538345, + "loss": 1.1492, + "step": 2687 + }, + { + "epoch": 0.26, + "grad_norm": 0.27424915699699154, + "learning_rate": 0.00019664120239093233, + "loss": 1.1685, + "step": 2688 + }, + { + "epoch": 0.26, + "grad_norm": 0.29135122016294623, + "learning_rate": 0.0001966371356095233, + "loss": 1.1547, + "step": 2689 + }, + { + "epoch": 0.26, + "grad_norm": 0.26527031382403177, + "learning_rate": 0.00019663306640970926, + "loss": 1.1219, + "step": 2690 + }, + { + "epoch": 0.26, + "grad_norm": 0.27127353395064424, + "learning_rate": 0.00019662899479159197, + "loss": 0.9899, + "step": 2691 + }, + { + "epoch": 0.26, + "grad_norm": 0.3033029254082761, + "learning_rate": 0.00019662492075527336, + "loss": 1.1249, + "step": 2692 + }, + { + "epoch": 0.26, + "grad_norm": 0.26657961186508666, + "learning_rate": 0.00019662084430085538, + "loss": 1.1842, + "step": 2693 + }, + { + "epoch": 0.26, + "grad_norm": 0.28599030101340955, + "learning_rate": 0.00019661676542844007, + "loss": 1.1839, + "step": 2694 + }, + { + "epoch": 0.26, + "grad_norm": 0.30087762030890813, + "learning_rate": 0.00019661268413812946, + "loss": 1.0863, + "step": 2695 + }, + { + "epoch": 0.26, + "grad_norm": 0.325088281089176, + "learning_rate": 0.00019660860043002574, + "loss": 1.049, + "step": 2696 + }, + { + "epoch": 0.26, + "grad_norm": 0.2828097114488758, + "learning_rate": 0.00019660451430423103, + "loss": 1.1857, + "step": 2697 + }, + { + "epoch": 0.26, + "grad_norm": 0.2701302351874178, + "learning_rate": 0.00019660042576084767, + "loss": 1.1144, + "step": 2698 + }, + { + "epoch": 0.26, + "grad_norm": 0.2835386842245424, + "learning_rate": 0.00019659633479997794, + "loss": 1.1066, + "step": 2699 + }, + { + "epoch": 0.26, + "grad_norm": 0.243379840423929, + "learning_rate": 0.00019659224142172424, + "loss": 1.0204, + "step": 2700 + }, + { + "epoch": 0.26, + "grad_norm": 0.24266059096127535, + "learning_rate": 0.00019658814562618896, + "loss": 1.1088, + "step": 2701 + }, + { + "epoch": 0.26, + "grad_norm": 0.2617930016670921, + "learning_rate": 0.00019658404741347462, + "loss": 1.1113, + "step": 2702 + }, + { + "epoch": 0.26, + "grad_norm": 0.2547097198503651, + "learning_rate": 0.00019657994678368385, + "loss": 1.0444, + "step": 2703 + }, + { + "epoch": 0.26, + "grad_norm": 0.2625250549016416, + "learning_rate": 0.00019657584373691917, + "loss": 1.1396, + "step": 2704 + }, + { + "epoch": 0.26, + "grad_norm": 0.26689206957202594, + "learning_rate": 0.0001965717382732833, + "loss": 1.1596, + "step": 2705 + }, + { + "epoch": 0.26, + "grad_norm": 0.2903595268276037, + "learning_rate": 0.000196567630392879, + "loss": 1.0474, + "step": 2706 + }, + { + "epoch": 0.26, + "grad_norm": 0.2572495572058499, + "learning_rate": 0.00019656352009580908, + "loss": 1.0761, + "step": 2707 + }, + { + "epoch": 0.26, + "grad_norm": 0.2470130498530773, + "learning_rate": 0.00019655940738217635, + "loss": 1.0546, + "step": 2708 + }, + { + "epoch": 0.26, + "grad_norm": 0.2930008567798905, + "learning_rate": 0.00019655529225208378, + "loss": 1.1107, + "step": 2709 + }, + { + "epoch": 0.26, + "grad_norm": 0.25989997031388734, + "learning_rate": 0.00019655117470563434, + "loss": 1.0839, + "step": 2710 + }, + { + "epoch": 0.26, + "grad_norm": 0.30098329989662703, + "learning_rate": 0.00019654705474293107, + "loss": 1.1056, + "step": 2711 + }, + { + "epoch": 0.26, + "grad_norm": 0.2424595664908627, + "learning_rate": 0.00019654293236407707, + "loss": 1.0923, + "step": 2712 + }, + { + "epoch": 0.26, + "grad_norm": 0.28519149075066125, + "learning_rate": 0.00019653880756917552, + "loss": 1.0625, + "step": 2713 + }, + { + "epoch": 0.26, + "grad_norm": 0.31144165591888, + "learning_rate": 0.00019653468035832965, + "loss": 1.2098, + "step": 2714 + }, + { + "epoch": 0.26, + "grad_norm": 0.2714581157840239, + "learning_rate": 0.0001965305507316427, + "loss": 1.092, + "step": 2715 + }, + { + "epoch": 0.26, + "grad_norm": 0.28440142688999087, + "learning_rate": 0.0001965264186892181, + "loss": 1.0165, + "step": 2716 + }, + { + "epoch": 0.26, + "grad_norm": 0.24262222631275346, + "learning_rate": 0.00019652228423115917, + "loss": 1.0364, + "step": 2717 + }, + { + "epoch": 0.26, + "grad_norm": 0.2743500722654148, + "learning_rate": 0.00019651814735756942, + "loss": 1.0864, + "step": 2718 + }, + { + "epoch": 0.26, + "grad_norm": 0.28526861621648486, + "learning_rate": 0.00019651400806855237, + "loss": 1.1124, + "step": 2719 + }, + { + "epoch": 0.26, + "grad_norm": 0.31205779575841586, + "learning_rate": 0.00019650986636421164, + "loss": 1.102, + "step": 2720 + }, + { + "epoch": 0.26, + "grad_norm": 0.2661120177899195, + "learning_rate": 0.00019650572224465084, + "loss": 1.1081, + "step": 2721 + }, + { + "epoch": 0.26, + "grad_norm": 0.29878434646252183, + "learning_rate": 0.00019650157570997364, + "loss": 1.1004, + "step": 2722 + }, + { + "epoch": 0.26, + "grad_norm": 0.2612778406780732, + "learning_rate": 0.00019649742676028394, + "loss": 1.2379, + "step": 2723 + }, + { + "epoch": 0.26, + "grad_norm": 0.2933566914172725, + "learning_rate": 0.00019649327539568543, + "loss": 1.162, + "step": 2724 + }, + { + "epoch": 0.26, + "grad_norm": 0.24094136996286655, + "learning_rate": 0.0001964891216162821, + "loss": 1.0807, + "step": 2725 + }, + { + "epoch": 0.26, + "grad_norm": 0.2943459316325484, + "learning_rate": 0.00019648496542217783, + "loss": 0.998, + "step": 2726 + }, + { + "epoch": 0.26, + "grad_norm": 0.22961741180973028, + "learning_rate": 0.00019648080681347664, + "loss": 1.1305, + "step": 2727 + }, + { + "epoch": 0.26, + "grad_norm": 0.2701859846638287, + "learning_rate": 0.00019647664579028267, + "loss": 1.0889, + "step": 2728 + }, + { + "epoch": 0.26, + "grad_norm": 0.26857093567006685, + "learning_rate": 0.0001964724823527, + "loss": 1.2013, + "step": 2729 + }, + { + "epoch": 0.26, + "grad_norm": 0.31949382152840944, + "learning_rate": 0.0001964683165008328, + "loss": 1.1718, + "step": 2730 + }, + { + "epoch": 0.26, + "grad_norm": 0.28367090867227224, + "learning_rate": 0.00019646414823478535, + "loss": 1.1491, + "step": 2731 + }, + { + "epoch": 0.26, + "grad_norm": 0.2504701281061607, + "learning_rate": 0.000196459977554662, + "loss": 1.1017, + "step": 2732 + }, + { + "epoch": 0.26, + "grad_norm": 0.25221421853430986, + "learning_rate": 0.00019645580446056706, + "loss": 1.1185, + "step": 2733 + }, + { + "epoch": 0.26, + "grad_norm": 0.28143888782957394, + "learning_rate": 0.000196451628952605, + "loss": 1.1759, + "step": 2734 + }, + { + "epoch": 0.26, + "grad_norm": 0.2578191227270372, + "learning_rate": 0.00019644745103088033, + "loss": 1.1787, + "step": 2735 + }, + { + "epoch": 0.26, + "grad_norm": 0.26056470647250435, + "learning_rate": 0.00019644327069549754, + "loss": 1.1649, + "step": 2736 + }, + { + "epoch": 0.26, + "grad_norm": 0.27176880852893454, + "learning_rate": 0.00019643908794656135, + "loss": 1.1057, + "step": 2737 + }, + { + "epoch": 0.26, + "grad_norm": 0.2882235242635738, + "learning_rate": 0.00019643490278417632, + "loss": 1.1081, + "step": 2738 + }, + { + "epoch": 0.26, + "grad_norm": 0.27263458978440536, + "learning_rate": 0.00019643071520844725, + "loss": 1.0712, + "step": 2739 + }, + { + "epoch": 0.26, + "grad_norm": 0.25866965428198324, + "learning_rate": 0.00019642652521947894, + "loss": 1.1159, + "step": 2740 + }, + { + "epoch": 0.26, + "grad_norm": 0.2630902327301495, + "learning_rate": 0.00019642233281737625, + "loss": 1.0603, + "step": 2741 + }, + { + "epoch": 0.26, + "grad_norm": 0.25820714973330133, + "learning_rate": 0.00019641813800224406, + "loss": 0.9922, + "step": 2742 + }, + { + "epoch": 0.26, + "grad_norm": 0.2724772043679332, + "learning_rate": 0.00019641394077418736, + "loss": 1.0461, + "step": 2743 + }, + { + "epoch": 0.26, + "grad_norm": 0.27088818855216434, + "learning_rate": 0.00019640974113331123, + "loss": 1.1959, + "step": 2744 + }, + { + "epoch": 0.26, + "grad_norm": 0.2733607733632018, + "learning_rate": 0.00019640553907972072, + "loss": 1.1886, + "step": 2745 + }, + { + "epoch": 0.26, + "grad_norm": 0.24676149904011196, + "learning_rate": 0.000196401334613521, + "loss": 1.0969, + "step": 2746 + }, + { + "epoch": 0.26, + "grad_norm": 0.27834584403561474, + "learning_rate": 0.00019639712773481728, + "loss": 1.1597, + "step": 2747 + }, + { + "epoch": 0.26, + "grad_norm": 0.25755736145056124, + "learning_rate": 0.0001963929184437149, + "loss": 0.9978, + "step": 2748 + }, + { + "epoch": 0.26, + "grad_norm": 0.31079398376305944, + "learning_rate": 0.00019638870674031913, + "loss": 1.1268, + "step": 2749 + }, + { + "epoch": 0.26, + "grad_norm": 0.2774704968716273, + "learning_rate": 0.0001963844926247354, + "loss": 1.1641, + "step": 2750 + }, + { + "epoch": 0.26, + "grad_norm": 0.2781649290866802, + "learning_rate": 0.00019638027609706916, + "loss": 1.0707, + "step": 2751 + }, + { + "epoch": 0.26, + "grad_norm": 0.2552137884918176, + "learning_rate": 0.00019637605715742593, + "loss": 1.2287, + "step": 2752 + }, + { + "epoch": 0.26, + "grad_norm": 0.2599068780978502, + "learning_rate": 0.00019637183580591133, + "loss": 1.0331, + "step": 2753 + }, + { + "epoch": 0.26, + "grad_norm": 0.27220701541938597, + "learning_rate": 0.00019636761204263093, + "loss": 1.0395, + "step": 2754 + }, + { + "epoch": 0.26, + "grad_norm": 0.24397827889832657, + "learning_rate": 0.0001963633858676905, + "loss": 1.1166, + "step": 2755 + }, + { + "epoch": 0.26, + "grad_norm": 0.2497024411579182, + "learning_rate": 0.00019635915728119575, + "loss": 1.0847, + "step": 2756 + }, + { + "epoch": 0.26, + "grad_norm": 0.2589117765444782, + "learning_rate": 0.00019635492628325256, + "loss": 1.0086, + "step": 2757 + }, + { + "epoch": 0.26, + "grad_norm": 0.29313802184742693, + "learning_rate": 0.00019635069287396678, + "loss": 1.1229, + "step": 2758 + }, + { + "epoch": 0.26, + "grad_norm": 0.2618259520734273, + "learning_rate": 0.00019634645705344435, + "loss": 1.1459, + "step": 2759 + }, + { + "epoch": 0.26, + "grad_norm": 0.2454773691105509, + "learning_rate": 0.0001963422188217913, + "loss": 0.968, + "step": 2760 + }, + { + "epoch": 0.26, + "grad_norm": 0.28372490824875446, + "learning_rate": 0.00019633797817911365, + "loss": 1.1127, + "step": 2761 + }, + { + "epoch": 0.26, + "grad_norm": 0.3273752692424284, + "learning_rate": 0.00019633373512551754, + "loss": 1.1669, + "step": 2762 + }, + { + "epoch": 0.26, + "grad_norm": 0.2366079234217832, + "learning_rate": 0.0001963294896611092, + "loss": 1.0848, + "step": 2763 + }, + { + "epoch": 0.26, + "grad_norm": 0.5261995752384835, + "learning_rate": 0.00019632524178599483, + "loss": 1.1333, + "step": 2764 + }, + { + "epoch": 0.26, + "grad_norm": 0.26665251541131973, + "learning_rate": 0.00019632099150028074, + "loss": 1.0651, + "step": 2765 + }, + { + "epoch": 0.26, + "grad_norm": 0.2584540431624957, + "learning_rate": 0.0001963167388040733, + "loss": 1.107, + "step": 2766 + }, + { + "epoch": 0.26, + "grad_norm": 0.2634462477386324, + "learning_rate": 0.00019631248369747893, + "loss": 1.124, + "step": 2767 + }, + { + "epoch": 0.26, + "grad_norm": 0.29816863160086016, + "learning_rate": 0.00019630822618060413, + "loss": 1.1173, + "step": 2768 + }, + { + "epoch": 0.26, + "grad_norm": 0.25963330208582736, + "learning_rate": 0.00019630396625355546, + "loss": 1.0285, + "step": 2769 + }, + { + "epoch": 0.27, + "grad_norm": 0.2887065947096526, + "learning_rate": 0.00019629970391643947, + "loss": 1.1987, + "step": 2770 + }, + { + "epoch": 0.27, + "grad_norm": 0.27628742653188076, + "learning_rate": 0.0001962954391693629, + "loss": 1.067, + "step": 2771 + }, + { + "epoch": 0.27, + "grad_norm": 0.2707072531440452, + "learning_rate": 0.00019629117201243242, + "loss": 0.9484, + "step": 2772 + }, + { + "epoch": 0.27, + "grad_norm": 0.2973518749444581, + "learning_rate": 0.0001962869024457549, + "loss": 1.0383, + "step": 2773 + }, + { + "epoch": 0.27, + "grad_norm": 0.26365187418455077, + "learning_rate": 0.0001962826304694371, + "loss": 1.0947, + "step": 2774 + }, + { + "epoch": 0.27, + "grad_norm": 0.2591589967435732, + "learning_rate": 0.00019627835608358596, + "loss": 1.07, + "step": 2775 + }, + { + "epoch": 0.27, + "grad_norm": 0.26492792550338157, + "learning_rate": 0.00019627407928830842, + "loss": 1.1614, + "step": 2776 + }, + { + "epoch": 0.27, + "grad_norm": 0.25376976035220183, + "learning_rate": 0.00019626980008371158, + "loss": 1.0263, + "step": 2777 + }, + { + "epoch": 0.27, + "grad_norm": 0.23447258204438867, + "learning_rate": 0.0001962655184699025, + "loss": 1.1122, + "step": 2778 + }, + { + "epoch": 0.27, + "grad_norm": 0.2562093782282185, + "learning_rate": 0.00019626123444698828, + "loss": 1.0457, + "step": 2779 + }, + { + "epoch": 0.27, + "grad_norm": 0.2723192435370688, + "learning_rate": 0.00019625694801507618, + "loss": 1.0636, + "step": 2780 + }, + { + "epoch": 0.27, + "grad_norm": 0.29084143851467165, + "learning_rate": 0.00019625265917427346, + "loss": 1.1492, + "step": 2781 + }, + { + "epoch": 0.27, + "grad_norm": 0.2759691568719512, + "learning_rate": 0.00019624836792468746, + "loss": 1.0947, + "step": 2782 + }, + { + "epoch": 0.27, + "grad_norm": 0.2847571657532783, + "learning_rate": 0.00019624407426642557, + "loss": 1.0998, + "step": 2783 + }, + { + "epoch": 0.27, + "grad_norm": 0.2821857453688257, + "learning_rate": 0.00019623977819959522, + "loss": 1.0525, + "step": 2784 + }, + { + "epoch": 0.27, + "grad_norm": 0.2710683595054374, + "learning_rate": 0.00019623547972430394, + "loss": 1.1189, + "step": 2785 + }, + { + "epoch": 0.27, + "grad_norm": 0.2650861466646489, + "learning_rate": 0.00019623117884065932, + "loss": 1.0532, + "step": 2786 + }, + { + "epoch": 0.27, + "grad_norm": 0.28632313517563235, + "learning_rate": 0.00019622687554876893, + "loss": 1.0432, + "step": 2787 + }, + { + "epoch": 0.27, + "grad_norm": 0.2927818066954993, + "learning_rate": 0.00019622256984874053, + "loss": 1.0847, + "step": 2788 + }, + { + "epoch": 0.27, + "grad_norm": 0.2631443348972877, + "learning_rate": 0.00019621826174068185, + "loss": 1.1038, + "step": 2789 + }, + { + "epoch": 0.27, + "grad_norm": 0.26694650397049624, + "learning_rate": 0.00019621395122470066, + "loss": 0.9954, + "step": 2790 + }, + { + "epoch": 0.27, + "grad_norm": 0.27909721930965364, + "learning_rate": 0.00019620963830090492, + "loss": 0.9486, + "step": 2791 + }, + { + "epoch": 0.27, + "grad_norm": 0.24079022570344805, + "learning_rate": 0.0001962053229694025, + "loss": 0.9676, + "step": 2792 + }, + { + "epoch": 0.27, + "grad_norm": 0.2559668456069455, + "learning_rate": 0.0001962010052303014, + "loss": 1.1104, + "step": 2793 + }, + { + "epoch": 0.27, + "grad_norm": 0.25016354937216784, + "learning_rate": 0.0001961966850837097, + "loss": 1.1744, + "step": 2794 + }, + { + "epoch": 0.27, + "grad_norm": 0.25412464453761213, + "learning_rate": 0.0001961923625297355, + "loss": 1.1563, + "step": 2795 + }, + { + "epoch": 0.27, + "grad_norm": 0.25700418834836664, + "learning_rate": 0.00019618803756848695, + "loss": 1.155, + "step": 2796 + }, + { + "epoch": 0.27, + "grad_norm": 0.29832287781586436, + "learning_rate": 0.0001961837102000723, + "loss": 1.0929, + "step": 2797 + }, + { + "epoch": 0.27, + "grad_norm": 0.24969396689469361, + "learning_rate": 0.00019617938042459988, + "loss": 1.0692, + "step": 2798 + }, + { + "epoch": 0.27, + "grad_norm": 0.27702787637233217, + "learning_rate": 0.00019617504824217803, + "loss": 1.0812, + "step": 2799 + }, + { + "epoch": 0.27, + "grad_norm": 0.2900431406784246, + "learning_rate": 0.00019617071365291512, + "loss": 1.1213, + "step": 2800 + }, + { + "epoch": 0.27, + "grad_norm": 0.2650773606437207, + "learning_rate": 0.0001961663766569197, + "loss": 1.0438, + "step": 2801 + }, + { + "epoch": 0.27, + "grad_norm": 0.23346920952561745, + "learning_rate": 0.00019616203725430023, + "loss": 0.9972, + "step": 2802 + }, + { + "epoch": 0.27, + "grad_norm": 0.2739900839005771, + "learning_rate": 0.00019615769544516532, + "loss": 1.0435, + "step": 2803 + }, + { + "epoch": 0.27, + "grad_norm": 0.3190423287534823, + "learning_rate": 0.00019615335122962372, + "loss": 1.1342, + "step": 2804 + }, + { + "epoch": 0.27, + "grad_norm": 0.2830709781599123, + "learning_rate": 0.00019614900460778403, + "loss": 1.0853, + "step": 2805 + }, + { + "epoch": 0.27, + "grad_norm": 0.26452593616904946, + "learning_rate": 0.00019614465557975507, + "loss": 1.1323, + "step": 2806 + }, + { + "epoch": 0.27, + "grad_norm": 0.2592995682321865, + "learning_rate": 0.00019614030414564568, + "loss": 1.068, + "step": 2807 + }, + { + "epoch": 0.27, + "grad_norm": 0.2827474115219191, + "learning_rate": 0.00019613595030556477, + "loss": 1.1488, + "step": 2808 + }, + { + "epoch": 0.27, + "grad_norm": 0.2649963078693095, + "learning_rate": 0.0001961315940596213, + "loss": 0.996, + "step": 2809 + }, + { + "epoch": 0.27, + "grad_norm": 0.26346285838705263, + "learning_rate": 0.00019612723540792426, + "loss": 1.1175, + "step": 2810 + }, + { + "epoch": 0.27, + "grad_norm": 0.28954499176521176, + "learning_rate": 0.00019612287435058273, + "loss": 1.1733, + "step": 2811 + }, + { + "epoch": 0.27, + "grad_norm": 0.2761795119898941, + "learning_rate": 0.00019611851088770585, + "loss": 1.1321, + "step": 2812 + }, + { + "epoch": 0.27, + "grad_norm": 0.277378324643611, + "learning_rate": 0.00019611414501940284, + "loss": 1.291, + "step": 2813 + }, + { + "epoch": 0.27, + "grad_norm": 0.29898305586569235, + "learning_rate": 0.00019610977674578296, + "loss": 1.1056, + "step": 2814 + }, + { + "epoch": 0.27, + "grad_norm": 0.2728332232087226, + "learning_rate": 0.00019610540606695547, + "loss": 1.1051, + "step": 2815 + }, + { + "epoch": 0.27, + "grad_norm": 0.3020216145302391, + "learning_rate": 0.0001961010329830298, + "loss": 0.9909, + "step": 2816 + }, + { + "epoch": 0.27, + "grad_norm": 0.2614179979610706, + "learning_rate": 0.00019609665749411543, + "loss": 1.0927, + "step": 2817 + }, + { + "epoch": 0.27, + "grad_norm": 0.2486123416932773, + "learning_rate": 0.00019609227960032177, + "loss": 1.1096, + "step": 2818 + }, + { + "epoch": 0.27, + "grad_norm": 0.31135834548170077, + "learning_rate": 0.00019608789930175845, + "loss": 1.1178, + "step": 2819 + }, + { + "epoch": 0.27, + "grad_norm": 0.277755259399668, + "learning_rate": 0.00019608351659853503, + "loss": 1.1473, + "step": 2820 + }, + { + "epoch": 0.27, + "grad_norm": 0.2584745367908558, + "learning_rate": 0.00019607913149076125, + "loss": 1.0122, + "step": 2821 + }, + { + "epoch": 0.27, + "grad_norm": 0.2619607556881495, + "learning_rate": 0.0001960747439785468, + "loss": 1.128, + "step": 2822 + }, + { + "epoch": 0.27, + "grad_norm": 0.29050519695473265, + "learning_rate": 0.00019607035406200152, + "loss": 1.0491, + "step": 2823 + }, + { + "epoch": 0.27, + "grad_norm": 0.21812110204389035, + "learning_rate": 0.00019606596174123525, + "loss": 1.1075, + "step": 2824 + }, + { + "epoch": 0.27, + "grad_norm": 0.2839251249015717, + "learning_rate": 0.00019606156701635792, + "loss": 1.1837, + "step": 2825 + }, + { + "epoch": 0.27, + "grad_norm": 0.2958308386052112, + "learning_rate": 0.0001960571698874795, + "loss": 1.0984, + "step": 2826 + }, + { + "epoch": 0.27, + "grad_norm": 0.2671536081657146, + "learning_rate": 0.00019605277035470998, + "loss": 1.0137, + "step": 2827 + }, + { + "epoch": 0.27, + "grad_norm": 0.260771384058964, + "learning_rate": 0.00019604836841815958, + "loss": 1.0675, + "step": 2828 + }, + { + "epoch": 0.27, + "grad_norm": 0.2615419961091573, + "learning_rate": 0.00019604396407793835, + "loss": 1.055, + "step": 2829 + }, + { + "epoch": 0.27, + "grad_norm": 0.27494047722891274, + "learning_rate": 0.0001960395573341566, + "loss": 1.1479, + "step": 2830 + }, + { + "epoch": 0.27, + "grad_norm": 0.3121477873782516, + "learning_rate": 0.00019603514818692454, + "loss": 1.0033, + "step": 2831 + }, + { + "epoch": 0.27, + "grad_norm": 0.26737638033087213, + "learning_rate": 0.00019603073663635256, + "loss": 1.1077, + "step": 2832 + }, + { + "epoch": 0.27, + "grad_norm": 0.24631307751232157, + "learning_rate": 0.00019602632268255103, + "loss": 1.0545, + "step": 2833 + }, + { + "epoch": 0.27, + "grad_norm": 0.3013621704790624, + "learning_rate": 0.00019602190632563043, + "loss": 1.0969, + "step": 2834 + }, + { + "epoch": 0.27, + "grad_norm": 0.2783637530154318, + "learning_rate": 0.00019601748756570126, + "loss": 1.0622, + "step": 2835 + }, + { + "epoch": 0.27, + "grad_norm": 0.2574957740041804, + "learning_rate": 0.00019601306640287415, + "loss": 1.101, + "step": 2836 + }, + { + "epoch": 0.27, + "grad_norm": 0.26247517047321034, + "learning_rate": 0.00019600864283725967, + "loss": 1.0651, + "step": 2837 + }, + { + "epoch": 0.27, + "grad_norm": 0.25876279601882096, + "learning_rate": 0.0001960042168689686, + "loss": 1.062, + "step": 2838 + }, + { + "epoch": 0.27, + "grad_norm": 0.25523429862063185, + "learning_rate": 0.00019599978849811164, + "loss": 1.1267, + "step": 2839 + }, + { + "epoch": 0.27, + "grad_norm": 0.23537001704505256, + "learning_rate": 0.00019599535772479968, + "loss": 1.0823, + "step": 2840 + }, + { + "epoch": 0.27, + "grad_norm": 0.2462574908056951, + "learning_rate": 0.00019599092454914351, + "loss": 1.0301, + "step": 2841 + }, + { + "epoch": 0.27, + "grad_norm": 0.2721311709805547, + "learning_rate": 0.00019598648897125416, + "loss": 1.067, + "step": 2842 + }, + { + "epoch": 0.27, + "grad_norm": 0.3069559448268241, + "learning_rate": 0.0001959820509912426, + "loss": 1.013, + "step": 2843 + }, + { + "epoch": 0.27, + "grad_norm": 0.2702912452683172, + "learning_rate": 0.00019597761060921985, + "loss": 1.1137, + "step": 2844 + }, + { + "epoch": 0.27, + "grad_norm": 0.2797213146710235, + "learning_rate": 0.00019597316782529715, + "loss": 1.0939, + "step": 2845 + }, + { + "epoch": 0.27, + "grad_norm": 0.2787776466517135, + "learning_rate": 0.00019596872263958552, + "loss": 1.1058, + "step": 2846 + }, + { + "epoch": 0.27, + "grad_norm": 0.26116233626175234, + "learning_rate": 0.00019596427505219635, + "loss": 1.0094, + "step": 2847 + }, + { + "epoch": 0.27, + "grad_norm": 0.2693281423942913, + "learning_rate": 0.0001959598250632409, + "loss": 1.1635, + "step": 2848 + }, + { + "epoch": 0.27, + "grad_norm": 0.2506743951314745, + "learning_rate": 0.00019595537267283047, + "loss": 1.1075, + "step": 2849 + }, + { + "epoch": 0.27, + "grad_norm": 0.24106080467021557, + "learning_rate": 0.00019595091788107656, + "loss": 0.9761, + "step": 2850 + }, + { + "epoch": 0.27, + "grad_norm": 0.2860007338075784, + "learning_rate": 0.0001959464606880906, + "loss": 1.0474, + "step": 2851 + }, + { + "epoch": 0.27, + "grad_norm": 0.2359433024977078, + "learning_rate": 0.00019594200109398417, + "loss": 1.0804, + "step": 2852 + }, + { + "epoch": 0.27, + "grad_norm": 0.24715768704519478, + "learning_rate": 0.0001959375390988689, + "loss": 1.1157, + "step": 2853 + }, + { + "epoch": 0.27, + "grad_norm": 0.2586351713329031, + "learning_rate": 0.0001959330747028564, + "loss": 1.0448, + "step": 2854 + }, + { + "epoch": 0.27, + "grad_norm": 0.25499580779707726, + "learning_rate": 0.00019592860790605842, + "loss": 1.1657, + "step": 2855 + }, + { + "epoch": 0.27, + "grad_norm": 0.29734674634840746, + "learning_rate": 0.0001959241387085867, + "loss": 1.1446, + "step": 2856 + }, + { + "epoch": 0.27, + "grad_norm": 0.2647465723279589, + "learning_rate": 0.00019591966711055315, + "loss": 1.1668, + "step": 2857 + }, + { + "epoch": 0.27, + "grad_norm": 0.26368966136330935, + "learning_rate": 0.00019591519311206964, + "loss": 1.1992, + "step": 2858 + }, + { + "epoch": 0.27, + "grad_norm": 0.3126849078518487, + "learning_rate": 0.00019591071671324817, + "loss": 1.141, + "step": 2859 + }, + { + "epoch": 0.27, + "grad_norm": 0.25583665345229606, + "learning_rate": 0.00019590623791420071, + "loss": 1.1441, + "step": 2860 + }, + { + "epoch": 0.27, + "grad_norm": 0.23927224607942188, + "learning_rate": 0.00019590175671503938, + "loss": 1.1212, + "step": 2861 + }, + { + "epoch": 0.27, + "grad_norm": 0.28570558886340036, + "learning_rate": 0.00019589727311587632, + "loss": 1.0623, + "step": 2862 + }, + { + "epoch": 0.27, + "grad_norm": 0.23141130803687024, + "learning_rate": 0.00019589278711682373, + "loss": 1.1051, + "step": 2863 + }, + { + "epoch": 0.27, + "grad_norm": 0.31126189932366843, + "learning_rate": 0.00019588829871799388, + "loss": 1.098, + "step": 2864 + }, + { + "epoch": 0.27, + "grad_norm": 0.2368691173217574, + "learning_rate": 0.00019588380791949906, + "loss": 0.937, + "step": 2865 + }, + { + "epoch": 0.27, + "grad_norm": 0.2905469407114906, + "learning_rate": 0.0001958793147214517, + "loss": 1.1837, + "step": 2866 + }, + { + "epoch": 0.27, + "grad_norm": 0.25681129260211033, + "learning_rate": 0.00019587481912396426, + "loss": 1.0659, + "step": 2867 + }, + { + "epoch": 0.27, + "grad_norm": 0.3115969168588661, + "learning_rate": 0.0001958703211271492, + "loss": 1.1245, + "step": 2868 + }, + { + "epoch": 0.27, + "grad_norm": 0.2629500967052983, + "learning_rate": 0.0001958658207311191, + "loss": 1.0451, + "step": 2869 + }, + { + "epoch": 0.27, + "grad_norm": 0.26897588144338785, + "learning_rate": 0.0001958613179359866, + "loss": 1.0232, + "step": 2870 + }, + { + "epoch": 0.27, + "grad_norm": 0.2745821063034373, + "learning_rate": 0.00019585681274186434, + "loss": 1.1058, + "step": 2871 + }, + { + "epoch": 0.27, + "grad_norm": 0.28134411874328125, + "learning_rate": 0.00019585230514886513, + "loss": 1.0646, + "step": 2872 + }, + { + "epoch": 0.27, + "grad_norm": 0.2739086818209478, + "learning_rate": 0.0001958477951571017, + "loss": 1.0774, + "step": 2873 + }, + { + "epoch": 0.27, + "grad_norm": 0.24039032053442347, + "learning_rate": 0.000195843282766687, + "loss": 1.1367, + "step": 2874 + }, + { + "epoch": 0.28, + "grad_norm": 0.2729224048445481, + "learning_rate": 0.00019583876797773391, + "loss": 1.0894, + "step": 2875 + }, + { + "epoch": 0.28, + "grad_norm": 0.2548563216514712, + "learning_rate": 0.0001958342507903554, + "loss": 1.1571, + "step": 2876 + }, + { + "epoch": 0.28, + "grad_norm": 0.2611894266017868, + "learning_rate": 0.00019582973120466454, + "loss": 1.1219, + "step": 2877 + }, + { + "epoch": 0.28, + "grad_norm": 0.2517135134023402, + "learning_rate": 0.00019582520922077444, + "loss": 1.1457, + "step": 2878 + }, + { + "epoch": 0.28, + "grad_norm": 0.25685092446562596, + "learning_rate": 0.00019582068483879822, + "loss": 1.0947, + "step": 2879 + }, + { + "epoch": 0.28, + "grad_norm": 0.2531800123875422, + "learning_rate": 0.00019581615805884918, + "loss": 1.0824, + "step": 2880 + }, + { + "epoch": 0.28, + "grad_norm": 0.29746440693794585, + "learning_rate": 0.00019581162888104056, + "loss": 1.127, + "step": 2881 + }, + { + "epoch": 0.28, + "grad_norm": 0.2951013215212664, + "learning_rate": 0.0001958070973054857, + "loss": 0.9708, + "step": 2882 + }, + { + "epoch": 0.28, + "grad_norm": 0.26703604465532904, + "learning_rate": 0.00019580256333229804, + "loss": 1.0996, + "step": 2883 + }, + { + "epoch": 0.28, + "grad_norm": 0.3054149559628321, + "learning_rate": 0.00019579802696159098, + "loss": 1.1972, + "step": 2884 + }, + { + "epoch": 0.28, + "grad_norm": 0.2554844575267877, + "learning_rate": 0.00019579348819347814, + "loss": 1.1815, + "step": 2885 + }, + { + "epoch": 0.28, + "grad_norm": 0.27725302476518116, + "learning_rate": 0.00019578894702807303, + "loss": 1.1779, + "step": 2886 + }, + { + "epoch": 0.28, + "grad_norm": 0.2812121223614431, + "learning_rate": 0.0001957844034654893, + "loss": 1.0931, + "step": 2887 + }, + { + "epoch": 0.28, + "grad_norm": 0.28657879183155893, + "learning_rate": 0.0001957798575058407, + "loss": 1.209, + "step": 2888 + }, + { + "epoch": 0.28, + "grad_norm": 0.32607385402952277, + "learning_rate": 0.00019577530914924096, + "loss": 1.083, + "step": 2889 + }, + { + "epoch": 0.28, + "grad_norm": 0.2622053283912726, + "learning_rate": 0.00019577075839580395, + "loss": 1.1331, + "step": 2890 + }, + { + "epoch": 0.28, + "grad_norm": 0.27330452773103864, + "learning_rate": 0.00019576620524564347, + "loss": 1.1223, + "step": 2891 + }, + { + "epoch": 0.28, + "grad_norm": 0.28722568397660336, + "learning_rate": 0.00019576164969887353, + "loss": 1.0848, + "step": 2892 + }, + { + "epoch": 0.28, + "grad_norm": 0.25111702949361947, + "learning_rate": 0.00019575709175560815, + "loss": 1.1015, + "step": 2893 + }, + { + "epoch": 0.28, + "grad_norm": 0.27784956399299426, + "learning_rate": 0.00019575253141596136, + "loss": 1.0712, + "step": 2894 + }, + { + "epoch": 0.28, + "grad_norm": 0.29027995612195606, + "learning_rate": 0.00019574796868004728, + "loss": 1.0522, + "step": 2895 + }, + { + "epoch": 0.28, + "grad_norm": 0.2662180849874586, + "learning_rate": 0.00019574340354798012, + "loss": 1.0711, + "step": 2896 + }, + { + "epoch": 0.28, + "grad_norm": 0.272996240282476, + "learning_rate": 0.00019573883601987409, + "loss": 1.1081, + "step": 2897 + }, + { + "epoch": 0.28, + "grad_norm": 0.2500926481674787, + "learning_rate": 0.00019573426609584353, + "loss": 1.0818, + "step": 2898 + }, + { + "epoch": 0.28, + "grad_norm": 0.2690599907288768, + "learning_rate": 0.00019572969377600278, + "loss": 1.1512, + "step": 2899 + }, + { + "epoch": 0.28, + "grad_norm": 0.26895706721452967, + "learning_rate": 0.00019572511906046632, + "loss": 1.106, + "step": 2900 + }, + { + "epoch": 0.28, + "grad_norm": 0.28870985265507426, + "learning_rate": 0.00019572054194934855, + "loss": 1.0406, + "step": 2901 + }, + { + "epoch": 0.28, + "grad_norm": 0.2569564435809099, + "learning_rate": 0.00019571596244276408, + "loss": 1.1162, + "step": 2902 + }, + { + "epoch": 0.28, + "grad_norm": 0.26799107456705956, + "learning_rate": 0.0001957113805408275, + "loss": 1.0301, + "step": 2903 + }, + { + "epoch": 0.28, + "grad_norm": 0.2476133822775531, + "learning_rate": 0.00019570679624365348, + "loss": 1.1889, + "step": 2904 + }, + { + "epoch": 0.28, + "grad_norm": 0.28688265263367885, + "learning_rate": 0.00019570220955135673, + "loss": 1.0879, + "step": 2905 + }, + { + "epoch": 0.28, + "grad_norm": 0.279647668676218, + "learning_rate": 0.000195697620464052, + "loss": 1.1741, + "step": 2906 + }, + { + "epoch": 0.28, + "grad_norm": 0.26565186704575483, + "learning_rate": 0.0001956930289818542, + "loss": 1.0533, + "step": 2907 + }, + { + "epoch": 0.28, + "grad_norm": 0.27848478850209946, + "learning_rate": 0.00019568843510487822, + "loss": 1.0685, + "step": 2908 + }, + { + "epoch": 0.28, + "grad_norm": 0.27824963257277385, + "learning_rate": 0.00019568383883323902, + "loss": 1.154, + "step": 2909 + }, + { + "epoch": 0.28, + "grad_norm": 0.2889721158688345, + "learning_rate": 0.0001956792401670516, + "loss": 1.0229, + "step": 2910 + }, + { + "epoch": 0.28, + "grad_norm": 0.2433224132508536, + "learning_rate": 0.00019567463910643106, + "loss": 1.0934, + "step": 2911 + }, + { + "epoch": 0.28, + "grad_norm": 0.2558852060337151, + "learning_rate": 0.00019567003565149256, + "loss": 1.035, + "step": 2912 + }, + { + "epoch": 0.28, + "grad_norm": 0.25619363373049336, + "learning_rate": 0.0001956654298023513, + "loss": 1.1415, + "step": 2913 + }, + { + "epoch": 0.28, + "grad_norm": 0.2913829589793719, + "learning_rate": 0.0001956608215591225, + "loss": 1.0902, + "step": 2914 + }, + { + "epoch": 0.28, + "grad_norm": 0.25636509940637525, + "learning_rate": 0.00019565621092192156, + "loss": 1.0804, + "step": 2915 + }, + { + "epoch": 0.28, + "grad_norm": 0.2911863913856473, + "learning_rate": 0.00019565159789086377, + "loss": 1.0234, + "step": 2916 + }, + { + "epoch": 0.28, + "grad_norm": 0.3098448701698118, + "learning_rate": 0.00019564698246606467, + "loss": 1.018, + "step": 2917 + }, + { + "epoch": 0.28, + "grad_norm": 0.28894296908926365, + "learning_rate": 0.00019564236464763971, + "loss": 1.0444, + "step": 2918 + }, + { + "epoch": 0.28, + "grad_norm": 0.2969290728405071, + "learning_rate": 0.00019563774443570448, + "loss": 1.0826, + "step": 2919 + }, + { + "epoch": 0.28, + "grad_norm": 0.26737166147445435, + "learning_rate": 0.00019563312183037458, + "loss": 1.1668, + "step": 2920 + }, + { + "epoch": 0.28, + "grad_norm": 0.28206692002567496, + "learning_rate": 0.0001956284968317657, + "loss": 1.0922, + "step": 2921 + }, + { + "epoch": 0.28, + "grad_norm": 0.24812211567721631, + "learning_rate": 0.0001956238694399936, + "loss": 1.1432, + "step": 2922 + }, + { + "epoch": 0.28, + "grad_norm": 0.2599840736277116, + "learning_rate": 0.00019561923965517405, + "loss": 1.0521, + "step": 2923 + }, + { + "epoch": 0.28, + "grad_norm": 0.2543940271199818, + "learning_rate": 0.00019561460747742295, + "loss": 1.1435, + "step": 2924 + }, + { + "epoch": 0.28, + "grad_norm": 0.27672619084132033, + "learning_rate": 0.0001956099729068562, + "loss": 1.0804, + "step": 2925 + }, + { + "epoch": 0.28, + "grad_norm": 0.257274251896411, + "learning_rate": 0.0001956053359435898, + "loss": 1.1605, + "step": 2926 + }, + { + "epoch": 0.28, + "grad_norm": 0.2700086089277536, + "learning_rate": 0.00019560069658773976, + "loss": 1.1006, + "step": 2927 + }, + { + "epoch": 0.28, + "grad_norm": 0.2729066392922699, + "learning_rate": 0.00019559605483942223, + "loss": 0.9848, + "step": 2928 + }, + { + "epoch": 0.28, + "grad_norm": 0.2819343853836847, + "learning_rate": 0.0001955914106987533, + "loss": 1.0987, + "step": 2929 + }, + { + "epoch": 0.28, + "grad_norm": 0.28151027604586293, + "learning_rate": 0.00019558676416584929, + "loss": 1.1282, + "step": 2930 + }, + { + "epoch": 0.28, + "grad_norm": 0.26677614820796297, + "learning_rate": 0.0001955821152408264, + "loss": 1.1034, + "step": 2931 + }, + { + "epoch": 0.28, + "grad_norm": 0.3059666303412851, + "learning_rate": 0.00019557746392380104, + "loss": 1.1612, + "step": 2932 + }, + { + "epoch": 0.28, + "grad_norm": 0.2819684123621462, + "learning_rate": 0.00019557281021488957, + "loss": 1.0681, + "step": 2933 + }, + { + "epoch": 0.28, + "grad_norm": 0.23809460047261669, + "learning_rate": 0.00019556815411420842, + "loss": 1.0539, + "step": 2934 + }, + { + "epoch": 0.28, + "grad_norm": 0.26195198259872626, + "learning_rate": 0.0001955634956218742, + "loss": 1.0663, + "step": 2935 + }, + { + "epoch": 0.28, + "grad_norm": 0.2776958285854396, + "learning_rate": 0.00019555883473800344, + "loss": 1.2042, + "step": 2936 + }, + { + "epoch": 0.28, + "grad_norm": 0.2776878056257854, + "learning_rate": 0.00019555417146271275, + "loss": 1.0723, + "step": 2937 + }, + { + "epoch": 0.28, + "grad_norm": 0.22289840395374008, + "learning_rate": 0.00019554950579611888, + "loss": 1.101, + "step": 2938 + }, + { + "epoch": 0.28, + "grad_norm": 0.2765820638955678, + "learning_rate": 0.00019554483773833855, + "loss": 1.1081, + "step": 2939 + }, + { + "epoch": 0.28, + "grad_norm": 0.313800298123316, + "learning_rate": 0.00019554016728948865, + "loss": 1.1322, + "step": 2940 + }, + { + "epoch": 0.28, + "grad_norm": 0.25205600427022146, + "learning_rate": 0.00019553549444968602, + "loss": 1.0419, + "step": 2941 + }, + { + "epoch": 0.28, + "grad_norm": 0.271795514200843, + "learning_rate": 0.00019553081921904757, + "loss": 1.1375, + "step": 2942 + }, + { + "epoch": 0.28, + "grad_norm": 0.28101236125294443, + "learning_rate": 0.00019552614159769034, + "loss": 1.0403, + "step": 2943 + }, + { + "epoch": 0.28, + "grad_norm": 0.25207664771282795, + "learning_rate": 0.0001955214615857314, + "loss": 1.0785, + "step": 2944 + }, + { + "epoch": 0.28, + "grad_norm": 0.27380543511172994, + "learning_rate": 0.00019551677918328784, + "loss": 1.1187, + "step": 2945 + }, + { + "epoch": 0.28, + "grad_norm": 0.2732262776480482, + "learning_rate": 0.00019551209439047683, + "loss": 1.1426, + "step": 2946 + }, + { + "epoch": 0.28, + "grad_norm": 0.25773244127240097, + "learning_rate": 0.00019550740720741564, + "loss": 1.019, + "step": 2947 + }, + { + "epoch": 0.28, + "grad_norm": 0.2649249368780316, + "learning_rate": 0.0001955027176342216, + "loss": 1.0553, + "step": 2948 + }, + { + "epoch": 0.28, + "grad_norm": 0.2778435224973774, + "learning_rate": 0.00019549802567101198, + "loss": 1.1484, + "step": 2949 + }, + { + "epoch": 0.28, + "grad_norm": 0.2835832433902629, + "learning_rate": 0.00019549333131790427, + "loss": 1.1626, + "step": 2950 + }, + { + "epoch": 0.28, + "grad_norm": 0.2684901729578864, + "learning_rate": 0.00019548863457501592, + "loss": 1.0469, + "step": 2951 + }, + { + "epoch": 0.28, + "grad_norm": 0.2917738619766324, + "learning_rate": 0.0001954839354424645, + "loss": 1.0706, + "step": 2952 + }, + { + "epoch": 0.28, + "grad_norm": 0.2719180117243129, + "learning_rate": 0.00019547923392036756, + "loss": 1.069, + "step": 2953 + }, + { + "epoch": 0.28, + "grad_norm": 0.32365658454747653, + "learning_rate": 0.00019547453000884278, + "loss": 1.2248, + "step": 2954 + }, + { + "epoch": 0.28, + "grad_norm": 0.22919570649820376, + "learning_rate": 0.0001954698237080079, + "loss": 0.9762, + "step": 2955 + }, + { + "epoch": 0.28, + "grad_norm": 0.27049554069580856, + "learning_rate": 0.00019546511501798068, + "loss": 1.0445, + "step": 2956 + }, + { + "epoch": 0.28, + "grad_norm": 0.2321876330280462, + "learning_rate": 0.00019546040393887896, + "loss": 0.9582, + "step": 2957 + }, + { + "epoch": 0.28, + "grad_norm": 0.2866826620321833, + "learning_rate": 0.00019545569047082063, + "loss": 1.0803, + "step": 2958 + }, + { + "epoch": 0.28, + "grad_norm": 0.2513540583580118, + "learning_rate": 0.00019545097461392364, + "loss": 1.083, + "step": 2959 + }, + { + "epoch": 0.28, + "grad_norm": 0.26654262390528605, + "learning_rate": 0.00019544625636830606, + "loss": 1.0319, + "step": 2960 + }, + { + "epoch": 0.28, + "grad_norm": 0.2413056145935159, + "learning_rate": 0.00019544153573408592, + "loss": 1.158, + "step": 2961 + }, + { + "epoch": 0.28, + "grad_norm": 0.26314675374807356, + "learning_rate": 0.00019543681271138135, + "loss": 1.1581, + "step": 2962 + }, + { + "epoch": 0.28, + "grad_norm": 0.302120035744808, + "learning_rate": 0.00019543208730031056, + "loss": 1.0621, + "step": 2963 + }, + { + "epoch": 0.28, + "grad_norm": 0.2784231038369151, + "learning_rate": 0.0001954273595009918, + "loss": 1.1521, + "step": 2964 + }, + { + "epoch": 0.28, + "grad_norm": 0.3301489966327534, + "learning_rate": 0.00019542262931354342, + "loss": 1.1991, + "step": 2965 + }, + { + "epoch": 0.28, + "grad_norm": 0.2625920950709, + "learning_rate": 0.00019541789673808378, + "loss": 1.1439, + "step": 2966 + }, + { + "epoch": 0.28, + "grad_norm": 0.2799876119710994, + "learning_rate": 0.00019541316177473127, + "loss": 1.2343, + "step": 2967 + }, + { + "epoch": 0.28, + "grad_norm": 0.25861702387999425, + "learning_rate": 0.00019540842442360444, + "loss": 1.0334, + "step": 2968 + }, + { + "epoch": 0.28, + "grad_norm": 0.26861453164120885, + "learning_rate": 0.00019540368468482183, + "loss": 1.0876, + "step": 2969 + }, + { + "epoch": 0.28, + "grad_norm": 0.2790820406297911, + "learning_rate": 0.00019539894255850203, + "loss": 1.2192, + "step": 2970 + }, + { + "epoch": 0.28, + "grad_norm": 0.26958726146743006, + "learning_rate": 0.00019539419804476377, + "loss": 1.071, + "step": 2971 + }, + { + "epoch": 0.28, + "grad_norm": 0.2677639222097805, + "learning_rate": 0.00019538945114372573, + "loss": 1.2223, + "step": 2972 + }, + { + "epoch": 0.28, + "grad_norm": 0.26245494649962503, + "learning_rate": 0.00019538470185550674, + "loss": 1.105, + "step": 2973 + }, + { + "epoch": 0.28, + "grad_norm": 0.2693830042475195, + "learning_rate": 0.00019537995018022563, + "loss": 1.1118, + "step": 2974 + }, + { + "epoch": 0.28, + "grad_norm": 0.24631907921671092, + "learning_rate": 0.0001953751961180013, + "loss": 1.0035, + "step": 2975 + }, + { + "epoch": 0.28, + "grad_norm": 0.30613365197670445, + "learning_rate": 0.00019537043966895277, + "loss": 1.0775, + "step": 2976 + }, + { + "epoch": 0.28, + "grad_norm": 0.2776873480898374, + "learning_rate": 0.00019536568083319903, + "loss": 1.0197, + "step": 2977 + }, + { + "epoch": 0.28, + "grad_norm": 0.28885110017090304, + "learning_rate": 0.00019536091961085922, + "loss": 1.1091, + "step": 2978 + }, + { + "epoch": 0.29, + "grad_norm": 0.2567410212869977, + "learning_rate": 0.00019535615600205247, + "loss": 1.0443, + "step": 2979 + }, + { + "epoch": 0.29, + "grad_norm": 0.28945345981693216, + "learning_rate": 0.00019535139000689795, + "loss": 1.0625, + "step": 2980 + }, + { + "epoch": 0.29, + "grad_norm": 0.2774312492094651, + "learning_rate": 0.000195346621625515, + "loss": 1.1077, + "step": 2981 + }, + { + "epoch": 0.29, + "grad_norm": 0.26395699634012787, + "learning_rate": 0.00019534185085802293, + "loss": 1.0201, + "step": 2982 + }, + { + "epoch": 0.29, + "grad_norm": 0.28496537247736714, + "learning_rate": 0.0001953370777045411, + "loss": 1.1266, + "step": 2983 + }, + { + "epoch": 0.29, + "grad_norm": 0.3243637973540252, + "learning_rate": 0.00019533230216518897, + "loss": 1.1888, + "step": 2984 + }, + { + "epoch": 0.29, + "grad_norm": 0.29081576092276984, + "learning_rate": 0.00019532752424008607, + "loss": 1.0315, + "step": 2985 + }, + { + "epoch": 0.29, + "grad_norm": 0.2850948882845963, + "learning_rate": 0.00019532274392935198, + "loss": 1.0013, + "step": 2986 + }, + { + "epoch": 0.29, + "grad_norm": 0.2923430922930697, + "learning_rate": 0.0001953179612331063, + "loss": 0.9897, + "step": 2987 + }, + { + "epoch": 0.29, + "grad_norm": 0.29321038881690914, + "learning_rate": 0.00019531317615146873, + "loss": 1.1548, + "step": 2988 + }, + { + "epoch": 0.29, + "grad_norm": 0.28328990204826776, + "learning_rate": 0.00019530838868455906, + "loss": 1.0857, + "step": 2989 + }, + { + "epoch": 0.29, + "grad_norm": 0.25769647168620946, + "learning_rate": 0.00019530359883249701, + "loss": 1.025, + "step": 2990 + }, + { + "epoch": 0.29, + "grad_norm": 0.2796837930311345, + "learning_rate": 0.00019529880659540256, + "loss": 1.1879, + "step": 2991 + }, + { + "epoch": 0.29, + "grad_norm": 0.2787744820467792, + "learning_rate": 0.00019529401197339557, + "loss": 1.1248, + "step": 2992 + }, + { + "epoch": 0.29, + "grad_norm": 0.269889282961234, + "learning_rate": 0.00019528921496659603, + "loss": 1.0331, + "step": 2993 + }, + { + "epoch": 0.29, + "grad_norm": 0.28222433704753624, + "learning_rate": 0.00019528441557512398, + "loss": 1.0523, + "step": 2994 + }, + { + "epoch": 0.29, + "grad_norm": 0.27863870289129955, + "learning_rate": 0.00019527961379909957, + "loss": 1.1745, + "step": 2995 + }, + { + "epoch": 0.29, + "grad_norm": 0.31883034906636215, + "learning_rate": 0.00019527480963864294, + "loss": 1.0541, + "step": 2996 + }, + { + "epoch": 0.29, + "grad_norm": 0.27111306738362173, + "learning_rate": 0.0001952700030938743, + "loss": 1.1796, + "step": 2997 + }, + { + "epoch": 0.29, + "grad_norm": 0.27408759182730913, + "learning_rate": 0.00019526519416491401, + "loss": 1.0041, + "step": 2998 + }, + { + "epoch": 0.29, + "grad_norm": 0.24522235103299234, + "learning_rate": 0.0001952603828518823, + "loss": 0.8971, + "step": 2999 + }, + { + "epoch": 0.29, + "grad_norm": 0.2583032592016034, + "learning_rate": 0.00019525556915489967, + "loss": 0.9294, + "step": 3000 + }, + { + "epoch": 0.29, + "grad_norm": 0.25624359117015355, + "learning_rate": 0.00019525075307408655, + "loss": 0.9701, + "step": 3001 + }, + { + "epoch": 0.29, + "grad_norm": 0.28720231190552714, + "learning_rate": 0.0001952459346095635, + "loss": 1.0356, + "step": 3002 + }, + { + "epoch": 0.29, + "grad_norm": 0.29696988241107114, + "learning_rate": 0.00019524111376145105, + "loss": 1.0428, + "step": 3003 + }, + { + "epoch": 0.29, + "grad_norm": 0.26750283784091977, + "learning_rate": 0.00019523629052986988, + "loss": 1.0176, + "step": 3004 + }, + { + "epoch": 0.29, + "grad_norm": 0.28397737724131106, + "learning_rate": 0.00019523146491494067, + "loss": 1.0977, + "step": 3005 + }, + { + "epoch": 0.29, + "grad_norm": 0.30294167237041875, + "learning_rate": 0.0001952266369167842, + "loss": 1.1875, + "step": 3006 + }, + { + "epoch": 0.29, + "grad_norm": 0.2580269760608724, + "learning_rate": 0.00019522180653552132, + "loss": 1.0923, + "step": 3007 + }, + { + "epoch": 0.29, + "grad_norm": 0.2788977172196587, + "learning_rate": 0.00019521697377127285, + "loss": 1.1057, + "step": 3008 + }, + { + "epoch": 0.29, + "grad_norm": 0.26291500633347537, + "learning_rate": 0.00019521213862415979, + "loss": 1.1551, + "step": 3009 + }, + { + "epoch": 0.29, + "grad_norm": 0.2760993256298351, + "learning_rate": 0.00019520730109430314, + "loss": 1.0357, + "step": 3010 + }, + { + "epoch": 0.29, + "grad_norm": 0.2851223724038372, + "learning_rate": 0.0001952024611818239, + "loss": 1.1088, + "step": 3011 + }, + { + "epoch": 0.29, + "grad_norm": 0.2613411081969399, + "learning_rate": 0.00019519761888684326, + "loss": 1.0058, + "step": 3012 + }, + { + "epoch": 0.29, + "grad_norm": 0.2374352044099212, + "learning_rate": 0.0001951927742094824, + "loss": 1.0545, + "step": 3013 + }, + { + "epoch": 0.29, + "grad_norm": 0.27080671804410106, + "learning_rate": 0.00019518792714986254, + "loss": 1.1475, + "step": 3014 + }, + { + "epoch": 0.29, + "grad_norm": 0.2993708318879525, + "learning_rate": 0.00019518307770810496, + "loss": 1.0931, + "step": 3015 + }, + { + "epoch": 0.29, + "grad_norm": 0.29400732138156965, + "learning_rate": 0.00019517822588433102, + "loss": 1.0799, + "step": 3016 + }, + { + "epoch": 0.29, + "grad_norm": 0.25464534887846263, + "learning_rate": 0.0001951733716786622, + "loss": 0.993, + "step": 3017 + }, + { + "epoch": 0.29, + "grad_norm": 0.2652448178700676, + "learning_rate": 0.0001951685150912199, + "loss": 1.1835, + "step": 3018 + }, + { + "epoch": 0.29, + "grad_norm": 0.3077147987981465, + "learning_rate": 0.00019516365612212572, + "loss": 1.0706, + "step": 3019 + }, + { + "epoch": 0.29, + "grad_norm": 0.30685934393160413, + "learning_rate": 0.00019515879477150123, + "loss": 1.0244, + "step": 3020 + }, + { + "epoch": 0.29, + "grad_norm": 0.2856291230251649, + "learning_rate": 0.00019515393103946812, + "loss": 1.0963, + "step": 3021 + }, + { + "epoch": 0.29, + "grad_norm": 0.2767228605351276, + "learning_rate": 0.00019514906492614805, + "loss": 0.9945, + "step": 3022 + }, + { + "epoch": 0.29, + "grad_norm": 0.27035783848571304, + "learning_rate": 0.00019514419643166283, + "loss": 1.1075, + "step": 3023 + }, + { + "epoch": 0.29, + "grad_norm": 0.2990769279659998, + "learning_rate": 0.0001951393255561343, + "loss": 1.1556, + "step": 3024 + }, + { + "epoch": 0.29, + "grad_norm": 0.2844357764929915, + "learning_rate": 0.00019513445229968438, + "loss": 0.9933, + "step": 3025 + }, + { + "epoch": 0.29, + "grad_norm": 0.3070197133609208, + "learning_rate": 0.000195129576662435, + "loss": 1.0885, + "step": 3026 + }, + { + "epoch": 0.29, + "grad_norm": 0.2820008198156176, + "learning_rate": 0.0001951246986445082, + "loss": 1.0819, + "step": 3027 + }, + { + "epoch": 0.29, + "grad_norm": 0.27229352040640303, + "learning_rate": 0.00019511981824602598, + "loss": 1.046, + "step": 3028 + }, + { + "epoch": 0.29, + "grad_norm": 0.2739544259171004, + "learning_rate": 0.00019511493546711054, + "loss": 1.0647, + "step": 3029 + }, + { + "epoch": 0.29, + "grad_norm": 0.2620879572449313, + "learning_rate": 0.00019511005030788407, + "loss": 1.1027, + "step": 3030 + }, + { + "epoch": 0.29, + "grad_norm": 0.251134914749705, + "learning_rate": 0.00019510516276846884, + "loss": 1.0464, + "step": 3031 + }, + { + "epoch": 0.29, + "grad_norm": 0.23480951173895892, + "learning_rate": 0.0001951002728489871, + "loss": 1.0258, + "step": 3032 + }, + { + "epoch": 0.29, + "grad_norm": 0.2637238517781191, + "learning_rate": 0.0001950953805495613, + "loss": 1.0609, + "step": 3033 + }, + { + "epoch": 0.29, + "grad_norm": 0.23941970843056096, + "learning_rate": 0.0001950904858703138, + "loss": 0.9995, + "step": 3034 + }, + { + "epoch": 0.29, + "grad_norm": 0.2674317195338645, + "learning_rate": 0.00019508558881136716, + "loss": 1.1166, + "step": 3035 + }, + { + "epoch": 0.29, + "grad_norm": 0.2671056081371556, + "learning_rate": 0.0001950806893728439, + "loss": 1.1134, + "step": 3036 + }, + { + "epoch": 0.29, + "grad_norm": 0.25637597126512307, + "learning_rate": 0.0001950757875548666, + "loss": 1.0847, + "step": 3037 + }, + { + "epoch": 0.29, + "grad_norm": 0.26941442259152115, + "learning_rate": 0.000195070883357558, + "loss": 1.1351, + "step": 3038 + }, + { + "epoch": 0.29, + "grad_norm": 0.26139367123922513, + "learning_rate": 0.00019506597678104078, + "loss": 1.1819, + "step": 3039 + }, + { + "epoch": 0.29, + "grad_norm": 0.2730769424356869, + "learning_rate": 0.00019506106782543774, + "loss": 1.0862, + "step": 3040 + }, + { + "epoch": 0.29, + "grad_norm": 0.2564183571077773, + "learning_rate": 0.00019505615649087173, + "loss": 1.057, + "step": 3041 + }, + { + "epoch": 0.29, + "grad_norm": 0.27496154581521765, + "learning_rate": 0.00019505124277746568, + "loss": 1.0365, + "step": 3042 + }, + { + "epoch": 0.29, + "grad_norm": 0.2906578614460428, + "learning_rate": 0.00019504632668534253, + "loss": 1.0765, + "step": 3043 + }, + { + "epoch": 0.29, + "grad_norm": 0.24889624819261374, + "learning_rate": 0.00019504140821462534, + "loss": 1.0847, + "step": 3044 + }, + { + "epoch": 0.29, + "grad_norm": 0.26592584153440635, + "learning_rate": 0.00019503648736543715, + "loss": 1.0803, + "step": 3045 + }, + { + "epoch": 0.29, + "grad_norm": 0.2944881481822344, + "learning_rate": 0.00019503156413790113, + "loss": 1.0591, + "step": 3046 + }, + { + "epoch": 0.29, + "grad_norm": 0.2918642575968384, + "learning_rate": 0.00019502663853214052, + "loss": 1.0976, + "step": 3047 + }, + { + "epoch": 0.29, + "grad_norm": 0.26006791100294435, + "learning_rate": 0.00019502171054827856, + "loss": 1.1608, + "step": 3048 + }, + { + "epoch": 0.29, + "grad_norm": 0.3177927171422205, + "learning_rate": 0.00019501678018643854, + "loss": 1.1429, + "step": 3049 + }, + { + "epoch": 0.29, + "grad_norm": 0.26535744168889774, + "learning_rate": 0.0001950118474467439, + "loss": 1.0564, + "step": 3050 + }, + { + "epoch": 0.29, + "grad_norm": 0.30516176813295376, + "learning_rate": 0.00019500691232931806, + "loss": 1.085, + "step": 3051 + }, + { + "epoch": 0.29, + "grad_norm": 0.3062397007408206, + "learning_rate": 0.00019500197483428454, + "loss": 1.202, + "step": 3052 + }, + { + "epoch": 0.29, + "grad_norm": 0.30600809859823863, + "learning_rate": 0.0001949970349617669, + "loss": 1.0306, + "step": 3053 + }, + { + "epoch": 0.29, + "grad_norm": 0.26553989672712847, + "learning_rate": 0.00019499209271188874, + "loss": 1.1253, + "step": 3054 + }, + { + "epoch": 0.29, + "grad_norm": 0.3115655362592651, + "learning_rate": 0.00019498714808477375, + "loss": 1.0844, + "step": 3055 + }, + { + "epoch": 0.29, + "grad_norm": 0.2733439826555348, + "learning_rate": 0.00019498220108054573, + "loss": 1.0594, + "step": 3056 + }, + { + "epoch": 0.29, + "grad_norm": 0.27824607858191935, + "learning_rate": 0.00019497725169932839, + "loss": 1.1842, + "step": 3057 + }, + { + "epoch": 0.29, + "grad_norm": 0.29361249395554595, + "learning_rate": 0.00019497229994124563, + "loss": 1.156, + "step": 3058 + }, + { + "epoch": 0.29, + "grad_norm": 0.2909096666085275, + "learning_rate": 0.00019496734580642139, + "loss": 1.0713, + "step": 3059 + }, + { + "epoch": 0.29, + "grad_norm": 0.3008406748557583, + "learning_rate": 0.00019496238929497968, + "loss": 0.9974, + "step": 3060 + }, + { + "epoch": 0.29, + "grad_norm": 0.2557208115014124, + "learning_rate": 0.00019495743040704445, + "loss": 1.1056, + "step": 3061 + }, + { + "epoch": 0.29, + "grad_norm": 0.2619314513826558, + "learning_rate": 0.00019495246914273985, + "loss": 1.121, + "step": 3062 + }, + { + "epoch": 0.29, + "grad_norm": 0.2777726951083541, + "learning_rate": 0.00019494750550219, + "loss": 1.1167, + "step": 3063 + }, + { + "epoch": 0.29, + "grad_norm": 0.27592070412234426, + "learning_rate": 0.00019494253948551922, + "loss": 1.1754, + "step": 3064 + }, + { + "epoch": 0.29, + "grad_norm": 0.27823194441497656, + "learning_rate": 0.0001949375710928517, + "loss": 1.0839, + "step": 3065 + }, + { + "epoch": 0.29, + "grad_norm": 0.2982548750310373, + "learning_rate": 0.00019493260032431176, + "loss": 1.1597, + "step": 3066 + }, + { + "epoch": 0.29, + "grad_norm": 0.2859522245926003, + "learning_rate": 0.00019492762718002386, + "loss": 1.1175, + "step": 3067 + }, + { + "epoch": 0.29, + "grad_norm": 0.25703939560705324, + "learning_rate": 0.00019492265166011244, + "loss": 1.0775, + "step": 3068 + }, + { + "epoch": 0.29, + "grad_norm": 0.28439233978147976, + "learning_rate": 0.000194917673764702, + "loss": 1.1447, + "step": 3069 + }, + { + "epoch": 0.29, + "grad_norm": 0.2794544690967338, + "learning_rate": 0.00019491269349391712, + "loss": 1.1776, + "step": 3070 + }, + { + "epoch": 0.29, + "grad_norm": 0.2438574837351192, + "learning_rate": 0.00019490771084788242, + "loss": 1.1019, + "step": 3071 + }, + { + "epoch": 0.29, + "grad_norm": 0.24023569609710485, + "learning_rate": 0.00019490272582672262, + "loss": 1.1135, + "step": 3072 + }, + { + "epoch": 0.29, + "grad_norm": 0.29972615964686367, + "learning_rate": 0.00019489773843056244, + "loss": 1.069, + "step": 3073 + }, + { + "epoch": 0.29, + "grad_norm": 0.2589384956974427, + "learning_rate": 0.00019489274865952676, + "loss": 1.2025, + "step": 3074 + }, + { + "epoch": 0.29, + "grad_norm": 0.2597338399690944, + "learning_rate": 0.00019488775651374038, + "loss": 1.0932, + "step": 3075 + }, + { + "epoch": 0.29, + "grad_norm": 0.31065520260111457, + "learning_rate": 0.00019488276199332825, + "loss": 1.2195, + "step": 3076 + }, + { + "epoch": 0.29, + "grad_norm": 0.25453729205708414, + "learning_rate": 0.0001948777650984154, + "loss": 1.107, + "step": 3077 + }, + { + "epoch": 0.29, + "grad_norm": 0.29862291936090024, + "learning_rate": 0.00019487276582912683, + "loss": 1.1301, + "step": 3078 + }, + { + "epoch": 0.29, + "grad_norm": 0.3221818146284146, + "learning_rate": 0.00019486776418558766, + "loss": 1.1191, + "step": 3079 + }, + { + "epoch": 0.29, + "grad_norm": 0.3782095978448149, + "learning_rate": 0.0001948627601679231, + "loss": 1.0482, + "step": 3080 + }, + { + "epoch": 0.29, + "grad_norm": 0.28378395622830654, + "learning_rate": 0.0001948577537762583, + "loss": 1.1376, + "step": 3081 + }, + { + "epoch": 0.29, + "grad_norm": 0.3319042029552207, + "learning_rate": 0.00019485274501071864, + "loss": 1.0665, + "step": 3082 + }, + { + "epoch": 0.29, + "grad_norm": 0.26347510714336875, + "learning_rate": 0.00019484773387142942, + "loss": 1.1515, + "step": 3083 + }, + { + "epoch": 0.3, + "grad_norm": 0.3101715271811368, + "learning_rate": 0.000194842720358516, + "loss": 1.1255, + "step": 3084 + }, + { + "epoch": 0.3, + "grad_norm": 0.25293432322802967, + "learning_rate": 0.00019483770447210397, + "loss": 1.0296, + "step": 3085 + }, + { + "epoch": 0.3, + "grad_norm": 0.28621886274041924, + "learning_rate": 0.00019483268621231875, + "loss": 0.9487, + "step": 3086 + }, + { + "epoch": 0.3, + "grad_norm": 0.2747791927139919, + "learning_rate": 0.00019482766557928592, + "loss": 1.0543, + "step": 3087 + }, + { + "epoch": 0.3, + "grad_norm": 0.26223503943348814, + "learning_rate": 0.00019482264257313122, + "loss": 1.0122, + "step": 3088 + }, + { + "epoch": 0.3, + "grad_norm": 0.27709490802994835, + "learning_rate": 0.00019481761719398027, + "loss": 1.119, + "step": 3089 + }, + { + "epoch": 0.3, + "grad_norm": 0.26196843932694963, + "learning_rate": 0.00019481258944195886, + "loss": 1.0707, + "step": 3090 + }, + { + "epoch": 0.3, + "grad_norm": 0.2782018365147699, + "learning_rate": 0.00019480755931719281, + "loss": 1.038, + "step": 3091 + }, + { + "epoch": 0.3, + "grad_norm": 0.24361390545083103, + "learning_rate": 0.00019480252681980802, + "loss": 1.056, + "step": 3092 + }, + { + "epoch": 0.3, + "grad_norm": 0.2674864730406895, + "learning_rate": 0.0001947974919499304, + "loss": 1.1546, + "step": 3093 + }, + { + "epoch": 0.3, + "grad_norm": 0.25038494984812143, + "learning_rate": 0.00019479245470768595, + "loss": 1.0509, + "step": 3094 + }, + { + "epoch": 0.3, + "grad_norm": 0.27600843103065575, + "learning_rate": 0.00019478741509320076, + "loss": 1.1192, + "step": 3095 + }, + { + "epoch": 0.3, + "grad_norm": 0.2736297017538167, + "learning_rate": 0.00019478237310660093, + "loss": 1.1505, + "step": 3096 + }, + { + "epoch": 0.3, + "grad_norm": 0.2545876917992688, + "learning_rate": 0.00019477732874801265, + "loss": 1.0992, + "step": 3097 + }, + { + "epoch": 0.3, + "grad_norm": 0.2539192301177348, + "learning_rate": 0.0001947722820175622, + "loss": 1.044, + "step": 3098 + }, + { + "epoch": 0.3, + "grad_norm": 0.25814263130227105, + "learning_rate": 0.00019476723291537575, + "loss": 1.1827, + "step": 3099 + }, + { + "epoch": 0.3, + "grad_norm": 0.2976356116278242, + "learning_rate": 0.0001947621814415798, + "loss": 0.9778, + "step": 3100 + }, + { + "epoch": 0.3, + "grad_norm": 0.2625055643669641, + "learning_rate": 0.00019475712759630068, + "loss": 1.0887, + "step": 3101 + }, + { + "epoch": 0.3, + "grad_norm": 0.26312176570889856, + "learning_rate": 0.00019475207137966487, + "loss": 1.0807, + "step": 3102 + }, + { + "epoch": 0.3, + "grad_norm": 0.2618322867889844, + "learning_rate": 0.00019474701279179895, + "loss": 1.2045, + "step": 3103 + }, + { + "epoch": 0.3, + "grad_norm": 0.2891550174279668, + "learning_rate": 0.00019474195183282947, + "loss": 1.0771, + "step": 3104 + }, + { + "epoch": 0.3, + "grad_norm": 0.3202983567379544, + "learning_rate": 0.00019473688850288312, + "loss": 1.1852, + "step": 3105 + }, + { + "epoch": 0.3, + "grad_norm": 0.25021772062444586, + "learning_rate": 0.0001947318228020866, + "loss": 1.1832, + "step": 3106 + }, + { + "epoch": 0.3, + "grad_norm": 0.2930599815597174, + "learning_rate": 0.00019472675473056666, + "loss": 1.0763, + "step": 3107 + }, + { + "epoch": 0.3, + "grad_norm": 0.30676853669698495, + "learning_rate": 0.00019472168428845014, + "loss": 1.0405, + "step": 3108 + }, + { + "epoch": 0.3, + "grad_norm": 0.2648409443563144, + "learning_rate": 0.00019471661147586395, + "loss": 1.1125, + "step": 3109 + }, + { + "epoch": 0.3, + "grad_norm": 0.26584314244912965, + "learning_rate": 0.00019471153629293503, + "loss": 1.0697, + "step": 3110 + }, + { + "epoch": 0.3, + "grad_norm": 0.37140190776674137, + "learning_rate": 0.0001947064587397904, + "loss": 1.0939, + "step": 3111 + }, + { + "epoch": 0.3, + "grad_norm": 0.259017588096064, + "learning_rate": 0.00019470137881655712, + "loss": 1.0809, + "step": 3112 + }, + { + "epoch": 0.3, + "grad_norm": 0.26998271747435276, + "learning_rate": 0.00019469629652336232, + "loss": 1.0425, + "step": 3113 + }, + { + "epoch": 0.3, + "grad_norm": 0.25718343306878455, + "learning_rate": 0.0001946912118603332, + "loss": 1.1163, + "step": 3114 + }, + { + "epoch": 0.3, + "grad_norm": 0.2568678014483129, + "learning_rate": 0.00019468612482759695, + "loss": 0.9441, + "step": 3115 + }, + { + "epoch": 0.3, + "grad_norm": 0.2765085464241682, + "learning_rate": 0.00019468103542528094, + "loss": 1.0876, + "step": 3116 + }, + { + "epoch": 0.3, + "grad_norm": 0.23905715647271397, + "learning_rate": 0.0001946759436535125, + "loss": 1.1495, + "step": 3117 + }, + { + "epoch": 0.3, + "grad_norm": 0.23414722392686665, + "learning_rate": 0.00019467084951241907, + "loss": 1.0045, + "step": 3118 + }, + { + "epoch": 0.3, + "grad_norm": 0.2825675728276391, + "learning_rate": 0.00019466575300212816, + "loss": 1.0469, + "step": 3119 + }, + { + "epoch": 0.3, + "grad_norm": 0.27456256708811555, + "learning_rate": 0.00019466065412276727, + "loss": 0.952, + "step": 3120 + }, + { + "epoch": 0.3, + "grad_norm": 0.23902984443073147, + "learning_rate": 0.00019465555287446402, + "loss": 1.1261, + "step": 3121 + }, + { + "epoch": 0.3, + "grad_norm": 0.2860878827274362, + "learning_rate": 0.00019465044925734605, + "loss": 0.9592, + "step": 3122 + }, + { + "epoch": 0.3, + "grad_norm": 0.27511877405249713, + "learning_rate": 0.00019464534327154112, + "loss": 1.0913, + "step": 3123 + }, + { + "epoch": 0.3, + "grad_norm": 0.25798267940905334, + "learning_rate": 0.000194640234917177, + "loss": 1.0157, + "step": 3124 + }, + { + "epoch": 0.3, + "grad_norm": 0.284100415728136, + "learning_rate": 0.00019463512419438153, + "loss": 1.1027, + "step": 3125 + }, + { + "epoch": 0.3, + "grad_norm": 0.2382259658744238, + "learning_rate": 0.00019463001110328257, + "loss": 1.1828, + "step": 3126 + }, + { + "epoch": 0.3, + "grad_norm": 0.31222857704200785, + "learning_rate": 0.0001946248956440081, + "loss": 1.1124, + "step": 3127 + }, + { + "epoch": 0.3, + "grad_norm": 0.2625066692419136, + "learning_rate": 0.00019461977781668618, + "loss": 1.0737, + "step": 3128 + }, + { + "epoch": 0.3, + "grad_norm": 0.2793854323707662, + "learning_rate": 0.00019461465762144487, + "loss": 1.1363, + "step": 3129 + }, + { + "epoch": 0.3, + "grad_norm": 0.27108476180470265, + "learning_rate": 0.00019460953505841223, + "loss": 1.1485, + "step": 3130 + }, + { + "epoch": 0.3, + "grad_norm": 0.26903383341011894, + "learning_rate": 0.0001946044101277166, + "loss": 1.0214, + "step": 3131 + }, + { + "epoch": 0.3, + "grad_norm": 0.30317173630025673, + "learning_rate": 0.00019459928282948607, + "loss": 1.0941, + "step": 3132 + }, + { + "epoch": 0.3, + "grad_norm": 0.2833482336806812, + "learning_rate": 0.00019459415316384906, + "loss": 1.1549, + "step": 3133 + }, + { + "epoch": 0.3, + "grad_norm": 0.2652521067523786, + "learning_rate": 0.00019458902113093395, + "loss": 1.0997, + "step": 3134 + }, + { + "epoch": 0.3, + "grad_norm": 0.2796165666023849, + "learning_rate": 0.0001945838867308691, + "loss": 1.1974, + "step": 3135 + }, + { + "epoch": 0.3, + "grad_norm": 0.2795419726491581, + "learning_rate": 0.00019457874996378304, + "loss": 1.0421, + "step": 3136 + }, + { + "epoch": 0.3, + "grad_norm": 0.27461888378734867, + "learning_rate": 0.00019457361082980432, + "loss": 1.0375, + "step": 3137 + }, + { + "epoch": 0.3, + "grad_norm": 0.2751564840738016, + "learning_rate": 0.00019456846932906156, + "loss": 0.9755, + "step": 3138 + }, + { + "epoch": 0.3, + "grad_norm": 0.27520201040938214, + "learning_rate": 0.00019456332546168343, + "loss": 0.9982, + "step": 3139 + }, + { + "epoch": 0.3, + "grad_norm": 0.28743296092030146, + "learning_rate": 0.00019455817922779868, + "loss": 0.8786, + "step": 3140 + }, + { + "epoch": 0.3, + "grad_norm": 0.2791638788893473, + "learning_rate": 0.000194553030627536, + "loss": 1.1424, + "step": 3141 + }, + { + "epoch": 0.3, + "grad_norm": 0.2605528838298395, + "learning_rate": 0.00019454787966102435, + "loss": 1.0785, + "step": 3142 + }, + { + "epoch": 0.3, + "grad_norm": 0.2652776678480821, + "learning_rate": 0.00019454272632839255, + "loss": 1.0047, + "step": 3143 + }, + { + "epoch": 0.3, + "grad_norm": 0.2770395767937961, + "learning_rate": 0.00019453757062976964, + "loss": 1.1224, + "step": 3144 + }, + { + "epoch": 0.3, + "grad_norm": 0.2774545055078794, + "learning_rate": 0.00019453241256528462, + "loss": 1.0218, + "step": 3145 + }, + { + "epoch": 0.3, + "grad_norm": 0.29292827749120715, + "learning_rate": 0.00019452725213506654, + "loss": 1.1559, + "step": 3146 + }, + { + "epoch": 0.3, + "grad_norm": 0.2769766053696637, + "learning_rate": 0.00019452208933924459, + "loss": 1.0685, + "step": 3147 + }, + { + "epoch": 0.3, + "grad_norm": 0.24168206829639038, + "learning_rate": 0.00019451692417794792, + "loss": 1.092, + "step": 3148 + }, + { + "epoch": 0.3, + "grad_norm": 0.3243097289376712, + "learning_rate": 0.00019451175665130584, + "loss": 1.1109, + "step": 3149 + }, + { + "epoch": 0.3, + "grad_norm": 0.2851118834219872, + "learning_rate": 0.00019450658675944764, + "loss": 1.0859, + "step": 3150 + }, + { + "epoch": 0.3, + "grad_norm": 0.26646697928523183, + "learning_rate": 0.00019450141450250272, + "loss": 1.092, + "step": 3151 + }, + { + "epoch": 0.3, + "grad_norm": 0.31149135380894666, + "learning_rate": 0.0001944962398806005, + "loss": 1.0533, + "step": 3152 + }, + { + "epoch": 0.3, + "grad_norm": 0.31641860365957436, + "learning_rate": 0.00019449106289387048, + "loss": 1.0906, + "step": 3153 + }, + { + "epoch": 0.3, + "grad_norm": 0.2861114243921771, + "learning_rate": 0.00019448588354244227, + "loss": 1.1436, + "step": 3154 + }, + { + "epoch": 0.3, + "grad_norm": 0.2678359187873902, + "learning_rate": 0.0001944807018264454, + "loss": 1.0751, + "step": 3155 + }, + { + "epoch": 0.3, + "grad_norm": 0.2795381844318716, + "learning_rate": 0.00019447551774600958, + "loss": 1.0243, + "step": 3156 + }, + { + "epoch": 0.3, + "grad_norm": 0.2646096339194622, + "learning_rate": 0.00019447033130126458, + "loss": 1.0279, + "step": 3157 + }, + { + "epoch": 0.3, + "grad_norm": 0.295589829682441, + "learning_rate": 0.00019446514249234017, + "loss": 1.0735, + "step": 3158 + }, + { + "epoch": 0.3, + "grad_norm": 0.2980645819323912, + "learning_rate": 0.0001944599513193662, + "loss": 1.145, + "step": 3159 + }, + { + "epoch": 0.3, + "grad_norm": 0.3115102825868531, + "learning_rate": 0.00019445475778247256, + "loss": 1.1983, + "step": 3160 + }, + { + "epoch": 0.3, + "grad_norm": 0.2813320240238785, + "learning_rate": 0.00019444956188178927, + "loss": 1.1677, + "step": 3161 + }, + { + "epoch": 0.3, + "grad_norm": 0.23562367846120347, + "learning_rate": 0.00019444436361744632, + "loss": 1.1973, + "step": 3162 + }, + { + "epoch": 0.3, + "grad_norm": 0.2928636477767414, + "learning_rate": 0.0001944391629895738, + "loss": 1.1904, + "step": 3163 + }, + { + "epoch": 0.3, + "grad_norm": 0.2938628446043182, + "learning_rate": 0.0001944339599983019, + "loss": 1.1338, + "step": 3164 + }, + { + "epoch": 0.3, + "grad_norm": 0.2698493025290344, + "learning_rate": 0.00019442875464376077, + "loss": 1.085, + "step": 3165 + }, + { + "epoch": 0.3, + "grad_norm": 0.30254516206760285, + "learning_rate": 0.00019442354692608075, + "loss": 1.21, + "step": 3166 + }, + { + "epoch": 0.3, + "grad_norm": 0.26210762133120497, + "learning_rate": 0.0001944183368453921, + "loss": 1.0347, + "step": 3167 + }, + { + "epoch": 0.3, + "grad_norm": 0.2307520308777141, + "learning_rate": 0.00019441312440182524, + "loss": 1.0734, + "step": 3168 + }, + { + "epoch": 0.3, + "grad_norm": 0.256456644213979, + "learning_rate": 0.0001944079095955106, + "loss": 1.1074, + "step": 3169 + }, + { + "epoch": 0.3, + "grad_norm": 0.27238535013463366, + "learning_rate": 0.00019440269242657868, + "loss": 1.1605, + "step": 3170 + }, + { + "epoch": 0.3, + "grad_norm": 0.27536976098604626, + "learning_rate": 0.00019439747289516009, + "loss": 1.2405, + "step": 3171 + }, + { + "epoch": 0.3, + "grad_norm": 0.2537359030538808, + "learning_rate": 0.00019439225100138536, + "loss": 1.0907, + "step": 3172 + }, + { + "epoch": 0.3, + "grad_norm": 0.27291761429286016, + "learning_rate": 0.00019438702674538525, + "loss": 1.0893, + "step": 3173 + }, + { + "epoch": 0.3, + "grad_norm": 0.2650287732828565, + "learning_rate": 0.00019438180012729047, + "loss": 1.0594, + "step": 3174 + }, + { + "epoch": 0.3, + "grad_norm": 0.2566176933125399, + "learning_rate": 0.00019437657114723184, + "loss": 1.0371, + "step": 3175 + }, + { + "epoch": 0.3, + "grad_norm": 0.27936305636263997, + "learning_rate": 0.0001943713398053402, + "loss": 1.1312, + "step": 3176 + }, + { + "epoch": 0.3, + "grad_norm": 0.29167361844632245, + "learning_rate": 0.00019436610610174646, + "loss": 1.116, + "step": 3177 + }, + { + "epoch": 0.3, + "grad_norm": 0.28843098771577746, + "learning_rate": 0.00019436087003658163, + "loss": 1.0541, + "step": 3178 + }, + { + "epoch": 0.3, + "grad_norm": 0.32516725974523086, + "learning_rate": 0.0001943556316099767, + "loss": 1.1834, + "step": 3179 + }, + { + "epoch": 0.3, + "grad_norm": 0.2541028278026109, + "learning_rate": 0.0001943503908220628, + "loss": 1.1216, + "step": 3180 + }, + { + "epoch": 0.3, + "grad_norm": 0.2834034653405795, + "learning_rate": 0.00019434514767297108, + "loss": 1.2544, + "step": 3181 + }, + { + "epoch": 0.3, + "grad_norm": 0.30931710915213423, + "learning_rate": 0.00019433990216283274, + "loss": 1.0865, + "step": 3182 + }, + { + "epoch": 0.3, + "grad_norm": 0.2777726505717253, + "learning_rate": 0.00019433465429177904, + "loss": 1.0133, + "step": 3183 + }, + { + "epoch": 0.3, + "grad_norm": 0.2893309338829881, + "learning_rate": 0.00019432940405994135, + "loss": 1.1005, + "step": 3184 + }, + { + "epoch": 0.3, + "grad_norm": 0.2718000624288272, + "learning_rate": 0.00019432415146745103, + "loss": 0.9954, + "step": 3185 + }, + { + "epoch": 0.3, + "grad_norm": 0.2963971328585791, + "learning_rate": 0.00019431889651443953, + "loss": 1.0576, + "step": 3186 + }, + { + "epoch": 0.3, + "grad_norm": 0.2767798358909039, + "learning_rate": 0.00019431363920103837, + "loss": 1.1268, + "step": 3187 + }, + { + "epoch": 0.3, + "grad_norm": 0.31700853184697214, + "learning_rate": 0.00019430837952737914, + "loss": 1.061, + "step": 3188 + }, + { + "epoch": 0.31, + "grad_norm": 0.25763423615478015, + "learning_rate": 0.0001943031174935934, + "loss": 1.0927, + "step": 3189 + }, + { + "epoch": 0.31, + "grad_norm": 0.2610139408028157, + "learning_rate": 0.00019429785309981292, + "loss": 1.0666, + "step": 3190 + }, + { + "epoch": 0.31, + "grad_norm": 0.2884216831748455, + "learning_rate": 0.00019429258634616941, + "loss": 1.064, + "step": 3191 + }, + { + "epoch": 0.31, + "grad_norm": 0.2502879838444048, + "learning_rate": 0.00019428731723279463, + "loss": 1.1431, + "step": 3192 + }, + { + "epoch": 0.31, + "grad_norm": 0.25880669047725313, + "learning_rate": 0.0001942820457598205, + "loss": 1.0648, + "step": 3193 + }, + { + "epoch": 0.31, + "grad_norm": 0.251763418883559, + "learning_rate": 0.0001942767719273789, + "loss": 1.095, + "step": 3194 + }, + { + "epoch": 0.31, + "grad_norm": 0.28454723290611905, + "learning_rate": 0.00019427149573560183, + "loss": 1.0639, + "step": 3195 + }, + { + "epoch": 0.31, + "grad_norm": 0.2740113209524548, + "learning_rate": 0.00019426621718462137, + "loss": 1.0383, + "step": 3196 + }, + { + "epoch": 0.31, + "grad_norm": 0.2894651427378075, + "learning_rate": 0.00019426093627456954, + "loss": 1.0393, + "step": 3197 + }, + { + "epoch": 0.31, + "grad_norm": 0.272356851821048, + "learning_rate": 0.00019425565300557857, + "loss": 1.0492, + "step": 3198 + }, + { + "epoch": 0.31, + "grad_norm": 0.2834745514772499, + "learning_rate": 0.00019425036737778063, + "loss": 1.1115, + "step": 3199 + }, + { + "epoch": 0.31, + "grad_norm": 0.290735855354118, + "learning_rate": 0.00019424507939130802, + "loss": 1.1519, + "step": 3200 + }, + { + "epoch": 0.31, + "grad_norm": 0.26309247215034526, + "learning_rate": 0.00019423978904629303, + "loss": 1.1589, + "step": 3201 + }, + { + "epoch": 0.31, + "grad_norm": 0.3034788416160877, + "learning_rate": 0.00019423449634286812, + "loss": 1.0927, + "step": 3202 + }, + { + "epoch": 0.31, + "grad_norm": 0.24953378202378695, + "learning_rate": 0.00019422920128116573, + "loss": 1.0734, + "step": 3203 + }, + { + "epoch": 0.31, + "grad_norm": 0.30391403166399206, + "learning_rate": 0.00019422390386131835, + "loss": 1.1223, + "step": 3204 + }, + { + "epoch": 0.31, + "grad_norm": 0.26768113646614333, + "learning_rate": 0.00019421860408345856, + "loss": 1.074, + "step": 3205 + }, + { + "epoch": 0.31, + "grad_norm": 0.26968925174933783, + "learning_rate": 0.000194213301947719, + "loss": 1.1126, + "step": 3206 + }, + { + "epoch": 0.31, + "grad_norm": 0.25338870451527706, + "learning_rate": 0.0001942079974542323, + "loss": 1.2239, + "step": 3207 + }, + { + "epoch": 0.31, + "grad_norm": 0.27878381417056575, + "learning_rate": 0.0001942026906031313, + "loss": 1.184, + "step": 3208 + }, + { + "epoch": 0.31, + "grad_norm": 0.25893716244148623, + "learning_rate": 0.00019419738139454874, + "loss": 1.1045, + "step": 3209 + }, + { + "epoch": 0.31, + "grad_norm": 0.2657479758737288, + "learning_rate": 0.0001941920698286175, + "loss": 1.0898, + "step": 3210 + }, + { + "epoch": 0.31, + "grad_norm": 0.274528853814968, + "learning_rate": 0.00019418675590547054, + "loss": 1.2649, + "step": 3211 + }, + { + "epoch": 0.31, + "grad_norm": 0.28015720519786214, + "learning_rate": 0.00019418143962524084, + "loss": 1.1167, + "step": 3212 + }, + { + "epoch": 0.31, + "grad_norm": 0.26974290585106064, + "learning_rate": 0.00019417612098806137, + "loss": 1.0439, + "step": 3213 + }, + { + "epoch": 0.31, + "grad_norm": 0.26730225585717254, + "learning_rate": 0.00019417079999406532, + "loss": 1.1091, + "step": 3214 + }, + { + "epoch": 0.31, + "grad_norm": 0.24578940286887485, + "learning_rate": 0.0001941654766433858, + "loss": 1.1115, + "step": 3215 + }, + { + "epoch": 0.31, + "grad_norm": 0.24916795650749093, + "learning_rate": 0.00019416015093615604, + "loss": 0.9763, + "step": 3216 + }, + { + "epoch": 0.31, + "grad_norm": 0.3032785764886797, + "learning_rate": 0.00019415482287250935, + "loss": 1.0748, + "step": 3217 + }, + { + "epoch": 0.31, + "grad_norm": 0.2840877489578721, + "learning_rate": 0.00019414949245257903, + "loss": 1.0943, + "step": 3218 + }, + { + "epoch": 0.31, + "grad_norm": 0.2721658275584308, + "learning_rate": 0.0001941441596764985, + "loss": 1.0798, + "step": 3219 + }, + { + "epoch": 0.31, + "grad_norm": 0.27711342122729105, + "learning_rate": 0.00019413882454440118, + "loss": 1.0857, + "step": 3220 + }, + { + "epoch": 0.31, + "grad_norm": 0.264240156198682, + "learning_rate": 0.00019413348705642065, + "loss": 1.1476, + "step": 3221 + }, + { + "epoch": 0.31, + "grad_norm": 0.2672728525407875, + "learning_rate": 0.00019412814721269042, + "loss": 1.1006, + "step": 3222 + }, + { + "epoch": 0.31, + "grad_norm": 0.2790227898377365, + "learning_rate": 0.00019412280501334418, + "loss": 1.0214, + "step": 3223 + }, + { + "epoch": 0.31, + "grad_norm": 0.26686766116618904, + "learning_rate": 0.00019411746045851553, + "loss": 1.0939, + "step": 3224 + }, + { + "epoch": 0.31, + "grad_norm": 0.26689843862224927, + "learning_rate": 0.00019411211354833832, + "loss": 1.1118, + "step": 3225 + }, + { + "epoch": 0.31, + "grad_norm": 0.2645315881506633, + "learning_rate": 0.00019410676428294633, + "loss": 1.064, + "step": 3226 + }, + { + "epoch": 0.31, + "grad_norm": 0.27750130260616634, + "learning_rate": 0.00019410141266247338, + "loss": 1.1626, + "step": 3227 + }, + { + "epoch": 0.31, + "grad_norm": 0.2739213945232073, + "learning_rate": 0.0001940960586870535, + "loss": 1.0854, + "step": 3228 + }, + { + "epoch": 0.31, + "grad_norm": 0.28287550537217365, + "learning_rate": 0.00019409070235682055, + "loss": 1.0474, + "step": 3229 + }, + { + "epoch": 0.31, + "grad_norm": 0.2731751302387, + "learning_rate": 0.0001940853436719087, + "loss": 0.9977, + "step": 3230 + }, + { + "epoch": 0.31, + "grad_norm": 0.2950771929234802, + "learning_rate": 0.00019407998263245194, + "loss": 1.1031, + "step": 3231 + }, + { + "epoch": 0.31, + "grad_norm": 0.274205168506372, + "learning_rate": 0.0001940746192385845, + "loss": 1.0148, + "step": 3232 + }, + { + "epoch": 0.31, + "grad_norm": 0.24694847878806808, + "learning_rate": 0.0001940692534904406, + "loss": 1.0189, + "step": 3233 + }, + { + "epoch": 0.31, + "grad_norm": 0.2884458695976643, + "learning_rate": 0.00019406388538815454, + "loss": 1.0534, + "step": 3234 + }, + { + "epoch": 0.31, + "grad_norm": 0.25614325988090403, + "learning_rate": 0.0001940585149318606, + "loss": 1.1938, + "step": 3235 + }, + { + "epoch": 0.31, + "grad_norm": 0.28634252057649984, + "learning_rate": 0.0001940531421216932, + "loss": 1.1215, + "step": 3236 + }, + { + "epoch": 0.31, + "grad_norm": 0.26353357452427006, + "learning_rate": 0.00019404776695778684, + "loss": 1.0671, + "step": 3237 + }, + { + "epoch": 0.31, + "grad_norm": 0.252301189841136, + "learning_rate": 0.00019404238944027596, + "loss": 1.0318, + "step": 3238 + }, + { + "epoch": 0.31, + "grad_norm": 0.2921004823006027, + "learning_rate": 0.0001940370095692952, + "loss": 1.1743, + "step": 3239 + }, + { + "epoch": 0.31, + "grad_norm": 0.3010241530589875, + "learning_rate": 0.0001940316273449792, + "loss": 1.1167, + "step": 3240 + }, + { + "epoch": 0.31, + "grad_norm": 0.26470231728020255, + "learning_rate": 0.00019402624276746263, + "loss": 1.1322, + "step": 3241 + }, + { + "epoch": 0.31, + "grad_norm": 0.26518892722531195, + "learning_rate": 0.00019402085583688022, + "loss": 1.043, + "step": 3242 + }, + { + "epoch": 0.31, + "grad_norm": 0.28690580948021155, + "learning_rate": 0.0001940154665533668, + "loss": 1.0667, + "step": 3243 + }, + { + "epoch": 0.31, + "grad_norm": 0.2810395515843624, + "learning_rate": 0.00019401007491705725, + "loss": 0.9801, + "step": 3244 + }, + { + "epoch": 0.31, + "grad_norm": 0.2572499963401515, + "learning_rate": 0.00019400468092808647, + "loss": 1.159, + "step": 3245 + }, + { + "epoch": 0.31, + "grad_norm": 0.2644492737063085, + "learning_rate": 0.00019399928458658952, + "loss": 1.119, + "step": 3246 + }, + { + "epoch": 0.31, + "grad_norm": 0.2693774572143553, + "learning_rate": 0.00019399388589270134, + "loss": 1.1763, + "step": 3247 + }, + { + "epoch": 0.31, + "grad_norm": 0.3074129041443745, + "learning_rate": 0.00019398848484655714, + "loss": 1.1109, + "step": 3248 + }, + { + "epoch": 0.31, + "grad_norm": 0.22885608780661518, + "learning_rate": 0.00019398308144829202, + "loss": 1.1484, + "step": 3249 + }, + { + "epoch": 0.31, + "grad_norm": 0.2431406554688736, + "learning_rate": 0.0001939776756980412, + "loss": 1.1024, + "step": 3250 + }, + { + "epoch": 0.31, + "grad_norm": 0.2508813769274511, + "learning_rate": 0.00019397226759594003, + "loss": 1.1161, + "step": 3251 + }, + { + "epoch": 0.31, + "grad_norm": 0.259498174018731, + "learning_rate": 0.00019396685714212378, + "loss": 1.1121, + "step": 3252 + }, + { + "epoch": 0.31, + "grad_norm": 0.25690080144023086, + "learning_rate": 0.00019396144433672787, + "loss": 1.1951, + "step": 3253 + }, + { + "epoch": 0.31, + "grad_norm": 0.2703955152553523, + "learning_rate": 0.00019395602917988774, + "loss": 1.0848, + "step": 3254 + }, + { + "epoch": 0.31, + "grad_norm": 0.3248488191794343, + "learning_rate": 0.00019395061167173895, + "loss": 1.1507, + "step": 3255 + }, + { + "epoch": 0.31, + "grad_norm": 0.28991024777070024, + "learning_rate": 0.00019394519181241705, + "loss": 1.1015, + "step": 3256 + }, + { + "epoch": 0.31, + "grad_norm": 0.2764459982278797, + "learning_rate": 0.00019393976960205772, + "loss": 0.9972, + "step": 3257 + }, + { + "epoch": 0.31, + "grad_norm": 0.26862234853504874, + "learning_rate": 0.00019393434504079657, + "loss": 1.1453, + "step": 3258 + }, + { + "epoch": 0.31, + "grad_norm": 0.27971969886015396, + "learning_rate": 0.00019392891812876944, + "loss": 1.0825, + "step": 3259 + }, + { + "epoch": 0.31, + "grad_norm": 0.2770468588755897, + "learning_rate": 0.00019392348886611207, + "loss": 1.0536, + "step": 3260 + }, + { + "epoch": 0.31, + "grad_norm": 0.271609510880968, + "learning_rate": 0.00019391805725296038, + "loss": 1.0481, + "step": 3261 + }, + { + "epoch": 0.31, + "grad_norm": 0.24387465904724714, + "learning_rate": 0.00019391262328945027, + "loss": 1.0953, + "step": 3262 + }, + { + "epoch": 0.31, + "grad_norm": 0.306543464049848, + "learning_rate": 0.00019390718697571776, + "loss": 1.1486, + "step": 3263 + }, + { + "epoch": 0.31, + "grad_norm": 0.30439534530685314, + "learning_rate": 0.00019390174831189887, + "loss": 1.0251, + "step": 3264 + }, + { + "epoch": 0.31, + "grad_norm": 0.2702567545507524, + "learning_rate": 0.0001938963072981297, + "loss": 1.0219, + "step": 3265 + }, + { + "epoch": 0.31, + "grad_norm": 0.24990979733983004, + "learning_rate": 0.00019389086393454644, + "loss": 0.9841, + "step": 3266 + }, + { + "epoch": 0.31, + "grad_norm": 0.30401365328102387, + "learning_rate": 0.0001938854182212853, + "loss": 1.0608, + "step": 3267 + }, + { + "epoch": 0.31, + "grad_norm": 0.3051517353842027, + "learning_rate": 0.00019387997015848254, + "loss": 1.0624, + "step": 3268 + }, + { + "epoch": 0.31, + "grad_norm": 0.28665306616118164, + "learning_rate": 0.00019387451974627455, + "loss": 1.0742, + "step": 3269 + }, + { + "epoch": 0.31, + "grad_norm": 0.3231516980435964, + "learning_rate": 0.0001938690669847977, + "loss": 1.1872, + "step": 3270 + }, + { + "epoch": 0.31, + "grad_norm": 0.2956462625808931, + "learning_rate": 0.00019386361187418848, + "loss": 1.1729, + "step": 3271 + }, + { + "epoch": 0.31, + "grad_norm": 0.2657542153528369, + "learning_rate": 0.00019385815441458335, + "loss": 1.0359, + "step": 3272 + }, + { + "epoch": 0.31, + "grad_norm": 0.3012539639681833, + "learning_rate": 0.0001938526946061189, + "loss": 1.224, + "step": 3273 + }, + { + "epoch": 0.31, + "grad_norm": 0.23858077659378998, + "learning_rate": 0.00019384723244893182, + "loss": 0.9866, + "step": 3274 + }, + { + "epoch": 0.31, + "grad_norm": 0.28237967640012435, + "learning_rate": 0.00019384176794315876, + "loss": 1.1095, + "step": 3275 + }, + { + "epoch": 0.31, + "grad_norm": 0.2599923150922761, + "learning_rate": 0.0001938363010889365, + "loss": 1.1204, + "step": 3276 + }, + { + "epoch": 0.31, + "grad_norm": 0.2869091154710219, + "learning_rate": 0.00019383083188640178, + "loss": 1.0668, + "step": 3277 + }, + { + "epoch": 0.31, + "grad_norm": 0.31710794291071975, + "learning_rate": 0.00019382536033569155, + "loss": 1.0612, + "step": 3278 + }, + { + "epoch": 0.31, + "grad_norm": 0.29559893780311114, + "learning_rate": 0.0001938198864369427, + "loss": 1.077, + "step": 3279 + }, + { + "epoch": 0.31, + "grad_norm": 0.2898590799989514, + "learning_rate": 0.00019381441019029224, + "loss": 0.9822, + "step": 3280 + }, + { + "epoch": 0.31, + "grad_norm": 0.2653384099576511, + "learning_rate": 0.00019380893159587722, + "loss": 1.0328, + "step": 3281 + }, + { + "epoch": 0.31, + "grad_norm": 0.3059551368813422, + "learning_rate": 0.00019380345065383468, + "loss": 1.1349, + "step": 3282 + }, + { + "epoch": 0.31, + "grad_norm": 0.24886086837418994, + "learning_rate": 0.0001937979673643019, + "loss": 1.068, + "step": 3283 + }, + { + "epoch": 0.31, + "grad_norm": 0.23919413973427292, + "learning_rate": 0.000193792481727416, + "loss": 1.1181, + "step": 3284 + }, + { + "epoch": 0.31, + "grad_norm": 0.25404095588714376, + "learning_rate": 0.0001937869937433143, + "loss": 1.045, + "step": 3285 + }, + { + "epoch": 0.31, + "grad_norm": 0.223637661799907, + "learning_rate": 0.00019378150341213416, + "loss": 1.0104, + "step": 3286 + }, + { + "epoch": 0.31, + "grad_norm": 0.2779252243873626, + "learning_rate": 0.00019377601073401293, + "loss": 1.1353, + "step": 3287 + }, + { + "epoch": 0.31, + "grad_norm": 0.27508596502440386, + "learning_rate": 0.0001937705157090881, + "loss": 1.0028, + "step": 3288 + }, + { + "epoch": 0.31, + "grad_norm": 0.2829961966551457, + "learning_rate": 0.0001937650183374972, + "loss": 1.1252, + "step": 3289 + }, + { + "epoch": 0.31, + "grad_norm": 0.2547718896278451, + "learning_rate": 0.00019375951861937775, + "loss": 1.1001, + "step": 3290 + }, + { + "epoch": 0.31, + "grad_norm": 0.2386176095295127, + "learning_rate": 0.00019375401655486745, + "loss": 0.9711, + "step": 3291 + }, + { + "epoch": 0.31, + "grad_norm": 0.26234578344158516, + "learning_rate": 0.00019374851214410397, + "loss": 1.0339, + "step": 3292 + }, + { + "epoch": 0.32, + "grad_norm": 0.2919799926949464, + "learning_rate": 0.00019374300538722503, + "loss": 1.0436, + "step": 3293 + }, + { + "epoch": 0.32, + "grad_norm": 0.26522986184686614, + "learning_rate": 0.00019373749628436848, + "loss": 1.1032, + "step": 3294 + }, + { + "epoch": 0.32, + "grad_norm": 0.27747724141686647, + "learning_rate": 0.00019373198483567215, + "loss": 1.1199, + "step": 3295 + }, + { + "epoch": 0.32, + "grad_norm": 0.251558053545879, + "learning_rate": 0.00019372647104127401, + "loss": 1.0908, + "step": 3296 + }, + { + "epoch": 0.32, + "grad_norm": 0.2866203569622034, + "learning_rate": 0.00019372095490131206, + "loss": 1.1136, + "step": 3297 + }, + { + "epoch": 0.32, + "grad_norm": 0.26519901166294035, + "learning_rate": 0.00019371543641592427, + "loss": 1.0744, + "step": 3298 + }, + { + "epoch": 0.32, + "grad_norm": 0.2670922090903857, + "learning_rate": 0.0001937099155852488, + "loss": 1.0004, + "step": 3299 + }, + { + "epoch": 0.32, + "grad_norm": 0.2582252671213613, + "learning_rate": 0.0001937043924094238, + "loss": 1.0158, + "step": 3300 + }, + { + "epoch": 0.32, + "grad_norm": 0.2780671456060699, + "learning_rate": 0.00019369886688858746, + "loss": 1.0033, + "step": 3301 + }, + { + "epoch": 0.32, + "grad_norm": 0.26115496930707605, + "learning_rate": 0.00019369333902287812, + "loss": 1.1021, + "step": 3302 + }, + { + "epoch": 0.32, + "grad_norm": 0.2806226279994436, + "learning_rate": 0.00019368780881243408, + "loss": 1.0112, + "step": 3303 + }, + { + "epoch": 0.32, + "grad_norm": 0.2544906876251208, + "learning_rate": 0.00019368227625739376, + "loss": 1.1054, + "step": 3304 + }, + { + "epoch": 0.32, + "grad_norm": 0.2856721128498456, + "learning_rate": 0.00019367674135789559, + "loss": 1.1403, + "step": 3305 + }, + { + "epoch": 0.32, + "grad_norm": 0.28275774405141135, + "learning_rate": 0.00019367120411407807, + "loss": 1.1926, + "step": 3306 + }, + { + "epoch": 0.32, + "grad_norm": 0.30945016209372866, + "learning_rate": 0.00019366566452607984, + "loss": 1.0632, + "step": 3307 + }, + { + "epoch": 0.32, + "grad_norm": 0.28757906013236484, + "learning_rate": 0.00019366012259403945, + "loss": 1.0334, + "step": 3308 + }, + { + "epoch": 0.32, + "grad_norm": 0.28623289799348706, + "learning_rate": 0.00019365457831809564, + "loss": 1.0065, + "step": 3309 + }, + { + "epoch": 0.32, + "grad_norm": 0.31283779658466215, + "learning_rate": 0.00019364903169838714, + "loss": 1.1444, + "step": 3310 + }, + { + "epoch": 0.32, + "grad_norm": 0.25353872879290523, + "learning_rate": 0.0001936434827350528, + "loss": 1.1551, + "step": 3311 + }, + { + "epoch": 0.32, + "grad_norm": 0.28420385173740026, + "learning_rate": 0.00019363793142823142, + "loss": 1.0866, + "step": 3312 + }, + { + "epoch": 0.32, + "grad_norm": 0.2756468967799023, + "learning_rate": 0.00019363237777806193, + "loss": 1.1621, + "step": 3313 + }, + { + "epoch": 0.32, + "grad_norm": 0.2725051208626741, + "learning_rate": 0.0001936268217846834, + "loss": 1.0359, + "step": 3314 + }, + { + "epoch": 0.32, + "grad_norm": 0.2785478077945499, + "learning_rate": 0.0001936212634482348, + "loss": 1.1558, + "step": 3315 + }, + { + "epoch": 0.32, + "grad_norm": 0.2506394562927538, + "learning_rate": 0.00019361570276885522, + "loss": 1.1897, + "step": 3316 + }, + { + "epoch": 0.32, + "grad_norm": 0.2606058706252523, + "learning_rate": 0.00019361013974668385, + "loss": 1.1776, + "step": 3317 + }, + { + "epoch": 0.32, + "grad_norm": 0.2496048710743572, + "learning_rate": 0.0001936045743818599, + "loss": 0.9721, + "step": 3318 + }, + { + "epoch": 0.32, + "grad_norm": 0.26993436748357164, + "learning_rate": 0.00019359900667452264, + "loss": 1.104, + "step": 3319 + }, + { + "epoch": 0.32, + "grad_norm": 0.255141548047229, + "learning_rate": 0.0001935934366248114, + "loss": 1.0802, + "step": 3320 + }, + { + "epoch": 0.32, + "grad_norm": 0.27647316322378807, + "learning_rate": 0.00019358786423286564, + "loss": 1.0241, + "step": 3321 + }, + { + "epoch": 0.32, + "grad_norm": 0.2923601566113196, + "learning_rate": 0.00019358228949882474, + "loss": 1.0406, + "step": 3322 + }, + { + "epoch": 0.32, + "grad_norm": 0.24263042595969897, + "learning_rate": 0.00019357671242282821, + "loss": 0.9864, + "step": 3323 + }, + { + "epoch": 0.32, + "grad_norm": 0.27774846850692353, + "learning_rate": 0.00019357113300501566, + "loss": 1.0937, + "step": 3324 + }, + { + "epoch": 0.32, + "grad_norm": 0.27923854319931557, + "learning_rate": 0.0001935655512455267, + "loss": 1.0601, + "step": 3325 + }, + { + "epoch": 0.32, + "grad_norm": 0.3061443475173794, + "learning_rate": 0.000193559967144501, + "loss": 1.2078, + "step": 3326 + }, + { + "epoch": 0.32, + "grad_norm": 0.2846307312850083, + "learning_rate": 0.00019355438070207834, + "loss": 1.0301, + "step": 3327 + }, + { + "epoch": 0.32, + "grad_norm": 0.27792518753199547, + "learning_rate": 0.0001935487919183985, + "loss": 1.0656, + "step": 3328 + }, + { + "epoch": 0.32, + "grad_norm": 0.2845450991292036, + "learning_rate": 0.00019354320079360132, + "loss": 1.0507, + "step": 3329 + }, + { + "epoch": 0.32, + "grad_norm": 0.31509060671206557, + "learning_rate": 0.0001935376073278268, + "loss": 1.1854, + "step": 3330 + }, + { + "epoch": 0.32, + "grad_norm": 0.27538788262531033, + "learning_rate": 0.00019353201152121484, + "loss": 1.0761, + "step": 3331 + }, + { + "epoch": 0.32, + "grad_norm": 0.3085113970962017, + "learning_rate": 0.00019352641337390552, + "loss": 1.0149, + "step": 3332 + }, + { + "epoch": 0.32, + "grad_norm": 0.2649187417930221, + "learning_rate": 0.00019352081288603895, + "loss": 0.9846, + "step": 3333 + }, + { + "epoch": 0.32, + "grad_norm": 0.2702951480669255, + "learning_rate": 0.0001935152100577552, + "loss": 1.1289, + "step": 3334 + }, + { + "epoch": 0.32, + "grad_norm": 0.2587123130961769, + "learning_rate": 0.00019350960488919458, + "loss": 1.1603, + "step": 3335 + }, + { + "epoch": 0.32, + "grad_norm": 0.26093692625675025, + "learning_rate": 0.00019350399738049735, + "loss": 1.189, + "step": 3336 + }, + { + "epoch": 0.32, + "grad_norm": 0.2571900526712086, + "learning_rate": 0.0001934983875318038, + "loss": 1.1219, + "step": 3337 + }, + { + "epoch": 0.32, + "grad_norm": 0.2823069020729405, + "learning_rate": 0.0001934927753432543, + "loss": 1.0871, + "step": 3338 + }, + { + "epoch": 0.32, + "grad_norm": 0.29397830877053815, + "learning_rate": 0.00019348716081498942, + "loss": 1.0944, + "step": 3339 + }, + { + "epoch": 0.32, + "grad_norm": 0.29592501153858697, + "learning_rate": 0.00019348154394714952, + "loss": 1.0847, + "step": 3340 + }, + { + "epoch": 0.32, + "grad_norm": 0.27462052813299775, + "learning_rate": 0.00019347592473987528, + "loss": 1.1752, + "step": 3341 + }, + { + "epoch": 0.32, + "grad_norm": 0.24616995149088777, + "learning_rate": 0.00019347030319330727, + "loss": 1.1025, + "step": 3342 + }, + { + "epoch": 0.32, + "grad_norm": 0.27129435980716926, + "learning_rate": 0.00019346467930758614, + "loss": 1.1367, + "step": 3343 + }, + { + "epoch": 0.32, + "grad_norm": 0.27336188709510006, + "learning_rate": 0.0001934590530828527, + "loss": 1.1193, + "step": 3344 + }, + { + "epoch": 0.32, + "grad_norm": 0.27826604163520996, + "learning_rate": 0.0001934534245192477, + "loss": 1.1751, + "step": 3345 + }, + { + "epoch": 0.32, + "grad_norm": 0.2998027219687268, + "learning_rate": 0.00019344779361691203, + "loss": 1.1368, + "step": 3346 + }, + { + "epoch": 0.32, + "grad_norm": 0.28056799541807836, + "learning_rate": 0.0001934421603759866, + "loss": 1.0334, + "step": 3347 + }, + { + "epoch": 0.32, + "grad_norm": 0.24723826046732345, + "learning_rate": 0.00019343652479661237, + "loss": 1.0881, + "step": 3348 + }, + { + "epoch": 0.32, + "grad_norm": 0.3324987905537479, + "learning_rate": 0.00019343088687893037, + "loss": 1.0454, + "step": 3349 + }, + { + "epoch": 0.32, + "grad_norm": 0.2612587061661008, + "learning_rate": 0.00019342524662308174, + "loss": 1.1285, + "step": 3350 + }, + { + "epoch": 0.32, + "grad_norm": 0.3175396328178754, + "learning_rate": 0.0001934196040292076, + "loss": 1.1002, + "step": 3351 + }, + { + "epoch": 0.32, + "grad_norm": 0.2982419959527945, + "learning_rate": 0.00019341395909744914, + "loss": 1.1169, + "step": 3352 + }, + { + "epoch": 0.32, + "grad_norm": 0.2665630466583381, + "learning_rate": 0.00019340831182794763, + "loss": 1.0946, + "step": 3353 + }, + { + "epoch": 0.32, + "grad_norm": 0.2747715419393994, + "learning_rate": 0.00019340266222084445, + "loss": 1.0806, + "step": 3354 + }, + { + "epoch": 0.32, + "grad_norm": 0.26678051390620816, + "learning_rate": 0.00019339701027628093, + "loss": 1.0541, + "step": 3355 + }, + { + "epoch": 0.32, + "grad_norm": 0.2598902398444097, + "learning_rate": 0.00019339135599439852, + "loss": 1.0699, + "step": 3356 + }, + { + "epoch": 0.32, + "grad_norm": 0.254933790104563, + "learning_rate": 0.00019338569937533872, + "loss": 1.174, + "step": 3357 + }, + { + "epoch": 0.32, + "grad_norm": 0.3128276440616595, + "learning_rate": 0.00019338004041924314, + "loss": 1.1016, + "step": 3358 + }, + { + "epoch": 0.32, + "grad_norm": 0.296430797427781, + "learning_rate": 0.00019337437912625332, + "loss": 1.108, + "step": 3359 + }, + { + "epoch": 0.32, + "grad_norm": 0.2970936872836418, + "learning_rate": 0.00019336871549651102, + "loss": 1.0936, + "step": 3360 + }, + { + "epoch": 0.32, + "grad_norm": 0.30049312775342935, + "learning_rate": 0.0001933630495301579, + "loss": 1.1077, + "step": 3361 + }, + { + "epoch": 0.32, + "grad_norm": 0.25869029149150324, + "learning_rate": 0.0001933573812273358, + "loss": 1.1524, + "step": 3362 + }, + { + "epoch": 0.32, + "grad_norm": 0.25180878439069226, + "learning_rate": 0.00019335171058818657, + "loss": 0.9934, + "step": 3363 + }, + { + "epoch": 0.32, + "grad_norm": 0.2582499973976482, + "learning_rate": 0.0001933460376128521, + "loss": 1.0993, + "step": 3364 + }, + { + "epoch": 0.32, + "grad_norm": 0.27195751975656457, + "learning_rate": 0.0001933403623014744, + "loss": 1.0789, + "step": 3365 + }, + { + "epoch": 0.32, + "grad_norm": 0.2725473211575861, + "learning_rate": 0.00019333468465419545, + "loss": 1.1782, + "step": 3366 + }, + { + "epoch": 0.32, + "grad_norm": 0.2696499000977069, + "learning_rate": 0.00019332900467115735, + "loss": 1.079, + "step": 3367 + }, + { + "epoch": 0.32, + "grad_norm": 0.26468748026222183, + "learning_rate": 0.00019332332235250227, + "loss": 1.0032, + "step": 3368 + }, + { + "epoch": 0.32, + "grad_norm": 0.27136388855246457, + "learning_rate": 0.00019331763769837239, + "loss": 1.1859, + "step": 3369 + }, + { + "epoch": 0.32, + "grad_norm": 0.3147503114706967, + "learning_rate": 0.00019331195070890997, + "loss": 0.9721, + "step": 3370 + }, + { + "epoch": 0.32, + "grad_norm": 0.26623718570346866, + "learning_rate": 0.00019330626138425733, + "loss": 1.1342, + "step": 3371 + }, + { + "epoch": 0.32, + "grad_norm": 0.238352475784514, + "learning_rate": 0.0001933005697245569, + "loss": 1.0439, + "step": 3372 + }, + { + "epoch": 0.32, + "grad_norm": 0.28164020535606377, + "learning_rate": 0.00019329487572995104, + "loss": 1.0531, + "step": 3373 + }, + { + "epoch": 0.32, + "grad_norm": 0.2798281524042237, + "learning_rate": 0.0001932891794005823, + "loss": 1.0947, + "step": 3374 + }, + { + "epoch": 0.32, + "grad_norm": 0.2593240400068639, + "learning_rate": 0.0001932834807365932, + "loss": 1.3053, + "step": 3375 + }, + { + "epoch": 0.32, + "grad_norm": 0.2956714093140169, + "learning_rate": 0.0001932777797381264, + "loss": 1.1146, + "step": 3376 + }, + { + "epoch": 0.32, + "grad_norm": 0.24933340729423376, + "learning_rate": 0.0001932720764053245, + "loss": 1.0169, + "step": 3377 + }, + { + "epoch": 0.32, + "grad_norm": 0.2746804152121392, + "learning_rate": 0.0001932663707383303, + "loss": 1.2153, + "step": 3378 + }, + { + "epoch": 0.32, + "grad_norm": 0.2438039762627365, + "learning_rate": 0.00019326066273728652, + "loss": 1.1174, + "step": 3379 + }, + { + "epoch": 0.32, + "grad_norm": 0.2667417249557865, + "learning_rate": 0.00019325495240233608, + "loss": 1.1442, + "step": 3380 + }, + { + "epoch": 0.32, + "grad_norm": 0.282105093224216, + "learning_rate": 0.0001932492397336218, + "loss": 1.0328, + "step": 3381 + }, + { + "epoch": 0.32, + "grad_norm": 0.2700271902826537, + "learning_rate": 0.00019324352473128675, + "loss": 1.1098, + "step": 3382 + }, + { + "epoch": 0.32, + "grad_norm": 0.2810010796311137, + "learning_rate": 0.00019323780739547382, + "loss": 1.1228, + "step": 3383 + }, + { + "epoch": 0.32, + "grad_norm": 0.27543586143634324, + "learning_rate": 0.00019323208772632623, + "loss": 1.0899, + "step": 3384 + }, + { + "epoch": 0.32, + "grad_norm": 0.28148476530118366, + "learning_rate": 0.00019322636572398705, + "loss": 1.0954, + "step": 3385 + }, + { + "epoch": 0.32, + "grad_norm": 0.24691162930671992, + "learning_rate": 0.00019322064138859943, + "loss": 1.0638, + "step": 3386 + }, + { + "epoch": 0.32, + "grad_norm": 0.2766771037796505, + "learning_rate": 0.0001932149147203067, + "loss": 1.0988, + "step": 3387 + }, + { + "epoch": 0.32, + "grad_norm": 0.27879746565463825, + "learning_rate": 0.00019320918571925214, + "loss": 1.1285, + "step": 3388 + }, + { + "epoch": 0.32, + "grad_norm": 0.22878400180565586, + "learning_rate": 0.00019320345438557913, + "loss": 1.0721, + "step": 3389 + }, + { + "epoch": 0.32, + "grad_norm": 0.28209510083017475, + "learning_rate": 0.0001931977207194311, + "loss": 1.0092, + "step": 3390 + }, + { + "epoch": 0.32, + "grad_norm": 0.2718261063085623, + "learning_rate": 0.00019319198472095154, + "loss": 1.118, + "step": 3391 + }, + { + "epoch": 0.32, + "grad_norm": 0.29842791503503757, + "learning_rate": 0.00019318624639028397, + "loss": 1.0923, + "step": 3392 + }, + { + "epoch": 0.32, + "grad_norm": 0.24518361819708703, + "learning_rate": 0.00019318050572757206, + "loss": 1.0292, + "step": 3393 + }, + { + "epoch": 0.32, + "grad_norm": 0.24792628965630142, + "learning_rate": 0.00019317476273295937, + "loss": 1.0833, + "step": 3394 + }, + { + "epoch": 0.32, + "grad_norm": 0.29078999381758364, + "learning_rate": 0.00019316901740658974, + "loss": 1.1031, + "step": 3395 + }, + { + "epoch": 0.32, + "grad_norm": 0.2567096843477097, + "learning_rate": 0.00019316326974860688, + "loss": 1.0013, + "step": 3396 + }, + { + "epoch": 0.32, + "grad_norm": 0.2554741348697831, + "learning_rate": 0.00019315751975915464, + "loss": 1.0173, + "step": 3397 + }, + { + "epoch": 0.33, + "grad_norm": 0.28242380704250564, + "learning_rate": 0.00019315176743837692, + "loss": 1.1378, + "step": 3398 + }, + { + "epoch": 0.33, + "grad_norm": 0.31001384373991414, + "learning_rate": 0.00019314601278641767, + "loss": 1.1999, + "step": 3399 + }, + { + "epoch": 0.33, + "grad_norm": 0.2833937019206038, + "learning_rate": 0.0001931402558034209, + "loss": 1.1491, + "step": 3400 + }, + { + "epoch": 0.33, + "grad_norm": 0.3026457744535161, + "learning_rate": 0.00019313449648953075, + "loss": 1.2268, + "step": 3401 + }, + { + "epoch": 0.33, + "grad_norm": 0.26414264616917044, + "learning_rate": 0.00019312873484489122, + "loss": 1.0955, + "step": 3402 + }, + { + "epoch": 0.33, + "grad_norm": 0.24532958588409323, + "learning_rate": 0.0001931229708696466, + "loss": 1.0478, + "step": 3403 + }, + { + "epoch": 0.33, + "grad_norm": 0.30331695478379483, + "learning_rate": 0.00019311720456394115, + "loss": 1.0953, + "step": 3404 + }, + { + "epoch": 0.33, + "grad_norm": 0.28074250597379036, + "learning_rate": 0.00019311143592791908, + "loss": 1.1166, + "step": 3405 + }, + { + "epoch": 0.33, + "grad_norm": 0.2409279290445585, + "learning_rate": 0.00019310566496172482, + "loss": 1.1022, + "step": 3406 + }, + { + "epoch": 0.33, + "grad_norm": 0.27337622876374895, + "learning_rate": 0.00019309989166550276, + "loss": 1.0369, + "step": 3407 + }, + { + "epoch": 0.33, + "grad_norm": 0.27239368467190694, + "learning_rate": 0.00019309411603939746, + "loss": 1.0825, + "step": 3408 + }, + { + "epoch": 0.33, + "grad_norm": 0.2510203565258273, + "learning_rate": 0.00019308833808355335, + "loss": 1.1297, + "step": 3409 + }, + { + "epoch": 0.33, + "grad_norm": 0.3019608047256465, + "learning_rate": 0.0001930825577981151, + "loss": 1.0537, + "step": 3410 + }, + { + "epoch": 0.33, + "grad_norm": 0.27782305027611853, + "learning_rate": 0.00019307677518322732, + "loss": 1.0568, + "step": 3411 + }, + { + "epoch": 0.33, + "grad_norm": 0.27292941976306373, + "learning_rate": 0.00019307099023903475, + "loss": 1.1049, + "step": 3412 + }, + { + "epoch": 0.33, + "grad_norm": 0.31644950630512886, + "learning_rate": 0.00019306520296568213, + "loss": 0.9, + "step": 3413 + }, + { + "epoch": 0.33, + "grad_norm": 0.25546779805105374, + "learning_rate": 0.00019305941336331437, + "loss": 1.1242, + "step": 3414 + }, + { + "epoch": 0.33, + "grad_norm": 0.2792316426100012, + "learning_rate": 0.00019305362143207629, + "loss": 1.0101, + "step": 3415 + }, + { + "epoch": 0.33, + "grad_norm": 0.28856119813600223, + "learning_rate": 0.00019304782717211282, + "loss": 1.0683, + "step": 3416 + }, + { + "epoch": 0.33, + "grad_norm": 0.2538864469655074, + "learning_rate": 0.00019304203058356903, + "loss": 1.0736, + "step": 3417 + }, + { + "epoch": 0.33, + "grad_norm": 0.27697511534173397, + "learning_rate": 0.00019303623166658994, + "loss": 1.0237, + "step": 3418 + }, + { + "epoch": 0.33, + "grad_norm": 0.2816777125591762, + "learning_rate": 0.00019303043042132067, + "loss": 0.9735, + "step": 3419 + }, + { + "epoch": 0.33, + "grad_norm": 0.270336650155053, + "learning_rate": 0.00019302462684790643, + "loss": 1.0452, + "step": 3420 + }, + { + "epoch": 0.33, + "grad_norm": 0.3171677773770914, + "learning_rate": 0.00019301882094649244, + "loss": 1.0691, + "step": 3421 + }, + { + "epoch": 0.33, + "grad_norm": 0.30382729652830914, + "learning_rate": 0.00019301301271722397, + "loss": 1.1365, + "step": 3422 + }, + { + "epoch": 0.33, + "grad_norm": 0.2811965071217192, + "learning_rate": 0.00019300720216024642, + "loss": 1.0391, + "step": 3423 + }, + { + "epoch": 0.33, + "grad_norm": 0.2525300639055227, + "learning_rate": 0.00019300138927570517, + "loss": 1.1168, + "step": 3424 + }, + { + "epoch": 0.33, + "grad_norm": 0.2566849695783432, + "learning_rate": 0.00019299557406374574, + "loss": 1.0444, + "step": 3425 + }, + { + "epoch": 0.33, + "grad_norm": 0.29423746734781697, + "learning_rate": 0.00019298975652451357, + "loss": 1.1018, + "step": 3426 + }, + { + "epoch": 0.33, + "grad_norm": 0.27988383125452326, + "learning_rate": 0.00019298393665815434, + "loss": 1.0843, + "step": 3427 + }, + { + "epoch": 0.33, + "grad_norm": 0.2914047802879343, + "learning_rate": 0.00019297811446481364, + "loss": 1.1238, + "step": 3428 + }, + { + "epoch": 0.33, + "grad_norm": 0.26554271237108157, + "learning_rate": 0.0001929722899446372, + "loss": 1.0654, + "step": 3429 + }, + { + "epoch": 0.33, + "grad_norm": 0.2626759072606683, + "learning_rate": 0.00019296646309777078, + "loss": 1.0299, + "step": 3430 + }, + { + "epoch": 0.33, + "grad_norm": 0.24310810286273438, + "learning_rate": 0.00019296063392436016, + "loss": 1.1926, + "step": 3431 + }, + { + "epoch": 0.33, + "grad_norm": 0.2681389299412346, + "learning_rate": 0.0001929548024245513, + "loss": 1.0535, + "step": 3432 + }, + { + "epoch": 0.33, + "grad_norm": 0.2705554100674985, + "learning_rate": 0.00019294896859849007, + "loss": 1.1006, + "step": 3433 + }, + { + "epoch": 0.33, + "grad_norm": 0.29956584728704405, + "learning_rate": 0.00019294313244632246, + "loss": 1.1618, + "step": 3434 + }, + { + "epoch": 0.33, + "grad_norm": 0.25979972995620976, + "learning_rate": 0.00019293729396819455, + "loss": 1.1341, + "step": 3435 + }, + { + "epoch": 0.33, + "grad_norm": 0.31840805453945864, + "learning_rate": 0.0001929314531642525, + "loss": 1.0505, + "step": 3436 + }, + { + "epoch": 0.33, + "grad_norm": 0.30601505649557453, + "learning_rate": 0.0001929256100346424, + "loss": 1.1247, + "step": 3437 + }, + { + "epoch": 0.33, + "grad_norm": 0.255162069191932, + "learning_rate": 0.0001929197645795105, + "loss": 1.0914, + "step": 3438 + }, + { + "epoch": 0.33, + "grad_norm": 0.27220081852340894, + "learning_rate": 0.00019291391679900308, + "loss": 1.1204, + "step": 3439 + }, + { + "epoch": 0.33, + "grad_norm": 0.2680103064629152, + "learning_rate": 0.00019290806669326651, + "loss": 1.1374, + "step": 3440 + }, + { + "epoch": 0.33, + "grad_norm": 0.26907374179829274, + "learning_rate": 0.0001929022142624472, + "loss": 1.104, + "step": 3441 + }, + { + "epoch": 0.33, + "grad_norm": 0.25948281052343475, + "learning_rate": 0.00019289635950669158, + "loss": 1.1315, + "step": 3442 + }, + { + "epoch": 0.33, + "grad_norm": 0.25383699970065504, + "learning_rate": 0.00019289050242614616, + "loss": 1.0563, + "step": 3443 + }, + { + "epoch": 0.33, + "grad_norm": 0.28630705685350616, + "learning_rate": 0.00019288464302095757, + "loss": 1.0699, + "step": 3444 + }, + { + "epoch": 0.33, + "grad_norm": 0.2622608246535195, + "learning_rate": 0.00019287878129127238, + "loss": 1.0172, + "step": 3445 + }, + { + "epoch": 0.33, + "grad_norm": 0.27001089784818566, + "learning_rate": 0.00019287291723723735, + "loss": 0.9808, + "step": 3446 + }, + { + "epoch": 0.33, + "grad_norm": 0.3040076339208099, + "learning_rate": 0.00019286705085899916, + "loss": 1.2147, + "step": 3447 + }, + { + "epoch": 0.33, + "grad_norm": 0.2996187859781569, + "learning_rate": 0.00019286118215670471, + "loss": 1.1517, + "step": 3448 + }, + { + "epoch": 0.33, + "grad_norm": 0.33317490146125284, + "learning_rate": 0.00019285531113050075, + "loss": 1.0603, + "step": 3449 + }, + { + "epoch": 0.33, + "grad_norm": 0.2900875599565296, + "learning_rate": 0.00019284943778053433, + "loss": 1.1299, + "step": 3450 + }, + { + "epoch": 0.33, + "grad_norm": 0.28251347088813633, + "learning_rate": 0.00019284356210695234, + "loss": 1.1064, + "step": 3451 + }, + { + "epoch": 0.33, + "grad_norm": 0.28341004906992046, + "learning_rate": 0.00019283768410990185, + "loss": 1.0892, + "step": 3452 + }, + { + "epoch": 0.33, + "grad_norm": 0.2883977918976394, + "learning_rate": 0.00019283180378953, + "loss": 1.1733, + "step": 3453 + }, + { + "epoch": 0.33, + "grad_norm": 0.260390677587717, + "learning_rate": 0.0001928259211459839, + "loss": 0.9542, + "step": 3454 + }, + { + "epoch": 0.33, + "grad_norm": 0.29136611867774026, + "learning_rate": 0.0001928200361794108, + "loss": 1.0044, + "step": 3455 + }, + { + "epoch": 0.33, + "grad_norm": 0.26446944983636694, + "learning_rate": 0.00019281414888995795, + "loss": 1.083, + "step": 3456 + }, + { + "epoch": 0.33, + "grad_norm": 0.2483864576667101, + "learning_rate": 0.0001928082592777727, + "loss": 1.0629, + "step": 3457 + }, + { + "epoch": 0.33, + "grad_norm": 0.24787693382866996, + "learning_rate": 0.00019280236734300243, + "loss": 1.0367, + "step": 3458 + }, + { + "epoch": 0.33, + "grad_norm": 0.3038463165988747, + "learning_rate": 0.00019279647308579457, + "loss": 1.0523, + "step": 3459 + }, + { + "epoch": 0.33, + "grad_norm": 0.278602231033513, + "learning_rate": 0.00019279057650629667, + "loss": 1.168, + "step": 3460 + }, + { + "epoch": 0.33, + "grad_norm": 0.2488098065303038, + "learning_rate": 0.0001927846776046563, + "loss": 1.0209, + "step": 3461 + }, + { + "epoch": 0.33, + "grad_norm": 0.2758374803041767, + "learning_rate": 0.00019277877638102103, + "loss": 1.2443, + "step": 3462 + }, + { + "epoch": 0.33, + "grad_norm": 0.26309237485251713, + "learning_rate": 0.00019277287283553856, + "loss": 0.9914, + "step": 3463 + }, + { + "epoch": 0.33, + "grad_norm": 0.23795915412482324, + "learning_rate": 0.00019276696696835668, + "loss": 1.1086, + "step": 3464 + }, + { + "epoch": 0.33, + "grad_norm": 0.3224730190795291, + "learning_rate": 0.0001927610587796231, + "loss": 1.2036, + "step": 3465 + }, + { + "epoch": 0.33, + "grad_norm": 0.282804393013691, + "learning_rate": 0.00019275514826948577, + "loss": 1.137, + "step": 3466 + }, + { + "epoch": 0.33, + "grad_norm": 0.26066639065374003, + "learning_rate": 0.00019274923543809253, + "loss": 1.127, + "step": 3467 + }, + { + "epoch": 0.33, + "grad_norm": 0.24788428251360412, + "learning_rate": 0.00019274332028559142, + "loss": 1.0519, + "step": 3468 + }, + { + "epoch": 0.33, + "grad_norm": 0.290526276121983, + "learning_rate": 0.0001927374028121304, + "loss": 1.1116, + "step": 3469 + }, + { + "epoch": 0.33, + "grad_norm": 0.2673170033387275, + "learning_rate": 0.00019273148301785759, + "loss": 1.1649, + "step": 3470 + }, + { + "epoch": 0.33, + "grad_norm": 0.264616160691623, + "learning_rate": 0.00019272556090292115, + "loss": 1.1035, + "step": 3471 + }, + { + "epoch": 0.33, + "grad_norm": 0.22386779170530668, + "learning_rate": 0.00019271963646746927, + "loss": 1.0443, + "step": 3472 + }, + { + "epoch": 0.33, + "grad_norm": 0.29588656285029913, + "learning_rate": 0.00019271370971165022, + "loss": 1.1571, + "step": 3473 + }, + { + "epoch": 0.33, + "grad_norm": 0.26869130366990424, + "learning_rate": 0.00019270778063561233, + "loss": 0.8483, + "step": 3474 + }, + { + "epoch": 0.33, + "grad_norm": 0.2442917412236581, + "learning_rate": 0.00019270184923950395, + "loss": 1.1119, + "step": 3475 + }, + { + "epoch": 0.33, + "grad_norm": 0.27434823182121076, + "learning_rate": 0.00019269591552347352, + "loss": 1.0498, + "step": 3476 + }, + { + "epoch": 0.33, + "grad_norm": 0.26614974020743387, + "learning_rate": 0.00019268997948766956, + "loss": 1.0791, + "step": 3477 + }, + { + "epoch": 0.33, + "grad_norm": 0.2959411757891701, + "learning_rate": 0.00019268404113224059, + "loss": 1.1704, + "step": 3478 + }, + { + "epoch": 0.33, + "grad_norm": 0.26576214735143877, + "learning_rate": 0.00019267810045733527, + "loss": 1.0451, + "step": 3479 + }, + { + "epoch": 0.33, + "grad_norm": 0.26797549883523514, + "learning_rate": 0.00019267215746310222, + "loss": 1.1066, + "step": 3480 + }, + { + "epoch": 0.33, + "grad_norm": 0.26415504530857153, + "learning_rate": 0.0001926662121496902, + "loss": 0.9906, + "step": 3481 + }, + { + "epoch": 0.33, + "grad_norm": 0.2811191242637275, + "learning_rate": 0.000192660264517248, + "loss": 1.2062, + "step": 3482 + }, + { + "epoch": 0.33, + "grad_norm": 0.26967275386109973, + "learning_rate": 0.0001926543145659244, + "loss": 1.1001, + "step": 3483 + }, + { + "epoch": 0.33, + "grad_norm": 0.24658725442432267, + "learning_rate": 0.00019264836229586837, + "loss": 1.0698, + "step": 3484 + }, + { + "epoch": 0.33, + "grad_norm": 0.2716734887510684, + "learning_rate": 0.00019264240770722885, + "loss": 1.0129, + "step": 3485 + }, + { + "epoch": 0.33, + "grad_norm": 0.24634587706914243, + "learning_rate": 0.00019263645080015485, + "loss": 0.9294, + "step": 3486 + }, + { + "epoch": 0.33, + "grad_norm": 0.26665979653082733, + "learning_rate": 0.00019263049157479544, + "loss": 1.0361, + "step": 3487 + }, + { + "epoch": 0.33, + "grad_norm": 0.262098888665953, + "learning_rate": 0.0001926245300312998, + "loss": 1.0617, + "step": 3488 + }, + { + "epoch": 0.33, + "grad_norm": 0.27380325184122906, + "learning_rate": 0.00019261856616981703, + "loss": 1.0235, + "step": 3489 + }, + { + "epoch": 0.33, + "grad_norm": 0.25372809019534937, + "learning_rate": 0.00019261259999049646, + "loss": 1.0733, + "step": 3490 + }, + { + "epoch": 0.33, + "grad_norm": 0.2703271495420422, + "learning_rate": 0.00019260663149348736, + "loss": 1.1621, + "step": 3491 + }, + { + "epoch": 0.33, + "grad_norm": 0.285631485877341, + "learning_rate": 0.00019260066067893915, + "loss": 1.119, + "step": 3492 + }, + { + "epoch": 0.33, + "grad_norm": 0.28067090620266755, + "learning_rate": 0.00019259468754700114, + "loss": 1.1283, + "step": 3493 + }, + { + "epoch": 0.33, + "grad_norm": 0.25704313202950074, + "learning_rate": 0.00019258871209782292, + "loss": 1.1286, + "step": 3494 + }, + { + "epoch": 0.33, + "grad_norm": 0.29208835332994176, + "learning_rate": 0.00019258273433155399, + "loss": 1.1172, + "step": 3495 + }, + { + "epoch": 0.33, + "grad_norm": 0.26926728161587604, + "learning_rate": 0.00019257675424834395, + "loss": 1.0684, + "step": 3496 + }, + { + "epoch": 0.33, + "grad_norm": 0.2732931154751818, + "learning_rate": 0.00019257077184834244, + "loss": 1.1041, + "step": 3497 + }, + { + "epoch": 0.33, + "grad_norm": 0.26420765126059936, + "learning_rate": 0.00019256478713169917, + "loss": 1.0886, + "step": 3498 + }, + { + "epoch": 0.33, + "grad_norm": 0.26082579867661154, + "learning_rate": 0.00019255880009856396, + "loss": 1.0275, + "step": 3499 + }, + { + "epoch": 0.33, + "grad_norm": 0.28137431081329883, + "learning_rate": 0.0001925528107490866, + "loss": 1.0764, + "step": 3500 + }, + { + "epoch": 0.33, + "grad_norm": 0.26907207440559944, + "learning_rate": 0.00019254681908341696, + "loss": 1.0703, + "step": 3501 + }, + { + "epoch": 0.34, + "grad_norm": 0.31906228924539476, + "learning_rate": 0.00019254082510170503, + "loss": 1.0448, + "step": 3502 + }, + { + "epoch": 0.34, + "grad_norm": 0.27170761831755913, + "learning_rate": 0.0001925348288041008, + "loss": 1.1028, + "step": 3503 + }, + { + "epoch": 0.34, + "grad_norm": 0.2855022956911831, + "learning_rate": 0.00019252883019075433, + "loss": 1.0736, + "step": 3504 + }, + { + "epoch": 0.34, + "grad_norm": 0.2857537178954334, + "learning_rate": 0.0001925228292618157, + "loss": 1.0172, + "step": 3505 + }, + { + "epoch": 0.34, + "grad_norm": 0.26288096894393326, + "learning_rate": 0.0001925168260174351, + "loss": 1.0262, + "step": 3506 + }, + { + "epoch": 0.34, + "grad_norm": 0.2682150517518947, + "learning_rate": 0.00019251082045776283, + "loss": 1.0227, + "step": 3507 + }, + { + "epoch": 0.34, + "grad_norm": 0.28175497164571117, + "learning_rate": 0.00019250481258294911, + "loss": 1.0507, + "step": 3508 + }, + { + "epoch": 0.34, + "grad_norm": 0.28227600468816066, + "learning_rate": 0.00019249880239314435, + "loss": 1.1972, + "step": 3509 + }, + { + "epoch": 0.34, + "grad_norm": 0.2933560845393136, + "learning_rate": 0.0001924927898884989, + "loss": 1.0141, + "step": 3510 + }, + { + "epoch": 0.34, + "grad_norm": 0.2659979972720397, + "learning_rate": 0.0001924867750691633, + "loss": 1.1605, + "step": 3511 + }, + { + "epoch": 0.34, + "grad_norm": 0.28458190404309464, + "learning_rate": 0.00019248075793528794, + "loss": 1.1147, + "step": 3512 + }, + { + "epoch": 0.34, + "grad_norm": 0.30482251375794783, + "learning_rate": 0.00019247473848702358, + "loss": 1.2108, + "step": 3513 + }, + { + "epoch": 0.34, + "grad_norm": 0.30589628132765245, + "learning_rate": 0.00019246871672452072, + "loss": 1.1377, + "step": 3514 + }, + { + "epoch": 0.34, + "grad_norm": 0.2515859551958094, + "learning_rate": 0.00019246269264793013, + "loss": 1.0792, + "step": 3515 + }, + { + "epoch": 0.34, + "grad_norm": 0.2822974779571469, + "learning_rate": 0.00019245666625740252, + "loss": 1.1263, + "step": 3516 + }, + { + "epoch": 0.34, + "grad_norm": 0.26320433675688415, + "learning_rate": 0.00019245063755308873, + "loss": 1.0395, + "step": 3517 + }, + { + "epoch": 0.34, + "grad_norm": 0.27747602120555126, + "learning_rate": 0.00019244460653513966, + "loss": 1.0819, + "step": 3518 + }, + { + "epoch": 0.34, + "grad_norm": 0.27754855559046226, + "learning_rate": 0.00019243857320370622, + "loss": 1.1293, + "step": 3519 + }, + { + "epoch": 0.34, + "grad_norm": 0.2718497648773705, + "learning_rate": 0.00019243253755893934, + "loss": 1.0904, + "step": 3520 + }, + { + "epoch": 0.34, + "grad_norm": 0.24497317555871573, + "learning_rate": 0.00019242649960099018, + "loss": 1.0817, + "step": 3521 + }, + { + "epoch": 0.34, + "grad_norm": 0.2832235216362736, + "learning_rate": 0.00019242045933000974, + "loss": 1.1896, + "step": 3522 + }, + { + "epoch": 0.34, + "grad_norm": 0.2804650893498078, + "learning_rate": 0.00019241441674614925, + "loss": 1.0998, + "step": 3523 + }, + { + "epoch": 0.34, + "grad_norm": 0.2746480898368063, + "learning_rate": 0.00019240837184955986, + "loss": 1.1329, + "step": 3524 + }, + { + "epoch": 0.34, + "grad_norm": 0.27811869745054746, + "learning_rate": 0.0001924023246403929, + "loss": 1.2432, + "step": 3525 + }, + { + "epoch": 0.34, + "grad_norm": 0.2438034625107248, + "learning_rate": 0.0001923962751187997, + "loss": 1.0454, + "step": 3526 + }, + { + "epoch": 0.34, + "grad_norm": 0.29872712215291863, + "learning_rate": 0.00019239022328493166, + "loss": 1.0986, + "step": 3527 + }, + { + "epoch": 0.34, + "grad_norm": 0.28352993745178234, + "learning_rate": 0.00019238416913894022, + "loss": 1.0937, + "step": 3528 + }, + { + "epoch": 0.34, + "grad_norm": 0.2573940016691036, + "learning_rate": 0.00019237811268097685, + "loss": 1.061, + "step": 3529 + }, + { + "epoch": 0.34, + "grad_norm": 0.26547291167111237, + "learning_rate": 0.00019237205391119317, + "loss": 1.1062, + "step": 3530 + }, + { + "epoch": 0.34, + "grad_norm": 0.2434569652472095, + "learning_rate": 0.0001923659928297408, + "loss": 0.994, + "step": 3531 + }, + { + "epoch": 0.34, + "grad_norm": 0.24604489384038644, + "learning_rate": 0.00019235992943677138, + "loss": 1.0407, + "step": 3532 + }, + { + "epoch": 0.34, + "grad_norm": 0.2566897641461854, + "learning_rate": 0.0001923538637324367, + "loss": 1.0814, + "step": 3533 + }, + { + "epoch": 0.34, + "grad_norm": 0.2833118574755522, + "learning_rate": 0.00019234779571688856, + "loss": 1.0133, + "step": 3534 + }, + { + "epoch": 0.34, + "grad_norm": 0.2641589772291687, + "learning_rate": 0.00019234172539027875, + "loss": 1.0779, + "step": 3535 + }, + { + "epoch": 0.34, + "grad_norm": 0.26348102788720923, + "learning_rate": 0.00019233565275275926, + "loss": 1.0349, + "step": 3536 + }, + { + "epoch": 0.34, + "grad_norm": 0.26442584979280237, + "learning_rate": 0.00019232957780448203, + "loss": 1.0908, + "step": 3537 + }, + { + "epoch": 0.34, + "grad_norm": 0.21497138036257077, + "learning_rate": 0.00019232350054559908, + "loss": 1.0206, + "step": 3538 + }, + { + "epoch": 0.34, + "grad_norm": 0.28584115929317455, + "learning_rate": 0.00019231742097626248, + "loss": 1.0948, + "step": 3539 + }, + { + "epoch": 0.34, + "grad_norm": 0.28169323923794704, + "learning_rate": 0.00019231133909662442, + "loss": 1.1113, + "step": 3540 + }, + { + "epoch": 0.34, + "grad_norm": 0.27099757848452277, + "learning_rate": 0.0001923052549068371, + "loss": 1.1069, + "step": 3541 + }, + { + "epoch": 0.34, + "grad_norm": 0.2576793427814868, + "learning_rate": 0.00019229916840705276, + "loss": 0.9917, + "step": 3542 + }, + { + "epoch": 0.34, + "grad_norm": 0.261636637446303, + "learning_rate": 0.0001922930795974237, + "loss": 1.0584, + "step": 3543 + }, + { + "epoch": 0.34, + "grad_norm": 0.27616031671725766, + "learning_rate": 0.0001922869884781023, + "loss": 1.0674, + "step": 3544 + }, + { + "epoch": 0.34, + "grad_norm": 0.2603326493271169, + "learning_rate": 0.0001922808950492411, + "loss": 0.9925, + "step": 3545 + }, + { + "epoch": 0.34, + "grad_norm": 0.2611098162609211, + "learning_rate": 0.00019227479931099243, + "loss": 1.1144, + "step": 3546 + }, + { + "epoch": 0.34, + "grad_norm": 0.28213224868364634, + "learning_rate": 0.00019226870126350893, + "loss": 0.9999, + "step": 3547 + }, + { + "epoch": 0.34, + "grad_norm": 0.26909330897869277, + "learning_rate": 0.00019226260090694322, + "loss": 1.1827, + "step": 3548 + }, + { + "epoch": 0.34, + "grad_norm": 0.2204883513538407, + "learning_rate": 0.00019225649824144788, + "loss": 1.0633, + "step": 3549 + }, + { + "epoch": 0.34, + "grad_norm": 0.2800738248678148, + "learning_rate": 0.00019225039326717575, + "loss": 1.0618, + "step": 3550 + }, + { + "epoch": 0.34, + "grad_norm": 0.3026024017055852, + "learning_rate": 0.0001922442859842795, + "loss": 1.0372, + "step": 3551 + }, + { + "epoch": 0.34, + "grad_norm": 0.25052538631469556, + "learning_rate": 0.00019223817639291206, + "loss": 1.1017, + "step": 3552 + }, + { + "epoch": 0.34, + "grad_norm": 0.2534940085440102, + "learning_rate": 0.00019223206449322627, + "loss": 1.063, + "step": 3553 + }, + { + "epoch": 0.34, + "grad_norm": 0.2829769188702781, + "learning_rate": 0.0001922259502853751, + "loss": 1.0918, + "step": 3554 + }, + { + "epoch": 0.34, + "grad_norm": 0.2515647647546932, + "learning_rate": 0.0001922198337695116, + "loss": 1.1084, + "step": 3555 + }, + { + "epoch": 0.34, + "grad_norm": 0.2716176574803204, + "learning_rate": 0.00019221371494578874, + "loss": 1.1048, + "step": 3556 + }, + { + "epoch": 0.34, + "grad_norm": 0.2544586203933426, + "learning_rate": 0.00019220759381435976, + "loss": 0.9334, + "step": 3557 + }, + { + "epoch": 0.34, + "grad_norm": 0.25686564637963666, + "learning_rate": 0.00019220147037537775, + "loss": 1.1342, + "step": 3558 + }, + { + "epoch": 0.34, + "grad_norm": 0.2856418475083659, + "learning_rate": 0.00019219534462899603, + "loss": 1.0993, + "step": 3559 + }, + { + "epoch": 0.34, + "grad_norm": 0.25892037090159264, + "learning_rate": 0.00019218921657536785, + "loss": 1.1242, + "step": 3560 + }, + { + "epoch": 0.34, + "grad_norm": 0.22637468596243374, + "learning_rate": 0.00019218308621464657, + "loss": 0.9695, + "step": 3561 + }, + { + "epoch": 0.34, + "grad_norm": 0.24205716240527825, + "learning_rate": 0.00019217695354698566, + "loss": 0.9977, + "step": 3562 + }, + { + "epoch": 0.34, + "grad_norm": 0.28419771649107817, + "learning_rate": 0.00019217081857253855, + "loss": 1.1305, + "step": 3563 + }, + { + "epoch": 0.34, + "grad_norm": 0.2997025561180733, + "learning_rate": 0.00019216468129145878, + "loss": 1.1392, + "step": 3564 + }, + { + "epoch": 0.34, + "grad_norm": 0.2575883782828923, + "learning_rate": 0.00019215854170389992, + "loss": 1.1146, + "step": 3565 + }, + { + "epoch": 0.34, + "grad_norm": 0.2378156355843756, + "learning_rate": 0.00019215239981001565, + "loss": 1.1623, + "step": 3566 + }, + { + "epoch": 0.34, + "grad_norm": 0.22766346459594225, + "learning_rate": 0.00019214625560995963, + "loss": 1.1813, + "step": 3567 + }, + { + "epoch": 0.34, + "grad_norm": 0.2815971957249664, + "learning_rate": 0.0001921401091038857, + "loss": 1.0946, + "step": 3568 + }, + { + "epoch": 0.34, + "grad_norm": 0.24854801853444775, + "learning_rate": 0.0001921339602919476, + "loss": 1.1996, + "step": 3569 + }, + { + "epoch": 0.34, + "grad_norm": 0.28777374446942766, + "learning_rate": 0.00019212780917429923, + "loss": 1.2163, + "step": 3570 + }, + { + "epoch": 0.34, + "grad_norm": 0.2755310826907456, + "learning_rate": 0.00019212165575109452, + "loss": 1.2163, + "step": 3571 + }, + { + "epoch": 0.34, + "grad_norm": 0.23202223527427898, + "learning_rate": 0.00019211550002248755, + "loss": 1.0259, + "step": 3572 + }, + { + "epoch": 0.34, + "grad_norm": 0.25126040011653317, + "learning_rate": 0.00019210934198863225, + "loss": 1.1719, + "step": 3573 + }, + { + "epoch": 0.34, + "grad_norm": 0.31182851203253326, + "learning_rate": 0.00019210318164968276, + "loss": 0.9726, + "step": 3574 + }, + { + "epoch": 0.34, + "grad_norm": 0.30037163533010824, + "learning_rate": 0.00019209701900579332, + "loss": 1.1423, + "step": 3575 + }, + { + "epoch": 0.34, + "grad_norm": 0.2769715350587386, + "learning_rate": 0.00019209085405711806, + "loss": 1.023, + "step": 3576 + }, + { + "epoch": 0.34, + "grad_norm": 0.2737139517166542, + "learning_rate": 0.0001920846868038113, + "loss": 1.1156, + "step": 3577 + }, + { + "epoch": 0.34, + "grad_norm": 0.27215926972723625, + "learning_rate": 0.00019207851724602738, + "loss": 1.2292, + "step": 3578 + }, + { + "epoch": 0.34, + "grad_norm": 0.25589677103292885, + "learning_rate": 0.0001920723453839207, + "loss": 1.066, + "step": 3579 + }, + { + "epoch": 0.34, + "grad_norm": 0.27063989588024306, + "learning_rate": 0.00019206617121764573, + "loss": 1.1828, + "step": 3580 + }, + { + "epoch": 0.34, + "grad_norm": 0.2621987437844738, + "learning_rate": 0.00019205999474735695, + "loss": 1.113, + "step": 3581 + }, + { + "epoch": 0.34, + "grad_norm": 0.27664343721303936, + "learning_rate": 0.00019205381597320895, + "loss": 1.0935, + "step": 3582 + }, + { + "epoch": 0.34, + "grad_norm": 0.2521436526457091, + "learning_rate": 0.00019204763489535633, + "loss": 1.0474, + "step": 3583 + }, + { + "epoch": 0.34, + "grad_norm": 0.29893749568707256, + "learning_rate": 0.00019204145151395383, + "loss": 1.0565, + "step": 3584 + }, + { + "epoch": 0.34, + "grad_norm": 0.2865188185320406, + "learning_rate": 0.00019203526582915615, + "loss": 1.0099, + "step": 3585 + }, + { + "epoch": 0.34, + "grad_norm": 0.30840096394075645, + "learning_rate": 0.0001920290778411181, + "loss": 1.0744, + "step": 3586 + }, + { + "epoch": 0.34, + "grad_norm": 0.27719489340614456, + "learning_rate": 0.00019202288754999454, + "loss": 1.1818, + "step": 3587 + }, + { + "epoch": 0.34, + "grad_norm": 0.26038266306240404, + "learning_rate": 0.00019201669495594036, + "loss": 1.1681, + "step": 3588 + }, + { + "epoch": 0.34, + "grad_norm": 0.2535864656959433, + "learning_rate": 0.00019201050005911057, + "loss": 1.0594, + "step": 3589 + }, + { + "epoch": 0.34, + "grad_norm": 0.26996214011774267, + "learning_rate": 0.0001920043028596602, + "loss": 1.0319, + "step": 3590 + }, + { + "epoch": 0.34, + "grad_norm": 0.293644543486573, + "learning_rate": 0.00019199810335774432, + "loss": 1.0956, + "step": 3591 + }, + { + "epoch": 0.34, + "grad_norm": 0.24802640655532923, + "learning_rate": 0.0001919919015535181, + "loss": 1.028, + "step": 3592 + }, + { + "epoch": 0.34, + "grad_norm": 0.288494054514753, + "learning_rate": 0.0001919856974471367, + "loss": 1.2163, + "step": 3593 + }, + { + "epoch": 0.34, + "grad_norm": 0.28835626422546046, + "learning_rate": 0.00019197949103875542, + "loss": 1.1595, + "step": 3594 + }, + { + "epoch": 0.34, + "grad_norm": 0.2834452761231032, + "learning_rate": 0.00019197328232852957, + "loss": 1.2283, + "step": 3595 + }, + { + "epoch": 0.34, + "grad_norm": 0.2712391659872608, + "learning_rate": 0.00019196707131661456, + "loss": 0.9865, + "step": 3596 + }, + { + "epoch": 0.34, + "grad_norm": 0.28960322854085857, + "learning_rate": 0.00019196085800316577, + "loss": 1.0389, + "step": 3597 + }, + { + "epoch": 0.34, + "grad_norm": 0.26819478685827985, + "learning_rate": 0.00019195464238833872, + "loss": 1.0676, + "step": 3598 + }, + { + "epoch": 0.34, + "grad_norm": 0.27909258767693773, + "learning_rate": 0.00019194842447228894, + "loss": 1.1414, + "step": 3599 + }, + { + "epoch": 0.34, + "grad_norm": 0.29782335526237375, + "learning_rate": 0.00019194220425517203, + "loss": 1.0944, + "step": 3600 + }, + { + "epoch": 0.34, + "grad_norm": 0.2990681773116163, + "learning_rate": 0.00019193598173714368, + "loss": 1.2351, + "step": 3601 + }, + { + "epoch": 0.34, + "grad_norm": 0.25309601233066126, + "learning_rate": 0.00019192975691835967, + "loss": 0.9833, + "step": 3602 + }, + { + "epoch": 0.34, + "grad_norm": 0.2530498249236964, + "learning_rate": 0.00019192352979897564, + "loss": 1.1116, + "step": 3603 + }, + { + "epoch": 0.34, + "grad_norm": 0.261372825515641, + "learning_rate": 0.00019191730037914755, + "loss": 1.1451, + "step": 3604 + }, + { + "epoch": 0.34, + "grad_norm": 0.2224404725086543, + "learning_rate": 0.00019191106865903125, + "loss": 1.1346, + "step": 3605 + }, + { + "epoch": 0.34, + "grad_norm": 0.24745135431170612, + "learning_rate": 0.00019190483463878266, + "loss": 1.0474, + "step": 3606 + }, + { + "epoch": 0.35, + "grad_norm": 0.2680064496452726, + "learning_rate": 0.00019189859831855786, + "loss": 1.0726, + "step": 3607 + }, + { + "epoch": 0.35, + "grad_norm": 0.2728343814388248, + "learning_rate": 0.00019189235969851285, + "loss": 1.143, + "step": 3608 + }, + { + "epoch": 0.35, + "grad_norm": 0.2758236497986754, + "learning_rate": 0.0001918861187788038, + "loss": 1.1073, + "step": 3609 + }, + { + "epoch": 0.35, + "grad_norm": 0.24891564132409086, + "learning_rate": 0.00019187987555958688, + "loss": 0.9501, + "step": 3610 + }, + { + "epoch": 0.35, + "grad_norm": 0.28175698358764867, + "learning_rate": 0.00019187363004101834, + "loss": 1.119, + "step": 3611 + }, + { + "epoch": 0.35, + "grad_norm": 0.29465454747771924, + "learning_rate": 0.00019186738222325446, + "loss": 1.1071, + "step": 3612 + }, + { + "epoch": 0.35, + "grad_norm": 0.28563894659906613, + "learning_rate": 0.00019186113210645158, + "loss": 0.9567, + "step": 3613 + }, + { + "epoch": 0.35, + "grad_norm": 0.2665261670344539, + "learning_rate": 0.00019185487969076618, + "loss": 1.0447, + "step": 3614 + }, + { + "epoch": 0.35, + "grad_norm": 0.29057112065403007, + "learning_rate": 0.00019184862497635466, + "loss": 1.094, + "step": 3615 + }, + { + "epoch": 0.35, + "grad_norm": 0.24917921323756612, + "learning_rate": 0.0001918423679633736, + "loss": 1.0332, + "step": 3616 + }, + { + "epoch": 0.35, + "grad_norm": 0.25724912043463755, + "learning_rate": 0.0001918361086519795, + "loss": 1.1526, + "step": 3617 + }, + { + "epoch": 0.35, + "grad_norm": 0.30458714457974434, + "learning_rate": 0.00019182984704232912, + "loss": 1.1059, + "step": 3618 + }, + { + "epoch": 0.35, + "grad_norm": 0.233003803525938, + "learning_rate": 0.00019182358313457907, + "loss": 1.0059, + "step": 3619 + }, + { + "epoch": 0.35, + "grad_norm": 0.3095013736710616, + "learning_rate": 0.00019181731692888615, + "loss": 1.2226, + "step": 3620 + }, + { + "epoch": 0.35, + "grad_norm": 0.3112580093485402, + "learning_rate": 0.0001918110484254072, + "loss": 1.0111, + "step": 3621 + }, + { + "epoch": 0.35, + "grad_norm": 0.2615583637323915, + "learning_rate": 0.00019180477762429905, + "loss": 1.0417, + "step": 3622 + }, + { + "epoch": 0.35, + "grad_norm": 0.25331226852742855, + "learning_rate": 0.00019179850452571864, + "loss": 1.1169, + "step": 3623 + }, + { + "epoch": 0.35, + "grad_norm": 0.2653623351810319, + "learning_rate": 0.00019179222912982295, + "loss": 1.0001, + "step": 3624 + }, + { + "epoch": 0.35, + "grad_norm": 0.3027616299847975, + "learning_rate": 0.00019178595143676903, + "loss": 1.0122, + "step": 3625 + }, + { + "epoch": 0.35, + "grad_norm": 0.2596463436944122, + "learning_rate": 0.00019177967144671403, + "loss": 1.0603, + "step": 3626 + }, + { + "epoch": 0.35, + "grad_norm": 0.26322578302738187, + "learning_rate": 0.00019177338915981503, + "loss": 1.1179, + "step": 3627 + }, + { + "epoch": 0.35, + "grad_norm": 0.26304450867502877, + "learning_rate": 0.00019176710457622932, + "loss": 1.1117, + "step": 3628 + }, + { + "epoch": 0.35, + "grad_norm": 0.26025672008441436, + "learning_rate": 0.00019176081769611413, + "loss": 1.0988, + "step": 3629 + }, + { + "epoch": 0.35, + "grad_norm": 0.264676786106681, + "learning_rate": 0.00019175452851962678, + "loss": 1.1449, + "step": 3630 + }, + { + "epoch": 0.35, + "grad_norm": 0.28753505819216246, + "learning_rate": 0.00019174823704692473, + "loss": 0.9785, + "step": 3631 + }, + { + "epoch": 0.35, + "grad_norm": 0.27286886659302656, + "learning_rate": 0.00019174194327816534, + "loss": 1.0712, + "step": 3632 + }, + { + "epoch": 0.35, + "grad_norm": 0.2851788136509597, + "learning_rate": 0.0001917356472135062, + "loss": 1.1066, + "step": 3633 + }, + { + "epoch": 0.35, + "grad_norm": 0.2596390703916512, + "learning_rate": 0.00019172934885310484, + "loss": 1.03, + "step": 3634 + }, + { + "epoch": 0.35, + "grad_norm": 0.27793648005875066, + "learning_rate": 0.00019172304819711886, + "loss": 1.0643, + "step": 3635 + }, + { + "epoch": 0.35, + "grad_norm": 0.2785181884043944, + "learning_rate": 0.00019171674524570595, + "loss": 1.1464, + "step": 3636 + }, + { + "epoch": 0.35, + "grad_norm": 0.30776612944819687, + "learning_rate": 0.00019171043999902386, + "loss": 1.0927, + "step": 3637 + }, + { + "epoch": 0.35, + "grad_norm": 0.2477006043242119, + "learning_rate": 0.00019170413245723036, + "loss": 1.0933, + "step": 3638 + }, + { + "epoch": 0.35, + "grad_norm": 0.28543848973925623, + "learning_rate": 0.00019169782262048332, + "loss": 1.0738, + "step": 3639 + }, + { + "epoch": 0.35, + "grad_norm": 0.23369616317344152, + "learning_rate": 0.00019169151048894066, + "loss": 1.0539, + "step": 3640 + }, + { + "epoch": 0.35, + "grad_norm": 0.26607663383639485, + "learning_rate": 0.00019168519606276027, + "loss": 1.0068, + "step": 3641 + }, + { + "epoch": 0.35, + "grad_norm": 0.27489969296500305, + "learning_rate": 0.00019167887934210028, + "loss": 1.1252, + "step": 3642 + }, + { + "epoch": 0.35, + "grad_norm": 0.26853165269525164, + "learning_rate": 0.00019167256032711868, + "loss": 1.1099, + "step": 3643 + }, + { + "epoch": 0.35, + "grad_norm": 0.26605252525945594, + "learning_rate": 0.00019166623901797366, + "loss": 1.063, + "step": 3644 + }, + { + "epoch": 0.35, + "grad_norm": 0.2857813746118553, + "learning_rate": 0.0001916599154148234, + "loss": 1.1668, + "step": 3645 + }, + { + "epoch": 0.35, + "grad_norm": 0.3004731926385603, + "learning_rate": 0.00019165358951782612, + "loss": 1.1223, + "step": 3646 + }, + { + "epoch": 0.35, + "grad_norm": 0.30149091959544416, + "learning_rate": 0.0001916472613271402, + "loss": 1.0124, + "step": 3647 + }, + { + "epoch": 0.35, + "grad_norm": 0.24733679580665724, + "learning_rate": 0.00019164093084292393, + "loss": 1.1462, + "step": 3648 + }, + { + "epoch": 0.35, + "grad_norm": 0.21990278220565118, + "learning_rate": 0.00019163459806533582, + "loss": 1.0147, + "step": 3649 + }, + { + "epoch": 0.35, + "grad_norm": 0.26246347077998894, + "learning_rate": 0.00019162826299453427, + "loss": 1.0042, + "step": 3650 + }, + { + "epoch": 0.35, + "grad_norm": 0.2552863623874917, + "learning_rate": 0.00019162192563067785, + "loss": 1.0509, + "step": 3651 + }, + { + "epoch": 0.35, + "grad_norm": 0.26792837644378786, + "learning_rate": 0.00019161558597392516, + "loss": 1.0793, + "step": 3652 + }, + { + "epoch": 0.35, + "grad_norm": 0.26602863063633964, + "learning_rate": 0.00019160924402443488, + "loss": 1.0786, + "step": 3653 + }, + { + "epoch": 0.35, + "grad_norm": 0.2748453626108522, + "learning_rate": 0.00019160289978236567, + "loss": 1.1711, + "step": 3654 + }, + { + "epoch": 0.35, + "grad_norm": 0.2892103795799046, + "learning_rate": 0.00019159655324787634, + "loss": 1.0855, + "step": 3655 + }, + { + "epoch": 0.35, + "grad_norm": 0.29356881676510077, + "learning_rate": 0.00019159020442112567, + "loss": 1.0765, + "step": 3656 + }, + { + "epoch": 0.35, + "grad_norm": 0.25797959221770866, + "learning_rate": 0.0001915838533022726, + "loss": 1.0762, + "step": 3657 + }, + { + "epoch": 0.35, + "grad_norm": 0.24086139476910498, + "learning_rate": 0.00019157749989147602, + "loss": 1.0265, + "step": 3658 + }, + { + "epoch": 0.35, + "grad_norm": 0.29161814627285754, + "learning_rate": 0.00019157114418889498, + "loss": 1.0909, + "step": 3659 + }, + { + "epoch": 0.35, + "grad_norm": 0.2899831061438026, + "learning_rate": 0.0001915647861946885, + "loss": 1.0793, + "step": 3660 + }, + { + "epoch": 0.35, + "grad_norm": 0.2444771404216185, + "learning_rate": 0.00019155842590901564, + "loss": 1.0686, + "step": 3661 + }, + { + "epoch": 0.35, + "grad_norm": 0.2757863183788381, + "learning_rate": 0.0001915520633320357, + "loss": 1.1698, + "step": 3662 + }, + { + "epoch": 0.35, + "grad_norm": 0.2850756152744979, + "learning_rate": 0.00019154569846390783, + "loss": 1.2098, + "step": 3663 + }, + { + "epoch": 0.35, + "grad_norm": 0.2229292059572456, + "learning_rate": 0.00019153933130479128, + "loss": 1.1241, + "step": 3664 + }, + { + "epoch": 0.35, + "grad_norm": 0.2763336437474556, + "learning_rate": 0.00019153296185484545, + "loss": 1.0689, + "step": 3665 + }, + { + "epoch": 0.35, + "grad_norm": 0.26355818754319077, + "learning_rate": 0.00019152659011422975, + "loss": 1.1286, + "step": 3666 + }, + { + "epoch": 0.35, + "grad_norm": 0.2834332018069187, + "learning_rate": 0.0001915202160831036, + "loss": 1.1524, + "step": 3667 + }, + { + "epoch": 0.35, + "grad_norm": 0.2829528091832065, + "learning_rate": 0.0001915138397616265, + "loss": 1.1356, + "step": 3668 + }, + { + "epoch": 0.35, + "grad_norm": 0.2549124521154805, + "learning_rate": 0.00019150746114995809, + "loss": 1.0091, + "step": 3669 + }, + { + "epoch": 0.35, + "grad_norm": 0.2787455502134115, + "learning_rate": 0.00019150108024825792, + "loss": 1.1973, + "step": 3670 + }, + { + "epoch": 0.35, + "grad_norm": 0.3045237462568005, + "learning_rate": 0.00019149469705668574, + "loss": 1.0021, + "step": 3671 + }, + { + "epoch": 0.35, + "grad_norm": 0.2563873686787024, + "learning_rate": 0.00019148831157540122, + "loss": 1.0652, + "step": 3672 + }, + { + "epoch": 0.35, + "grad_norm": 0.26639470855690445, + "learning_rate": 0.0001914819238045642, + "loss": 1.0402, + "step": 3673 + }, + { + "epoch": 0.35, + "grad_norm": 0.24615772856913087, + "learning_rate": 0.00019147553374433457, + "loss": 1.1036, + "step": 3674 + }, + { + "epoch": 0.35, + "grad_norm": 0.26050952114504417, + "learning_rate": 0.00019146914139487223, + "loss": 1.082, + "step": 3675 + }, + { + "epoch": 0.35, + "grad_norm": 0.2711202214111707, + "learning_rate": 0.00019146274675633712, + "loss": 1.022, + "step": 3676 + }, + { + "epoch": 0.35, + "grad_norm": 0.27642386341556874, + "learning_rate": 0.0001914563498288893, + "loss": 1.1196, + "step": 3677 + }, + { + "epoch": 0.35, + "grad_norm": 0.26017602194644773, + "learning_rate": 0.00019144995061268886, + "loss": 1.0743, + "step": 3678 + }, + { + "epoch": 0.35, + "grad_norm": 0.2802611591626233, + "learning_rate": 0.00019144354910789592, + "loss": 1.1542, + "step": 3679 + }, + { + "epoch": 0.35, + "grad_norm": 0.30109865674446373, + "learning_rate": 0.00019143714531467067, + "loss": 1.0761, + "step": 3680 + }, + { + "epoch": 0.35, + "grad_norm": 0.29232637101985814, + "learning_rate": 0.00019143073923317343, + "loss": 1.1051, + "step": 3681 + }, + { + "epoch": 0.35, + "grad_norm": 0.26726588819259994, + "learning_rate": 0.00019142433086356445, + "loss": 1.1052, + "step": 3682 + }, + { + "epoch": 0.35, + "grad_norm": 0.3170937940560179, + "learning_rate": 0.00019141792020600416, + "loss": 1.0991, + "step": 3683 + }, + { + "epoch": 0.35, + "grad_norm": 0.26117278170269886, + "learning_rate": 0.00019141150726065292, + "loss": 1.1537, + "step": 3684 + }, + { + "epoch": 0.35, + "grad_norm": 0.23933035001939557, + "learning_rate": 0.00019140509202767128, + "loss": 1.0034, + "step": 3685 + }, + { + "epoch": 0.35, + "grad_norm": 0.261470828936781, + "learning_rate": 0.00019139867450721978, + "loss": 1.0356, + "step": 3686 + }, + { + "epoch": 0.35, + "grad_norm": 0.28669585869759606, + "learning_rate": 0.000191392254699459, + "loss": 1.1138, + "step": 3687 + }, + { + "epoch": 0.35, + "grad_norm": 0.28522961501456195, + "learning_rate": 0.00019138583260454962, + "loss": 1.1261, + "step": 3688 + }, + { + "epoch": 0.35, + "grad_norm": 0.24515845894099997, + "learning_rate": 0.00019137940822265234, + "loss": 1.0393, + "step": 3689 + }, + { + "epoch": 0.35, + "grad_norm": 0.30181790008831666, + "learning_rate": 0.00019137298155392794, + "loss": 1.1297, + "step": 3690 + }, + { + "epoch": 0.35, + "grad_norm": 0.23025123557623256, + "learning_rate": 0.0001913665525985372, + "loss": 0.9475, + "step": 3691 + }, + { + "epoch": 0.35, + "grad_norm": 0.2916644640665306, + "learning_rate": 0.0001913601213566411, + "loss": 1.0052, + "step": 3692 + }, + { + "epoch": 0.35, + "grad_norm": 0.2914800928299035, + "learning_rate": 0.00019135368782840058, + "loss": 1.0656, + "step": 3693 + }, + { + "epoch": 0.35, + "grad_norm": 0.2638508110242151, + "learning_rate": 0.00019134725201397655, + "loss": 1.1046, + "step": 3694 + }, + { + "epoch": 0.35, + "grad_norm": 0.2862216325349102, + "learning_rate": 0.00019134081391353018, + "loss": 1.1441, + "step": 3695 + }, + { + "epoch": 0.35, + "grad_norm": 0.31225707110177847, + "learning_rate": 0.00019133437352722253, + "loss": 1.143, + "step": 3696 + }, + { + "epoch": 0.35, + "grad_norm": 0.30593521073577173, + "learning_rate": 0.00019132793085521477, + "loss": 1.1625, + "step": 3697 + }, + { + "epoch": 0.35, + "grad_norm": 0.29258514429430604, + "learning_rate": 0.00019132148589766812, + "loss": 1.1549, + "step": 3698 + }, + { + "epoch": 0.35, + "grad_norm": 0.25691353350560925, + "learning_rate": 0.00019131503865474388, + "loss": 1.086, + "step": 3699 + }, + { + "epoch": 0.35, + "grad_norm": 0.28413967582202954, + "learning_rate": 0.00019130858912660346, + "loss": 1.0516, + "step": 3700 + }, + { + "epoch": 0.35, + "grad_norm": 0.2486682279993249, + "learning_rate": 0.0001913021373134082, + "loss": 1.144, + "step": 3701 + }, + { + "epoch": 0.35, + "grad_norm": 0.2799664249634028, + "learning_rate": 0.00019129568321531957, + "loss": 1.203, + "step": 3702 + }, + { + "epoch": 0.35, + "grad_norm": 0.2360258518505738, + "learning_rate": 0.00019128922683249905, + "loss": 1.0592, + "step": 3703 + }, + { + "epoch": 0.35, + "grad_norm": 0.28969445762878065, + "learning_rate": 0.0001912827681651083, + "loss": 1.2039, + "step": 3704 + }, + { + "epoch": 0.35, + "grad_norm": 0.2925284783203515, + "learning_rate": 0.0001912763072133089, + "loss": 1.0849, + "step": 3705 + }, + { + "epoch": 0.35, + "grad_norm": 0.33511999769210443, + "learning_rate": 0.00019126984397726252, + "loss": 1.0263, + "step": 3706 + }, + { + "epoch": 0.35, + "grad_norm": 0.2898231101162137, + "learning_rate": 0.00019126337845713098, + "loss": 1.042, + "step": 3707 + }, + { + "epoch": 0.35, + "grad_norm": 0.2787807763701908, + "learning_rate": 0.000191256910653076, + "loss": 0.9801, + "step": 3708 + }, + { + "epoch": 0.35, + "grad_norm": 0.2655169550180212, + "learning_rate": 0.0001912504405652595, + "loss": 1.024, + "step": 3709 + }, + { + "epoch": 0.35, + "grad_norm": 0.23484914814404617, + "learning_rate": 0.00019124396819384336, + "loss": 1.083, + "step": 3710 + }, + { + "epoch": 0.36, + "grad_norm": 0.2399809754737731, + "learning_rate": 0.00019123749353898957, + "loss": 0.8806, + "step": 3711 + }, + { + "epoch": 0.36, + "grad_norm": 0.25071481306083293, + "learning_rate": 0.00019123101660086018, + "loss": 1.0832, + "step": 3712 + }, + { + "epoch": 0.36, + "grad_norm": 0.26019964031949383, + "learning_rate": 0.00019122453737961724, + "loss": 1.105, + "step": 3713 + }, + { + "epoch": 0.36, + "grad_norm": 0.27184915954233857, + "learning_rate": 0.00019121805587542294, + "loss": 1.1295, + "step": 3714 + }, + { + "epoch": 0.36, + "grad_norm": 0.3232217248521111, + "learning_rate": 0.00019121157208843947, + "loss": 1.0362, + "step": 3715 + }, + { + "epoch": 0.36, + "grad_norm": 0.2484485517327626, + "learning_rate": 0.00019120508601882906, + "loss": 1.0417, + "step": 3716 + }, + { + "epoch": 0.36, + "grad_norm": 0.28882594941774437, + "learning_rate": 0.00019119859766675407, + "loss": 1.0759, + "step": 3717 + }, + { + "epoch": 0.36, + "grad_norm": 0.26746773610204805, + "learning_rate": 0.00019119210703237685, + "loss": 0.9896, + "step": 3718 + }, + { + "epoch": 0.36, + "grad_norm": 0.3030903175749709, + "learning_rate": 0.00019118561411585986, + "loss": 1.0477, + "step": 3719 + }, + { + "epoch": 0.36, + "grad_norm": 0.2647502528125401, + "learning_rate": 0.00019117911891736552, + "loss": 1.0581, + "step": 3720 + }, + { + "epoch": 0.36, + "grad_norm": 0.31861248281328974, + "learning_rate": 0.00019117262143705647, + "loss": 1.144, + "step": 3721 + }, + { + "epoch": 0.36, + "grad_norm": 0.25783949986971005, + "learning_rate": 0.00019116612167509526, + "loss": 1.2301, + "step": 3722 + }, + { + "epoch": 0.36, + "grad_norm": 0.24441169216486786, + "learning_rate": 0.00019115961963164454, + "loss": 1.118, + "step": 3723 + }, + { + "epoch": 0.36, + "grad_norm": 0.2659364840495592, + "learning_rate": 0.00019115311530686706, + "loss": 1.0523, + "step": 3724 + }, + { + "epoch": 0.36, + "grad_norm": 0.2522530337190297, + "learning_rate": 0.00019114660870092558, + "loss": 0.9005, + "step": 3725 + }, + { + "epoch": 0.36, + "grad_norm": 0.2647152274884048, + "learning_rate": 0.00019114009981398294, + "loss": 1.0931, + "step": 3726 + }, + { + "epoch": 0.36, + "grad_norm": 0.2702016934536367, + "learning_rate": 0.000191133588646202, + "loss": 1.0832, + "step": 3727 + }, + { + "epoch": 0.36, + "grad_norm": 0.24668859082298314, + "learning_rate": 0.00019112707519774576, + "loss": 0.9758, + "step": 3728 + }, + { + "epoch": 0.36, + "grad_norm": 0.2672446623933551, + "learning_rate": 0.0001911205594687772, + "loss": 1.149, + "step": 3729 + }, + { + "epoch": 0.36, + "grad_norm": 0.2600484573038422, + "learning_rate": 0.00019111404145945933, + "loss": 1.057, + "step": 3730 + }, + { + "epoch": 0.36, + "grad_norm": 0.31076021763085576, + "learning_rate": 0.00019110752116995535, + "loss": 1.1161, + "step": 3731 + }, + { + "epoch": 0.36, + "grad_norm": 0.2626503992229002, + "learning_rate": 0.00019110099860042835, + "loss": 1.1429, + "step": 3732 + }, + { + "epoch": 0.36, + "grad_norm": 0.2550015497297485, + "learning_rate": 0.00019109447375104165, + "loss": 1.207, + "step": 3733 + }, + { + "epoch": 0.36, + "grad_norm": 0.29381829699592593, + "learning_rate": 0.00019108794662195847, + "loss": 1.071, + "step": 3734 + }, + { + "epoch": 0.36, + "grad_norm": 0.2582343525213059, + "learning_rate": 0.00019108141721334217, + "loss": 1.0302, + "step": 3735 + }, + { + "epoch": 0.36, + "grad_norm": 0.3033664358520454, + "learning_rate": 0.00019107488552535617, + "loss": 1.0559, + "step": 3736 + }, + { + "epoch": 0.36, + "grad_norm": 0.2804498013242838, + "learning_rate": 0.00019106835155816395, + "loss": 1.1039, + "step": 3737 + }, + { + "epoch": 0.36, + "grad_norm": 0.2804541814983532, + "learning_rate": 0.00019106181531192894, + "loss": 1.1296, + "step": 3738 + }, + { + "epoch": 0.36, + "grad_norm": 0.30659613447065276, + "learning_rate": 0.0001910552767868148, + "loss": 1.1612, + "step": 3739 + }, + { + "epoch": 0.36, + "grad_norm": 0.2891921914653932, + "learning_rate": 0.00019104873598298517, + "loss": 1.0669, + "step": 3740 + }, + { + "epoch": 0.36, + "grad_norm": 0.28558383598746206, + "learning_rate": 0.00019104219290060366, + "loss": 1.2211, + "step": 3741 + }, + { + "epoch": 0.36, + "grad_norm": 0.29494252698624446, + "learning_rate": 0.00019103564753983405, + "loss": 0.9863, + "step": 3742 + }, + { + "epoch": 0.36, + "grad_norm": 0.2556255730742099, + "learning_rate": 0.00019102909990084018, + "loss": 1.0618, + "step": 3743 + }, + { + "epoch": 0.36, + "grad_norm": 0.2506965201053014, + "learning_rate": 0.00019102254998378584, + "loss": 0.9738, + "step": 3744 + }, + { + "epoch": 0.36, + "grad_norm": 0.29913681320353497, + "learning_rate": 0.00019101599778883498, + "loss": 1.0876, + "step": 3745 + }, + { + "epoch": 0.36, + "grad_norm": 0.2773312706953958, + "learning_rate": 0.0001910094433161516, + "loss": 1.0741, + "step": 3746 + }, + { + "epoch": 0.36, + "grad_norm": 0.2917033659607459, + "learning_rate": 0.0001910028865658997, + "loss": 1.1748, + "step": 3747 + }, + { + "epoch": 0.36, + "grad_norm": 0.2296887419120594, + "learning_rate": 0.00019099632753824335, + "loss": 0.9938, + "step": 3748 + }, + { + "epoch": 0.36, + "grad_norm": 0.2704584908164929, + "learning_rate": 0.00019098976623334673, + "loss": 1.0166, + "step": 3749 + }, + { + "epoch": 0.36, + "grad_norm": 0.21917669304416104, + "learning_rate": 0.00019098320265137402, + "loss": 1.0756, + "step": 3750 + }, + { + "epoch": 0.36, + "grad_norm": 0.26280707594546593, + "learning_rate": 0.0001909766367924895, + "loss": 1.2615, + "step": 3751 + }, + { + "epoch": 0.36, + "grad_norm": 0.29377384953319624, + "learning_rate": 0.00019097006865685743, + "loss": 1.0886, + "step": 3752 + }, + { + "epoch": 0.36, + "grad_norm": 0.23301855169248875, + "learning_rate": 0.00019096349824464226, + "loss": 0.9808, + "step": 3753 + }, + { + "epoch": 0.36, + "grad_norm": 0.253520755622324, + "learning_rate": 0.00019095692555600832, + "loss": 1.1587, + "step": 3754 + }, + { + "epoch": 0.36, + "grad_norm": 0.2754796183392769, + "learning_rate": 0.00019095035059112024, + "loss": 1.1064, + "step": 3755 + }, + { + "epoch": 0.36, + "grad_norm": 0.2747255493731151, + "learning_rate": 0.00019094377335014242, + "loss": 1.055, + "step": 3756 + }, + { + "epoch": 0.36, + "grad_norm": 0.28773064135087933, + "learning_rate": 0.00019093719383323952, + "loss": 1.1881, + "step": 3757 + }, + { + "epoch": 0.36, + "grad_norm": 0.2806691047427897, + "learning_rate": 0.0001909306120405762, + "loss": 1.1801, + "step": 3758 + }, + { + "epoch": 0.36, + "grad_norm": 0.3132693357401042, + "learning_rate": 0.00019092402797231715, + "loss": 1.0493, + "step": 3759 + }, + { + "epoch": 0.36, + "grad_norm": 0.25209467298228705, + "learning_rate": 0.00019091744162862717, + "loss": 1.03, + "step": 3760 + }, + { + "epoch": 0.36, + "grad_norm": 0.31060570827727507, + "learning_rate": 0.0001909108530096711, + "loss": 1.21, + "step": 3761 + }, + { + "epoch": 0.36, + "grad_norm": 0.25830756717657666, + "learning_rate": 0.00019090426211561376, + "loss": 1.0167, + "step": 3762 + }, + { + "epoch": 0.36, + "grad_norm": 0.2535702002191736, + "learning_rate": 0.00019089766894662014, + "loss": 1.0515, + "step": 3763 + }, + { + "epoch": 0.36, + "grad_norm": 0.278969068298096, + "learning_rate": 0.00019089107350285522, + "loss": 1.1331, + "step": 3764 + }, + { + "epoch": 0.36, + "grad_norm": 0.2807324463593699, + "learning_rate": 0.00019088447578448407, + "loss": 1.0715, + "step": 3765 + }, + { + "epoch": 0.36, + "grad_norm": 0.28955522038757464, + "learning_rate": 0.0001908778757916718, + "loss": 1.0668, + "step": 3766 + }, + { + "epoch": 0.36, + "grad_norm": 0.2595700503045936, + "learning_rate": 0.00019087127352458358, + "loss": 1.1446, + "step": 3767 + }, + { + "epoch": 0.36, + "grad_norm": 0.28444433757950754, + "learning_rate": 0.00019086466898338462, + "loss": 1.0789, + "step": 3768 + }, + { + "epoch": 0.36, + "grad_norm": 0.2787597254903445, + "learning_rate": 0.00019085806216824017, + "loss": 0.9789, + "step": 3769 + }, + { + "epoch": 0.36, + "grad_norm": 0.2941116656712117, + "learning_rate": 0.0001908514530793157, + "loss": 1.0751, + "step": 3770 + }, + { + "epoch": 0.36, + "grad_norm": 0.22941662757128714, + "learning_rate": 0.00019084484171677646, + "loss": 1.1191, + "step": 3771 + }, + { + "epoch": 0.36, + "grad_norm": 0.24999713030985823, + "learning_rate": 0.00019083822808078798, + "loss": 1.1661, + "step": 3772 + }, + { + "epoch": 0.36, + "grad_norm": 0.28806317138126275, + "learning_rate": 0.00019083161217151574, + "loss": 1.138, + "step": 3773 + }, + { + "epoch": 0.36, + "grad_norm": 0.26401411481347825, + "learning_rate": 0.00019082499398912533, + "loss": 1.0933, + "step": 3774 + }, + { + "epoch": 0.36, + "grad_norm": 0.27780628237715865, + "learning_rate": 0.00019081837353378237, + "loss": 1.1479, + "step": 3775 + }, + { + "epoch": 0.36, + "grad_norm": 0.2567041443182185, + "learning_rate": 0.00019081175080565253, + "loss": 0.9848, + "step": 3776 + }, + { + "epoch": 0.36, + "grad_norm": 0.33134929043752975, + "learning_rate": 0.00019080512580490154, + "loss": 1.005, + "step": 3777 + }, + { + "epoch": 0.36, + "grad_norm": 0.25790224327167643, + "learning_rate": 0.0001907984985316952, + "loss": 1.063, + "step": 3778 + }, + { + "epoch": 0.36, + "grad_norm": 0.2542673197030738, + "learning_rate": 0.0001907918689861994, + "loss": 1.1054, + "step": 3779 + }, + { + "epoch": 0.36, + "grad_norm": 0.29841436435986246, + "learning_rate": 0.00019078523716858, + "loss": 1.0715, + "step": 3780 + }, + { + "epoch": 0.36, + "grad_norm": 0.2607650355473665, + "learning_rate": 0.000190778603079003, + "loss": 1.0562, + "step": 3781 + }, + { + "epoch": 0.36, + "grad_norm": 0.27320429258836315, + "learning_rate": 0.00019077196671763436, + "loss": 1.0277, + "step": 3782 + }, + { + "epoch": 0.36, + "grad_norm": 0.2548164654391468, + "learning_rate": 0.00019076532808464026, + "loss": 1.0822, + "step": 3783 + }, + { + "epoch": 0.36, + "grad_norm": 0.3085428982977869, + "learning_rate": 0.00019075868718018677, + "loss": 1.1456, + "step": 3784 + }, + { + "epoch": 0.36, + "grad_norm": 0.31897081017855045, + "learning_rate": 0.0001907520440044401, + "loss": 1.0907, + "step": 3785 + }, + { + "epoch": 0.36, + "grad_norm": 0.2491436372725657, + "learning_rate": 0.00019074539855756646, + "loss": 1.0872, + "step": 3786 + }, + { + "epoch": 0.36, + "grad_norm": 0.2844533490165419, + "learning_rate": 0.00019073875083973222, + "loss": 1.0052, + "step": 3787 + }, + { + "epoch": 0.36, + "grad_norm": 0.3018741634697304, + "learning_rate": 0.0001907321008511037, + "loss": 0.9965, + "step": 3788 + }, + { + "epoch": 0.36, + "grad_norm": 0.2423698565672897, + "learning_rate": 0.0001907254485918473, + "loss": 1.0812, + "step": 3789 + }, + { + "epoch": 0.36, + "grad_norm": 0.2975822751648389, + "learning_rate": 0.0001907187940621296, + "loss": 0.9894, + "step": 3790 + }, + { + "epoch": 0.36, + "grad_norm": 0.29200730112195994, + "learning_rate": 0.000190712137262117, + "loss": 1.0714, + "step": 3791 + }, + { + "epoch": 0.36, + "grad_norm": 0.28370419210343656, + "learning_rate": 0.0001907054781919762, + "loss": 1.077, + "step": 3792 + }, + { + "epoch": 0.36, + "grad_norm": 0.2722830323949, + "learning_rate": 0.0001906988168518738, + "loss": 1.0713, + "step": 3793 + }, + { + "epoch": 0.36, + "grad_norm": 0.24874541049005577, + "learning_rate": 0.00019069215324197646, + "loss": 1.0738, + "step": 3794 + }, + { + "epoch": 0.36, + "grad_norm": 0.28546406332135904, + "learning_rate": 0.00019068548736245102, + "loss": 1.1608, + "step": 3795 + }, + { + "epoch": 0.36, + "grad_norm": 0.29760842933931714, + "learning_rate": 0.00019067881921346427, + "loss": 1.0152, + "step": 3796 + }, + { + "epoch": 0.36, + "grad_norm": 0.2656364460449629, + "learning_rate": 0.00019067214879518306, + "loss": 1.1331, + "step": 3797 + }, + { + "epoch": 0.36, + "grad_norm": 0.2807658825912433, + "learning_rate": 0.00019066547610777437, + "loss": 1.0905, + "step": 3798 + }, + { + "epoch": 0.36, + "grad_norm": 0.2821813716967829, + "learning_rate": 0.00019065880115140513, + "loss": 1.0497, + "step": 3799 + }, + { + "epoch": 0.36, + "grad_norm": 0.26730901537371593, + "learning_rate": 0.00019065212392624243, + "loss": 1.0252, + "step": 3800 + }, + { + "epoch": 0.36, + "grad_norm": 0.26067574808177124, + "learning_rate": 0.00019064544443245335, + "loss": 0.9627, + "step": 3801 + }, + { + "epoch": 0.36, + "grad_norm": 0.26226327026433305, + "learning_rate": 0.00019063876267020507, + "loss": 1.0757, + "step": 3802 + }, + { + "epoch": 0.36, + "grad_norm": 0.2804595303992103, + "learning_rate": 0.00019063207863966478, + "loss": 1.1859, + "step": 3803 + }, + { + "epoch": 0.36, + "grad_norm": 0.265318805788959, + "learning_rate": 0.00019062539234099973, + "loss": 1.0262, + "step": 3804 + }, + { + "epoch": 0.36, + "grad_norm": 0.2530307087999173, + "learning_rate": 0.00019061870377437733, + "loss": 1.0954, + "step": 3805 + }, + { + "epoch": 0.36, + "grad_norm": 0.2734697145712888, + "learning_rate": 0.00019061201293996488, + "loss": 1.0704, + "step": 3806 + }, + { + "epoch": 0.36, + "grad_norm": 0.24462035828995993, + "learning_rate": 0.00019060531983792987, + "loss": 1.1139, + "step": 3807 + }, + { + "epoch": 0.36, + "grad_norm": 0.2894517260409347, + "learning_rate": 0.00019059862446843982, + "loss": 1.0643, + "step": 3808 + }, + { + "epoch": 0.36, + "grad_norm": 0.28221499051147075, + "learning_rate": 0.00019059192683166222, + "loss": 1.0379, + "step": 3809 + }, + { + "epoch": 0.36, + "grad_norm": 0.2596597115877193, + "learning_rate": 0.00019058522692776473, + "loss": 1.1256, + "step": 3810 + }, + { + "epoch": 0.36, + "grad_norm": 0.26056578947550624, + "learning_rate": 0.00019057852475691498, + "loss": 1.002, + "step": 3811 + }, + { + "epoch": 0.36, + "grad_norm": 0.27913230085219015, + "learning_rate": 0.00019057182031928074, + "loss": 1.1348, + "step": 3812 + }, + { + "epoch": 0.36, + "grad_norm": 0.27929429057142885, + "learning_rate": 0.00019056511361502975, + "loss": 1.1987, + "step": 3813 + }, + { + "epoch": 0.36, + "grad_norm": 0.235530597825584, + "learning_rate": 0.0001905584046443299, + "loss": 1.1361, + "step": 3814 + }, + { + "epoch": 0.36, + "grad_norm": 0.2522023187837088, + "learning_rate": 0.00019055169340734908, + "loss": 1.0713, + "step": 3815 + }, + { + "epoch": 0.37, + "grad_norm": 0.280427344983831, + "learning_rate": 0.0001905449799042552, + "loss": 1.0557, + "step": 3816 + }, + { + "epoch": 0.37, + "grad_norm": 0.2969054368145568, + "learning_rate": 0.0001905382641352163, + "loss": 1.1357, + "step": 3817 + }, + { + "epoch": 0.37, + "grad_norm": 0.2440950945037354, + "learning_rate": 0.00019053154610040044, + "loss": 0.9862, + "step": 3818 + }, + { + "epoch": 0.37, + "grad_norm": 0.26826804773960794, + "learning_rate": 0.0001905248257999757, + "loss": 1.1239, + "step": 3819 + }, + { + "epoch": 0.37, + "grad_norm": 0.2505950276337422, + "learning_rate": 0.00019051810323411034, + "loss": 1.0886, + "step": 3820 + }, + { + "epoch": 0.37, + "grad_norm": 0.27676795581151226, + "learning_rate": 0.00019051137840297256, + "loss": 1.1414, + "step": 3821 + }, + { + "epoch": 0.37, + "grad_norm": 0.2571438975717034, + "learning_rate": 0.00019050465130673067, + "loss": 1.0289, + "step": 3822 + }, + { + "epoch": 0.37, + "grad_norm": 0.25111218618944997, + "learning_rate": 0.00019049792194555294, + "loss": 0.9651, + "step": 3823 + }, + { + "epoch": 0.37, + "grad_norm": 0.2610987856579739, + "learning_rate": 0.00019049119031960788, + "loss": 1.1635, + "step": 3824 + }, + { + "epoch": 0.37, + "grad_norm": 0.2432839279077218, + "learning_rate": 0.00019048445642906388, + "loss": 1.1106, + "step": 3825 + }, + { + "epoch": 0.37, + "grad_norm": 0.25411198588402173, + "learning_rate": 0.00019047772027408954, + "loss": 1.0766, + "step": 3826 + }, + { + "epoch": 0.37, + "grad_norm": 0.2611685254036077, + "learning_rate": 0.00019047098185485335, + "loss": 1.0616, + "step": 3827 + }, + { + "epoch": 0.37, + "grad_norm": 0.2486133581840728, + "learning_rate": 0.00019046424117152402, + "loss": 0.9964, + "step": 3828 + }, + { + "epoch": 0.37, + "grad_norm": 0.3298032342837613, + "learning_rate": 0.00019045749822427016, + "loss": 0.9625, + "step": 3829 + }, + { + "epoch": 0.37, + "grad_norm": 0.29292144210717325, + "learning_rate": 0.00019045075301326057, + "loss": 1.0938, + "step": 3830 + }, + { + "epoch": 0.37, + "grad_norm": 0.2906727228238908, + "learning_rate": 0.00019044400553866405, + "loss": 1.0653, + "step": 3831 + }, + { + "epoch": 0.37, + "grad_norm": 0.25691594404144225, + "learning_rate": 0.00019043725580064939, + "loss": 1.0817, + "step": 3832 + }, + { + "epoch": 0.37, + "grad_norm": 0.2787203095759583, + "learning_rate": 0.00019043050379938565, + "loss": 1.0643, + "step": 3833 + }, + { + "epoch": 0.37, + "grad_norm": 0.26242429857133936, + "learning_rate": 0.00019042374953504165, + "loss": 1.0383, + "step": 3834 + }, + { + "epoch": 0.37, + "grad_norm": 0.2750170989226645, + "learning_rate": 0.00019041699300778654, + "loss": 1.0113, + "step": 3835 + }, + { + "epoch": 0.37, + "grad_norm": 0.24689367063544376, + "learning_rate": 0.00019041023421778933, + "loss": 1.0589, + "step": 3836 + }, + { + "epoch": 0.37, + "grad_norm": 0.3101949979537771, + "learning_rate": 0.0001904034731652192, + "loss": 1.0844, + "step": 3837 + }, + { + "epoch": 0.37, + "grad_norm": 0.3026108279779279, + "learning_rate": 0.00019039670985024533, + "loss": 0.9748, + "step": 3838 + }, + { + "epoch": 0.37, + "grad_norm": 0.28436273819827806, + "learning_rate": 0.00019038994427303697, + "loss": 1.13, + "step": 3839 + }, + { + "epoch": 0.37, + "grad_norm": 0.26672874828160903, + "learning_rate": 0.00019038317643376346, + "loss": 0.9425, + "step": 3840 + }, + { + "epoch": 0.37, + "grad_norm": 0.2551937537263912, + "learning_rate": 0.00019037640633259417, + "loss": 1.1193, + "step": 3841 + }, + { + "epoch": 0.37, + "grad_norm": 0.2421616493910976, + "learning_rate": 0.00019036963396969848, + "loss": 1.1925, + "step": 3842 + }, + { + "epoch": 0.37, + "grad_norm": 0.27403481599431856, + "learning_rate": 0.00019036285934524594, + "loss": 0.9328, + "step": 3843 + }, + { + "epoch": 0.37, + "grad_norm": 0.2736343566787731, + "learning_rate": 0.00019035608245940603, + "loss": 1.1229, + "step": 3844 + }, + { + "epoch": 0.37, + "grad_norm": 0.2984644122142435, + "learning_rate": 0.00019034930331234836, + "loss": 1.0706, + "step": 3845 + }, + { + "epoch": 0.37, + "grad_norm": 0.25600173944542287, + "learning_rate": 0.00019034252190424264, + "loss": 1.1189, + "step": 3846 + }, + { + "epoch": 0.37, + "grad_norm": 0.27750211746463055, + "learning_rate": 0.0001903357382352585, + "loss": 1.2051, + "step": 3847 + }, + { + "epoch": 0.37, + "grad_norm": 0.308006852697485, + "learning_rate": 0.00019032895230556573, + "loss": 1.1935, + "step": 3848 + }, + { + "epoch": 0.37, + "grad_norm": 0.2606236441816338, + "learning_rate": 0.00019032216411533415, + "loss": 1.1713, + "step": 3849 + }, + { + "epoch": 0.37, + "grad_norm": 0.28436108514736663, + "learning_rate": 0.00019031537366473369, + "loss": 1.0714, + "step": 3850 + }, + { + "epoch": 0.37, + "grad_norm": 0.3140913304294229, + "learning_rate": 0.00019030858095393422, + "loss": 1.109, + "step": 3851 + }, + { + "epoch": 0.37, + "grad_norm": 0.27820186741488906, + "learning_rate": 0.00019030178598310573, + "loss": 1.1456, + "step": 3852 + }, + { + "epoch": 0.37, + "grad_norm": 0.25313171010488306, + "learning_rate": 0.00019029498875241832, + "loss": 1.0996, + "step": 3853 + }, + { + "epoch": 0.37, + "grad_norm": 0.26632647868200215, + "learning_rate": 0.00019028818926204207, + "loss": 1.0288, + "step": 3854 + }, + { + "epoch": 0.37, + "grad_norm": 0.2802151300023979, + "learning_rate": 0.00019028138751214714, + "loss": 1.1178, + "step": 3855 + }, + { + "epoch": 0.37, + "grad_norm": 0.28551319793457147, + "learning_rate": 0.00019027458350290375, + "loss": 1.1742, + "step": 3856 + }, + { + "epoch": 0.37, + "grad_norm": 0.29354399099059836, + "learning_rate": 0.00019026777723448214, + "loss": 1.1434, + "step": 3857 + }, + { + "epoch": 0.37, + "grad_norm": 0.28766227685882445, + "learning_rate": 0.00019026096870705274, + "loss": 1.1227, + "step": 3858 + }, + { + "epoch": 0.37, + "grad_norm": 0.25123810673486535, + "learning_rate": 0.0001902541579207858, + "loss": 1.0514, + "step": 3859 + }, + { + "epoch": 0.37, + "grad_norm": 0.3035800237589507, + "learning_rate": 0.00019024734487585186, + "loss": 1.0958, + "step": 3860 + }, + { + "epoch": 0.37, + "grad_norm": 0.24205745440797985, + "learning_rate": 0.0001902405295724214, + "loss": 1.1692, + "step": 3861 + }, + { + "epoch": 0.37, + "grad_norm": 0.27865447448135033, + "learning_rate": 0.00019023371201066497, + "loss": 1.0144, + "step": 3862 + }, + { + "epoch": 0.37, + "grad_norm": 0.2660377758295032, + "learning_rate": 0.0001902268921907532, + "loss": 0.9894, + "step": 3863 + }, + { + "epoch": 0.37, + "grad_norm": 0.2726012616002911, + "learning_rate": 0.00019022007011285674, + "loss": 1.0454, + "step": 3864 + }, + { + "epoch": 0.37, + "grad_norm": 0.2726846785840144, + "learning_rate": 0.0001902132457771463, + "loss": 1.0598, + "step": 3865 + }, + { + "epoch": 0.37, + "grad_norm": 0.25701844846319727, + "learning_rate": 0.0001902064191837927, + "loss": 1.1679, + "step": 3866 + }, + { + "epoch": 0.37, + "grad_norm": 0.2921405168078913, + "learning_rate": 0.00019019959033296678, + "loss": 1.0524, + "step": 3867 + }, + { + "epoch": 0.37, + "grad_norm": 0.27129522822886254, + "learning_rate": 0.00019019275922483943, + "loss": 1.105, + "step": 3868 + }, + { + "epoch": 0.37, + "grad_norm": 0.2718161044722209, + "learning_rate": 0.0001901859258595816, + "loss": 1.0613, + "step": 3869 + }, + { + "epoch": 0.37, + "grad_norm": 0.2536896991586868, + "learning_rate": 0.00019017909023736428, + "loss": 0.9996, + "step": 3870 + }, + { + "epoch": 0.37, + "grad_norm": 0.29175405932195664, + "learning_rate": 0.00019017225235835853, + "loss": 1.1386, + "step": 3871 + }, + { + "epoch": 0.37, + "grad_norm": 0.30144877670952286, + "learning_rate": 0.00019016541222273553, + "loss": 1.1468, + "step": 3872 + }, + { + "epoch": 0.37, + "grad_norm": 0.2931426576366692, + "learning_rate": 0.00019015856983066644, + "loss": 1.1721, + "step": 3873 + }, + { + "epoch": 0.37, + "grad_norm": 0.2916770600223048, + "learning_rate": 0.00019015172518232242, + "loss": 1.0945, + "step": 3874 + }, + { + "epoch": 0.37, + "grad_norm": 0.27166976489578, + "learning_rate": 0.00019014487827787483, + "loss": 1.0891, + "step": 3875 + }, + { + "epoch": 0.37, + "grad_norm": 0.2839705782430795, + "learning_rate": 0.00019013802911749505, + "loss": 1.1197, + "step": 3876 + }, + { + "epoch": 0.37, + "grad_norm": 0.298375581966575, + "learning_rate": 0.0001901311777013544, + "loss": 1.0689, + "step": 3877 + }, + { + "epoch": 0.37, + "grad_norm": 0.3013484850412901, + "learning_rate": 0.0001901243240296244, + "loss": 1.0352, + "step": 3878 + }, + { + "epoch": 0.37, + "grad_norm": 0.25624648167577585, + "learning_rate": 0.00019011746810247658, + "loss": 1.0749, + "step": 3879 + }, + { + "epoch": 0.37, + "grad_norm": 0.25490116870005675, + "learning_rate": 0.00019011060992008244, + "loss": 1.1551, + "step": 3880 + }, + { + "epoch": 0.37, + "grad_norm": 0.28069672688477176, + "learning_rate": 0.00019010374948261367, + "loss": 0.9975, + "step": 3881 + }, + { + "epoch": 0.37, + "grad_norm": 0.2651997155968314, + "learning_rate": 0.0001900968867902419, + "loss": 1.1731, + "step": 3882 + }, + { + "epoch": 0.37, + "grad_norm": 0.3115313345415178, + "learning_rate": 0.00019009002184313897, + "loss": 1.1077, + "step": 3883 + }, + { + "epoch": 0.37, + "grad_norm": 0.2775305592186655, + "learning_rate": 0.00019008315464147662, + "loss": 1.1736, + "step": 3884 + }, + { + "epoch": 0.37, + "grad_norm": 0.28786278339401167, + "learning_rate": 0.0001900762851854267, + "loss": 1.0521, + "step": 3885 + }, + { + "epoch": 0.37, + "grad_norm": 0.23797474326903245, + "learning_rate": 0.0001900694134751611, + "loss": 1.1405, + "step": 3886 + }, + { + "epoch": 0.37, + "grad_norm": 0.2605654562908968, + "learning_rate": 0.00019006253951085186, + "loss": 1.029, + "step": 3887 + }, + { + "epoch": 0.37, + "grad_norm": 0.25694650291010046, + "learning_rate": 0.00019005566329267096, + "loss": 1.0665, + "step": 3888 + }, + { + "epoch": 0.37, + "grad_norm": 0.26716631196903606, + "learning_rate": 0.0001900487848207905, + "loss": 1.1494, + "step": 3889 + }, + { + "epoch": 0.37, + "grad_norm": 0.3116260242272765, + "learning_rate": 0.00019004190409538255, + "loss": 1.117, + "step": 3890 + }, + { + "epoch": 0.37, + "grad_norm": 0.26774746733662874, + "learning_rate": 0.00019003502111661943, + "loss": 1.1987, + "step": 3891 + }, + { + "epoch": 0.37, + "grad_norm": 0.25363750120087714, + "learning_rate": 0.0001900281358846733, + "loss": 1.0831, + "step": 3892 + }, + { + "epoch": 0.37, + "grad_norm": 0.28339718118220925, + "learning_rate": 0.00019002124839971647, + "loss": 1.1161, + "step": 3893 + }, + { + "epoch": 0.37, + "grad_norm": 0.2541180184606548, + "learning_rate": 0.00019001435866192133, + "loss": 1.037, + "step": 3894 + }, + { + "epoch": 0.37, + "grad_norm": 0.2662101976131836, + "learning_rate": 0.0001900074666714603, + "loss": 1.0495, + "step": 3895 + }, + { + "epoch": 0.37, + "grad_norm": 0.24244962084116864, + "learning_rate": 0.00019000057242850584, + "loss": 1.143, + "step": 3896 + }, + { + "epoch": 0.37, + "grad_norm": 0.2815830104446293, + "learning_rate": 0.00018999367593323048, + "loss": 1.1087, + "step": 3897 + }, + { + "epoch": 0.37, + "grad_norm": 0.2661433168734344, + "learning_rate": 0.00018998677718580687, + "loss": 1.0699, + "step": 3898 + }, + { + "epoch": 0.37, + "grad_norm": 0.278452400141501, + "learning_rate": 0.00018997987618640756, + "loss": 0.9984, + "step": 3899 + }, + { + "epoch": 0.37, + "grad_norm": 0.2950352883266951, + "learning_rate": 0.00018997297293520533, + "loss": 1.0725, + "step": 3900 + }, + { + "epoch": 0.37, + "grad_norm": 0.24855309318205562, + "learning_rate": 0.00018996606743237288, + "loss": 1.1091, + "step": 3901 + }, + { + "epoch": 0.37, + "grad_norm": 0.26157079774866104, + "learning_rate": 0.00018995915967808305, + "loss": 1.137, + "step": 3902 + }, + { + "epoch": 0.37, + "grad_norm": 0.2817964268191354, + "learning_rate": 0.00018995224967250873, + "loss": 1.0887, + "step": 3903 + }, + { + "epoch": 0.37, + "grad_norm": 0.303055127418326, + "learning_rate": 0.00018994533741582283, + "loss": 1.0509, + "step": 3904 + }, + { + "epoch": 0.37, + "grad_norm": 0.29319713632683003, + "learning_rate": 0.00018993842290819833, + "loss": 1.1527, + "step": 3905 + }, + { + "epoch": 0.37, + "grad_norm": 0.28460911802279176, + "learning_rate": 0.00018993150614980824, + "loss": 1.1663, + "step": 3906 + }, + { + "epoch": 0.37, + "grad_norm": 0.24323458888587937, + "learning_rate": 0.00018992458714082574, + "loss": 1.107, + "step": 3907 + }, + { + "epoch": 0.37, + "grad_norm": 0.2507473651775092, + "learning_rate": 0.0001899176658814239, + "loss": 1.1291, + "step": 3908 + }, + { + "epoch": 0.37, + "grad_norm": 0.26851442265034603, + "learning_rate": 0.00018991074237177595, + "loss": 1.0296, + "step": 3909 + }, + { + "epoch": 0.37, + "grad_norm": 0.27910401854051703, + "learning_rate": 0.0001899038166120552, + "loss": 1.0243, + "step": 3910 + }, + { + "epoch": 0.37, + "grad_norm": 0.2643335688729103, + "learning_rate": 0.0001898968886024349, + "loss": 1.1176, + "step": 3911 + }, + { + "epoch": 0.37, + "grad_norm": 0.2845122162938141, + "learning_rate": 0.0001898899583430885, + "loss": 1.1254, + "step": 3912 + }, + { + "epoch": 0.37, + "grad_norm": 0.2631105012541486, + "learning_rate": 0.00018988302583418937, + "loss": 1.0436, + "step": 3913 + }, + { + "epoch": 0.37, + "grad_norm": 0.26527130872751753, + "learning_rate": 0.00018987609107591104, + "loss": 1.0063, + "step": 3914 + }, + { + "epoch": 0.37, + "grad_norm": 0.26612171965346043, + "learning_rate": 0.00018986915406842708, + "loss": 1.1018, + "step": 3915 + }, + { + "epoch": 0.37, + "grad_norm": 0.29895893771415827, + "learning_rate": 0.000189862214811911, + "loss": 1.0477, + "step": 3916 + }, + { + "epoch": 0.37, + "grad_norm": 0.298508391495162, + "learning_rate": 0.00018985527330653653, + "loss": 1.1202, + "step": 3917 + }, + { + "epoch": 0.37, + "grad_norm": 0.2862203670774759, + "learning_rate": 0.0001898483295524774, + "loss": 1.1676, + "step": 3918 + }, + { + "epoch": 0.37, + "grad_norm": 0.7409275636159137, + "learning_rate": 0.00018984138354990736, + "loss": 1.4645, + "step": 3919 + }, + { + "epoch": 0.38, + "grad_norm": 0.26519753642569927, + "learning_rate": 0.0001898344352990002, + "loss": 1.0743, + "step": 3920 + }, + { + "epoch": 0.38, + "grad_norm": 0.29786591251006633, + "learning_rate": 0.00018982748479992988, + "loss": 1.1278, + "step": 3921 + }, + { + "epoch": 0.38, + "grad_norm": 0.278707089601725, + "learning_rate": 0.00018982053205287024, + "loss": 1.0982, + "step": 3922 + }, + { + "epoch": 0.38, + "grad_norm": 0.233468164464521, + "learning_rate": 0.00018981357705799538, + "loss": 1.0338, + "step": 3923 + }, + { + "epoch": 0.38, + "grad_norm": 0.32242913731155076, + "learning_rate": 0.0001898066198154793, + "loss": 1.1282, + "step": 3924 + }, + { + "epoch": 0.38, + "grad_norm": 0.2712636448427821, + "learning_rate": 0.00018979966032549612, + "loss": 1.0868, + "step": 3925 + }, + { + "epoch": 0.38, + "grad_norm": 0.27812674663111897, + "learning_rate": 0.00018979269858822, + "loss": 0.9507, + "step": 3926 + }, + { + "epoch": 0.38, + "grad_norm": 0.31036023474014135, + "learning_rate": 0.00018978573460382516, + "loss": 1.066, + "step": 3927 + }, + { + "epoch": 0.38, + "grad_norm": 0.2741036928468329, + "learning_rate": 0.00018977876837248587, + "loss": 1.114, + "step": 3928 + }, + { + "epoch": 0.38, + "grad_norm": 0.32960721822124844, + "learning_rate": 0.0001897717998943765, + "loss": 0.9916, + "step": 3929 + }, + { + "epoch": 0.38, + "grad_norm": 0.25938355281972847, + "learning_rate": 0.0001897648291696714, + "loss": 1.0566, + "step": 3930 + }, + { + "epoch": 0.38, + "grad_norm": 0.27023081057739146, + "learning_rate": 0.00018975785619854504, + "loss": 1.1365, + "step": 3931 + }, + { + "epoch": 0.38, + "grad_norm": 0.30667049605255553, + "learning_rate": 0.00018975088098117194, + "loss": 1.0995, + "step": 3932 + }, + { + "epoch": 0.38, + "grad_norm": 0.24394315516483825, + "learning_rate": 0.00018974390351772665, + "loss": 1.0412, + "step": 3933 + }, + { + "epoch": 0.38, + "grad_norm": 0.2801905314214369, + "learning_rate": 0.00018973692380838371, + "loss": 1.0848, + "step": 3934 + }, + { + "epoch": 0.38, + "grad_norm": 0.2381280727680932, + "learning_rate": 0.00018972994185331788, + "loss": 1.0596, + "step": 3935 + }, + { + "epoch": 0.38, + "grad_norm": 0.23812773003536764, + "learning_rate": 0.00018972295765270388, + "loss": 1.0666, + "step": 3936 + }, + { + "epoch": 0.38, + "grad_norm": 0.32096602227564136, + "learning_rate": 0.00018971597120671647, + "loss": 1.0948, + "step": 3937 + }, + { + "epoch": 0.38, + "grad_norm": 0.26277005671959386, + "learning_rate": 0.0001897089825155305, + "loss": 0.992, + "step": 3938 + }, + { + "epoch": 0.38, + "grad_norm": 0.25895544694478345, + "learning_rate": 0.00018970199157932084, + "loss": 1.0121, + "step": 3939 + }, + { + "epoch": 0.38, + "grad_norm": 0.33276374338059417, + "learning_rate": 0.0001896949983982625, + "loss": 1.1811, + "step": 3940 + }, + { + "epoch": 0.38, + "grad_norm": 0.2899952697366727, + "learning_rate": 0.00018968800297253043, + "loss": 1.0459, + "step": 3941 + }, + { + "epoch": 0.38, + "grad_norm": 0.28332378815166603, + "learning_rate": 0.0001896810053022997, + "loss": 1.1406, + "step": 3942 + }, + { + "epoch": 0.38, + "grad_norm": 0.2563768370833164, + "learning_rate": 0.00018967400538774548, + "loss": 1.0918, + "step": 3943 + }, + { + "epoch": 0.38, + "grad_norm": 0.2527720587986432, + "learning_rate": 0.00018966700322904293, + "loss": 1.0922, + "step": 3944 + }, + { + "epoch": 0.38, + "grad_norm": 0.25592462221706, + "learning_rate": 0.00018965999882636725, + "loss": 1.0781, + "step": 3945 + }, + { + "epoch": 0.38, + "grad_norm": 0.3089279882540109, + "learning_rate": 0.00018965299217989375, + "loss": 1.1559, + "step": 3946 + }, + { + "epoch": 0.38, + "grad_norm": 0.25525835490586896, + "learning_rate": 0.00018964598328979776, + "loss": 1.1349, + "step": 3947 + }, + { + "epoch": 0.38, + "grad_norm": 0.2841608784955134, + "learning_rate": 0.00018963897215625472, + "loss": 0.984, + "step": 3948 + }, + { + "epoch": 0.38, + "grad_norm": 0.272016327051106, + "learning_rate": 0.00018963195877944007, + "loss": 1.058, + "step": 3949 + }, + { + "epoch": 0.38, + "grad_norm": 0.3037390334949688, + "learning_rate": 0.0001896249431595293, + "loss": 1.0846, + "step": 3950 + }, + { + "epoch": 0.38, + "grad_norm": 0.2497613489471199, + "learning_rate": 0.000189617925296698, + "loss": 1.0793, + "step": 3951 + }, + { + "epoch": 0.38, + "grad_norm": 0.27852691608884916, + "learning_rate": 0.00018961090519112182, + "loss": 1.1262, + "step": 3952 + }, + { + "epoch": 0.38, + "grad_norm": 0.2844330777819289, + "learning_rate": 0.0001896038828429764, + "loss": 1.0902, + "step": 3953 + }, + { + "epoch": 0.38, + "grad_norm": 0.30372667850264373, + "learning_rate": 0.0001895968582524375, + "loss": 0.9642, + "step": 3954 + }, + { + "epoch": 0.38, + "grad_norm": 0.24532996718177777, + "learning_rate": 0.00018958983141968095, + "loss": 0.9768, + "step": 3955 + }, + { + "epoch": 0.38, + "grad_norm": 0.27188695012581215, + "learning_rate": 0.0001895828023448825, + "loss": 1.0809, + "step": 3956 + }, + { + "epoch": 0.38, + "grad_norm": 0.28620857758847607, + "learning_rate": 0.00018957577102821817, + "loss": 1.1338, + "step": 3957 + }, + { + "epoch": 0.38, + "grad_norm": 0.2967269096451259, + "learning_rate": 0.00018956873746986386, + "loss": 1.0498, + "step": 3958 + }, + { + "epoch": 0.38, + "grad_norm": 0.28197346617437485, + "learning_rate": 0.00018956170166999558, + "loss": 1.1001, + "step": 3959 + }, + { + "epoch": 0.38, + "grad_norm": 0.32447316500456774, + "learning_rate": 0.00018955466362878943, + "loss": 1.1346, + "step": 3960 + }, + { + "epoch": 0.38, + "grad_norm": 0.29206790559618895, + "learning_rate": 0.00018954762334642158, + "loss": 1.1531, + "step": 3961 + }, + { + "epoch": 0.38, + "grad_norm": 0.31059482758918283, + "learning_rate": 0.00018954058082306817, + "loss": 1.0824, + "step": 3962 + }, + { + "epoch": 0.38, + "grad_norm": 0.2599356809697041, + "learning_rate": 0.0001895335360589054, + "loss": 1.0651, + "step": 3963 + }, + { + "epoch": 0.38, + "grad_norm": 0.28568670159243, + "learning_rate": 0.00018952648905410966, + "loss": 1.111, + "step": 3964 + }, + { + "epoch": 0.38, + "grad_norm": 0.2741580155840068, + "learning_rate": 0.0001895194398088573, + "loss": 1.0367, + "step": 3965 + }, + { + "epoch": 0.38, + "grad_norm": 0.2460551214193779, + "learning_rate": 0.00018951238832332464, + "loss": 1.1306, + "step": 3966 + }, + { + "epoch": 0.38, + "grad_norm": 0.28164939664440597, + "learning_rate": 0.00018950533459768823, + "loss": 1.0736, + "step": 3967 + }, + { + "epoch": 0.38, + "grad_norm": 0.2935359891153058, + "learning_rate": 0.00018949827863212456, + "loss": 1.1287, + "step": 3968 + }, + { + "epoch": 0.38, + "grad_norm": 0.27127546853360357, + "learning_rate": 0.00018949122042681023, + "loss": 0.9919, + "step": 3969 + }, + { + "epoch": 0.38, + "grad_norm": 0.3153198801953517, + "learning_rate": 0.00018948415998192182, + "loss": 1.1561, + "step": 3970 + }, + { + "epoch": 0.38, + "grad_norm": 0.2639733592331646, + "learning_rate": 0.0001894770972976361, + "loss": 1.0033, + "step": 3971 + }, + { + "epoch": 0.38, + "grad_norm": 0.26313646288953435, + "learning_rate": 0.0001894700323741298, + "loss": 1.1798, + "step": 3972 + }, + { + "epoch": 0.38, + "grad_norm": 0.2711515260554288, + "learning_rate": 0.0001894629652115797, + "loss": 1.1126, + "step": 3973 + }, + { + "epoch": 0.38, + "grad_norm": 0.272492854127571, + "learning_rate": 0.0001894558958101627, + "loss": 0.9806, + "step": 3974 + }, + { + "epoch": 0.38, + "grad_norm": 0.27502865859424286, + "learning_rate": 0.00018944882417005565, + "loss": 1.088, + "step": 3975 + }, + { + "epoch": 0.38, + "grad_norm": 0.3343809998581711, + "learning_rate": 0.00018944175029143558, + "loss": 1.1771, + "step": 3976 + }, + { + "epoch": 0.38, + "grad_norm": 0.2894367965617339, + "learning_rate": 0.0001894346741744795, + "loss": 1.1597, + "step": 3977 + }, + { + "epoch": 0.38, + "grad_norm": 0.25286298932737533, + "learning_rate": 0.00018942759581936446, + "loss": 1.0403, + "step": 3978 + }, + { + "epoch": 0.38, + "grad_norm": 0.27494211784286493, + "learning_rate": 0.00018942051522626764, + "loss": 1.1405, + "step": 3979 + }, + { + "epoch": 0.38, + "grad_norm": 0.28818353355011783, + "learning_rate": 0.00018941343239536624, + "loss": 1.0295, + "step": 3980 + }, + { + "epoch": 0.38, + "grad_norm": 0.25536310488024094, + "learning_rate": 0.0001894063473268375, + "loss": 1.0547, + "step": 3981 + }, + { + "epoch": 0.38, + "grad_norm": 0.32569914313859244, + "learning_rate": 0.00018939926002085872, + "loss": 1.2414, + "step": 3982 + }, + { + "epoch": 0.38, + "grad_norm": 0.2869364080234295, + "learning_rate": 0.0001893921704776073, + "loss": 0.9819, + "step": 3983 + }, + { + "epoch": 0.38, + "grad_norm": 0.2657350930953723, + "learning_rate": 0.0001893850786972606, + "loss": 1.0408, + "step": 3984 + }, + { + "epoch": 0.38, + "grad_norm": 0.29775662133242436, + "learning_rate": 0.0001893779846799961, + "loss": 1.1624, + "step": 3985 + }, + { + "epoch": 0.38, + "grad_norm": 0.27355863349278053, + "learning_rate": 0.00018937088842599142, + "loss": 1.2827, + "step": 3986 + }, + { + "epoch": 0.38, + "grad_norm": 0.2732303129004138, + "learning_rate": 0.00018936378993542408, + "loss": 1.0976, + "step": 3987 + }, + { + "epoch": 0.38, + "grad_norm": 0.3021647282609973, + "learning_rate": 0.00018935668920847171, + "loss": 1.0696, + "step": 3988 + }, + { + "epoch": 0.38, + "grad_norm": 0.272227355179039, + "learning_rate": 0.00018934958624531207, + "loss": 1.0169, + "step": 3989 + }, + { + "epoch": 0.38, + "grad_norm": 0.2958029148879499, + "learning_rate": 0.00018934248104612283, + "loss": 1.1677, + "step": 3990 + }, + { + "epoch": 0.38, + "grad_norm": 0.2961165738395803, + "learning_rate": 0.00018933537361108188, + "loss": 1.1413, + "step": 3991 + }, + { + "epoch": 0.38, + "grad_norm": 0.3276060272777321, + "learning_rate": 0.00018932826394036707, + "loss": 1.1431, + "step": 3992 + }, + { + "epoch": 0.38, + "grad_norm": 0.24414858635640996, + "learning_rate": 0.00018932115203415631, + "loss": 0.9642, + "step": 3993 + }, + { + "epoch": 0.38, + "grad_norm": 0.24286855215872527, + "learning_rate": 0.0001893140378926276, + "loss": 1.0458, + "step": 3994 + }, + { + "epoch": 0.38, + "grad_norm": 0.2846207625112956, + "learning_rate": 0.0001893069215159589, + "loss": 1.1316, + "step": 3995 + }, + { + "epoch": 0.38, + "grad_norm": 0.28248248879909615, + "learning_rate": 0.00018929980290432842, + "loss": 1.1581, + "step": 3996 + }, + { + "epoch": 0.38, + "grad_norm": 0.27218202374232026, + "learning_rate": 0.00018929268205791422, + "loss": 1.2227, + "step": 3997 + }, + { + "epoch": 0.38, + "grad_norm": 0.26852560065398273, + "learning_rate": 0.00018928555897689456, + "loss": 0.9302, + "step": 3998 + }, + { + "epoch": 0.38, + "grad_norm": 0.2776665044902241, + "learning_rate": 0.00018927843366144765, + "loss": 1.0248, + "step": 3999 + }, + { + "epoch": 0.38, + "grad_norm": 0.27797522607330477, + "learning_rate": 0.00018927130611175183, + "loss": 1.166, + "step": 4000 + }, + { + "epoch": 0.38, + "grad_norm": 0.23621425637102375, + "learning_rate": 0.00018926417632798547, + "loss": 1.1106, + "step": 4001 + }, + { + "epoch": 0.38, + "grad_norm": 0.2926545007076596, + "learning_rate": 0.000189257044310327, + "loss": 1.1281, + "step": 4002 + }, + { + "epoch": 0.38, + "grad_norm": 0.27712375321038807, + "learning_rate": 0.00018924991005895493, + "loss": 1.0626, + "step": 4003 + }, + { + "epoch": 0.38, + "grad_norm": 0.231003959100629, + "learning_rate": 0.0001892427735740477, + "loss": 0.9582, + "step": 4004 + }, + { + "epoch": 0.38, + "grad_norm": 0.29692646912091747, + "learning_rate": 0.00018923563485578405, + "loss": 1.0728, + "step": 4005 + }, + { + "epoch": 0.38, + "grad_norm": 0.26325491425794023, + "learning_rate": 0.0001892284939043425, + "loss": 1.1163, + "step": 4006 + }, + { + "epoch": 0.38, + "grad_norm": 0.27243932891523137, + "learning_rate": 0.00018922135071990185, + "loss": 1.0374, + "step": 4007 + }, + { + "epoch": 0.38, + "grad_norm": 0.28198690740674986, + "learning_rate": 0.0001892142053026408, + "loss": 1.1425, + "step": 4008 + }, + { + "epoch": 0.38, + "grad_norm": 0.28509269273338117, + "learning_rate": 0.00018920705765273818, + "loss": 1.3779, + "step": 4009 + }, + { + "epoch": 0.38, + "grad_norm": 0.28126037107058494, + "learning_rate": 0.0001891999077703729, + "loss": 1.1616, + "step": 4010 + }, + { + "epoch": 0.38, + "grad_norm": 0.3282196814898794, + "learning_rate": 0.00018919275565572387, + "loss": 1.1184, + "step": 4011 + }, + { + "epoch": 0.38, + "grad_norm": 0.2796592489259632, + "learning_rate": 0.00018918560130897006, + "loss": 1.1493, + "step": 4012 + }, + { + "epoch": 0.38, + "grad_norm": 0.35705040515515696, + "learning_rate": 0.00018917844473029054, + "loss": 1.035, + "step": 4013 + }, + { + "epoch": 0.38, + "grad_norm": 0.26954678750551575, + "learning_rate": 0.00018917128591986439, + "loss": 1.1217, + "step": 4014 + }, + { + "epoch": 0.38, + "grad_norm": 0.268134645202258, + "learning_rate": 0.00018916412487787076, + "loss": 1.1415, + "step": 4015 + }, + { + "epoch": 0.38, + "grad_norm": 0.2869059780412037, + "learning_rate": 0.0001891569616044889, + "loss": 1.1142, + "step": 4016 + }, + { + "epoch": 0.38, + "grad_norm": 0.2866248425276922, + "learning_rate": 0.000189149796099898, + "loss": 0.968, + "step": 4017 + }, + { + "epoch": 0.38, + "grad_norm": 0.2854847382998254, + "learning_rate": 0.00018914262836427744, + "loss": 1.1804, + "step": 4018 + }, + { + "epoch": 0.38, + "grad_norm": 0.28804702455957, + "learning_rate": 0.00018913545839780658, + "loss": 0.9394, + "step": 4019 + }, + { + "epoch": 0.38, + "grad_norm": 0.26449776920625534, + "learning_rate": 0.00018912828620066486, + "loss": 1.1066, + "step": 4020 + }, + { + "epoch": 0.38, + "grad_norm": 0.2946737668152781, + "learning_rate": 0.00018912111177303177, + "loss": 1.2112, + "step": 4021 + }, + { + "epoch": 0.38, + "grad_norm": 0.2901645389196644, + "learning_rate": 0.00018911393511508685, + "loss": 1.039, + "step": 4022 + }, + { + "epoch": 0.38, + "grad_norm": 0.27790893974884995, + "learning_rate": 0.00018910675622700967, + "loss": 1.058, + "step": 4023 + }, + { + "epoch": 0.38, + "grad_norm": 0.27176601010684964, + "learning_rate": 0.00018909957510897992, + "loss": 1.1212, + "step": 4024 + }, + { + "epoch": 0.39, + "grad_norm": 0.27542708140447214, + "learning_rate": 0.00018909239176117732, + "loss": 1.0686, + "step": 4025 + }, + { + "epoch": 0.39, + "grad_norm": 0.27230675704373336, + "learning_rate": 0.0001890852061837816, + "loss": 1.1652, + "step": 4026 + }, + { + "epoch": 0.39, + "grad_norm": 0.2557788458829725, + "learning_rate": 0.00018907801837697265, + "loss": 0.955, + "step": 4027 + }, + { + "epoch": 0.39, + "grad_norm": 0.2601254941936526, + "learning_rate": 0.00018907082834093028, + "loss": 1.0526, + "step": 4028 + }, + { + "epoch": 0.39, + "grad_norm": 0.266059316761016, + "learning_rate": 0.00018906363607583445, + "loss": 1.0664, + "step": 4029 + }, + { + "epoch": 0.39, + "grad_norm": 0.26037123171682397, + "learning_rate": 0.00018905644158186515, + "loss": 1.027, + "step": 4030 + }, + { + "epoch": 0.39, + "grad_norm": 0.2563328237402112, + "learning_rate": 0.00018904924485920247, + "loss": 1.1243, + "step": 4031 + }, + { + "epoch": 0.39, + "grad_norm": 0.28925250146928455, + "learning_rate": 0.0001890420459080264, + "loss": 1.1726, + "step": 4032 + }, + { + "epoch": 0.39, + "grad_norm": 0.28975657876816185, + "learning_rate": 0.0001890348447285172, + "loss": 1.0362, + "step": 4033 + }, + { + "epoch": 0.39, + "grad_norm": 0.266527827382827, + "learning_rate": 0.00018902764132085507, + "loss": 1.1282, + "step": 4034 + }, + { + "epoch": 0.39, + "grad_norm": 0.2819868839387067, + "learning_rate": 0.00018902043568522027, + "loss": 1.0481, + "step": 4035 + }, + { + "epoch": 0.39, + "grad_norm": 0.26260787323629486, + "learning_rate": 0.0001890132278217931, + "loss": 1.0494, + "step": 4036 + }, + { + "epoch": 0.39, + "grad_norm": 0.29231684987402756, + "learning_rate": 0.00018900601773075396, + "loss": 1.0668, + "step": 4037 + }, + { + "epoch": 0.39, + "grad_norm": 0.2689538216980018, + "learning_rate": 0.00018899880541228332, + "loss": 1.0549, + "step": 4038 + }, + { + "epoch": 0.39, + "grad_norm": 0.2748948697066311, + "learning_rate": 0.0001889915908665616, + "loss": 1.0691, + "step": 4039 + }, + { + "epoch": 0.39, + "grad_norm": 0.26070699821281274, + "learning_rate": 0.00018898437409376942, + "loss": 1.0906, + "step": 4040 + }, + { + "epoch": 0.39, + "grad_norm": 0.29273143034067534, + "learning_rate": 0.00018897715509408734, + "loss": 1.0254, + "step": 4041 + }, + { + "epoch": 0.39, + "grad_norm": 0.3032571667610144, + "learning_rate": 0.00018896993386769602, + "loss": 0.9515, + "step": 4042 + }, + { + "epoch": 0.39, + "grad_norm": 0.2567507749894945, + "learning_rate": 0.0001889627104147762, + "loss": 1.054, + "step": 4043 + }, + { + "epoch": 0.39, + "grad_norm": 0.3048904231170557, + "learning_rate": 0.00018895548473550866, + "loss": 1.0738, + "step": 4044 + }, + { + "epoch": 0.39, + "grad_norm": 0.27093062547894164, + "learning_rate": 0.00018894825683007417, + "loss": 1.1361, + "step": 4045 + }, + { + "epoch": 0.39, + "grad_norm": 0.2907476290882117, + "learning_rate": 0.00018894102669865368, + "loss": 1.2182, + "step": 4046 + }, + { + "epoch": 0.39, + "grad_norm": 0.27932158114529015, + "learning_rate": 0.0001889337943414281, + "loss": 1.0292, + "step": 4047 + }, + { + "epoch": 0.39, + "grad_norm": 0.303163185427342, + "learning_rate": 0.00018892655975857842, + "loss": 1.0809, + "step": 4048 + }, + { + "epoch": 0.39, + "grad_norm": 0.28389248942158896, + "learning_rate": 0.0001889193229502857, + "loss": 0.9802, + "step": 4049 + }, + { + "epoch": 0.39, + "grad_norm": 0.23743556853791362, + "learning_rate": 0.000188912083916731, + "loss": 1.0415, + "step": 4050 + }, + { + "epoch": 0.39, + "grad_norm": 0.2607618534899767, + "learning_rate": 0.00018890484265809558, + "loss": 1.1187, + "step": 4051 + }, + { + "epoch": 0.39, + "grad_norm": 0.282981189123625, + "learning_rate": 0.00018889759917456057, + "loss": 1.1592, + "step": 4052 + }, + { + "epoch": 0.39, + "grad_norm": 0.2875209522933082, + "learning_rate": 0.00018889035346630726, + "loss": 1.1722, + "step": 4053 + }, + { + "epoch": 0.39, + "grad_norm": 0.28469142917045703, + "learning_rate": 0.000188883105533517, + "loss": 1.0714, + "step": 4054 + }, + { + "epoch": 0.39, + "grad_norm": 0.27563667585065, + "learning_rate": 0.00018887585537637116, + "loss": 0.9172, + "step": 4055 + }, + { + "epoch": 0.39, + "grad_norm": 0.2735767551268766, + "learning_rate": 0.00018886860299505118, + "loss": 1.1189, + "step": 4056 + }, + { + "epoch": 0.39, + "grad_norm": 0.30194229419916335, + "learning_rate": 0.00018886134838973857, + "loss": 1.0886, + "step": 4057 + }, + { + "epoch": 0.39, + "grad_norm": 0.2932088348927251, + "learning_rate": 0.00018885409156061488, + "loss": 1.115, + "step": 4058 + }, + { + "epoch": 0.39, + "grad_norm": 0.27110810019888465, + "learning_rate": 0.00018884683250786167, + "loss": 1.0398, + "step": 4059 + }, + { + "epoch": 0.39, + "grad_norm": 0.33237664113893134, + "learning_rate": 0.00018883957123166066, + "loss": 1.1189, + "step": 4060 + }, + { + "epoch": 0.39, + "grad_norm": 0.2507604291402785, + "learning_rate": 0.00018883230773219354, + "loss": 1.0053, + "step": 4061 + }, + { + "epoch": 0.39, + "grad_norm": 0.2972413921137987, + "learning_rate": 0.00018882504200964207, + "loss": 1.1487, + "step": 4062 + }, + { + "epoch": 0.39, + "grad_norm": 0.26170579658272336, + "learning_rate": 0.00018881777406418816, + "loss": 1.1655, + "step": 4063 + }, + { + "epoch": 0.39, + "grad_norm": 0.30453124626017863, + "learning_rate": 0.00018881050389601357, + "loss": 0.9793, + "step": 4064 + }, + { + "epoch": 0.39, + "grad_norm": 0.27274778300480246, + "learning_rate": 0.00018880323150530034, + "loss": 1.019, + "step": 4065 + }, + { + "epoch": 0.39, + "grad_norm": 0.2597880316659366, + "learning_rate": 0.0001887959568922304, + "loss": 1.13, + "step": 4066 + }, + { + "epoch": 0.39, + "grad_norm": 0.30696703463261027, + "learning_rate": 0.00018878868005698586, + "loss": 1.068, + "step": 4067 + }, + { + "epoch": 0.39, + "grad_norm": 0.254482856569276, + "learning_rate": 0.0001887814009997488, + "loss": 1.156, + "step": 4068 + }, + { + "epoch": 0.39, + "grad_norm": 0.29100192996613405, + "learning_rate": 0.00018877411972070135, + "loss": 1.1195, + "step": 4069 + }, + { + "epoch": 0.39, + "grad_norm": 0.2592354961216204, + "learning_rate": 0.0001887668362200258, + "loss": 1.1087, + "step": 4070 + }, + { + "epoch": 0.39, + "grad_norm": 0.28232033919806615, + "learning_rate": 0.00018875955049790438, + "loss": 1.0374, + "step": 4071 + }, + { + "epoch": 0.39, + "grad_norm": 0.2710379764058582, + "learning_rate": 0.00018875226255451942, + "loss": 1.0692, + "step": 4072 + }, + { + "epoch": 0.39, + "grad_norm": 0.3211139224345676, + "learning_rate": 0.00018874497239005332, + "loss": 1.0422, + "step": 4073 + }, + { + "epoch": 0.39, + "grad_norm": 0.26011924754300747, + "learning_rate": 0.0001887376800046885, + "loss": 1.0197, + "step": 4074 + }, + { + "epoch": 0.39, + "grad_norm": 0.2596113402217534, + "learning_rate": 0.00018873038539860747, + "loss": 1.0965, + "step": 4075 + }, + { + "epoch": 0.39, + "grad_norm": 0.2666964093730143, + "learning_rate": 0.0001887230885719928, + "loss": 1.1003, + "step": 4076 + }, + { + "epoch": 0.39, + "grad_norm": 0.24755453906410413, + "learning_rate": 0.00018871578952502703, + "loss": 1.0753, + "step": 4077 + }, + { + "epoch": 0.39, + "grad_norm": 0.25330256715469024, + "learning_rate": 0.0001887084882578929, + "loss": 1.0962, + "step": 4078 + }, + { + "epoch": 0.39, + "grad_norm": 0.2664417858327983, + "learning_rate": 0.00018870118477077309, + "loss": 1.1026, + "step": 4079 + }, + { + "epoch": 0.39, + "grad_norm": 0.2887769686304296, + "learning_rate": 0.00018869387906385044, + "loss": 1.085, + "step": 4080 + }, + { + "epoch": 0.39, + "grad_norm": 0.2945622145717073, + "learning_rate": 0.00018868657113730764, + "loss": 1.0453, + "step": 4081 + }, + { + "epoch": 0.39, + "grad_norm": 0.2914962906932826, + "learning_rate": 0.0001886792609913277, + "loss": 1.1669, + "step": 4082 + }, + { + "epoch": 0.39, + "grad_norm": 0.2982094102526812, + "learning_rate": 0.00018867194862609354, + "loss": 1.1398, + "step": 4083 + }, + { + "epoch": 0.39, + "grad_norm": 0.2943744623045212, + "learning_rate": 0.0001886646340417881, + "loss": 1.1049, + "step": 4084 + }, + { + "epoch": 0.39, + "grad_norm": 0.2808334985745818, + "learning_rate": 0.0001886573172385945, + "loss": 1.0434, + "step": 4085 + }, + { + "epoch": 0.39, + "grad_norm": 0.28765243366041743, + "learning_rate": 0.0001886499982166958, + "loss": 1.1537, + "step": 4086 + }, + { + "epoch": 0.39, + "grad_norm": 0.2838538473305344, + "learning_rate": 0.0001886426769762752, + "loss": 1.1621, + "step": 4087 + }, + { + "epoch": 0.39, + "grad_norm": 0.2779123375968445, + "learning_rate": 0.00018863535351751586, + "loss": 1.0426, + "step": 4088 + }, + { + "epoch": 0.39, + "grad_norm": 0.26234781180349137, + "learning_rate": 0.00018862802784060115, + "loss": 1.1284, + "step": 4089 + }, + { + "epoch": 0.39, + "grad_norm": 0.27792379078088625, + "learning_rate": 0.00018862069994571428, + "loss": 0.9789, + "step": 4090 + }, + { + "epoch": 0.39, + "grad_norm": 0.3213818567797798, + "learning_rate": 0.00018861336983303875, + "loss": 1.0857, + "step": 4091 + }, + { + "epoch": 0.39, + "grad_norm": 0.2942679485146761, + "learning_rate": 0.0001886060375027579, + "loss": 1.0244, + "step": 4092 + }, + { + "epoch": 0.39, + "grad_norm": 0.2934982143646054, + "learning_rate": 0.0001885987029550553, + "loss": 1.0908, + "step": 4093 + }, + { + "epoch": 0.39, + "grad_norm": 0.2705085832069158, + "learning_rate": 0.00018859136619011447, + "loss": 1.0971, + "step": 4094 + }, + { + "epoch": 0.39, + "grad_norm": 0.2645736238106595, + "learning_rate": 0.00018858402720811905, + "loss": 1.0806, + "step": 4095 + }, + { + "epoch": 0.39, + "grad_norm": 0.25805002703777125, + "learning_rate": 0.00018857668600925264, + "loss": 1.0815, + "step": 4096 + }, + { + "epoch": 0.39, + "grad_norm": 0.2768106728894951, + "learning_rate": 0.00018856934259369902, + "loss": 1.0963, + "step": 4097 + }, + { + "epoch": 0.39, + "grad_norm": 0.2952517992902994, + "learning_rate": 0.00018856199696164194, + "loss": 1.0913, + "step": 4098 + }, + { + "epoch": 0.39, + "grad_norm": 0.25015646961672283, + "learning_rate": 0.0001885546491132652, + "loss": 1.0521, + "step": 4099 + }, + { + "epoch": 0.39, + "grad_norm": 0.2933146156325368, + "learning_rate": 0.00018854729904875273, + "loss": 1.1188, + "step": 4100 + }, + { + "epoch": 0.39, + "grad_norm": 0.2906550664804429, + "learning_rate": 0.00018853994676828846, + "loss": 1.1433, + "step": 4101 + }, + { + "epoch": 0.39, + "grad_norm": 0.26884740955424163, + "learning_rate": 0.00018853259227205634, + "loss": 1.1072, + "step": 4102 + }, + { + "epoch": 0.39, + "grad_norm": 0.3113311790033681, + "learning_rate": 0.0001885252355602405, + "loss": 1.1282, + "step": 4103 + }, + { + "epoch": 0.39, + "grad_norm": 0.28155446924580935, + "learning_rate": 0.00018851787663302498, + "loss": 1.083, + "step": 4104 + }, + { + "epoch": 0.39, + "grad_norm": 0.29809592194674794, + "learning_rate": 0.00018851051549059397, + "loss": 1.0569, + "step": 4105 + }, + { + "epoch": 0.39, + "grad_norm": 0.24384181868224059, + "learning_rate": 0.0001885031521331317, + "loss": 1.0561, + "step": 4106 + }, + { + "epoch": 0.39, + "grad_norm": 0.27108319846787104, + "learning_rate": 0.0001884957865608224, + "loss": 1.058, + "step": 4107 + }, + { + "epoch": 0.39, + "grad_norm": 0.2943767607152993, + "learning_rate": 0.00018848841877385045, + "loss": 1.1393, + "step": 4108 + }, + { + "epoch": 0.39, + "grad_norm": 0.29474696198802675, + "learning_rate": 0.00018848104877240015, + "loss": 1.1804, + "step": 4109 + }, + { + "epoch": 0.39, + "grad_norm": 0.2951250932743141, + "learning_rate": 0.00018847367655665606, + "loss": 1.2261, + "step": 4110 + }, + { + "epoch": 0.39, + "grad_norm": 0.2608593829479291, + "learning_rate": 0.0001884663021268026, + "loss": 1.0769, + "step": 4111 + }, + { + "epoch": 0.39, + "grad_norm": 0.29600065532000475, + "learning_rate": 0.0001884589254830243, + "loss": 1.0772, + "step": 4112 + }, + { + "epoch": 0.39, + "grad_norm": 0.2990313407932539, + "learning_rate": 0.0001884515466255058, + "loss": 1.1775, + "step": 4113 + }, + { + "epoch": 0.39, + "grad_norm": 0.2843669340987528, + "learning_rate": 0.00018844416555443178, + "loss": 1.1374, + "step": 4114 + }, + { + "epoch": 0.39, + "grad_norm": 0.2508179755628321, + "learning_rate": 0.00018843678226998693, + "loss": 1.1355, + "step": 4115 + }, + { + "epoch": 0.39, + "grad_norm": 0.2754853279028242, + "learning_rate": 0.000188429396772356, + "loss": 1.0528, + "step": 4116 + }, + { + "epoch": 0.39, + "grad_norm": 0.2692652041119292, + "learning_rate": 0.00018842200906172386, + "loss": 1.081, + "step": 4117 + }, + { + "epoch": 0.39, + "grad_norm": 0.2581020541470848, + "learning_rate": 0.00018841461913827537, + "loss": 1.1674, + "step": 4118 + }, + { + "epoch": 0.39, + "grad_norm": 0.30950720631732437, + "learning_rate": 0.0001884072270021955, + "loss": 1.1665, + "step": 4119 + }, + { + "epoch": 0.39, + "grad_norm": 0.2485313737562046, + "learning_rate": 0.00018839983265366917, + "loss": 1.1825, + "step": 4120 + }, + { + "epoch": 0.39, + "grad_norm": 0.2644284997612591, + "learning_rate": 0.0001883924360928815, + "loss": 1.1264, + "step": 4121 + }, + { + "epoch": 0.39, + "grad_norm": 0.29112561286000443, + "learning_rate": 0.0001883850373200175, + "loss": 1.1339, + "step": 4122 + }, + { + "epoch": 0.39, + "grad_norm": 0.27090825993266066, + "learning_rate": 0.00018837763633526247, + "loss": 1.1998, + "step": 4123 + }, + { + "epoch": 0.39, + "grad_norm": 0.3146596625378274, + "learning_rate": 0.0001883702331388015, + "loss": 1.1325, + "step": 4124 + }, + { + "epoch": 0.39, + "grad_norm": 0.31943559497066387, + "learning_rate": 0.00018836282773081992, + "loss": 1.0685, + "step": 4125 + }, + { + "epoch": 0.39, + "grad_norm": 0.3067968110428308, + "learning_rate": 0.00018835542011150303, + "loss": 1.1561, + "step": 4126 + }, + { + "epoch": 0.39, + "grad_norm": 0.27469103129370837, + "learning_rate": 0.00018834801028103627, + "loss": 1.0606, + "step": 4127 + }, + { + "epoch": 0.39, + "grad_norm": 0.2565388108324173, + "learning_rate": 0.00018834059823960497, + "loss": 1.1264, + "step": 4128 + }, + { + "epoch": 0.4, + "grad_norm": 0.26620235976121565, + "learning_rate": 0.0001883331839873947, + "loss": 0.9498, + "step": 4129 + }, + { + "epoch": 0.4, + "grad_norm": 0.2736769679639783, + "learning_rate": 0.00018832576752459099, + "loss": 1.1824, + "step": 4130 + }, + { + "epoch": 0.4, + "grad_norm": 0.28039373078224655, + "learning_rate": 0.00018831834885137943, + "loss": 1.0567, + "step": 4131 + }, + { + "epoch": 0.4, + "grad_norm": 0.28360004650576354, + "learning_rate": 0.00018831092796794572, + "loss": 1.1355, + "step": 4132 + }, + { + "epoch": 0.4, + "grad_norm": 0.3007266751162003, + "learning_rate": 0.0001883035048744755, + "loss": 1.1805, + "step": 4133 + }, + { + "epoch": 0.4, + "grad_norm": 0.27426609206190533, + "learning_rate": 0.00018829607957115458, + "loss": 1.1591, + "step": 4134 + }, + { + "epoch": 0.4, + "grad_norm": 0.2975062542878613, + "learning_rate": 0.00018828865205816877, + "loss": 1.1326, + "step": 4135 + }, + { + "epoch": 0.4, + "grad_norm": 0.24138546192014423, + "learning_rate": 0.00018828122233570396, + "loss": 1.0062, + "step": 4136 + }, + { + "epoch": 0.4, + "grad_norm": 0.3259428242836633, + "learning_rate": 0.00018827379040394607, + "loss": 1.0638, + "step": 4137 + }, + { + "epoch": 0.4, + "grad_norm": 0.2893519126284774, + "learning_rate": 0.00018826635626308113, + "loss": 1.0938, + "step": 4138 + }, + { + "epoch": 0.4, + "grad_norm": 0.2649524956937564, + "learning_rate": 0.00018825891991329513, + "loss": 1.0869, + "step": 4139 + }, + { + "epoch": 0.4, + "grad_norm": 0.28942290188391856, + "learning_rate": 0.00018825148135477417, + "loss": 0.9653, + "step": 4140 + }, + { + "epoch": 0.4, + "grad_norm": 0.303875931112067, + "learning_rate": 0.00018824404058770443, + "loss": 1.1875, + "step": 4141 + }, + { + "epoch": 0.4, + "grad_norm": 0.27790521276702573, + "learning_rate": 0.00018823659761227216, + "loss": 1.1442, + "step": 4142 + }, + { + "epoch": 0.4, + "grad_norm": 0.2499769206341798, + "learning_rate": 0.00018822915242866354, + "loss": 0.9754, + "step": 4143 + }, + { + "epoch": 0.4, + "grad_norm": 0.30156640795095413, + "learning_rate": 0.00018822170503706494, + "loss": 1.1384, + "step": 4144 + }, + { + "epoch": 0.4, + "grad_norm": 0.2551627106250161, + "learning_rate": 0.00018821425543766275, + "loss": 0.9835, + "step": 4145 + }, + { + "epoch": 0.4, + "grad_norm": 0.2699256367377609, + "learning_rate": 0.00018820680363064335, + "loss": 1.0811, + "step": 4146 + }, + { + "epoch": 0.4, + "grad_norm": 0.2685887519613943, + "learning_rate": 0.00018819934961619323, + "loss": 1.1884, + "step": 4147 + }, + { + "epoch": 0.4, + "grad_norm": 0.27522442156799526, + "learning_rate": 0.000188191893394499, + "loss": 1.1393, + "step": 4148 + }, + { + "epoch": 0.4, + "grad_norm": 0.25866933190885216, + "learning_rate": 0.0001881844349657472, + "loss": 1.0642, + "step": 4149 + }, + { + "epoch": 0.4, + "grad_norm": 0.2767798366498473, + "learning_rate": 0.00018817697433012447, + "loss": 1.1294, + "step": 4150 + }, + { + "epoch": 0.4, + "grad_norm": 0.25741272329454395, + "learning_rate": 0.00018816951148781756, + "loss": 1.2252, + "step": 4151 + }, + { + "epoch": 0.4, + "grad_norm": 0.2703985849781623, + "learning_rate": 0.00018816204643901322, + "loss": 1.0734, + "step": 4152 + }, + { + "epoch": 0.4, + "grad_norm": 0.3049033588778724, + "learning_rate": 0.00018815457918389822, + "loss": 1.053, + "step": 4153 + }, + { + "epoch": 0.4, + "grad_norm": 0.22456734765420605, + "learning_rate": 0.00018814710972265953, + "loss": 0.9969, + "step": 4154 + }, + { + "epoch": 0.4, + "grad_norm": 0.2773921790872893, + "learning_rate": 0.00018813963805548397, + "loss": 1.0376, + "step": 4155 + }, + { + "epoch": 0.4, + "grad_norm": 0.34700507029136146, + "learning_rate": 0.0001881321641825586, + "loss": 1.1532, + "step": 4156 + }, + { + "epoch": 0.4, + "grad_norm": 0.3113422889257847, + "learning_rate": 0.00018812468810407043, + "loss": 1.0595, + "step": 4157 + }, + { + "epoch": 0.4, + "grad_norm": 0.29393485580712975, + "learning_rate": 0.00018811720982020655, + "loss": 1.1448, + "step": 4158 + }, + { + "epoch": 0.4, + "grad_norm": 0.2958948874029899, + "learning_rate": 0.00018810972933115412, + "loss": 1.1131, + "step": 4159 + }, + { + "epoch": 0.4, + "grad_norm": 0.2465662069981653, + "learning_rate": 0.00018810224663710033, + "loss": 0.8469, + "step": 4160 + }, + { + "epoch": 0.4, + "grad_norm": 0.2876082268093369, + "learning_rate": 0.00018809476173823247, + "loss": 1.0811, + "step": 4161 + }, + { + "epoch": 0.4, + "grad_norm": 0.3097135244404772, + "learning_rate": 0.0001880872746347378, + "loss": 1.1378, + "step": 4162 + }, + { + "epoch": 0.4, + "grad_norm": 0.2793139934409337, + "learning_rate": 0.00018807978532680374, + "loss": 1.0819, + "step": 4163 + }, + { + "epoch": 0.4, + "grad_norm": 0.2657493887824555, + "learning_rate": 0.0001880722938146177, + "loss": 1.1998, + "step": 4164 + }, + { + "epoch": 0.4, + "grad_norm": 0.2504323188479509, + "learning_rate": 0.00018806480009836716, + "loss": 1.136, + "step": 4165 + }, + { + "epoch": 0.4, + "grad_norm": 0.2770886792479756, + "learning_rate": 0.00018805730417823964, + "loss": 1.0626, + "step": 4166 + }, + { + "epoch": 0.4, + "grad_norm": 0.28327442888376786, + "learning_rate": 0.00018804980605442273, + "loss": 1.2524, + "step": 4167 + }, + { + "epoch": 0.4, + "grad_norm": 0.2738803155422817, + "learning_rate": 0.00018804230572710411, + "loss": 1.2178, + "step": 4168 + }, + { + "epoch": 0.4, + "grad_norm": 0.2621464862724536, + "learning_rate": 0.00018803480319647145, + "loss": 1.1003, + "step": 4169 + }, + { + "epoch": 0.4, + "grad_norm": 0.2829875367760781, + "learning_rate": 0.0001880272984627125, + "loss": 1.1652, + "step": 4170 + }, + { + "epoch": 0.4, + "grad_norm": 0.25110962790800634, + "learning_rate": 0.00018801979152601508, + "loss": 1.0741, + "step": 4171 + }, + { + "epoch": 0.4, + "grad_norm": 0.3021846850869496, + "learning_rate": 0.0001880122823865671, + "loss": 1.1427, + "step": 4172 + }, + { + "epoch": 0.4, + "grad_norm": 0.25970989561092755, + "learning_rate": 0.00018800477104455638, + "loss": 1.1105, + "step": 4173 + }, + { + "epoch": 0.4, + "grad_norm": 0.3037823815880924, + "learning_rate": 0.00018799725750017098, + "loss": 1.04, + "step": 4174 + }, + { + "epoch": 0.4, + "grad_norm": 0.28340725697761254, + "learning_rate": 0.00018798974175359892, + "loss": 1.0287, + "step": 4175 + }, + { + "epoch": 0.4, + "grad_norm": 0.2907181694596812, + "learning_rate": 0.00018798222380502825, + "loss": 1.1459, + "step": 4176 + }, + { + "epoch": 0.4, + "grad_norm": 0.2798317614131016, + "learning_rate": 0.00018797470365464718, + "loss": 1.0269, + "step": 4177 + }, + { + "epoch": 0.4, + "grad_norm": 0.2451699568307834, + "learning_rate": 0.0001879671813026438, + "loss": 1.0953, + "step": 4178 + }, + { + "epoch": 0.4, + "grad_norm": 0.26115132881368164, + "learning_rate": 0.00018795965674920647, + "loss": 0.95, + "step": 4179 + }, + { + "epoch": 0.4, + "grad_norm": 0.28240121314882904, + "learning_rate": 0.00018795212999452344, + "loss": 1.0437, + "step": 4180 + }, + { + "epoch": 0.4, + "grad_norm": 0.30408179576379274, + "learning_rate": 0.00018794460103878306, + "loss": 1.1509, + "step": 4181 + }, + { + "epoch": 0.4, + "grad_norm": 0.2613648521070335, + "learning_rate": 0.00018793706988217378, + "loss": 1.0649, + "step": 4182 + }, + { + "epoch": 0.4, + "grad_norm": 0.2807710295201798, + "learning_rate": 0.00018792953652488405, + "loss": 1.2016, + "step": 4183 + }, + { + "epoch": 0.4, + "grad_norm": 0.27595401782163975, + "learning_rate": 0.0001879220009671024, + "loss": 1.12, + "step": 4184 + }, + { + "epoch": 0.4, + "grad_norm": 0.28286269456870644, + "learning_rate": 0.00018791446320901747, + "loss": 1.0609, + "step": 4185 + }, + { + "epoch": 0.4, + "grad_norm": 0.28563194802618863, + "learning_rate": 0.0001879069232508178, + "loss": 1.0778, + "step": 4186 + }, + { + "epoch": 0.4, + "grad_norm": 0.270002889739277, + "learning_rate": 0.00018789938109269215, + "loss": 1.0788, + "step": 4187 + }, + { + "epoch": 0.4, + "grad_norm": 0.25196325847643697, + "learning_rate": 0.00018789183673482924, + "loss": 1.1108, + "step": 4188 + }, + { + "epoch": 0.4, + "grad_norm": 0.25601666120815847, + "learning_rate": 0.00018788429017741785, + "loss": 1.043, + "step": 4189 + }, + { + "epoch": 0.4, + "grad_norm": 0.2743217941272287, + "learning_rate": 0.0001878767414206469, + "loss": 1.0379, + "step": 4190 + }, + { + "epoch": 0.4, + "grad_norm": 0.2971315421501434, + "learning_rate": 0.00018786919046470527, + "loss": 1.1175, + "step": 4191 + }, + { + "epoch": 0.4, + "grad_norm": 0.24701289964452056, + "learning_rate": 0.0001878616373097819, + "loss": 1.047, + "step": 4192 + }, + { + "epoch": 0.4, + "grad_norm": 0.28093581228292824, + "learning_rate": 0.00018785408195606587, + "loss": 1.1656, + "step": 4193 + }, + { + "epoch": 0.4, + "grad_norm": 0.2738956970254292, + "learning_rate": 0.0001878465244037462, + "loss": 1.0897, + "step": 4194 + }, + { + "epoch": 0.4, + "grad_norm": 0.3300931758890875, + "learning_rate": 0.00018783896465301205, + "loss": 1.1008, + "step": 4195 + }, + { + "epoch": 0.4, + "grad_norm": 0.2339082478589511, + "learning_rate": 0.0001878314027040526, + "loss": 1.1647, + "step": 4196 + }, + { + "epoch": 0.4, + "grad_norm": 0.2575164996133438, + "learning_rate": 0.0001878238385570571, + "loss": 1.1316, + "step": 4197 + }, + { + "epoch": 0.4, + "grad_norm": 0.28065885782328104, + "learning_rate": 0.00018781627221221484, + "loss": 1.1204, + "step": 4198 + }, + { + "epoch": 0.4, + "grad_norm": 0.27240538013841803, + "learning_rate": 0.0001878087036697152, + "loss": 1.1995, + "step": 4199 + }, + { + "epoch": 0.4, + "grad_norm": 0.28177346756683214, + "learning_rate": 0.00018780113292974756, + "loss": 1.0969, + "step": 4200 + }, + { + "epoch": 0.4, + "grad_norm": 0.2869344753327611, + "learning_rate": 0.00018779355999250135, + "loss": 1.02, + "step": 4201 + }, + { + "epoch": 0.4, + "grad_norm": 0.317258761790818, + "learning_rate": 0.00018778598485816618, + "loss": 1.0979, + "step": 4202 + }, + { + "epoch": 0.4, + "grad_norm": 0.28088629903560547, + "learning_rate": 0.00018777840752693152, + "loss": 1.0249, + "step": 4203 + }, + { + "epoch": 0.4, + "grad_norm": 0.2796415407726609, + "learning_rate": 0.00018777082799898705, + "loss": 1.2466, + "step": 4204 + }, + { + "epoch": 0.4, + "grad_norm": 0.28573228367442755, + "learning_rate": 0.00018776324627452247, + "loss": 1.1008, + "step": 4205 + }, + { + "epoch": 0.4, + "grad_norm": 0.36653558077222376, + "learning_rate": 0.0001877556623537275, + "loss": 1.1889, + "step": 4206 + }, + { + "epoch": 0.4, + "grad_norm": 0.257060176220037, + "learning_rate": 0.00018774807623679192, + "loss": 1.096, + "step": 4207 + }, + { + "epoch": 0.4, + "grad_norm": 0.26689142065335364, + "learning_rate": 0.00018774048792390559, + "loss": 1.1396, + "step": 4208 + }, + { + "epoch": 0.4, + "grad_norm": 0.27797803343333893, + "learning_rate": 0.0001877328974152584, + "loss": 1.0059, + "step": 4209 + }, + { + "epoch": 0.4, + "grad_norm": 0.26890403496189735, + "learning_rate": 0.00018772530471104028, + "loss": 1.1074, + "step": 4210 + }, + { + "epoch": 0.4, + "grad_norm": 0.29380901154741923, + "learning_rate": 0.00018771770981144132, + "loss": 1.1014, + "step": 4211 + }, + { + "epoch": 0.4, + "grad_norm": 0.2777299581369351, + "learning_rate": 0.00018771011271665153, + "loss": 1.0938, + "step": 4212 + }, + { + "epoch": 0.4, + "grad_norm": 0.26403372932978386, + "learning_rate": 0.00018770251342686104, + "loss": 1.1595, + "step": 4213 + }, + { + "epoch": 0.4, + "grad_norm": 0.28636002615157274, + "learning_rate": 0.00018769491194226006, + "loss": 1.0455, + "step": 4214 + }, + { + "epoch": 0.4, + "grad_norm": 0.2844974569194305, + "learning_rate": 0.00018768730826303876, + "loss": 1.0772, + "step": 4215 + }, + { + "epoch": 0.4, + "grad_norm": 0.25117753184917296, + "learning_rate": 0.0001876797023893875, + "loss": 1.1215, + "step": 4216 + }, + { + "epoch": 0.4, + "grad_norm": 0.25290828237696705, + "learning_rate": 0.00018767209432149652, + "loss": 1.0751, + "step": 4217 + }, + { + "epoch": 0.4, + "grad_norm": 0.2641125806261801, + "learning_rate": 0.0001876644840595563, + "loss": 1.1248, + "step": 4218 + }, + { + "epoch": 0.4, + "grad_norm": 0.26789458127024735, + "learning_rate": 0.00018765687160375732, + "loss": 1.058, + "step": 4219 + }, + { + "epoch": 0.4, + "grad_norm": 0.2656234103720025, + "learning_rate": 0.00018764925695428998, + "loss": 1.0255, + "step": 4220 + }, + { + "epoch": 0.4, + "grad_norm": 0.24975067381842872, + "learning_rate": 0.00018764164011134495, + "loss": 1.1212, + "step": 4221 + }, + { + "epoch": 0.4, + "grad_norm": 0.2395528550635482, + "learning_rate": 0.00018763402107511276, + "loss": 0.9989, + "step": 4222 + }, + { + "epoch": 0.4, + "grad_norm": 0.2532908262725297, + "learning_rate": 0.00018762639984578412, + "loss": 1.1901, + "step": 4223 + }, + { + "epoch": 0.4, + "grad_norm": 0.31719585665339967, + "learning_rate": 0.00018761877642354977, + "loss": 1.2269, + "step": 4224 + }, + { + "epoch": 0.4, + "grad_norm": 0.31874436060610234, + "learning_rate": 0.00018761115080860046, + "loss": 0.9891, + "step": 4225 + }, + { + "epoch": 0.4, + "grad_norm": 0.29050294955105876, + "learning_rate": 0.00018760352300112705, + "loss": 1.1358, + "step": 4226 + }, + { + "epoch": 0.4, + "grad_norm": 0.27399249257978747, + "learning_rate": 0.00018759589300132041, + "loss": 1.0868, + "step": 4227 + }, + { + "epoch": 0.4, + "grad_norm": 0.2696276778720872, + "learning_rate": 0.00018758826080937148, + "loss": 1.1893, + "step": 4228 + }, + { + "epoch": 0.4, + "grad_norm": 0.28473533529326145, + "learning_rate": 0.00018758062642547133, + "loss": 1.1131, + "step": 4229 + }, + { + "epoch": 0.4, + "grad_norm": 0.27137640986272094, + "learning_rate": 0.00018757298984981092, + "loss": 0.976, + "step": 4230 + }, + { + "epoch": 0.4, + "grad_norm": 0.3035287993677886, + "learning_rate": 0.0001875653510825814, + "loss": 1.2036, + "step": 4231 + }, + { + "epoch": 0.4, + "grad_norm": 0.2730310702863054, + "learning_rate": 0.00018755771012397393, + "loss": 1.1172, + "step": 4232 + }, + { + "epoch": 0.4, + "grad_norm": 0.2515152904223668, + "learning_rate": 0.00018755006697417976, + "loss": 1.0117, + "step": 4233 + }, + { + "epoch": 0.41, + "grad_norm": 0.2628436719293594, + "learning_rate": 0.00018754242163339014, + "loss": 1.0782, + "step": 4234 + }, + { + "epoch": 0.41, + "grad_norm": 0.26021326272006884, + "learning_rate": 0.0001875347741017964, + "loss": 1.1058, + "step": 4235 + }, + { + "epoch": 0.41, + "grad_norm": 0.2739556248198109, + "learning_rate": 0.0001875271243795899, + "loss": 0.9982, + "step": 4236 + }, + { + "epoch": 0.41, + "grad_norm": 0.30134796736592306, + "learning_rate": 0.00018751947246696212, + "loss": 1.0949, + "step": 4237 + }, + { + "epoch": 0.41, + "grad_norm": 0.271764675869869, + "learning_rate": 0.00018751181836410455, + "loss": 1.0182, + "step": 4238 + }, + { + "epoch": 0.41, + "grad_norm": 0.24250081027639273, + "learning_rate": 0.0001875041620712087, + "loss": 1.0884, + "step": 4239 + }, + { + "epoch": 0.41, + "grad_norm": 0.2869641568770455, + "learning_rate": 0.0001874965035884662, + "loss": 1.1567, + "step": 4240 + }, + { + "epoch": 0.41, + "grad_norm": 0.2969381271520231, + "learning_rate": 0.00018748884291606874, + "loss": 1.0765, + "step": 4241 + }, + { + "epoch": 0.41, + "grad_norm": 0.26178640355726157, + "learning_rate": 0.00018748118005420798, + "loss": 1.1142, + "step": 4242 + }, + { + "epoch": 0.41, + "grad_norm": 0.2800257202847653, + "learning_rate": 0.0001874735150030757, + "loss": 1.1046, + "step": 4243 + }, + { + "epoch": 0.41, + "grad_norm": 0.2944863333696688, + "learning_rate": 0.00018746584776286376, + "loss": 1.038, + "step": 4244 + }, + { + "epoch": 0.41, + "grad_norm": 0.30216624143418275, + "learning_rate": 0.00018745817833376398, + "loss": 1.2037, + "step": 4245 + }, + { + "epoch": 0.41, + "grad_norm": 0.2663185997548643, + "learning_rate": 0.00018745050671596834, + "loss": 1.1279, + "step": 4246 + }, + { + "epoch": 0.41, + "grad_norm": 0.2447130737177253, + "learning_rate": 0.00018744283290966882, + "loss": 1.0917, + "step": 4247 + }, + { + "epoch": 0.41, + "grad_norm": 0.290268659831609, + "learning_rate": 0.00018743515691505743, + "loss": 1.031, + "step": 4248 + }, + { + "epoch": 0.41, + "grad_norm": 0.28648517218701464, + "learning_rate": 0.0001874274787323263, + "loss": 1.0971, + "step": 4249 + }, + { + "epoch": 0.41, + "grad_norm": 0.2682660047938872, + "learning_rate": 0.00018741979836166755, + "loss": 1.1424, + "step": 4250 + }, + { + "epoch": 0.41, + "grad_norm": 0.3030211943382245, + "learning_rate": 0.00018741211580327344, + "loss": 1.254, + "step": 4251 + }, + { + "epoch": 0.41, + "grad_norm": 0.3098473572044177, + "learning_rate": 0.00018740443105733613, + "loss": 0.9803, + "step": 4252 + }, + { + "epoch": 0.41, + "grad_norm": 0.2797263417102469, + "learning_rate": 0.00018739674412404807, + "loss": 1.0723, + "step": 4253 + }, + { + "epoch": 0.41, + "grad_norm": 0.27611130556479674, + "learning_rate": 0.00018738905500360154, + "loss": 1.0926, + "step": 4254 + }, + { + "epoch": 0.41, + "grad_norm": 0.2787572653472917, + "learning_rate": 0.00018738136369618897, + "loss": 1.0504, + "step": 4255 + }, + { + "epoch": 0.41, + "grad_norm": 0.28676439879084453, + "learning_rate": 0.00018737367020200285, + "loss": 1.0677, + "step": 4256 + }, + { + "epoch": 0.41, + "grad_norm": 0.2898408979989211, + "learning_rate": 0.00018736597452123575, + "loss": 1.0629, + "step": 4257 + }, + { + "epoch": 0.41, + "grad_norm": 0.265340677178945, + "learning_rate": 0.00018735827665408022, + "loss": 1.0293, + "step": 4258 + }, + { + "epoch": 0.41, + "grad_norm": 0.2910287914202519, + "learning_rate": 0.0001873505766007289, + "loss": 0.9698, + "step": 4259 + }, + { + "epoch": 0.41, + "grad_norm": 0.2899174476618089, + "learning_rate": 0.00018734287436137452, + "loss": 1.1953, + "step": 4260 + }, + { + "epoch": 0.41, + "grad_norm": 0.2566780706363218, + "learning_rate": 0.0001873351699362098, + "loss": 1.0803, + "step": 4261 + }, + { + "epoch": 0.41, + "grad_norm": 0.25026487846526846, + "learning_rate": 0.00018732746332542758, + "loss": 1.0512, + "step": 4262 + }, + { + "epoch": 0.41, + "grad_norm": 0.2634865581228525, + "learning_rate": 0.00018731975452922073, + "loss": 1.0902, + "step": 4263 + }, + { + "epoch": 0.41, + "grad_norm": 0.2739235925675129, + "learning_rate": 0.0001873120435477821, + "loss": 1.2044, + "step": 4264 + }, + { + "epoch": 0.41, + "grad_norm": 0.3392678199356469, + "learning_rate": 0.00018730433038130473, + "loss": 1.1055, + "step": 4265 + }, + { + "epoch": 0.41, + "grad_norm": 0.2394900592357321, + "learning_rate": 0.0001872966150299816, + "loss": 1.0701, + "step": 4266 + }, + { + "epoch": 0.41, + "grad_norm": 0.2700640038637131, + "learning_rate": 0.00018728889749400584, + "loss": 1.0162, + "step": 4267 + }, + { + "epoch": 0.41, + "grad_norm": 0.2912889000502306, + "learning_rate": 0.00018728117777357055, + "loss": 1.1419, + "step": 4268 + }, + { + "epoch": 0.41, + "grad_norm": 0.2760619669010758, + "learning_rate": 0.00018727345586886892, + "loss": 1.0408, + "step": 4269 + }, + { + "epoch": 0.41, + "grad_norm": 0.2927376911193287, + "learning_rate": 0.00018726573178009422, + "loss": 1.0906, + "step": 4270 + }, + { + "epoch": 0.41, + "grad_norm": 0.25519561721394524, + "learning_rate": 0.00018725800550743976, + "loss": 1.0385, + "step": 4271 + }, + { + "epoch": 0.41, + "grad_norm": 0.2830277309204389, + "learning_rate": 0.00018725027705109886, + "loss": 1.0272, + "step": 4272 + }, + { + "epoch": 0.41, + "grad_norm": 0.2664819202376281, + "learning_rate": 0.00018724254641126493, + "loss": 1.1779, + "step": 4273 + }, + { + "epoch": 0.41, + "grad_norm": 0.2574469843738646, + "learning_rate": 0.00018723481358813145, + "loss": 1.0864, + "step": 4274 + }, + { + "epoch": 0.41, + "grad_norm": 0.2757193054824571, + "learning_rate": 0.00018722707858189193, + "loss": 1.0145, + "step": 4275 + }, + { + "epoch": 0.41, + "grad_norm": 0.29257610913040466, + "learning_rate": 0.00018721934139273998, + "loss": 0.9679, + "step": 4276 + }, + { + "epoch": 0.41, + "grad_norm": 0.289091197361481, + "learning_rate": 0.00018721160202086914, + "loss": 1.092, + "step": 4277 + }, + { + "epoch": 0.41, + "grad_norm": 0.24344559699327611, + "learning_rate": 0.0001872038604664732, + "loss": 1.1082, + "step": 4278 + }, + { + "epoch": 0.41, + "grad_norm": 0.2810770812752845, + "learning_rate": 0.0001871961167297458, + "loss": 1.0673, + "step": 4279 + }, + { + "epoch": 0.41, + "grad_norm": 0.2861371909971846, + "learning_rate": 0.0001871883708108808, + "loss": 1.0974, + "step": 4280 + }, + { + "epoch": 0.41, + "grad_norm": 0.31326751338971903, + "learning_rate": 0.00018718062271007204, + "loss": 1.1654, + "step": 4281 + }, + { + "epoch": 0.41, + "grad_norm": 0.2695673863745955, + "learning_rate": 0.00018717287242751341, + "loss": 1.045, + "step": 4282 + }, + { + "epoch": 0.41, + "grad_norm": 0.3078415478161515, + "learning_rate": 0.00018716511996339885, + "loss": 1.0805, + "step": 4283 + }, + { + "epoch": 0.41, + "grad_norm": 0.26900716775364764, + "learning_rate": 0.00018715736531792237, + "loss": 1.1007, + "step": 4284 + }, + { + "epoch": 0.41, + "grad_norm": 0.28023295676688076, + "learning_rate": 0.00018714960849127804, + "loss": 1.0815, + "step": 4285 + }, + { + "epoch": 0.41, + "grad_norm": 0.2898989628594484, + "learning_rate": 0.00018714184948366, + "loss": 1.0418, + "step": 4286 + }, + { + "epoch": 0.41, + "grad_norm": 0.26061571588826044, + "learning_rate": 0.00018713408829526242, + "loss": 1.1429, + "step": 4287 + }, + { + "epoch": 0.41, + "grad_norm": 0.2745641023863704, + "learning_rate": 0.0001871263249262795, + "loss": 1.1773, + "step": 4288 + }, + { + "epoch": 0.41, + "grad_norm": 0.26470950342417765, + "learning_rate": 0.00018711855937690556, + "loss": 1.1311, + "step": 4289 + }, + { + "epoch": 0.41, + "grad_norm": 0.29468498976148555, + "learning_rate": 0.00018711079164733491, + "loss": 1.0414, + "step": 4290 + }, + { + "epoch": 0.41, + "grad_norm": 0.26226927485345436, + "learning_rate": 0.00018710302173776194, + "loss": 1.0876, + "step": 4291 + }, + { + "epoch": 0.41, + "grad_norm": 0.24393119618376538, + "learning_rate": 0.00018709524964838115, + "loss": 1.0936, + "step": 4292 + }, + { + "epoch": 0.41, + "grad_norm": 0.2871335693752671, + "learning_rate": 0.00018708747537938696, + "loss": 1.0862, + "step": 4293 + }, + { + "epoch": 0.41, + "grad_norm": 0.25454898572621976, + "learning_rate": 0.00018707969893097399, + "loss": 0.9553, + "step": 4294 + }, + { + "epoch": 0.41, + "grad_norm": 0.2612510649191512, + "learning_rate": 0.0001870719203033368, + "loss": 1.015, + "step": 4295 + }, + { + "epoch": 0.41, + "grad_norm": 0.2735824234461878, + "learning_rate": 0.0001870641394966701, + "loss": 1.1366, + "step": 4296 + }, + { + "epoch": 0.41, + "grad_norm": 0.2561249258130775, + "learning_rate": 0.00018705635651116857, + "loss": 0.9771, + "step": 4297 + }, + { + "epoch": 0.41, + "grad_norm": 0.2724097621208985, + "learning_rate": 0.00018704857134702705, + "loss": 1.0373, + "step": 4298 + }, + { + "epoch": 0.41, + "grad_norm": 0.2624090546858682, + "learning_rate": 0.00018704078400444028, + "loss": 1.0986, + "step": 4299 + }, + { + "epoch": 0.41, + "grad_norm": 0.2933893818570987, + "learning_rate": 0.0001870329944836032, + "loss": 1.0659, + "step": 4300 + }, + { + "epoch": 0.41, + "grad_norm": 0.25403733960977537, + "learning_rate": 0.00018702520278471074, + "loss": 1.1726, + "step": 4301 + }, + { + "epoch": 0.41, + "grad_norm": 0.27542609220308545, + "learning_rate": 0.00018701740890795788, + "loss": 1.1237, + "step": 4302 + }, + { + "epoch": 0.41, + "grad_norm": 0.2536691041486995, + "learning_rate": 0.0001870096128535397, + "loss": 0.9981, + "step": 4303 + }, + { + "epoch": 0.41, + "grad_norm": 0.2732550236694586, + "learning_rate": 0.00018700181462165126, + "loss": 1.0504, + "step": 4304 + }, + { + "epoch": 0.41, + "grad_norm": 0.26940609611280475, + "learning_rate": 0.0001869940142124877, + "loss": 1.0977, + "step": 4305 + }, + { + "epoch": 0.41, + "grad_norm": 0.2912691081559738, + "learning_rate": 0.0001869862116262443, + "loss": 1.0703, + "step": 4306 + }, + { + "epoch": 0.41, + "grad_norm": 0.27335596729191, + "learning_rate": 0.00018697840686311628, + "loss": 1.1206, + "step": 4307 + }, + { + "epoch": 0.41, + "grad_norm": 0.2882836785046496, + "learning_rate": 0.00018697059992329895, + "loss": 1.0942, + "step": 4308 + }, + { + "epoch": 0.41, + "grad_norm": 0.2736335086651974, + "learning_rate": 0.0001869627908069877, + "loss": 1.09, + "step": 4309 + }, + { + "epoch": 0.41, + "grad_norm": 0.25546265700852766, + "learning_rate": 0.00018695497951437795, + "loss": 1.1653, + "step": 4310 + }, + { + "epoch": 0.41, + "grad_norm": 0.27099441480074316, + "learning_rate": 0.0001869471660456652, + "loss": 1.0797, + "step": 4311 + }, + { + "epoch": 0.41, + "grad_norm": 0.2734354890739324, + "learning_rate": 0.00018693935040104497, + "loss": 1.0753, + "step": 4312 + }, + { + "epoch": 0.41, + "grad_norm": 0.3009809400535224, + "learning_rate": 0.00018693153258071286, + "loss": 1.1689, + "step": 4313 + }, + { + "epoch": 0.41, + "grad_norm": 0.29754474015094556, + "learning_rate": 0.00018692371258486451, + "loss": 1.0379, + "step": 4314 + }, + { + "epoch": 0.41, + "grad_norm": 0.2996282403269194, + "learning_rate": 0.00018691589041369564, + "loss": 1.1081, + "step": 4315 + }, + { + "epoch": 0.41, + "grad_norm": 0.24314970149389548, + "learning_rate": 0.000186908066067402, + "loss": 1.0464, + "step": 4316 + }, + { + "epoch": 0.41, + "grad_norm": 0.24701089925472527, + "learning_rate": 0.00018690023954617932, + "loss": 1.1741, + "step": 4317 + }, + { + "epoch": 0.41, + "grad_norm": 0.27607286639486517, + "learning_rate": 0.0001868924108502236, + "loss": 1.0594, + "step": 4318 + }, + { + "epoch": 0.41, + "grad_norm": 0.3017023954760616, + "learning_rate": 0.00018688457997973065, + "loss": 1.0294, + "step": 4319 + }, + { + "epoch": 0.41, + "grad_norm": 0.24937940795309205, + "learning_rate": 0.00018687674693489647, + "loss": 1.0421, + "step": 4320 + }, + { + "epoch": 0.41, + "grad_norm": 0.23892217344372102, + "learning_rate": 0.00018686891171591712, + "loss": 1.0908, + "step": 4321 + }, + { + "epoch": 0.41, + "grad_norm": 0.28068063323508174, + "learning_rate": 0.00018686107432298868, + "loss": 1.1319, + "step": 4322 + }, + { + "epoch": 0.41, + "grad_norm": 0.26786810608469297, + "learning_rate": 0.00018685323475630723, + "loss": 1.0355, + "step": 4323 + }, + { + "epoch": 0.41, + "grad_norm": 0.2435421171517636, + "learning_rate": 0.000186845393016069, + "loss": 1.0141, + "step": 4324 + }, + { + "epoch": 0.41, + "grad_norm": 0.27884414310128647, + "learning_rate": 0.00018683754910247025, + "loss": 1.1473, + "step": 4325 + }, + { + "epoch": 0.41, + "grad_norm": 0.2890938734276742, + "learning_rate": 0.00018682970301570726, + "loss": 1.1203, + "step": 4326 + }, + { + "epoch": 0.41, + "grad_norm": 0.2769615698559789, + "learning_rate": 0.00018682185475597636, + "loss": 1.0541, + "step": 4327 + }, + { + "epoch": 0.41, + "grad_norm": 0.2785590604016616, + "learning_rate": 0.00018681400432347397, + "loss": 1.0544, + "step": 4328 + }, + { + "epoch": 0.41, + "grad_norm": 0.29418313155446657, + "learning_rate": 0.00018680615171839658, + "loss": 1.1175, + "step": 4329 + }, + { + "epoch": 0.41, + "grad_norm": 0.3025538424684769, + "learning_rate": 0.00018679829694094068, + "loss": 1.2113, + "step": 4330 + }, + { + "epoch": 0.41, + "grad_norm": 0.2910730421005133, + "learning_rate": 0.00018679043999130288, + "loss": 1.132, + "step": 4331 + }, + { + "epoch": 0.41, + "grad_norm": 0.29253393692155966, + "learning_rate": 0.00018678258086967975, + "loss": 1.095, + "step": 4332 + }, + { + "epoch": 0.41, + "grad_norm": 0.28305224842813903, + "learning_rate": 0.00018677471957626797, + "loss": 1.1026, + "step": 4333 + }, + { + "epoch": 0.41, + "grad_norm": 0.2924260897779223, + "learning_rate": 0.0001867668561112643, + "loss": 1.0386, + "step": 4334 + }, + { + "epoch": 0.41, + "grad_norm": 0.293768434817011, + "learning_rate": 0.00018675899047486557, + "loss": 1.0448, + "step": 4335 + }, + { + "epoch": 0.41, + "grad_norm": 0.2941288478482527, + "learning_rate": 0.00018675112266726854, + "loss": 1.0662, + "step": 4336 + }, + { + "epoch": 0.41, + "grad_norm": 0.2957044806998864, + "learning_rate": 0.00018674325268867016, + "loss": 0.9872, + "step": 4337 + }, + { + "epoch": 0.42, + "grad_norm": 0.3170426741111006, + "learning_rate": 0.00018673538053926735, + "loss": 1.0532, + "step": 4338 + }, + { + "epoch": 0.42, + "grad_norm": 0.3000921754928047, + "learning_rate": 0.00018672750621925714, + "loss": 1.1413, + "step": 4339 + }, + { + "epoch": 0.42, + "grad_norm": 0.2998372677155844, + "learning_rate": 0.00018671962972883658, + "loss": 1.1425, + "step": 4340 + }, + { + "epoch": 0.42, + "grad_norm": 0.2645728037896027, + "learning_rate": 0.00018671175106820277, + "loss": 1.0747, + "step": 4341 + }, + { + "epoch": 0.42, + "grad_norm": 0.28183953656999067, + "learning_rate": 0.00018670387023755295, + "loss": 0.9737, + "step": 4342 + }, + { + "epoch": 0.42, + "grad_norm": 0.256299564446901, + "learning_rate": 0.00018669598723708422, + "loss": 1.1139, + "step": 4343 + }, + { + "epoch": 0.42, + "grad_norm": 0.26935203909823185, + "learning_rate": 0.00018668810206699395, + "loss": 1.0717, + "step": 4344 + }, + { + "epoch": 0.42, + "grad_norm": 0.23352656548172643, + "learning_rate": 0.00018668021472747944, + "loss": 1.0427, + "step": 4345 + }, + { + "epoch": 0.42, + "grad_norm": 0.2778191273219274, + "learning_rate": 0.00018667232521873807, + "loss": 1.0442, + "step": 4346 + }, + { + "epoch": 0.42, + "grad_norm": 0.2708092397068009, + "learning_rate": 0.00018666443354096733, + "loss": 1.1642, + "step": 4347 + }, + { + "epoch": 0.42, + "grad_norm": 0.2628086430153175, + "learning_rate": 0.00018665653969436466, + "loss": 1.0567, + "step": 4348 + }, + { + "epoch": 0.42, + "grad_norm": 0.2689137058024899, + "learning_rate": 0.00018664864367912758, + "loss": 1.055, + "step": 4349 + }, + { + "epoch": 0.42, + "grad_norm": 0.27522982747891467, + "learning_rate": 0.00018664074549545377, + "loss": 0.9796, + "step": 4350 + }, + { + "epoch": 0.42, + "grad_norm": 0.2585842007123571, + "learning_rate": 0.00018663284514354084, + "loss": 1.0592, + "step": 4351 + }, + { + "epoch": 0.42, + "grad_norm": 0.2698306914125685, + "learning_rate": 0.0001866249426235865, + "loss": 1.084, + "step": 4352 + }, + { + "epoch": 0.42, + "grad_norm": 0.30565305866824105, + "learning_rate": 0.00018661703793578855, + "loss": 1.0316, + "step": 4353 + }, + { + "epoch": 0.42, + "grad_norm": 0.26114362713963013, + "learning_rate": 0.00018660913108034478, + "loss": 1.1677, + "step": 4354 + }, + { + "epoch": 0.42, + "grad_norm": 0.3016235809178839, + "learning_rate": 0.00018660122205745313, + "loss": 1.1933, + "step": 4355 + }, + { + "epoch": 0.42, + "grad_norm": 0.2863853851725708, + "learning_rate": 0.0001865933108673114, + "loss": 0.9809, + "step": 4356 + }, + { + "epoch": 0.42, + "grad_norm": 0.2912386308622707, + "learning_rate": 0.00018658539751011767, + "loss": 1.1289, + "step": 4357 + }, + { + "epoch": 0.42, + "grad_norm": 0.2776875205740512, + "learning_rate": 0.00018657748198606995, + "loss": 1.0565, + "step": 4358 + }, + { + "epoch": 0.42, + "grad_norm": 0.2748712764200445, + "learning_rate": 0.00018656956429536633, + "loss": 1.0782, + "step": 4359 + }, + { + "epoch": 0.42, + "grad_norm": 0.255166953661312, + "learning_rate": 0.00018656164443820494, + "loss": 1.0291, + "step": 4360 + }, + { + "epoch": 0.42, + "grad_norm": 0.2665620217061101, + "learning_rate": 0.00018655372241478403, + "loss": 1.114, + "step": 4361 + }, + { + "epoch": 0.42, + "grad_norm": 0.2544779490108305, + "learning_rate": 0.00018654579822530179, + "loss": 1.2265, + "step": 4362 + }, + { + "epoch": 0.42, + "grad_norm": 0.23854453003136591, + "learning_rate": 0.00018653787186995654, + "loss": 1.033, + "step": 4363 + }, + { + "epoch": 0.42, + "grad_norm": 0.2710547109595493, + "learning_rate": 0.00018652994334894668, + "loss": 1.0567, + "step": 4364 + }, + { + "epoch": 0.42, + "grad_norm": 0.27823107428416105, + "learning_rate": 0.00018652201266247063, + "loss": 1.0357, + "step": 4365 + }, + { + "epoch": 0.42, + "grad_norm": 0.3065920295229727, + "learning_rate": 0.0001865140798107268, + "loss": 0.9394, + "step": 4366 + }, + { + "epoch": 0.42, + "grad_norm": 0.29060026851055215, + "learning_rate": 0.00018650614479391378, + "loss": 1.0582, + "step": 4367 + }, + { + "epoch": 0.42, + "grad_norm": 0.2798363947842491, + "learning_rate": 0.00018649820761223012, + "loss": 1.1008, + "step": 4368 + }, + { + "epoch": 0.42, + "grad_norm": 0.2524281791152147, + "learning_rate": 0.00018649026826587442, + "loss": 1.0788, + "step": 4369 + }, + { + "epoch": 0.42, + "grad_norm": 0.3175783594422539, + "learning_rate": 0.00018648232675504543, + "loss": 1.0091, + "step": 4370 + }, + { + "epoch": 0.42, + "grad_norm": 0.2605770620877395, + "learning_rate": 0.00018647438307994185, + "loss": 1.0315, + "step": 4371 + }, + { + "epoch": 0.42, + "grad_norm": 0.3074184313253669, + "learning_rate": 0.0001864664372407625, + "loss": 1.11, + "step": 4372 + }, + { + "epoch": 0.42, + "grad_norm": 0.2804678028999845, + "learning_rate": 0.0001864584892377062, + "loss": 1.0521, + "step": 4373 + }, + { + "epoch": 0.42, + "grad_norm": 0.25285663187552027, + "learning_rate": 0.00018645053907097187, + "loss": 1.0407, + "step": 4374 + }, + { + "epoch": 0.42, + "grad_norm": 0.32876163956484417, + "learning_rate": 0.00018644258674075848, + "loss": 0.9872, + "step": 4375 + }, + { + "epoch": 0.42, + "grad_norm": 0.25777775379310247, + "learning_rate": 0.000186434632247265, + "loss": 1.0567, + "step": 4376 + }, + { + "epoch": 0.42, + "grad_norm": 0.29826590288671295, + "learning_rate": 0.00018642667559069055, + "loss": 1.0757, + "step": 4377 + }, + { + "epoch": 0.42, + "grad_norm": 0.27317079924980964, + "learning_rate": 0.0001864187167712342, + "loss": 1.1448, + "step": 4378 + }, + { + "epoch": 0.42, + "grad_norm": 0.3070229873978444, + "learning_rate": 0.00018641075578909518, + "loss": 1.0379, + "step": 4379 + }, + { + "epoch": 0.42, + "grad_norm": 0.2718896024874906, + "learning_rate": 0.0001864027926444727, + "loss": 1.1072, + "step": 4380 + }, + { + "epoch": 0.42, + "grad_norm": 0.2620818390598612, + "learning_rate": 0.00018639482733756601, + "loss": 1.0537, + "step": 4381 + }, + { + "epoch": 0.42, + "grad_norm": 0.2289284295029631, + "learning_rate": 0.00018638685986857448, + "loss": 1.0194, + "step": 4382 + }, + { + "epoch": 0.42, + "grad_norm": 0.29452954070001014, + "learning_rate": 0.00018637889023769748, + "loss": 1.1051, + "step": 4383 + }, + { + "epoch": 0.42, + "grad_norm": 0.24409495634240713, + "learning_rate": 0.00018637091844513445, + "loss": 1.089, + "step": 4384 + }, + { + "epoch": 0.42, + "grad_norm": 0.2444141149494506, + "learning_rate": 0.00018636294449108493, + "loss": 0.9955, + "step": 4385 + }, + { + "epoch": 0.42, + "grad_norm": 0.25886299860296796, + "learning_rate": 0.00018635496837574844, + "loss": 1.0442, + "step": 4386 + }, + { + "epoch": 0.42, + "grad_norm": 0.2788912752950406, + "learning_rate": 0.00018634699009932462, + "loss": 1.1965, + "step": 4387 + }, + { + "epoch": 0.42, + "grad_norm": 0.2671936525699648, + "learning_rate": 0.00018633900966201304, + "loss": 1.0077, + "step": 4388 + }, + { + "epoch": 0.42, + "grad_norm": 0.28071558942931124, + "learning_rate": 0.00018633102706401355, + "loss": 1.1462, + "step": 4389 + }, + { + "epoch": 0.42, + "grad_norm": 0.34829423214490346, + "learning_rate": 0.00018632304230552582, + "loss": 1.0623, + "step": 4390 + }, + { + "epoch": 0.42, + "grad_norm": 0.24191132707171756, + "learning_rate": 0.0001863150553867497, + "loss": 1.0765, + "step": 4391 + }, + { + "epoch": 0.42, + "grad_norm": 0.27536728708801206, + "learning_rate": 0.00018630706630788505, + "loss": 1.0193, + "step": 4392 + }, + { + "epoch": 0.42, + "grad_norm": 0.2697499088562797, + "learning_rate": 0.00018629907506913186, + "loss": 1.1339, + "step": 4393 + }, + { + "epoch": 0.42, + "grad_norm": 0.26546348358382443, + "learning_rate": 0.00018629108167069006, + "loss": 1.1509, + "step": 4394 + }, + { + "epoch": 0.42, + "grad_norm": 0.2627657556993223, + "learning_rate": 0.00018628308611275972, + "loss": 1.1698, + "step": 4395 + }, + { + "epoch": 0.42, + "grad_norm": 0.2681938025996225, + "learning_rate": 0.00018627508839554093, + "loss": 1.0299, + "step": 4396 + }, + { + "epoch": 0.42, + "grad_norm": 0.26016966609609554, + "learning_rate": 0.00018626708851923382, + "loss": 1.132, + "step": 4397 + }, + { + "epoch": 0.42, + "grad_norm": 0.28096390529787246, + "learning_rate": 0.0001862590864840386, + "loss": 1.1004, + "step": 4398 + }, + { + "epoch": 0.42, + "grad_norm": 0.2914134240329839, + "learning_rate": 0.00018625108229015555, + "loss": 1.0259, + "step": 4399 + }, + { + "epoch": 0.42, + "grad_norm": 0.27633727064382735, + "learning_rate": 0.00018624307593778495, + "loss": 1.0006, + "step": 4400 + }, + { + "epoch": 0.42, + "grad_norm": 0.26565720029010786, + "learning_rate": 0.00018623506742712715, + "loss": 1.0687, + "step": 4401 + }, + { + "epoch": 0.42, + "grad_norm": 0.24736946692607406, + "learning_rate": 0.00018622705675838263, + "loss": 1.1022, + "step": 4402 + }, + { + "epoch": 0.42, + "grad_norm": 0.23869319091908384, + "learning_rate": 0.0001862190439317518, + "loss": 1.1136, + "step": 4403 + }, + { + "epoch": 0.42, + "grad_norm": 0.26085938439573125, + "learning_rate": 0.0001862110289474352, + "loss": 1.0647, + "step": 4404 + }, + { + "epoch": 0.42, + "grad_norm": 0.2615746057557681, + "learning_rate": 0.00018620301180563342, + "loss": 1.026, + "step": 4405 + }, + { + "epoch": 0.42, + "grad_norm": 0.265385119223784, + "learning_rate": 0.0001861949925065471, + "loss": 1.0762, + "step": 4406 + }, + { + "epoch": 0.42, + "grad_norm": 0.2621600644598475, + "learning_rate": 0.00018618697105037693, + "loss": 1.0342, + "step": 4407 + }, + { + "epoch": 0.42, + "grad_norm": 0.28497896413460505, + "learning_rate": 0.00018617894743732361, + "loss": 1.0353, + "step": 4408 + }, + { + "epoch": 0.42, + "grad_norm": 0.2537528026008887, + "learning_rate": 0.00018617092166758802, + "loss": 0.9979, + "step": 4409 + }, + { + "epoch": 0.42, + "grad_norm": 0.273548636864498, + "learning_rate": 0.00018616289374137092, + "loss": 1.1967, + "step": 4410 + }, + { + "epoch": 0.42, + "grad_norm": 0.32894219686067755, + "learning_rate": 0.0001861548636588733, + "loss": 1.082, + "step": 4411 + }, + { + "epoch": 0.42, + "grad_norm": 0.26569438454996513, + "learning_rate": 0.00018614683142029602, + "loss": 0.8886, + "step": 4412 + }, + { + "epoch": 0.42, + "grad_norm": 0.25788494163824816, + "learning_rate": 0.00018613879702584013, + "loss": 1.0712, + "step": 4413 + }, + { + "epoch": 0.42, + "grad_norm": 0.272910401506832, + "learning_rate": 0.00018613076047570678, + "loss": 1.0169, + "step": 4414 + }, + { + "epoch": 0.42, + "grad_norm": 0.27161074974964666, + "learning_rate": 0.00018612272177009694, + "loss": 1.1233, + "step": 4415 + }, + { + "epoch": 0.42, + "grad_norm": 0.28453465725372484, + "learning_rate": 0.0001861146809092119, + "loss": 1.0644, + "step": 4416 + }, + { + "epoch": 0.42, + "grad_norm": 0.26695328078689184, + "learning_rate": 0.00018610663789325288, + "loss": 1.0859, + "step": 4417 + }, + { + "epoch": 0.42, + "grad_norm": 0.28644342117192206, + "learning_rate": 0.00018609859272242108, + "loss": 1.1014, + "step": 4418 + }, + { + "epoch": 0.42, + "grad_norm": 0.28232507692715614, + "learning_rate": 0.0001860905453969179, + "loss": 1.0183, + "step": 4419 + }, + { + "epoch": 0.42, + "grad_norm": 0.28914880496465856, + "learning_rate": 0.0001860824959169447, + "loss": 1.0779, + "step": 4420 + }, + { + "epoch": 0.42, + "grad_norm": 0.27341424599975833, + "learning_rate": 0.000186074444282703, + "loss": 0.9407, + "step": 4421 + }, + { + "epoch": 0.42, + "grad_norm": 0.31013888814670343, + "learning_rate": 0.00018606639049439415, + "loss": 1.1336, + "step": 4422 + }, + { + "epoch": 0.42, + "grad_norm": 0.2804627470142664, + "learning_rate": 0.00018605833455221984, + "loss": 1.1237, + "step": 4423 + }, + { + "epoch": 0.42, + "grad_norm": 0.27060408840367484, + "learning_rate": 0.00018605027645638163, + "loss": 1.0914, + "step": 4424 + }, + { + "epoch": 0.42, + "grad_norm": 0.2949572609779446, + "learning_rate": 0.00018604221620708113, + "loss": 1.0142, + "step": 4425 + }, + { + "epoch": 0.42, + "grad_norm": 0.2978775593613743, + "learning_rate": 0.00018603415380452013, + "loss": 1.0299, + "step": 4426 + }, + { + "epoch": 0.42, + "grad_norm": 0.27422717885209724, + "learning_rate": 0.00018602608924890034, + "loss": 0.9475, + "step": 4427 + }, + { + "epoch": 0.42, + "grad_norm": 0.24881221794442596, + "learning_rate": 0.0001860180225404236, + "loss": 1.0083, + "step": 4428 + }, + { + "epoch": 0.42, + "grad_norm": 0.28296329045097385, + "learning_rate": 0.00018600995367929182, + "loss": 1.1519, + "step": 4429 + }, + { + "epoch": 0.42, + "grad_norm": 0.3132917134168698, + "learning_rate": 0.00018600188266570687, + "loss": 1.1355, + "step": 4430 + }, + { + "epoch": 0.42, + "grad_norm": 0.2656930447471748, + "learning_rate": 0.00018599380949987072, + "loss": 1.1187, + "step": 4431 + }, + { + "epoch": 0.42, + "grad_norm": 0.29159128798152506, + "learning_rate": 0.0001859857341819855, + "loss": 1.0272, + "step": 4432 + }, + { + "epoch": 0.42, + "grad_norm": 0.26189338837080134, + "learning_rate": 0.00018597765671225322, + "loss": 1.0971, + "step": 4433 + }, + { + "epoch": 0.42, + "grad_norm": 0.28052538093117707, + "learning_rate": 0.00018596957709087603, + "loss": 1.0194, + "step": 4434 + }, + { + "epoch": 0.42, + "grad_norm": 0.2652637453080487, + "learning_rate": 0.0001859614953180562, + "loss": 1.0746, + "step": 4435 + }, + { + "epoch": 0.42, + "grad_norm": 0.2829916437751392, + "learning_rate": 0.00018595341139399584, + "loss": 1.1021, + "step": 4436 + }, + { + "epoch": 0.42, + "grad_norm": 0.25812189396799773, + "learning_rate": 0.0001859453253188974, + "loss": 1.0858, + "step": 4437 + }, + { + "epoch": 0.42, + "grad_norm": 0.26939995257598304, + "learning_rate": 0.00018593723709296316, + "loss": 1.0994, + "step": 4438 + }, + { + "epoch": 0.42, + "grad_norm": 0.2860192381686427, + "learning_rate": 0.00018592914671639553, + "loss": 1.0819, + "step": 4439 + }, + { + "epoch": 0.42, + "grad_norm": 0.2791379239990837, + "learning_rate": 0.00018592105418939705, + "loss": 1.0618, + "step": 4440 + }, + { + "epoch": 0.42, + "grad_norm": 0.2740072559598784, + "learning_rate": 0.00018591295951217015, + "loss": 1.1346, + "step": 4441 + }, + { + "epoch": 0.42, + "grad_norm": 0.2889247502983577, + "learning_rate": 0.00018590486268491748, + "loss": 1.0459, + "step": 4442 + }, + { + "epoch": 0.43, + "grad_norm": 0.2978227410800553, + "learning_rate": 0.0001858967637078416, + "loss": 1.0501, + "step": 4443 + }, + { + "epoch": 0.43, + "grad_norm": 0.3018285492782609, + "learning_rate": 0.00018588866258114524, + "loss": 1.0894, + "step": 4444 + }, + { + "epoch": 0.43, + "grad_norm": 0.28346415215862286, + "learning_rate": 0.0001858805593050311, + "loss": 0.9718, + "step": 4445 + }, + { + "epoch": 0.43, + "grad_norm": 0.26039184338131427, + "learning_rate": 0.000185872453879702, + "loss": 1.1089, + "step": 4446 + }, + { + "epoch": 0.43, + "grad_norm": 0.3206288442226837, + "learning_rate": 0.0001858643463053608, + "loss": 1.0651, + "step": 4447 + }, + { + "epoch": 0.43, + "grad_norm": 0.265449658618547, + "learning_rate": 0.00018585623658221034, + "loss": 1.0637, + "step": 4448 + }, + { + "epoch": 0.43, + "grad_norm": 0.29272693696344226, + "learning_rate": 0.0001858481247104536, + "loss": 1.0931, + "step": 4449 + }, + { + "epoch": 0.43, + "grad_norm": 0.29578928026152973, + "learning_rate": 0.0001858400106902936, + "loss": 1.1793, + "step": 4450 + }, + { + "epoch": 0.43, + "grad_norm": 0.27791665518502323, + "learning_rate": 0.00018583189452193338, + "loss": 1.0318, + "step": 4451 + }, + { + "epoch": 0.43, + "grad_norm": 0.28708172386603614, + "learning_rate": 0.00018582377620557602, + "loss": 1.0001, + "step": 4452 + }, + { + "epoch": 0.43, + "grad_norm": 0.2945398920234109, + "learning_rate": 0.0001858156557414248, + "loss": 1.0574, + "step": 4453 + }, + { + "epoch": 0.43, + "grad_norm": 0.23921340949830877, + "learning_rate": 0.0001858075331296828, + "loss": 1.0104, + "step": 4454 + }, + { + "epoch": 0.43, + "grad_norm": 0.3568337762102039, + "learning_rate": 0.00018579940837055338, + "loss": 0.9811, + "step": 4455 + }, + { + "epoch": 0.43, + "grad_norm": 0.3247650997857825, + "learning_rate": 0.00018579128146423984, + "loss": 1.1677, + "step": 4456 + }, + { + "epoch": 0.43, + "grad_norm": 0.3235988446494336, + "learning_rate": 0.00018578315241094554, + "loss": 1.1063, + "step": 4457 + }, + { + "epoch": 0.43, + "grad_norm": 0.2910859031590426, + "learning_rate": 0.00018577502121087396, + "loss": 1.1122, + "step": 4458 + }, + { + "epoch": 0.43, + "grad_norm": 0.2855175850221935, + "learning_rate": 0.00018576688786422856, + "loss": 1.0616, + "step": 4459 + }, + { + "epoch": 0.43, + "grad_norm": 0.24829230055660786, + "learning_rate": 0.0001857587523712129, + "loss": 1.0424, + "step": 4460 + }, + { + "epoch": 0.43, + "grad_norm": 0.2840641696870963, + "learning_rate": 0.00018575061473203054, + "loss": 1.2132, + "step": 4461 + }, + { + "epoch": 0.43, + "grad_norm": 0.27078441085947547, + "learning_rate": 0.0001857424749468852, + "loss": 1.0858, + "step": 4462 + }, + { + "epoch": 0.43, + "grad_norm": 0.285656535390425, + "learning_rate": 0.0001857343330159805, + "loss": 1.0003, + "step": 4463 + }, + { + "epoch": 0.43, + "grad_norm": 0.2687099265187277, + "learning_rate": 0.00018572618893952024, + "loss": 1.0664, + "step": 4464 + }, + { + "epoch": 0.43, + "grad_norm": 0.3181574270847497, + "learning_rate": 0.00018571804271770822, + "loss": 0.9845, + "step": 4465 + }, + { + "epoch": 0.43, + "grad_norm": 0.25226801914556535, + "learning_rate": 0.0001857098943507483, + "loss": 1.1452, + "step": 4466 + }, + { + "epoch": 0.43, + "grad_norm": 0.3055598706754259, + "learning_rate": 0.00018570174383884442, + "loss": 1.0501, + "step": 4467 + }, + { + "epoch": 0.43, + "grad_norm": 0.2382755864156548, + "learning_rate": 0.00018569359118220056, + "loss": 1.0189, + "step": 4468 + }, + { + "epoch": 0.43, + "grad_norm": 0.26719151556229226, + "learning_rate": 0.00018568543638102072, + "loss": 1.1856, + "step": 4469 + }, + { + "epoch": 0.43, + "grad_norm": 0.2671695573995491, + "learning_rate": 0.00018567727943550897, + "loss": 1.0382, + "step": 4470 + }, + { + "epoch": 0.43, + "grad_norm": 0.27765193324490084, + "learning_rate": 0.00018566912034586946, + "loss": 1.2756, + "step": 4471 + }, + { + "epoch": 0.43, + "grad_norm": 0.27186513834244824, + "learning_rate": 0.00018566095911230638, + "loss": 1.0309, + "step": 4472 + }, + { + "epoch": 0.43, + "grad_norm": 0.29391579590132144, + "learning_rate": 0.00018565279573502392, + "loss": 1.1029, + "step": 4473 + }, + { + "epoch": 0.43, + "grad_norm": 0.30436200911544314, + "learning_rate": 0.00018564463021422645, + "loss": 1.0607, + "step": 4474 + }, + { + "epoch": 0.43, + "grad_norm": 0.29885017590562324, + "learning_rate": 0.00018563646255011828, + "loss": 1.1022, + "step": 4475 + }, + { + "epoch": 0.43, + "grad_norm": 0.30524559756359343, + "learning_rate": 0.0001856282927429038, + "loss": 1.1297, + "step": 4476 + }, + { + "epoch": 0.43, + "grad_norm": 0.2862736844725348, + "learning_rate": 0.0001856201207927875, + "loss": 1.0444, + "step": 4477 + }, + { + "epoch": 0.43, + "grad_norm": 0.27357248947278107, + "learning_rate": 0.00018561194669997386, + "loss": 1.1338, + "step": 4478 + }, + { + "epoch": 0.43, + "grad_norm": 0.2642596700408733, + "learning_rate": 0.00018560377046466747, + "loss": 1.0775, + "step": 4479 + }, + { + "epoch": 0.43, + "grad_norm": 0.28175880695705074, + "learning_rate": 0.00018559559208707288, + "loss": 1.1622, + "step": 4480 + }, + { + "epoch": 0.43, + "grad_norm": 0.29569453857541267, + "learning_rate": 0.00018558741156739483, + "loss": 1.0761, + "step": 4481 + }, + { + "epoch": 0.43, + "grad_norm": 0.2751922546444621, + "learning_rate": 0.000185579228905838, + "loss": 1.0507, + "step": 4482 + }, + { + "epoch": 0.43, + "grad_norm": 0.27122561641213205, + "learning_rate": 0.00018557104410260722, + "loss": 1.174, + "step": 4483 + }, + { + "epoch": 0.43, + "grad_norm": 0.2912196997412804, + "learning_rate": 0.00018556285715790724, + "loss": 1.0137, + "step": 4484 + }, + { + "epoch": 0.43, + "grad_norm": 0.2696109196990019, + "learning_rate": 0.00018555466807194303, + "loss": 0.8727, + "step": 4485 + }, + { + "epoch": 0.43, + "grad_norm": 0.27947099836689787, + "learning_rate": 0.00018554647684491943, + "loss": 1.1609, + "step": 4486 + }, + { + "epoch": 0.43, + "grad_norm": 0.26675781929194203, + "learning_rate": 0.00018553828347704152, + "loss": 1.0248, + "step": 4487 + }, + { + "epoch": 0.43, + "grad_norm": 0.2588683196044053, + "learning_rate": 0.00018553008796851428, + "loss": 1.0563, + "step": 4488 + }, + { + "epoch": 0.43, + "grad_norm": 0.27999097742810836, + "learning_rate": 0.00018552189031954285, + "loss": 1.0313, + "step": 4489 + }, + { + "epoch": 0.43, + "grad_norm": 0.2895515425733008, + "learning_rate": 0.00018551369053033237, + "loss": 1.0168, + "step": 4490 + }, + { + "epoch": 0.43, + "grad_norm": 0.2545597330861161, + "learning_rate": 0.00018550548860108804, + "loss": 1.0956, + "step": 4491 + }, + { + "epoch": 0.43, + "grad_norm": 0.26512014338659784, + "learning_rate": 0.00018549728453201513, + "loss": 1.1138, + "step": 4492 + }, + { + "epoch": 0.43, + "grad_norm": 0.27434370289340704, + "learning_rate": 0.0001854890783233189, + "loss": 1.0605, + "step": 4493 + }, + { + "epoch": 0.43, + "grad_norm": 0.2936485774839461, + "learning_rate": 0.0001854808699752048, + "loss": 1.0747, + "step": 4494 + }, + { + "epoch": 0.43, + "grad_norm": 0.27403335709343807, + "learning_rate": 0.00018547265948787818, + "loss": 1.0193, + "step": 4495 + }, + { + "epoch": 0.43, + "grad_norm": 0.28508114808522955, + "learning_rate": 0.00018546444686154455, + "loss": 1.0224, + "step": 4496 + }, + { + "epoch": 0.43, + "grad_norm": 0.3342643589174626, + "learning_rate": 0.00018545623209640941, + "loss": 1.0092, + "step": 4497 + }, + { + "epoch": 0.43, + "grad_norm": 0.26332940264787036, + "learning_rate": 0.0001854480151926784, + "loss": 1.1283, + "step": 4498 + }, + { + "epoch": 0.43, + "grad_norm": 0.30979393657373167, + "learning_rate": 0.00018543979615055705, + "loss": 1.1229, + "step": 4499 + }, + { + "epoch": 0.43, + "grad_norm": 0.29782251228163314, + "learning_rate": 0.00018543157497025113, + "loss": 1.0053, + "step": 4500 + }, + { + "epoch": 0.43, + "grad_norm": 0.26274351011419134, + "learning_rate": 0.00018542335165196635, + "loss": 1.1258, + "step": 4501 + }, + { + "epoch": 0.43, + "grad_norm": 0.3085288258220842, + "learning_rate": 0.00018541512619590854, + "loss": 1.0965, + "step": 4502 + }, + { + "epoch": 0.43, + "grad_norm": 0.30190633074116374, + "learning_rate": 0.00018540689860228348, + "loss": 1.2104, + "step": 4503 + }, + { + "epoch": 0.43, + "grad_norm": 0.2754205944826528, + "learning_rate": 0.0001853986688712971, + "loss": 1.1199, + "step": 4504 + }, + { + "epoch": 0.43, + "grad_norm": 0.2590596310721408, + "learning_rate": 0.00018539043700315538, + "loss": 1.1025, + "step": 4505 + }, + { + "epoch": 0.43, + "grad_norm": 0.24485764570082005, + "learning_rate": 0.0001853822029980643, + "loss": 0.9729, + "step": 4506 + }, + { + "epoch": 0.43, + "grad_norm": 0.27774076617481674, + "learning_rate": 0.00018537396685622994, + "loss": 1.103, + "step": 4507 + }, + { + "epoch": 0.43, + "grad_norm": 0.2682689587887445, + "learning_rate": 0.00018536572857785842, + "loss": 1.105, + "step": 4508 + }, + { + "epoch": 0.43, + "grad_norm": 0.29122611914479773, + "learning_rate": 0.00018535748816315585, + "loss": 1.1096, + "step": 4509 + }, + { + "epoch": 0.43, + "grad_norm": 0.23291917772580553, + "learning_rate": 0.0001853492456123285, + "loss": 1.1044, + "step": 4510 + }, + { + "epoch": 0.43, + "grad_norm": 0.33929513760098673, + "learning_rate": 0.00018534100092558266, + "loss": 1.1069, + "step": 4511 + }, + { + "epoch": 0.43, + "grad_norm": 0.2541464302266371, + "learning_rate": 0.00018533275410312464, + "loss": 1.043, + "step": 4512 + }, + { + "epoch": 0.43, + "grad_norm": 0.2729757201162812, + "learning_rate": 0.0001853245051451608, + "loss": 1.0917, + "step": 4513 + }, + { + "epoch": 0.43, + "grad_norm": 0.2881511710111317, + "learning_rate": 0.00018531625405189761, + "loss": 1.0334, + "step": 4514 + }, + { + "epoch": 0.43, + "grad_norm": 0.3004052066232476, + "learning_rate": 0.00018530800082354153, + "loss": 1.2131, + "step": 4515 + }, + { + "epoch": 0.43, + "grad_norm": 0.296301949311051, + "learning_rate": 0.0001852997454602991, + "loss": 1.153, + "step": 4516 + }, + { + "epoch": 0.43, + "grad_norm": 0.26666605205862975, + "learning_rate": 0.00018529148796237696, + "loss": 1.0267, + "step": 4517 + }, + { + "epoch": 0.43, + "grad_norm": 0.27466066458804184, + "learning_rate": 0.00018528322832998172, + "loss": 1.0631, + "step": 4518 + }, + { + "epoch": 0.43, + "grad_norm": 0.2913602893847047, + "learning_rate": 0.0001852749665633201, + "loss": 1.1576, + "step": 4519 + }, + { + "epoch": 0.43, + "grad_norm": 0.2631891203792342, + "learning_rate": 0.00018526670266259885, + "loss": 1.0626, + "step": 4520 + }, + { + "epoch": 0.43, + "grad_norm": 0.33347156766562963, + "learning_rate": 0.00018525843662802477, + "loss": 1.0737, + "step": 4521 + }, + { + "epoch": 0.43, + "grad_norm": 0.2790606063839219, + "learning_rate": 0.00018525016845980473, + "loss": 1.0466, + "step": 4522 + }, + { + "epoch": 0.43, + "grad_norm": 0.2836733588666177, + "learning_rate": 0.00018524189815814565, + "loss": 1.0932, + "step": 4523 + }, + { + "epoch": 0.43, + "grad_norm": 0.3028905196190124, + "learning_rate": 0.0001852336257232545, + "loss": 1.0616, + "step": 4524 + }, + { + "epoch": 0.43, + "grad_norm": 0.2845873599717972, + "learning_rate": 0.00018522535115533828, + "loss": 1.0551, + "step": 4525 + }, + { + "epoch": 0.43, + "grad_norm": 0.24471061186570162, + "learning_rate": 0.0001852170744546041, + "loss": 1.0166, + "step": 4526 + }, + { + "epoch": 0.43, + "grad_norm": 0.2877270171258153, + "learning_rate": 0.00018520879562125905, + "loss": 1.0145, + "step": 4527 + }, + { + "epoch": 0.43, + "grad_norm": 0.2770560293900152, + "learning_rate": 0.00018520051465551038, + "loss": 1.1089, + "step": 4528 + }, + { + "epoch": 0.43, + "grad_norm": 0.27958349827422824, + "learning_rate": 0.00018519223155756526, + "loss": 1.1539, + "step": 4529 + }, + { + "epoch": 0.43, + "grad_norm": 0.24834333728465557, + "learning_rate": 0.000185183946327631, + "loss": 1.1034, + "step": 4530 + }, + { + "epoch": 0.43, + "grad_norm": 0.2749131019387281, + "learning_rate": 0.00018517565896591494, + "loss": 0.9705, + "step": 4531 + }, + { + "epoch": 0.43, + "grad_norm": 0.2717292532230968, + "learning_rate": 0.00018516736947262453, + "loss": 1.0913, + "step": 4532 + }, + { + "epoch": 0.43, + "grad_norm": 0.22308007763116086, + "learning_rate": 0.00018515907784796712, + "loss": 1.0885, + "step": 4533 + }, + { + "epoch": 0.43, + "grad_norm": 0.2723580969909008, + "learning_rate": 0.00018515078409215029, + "loss": 1.0822, + "step": 4534 + }, + { + "epoch": 0.43, + "grad_norm": 0.2723449059110442, + "learning_rate": 0.00018514248820538157, + "loss": 1.1214, + "step": 4535 + }, + { + "epoch": 0.43, + "grad_norm": 0.2985439043563554, + "learning_rate": 0.0001851341901878686, + "loss": 1.0749, + "step": 4536 + }, + { + "epoch": 0.43, + "grad_norm": 0.26995390366164945, + "learning_rate": 0.000185125890039819, + "loss": 1.1233, + "step": 4537 + }, + { + "epoch": 0.43, + "grad_norm": 0.2852650589577388, + "learning_rate": 0.00018511758776144048, + "loss": 1.135, + "step": 4538 + }, + { + "epoch": 0.43, + "grad_norm": 0.25495506751354075, + "learning_rate": 0.0001851092833529408, + "loss": 1.0329, + "step": 4539 + }, + { + "epoch": 0.43, + "grad_norm": 0.2788212963607499, + "learning_rate": 0.0001851009768145279, + "loss": 1.0452, + "step": 4540 + }, + { + "epoch": 0.43, + "grad_norm": 0.29286510945056726, + "learning_rate": 0.00018509266814640952, + "loss": 1.0701, + "step": 4541 + }, + { + "epoch": 0.43, + "grad_norm": 0.298753776585502, + "learning_rate": 0.00018508435734879367, + "loss": 1.0797, + "step": 4542 + }, + { + "epoch": 0.43, + "grad_norm": 0.2893400312544143, + "learning_rate": 0.00018507604442188826, + "loss": 1.1236, + "step": 4543 + }, + { + "epoch": 0.43, + "grad_norm": 0.297108222760269, + "learning_rate": 0.0001850677293659014, + "loss": 1.1958, + "step": 4544 + }, + { + "epoch": 0.43, + "grad_norm": 0.3026573746982335, + "learning_rate": 0.00018505941218104112, + "loss": 1.0817, + "step": 4545 + }, + { + "epoch": 0.43, + "grad_norm": 0.27275443083535883, + "learning_rate": 0.00018505109286751564, + "loss": 1.0752, + "step": 4546 + }, + { + "epoch": 0.44, + "grad_norm": 0.27885346389765003, + "learning_rate": 0.00018504277142553308, + "loss": 1.0219, + "step": 4547 + }, + { + "epoch": 0.44, + "grad_norm": 0.3180339146414116, + "learning_rate": 0.00018503444785530172, + "loss": 1.0987, + "step": 4548 + }, + { + "epoch": 0.44, + "grad_norm": 0.2808155483149112, + "learning_rate": 0.00018502612215702988, + "loss": 1.073, + "step": 4549 + }, + { + "epoch": 0.44, + "grad_norm": 0.25391611727242125, + "learning_rate": 0.00018501779433092587, + "loss": 1.0876, + "step": 4550 + }, + { + "epoch": 0.44, + "grad_norm": 0.2261609383943027, + "learning_rate": 0.00018500946437719813, + "loss": 1.0297, + "step": 4551 + }, + { + "epoch": 0.44, + "grad_norm": 0.2754443425644491, + "learning_rate": 0.00018500113229605512, + "loss": 1.082, + "step": 4552 + }, + { + "epoch": 0.44, + "grad_norm": 0.27586769729030647, + "learning_rate": 0.00018499279808770536, + "loss": 1.137, + "step": 4553 + }, + { + "epoch": 0.44, + "grad_norm": 0.31905598519632444, + "learning_rate": 0.0001849844617523574, + "loss": 1.1845, + "step": 4554 + }, + { + "epoch": 0.44, + "grad_norm": 0.23729193980262855, + "learning_rate": 0.00018497612329021988, + "loss": 0.9061, + "step": 4555 + }, + { + "epoch": 0.44, + "grad_norm": 0.2782155585428631, + "learning_rate": 0.00018496778270150145, + "loss": 1.1527, + "step": 4556 + }, + { + "epoch": 0.44, + "grad_norm": 0.2710366956832406, + "learning_rate": 0.0001849594399864109, + "loss": 1.0705, + "step": 4557 + }, + { + "epoch": 0.44, + "grad_norm": 0.22796189873292866, + "learning_rate": 0.00018495109514515693, + "loss": 1.0695, + "step": 4558 + }, + { + "epoch": 0.44, + "grad_norm": 0.2831241401573468, + "learning_rate": 0.00018494274817794842, + "loss": 0.9582, + "step": 4559 + }, + { + "epoch": 0.44, + "grad_norm": 0.2824780217557077, + "learning_rate": 0.0001849343990849943, + "loss": 1.0315, + "step": 4560 + }, + { + "epoch": 0.44, + "grad_norm": 0.270556872472652, + "learning_rate": 0.0001849260478665034, + "loss": 1.0019, + "step": 4561 + }, + { + "epoch": 0.44, + "grad_norm": 0.25499089437909833, + "learning_rate": 0.00018491769452268482, + "loss": 1.0728, + "step": 4562 + }, + { + "epoch": 0.44, + "grad_norm": 0.2908571742460526, + "learning_rate": 0.00018490933905374754, + "loss": 1.0077, + "step": 4563 + }, + { + "epoch": 0.44, + "grad_norm": 0.28487382701620256, + "learning_rate": 0.0001849009814599007, + "loss": 1.0629, + "step": 4564 + }, + { + "epoch": 0.44, + "grad_norm": 0.26175206299618, + "learning_rate": 0.00018489262174135345, + "loss": 1.1802, + "step": 4565 + }, + { + "epoch": 0.44, + "grad_norm": 0.2881582041583295, + "learning_rate": 0.00018488425989831496, + "loss": 0.9577, + "step": 4566 + }, + { + "epoch": 0.44, + "grad_norm": 0.2833504131778664, + "learning_rate": 0.00018487589593099455, + "loss": 1.2034, + "step": 4567 + }, + { + "epoch": 0.44, + "grad_norm": 0.2623734320038995, + "learning_rate": 0.00018486752983960146, + "loss": 1.1153, + "step": 4568 + }, + { + "epoch": 0.44, + "grad_norm": 0.2522517398867482, + "learning_rate": 0.00018485916162434515, + "loss": 1.0618, + "step": 4569 + }, + { + "epoch": 0.44, + "grad_norm": 0.24153711019667667, + "learning_rate": 0.00018485079128543496, + "loss": 1.0822, + "step": 4570 + }, + { + "epoch": 0.44, + "grad_norm": 0.2882476151134348, + "learning_rate": 0.0001848424188230804, + "loss": 1.1657, + "step": 4571 + }, + { + "epoch": 0.44, + "grad_norm": 0.2964951820277572, + "learning_rate": 0.00018483404423749096, + "loss": 1.1222, + "step": 4572 + }, + { + "epoch": 0.44, + "grad_norm": 0.27784406217795626, + "learning_rate": 0.00018482566752887628, + "loss": 1.0545, + "step": 4573 + }, + { + "epoch": 0.44, + "grad_norm": 0.26707431862694997, + "learning_rate": 0.00018481728869744596, + "loss": 1.0567, + "step": 4574 + }, + { + "epoch": 0.44, + "grad_norm": 0.3017539840667631, + "learning_rate": 0.00018480890774340964, + "loss": 1.1777, + "step": 4575 + }, + { + "epoch": 0.44, + "grad_norm": 0.3113524234554564, + "learning_rate": 0.00018480052466697715, + "loss": 1.1013, + "step": 4576 + }, + { + "epoch": 0.44, + "grad_norm": 0.308306701323019, + "learning_rate": 0.00018479213946835822, + "loss": 1.1211, + "step": 4577 + }, + { + "epoch": 0.44, + "grad_norm": 0.2710413287763368, + "learning_rate": 0.00018478375214776272, + "loss": 1.1882, + "step": 4578 + }, + { + "epoch": 0.44, + "grad_norm": 0.31364976820788937, + "learning_rate": 0.00018477536270540052, + "loss": 1.0807, + "step": 4579 + }, + { + "epoch": 0.44, + "grad_norm": 0.25048911528630063, + "learning_rate": 0.00018476697114148158, + "loss": 1.0959, + "step": 4580 + }, + { + "epoch": 0.44, + "grad_norm": 0.2663984503966, + "learning_rate": 0.00018475857745621594, + "loss": 1.0236, + "step": 4581 + }, + { + "epoch": 0.44, + "grad_norm": 0.26614213923311075, + "learning_rate": 0.00018475018164981362, + "loss": 1.0751, + "step": 4582 + }, + { + "epoch": 0.44, + "grad_norm": 0.2814084782252928, + "learning_rate": 0.00018474178372248474, + "loss": 0.9935, + "step": 4583 + }, + { + "epoch": 0.44, + "grad_norm": 0.25798031430362306, + "learning_rate": 0.00018473338367443946, + "loss": 0.9938, + "step": 4584 + }, + { + "epoch": 0.44, + "grad_norm": 0.2865428384395744, + "learning_rate": 0.00018472498150588803, + "loss": 1.0878, + "step": 4585 + }, + { + "epoch": 0.44, + "grad_norm": 0.3077652321307214, + "learning_rate": 0.00018471657721704066, + "loss": 1.1567, + "step": 4586 + }, + { + "epoch": 0.44, + "grad_norm": 0.31452880319912974, + "learning_rate": 0.0001847081708081077, + "loss": 1.1304, + "step": 4587 + }, + { + "epoch": 0.44, + "grad_norm": 0.2763324776356194, + "learning_rate": 0.00018469976227929955, + "loss": 1.0634, + "step": 4588 + }, + { + "epoch": 0.44, + "grad_norm": 0.29691042342244073, + "learning_rate": 0.0001846913516308266, + "loss": 1.1236, + "step": 4589 + }, + { + "epoch": 0.44, + "grad_norm": 0.2846334629692399, + "learning_rate": 0.00018468293886289935, + "loss": 0.9717, + "step": 4590 + }, + { + "epoch": 0.44, + "grad_norm": 0.2668935421303119, + "learning_rate": 0.00018467452397572833, + "loss": 1.082, + "step": 4591 + }, + { + "epoch": 0.44, + "grad_norm": 0.28376985073992644, + "learning_rate": 0.00018466610696952416, + "loss": 1.088, + "step": 4592 + }, + { + "epoch": 0.44, + "grad_norm": 0.3014664421717131, + "learning_rate": 0.00018465768784449742, + "loss": 0.9671, + "step": 4593 + }, + { + "epoch": 0.44, + "grad_norm": 0.27756706076476795, + "learning_rate": 0.00018464926660085885, + "loss": 1.0486, + "step": 4594 + }, + { + "epoch": 0.44, + "grad_norm": 0.31276907084202593, + "learning_rate": 0.00018464084323881918, + "loss": 1.1944, + "step": 4595 + }, + { + "epoch": 0.44, + "grad_norm": 0.2710547392306981, + "learning_rate": 0.00018463241775858923, + "loss": 1.0873, + "step": 4596 + }, + { + "epoch": 0.44, + "grad_norm": 0.2726412664274147, + "learning_rate": 0.00018462399016037982, + "loss": 0.975, + "step": 4597 + }, + { + "epoch": 0.44, + "grad_norm": 0.28160638881053474, + "learning_rate": 0.00018461556044440186, + "loss": 1.092, + "step": 4598 + }, + { + "epoch": 0.44, + "grad_norm": 0.2725964852327896, + "learning_rate": 0.00018460712861086633, + "loss": 1.1128, + "step": 4599 + }, + { + "epoch": 0.44, + "grad_norm": 0.27984134666173177, + "learning_rate": 0.00018459869465998425, + "loss": 1.1321, + "step": 4600 + }, + { + "epoch": 0.44, + "grad_norm": 0.2863490711350883, + "learning_rate": 0.00018459025859196663, + "loss": 1.1366, + "step": 4601 + }, + { + "epoch": 0.44, + "grad_norm": 0.2879506583130578, + "learning_rate": 0.00018458182040702466, + "loss": 1.1197, + "step": 4602 + }, + { + "epoch": 0.44, + "grad_norm": 0.26719713518673577, + "learning_rate": 0.00018457338010536946, + "loss": 0.9944, + "step": 4603 + }, + { + "epoch": 0.44, + "grad_norm": 0.3121012835925408, + "learning_rate": 0.0001845649376872123, + "loss": 1.198, + "step": 4604 + }, + { + "epoch": 0.44, + "grad_norm": 0.30448939179247264, + "learning_rate": 0.0001845564931527644, + "loss": 0.9822, + "step": 4605 + }, + { + "epoch": 0.44, + "grad_norm": 0.2735559381750156, + "learning_rate": 0.00018454804650223713, + "loss": 1.0978, + "step": 4606 + }, + { + "epoch": 0.44, + "grad_norm": 0.2555127503868371, + "learning_rate": 0.0001845395977358418, + "loss": 1.0871, + "step": 4607 + }, + { + "epoch": 0.44, + "grad_norm": 0.2784512056488346, + "learning_rate": 0.00018453114685379, + "loss": 1.1117, + "step": 4608 + }, + { + "epoch": 0.44, + "grad_norm": 0.27412619492911333, + "learning_rate": 0.0001845226938562931, + "loss": 0.9858, + "step": 4609 + }, + { + "epoch": 0.44, + "grad_norm": 0.3198996291332153, + "learning_rate": 0.00018451423874356261, + "loss": 1.0908, + "step": 4610 + }, + { + "epoch": 0.44, + "grad_norm": 0.2893727277713405, + "learning_rate": 0.00018450578151581022, + "loss": 1.1565, + "step": 4611 + }, + { + "epoch": 0.44, + "grad_norm": 0.3012087634762126, + "learning_rate": 0.00018449732217324754, + "loss": 1.1766, + "step": 4612 + }, + { + "epoch": 0.44, + "grad_norm": 0.29522825483614856, + "learning_rate": 0.00018448886071608625, + "loss": 1.2191, + "step": 4613 + }, + { + "epoch": 0.44, + "grad_norm": 0.27761023884520314, + "learning_rate": 0.00018448039714453814, + "loss": 1.2029, + "step": 4614 + }, + { + "epoch": 0.44, + "grad_norm": 0.22548736584501833, + "learning_rate": 0.000184471931458815, + "loss": 1.0772, + "step": 4615 + }, + { + "epoch": 0.44, + "grad_norm": 0.28551218282274554, + "learning_rate": 0.00018446346365912867, + "loss": 1.1001, + "step": 4616 + }, + { + "epoch": 0.44, + "grad_norm": 0.29640261684492736, + "learning_rate": 0.0001844549937456911, + "loss": 1.0567, + "step": 4617 + }, + { + "epoch": 0.44, + "grad_norm": 0.26167728449746586, + "learning_rate": 0.0001844465217187142, + "loss": 1.0648, + "step": 4618 + }, + { + "epoch": 0.44, + "grad_norm": 0.27822656705075727, + "learning_rate": 0.00018443804757841003, + "loss": 1.1127, + "step": 4619 + }, + { + "epoch": 0.44, + "grad_norm": 0.2904755993031546, + "learning_rate": 0.00018442957132499069, + "loss": 1.0909, + "step": 4620 + }, + { + "epoch": 0.44, + "grad_norm": 0.28052003011517346, + "learning_rate": 0.00018442109295866823, + "loss": 1.0709, + "step": 4621 + }, + { + "epoch": 0.44, + "grad_norm": 0.25770690828327825, + "learning_rate": 0.00018441261247965487, + "loss": 1.1385, + "step": 4622 + }, + { + "epoch": 0.44, + "grad_norm": 0.256212600739427, + "learning_rate": 0.00018440412988816283, + "loss": 1.0721, + "step": 4623 + }, + { + "epoch": 0.44, + "grad_norm": 0.2749581272974963, + "learning_rate": 0.0001843956451844044, + "loss": 1.1453, + "step": 4624 + }, + { + "epoch": 0.44, + "grad_norm": 0.3046942593317576, + "learning_rate": 0.0001843871583685919, + "loss": 1.189, + "step": 4625 + }, + { + "epoch": 0.44, + "grad_norm": 0.28752913128850266, + "learning_rate": 0.00018437866944093773, + "loss": 1.2043, + "step": 4626 + }, + { + "epoch": 0.44, + "grad_norm": 0.2900180388718663, + "learning_rate": 0.00018437017840165434, + "loss": 1.1533, + "step": 4627 + }, + { + "epoch": 0.44, + "grad_norm": 0.26073951801493656, + "learning_rate": 0.0001843616852509542, + "loss": 1.0558, + "step": 4628 + }, + { + "epoch": 0.44, + "grad_norm": 0.3077527251913004, + "learning_rate": 0.00018435318998904986, + "loss": 1.0387, + "step": 4629 + }, + { + "epoch": 0.44, + "grad_norm": 0.29483737359579054, + "learning_rate": 0.00018434469261615393, + "loss": 1.0913, + "step": 4630 + }, + { + "epoch": 0.44, + "grad_norm": 0.2933924970818525, + "learning_rate": 0.00018433619313247906, + "loss": 1.0347, + "step": 4631 + }, + { + "epoch": 0.44, + "grad_norm": 0.26212171413860036, + "learning_rate": 0.00018432769153823797, + "loss": 1.0702, + "step": 4632 + }, + { + "epoch": 0.44, + "grad_norm": 0.2546684009326573, + "learning_rate": 0.00018431918783364337, + "loss": 1.028, + "step": 4633 + }, + { + "epoch": 0.44, + "grad_norm": 0.24838393365899913, + "learning_rate": 0.00018431068201890812, + "loss": 1.0165, + "step": 4634 + }, + { + "epoch": 0.44, + "grad_norm": 0.3099036311014584, + "learning_rate": 0.00018430217409424505, + "loss": 0.9548, + "step": 4635 + }, + { + "epoch": 0.44, + "grad_norm": 0.2759430315688091, + "learning_rate": 0.00018429366405986713, + "loss": 1.149, + "step": 4636 + }, + { + "epoch": 0.44, + "grad_norm": 0.2953749611748448, + "learning_rate": 0.00018428515191598726, + "loss": 1.1464, + "step": 4637 + }, + { + "epoch": 0.44, + "grad_norm": 0.2654249301676195, + "learning_rate": 0.0001842766376628185, + "loss": 1.0327, + "step": 4638 + }, + { + "epoch": 0.44, + "grad_norm": 0.2802886640388649, + "learning_rate": 0.0001842681213005739, + "loss": 1.1143, + "step": 4639 + }, + { + "epoch": 0.44, + "grad_norm": 0.29564078608147715, + "learning_rate": 0.00018425960282946661, + "loss": 0.8881, + "step": 4640 + }, + { + "epoch": 0.44, + "grad_norm": 0.2925251530239038, + "learning_rate": 0.00018425108224970983, + "loss": 1.1731, + "step": 4641 + }, + { + "epoch": 0.44, + "grad_norm": 0.2900983438301286, + "learning_rate": 0.00018424255956151674, + "loss": 1.177, + "step": 4642 + }, + { + "epoch": 0.44, + "grad_norm": 0.31019044957642683, + "learning_rate": 0.00018423403476510065, + "loss": 1.1355, + "step": 4643 + }, + { + "epoch": 0.44, + "grad_norm": 0.3213591456521516, + "learning_rate": 0.00018422550786067492, + "loss": 1.077, + "step": 4644 + }, + { + "epoch": 0.44, + "grad_norm": 0.2677075004163226, + "learning_rate": 0.0001842169788484529, + "loss": 1.0312, + "step": 4645 + }, + { + "epoch": 0.44, + "grad_norm": 0.25726221862210036, + "learning_rate": 0.0001842084477286481, + "loss": 0.9965, + "step": 4646 + }, + { + "epoch": 0.44, + "grad_norm": 0.29706048227507303, + "learning_rate": 0.00018419991450147394, + "loss": 1.0823, + "step": 4647 + }, + { + "epoch": 0.44, + "grad_norm": 0.2567670140513427, + "learning_rate": 0.000184191379167144, + "loss": 1.0802, + "step": 4648 + }, + { + "epoch": 0.44, + "grad_norm": 0.2730948766094166, + "learning_rate": 0.00018418284172587188, + "loss": 1.0743, + "step": 4649 + }, + { + "epoch": 0.44, + "grad_norm": 0.2576764085593571, + "learning_rate": 0.00018417430217787124, + "loss": 1.0818, + "step": 4650 + }, + { + "epoch": 0.44, + "grad_norm": 0.28479973480576726, + "learning_rate": 0.00018416576052335582, + "loss": 1.0513, + "step": 4651 + }, + { + "epoch": 0.45, + "grad_norm": 0.28752546745083385, + "learning_rate": 0.0001841572167625393, + "loss": 1.1685, + "step": 4652 + }, + { + "epoch": 0.45, + "grad_norm": 0.2846736795661952, + "learning_rate": 0.00018414867089563557, + "loss": 1.2087, + "step": 4653 + }, + { + "epoch": 0.45, + "grad_norm": 0.27455162051832954, + "learning_rate": 0.00018414012292285845, + "loss": 1.0672, + "step": 4654 + }, + { + "epoch": 0.45, + "grad_norm": 0.26145375283055017, + "learning_rate": 0.00018413157284442186, + "loss": 1.0861, + "step": 4655 + }, + { + "epoch": 0.45, + "grad_norm": 0.2871748977574908, + "learning_rate": 0.0001841230206605398, + "loss": 1.168, + "step": 4656 + }, + { + "epoch": 0.45, + "grad_norm": 0.26086270240508314, + "learning_rate": 0.00018411446637142632, + "loss": 1.1131, + "step": 4657 + }, + { + "epoch": 0.45, + "grad_norm": 0.26968127971681205, + "learning_rate": 0.0001841059099772954, + "loss": 1.0536, + "step": 4658 + }, + { + "epoch": 0.45, + "grad_norm": 0.2665866276637478, + "learning_rate": 0.00018409735147836124, + "loss": 1.2219, + "step": 4659 + }, + { + "epoch": 0.45, + "grad_norm": 0.2788104688731091, + "learning_rate": 0.000184088790874838, + "loss": 1.0498, + "step": 4660 + }, + { + "epoch": 0.45, + "grad_norm": 0.3042257990864148, + "learning_rate": 0.00018408022816693994, + "loss": 1.1032, + "step": 4661 + }, + { + "epoch": 0.45, + "grad_norm": 0.23891458103453578, + "learning_rate": 0.0001840716633548813, + "loss": 1.1667, + "step": 4662 + }, + { + "epoch": 0.45, + "grad_norm": 0.2668685305184029, + "learning_rate": 0.00018406309643887649, + "loss": 1.0313, + "step": 4663 + }, + { + "epoch": 0.45, + "grad_norm": 0.26923096074860575, + "learning_rate": 0.0001840545274191398, + "loss": 1.096, + "step": 4664 + }, + { + "epoch": 0.45, + "grad_norm": 0.26120691585456396, + "learning_rate": 0.0001840459562958858, + "loss": 1.0147, + "step": 4665 + }, + { + "epoch": 0.45, + "grad_norm": 0.279287076136616, + "learning_rate": 0.0001840373830693289, + "loss": 1.12, + "step": 4666 + }, + { + "epoch": 0.45, + "grad_norm": 0.2517878873784173, + "learning_rate": 0.00018402880773968363, + "loss": 1.0207, + "step": 4667 + }, + { + "epoch": 0.45, + "grad_norm": 0.2554752277219163, + "learning_rate": 0.00018402023030716469, + "loss": 1.1272, + "step": 4668 + }, + { + "epoch": 0.45, + "grad_norm": 0.24985971217131164, + "learning_rate": 0.00018401165077198666, + "loss": 1.0905, + "step": 4669 + }, + { + "epoch": 0.45, + "grad_norm": 0.2515290610443281, + "learning_rate": 0.0001840030691343643, + "loss": 1.1553, + "step": 4670 + }, + { + "epoch": 0.45, + "grad_norm": 0.2632716739172169, + "learning_rate": 0.00018399448539451228, + "loss": 1.0571, + "step": 4671 + }, + { + "epoch": 0.45, + "grad_norm": 0.29388108923128464, + "learning_rate": 0.00018398589955264552, + "loss": 1.0905, + "step": 4672 + }, + { + "epoch": 0.45, + "grad_norm": 0.34065302184228613, + "learning_rate": 0.00018397731160897882, + "loss": 1.118, + "step": 4673 + }, + { + "epoch": 0.45, + "grad_norm": 0.26474232885623084, + "learning_rate": 0.00018396872156372713, + "loss": 1.1804, + "step": 4674 + }, + { + "epoch": 0.45, + "grad_norm": 0.23096264177639642, + "learning_rate": 0.00018396012941710542, + "loss": 1.0124, + "step": 4675 + }, + { + "epoch": 0.45, + "grad_norm": 0.26143495936449157, + "learning_rate": 0.00018395153516932868, + "loss": 1.0354, + "step": 4676 + }, + { + "epoch": 0.45, + "grad_norm": 0.2909688243868478, + "learning_rate": 0.00018394293882061203, + "loss": 1.0682, + "step": 4677 + }, + { + "epoch": 0.45, + "grad_norm": 0.2786275126719699, + "learning_rate": 0.00018393434037117056, + "loss": 1.1167, + "step": 4678 + }, + { + "epoch": 0.45, + "grad_norm": 0.31697489334219714, + "learning_rate": 0.0001839257398212195, + "loss": 1.0991, + "step": 4679 + }, + { + "epoch": 0.45, + "grad_norm": 0.27077228765384886, + "learning_rate": 0.00018391713717097404, + "loss": 0.9453, + "step": 4680 + }, + { + "epoch": 0.45, + "grad_norm": 0.2762723786687761, + "learning_rate": 0.0001839085324206495, + "loss": 1.1394, + "step": 4681 + }, + { + "epoch": 0.45, + "grad_norm": 0.2904037438324207, + "learning_rate": 0.00018389992557046116, + "loss": 0.9806, + "step": 4682 + }, + { + "epoch": 0.45, + "grad_norm": 0.33416394497118107, + "learning_rate": 0.00018389131662062449, + "loss": 1.0519, + "step": 4683 + }, + { + "epoch": 0.45, + "grad_norm": 0.280572824730302, + "learning_rate": 0.00018388270557135488, + "loss": 1.0679, + "step": 4684 + }, + { + "epoch": 0.45, + "grad_norm": 0.318148944370186, + "learning_rate": 0.00018387409242286786, + "loss": 1.0326, + "step": 4685 + }, + { + "epoch": 0.45, + "grad_norm": 0.2592469384169005, + "learning_rate": 0.00018386547717537895, + "loss": 1.154, + "step": 4686 + }, + { + "epoch": 0.45, + "grad_norm": 0.24639073939502068, + "learning_rate": 0.00018385685982910376, + "loss": 1.0404, + "step": 4687 + }, + { + "epoch": 0.45, + "grad_norm": 0.305988694893456, + "learning_rate": 0.00018384824038425796, + "loss": 1.0658, + "step": 4688 + }, + { + "epoch": 0.45, + "grad_norm": 0.29161415294102544, + "learning_rate": 0.00018383961884105724, + "loss": 1.0198, + "step": 4689 + }, + { + "epoch": 0.45, + "grad_norm": 0.2679552986606862, + "learning_rate": 0.00018383099519971737, + "loss": 1.081, + "step": 4690 + }, + { + "epoch": 0.45, + "grad_norm": 0.27720828972286177, + "learning_rate": 0.00018382236946045416, + "loss": 1.1043, + "step": 4691 + }, + { + "epoch": 0.45, + "grad_norm": 0.26516410427337717, + "learning_rate": 0.0001838137416234835, + "loss": 0.9742, + "step": 4692 + }, + { + "epoch": 0.45, + "grad_norm": 0.27803791295610497, + "learning_rate": 0.00018380511168902128, + "loss": 1.0465, + "step": 4693 + }, + { + "epoch": 0.45, + "grad_norm": 0.3744328239378393, + "learning_rate": 0.00018379647965728344, + "loss": 1.1089, + "step": 4694 + }, + { + "epoch": 0.45, + "grad_norm": 0.2973942314834355, + "learning_rate": 0.00018378784552848605, + "loss": 1.0153, + "step": 4695 + }, + { + "epoch": 0.45, + "grad_norm": 0.26659388968085823, + "learning_rate": 0.00018377920930284515, + "loss": 1.1312, + "step": 4696 + }, + { + "epoch": 0.45, + "grad_norm": 0.25981005331538953, + "learning_rate": 0.0001837705709805769, + "loss": 1.0054, + "step": 4697 + }, + { + "epoch": 0.45, + "grad_norm": 0.22213075334205962, + "learning_rate": 0.00018376193056189745, + "loss": 1.015, + "step": 4698 + }, + { + "epoch": 0.45, + "grad_norm": 0.275135826725204, + "learning_rate": 0.00018375328804702304, + "loss": 0.965, + "step": 4699 + }, + { + "epoch": 0.45, + "grad_norm": 0.29035632968889363, + "learning_rate": 0.00018374464343617, + "loss": 1.0289, + "step": 4700 + }, + { + "epoch": 0.45, + "grad_norm": 0.27764015636676737, + "learning_rate": 0.0001837359967295546, + "loss": 1.0597, + "step": 4701 + }, + { + "epoch": 0.45, + "grad_norm": 0.25677252798632294, + "learning_rate": 0.00018372734792739323, + "loss": 1.0364, + "step": 4702 + }, + { + "epoch": 0.45, + "grad_norm": 0.3040774439028437, + "learning_rate": 0.0001837186970299024, + "loss": 1.157, + "step": 4703 + }, + { + "epoch": 0.45, + "grad_norm": 0.2693075199539607, + "learning_rate": 0.00018371004403729853, + "loss": 1.0576, + "step": 4704 + }, + { + "epoch": 0.45, + "grad_norm": 0.29196540371409324, + "learning_rate": 0.0001837013889497982, + "loss": 1.1876, + "step": 4705 + }, + { + "epoch": 0.45, + "grad_norm": 0.2926321409398997, + "learning_rate": 0.00018369273176761802, + "loss": 1.1449, + "step": 4706 + }, + { + "epoch": 0.45, + "grad_norm": 0.26895329592482214, + "learning_rate": 0.00018368407249097466, + "loss": 1.0901, + "step": 4707 + }, + { + "epoch": 0.45, + "grad_norm": 0.2606964589821753, + "learning_rate": 0.00018367541112008476, + "loss": 1.1665, + "step": 4708 + }, + { + "epoch": 0.45, + "grad_norm": 0.3222647861912937, + "learning_rate": 0.0001836667476551651, + "loss": 1.0423, + "step": 4709 + }, + { + "epoch": 0.45, + "grad_norm": 0.28474742820692894, + "learning_rate": 0.00018365808209643253, + "loss": 1.0235, + "step": 4710 + }, + { + "epoch": 0.45, + "grad_norm": 0.29857635596649795, + "learning_rate": 0.00018364941444410385, + "loss": 1.1574, + "step": 4711 + }, + { + "epoch": 0.45, + "grad_norm": 0.3222102111839974, + "learning_rate": 0.00018364074469839602, + "loss": 1.1667, + "step": 4712 + }, + { + "epoch": 0.45, + "grad_norm": 0.31112582358071006, + "learning_rate": 0.00018363207285952595, + "loss": 1.152, + "step": 4713 + }, + { + "epoch": 0.45, + "grad_norm": 0.26706361932201095, + "learning_rate": 0.00018362339892771072, + "loss": 1.1072, + "step": 4714 + }, + { + "epoch": 0.45, + "grad_norm": 0.3008566236074501, + "learning_rate": 0.00018361472290316736, + "loss": 1.0795, + "step": 4715 + }, + { + "epoch": 0.45, + "grad_norm": 0.2802364960499926, + "learning_rate": 0.00018360604478611303, + "loss": 1.062, + "step": 4716 + }, + { + "epoch": 0.45, + "grad_norm": 0.25442275739177317, + "learning_rate": 0.00018359736457676488, + "loss": 1.0775, + "step": 4717 + }, + { + "epoch": 0.45, + "grad_norm": 0.28931573899470037, + "learning_rate": 0.00018358868227534014, + "loss": 1.1024, + "step": 4718 + }, + { + "epoch": 0.45, + "grad_norm": 0.256028110974589, + "learning_rate": 0.0001835799978820561, + "loss": 1.1924, + "step": 4719 + }, + { + "epoch": 0.45, + "grad_norm": 0.2645246955746137, + "learning_rate": 0.00018357131139713008, + "loss": 1.1301, + "step": 4720 + }, + { + "epoch": 0.45, + "grad_norm": 0.3130330679624715, + "learning_rate": 0.0001835626228207795, + "loss": 1.0055, + "step": 4721 + }, + { + "epoch": 0.45, + "grad_norm": 0.28521400562617844, + "learning_rate": 0.00018355393215322173, + "loss": 1.1705, + "step": 4722 + }, + { + "epoch": 0.45, + "grad_norm": 0.23137236678760192, + "learning_rate": 0.0001835452393946743, + "loss": 1.0553, + "step": 4723 + }, + { + "epoch": 0.45, + "grad_norm": 0.2716401502501313, + "learning_rate": 0.00018353654454535473, + "loss": 0.9819, + "step": 4724 + }, + { + "epoch": 0.45, + "grad_norm": 0.25530206196101923, + "learning_rate": 0.00018352784760548066, + "loss": 1.0459, + "step": 4725 + }, + { + "epoch": 0.45, + "grad_norm": 0.3028518361428745, + "learning_rate": 0.0001835191485752697, + "loss": 1.1051, + "step": 4726 + }, + { + "epoch": 0.45, + "grad_norm": 0.266666477668348, + "learning_rate": 0.00018351044745493957, + "loss": 1.0546, + "step": 4727 + }, + { + "epoch": 0.45, + "grad_norm": 0.270858102859276, + "learning_rate": 0.000183501744244708, + "loss": 1.1381, + "step": 4728 + }, + { + "epoch": 0.45, + "grad_norm": 0.2626211922077772, + "learning_rate": 0.0001834930389447928, + "loss": 1.0571, + "step": 4729 + }, + { + "epoch": 0.45, + "grad_norm": 0.25526998893008135, + "learning_rate": 0.00018348433155541182, + "loss": 1.0052, + "step": 4730 + }, + { + "epoch": 0.45, + "grad_norm": 0.2578903383304173, + "learning_rate": 0.000183475622076783, + "loss": 0.9924, + "step": 4731 + }, + { + "epoch": 0.45, + "grad_norm": 0.2413218945569206, + "learning_rate": 0.00018346691050912423, + "loss": 1.1513, + "step": 4732 + }, + { + "epoch": 0.45, + "grad_norm": 0.2446885085320821, + "learning_rate": 0.0001834581968526536, + "loss": 1.0376, + "step": 4733 + }, + { + "epoch": 0.45, + "grad_norm": 0.3680136627218759, + "learning_rate": 0.00018344948110758912, + "loss": 0.9561, + "step": 4734 + }, + { + "epoch": 0.45, + "grad_norm": 0.2723843485796278, + "learning_rate": 0.00018344076327414896, + "loss": 1.0291, + "step": 4735 + }, + { + "epoch": 0.45, + "grad_norm": 0.3027706624868208, + "learning_rate": 0.00018343204335255123, + "loss": 1.1075, + "step": 4736 + }, + { + "epoch": 0.45, + "grad_norm": 0.27483257758567703, + "learning_rate": 0.00018342332134301418, + "loss": 1.0461, + "step": 4737 + }, + { + "epoch": 0.45, + "grad_norm": 0.30639774434247813, + "learning_rate": 0.00018341459724575612, + "loss": 1.1396, + "step": 4738 + }, + { + "epoch": 0.45, + "grad_norm": 0.30130628173745716, + "learning_rate": 0.00018340587106099532, + "loss": 1.0496, + "step": 4739 + }, + { + "epoch": 0.45, + "grad_norm": 0.2951096190186332, + "learning_rate": 0.00018339714278895017, + "loss": 1.0468, + "step": 4740 + }, + { + "epoch": 0.45, + "grad_norm": 0.255641750420738, + "learning_rate": 0.0001833884124298391, + "loss": 0.9368, + "step": 4741 + }, + { + "epoch": 0.45, + "grad_norm": 0.28182596587401804, + "learning_rate": 0.00018337967998388062, + "loss": 1.1615, + "step": 4742 + }, + { + "epoch": 0.45, + "grad_norm": 0.2654967829673649, + "learning_rate": 0.00018337094545129327, + "loss": 1.1393, + "step": 4743 + }, + { + "epoch": 0.45, + "grad_norm": 0.2809627165424522, + "learning_rate": 0.00018336220883229557, + "loss": 1.0288, + "step": 4744 + }, + { + "epoch": 0.45, + "grad_norm": 0.3063917931895973, + "learning_rate": 0.0001833534701271062, + "loss": 1.1096, + "step": 4745 + }, + { + "epoch": 0.45, + "grad_norm": 0.2978525520912866, + "learning_rate": 0.00018334472933594388, + "loss": 1.1158, + "step": 4746 + }, + { + "epoch": 0.45, + "grad_norm": 0.2957907023562577, + "learning_rate": 0.00018333598645902733, + "loss": 1.0369, + "step": 4747 + }, + { + "epoch": 0.45, + "grad_norm": 0.23971767785327205, + "learning_rate": 0.00018332724149657534, + "loss": 1.0324, + "step": 4748 + }, + { + "epoch": 0.45, + "grad_norm": 0.25958963718826317, + "learning_rate": 0.00018331849444880676, + "loss": 1.0323, + "step": 4749 + }, + { + "epoch": 0.45, + "grad_norm": 0.2702035936636161, + "learning_rate": 0.00018330974531594046, + "loss": 1.146, + "step": 4750 + }, + { + "epoch": 0.45, + "grad_norm": 0.30569382745175333, + "learning_rate": 0.00018330099409819548, + "loss": 1.1357, + "step": 4751 + }, + { + "epoch": 0.45, + "grad_norm": 0.2820062684104576, + "learning_rate": 0.00018329224079579072, + "loss": 1.1124, + "step": 4752 + }, + { + "epoch": 0.45, + "grad_norm": 0.2829441998287804, + "learning_rate": 0.0001832834854089453, + "loss": 1.0883, + "step": 4753 + }, + { + "epoch": 0.45, + "grad_norm": 0.30058225164600477, + "learning_rate": 0.00018327472793787833, + "loss": 1.1226, + "step": 4754 + }, + { + "epoch": 0.45, + "grad_norm": 0.2806946975067004, + "learning_rate": 0.00018326596838280897, + "loss": 1.0858, + "step": 4755 + }, + { + "epoch": 0.46, + "grad_norm": 0.26835184504189624, + "learning_rate": 0.0001832572067439564, + "loss": 1.0372, + "step": 4756 + }, + { + "epoch": 0.46, + "grad_norm": 0.2284738744280186, + "learning_rate": 0.00018324844302153992, + "loss": 0.9814, + "step": 4757 + }, + { + "epoch": 0.46, + "grad_norm": 0.2678772918055495, + "learning_rate": 0.00018323967721577881, + "loss": 1.1667, + "step": 4758 + }, + { + "epoch": 0.46, + "grad_norm": 0.2842258914883526, + "learning_rate": 0.00018323090932689248, + "loss": 1.0497, + "step": 4759 + }, + { + "epoch": 0.46, + "grad_norm": 0.2909027276625095, + "learning_rate": 0.00018322213935510035, + "loss": 1.083, + "step": 4760 + }, + { + "epoch": 0.46, + "grad_norm": 0.2861834538414527, + "learning_rate": 0.00018321336730062185, + "loss": 1.2472, + "step": 4761 + }, + { + "epoch": 0.46, + "grad_norm": 0.3061550280271623, + "learning_rate": 0.00018320459316367656, + "loss": 1.0457, + "step": 4762 + }, + { + "epoch": 0.46, + "grad_norm": 0.29646221655536087, + "learning_rate": 0.00018319581694448402, + "loss": 1.2093, + "step": 4763 + }, + { + "epoch": 0.46, + "grad_norm": 0.24948311138239332, + "learning_rate": 0.00018318703864326387, + "loss": 1.0602, + "step": 4764 + }, + { + "epoch": 0.46, + "grad_norm": 0.2972189282962595, + "learning_rate": 0.0001831782582602358, + "loss": 1.0545, + "step": 4765 + }, + { + "epoch": 0.46, + "grad_norm": 0.2766121044623654, + "learning_rate": 0.00018316947579561955, + "loss": 1.1347, + "step": 4766 + }, + { + "epoch": 0.46, + "grad_norm": 0.24848006867021413, + "learning_rate": 0.0001831606912496349, + "loss": 0.9744, + "step": 4767 + }, + { + "epoch": 0.46, + "grad_norm": 0.31237542451843275, + "learning_rate": 0.00018315190462250166, + "loss": 1.1843, + "step": 4768 + }, + { + "epoch": 0.46, + "grad_norm": 0.29004056616059715, + "learning_rate": 0.00018314311591443978, + "loss": 1.1249, + "step": 4769 + }, + { + "epoch": 0.46, + "grad_norm": 0.26116055277908656, + "learning_rate": 0.00018313432512566914, + "loss": 1.1072, + "step": 4770 + }, + { + "epoch": 0.46, + "grad_norm": 0.28883119567906423, + "learning_rate": 0.0001831255322564098, + "loss": 1.0457, + "step": 4771 + }, + { + "epoch": 0.46, + "grad_norm": 0.303739333682482, + "learning_rate": 0.00018311673730688174, + "loss": 1.0541, + "step": 4772 + }, + { + "epoch": 0.46, + "grad_norm": 0.29152935043740263, + "learning_rate": 0.00018310794027730513, + "loss": 0.9989, + "step": 4773 + }, + { + "epoch": 0.46, + "grad_norm": 0.2642591264939651, + "learning_rate": 0.00018309914116790006, + "loss": 1.0887, + "step": 4774 + }, + { + "epoch": 0.46, + "grad_norm": 0.2737816497235381, + "learning_rate": 0.00018309033997888677, + "loss": 0.9973, + "step": 4775 + }, + { + "epoch": 0.46, + "grad_norm": 0.27257528194967345, + "learning_rate": 0.0001830815367104855, + "loss": 1.0495, + "step": 4776 + }, + { + "epoch": 0.46, + "grad_norm": 0.28425805359861567, + "learning_rate": 0.00018307273136291654, + "loss": 1.0373, + "step": 4777 + }, + { + "epoch": 0.46, + "grad_norm": 0.29026318888377095, + "learning_rate": 0.00018306392393640025, + "loss": 1.0727, + "step": 4778 + }, + { + "epoch": 0.46, + "grad_norm": 0.27166227285923494, + "learning_rate": 0.0001830551144311571, + "loss": 1.0168, + "step": 4779 + }, + { + "epoch": 0.46, + "grad_norm": 0.2755929576909545, + "learning_rate": 0.00018304630284740752, + "loss": 1.1526, + "step": 4780 + }, + { + "epoch": 0.46, + "grad_norm": 0.2899920317630215, + "learning_rate": 0.00018303748918537197, + "loss": 1.1746, + "step": 4781 + }, + { + "epoch": 0.46, + "grad_norm": 0.2986410508568978, + "learning_rate": 0.00018302867344527113, + "loss": 0.9996, + "step": 4782 + }, + { + "epoch": 0.46, + "grad_norm": 0.265948409051619, + "learning_rate": 0.00018301985562732548, + "loss": 1.0769, + "step": 4783 + }, + { + "epoch": 0.46, + "grad_norm": 0.28694971143700526, + "learning_rate": 0.0001830110357317558, + "loss": 1.1081, + "step": 4784 + }, + { + "epoch": 0.46, + "grad_norm": 0.27742537509438936, + "learning_rate": 0.00018300221375878282, + "loss": 1.0131, + "step": 4785 + }, + { + "epoch": 0.46, + "grad_norm": 0.2625193602157892, + "learning_rate": 0.00018299338970862724, + "loss": 1.1648, + "step": 4786 + }, + { + "epoch": 0.46, + "grad_norm": 0.2738231281645282, + "learning_rate": 0.00018298456358150996, + "loss": 1.0869, + "step": 4787 + }, + { + "epoch": 0.46, + "grad_norm": 0.28533535787671566, + "learning_rate": 0.00018297573537765175, + "loss": 1.1657, + "step": 4788 + }, + { + "epoch": 0.46, + "grad_norm": 0.24149533195194994, + "learning_rate": 0.00018296690509727367, + "loss": 1.1252, + "step": 4789 + }, + { + "epoch": 0.46, + "grad_norm": 0.2513397282973885, + "learning_rate": 0.00018295807274059663, + "loss": 1.1539, + "step": 4790 + }, + { + "epoch": 0.46, + "grad_norm": 0.26483605282680023, + "learning_rate": 0.00018294923830784168, + "loss": 1.1032, + "step": 4791 + }, + { + "epoch": 0.46, + "grad_norm": 0.24791217448830763, + "learning_rate": 0.0001829404017992299, + "loss": 1.019, + "step": 4792 + }, + { + "epoch": 0.46, + "grad_norm": 0.2666231822052287, + "learning_rate": 0.00018293156321498247, + "loss": 1.0347, + "step": 4793 + }, + { + "epoch": 0.46, + "grad_norm": 0.29874993409578854, + "learning_rate": 0.0001829227225553205, + "loss": 1.178, + "step": 4794 + }, + { + "epoch": 0.46, + "grad_norm": 0.2738977464194366, + "learning_rate": 0.00018291387982046536, + "loss": 1.1841, + "step": 4795 + }, + { + "epoch": 0.46, + "grad_norm": 0.26913204839505855, + "learning_rate": 0.00018290503501063819, + "loss": 1.1249, + "step": 4796 + }, + { + "epoch": 0.46, + "grad_norm": 0.2642140627592377, + "learning_rate": 0.00018289618812606046, + "loss": 1.1228, + "step": 4797 + }, + { + "epoch": 0.46, + "grad_norm": 0.2923403068856712, + "learning_rate": 0.00018288733916695351, + "loss": 1.0207, + "step": 4798 + }, + { + "epoch": 0.46, + "grad_norm": 0.2574187448061279, + "learning_rate": 0.0001828784881335388, + "loss": 1.022, + "step": 4799 + }, + { + "epoch": 0.46, + "grad_norm": 0.2705462102994065, + "learning_rate": 0.00018286963502603786, + "loss": 1.1524, + "step": 4800 + }, + { + "epoch": 0.46, + "grad_norm": 0.2570338839265344, + "learning_rate": 0.0001828607798446722, + "loss": 1.0328, + "step": 4801 + }, + { + "epoch": 0.46, + "grad_norm": 0.3210236176915547, + "learning_rate": 0.00018285192258966343, + "loss": 1.1115, + "step": 4802 + }, + { + "epoch": 0.46, + "grad_norm": 0.28608741562007683, + "learning_rate": 0.00018284306326123327, + "loss": 1.1011, + "step": 4803 + }, + { + "epoch": 0.46, + "grad_norm": 0.2945292323022467, + "learning_rate": 0.00018283420185960338, + "loss": 1.0293, + "step": 4804 + }, + { + "epoch": 0.46, + "grad_norm": 0.2921369853346895, + "learning_rate": 0.00018282533838499552, + "loss": 1.0956, + "step": 4805 + }, + { + "epoch": 0.46, + "grad_norm": 0.2508835050525997, + "learning_rate": 0.0001828164728376315, + "loss": 0.9944, + "step": 4806 + }, + { + "epoch": 0.46, + "grad_norm": 0.31162618222530697, + "learning_rate": 0.00018280760521773322, + "loss": 1.1071, + "step": 4807 + }, + { + "epoch": 0.46, + "grad_norm": 0.26840278702905573, + "learning_rate": 0.00018279873552552256, + "loss": 1.0196, + "step": 4808 + }, + { + "epoch": 0.46, + "grad_norm": 0.28949640843261154, + "learning_rate": 0.0001827898637612215, + "loss": 1.0671, + "step": 4809 + }, + { + "epoch": 0.46, + "grad_norm": 0.26451664824875853, + "learning_rate": 0.00018278098992505207, + "loss": 1.037, + "step": 4810 + }, + { + "epoch": 0.46, + "grad_norm": 0.3163920468320238, + "learning_rate": 0.00018277211401723634, + "loss": 1.0914, + "step": 4811 + }, + { + "epoch": 0.46, + "grad_norm": 0.293156846011273, + "learning_rate": 0.00018276323603799645, + "loss": 1.0247, + "step": 4812 + }, + { + "epoch": 0.46, + "grad_norm": 0.3167908615898505, + "learning_rate": 0.00018275435598755457, + "loss": 1.0896, + "step": 4813 + }, + { + "epoch": 0.46, + "grad_norm": 0.29195900263136115, + "learning_rate": 0.0001827454738661329, + "loss": 1.106, + "step": 4814 + }, + { + "epoch": 0.46, + "grad_norm": 0.24880439045988673, + "learning_rate": 0.00018273658967395378, + "loss": 1.0249, + "step": 4815 + }, + { + "epoch": 0.46, + "grad_norm": 0.27945847139446783, + "learning_rate": 0.00018272770341123948, + "loss": 1.1502, + "step": 4816 + }, + { + "epoch": 0.46, + "grad_norm": 0.2826003957306108, + "learning_rate": 0.0001827188150782124, + "loss": 1.0295, + "step": 4817 + }, + { + "epoch": 0.46, + "grad_norm": 0.2733756686159319, + "learning_rate": 0.000182709924675095, + "loss": 1.1557, + "step": 4818 + }, + { + "epoch": 0.46, + "grad_norm": 0.2719146164107582, + "learning_rate": 0.00018270103220210975, + "loss": 1.1701, + "step": 4819 + }, + { + "epoch": 0.46, + "grad_norm": 0.27143925540828834, + "learning_rate": 0.0001826921376594792, + "loss": 1.094, + "step": 4820 + }, + { + "epoch": 0.46, + "grad_norm": 0.3064338525945154, + "learning_rate": 0.00018268324104742592, + "loss": 1.1517, + "step": 4821 + }, + { + "epoch": 0.46, + "grad_norm": 0.31652916261666003, + "learning_rate": 0.00018267434236617257, + "loss": 1.1124, + "step": 4822 + }, + { + "epoch": 0.46, + "grad_norm": 0.28456338644193074, + "learning_rate": 0.00018266544161594185, + "loss": 1.1934, + "step": 4823 + }, + { + "epoch": 0.46, + "grad_norm": 0.24199134880492237, + "learning_rate": 0.0001826565387969565, + "loss": 0.9981, + "step": 4824 + }, + { + "epoch": 0.46, + "grad_norm": 0.26907228489033597, + "learning_rate": 0.00018264763390943932, + "loss": 0.9906, + "step": 4825 + }, + { + "epoch": 0.46, + "grad_norm": 0.2885973158602934, + "learning_rate": 0.00018263872695361316, + "loss": 1.0922, + "step": 4826 + }, + { + "epoch": 0.46, + "grad_norm": 0.2618403285446365, + "learning_rate": 0.00018262981792970093, + "loss": 1.0619, + "step": 4827 + }, + { + "epoch": 0.46, + "grad_norm": 0.2508955108209627, + "learning_rate": 0.00018262090683792556, + "loss": 1.0296, + "step": 4828 + }, + { + "epoch": 0.46, + "grad_norm": 0.2805927803935353, + "learning_rate": 0.00018261199367851008, + "loss": 1.0554, + "step": 4829 + }, + { + "epoch": 0.46, + "grad_norm": 0.2703573079658423, + "learning_rate": 0.00018260307845167754, + "loss": 1.0561, + "step": 4830 + }, + { + "epoch": 0.46, + "grad_norm": 0.2674244714124627, + "learning_rate": 0.00018259416115765103, + "loss": 1.1403, + "step": 4831 + }, + { + "epoch": 0.46, + "grad_norm": 0.28142267132464976, + "learning_rate": 0.00018258524179665377, + "loss": 1.142, + "step": 4832 + }, + { + "epoch": 0.46, + "grad_norm": 0.28299880159536434, + "learning_rate": 0.00018257632036890891, + "loss": 1.0733, + "step": 4833 + }, + { + "epoch": 0.46, + "grad_norm": 0.26287599628896613, + "learning_rate": 0.0001825673968746397, + "loss": 1.036, + "step": 4834 + }, + { + "epoch": 0.46, + "grad_norm": 0.27196439255395516, + "learning_rate": 0.00018255847131406954, + "loss": 1.0408, + "step": 4835 + }, + { + "epoch": 0.46, + "grad_norm": 0.27846018633896047, + "learning_rate": 0.00018254954368742172, + "loss": 1.0871, + "step": 4836 + }, + { + "epoch": 0.46, + "grad_norm": 0.29707264756040697, + "learning_rate": 0.00018254061399491968, + "loss": 1.0648, + "step": 4837 + }, + { + "epoch": 0.46, + "grad_norm": 0.26505419324604984, + "learning_rate": 0.00018253168223678694, + "loss": 1.1028, + "step": 4838 + }, + { + "epoch": 0.46, + "grad_norm": 0.255410178872581, + "learning_rate": 0.00018252274841324697, + "loss": 1.0632, + "step": 4839 + }, + { + "epoch": 0.46, + "grad_norm": 0.31740617878546407, + "learning_rate": 0.00018251381252452334, + "loss": 1.0536, + "step": 4840 + }, + { + "epoch": 0.46, + "grad_norm": 0.283151238470317, + "learning_rate": 0.0001825048745708397, + "loss": 1.0617, + "step": 4841 + }, + { + "epoch": 0.46, + "grad_norm": 0.2754257741606846, + "learning_rate": 0.0001824959345524197, + "loss": 1.1151, + "step": 4842 + }, + { + "epoch": 0.46, + "grad_norm": 0.27843031061325174, + "learning_rate": 0.00018248699246948714, + "loss": 1.0461, + "step": 4843 + }, + { + "epoch": 0.46, + "grad_norm": 0.28725916243319277, + "learning_rate": 0.00018247804832226573, + "loss": 1.1349, + "step": 4844 + }, + { + "epoch": 0.46, + "grad_norm": 0.2656780830493282, + "learning_rate": 0.00018246910211097933, + "loss": 1.0609, + "step": 4845 + }, + { + "epoch": 0.46, + "grad_norm": 0.2735044242141262, + "learning_rate": 0.0001824601538358518, + "loss": 1.1276, + "step": 4846 + }, + { + "epoch": 0.46, + "grad_norm": 0.2758621380929105, + "learning_rate": 0.00018245120349710708, + "loss": 0.9056, + "step": 4847 + }, + { + "epoch": 0.46, + "grad_norm": 0.2399901693338529, + "learning_rate": 0.00018244225109496922, + "loss": 1.0778, + "step": 4848 + }, + { + "epoch": 0.46, + "grad_norm": 0.24569048440162722, + "learning_rate": 0.0001824332966296622, + "loss": 1.1481, + "step": 4849 + }, + { + "epoch": 0.46, + "grad_norm": 0.24697964457020974, + "learning_rate": 0.00018242434010141013, + "loss": 1.0456, + "step": 4850 + }, + { + "epoch": 0.46, + "grad_norm": 0.26864292763639014, + "learning_rate": 0.0001824153815104371, + "loss": 1.0611, + "step": 4851 + }, + { + "epoch": 0.46, + "grad_norm": 0.32764299735013946, + "learning_rate": 0.0001824064208569674, + "loss": 1.1193, + "step": 4852 + }, + { + "epoch": 0.46, + "grad_norm": 0.2841433629068385, + "learning_rate": 0.00018239745814122523, + "loss": 1.1592, + "step": 4853 + }, + { + "epoch": 0.46, + "grad_norm": 0.2803323057557425, + "learning_rate": 0.00018238849336343487, + "loss": 1.103, + "step": 4854 + }, + { + "epoch": 0.46, + "grad_norm": 0.2403783130731807, + "learning_rate": 0.00018237952652382067, + "loss": 1.1279, + "step": 4855 + }, + { + "epoch": 0.46, + "grad_norm": 0.2644107034178308, + "learning_rate": 0.00018237055762260708, + "loss": 1.0965, + "step": 4856 + }, + { + "epoch": 0.46, + "grad_norm": 0.30697640163453677, + "learning_rate": 0.0001823615866600185, + "loss": 1.2236, + "step": 4857 + }, + { + "epoch": 0.46, + "grad_norm": 0.2735457493491036, + "learning_rate": 0.00018235261363627945, + "loss": 1.0618, + "step": 4858 + }, + { + "epoch": 0.46, + "grad_norm": 0.2406737920100741, + "learning_rate": 0.00018234363855161448, + "loss": 1.004, + "step": 4859 + }, + { + "epoch": 0.46, + "grad_norm": 0.2484692248494346, + "learning_rate": 0.00018233466140624822, + "loss": 0.9887, + "step": 4860 + }, + { + "epoch": 0.47, + "grad_norm": 0.2896830439213678, + "learning_rate": 0.00018232568220040532, + "loss": 1.1294, + "step": 4861 + }, + { + "epoch": 0.47, + "grad_norm": 0.26324509988169137, + "learning_rate": 0.00018231670093431042, + "loss": 1.1409, + "step": 4862 + }, + { + "epoch": 0.47, + "grad_norm": 0.2545994960965595, + "learning_rate": 0.00018230771760818844, + "loss": 1.0028, + "step": 4863 + }, + { + "epoch": 0.47, + "grad_norm": 0.2708183217546789, + "learning_rate": 0.000182298732222264, + "loss": 1.0793, + "step": 4864 + }, + { + "epoch": 0.47, + "grad_norm": 0.2917108244983868, + "learning_rate": 0.00018228974477676216, + "loss": 1.0369, + "step": 4865 + }, + { + "epoch": 0.47, + "grad_norm": 0.261745568565937, + "learning_rate": 0.0001822807552719077, + "loss": 0.962, + "step": 4866 + }, + { + "epoch": 0.47, + "grad_norm": 0.2743799469305386, + "learning_rate": 0.0001822717637079256, + "loss": 1.2131, + "step": 4867 + }, + { + "epoch": 0.47, + "grad_norm": 0.2979549118848966, + "learning_rate": 0.0001822627700850409, + "loss": 1.0951, + "step": 4868 + }, + { + "epoch": 0.47, + "grad_norm": 0.2589301792916495, + "learning_rate": 0.00018225377440347874, + "loss": 1.1224, + "step": 4869 + }, + { + "epoch": 0.47, + "grad_norm": 0.27151217602723077, + "learning_rate": 0.00018224477666346414, + "loss": 1.2002, + "step": 4870 + }, + { + "epoch": 0.47, + "grad_norm": 0.2678272434705896, + "learning_rate": 0.00018223577686522232, + "loss": 1.0903, + "step": 4871 + }, + { + "epoch": 0.47, + "grad_norm": 0.28664523382221585, + "learning_rate": 0.0001822267750089785, + "loss": 1.1004, + "step": 4872 + }, + { + "epoch": 0.47, + "grad_norm": 0.2947186333319964, + "learning_rate": 0.00018221777109495797, + "loss": 1.0248, + "step": 4873 + }, + { + "epoch": 0.47, + "grad_norm": 0.2911818485318527, + "learning_rate": 0.00018220876512338604, + "loss": 1.1243, + "step": 4874 + }, + { + "epoch": 0.47, + "grad_norm": 0.2389316378471969, + "learning_rate": 0.0001821997570944881, + "loss": 1.0098, + "step": 4875 + }, + { + "epoch": 0.47, + "grad_norm": 0.29112691824378567, + "learning_rate": 0.00018219074700848956, + "loss": 1.1284, + "step": 4876 + }, + { + "epoch": 0.47, + "grad_norm": 0.32234639877925986, + "learning_rate": 0.00018218173486561593, + "loss": 1.1974, + "step": 4877 + }, + { + "epoch": 0.47, + "grad_norm": 0.28693422182466444, + "learning_rate": 0.00018217272066609275, + "loss": 1.07, + "step": 4878 + }, + { + "epoch": 0.47, + "grad_norm": 0.24948119387159542, + "learning_rate": 0.00018216370441014558, + "loss": 1.0697, + "step": 4879 + }, + { + "epoch": 0.47, + "grad_norm": 0.2766198420685631, + "learning_rate": 0.00018215468609800007, + "loss": 1.1055, + "step": 4880 + }, + { + "epoch": 0.47, + "grad_norm": 0.2913825743782603, + "learning_rate": 0.0001821456657298819, + "loss": 1.0749, + "step": 4881 + }, + { + "epoch": 0.47, + "grad_norm": 0.2688271558729764, + "learning_rate": 0.00018213664330601683, + "loss": 0.9326, + "step": 4882 + }, + { + "epoch": 0.47, + "grad_norm": 0.30551493928355505, + "learning_rate": 0.00018212761882663062, + "loss": 1.1667, + "step": 4883 + }, + { + "epoch": 0.47, + "grad_norm": 0.3272843823903593, + "learning_rate": 0.00018211859229194918, + "loss": 1.0908, + "step": 4884 + }, + { + "epoch": 0.47, + "grad_norm": 0.2683326547425987, + "learning_rate": 0.00018210956370219832, + "loss": 1.1501, + "step": 4885 + }, + { + "epoch": 0.47, + "grad_norm": 0.2508457951874583, + "learning_rate": 0.00018210053305760403, + "loss": 1.0294, + "step": 4886 + }, + { + "epoch": 0.47, + "grad_norm": 0.3066517541478313, + "learning_rate": 0.0001820915003583923, + "loss": 1.1289, + "step": 4887 + }, + { + "epoch": 0.47, + "grad_norm": 0.2979558635215368, + "learning_rate": 0.0001820824656047892, + "loss": 1.0369, + "step": 4888 + }, + { + "epoch": 0.47, + "grad_norm": 0.24297953137434605, + "learning_rate": 0.0001820734287970208, + "loss": 1.0952, + "step": 4889 + }, + { + "epoch": 0.47, + "grad_norm": 0.2642818716032307, + "learning_rate": 0.00018206438993531324, + "loss": 1.1495, + "step": 4890 + }, + { + "epoch": 0.47, + "grad_norm": 0.28250099631091213, + "learning_rate": 0.0001820553490198928, + "loss": 1.2023, + "step": 4891 + }, + { + "epoch": 0.47, + "grad_norm": 0.24244241524853985, + "learning_rate": 0.00018204630605098563, + "loss": 1.1123, + "step": 4892 + }, + { + "epoch": 0.47, + "grad_norm": 0.28117721265077, + "learning_rate": 0.00018203726102881807, + "loss": 1.1124, + "step": 4893 + }, + { + "epoch": 0.47, + "grad_norm": 0.24950954002584746, + "learning_rate": 0.00018202821395361656, + "loss": 1.0812, + "step": 4894 + }, + { + "epoch": 0.47, + "grad_norm": 0.29484743456459533, + "learning_rate": 0.0001820191648256074, + "loss": 1.019, + "step": 4895 + }, + { + "epoch": 0.47, + "grad_norm": 0.26478910448375825, + "learning_rate": 0.00018201011364501712, + "loss": 1.061, + "step": 4896 + }, + { + "epoch": 0.47, + "grad_norm": 0.2802200200065187, + "learning_rate": 0.00018200106041207218, + "loss": 1.2153, + "step": 4897 + }, + { + "epoch": 0.47, + "grad_norm": 0.2931006305889275, + "learning_rate": 0.00018199200512699918, + "loss": 1.1586, + "step": 4898 + }, + { + "epoch": 0.47, + "grad_norm": 0.28693848326326754, + "learning_rate": 0.00018198294779002473, + "loss": 0.8363, + "step": 4899 + }, + { + "epoch": 0.47, + "grad_norm": 0.2488053532878251, + "learning_rate": 0.00018197388840137548, + "loss": 1.0084, + "step": 4900 + }, + { + "epoch": 0.47, + "grad_norm": 0.30279834945613493, + "learning_rate": 0.00018196482696127814, + "loss": 1.0889, + "step": 4901 + }, + { + "epoch": 0.47, + "grad_norm": 0.27315676586595977, + "learning_rate": 0.0001819557634699595, + "loss": 1.194, + "step": 4902 + }, + { + "epoch": 0.47, + "grad_norm": 0.27870113697280036, + "learning_rate": 0.0001819466979276464, + "loss": 1.0237, + "step": 4903 + }, + { + "epoch": 0.47, + "grad_norm": 0.26068511407455874, + "learning_rate": 0.00018193763033456565, + "loss": 1.141, + "step": 4904 + }, + { + "epoch": 0.47, + "grad_norm": 0.25403699988609485, + "learning_rate": 0.0001819285606909442, + "loss": 0.985, + "step": 4905 + }, + { + "epoch": 0.47, + "grad_norm": 0.25520183475888464, + "learning_rate": 0.00018191948899700904, + "loss": 0.9452, + "step": 4906 + }, + { + "epoch": 0.47, + "grad_norm": 0.2890128875837949, + "learning_rate": 0.00018191041525298719, + "loss": 0.9948, + "step": 4907 + }, + { + "epoch": 0.47, + "grad_norm": 0.2826684783407896, + "learning_rate": 0.00018190133945910573, + "loss": 1.1965, + "step": 4908 + }, + { + "epoch": 0.47, + "grad_norm": 0.27927424785621435, + "learning_rate": 0.00018189226161559175, + "loss": 1.0749, + "step": 4909 + }, + { + "epoch": 0.47, + "grad_norm": 0.28389430568060914, + "learning_rate": 0.00018188318172267245, + "loss": 1.1575, + "step": 4910 + }, + { + "epoch": 0.47, + "grad_norm": 0.21938938517365983, + "learning_rate": 0.0001818740997805751, + "loss": 1.0206, + "step": 4911 + }, + { + "epoch": 0.47, + "grad_norm": 0.29366370308740425, + "learning_rate": 0.00018186501578952693, + "loss": 1.0559, + "step": 4912 + }, + { + "epoch": 0.47, + "grad_norm": 0.29917686177952224, + "learning_rate": 0.0001818559297497553, + "loss": 1.164, + "step": 4913 + }, + { + "epoch": 0.47, + "grad_norm": 0.24934981554280172, + "learning_rate": 0.00018184684166148754, + "loss": 0.978, + "step": 4914 + }, + { + "epoch": 0.47, + "grad_norm": 0.25613501770670105, + "learning_rate": 0.00018183775152495117, + "loss": 1.1179, + "step": 4915 + }, + { + "epoch": 0.47, + "grad_norm": 0.28150065228278026, + "learning_rate": 0.00018182865934037362, + "loss": 1.0, + "step": 4916 + }, + { + "epoch": 0.47, + "grad_norm": 0.2736305655565639, + "learning_rate": 0.00018181956510798246, + "loss": 0.9436, + "step": 4917 + }, + { + "epoch": 0.47, + "grad_norm": 0.23963815077384734, + "learning_rate": 0.00018181046882800525, + "loss": 1.1872, + "step": 4918 + }, + { + "epoch": 0.47, + "grad_norm": 0.26272710687943224, + "learning_rate": 0.00018180137050066963, + "loss": 1.1077, + "step": 4919 + }, + { + "epoch": 0.47, + "grad_norm": 0.2491631914332162, + "learning_rate": 0.00018179227012620332, + "loss": 1.0311, + "step": 4920 + }, + { + "epoch": 0.47, + "grad_norm": 0.2850474055751791, + "learning_rate": 0.00018178316770483405, + "loss": 1.072, + "step": 4921 + }, + { + "epoch": 0.47, + "grad_norm": 0.2563447984596202, + "learning_rate": 0.0001817740632367896, + "loss": 1.0942, + "step": 4922 + }, + { + "epoch": 0.47, + "grad_norm": 0.28963605095483796, + "learning_rate": 0.00018176495672229782, + "loss": 1.1151, + "step": 4923 + }, + { + "epoch": 0.47, + "grad_norm": 0.29799210873069965, + "learning_rate": 0.0001817558481615866, + "loss": 1.0651, + "step": 4924 + }, + { + "epoch": 0.47, + "grad_norm": 0.2900355369937844, + "learning_rate": 0.0001817467375548839, + "loss": 1.0395, + "step": 4925 + }, + { + "epoch": 0.47, + "grad_norm": 0.26790765666005045, + "learning_rate": 0.00018173762490241777, + "loss": 1.0195, + "step": 4926 + }, + { + "epoch": 0.47, + "grad_norm": 0.27029423974434824, + "learning_rate": 0.00018172851020441616, + "loss": 1.0376, + "step": 4927 + }, + { + "epoch": 0.47, + "grad_norm": 0.29781631086031485, + "learning_rate": 0.00018171939346110723, + "loss": 1.1626, + "step": 4928 + }, + { + "epoch": 0.47, + "grad_norm": 0.24153000189250098, + "learning_rate": 0.0001817102746727191, + "loss": 1.1416, + "step": 4929 + }, + { + "epoch": 0.47, + "grad_norm": 0.2832270079995993, + "learning_rate": 0.00018170115383948001, + "loss": 1.0366, + "step": 4930 + }, + { + "epoch": 0.47, + "grad_norm": 0.28980962720099673, + "learning_rate": 0.0001816920309616182, + "loss": 1.1199, + "step": 4931 + }, + { + "epoch": 0.47, + "grad_norm": 0.2885127084279082, + "learning_rate": 0.00018168290603936198, + "loss": 0.9624, + "step": 4932 + }, + { + "epoch": 0.47, + "grad_norm": 0.264979114595309, + "learning_rate": 0.00018167377907293966, + "loss": 1.1314, + "step": 4933 + }, + { + "epoch": 0.47, + "grad_norm": 0.29190107567170176, + "learning_rate": 0.00018166465006257972, + "loss": 1.1603, + "step": 4934 + }, + { + "epoch": 0.47, + "grad_norm": 0.29218193365264744, + "learning_rate": 0.0001816555190085106, + "loss": 1.1024, + "step": 4935 + }, + { + "epoch": 0.47, + "grad_norm": 0.290286097531124, + "learning_rate": 0.00018164638591096078, + "loss": 1.0179, + "step": 4936 + }, + { + "epoch": 0.47, + "grad_norm": 0.3045929894370889, + "learning_rate": 0.00018163725077015883, + "loss": 1.1458, + "step": 4937 + }, + { + "epoch": 0.47, + "grad_norm": 0.3096182938272493, + "learning_rate": 0.0001816281135863334, + "loss": 1.0445, + "step": 4938 + }, + { + "epoch": 0.47, + "grad_norm": 0.2989704049787465, + "learning_rate": 0.00018161897435971312, + "loss": 1.0597, + "step": 4939 + }, + { + "epoch": 0.47, + "grad_norm": 0.33521100047691355, + "learning_rate": 0.00018160983309052671, + "loss": 1.0351, + "step": 4940 + }, + { + "epoch": 0.47, + "grad_norm": 0.27937510049751796, + "learning_rate": 0.00018160068977900293, + "loss": 1.0309, + "step": 4941 + }, + { + "epoch": 0.47, + "grad_norm": 0.30607280379755825, + "learning_rate": 0.00018159154442537058, + "loss": 1.0614, + "step": 4942 + }, + { + "epoch": 0.47, + "grad_norm": 0.2942443003492716, + "learning_rate": 0.0001815823970298586, + "loss": 1.0296, + "step": 4943 + }, + { + "epoch": 0.47, + "grad_norm": 0.27168124667388854, + "learning_rate": 0.00018157324759269583, + "loss": 1.2025, + "step": 4944 + }, + { + "epoch": 0.47, + "grad_norm": 0.3040911262976451, + "learning_rate": 0.00018156409611411127, + "loss": 1.1002, + "step": 4945 + }, + { + "epoch": 0.47, + "grad_norm": 0.2932905809708139, + "learning_rate": 0.00018155494259433397, + "loss": 1.1174, + "step": 4946 + }, + { + "epoch": 0.47, + "grad_norm": 0.27057969270876014, + "learning_rate": 0.00018154578703359294, + "loss": 1.1873, + "step": 4947 + }, + { + "epoch": 0.47, + "grad_norm": 0.23291125191630732, + "learning_rate": 0.00018153662943211737, + "loss": 1.0286, + "step": 4948 + }, + { + "epoch": 0.47, + "grad_norm": 0.2935414983637233, + "learning_rate": 0.00018152746979013638, + "loss": 1.0808, + "step": 4949 + }, + { + "epoch": 0.47, + "grad_norm": 0.28416378913000195, + "learning_rate": 0.00018151830810787925, + "loss": 1.1802, + "step": 4950 + }, + { + "epoch": 0.47, + "grad_norm": 0.2804898105323499, + "learning_rate": 0.00018150914438557522, + "loss": 1.0825, + "step": 4951 + }, + { + "epoch": 0.47, + "grad_norm": 0.2908103000886573, + "learning_rate": 0.0001814999786234536, + "loss": 1.1133, + "step": 4952 + }, + { + "epoch": 0.47, + "grad_norm": 0.25695695427126014, + "learning_rate": 0.0001814908108217438, + "loss": 1.1093, + "step": 4953 + }, + { + "epoch": 0.47, + "grad_norm": 0.27219006337780743, + "learning_rate": 0.0001814816409806753, + "loss": 1.1098, + "step": 4954 + }, + { + "epoch": 0.47, + "grad_norm": 0.3013175495467479, + "learning_rate": 0.00018147246910047747, + "loss": 1.0026, + "step": 4955 + }, + { + "epoch": 0.47, + "grad_norm": 0.24847902288191978, + "learning_rate": 0.0001814632951813799, + "loss": 0.999, + "step": 4956 + }, + { + "epoch": 0.47, + "grad_norm": 0.2691757208602004, + "learning_rate": 0.00018145411922361219, + "loss": 1.0259, + "step": 4957 + }, + { + "epoch": 0.47, + "grad_norm": 0.2601243300778845, + "learning_rate": 0.00018144494122740394, + "loss": 0.9758, + "step": 4958 + }, + { + "epoch": 0.47, + "grad_norm": 0.27967276767781657, + "learning_rate": 0.00018143576119298484, + "loss": 1.0271, + "step": 4959 + }, + { + "epoch": 0.47, + "grad_norm": 0.3093700281906271, + "learning_rate": 0.00018142657912058465, + "loss": 0.9627, + "step": 4960 + }, + { + "epoch": 0.47, + "grad_norm": 0.2951333334350976, + "learning_rate": 0.00018141739501043315, + "loss": 1.0788, + "step": 4961 + }, + { + "epoch": 0.47, + "grad_norm": 0.2754995868240653, + "learning_rate": 0.00018140820886276018, + "loss": 0.987, + "step": 4962 + }, + { + "epoch": 0.47, + "grad_norm": 0.2650597795296595, + "learning_rate": 0.0001813990206777956, + "loss": 1.108, + "step": 4963 + }, + { + "epoch": 0.47, + "grad_norm": 0.2680629620506122, + "learning_rate": 0.00018138983045576937, + "loss": 1.0824, + "step": 4964 + }, + { + "epoch": 0.48, + "grad_norm": 0.2745197902744774, + "learning_rate": 0.00018138063819691147, + "loss": 1.087, + "step": 4965 + }, + { + "epoch": 0.48, + "grad_norm": 0.2951908263527352, + "learning_rate": 0.00018137144390145194, + "loss": 1.0986, + "step": 4966 + }, + { + "epoch": 0.48, + "grad_norm": 0.3006255204787449, + "learning_rate": 0.00018136224756962093, + "loss": 1.0538, + "step": 4967 + }, + { + "epoch": 0.48, + "grad_norm": 0.28348543906022167, + "learning_rate": 0.00018135304920164854, + "loss": 1.0905, + "step": 4968 + }, + { + "epoch": 0.48, + "grad_norm": 0.2572498672154097, + "learning_rate": 0.00018134384879776497, + "loss": 1.1133, + "step": 4969 + }, + { + "epoch": 0.48, + "grad_norm": 0.30267258785481194, + "learning_rate": 0.00018133464635820042, + "loss": 1.1097, + "step": 4970 + }, + { + "epoch": 0.48, + "grad_norm": 0.2645184466065096, + "learning_rate": 0.00018132544188318526, + "loss": 1.111, + "step": 4971 + }, + { + "epoch": 0.48, + "grad_norm": 0.2981463331584167, + "learning_rate": 0.0001813162353729498, + "loss": 1.0594, + "step": 4972 + }, + { + "epoch": 0.48, + "grad_norm": 0.3028092831683072, + "learning_rate": 0.0001813070268277244, + "loss": 1.227, + "step": 4973 + }, + { + "epoch": 0.48, + "grad_norm": 0.276916640360599, + "learning_rate": 0.00018129781624773961, + "loss": 1.1563, + "step": 4974 + }, + { + "epoch": 0.48, + "grad_norm": 0.26497515296266566, + "learning_rate": 0.00018128860363322586, + "loss": 1.0489, + "step": 4975 + }, + { + "epoch": 0.48, + "grad_norm": 0.2870089379191648, + "learning_rate": 0.00018127938898441373, + "loss": 1.1085, + "step": 4976 + }, + { + "epoch": 0.48, + "grad_norm": 0.2804072923871364, + "learning_rate": 0.00018127017230153378, + "loss": 1.1697, + "step": 4977 + }, + { + "epoch": 0.48, + "grad_norm": 0.26600268228720364, + "learning_rate": 0.0001812609535848167, + "loss": 1.0651, + "step": 4978 + }, + { + "epoch": 0.48, + "grad_norm": 0.3017636575581846, + "learning_rate": 0.0001812517328344932, + "loss": 1.0479, + "step": 4979 + }, + { + "epoch": 0.48, + "grad_norm": 0.2848522851547357, + "learning_rate": 0.000181242510050794, + "loss": 1.0961, + "step": 4980 + }, + { + "epoch": 0.48, + "grad_norm": 0.29706092141646745, + "learning_rate": 0.00018123328523394992, + "loss": 1.0572, + "step": 4981 + }, + { + "epoch": 0.48, + "grad_norm": 0.24623214065631238, + "learning_rate": 0.00018122405838419186, + "loss": 1.0209, + "step": 4982 + }, + { + "epoch": 0.48, + "grad_norm": 0.27148089077495985, + "learning_rate": 0.00018121482950175067, + "loss": 1.0425, + "step": 4983 + }, + { + "epoch": 0.48, + "grad_norm": 0.27374692095758846, + "learning_rate": 0.00018120559858685734, + "loss": 1.0984, + "step": 4984 + }, + { + "epoch": 0.48, + "grad_norm": 0.28314322190570707, + "learning_rate": 0.00018119636563974285, + "loss": 1.0245, + "step": 4985 + }, + { + "epoch": 0.48, + "grad_norm": 0.27988607306147967, + "learning_rate": 0.0001811871306606383, + "loss": 1.0786, + "step": 4986 + }, + { + "epoch": 0.48, + "grad_norm": 0.28485376383594274, + "learning_rate": 0.0001811778936497748, + "loss": 1.0639, + "step": 4987 + }, + { + "epoch": 0.48, + "grad_norm": 0.23607692697159477, + "learning_rate": 0.00018116865460738343, + "loss": 1.0506, + "step": 4988 + }, + { + "epoch": 0.48, + "grad_norm": 0.29135188062241957, + "learning_rate": 0.0001811594135336955, + "loss": 1.0689, + "step": 4989 + }, + { + "epoch": 0.48, + "grad_norm": 0.26704486335325794, + "learning_rate": 0.00018115017042894227, + "loss": 1.1443, + "step": 4990 + }, + { + "epoch": 0.48, + "grad_norm": 0.26331319852301815, + "learning_rate": 0.00018114092529335497, + "loss": 1.0934, + "step": 4991 + }, + { + "epoch": 0.48, + "grad_norm": 0.26377752564879786, + "learning_rate": 0.00018113167812716506, + "loss": 0.9711, + "step": 4992 + }, + { + "epoch": 0.48, + "grad_norm": 0.26534776082769385, + "learning_rate": 0.0001811224289306039, + "loss": 1.0002, + "step": 4993 + }, + { + "epoch": 0.48, + "grad_norm": 0.2711195707498488, + "learning_rate": 0.00018111317770390297, + "loss": 1.2213, + "step": 4994 + }, + { + "epoch": 0.48, + "grad_norm": 0.24845192655050088, + "learning_rate": 0.0001811039244472938, + "loss": 1.1603, + "step": 4995 + }, + { + "epoch": 0.48, + "grad_norm": 0.2774236359347863, + "learning_rate": 0.00018109466916100793, + "loss": 1.0153, + "step": 4996 + }, + { + "epoch": 0.48, + "grad_norm": 0.291748829551597, + "learning_rate": 0.000181085411845277, + "loss": 1.139, + "step": 4997 + }, + { + "epoch": 0.48, + "grad_norm": 0.2707487926064819, + "learning_rate": 0.0001810761525003327, + "loss": 0.9485, + "step": 4998 + }, + { + "epoch": 0.48, + "grad_norm": 0.31763719053401285, + "learning_rate": 0.0001810668911264067, + "loss": 1.0691, + "step": 4999 + }, + { + "epoch": 0.48, + "grad_norm": 0.271398772008147, + "learning_rate": 0.00018105762772373086, + "loss": 1.0501, + "step": 5000 + }, + { + "epoch": 0.48, + "grad_norm": 0.2723475184642883, + "learning_rate": 0.00018104836229253688, + "loss": 1.0046, + "step": 5001 + }, + { + "epoch": 0.48, + "grad_norm": 0.23738205025152834, + "learning_rate": 0.00018103909483305672, + "loss": 1.1247, + "step": 5002 + }, + { + "epoch": 0.48, + "grad_norm": 0.28212585900181886, + "learning_rate": 0.00018102982534552226, + "loss": 0.966, + "step": 5003 + }, + { + "epoch": 0.48, + "grad_norm": 0.2923303986208393, + "learning_rate": 0.00018102055383016554, + "loss": 1.177, + "step": 5004 + }, + { + "epoch": 0.48, + "grad_norm": 0.261063011725201, + "learning_rate": 0.0001810112802872185, + "loss": 0.9956, + "step": 5005 + }, + { + "epoch": 0.48, + "grad_norm": 0.28452157947524576, + "learning_rate": 0.0001810020047169133, + "loss": 1.1531, + "step": 5006 + }, + { + "epoch": 0.48, + "grad_norm": 0.30435514362331084, + "learning_rate": 0.00018099272711948197, + "loss": 1.101, + "step": 5007 + }, + { + "epoch": 0.48, + "grad_norm": 0.3171881336122615, + "learning_rate": 0.0001809834474951568, + "loss": 1.1477, + "step": 5008 + }, + { + "epoch": 0.48, + "grad_norm": 0.3021432074002858, + "learning_rate": 0.00018097416584416992, + "loss": 1.1089, + "step": 5009 + }, + { + "epoch": 0.48, + "grad_norm": 0.2584355712285805, + "learning_rate": 0.00018096488216675364, + "loss": 1.0171, + "step": 5010 + }, + { + "epoch": 0.48, + "grad_norm": 0.2784662384579421, + "learning_rate": 0.00018095559646314033, + "loss": 1.1563, + "step": 5011 + }, + { + "epoch": 0.48, + "grad_norm": 0.2590883277277557, + "learning_rate": 0.00018094630873356234, + "loss": 0.9934, + "step": 5012 + }, + { + "epoch": 0.48, + "grad_norm": 0.31013602501461507, + "learning_rate": 0.0001809370189782521, + "loss": 1.0347, + "step": 5013 + }, + { + "epoch": 0.48, + "grad_norm": 0.2908179871344749, + "learning_rate": 0.00018092772719744207, + "loss": 1.1276, + "step": 5014 + }, + { + "epoch": 0.48, + "grad_norm": 0.27894866086973424, + "learning_rate": 0.0001809184333913648, + "loss": 1.1603, + "step": 5015 + }, + { + "epoch": 0.48, + "grad_norm": 0.30799725337128564, + "learning_rate": 0.0001809091375602529, + "loss": 0.9902, + "step": 5016 + }, + { + "epoch": 0.48, + "grad_norm": 0.26185924608898803, + "learning_rate": 0.00018089983970433896, + "loss": 1.0665, + "step": 5017 + }, + { + "epoch": 0.48, + "grad_norm": 0.23371144120926612, + "learning_rate": 0.0001808905398238557, + "loss": 1.0455, + "step": 5018 + }, + { + "epoch": 0.48, + "grad_norm": 0.27647347653440757, + "learning_rate": 0.00018088123791903588, + "loss": 1.1008, + "step": 5019 + }, + { + "epoch": 0.48, + "grad_norm": 0.2619795224095397, + "learning_rate": 0.0001808719339901122, + "loss": 1.111, + "step": 5020 + }, + { + "epoch": 0.48, + "grad_norm": 0.28812383140009523, + "learning_rate": 0.00018086262803731758, + "loss": 1.1625, + "step": 5021 + }, + { + "epoch": 0.48, + "grad_norm": 0.29790943849759494, + "learning_rate": 0.00018085332006088486, + "loss": 1.0231, + "step": 5022 + }, + { + "epoch": 0.48, + "grad_norm": 0.2584586663721738, + "learning_rate": 0.00018084401006104699, + "loss": 0.9985, + "step": 5023 + }, + { + "epoch": 0.48, + "grad_norm": 0.3204107870274487, + "learning_rate": 0.00018083469803803696, + "loss": 1.0371, + "step": 5024 + }, + { + "epoch": 0.48, + "grad_norm": 0.32851014719630056, + "learning_rate": 0.0001808253839920878, + "loss": 1.0865, + "step": 5025 + }, + { + "epoch": 0.48, + "grad_norm": 0.28813789079186414, + "learning_rate": 0.00018081606792343262, + "loss": 1.0732, + "step": 5026 + }, + { + "epoch": 0.48, + "grad_norm": 0.2778577229940317, + "learning_rate": 0.00018080674983230455, + "loss": 0.9632, + "step": 5027 + }, + { + "epoch": 0.48, + "grad_norm": 0.25371536354442387, + "learning_rate": 0.00018079742971893677, + "loss": 1.0637, + "step": 5028 + }, + { + "epoch": 0.48, + "grad_norm": 0.2608878940566144, + "learning_rate": 0.00018078810758356256, + "loss": 1.1195, + "step": 5029 + }, + { + "epoch": 0.48, + "grad_norm": 0.2880007088671419, + "learning_rate": 0.00018077878342641514, + "loss": 1.0541, + "step": 5030 + }, + { + "epoch": 0.48, + "grad_norm": 0.27719099372774664, + "learning_rate": 0.0001807694572477279, + "loss": 1.14, + "step": 5031 + }, + { + "epoch": 0.48, + "grad_norm": 0.26236080400560896, + "learning_rate": 0.00018076012904773427, + "loss": 1.0992, + "step": 5032 + }, + { + "epoch": 0.48, + "grad_norm": 0.28662749205806315, + "learning_rate": 0.00018075079882666763, + "loss": 1.1681, + "step": 5033 + }, + { + "epoch": 0.48, + "grad_norm": 0.24357430650710507, + "learning_rate": 0.0001807414665847615, + "loss": 1.0774, + "step": 5034 + }, + { + "epoch": 0.48, + "grad_norm": 0.22511154512758563, + "learning_rate": 0.00018073213232224945, + "loss": 0.9704, + "step": 5035 + }, + { + "epoch": 0.48, + "grad_norm": 0.25551533972771157, + "learning_rate": 0.000180722796039365, + "loss": 1.1366, + "step": 5036 + }, + { + "epoch": 0.48, + "grad_norm": 0.27262724569941255, + "learning_rate": 0.0001807134577363419, + "loss": 0.9719, + "step": 5037 + }, + { + "epoch": 0.48, + "grad_norm": 0.2640695671336433, + "learning_rate": 0.00018070411741341377, + "loss": 1.0389, + "step": 5038 + }, + { + "epoch": 0.48, + "grad_norm": 0.25865882131840123, + "learning_rate": 0.00018069477507081438, + "loss": 1.0127, + "step": 5039 + }, + { + "epoch": 0.48, + "grad_norm": 0.3041466915560575, + "learning_rate": 0.00018068543070877752, + "loss": 1.1345, + "step": 5040 + }, + { + "epoch": 0.48, + "grad_norm": 0.280532311208517, + "learning_rate": 0.00018067608432753706, + "loss": 0.98, + "step": 5041 + }, + { + "epoch": 0.48, + "grad_norm": 0.3067608848042604, + "learning_rate": 0.0001806667359273269, + "loss": 1.1467, + "step": 5042 + }, + { + "epoch": 0.48, + "grad_norm": 0.28794591958027166, + "learning_rate": 0.00018065738550838094, + "loss": 1.1362, + "step": 5043 + }, + { + "epoch": 0.48, + "grad_norm": 0.25975883602188704, + "learning_rate": 0.00018064803307093325, + "loss": 1.0846, + "step": 5044 + }, + { + "epoch": 0.48, + "grad_norm": 0.28326599642955586, + "learning_rate": 0.00018063867861521784, + "loss": 1.0675, + "step": 5045 + }, + { + "epoch": 0.48, + "grad_norm": 0.2558529563820642, + "learning_rate": 0.00018062932214146882, + "loss": 1.1281, + "step": 5046 + }, + { + "epoch": 0.48, + "grad_norm": 0.28221647075585127, + "learning_rate": 0.0001806199636499203, + "loss": 1.0499, + "step": 5047 + }, + { + "epoch": 0.48, + "grad_norm": 0.25047524197955967, + "learning_rate": 0.00018061060314080658, + "loss": 1.1112, + "step": 5048 + }, + { + "epoch": 0.48, + "grad_norm": 0.25460102742828555, + "learning_rate": 0.00018060124061436184, + "loss": 1.139, + "step": 5049 + }, + { + "epoch": 0.48, + "grad_norm": 0.2684967059066573, + "learning_rate": 0.00018059187607082037, + "loss": 1.0121, + "step": 5050 + }, + { + "epoch": 0.48, + "grad_norm": 0.2740879178050141, + "learning_rate": 0.00018058250951041656, + "loss": 1.2213, + "step": 5051 + }, + { + "epoch": 0.48, + "grad_norm": 0.23814111073074376, + "learning_rate": 0.0001805731409333848, + "loss": 0.946, + "step": 5052 + }, + { + "epoch": 0.48, + "grad_norm": 0.2846550890405717, + "learning_rate": 0.00018056377033995959, + "loss": 1.0843, + "step": 5053 + }, + { + "epoch": 0.48, + "grad_norm": 0.26517227533600335, + "learning_rate": 0.00018055439773037536, + "loss": 1.0066, + "step": 5054 + }, + { + "epoch": 0.48, + "grad_norm": 0.27053031387998794, + "learning_rate": 0.0001805450231048667, + "loss": 0.9459, + "step": 5055 + }, + { + "epoch": 0.48, + "grad_norm": 0.3019675939625031, + "learning_rate": 0.00018053564646366822, + "loss": 1.2236, + "step": 5056 + }, + { + "epoch": 0.48, + "grad_norm": 0.25798882391808686, + "learning_rate": 0.00018052626780701457, + "loss": 1.1254, + "step": 5057 + }, + { + "epoch": 0.48, + "grad_norm": 0.27311813604505014, + "learning_rate": 0.00018051688713514047, + "loss": 1.0087, + "step": 5058 + }, + { + "epoch": 0.48, + "grad_norm": 0.24271338025668723, + "learning_rate": 0.00018050750444828067, + "loss": 1.0985, + "step": 5059 + }, + { + "epoch": 0.48, + "grad_norm": 0.28513204364807454, + "learning_rate": 0.00018049811974666996, + "loss": 0.968, + "step": 5060 + }, + { + "epoch": 0.48, + "grad_norm": 0.2684639300431927, + "learning_rate": 0.00018048873303054324, + "loss": 1.0187, + "step": 5061 + }, + { + "epoch": 0.48, + "grad_norm": 0.28668337168219155, + "learning_rate": 0.00018047934430013535, + "loss": 1.1447, + "step": 5062 + }, + { + "epoch": 0.48, + "grad_norm": 0.2526134197715635, + "learning_rate": 0.0001804699535556813, + "loss": 1.1222, + "step": 5063 + }, + { + "epoch": 0.48, + "grad_norm": 0.2734155691633195, + "learning_rate": 0.0001804605607974161, + "loss": 1.0668, + "step": 5064 + }, + { + "epoch": 0.48, + "grad_norm": 0.29009495050980677, + "learning_rate": 0.0001804511660255748, + "loss": 1.143, + "step": 5065 + }, + { + "epoch": 0.48, + "grad_norm": 0.24381730481982744, + "learning_rate": 0.0001804417692403925, + "loss": 1.2, + "step": 5066 + }, + { + "epoch": 0.48, + "grad_norm": 0.28667698702842986, + "learning_rate": 0.00018043237044210438, + "loss": 0.9782, + "step": 5067 + }, + { + "epoch": 0.48, + "grad_norm": 0.2290214469722657, + "learning_rate": 0.00018042296963094562, + "loss": 1.0481, + "step": 5068 + }, + { + "epoch": 0.48, + "grad_norm": 0.25712013816908574, + "learning_rate": 0.00018041356680715152, + "loss": 1.0808, + "step": 5069 + }, + { + "epoch": 0.49, + "grad_norm": 0.27191198243103304, + "learning_rate": 0.00018040416197095737, + "loss": 1.0839, + "step": 5070 + }, + { + "epoch": 0.49, + "grad_norm": 0.2896020255383838, + "learning_rate": 0.00018039475512259855, + "loss": 1.0719, + "step": 5071 + }, + { + "epoch": 0.49, + "grad_norm": 0.2722755649971534, + "learning_rate": 0.0001803853462623104, + "loss": 1.111, + "step": 5072 + }, + { + "epoch": 0.49, + "grad_norm": 0.296096353606665, + "learning_rate": 0.0001803759353903285, + "loss": 1.1522, + "step": 5073 + }, + { + "epoch": 0.49, + "grad_norm": 0.2516486412186821, + "learning_rate": 0.0001803665225068883, + "loss": 1.0666, + "step": 5074 + }, + { + "epoch": 0.49, + "grad_norm": 0.28162660247716065, + "learning_rate": 0.00018035710761222533, + "loss": 1.0581, + "step": 5075 + }, + { + "epoch": 0.49, + "grad_norm": 0.2573369100267432, + "learning_rate": 0.00018034769070657524, + "loss": 1.0313, + "step": 5076 + }, + { + "epoch": 0.49, + "grad_norm": 0.25964129622911775, + "learning_rate": 0.00018033827179017372, + "loss": 1.0417, + "step": 5077 + }, + { + "epoch": 0.49, + "grad_norm": 0.2584869469852085, + "learning_rate": 0.00018032885086325645, + "loss": 1.0705, + "step": 5078 + }, + { + "epoch": 0.49, + "grad_norm": 0.2803573003244443, + "learning_rate": 0.0001803194279260592, + "loss": 1.1134, + "step": 5079 + }, + { + "epoch": 0.49, + "grad_norm": 0.25948244844278057, + "learning_rate": 0.00018031000297881778, + "loss": 1.1251, + "step": 5080 + }, + { + "epoch": 0.49, + "grad_norm": 0.27477615836677105, + "learning_rate": 0.00018030057602176806, + "loss": 1.0987, + "step": 5081 + }, + { + "epoch": 0.49, + "grad_norm": 0.2949596444499225, + "learning_rate": 0.00018029114705514596, + "loss": 1.1267, + "step": 5082 + }, + { + "epoch": 0.49, + "grad_norm": 0.3126152488004548, + "learning_rate": 0.00018028171607918747, + "loss": 1.1043, + "step": 5083 + }, + { + "epoch": 0.49, + "grad_norm": 0.3158944201979104, + "learning_rate": 0.00018027228309412853, + "loss": 1.1333, + "step": 5084 + }, + { + "epoch": 0.49, + "grad_norm": 0.23721139976987007, + "learning_rate": 0.00018026284810020532, + "loss": 1.0958, + "step": 5085 + }, + { + "epoch": 0.49, + "grad_norm": 0.2620591461537738, + "learning_rate": 0.00018025341109765384, + "loss": 1.0396, + "step": 5086 + }, + { + "epoch": 0.49, + "grad_norm": 0.29187944723111464, + "learning_rate": 0.00018024397208671035, + "loss": 1.1689, + "step": 5087 + }, + { + "epoch": 0.49, + "grad_norm": 0.2740567842867544, + "learning_rate": 0.000180234531067611, + "loss": 1.0742, + "step": 5088 + }, + { + "epoch": 0.49, + "grad_norm": 0.28997111975839956, + "learning_rate": 0.00018022508804059207, + "loss": 1.1359, + "step": 5089 + }, + { + "epoch": 0.49, + "grad_norm": 0.264169160568814, + "learning_rate": 0.00018021564300588994, + "loss": 0.9856, + "step": 5090 + }, + { + "epoch": 0.49, + "grad_norm": 0.25799277848982, + "learning_rate": 0.0001802061959637409, + "loss": 1.0802, + "step": 5091 + }, + { + "epoch": 0.49, + "grad_norm": 0.27789264617161874, + "learning_rate": 0.0001801967469143814, + "loss": 1.1495, + "step": 5092 + }, + { + "epoch": 0.49, + "grad_norm": 0.284905774011098, + "learning_rate": 0.0001801872958580479, + "loss": 1.1043, + "step": 5093 + }, + { + "epoch": 0.49, + "grad_norm": 0.23840852278323477, + "learning_rate": 0.00018017784279497693, + "loss": 1.0545, + "step": 5094 + }, + { + "epoch": 0.49, + "grad_norm": 0.29365200369529876, + "learning_rate": 0.00018016838772540506, + "loss": 1.0454, + "step": 5095 + }, + { + "epoch": 0.49, + "grad_norm": 0.2905391396194257, + "learning_rate": 0.0001801589306495689, + "loss": 1.0989, + "step": 5096 + }, + { + "epoch": 0.49, + "grad_norm": 0.30717926924064015, + "learning_rate": 0.00018014947156770513, + "loss": 1.0989, + "step": 5097 + }, + { + "epoch": 0.49, + "grad_norm": 0.24633770830565768, + "learning_rate": 0.00018014001048005044, + "loss": 0.9554, + "step": 5098 + }, + { + "epoch": 0.49, + "grad_norm": 0.269877103345373, + "learning_rate": 0.00018013054738684166, + "loss": 1.1133, + "step": 5099 + }, + { + "epoch": 0.49, + "grad_norm": 0.2995678521690841, + "learning_rate": 0.00018012108228831556, + "loss": 1.1077, + "step": 5100 + }, + { + "epoch": 0.49, + "grad_norm": 0.24440039782838777, + "learning_rate": 0.000180111615184709, + "loss": 1.0012, + "step": 5101 + }, + { + "epoch": 0.49, + "grad_norm": 0.27251554211773565, + "learning_rate": 0.00018010214607625894, + "loss": 1.1004, + "step": 5102 + }, + { + "epoch": 0.49, + "grad_norm": 0.2810534155604437, + "learning_rate": 0.0001800926749632023, + "loss": 1.0193, + "step": 5103 + }, + { + "epoch": 0.49, + "grad_norm": 0.27100726353528887, + "learning_rate": 0.0001800832018457762, + "loss": 1.0602, + "step": 5104 + }, + { + "epoch": 0.49, + "grad_norm": 0.26564758624041535, + "learning_rate": 0.00018007372672421756, + "loss": 1.079, + "step": 5105 + }, + { + "epoch": 0.49, + "grad_norm": 0.2974717566929695, + "learning_rate": 0.00018006424959876363, + "loss": 1.1326, + "step": 5106 + }, + { + "epoch": 0.49, + "grad_norm": 0.30488571850565943, + "learning_rate": 0.00018005477046965153, + "loss": 1.0309, + "step": 5107 + }, + { + "epoch": 0.49, + "grad_norm": 0.2145246313949886, + "learning_rate": 0.0001800452893371185, + "loss": 1.0762, + "step": 5108 + }, + { + "epoch": 0.49, + "grad_norm": 0.2636249075639135, + "learning_rate": 0.00018003580620140177, + "loss": 1.153, + "step": 5109 + }, + { + "epoch": 0.49, + "grad_norm": 0.27782368506119703, + "learning_rate": 0.0001800263210627387, + "loss": 1.117, + "step": 5110 + }, + { + "epoch": 0.49, + "grad_norm": 0.3007327930211106, + "learning_rate": 0.00018001683392136666, + "loss": 1.1193, + "step": 5111 + }, + { + "epoch": 0.49, + "grad_norm": 0.28929437068723235, + "learning_rate": 0.00018000734477752306, + "loss": 1.0561, + "step": 5112 + }, + { + "epoch": 0.49, + "grad_norm": 0.28549761815962377, + "learning_rate": 0.00017999785363144536, + "loss": 1.1016, + "step": 5113 + }, + { + "epoch": 0.49, + "grad_norm": 0.2863390229819487, + "learning_rate": 0.0001799883604833711, + "loss": 1.1143, + "step": 5114 + }, + { + "epoch": 0.49, + "grad_norm": 0.2758612911370492, + "learning_rate": 0.00017997886533353786, + "loss": 1.1496, + "step": 5115 + }, + { + "epoch": 0.49, + "grad_norm": 0.26382558236784287, + "learning_rate": 0.00017996936818218324, + "loss": 1.0343, + "step": 5116 + }, + { + "epoch": 0.49, + "grad_norm": 0.2695740841257572, + "learning_rate": 0.00017995986902954493, + "loss": 1.1589, + "step": 5117 + }, + { + "epoch": 0.49, + "grad_norm": 0.315794471443836, + "learning_rate": 0.00017995036787586064, + "loss": 0.9799, + "step": 5118 + }, + { + "epoch": 0.49, + "grad_norm": 0.2707540246275962, + "learning_rate": 0.00017994086472136815, + "loss": 1.089, + "step": 5119 + }, + { + "epoch": 0.49, + "grad_norm": 0.3337912375773768, + "learning_rate": 0.0001799313595663053, + "loss": 1.0462, + "step": 5120 + }, + { + "epoch": 0.49, + "grad_norm": 0.30849482259896643, + "learning_rate": 0.0001799218524109099, + "loss": 0.9913, + "step": 5121 + }, + { + "epoch": 0.49, + "grad_norm": 0.2794905681957685, + "learning_rate": 0.00017991234325541995, + "loss": 1.0798, + "step": 5122 + }, + { + "epoch": 0.49, + "grad_norm": 0.26906629376110763, + "learning_rate": 0.00017990283210007335, + "loss": 0.9918, + "step": 5123 + }, + { + "epoch": 0.49, + "grad_norm": 0.2640810736886688, + "learning_rate": 0.00017989331894510818, + "loss": 1.0668, + "step": 5124 + }, + { + "epoch": 0.49, + "grad_norm": 0.3403000076240741, + "learning_rate": 0.0001798838037907625, + "loss": 1.1871, + "step": 5125 + }, + { + "epoch": 0.49, + "grad_norm": 0.29390271181136557, + "learning_rate": 0.00017987428663727441, + "loss": 1.0834, + "step": 5126 + }, + { + "epoch": 0.49, + "grad_norm": 0.29242522687517936, + "learning_rate": 0.00017986476748488214, + "loss": 0.9395, + "step": 5127 + }, + { + "epoch": 0.49, + "grad_norm": 0.30030389910750893, + "learning_rate": 0.00017985524633382381, + "loss": 1.0064, + "step": 5128 + }, + { + "epoch": 0.49, + "grad_norm": 0.26767260775313606, + "learning_rate": 0.00017984572318433778, + "loss": 1.2037, + "step": 5129 + }, + { + "epoch": 0.49, + "grad_norm": 0.2883783786958614, + "learning_rate": 0.00017983619803666235, + "loss": 1.0901, + "step": 5130 + }, + { + "epoch": 0.49, + "grad_norm": 0.2615336186142248, + "learning_rate": 0.00017982667089103588, + "loss": 1.1084, + "step": 5131 + }, + { + "epoch": 0.49, + "grad_norm": 0.2806392247590465, + "learning_rate": 0.0001798171417476968, + "loss": 1.0532, + "step": 5132 + }, + { + "epoch": 0.49, + "grad_norm": 0.2563049389592253, + "learning_rate": 0.0001798076106068836, + "loss": 1.0789, + "step": 5133 + }, + { + "epoch": 0.49, + "grad_norm": 0.32787128373934044, + "learning_rate": 0.0001797980774688348, + "loss": 1.0862, + "step": 5134 + }, + { + "epoch": 0.49, + "grad_norm": 0.26980316995262976, + "learning_rate": 0.00017978854233378891, + "loss": 1.2006, + "step": 5135 + }, + { + "epoch": 0.49, + "grad_norm": 0.2359678414179539, + "learning_rate": 0.00017977900520198465, + "loss": 0.9747, + "step": 5136 + }, + { + "epoch": 0.49, + "grad_norm": 0.28300881314834403, + "learning_rate": 0.00017976946607366063, + "loss": 1.0696, + "step": 5137 + }, + { + "epoch": 0.49, + "grad_norm": 0.27366802597990486, + "learning_rate": 0.0001797599249490556, + "loss": 0.9662, + "step": 5138 + }, + { + "epoch": 0.49, + "grad_norm": 0.2714308944537401, + "learning_rate": 0.00017975038182840828, + "loss": 1.083, + "step": 5139 + }, + { + "epoch": 0.49, + "grad_norm": 0.2644809976461488, + "learning_rate": 0.00017974083671195757, + "loss": 1.081, + "step": 5140 + }, + { + "epoch": 0.49, + "grad_norm": 0.2813538147596539, + "learning_rate": 0.0001797312895999423, + "loss": 1.1335, + "step": 5141 + }, + { + "epoch": 0.49, + "grad_norm": 0.2910923380431433, + "learning_rate": 0.0001797217404926014, + "loss": 1.0232, + "step": 5142 + }, + { + "epoch": 0.49, + "grad_norm": 0.26419502096815084, + "learning_rate": 0.00017971218939017382, + "loss": 1.106, + "step": 5143 + }, + { + "epoch": 0.49, + "grad_norm": 0.30598831197367454, + "learning_rate": 0.00017970263629289864, + "loss": 1.1303, + "step": 5144 + }, + { + "epoch": 0.49, + "grad_norm": 0.24615417333770134, + "learning_rate": 0.00017969308120101488, + "loss": 0.969, + "step": 5145 + }, + { + "epoch": 0.49, + "grad_norm": 0.2592721166455555, + "learning_rate": 0.00017968352411476166, + "loss": 1.0971, + "step": 5146 + }, + { + "epoch": 0.49, + "grad_norm": 0.2868843022304741, + "learning_rate": 0.00017967396503437816, + "loss": 0.986, + "step": 5147 + }, + { + "epoch": 0.49, + "grad_norm": 0.2936385306501026, + "learning_rate": 0.00017966440396010366, + "loss": 1.0833, + "step": 5148 + }, + { + "epoch": 0.49, + "grad_norm": 0.2947617623317851, + "learning_rate": 0.00017965484089217735, + "loss": 1.0711, + "step": 5149 + }, + { + "epoch": 0.49, + "grad_norm": 0.2628043083883782, + "learning_rate": 0.0001796452758308386, + "loss": 1.1146, + "step": 5150 + }, + { + "epoch": 0.49, + "grad_norm": 0.31452969252198026, + "learning_rate": 0.00017963570877632676, + "loss": 1.1144, + "step": 5151 + }, + { + "epoch": 0.49, + "grad_norm": 0.2596821467826261, + "learning_rate": 0.00017962613972888125, + "loss": 1.0938, + "step": 5152 + }, + { + "epoch": 0.49, + "grad_norm": 0.27739044259675993, + "learning_rate": 0.00017961656868874156, + "loss": 1.0211, + "step": 5153 + }, + { + "epoch": 0.49, + "grad_norm": 0.2937922180821596, + "learning_rate": 0.0001796069956561472, + "loss": 1.0647, + "step": 5154 + }, + { + "epoch": 0.49, + "grad_norm": 0.27965187717460716, + "learning_rate": 0.00017959742063133774, + "loss": 1.0117, + "step": 5155 + }, + { + "epoch": 0.49, + "grad_norm": 0.2840025373723606, + "learning_rate": 0.00017958784361455282, + "loss": 0.9706, + "step": 5156 + }, + { + "epoch": 0.49, + "grad_norm": 0.3178568601522771, + "learning_rate": 0.00017957826460603205, + "loss": 1.018, + "step": 5157 + }, + { + "epoch": 0.49, + "grad_norm": 0.2860032058314914, + "learning_rate": 0.00017956868360601526, + "loss": 1.1566, + "step": 5158 + }, + { + "epoch": 0.49, + "grad_norm": 0.28443581551498176, + "learning_rate": 0.00017955910061474213, + "loss": 1.0321, + "step": 5159 + }, + { + "epoch": 0.49, + "grad_norm": 0.27187438960276544, + "learning_rate": 0.0001795495156324525, + "loss": 1.0505, + "step": 5160 + }, + { + "epoch": 0.49, + "grad_norm": 0.29288869429900377, + "learning_rate": 0.00017953992865938622, + "loss": 1.0511, + "step": 5161 + }, + { + "epoch": 0.49, + "grad_norm": 0.30056027600724244, + "learning_rate": 0.00017953033969578326, + "loss": 1.1062, + "step": 5162 + }, + { + "epoch": 0.49, + "grad_norm": 0.26821431519549427, + "learning_rate": 0.00017952074874188356, + "loss": 1.092, + "step": 5163 + }, + { + "epoch": 0.49, + "grad_norm": 0.24906638557874644, + "learning_rate": 0.00017951115579792717, + "loss": 1.1022, + "step": 5164 + }, + { + "epoch": 0.49, + "grad_norm": 0.26309218013010355, + "learning_rate": 0.0001795015608641541, + "loss": 1.0265, + "step": 5165 + }, + { + "epoch": 0.49, + "grad_norm": 0.26268394260057937, + "learning_rate": 0.00017949196394080453, + "loss": 1.0653, + "step": 5166 + }, + { + "epoch": 0.49, + "grad_norm": 0.2896832205130746, + "learning_rate": 0.00017948236502811859, + "loss": 1.0854, + "step": 5167 + }, + { + "epoch": 0.49, + "grad_norm": 0.2625237394571958, + "learning_rate": 0.00017947276412633652, + "loss": 0.9619, + "step": 5168 + }, + { + "epoch": 0.49, + "grad_norm": 0.27986610289816, + "learning_rate": 0.0001794631612356986, + "loss": 1.1716, + "step": 5169 + }, + { + "epoch": 0.49, + "grad_norm": 0.28211340979754457, + "learning_rate": 0.0001794535563564451, + "loss": 1.1021, + "step": 5170 + }, + { + "epoch": 0.49, + "grad_norm": 0.24926141918189934, + "learning_rate": 0.00017944394948881642, + "loss": 1.056, + "step": 5171 + }, + { + "epoch": 0.49, + "grad_norm": 0.2869730653075155, + "learning_rate": 0.00017943434063305298, + "loss": 0.9492, + "step": 5172 + }, + { + "epoch": 0.49, + "grad_norm": 0.29187379432960536, + "learning_rate": 0.00017942472978939525, + "loss": 1.1833, + "step": 5173 + }, + { + "epoch": 0.5, + "grad_norm": 0.28973741108012707, + "learning_rate": 0.00017941511695808372, + "loss": 1.0489, + "step": 5174 + }, + { + "epoch": 0.5, + "grad_norm": 0.27689121010931406, + "learning_rate": 0.000179405502139359, + "loss": 1.1769, + "step": 5175 + }, + { + "epoch": 0.5, + "grad_norm": 0.3067616467203378, + "learning_rate": 0.00017939588533346168, + "loss": 1.1473, + "step": 5176 + }, + { + "epoch": 0.5, + "grad_norm": 0.2757139090006968, + "learning_rate": 0.0001793862665406324, + "loss": 1.1183, + "step": 5177 + }, + { + "epoch": 0.5, + "grad_norm": 0.27386836919563157, + "learning_rate": 0.00017937664576111198, + "loss": 1.0154, + "step": 5178 + }, + { + "epoch": 0.5, + "grad_norm": 0.2918051909413474, + "learning_rate": 0.00017936702299514105, + "loss": 1.1863, + "step": 5179 + }, + { + "epoch": 0.5, + "grad_norm": 0.2964159774568591, + "learning_rate": 0.00017935739824296052, + "loss": 1.0043, + "step": 5180 + }, + { + "epoch": 0.5, + "grad_norm": 0.31639186034892197, + "learning_rate": 0.0001793477715048112, + "loss": 1.1946, + "step": 5181 + }, + { + "epoch": 0.5, + "grad_norm": 0.3060832768680089, + "learning_rate": 0.00017933814278093407, + "loss": 0.9377, + "step": 5182 + }, + { + "epoch": 0.5, + "grad_norm": 0.3453844813102695, + "learning_rate": 0.00017932851207157002, + "loss": 1.0465, + "step": 5183 + }, + { + "epoch": 0.5, + "grad_norm": 0.25905466276473943, + "learning_rate": 0.0001793188793769601, + "loss": 1.1672, + "step": 5184 + }, + { + "epoch": 0.5, + "grad_norm": 0.29227551232364024, + "learning_rate": 0.00017930924469734537, + "loss": 1.0898, + "step": 5185 + }, + { + "epoch": 0.5, + "grad_norm": 0.2513616979563531, + "learning_rate": 0.00017929960803296697, + "loss": 1.0656, + "step": 5186 + }, + { + "epoch": 0.5, + "grad_norm": 0.275133905403024, + "learning_rate": 0.00017928996938406603, + "loss": 1.002, + "step": 5187 + }, + { + "epoch": 0.5, + "grad_norm": 0.28951906430321944, + "learning_rate": 0.00017928032875088375, + "loss": 1.1197, + "step": 5188 + }, + { + "epoch": 0.5, + "grad_norm": 0.26163177751335326, + "learning_rate": 0.00017927068613366145, + "loss": 1.1226, + "step": 5189 + }, + { + "epoch": 0.5, + "grad_norm": 0.308746805614135, + "learning_rate": 0.00017926104153264042, + "loss": 1.0741, + "step": 5190 + }, + { + "epoch": 0.5, + "grad_norm": 0.29075350653382204, + "learning_rate": 0.00017925139494806198, + "loss": 1.1404, + "step": 5191 + }, + { + "epoch": 0.5, + "grad_norm": 0.2760956753624024, + "learning_rate": 0.0001792417463801676, + "loss": 1.132, + "step": 5192 + }, + { + "epoch": 0.5, + "grad_norm": 0.30467872599141255, + "learning_rate": 0.0001792320958291987, + "loss": 1.1227, + "step": 5193 + }, + { + "epoch": 0.5, + "grad_norm": 0.269599432070437, + "learning_rate": 0.0001792224432953968, + "loss": 1.1915, + "step": 5194 + }, + { + "epoch": 0.5, + "grad_norm": 0.2540692313113988, + "learning_rate": 0.00017921278877900348, + "loss": 1.1215, + "step": 5195 + }, + { + "epoch": 0.5, + "grad_norm": 0.25608501554671015, + "learning_rate": 0.0001792031322802603, + "loss": 1.0529, + "step": 5196 + }, + { + "epoch": 0.5, + "grad_norm": 0.2646934249716702, + "learning_rate": 0.00017919347379940904, + "loss": 1.1269, + "step": 5197 + }, + { + "epoch": 0.5, + "grad_norm": 0.2458734180042231, + "learning_rate": 0.00017918381333669126, + "loss": 1.0294, + "step": 5198 + }, + { + "epoch": 0.5, + "grad_norm": 0.2629321130779039, + "learning_rate": 0.0001791741508923488, + "loss": 1.0545, + "step": 5199 + }, + { + "epoch": 0.5, + "grad_norm": 0.29932796860263766, + "learning_rate": 0.00017916448646662346, + "loss": 1.1029, + "step": 5200 + }, + { + "epoch": 0.5, + "grad_norm": 0.2794636942689881, + "learning_rate": 0.00017915482005975708, + "loss": 0.9605, + "step": 5201 + }, + { + "epoch": 0.5, + "grad_norm": 0.2805023432544276, + "learning_rate": 0.00017914515167199158, + "loss": 1.0897, + "step": 5202 + }, + { + "epoch": 0.5, + "grad_norm": 0.25784189464219454, + "learning_rate": 0.00017913548130356894, + "loss": 1.013, + "step": 5203 + }, + { + "epoch": 0.5, + "grad_norm": 0.31553870897854386, + "learning_rate": 0.00017912580895473114, + "loss": 1.1689, + "step": 5204 + }, + { + "epoch": 0.5, + "grad_norm": 0.31827443288264134, + "learning_rate": 0.00017911613462572024, + "loss": 1.0521, + "step": 5205 + }, + { + "epoch": 0.5, + "grad_norm": 0.3025879844146086, + "learning_rate": 0.00017910645831677836, + "loss": 0.997, + "step": 5206 + }, + { + "epoch": 0.5, + "grad_norm": 0.26057117438515826, + "learning_rate": 0.0001790967800281476, + "loss": 1.1074, + "step": 5207 + }, + { + "epoch": 0.5, + "grad_norm": 0.2611725357612145, + "learning_rate": 0.00017908709976007024, + "loss": 1.0784, + "step": 5208 + }, + { + "epoch": 0.5, + "grad_norm": 0.2344376304389919, + "learning_rate": 0.0001790774175127885, + "loss": 1.0707, + "step": 5209 + }, + { + "epoch": 0.5, + "grad_norm": 0.29122733484693347, + "learning_rate": 0.00017906773328654472, + "loss": 1.1213, + "step": 5210 + }, + { + "epoch": 0.5, + "grad_norm": 0.2759187534926237, + "learning_rate": 0.00017905804708158118, + "loss": 1.0693, + "step": 5211 + }, + { + "epoch": 0.5, + "grad_norm": 0.2777254558995927, + "learning_rate": 0.00017904835889814033, + "loss": 1.0366, + "step": 5212 + }, + { + "epoch": 0.5, + "grad_norm": 0.2714206402525035, + "learning_rate": 0.00017903866873646463, + "loss": 1.1107, + "step": 5213 + }, + { + "epoch": 0.5, + "grad_norm": 0.3012099609994761, + "learning_rate": 0.0001790289765967966, + "loss": 1.0848, + "step": 5214 + }, + { + "epoch": 0.5, + "grad_norm": 0.3032845339156945, + "learning_rate": 0.00017901928247937872, + "loss": 1.1453, + "step": 5215 + }, + { + "epoch": 0.5, + "grad_norm": 0.26396959858562596, + "learning_rate": 0.00017900958638445365, + "loss": 1.1375, + "step": 5216 + }, + { + "epoch": 0.5, + "grad_norm": 0.2727644440686032, + "learning_rate": 0.00017899988831226402, + "loss": 1.0937, + "step": 5217 + }, + { + "epoch": 0.5, + "grad_norm": 0.3053107974870675, + "learning_rate": 0.00017899018826305252, + "loss": 1.1792, + "step": 5218 + }, + { + "epoch": 0.5, + "grad_norm": 0.26793488874015303, + "learning_rate": 0.00017898048623706195, + "loss": 1.1571, + "step": 5219 + }, + { + "epoch": 0.5, + "grad_norm": 0.2597411208953679, + "learning_rate": 0.00017897078223453504, + "loss": 1.052, + "step": 5220 + }, + { + "epoch": 0.5, + "grad_norm": 0.31195341891857165, + "learning_rate": 0.0001789610762557147, + "loss": 1.0902, + "step": 5221 + }, + { + "epoch": 0.5, + "grad_norm": 0.2744676529459866, + "learning_rate": 0.0001789513683008438, + "loss": 1.0829, + "step": 5222 + }, + { + "epoch": 0.5, + "grad_norm": 0.26093271474594537, + "learning_rate": 0.00017894165837016528, + "loss": 0.9847, + "step": 5223 + }, + { + "epoch": 0.5, + "grad_norm": 0.290962254361299, + "learning_rate": 0.00017893194646392214, + "loss": 1.0686, + "step": 5224 + }, + { + "epoch": 0.5, + "grad_norm": 0.26375436627341603, + "learning_rate": 0.00017892223258235746, + "loss": 1.1454, + "step": 5225 + }, + { + "epoch": 0.5, + "grad_norm": 0.2646502633063915, + "learning_rate": 0.00017891251672571428, + "loss": 1.0035, + "step": 5226 + }, + { + "epoch": 0.5, + "eval_loss": 1.130812406539917, + "eval_runtime": 4229.0908, + "eval_samples_per_second": 19.772, + "eval_steps_per_second": 2.472, + "step": 5226 + }, + { + "epoch": 0.5, + "grad_norm": 0.27086856649700053, + "learning_rate": 0.00017890279889423577, + "loss": 1.2004, + "step": 5227 + }, + { + "epoch": 0.5, + "grad_norm": 0.26907032517761026, + "learning_rate": 0.00017889307908816514, + "loss": 1.1269, + "step": 5228 + }, + { + "epoch": 0.5, + "grad_norm": 0.2559621504798168, + "learning_rate": 0.00017888335730774563, + "loss": 1.0099, + "step": 5229 + }, + { + "epoch": 0.5, + "grad_norm": 0.2886629743550709, + "learning_rate": 0.00017887363355322054, + "loss": 1.0698, + "step": 5230 + }, + { + "epoch": 0.5, + "grad_norm": 0.3107523248743581, + "learning_rate": 0.00017886390782483318, + "loss": 1.086, + "step": 5231 + }, + { + "epoch": 0.5, + "grad_norm": 0.2731347287540774, + "learning_rate": 0.00017885418012282696, + "loss": 1.1274, + "step": 5232 + }, + { + "epoch": 0.5, + "grad_norm": 0.3032030305104551, + "learning_rate": 0.00017884445044744532, + "loss": 1.0157, + "step": 5233 + }, + { + "epoch": 0.5, + "grad_norm": 0.29204175835395896, + "learning_rate": 0.00017883471879893176, + "loss": 1.0925, + "step": 5234 + }, + { + "epoch": 0.5, + "grad_norm": 0.27875381267269567, + "learning_rate": 0.00017882498517752984, + "loss": 1.0, + "step": 5235 + }, + { + "epoch": 0.5, + "grad_norm": 0.2744649218236222, + "learning_rate": 0.00017881524958348311, + "loss": 1.1309, + "step": 5236 + }, + { + "epoch": 0.5, + "grad_norm": 0.2638597475413694, + "learning_rate": 0.00017880551201703522, + "loss": 1.1368, + "step": 5237 + }, + { + "epoch": 0.5, + "grad_norm": 0.29740497162072044, + "learning_rate": 0.00017879577247842984, + "loss": 1.163, + "step": 5238 + }, + { + "epoch": 0.5, + "grad_norm": 0.2792113343183714, + "learning_rate": 0.00017878603096791078, + "loss": 0.9453, + "step": 5239 + }, + { + "epoch": 0.5, + "grad_norm": 0.3252881204864866, + "learning_rate": 0.00017877628748572176, + "loss": 0.9687, + "step": 5240 + }, + { + "epoch": 0.5, + "grad_norm": 0.2993197408644714, + "learning_rate": 0.00017876654203210666, + "loss": 1.1889, + "step": 5241 + }, + { + "epoch": 0.5, + "grad_norm": 0.25588679975602296, + "learning_rate": 0.0001787567946073093, + "loss": 1.0073, + "step": 5242 + }, + { + "epoch": 0.5, + "grad_norm": 0.25844292628782845, + "learning_rate": 0.00017874704521157368, + "loss": 1.1023, + "step": 5243 + }, + { + "epoch": 0.5, + "grad_norm": 0.2701484124438951, + "learning_rate": 0.00017873729384514374, + "loss": 1.1446, + "step": 5244 + }, + { + "epoch": 0.5, + "grad_norm": 0.28163616631177285, + "learning_rate": 0.00017872754050826358, + "loss": 1.1451, + "step": 5245 + }, + { + "epoch": 0.5, + "grad_norm": 0.24357842637206142, + "learning_rate": 0.00017871778520117722, + "loss": 1.0879, + "step": 5246 + }, + { + "epoch": 0.5, + "grad_norm": 0.2854538423239079, + "learning_rate": 0.0001787080279241288, + "loss": 1.1346, + "step": 5247 + }, + { + "epoch": 0.5, + "grad_norm": 0.28966646255088585, + "learning_rate": 0.00017869826867736253, + "loss": 1.1342, + "step": 5248 + }, + { + "epoch": 0.5, + "grad_norm": 0.269137927932761, + "learning_rate": 0.0001786885074611226, + "loss": 1.2118, + "step": 5249 + }, + { + "epoch": 0.5, + "grad_norm": 0.24965315368924026, + "learning_rate": 0.00017867874427565336, + "loss": 1.0445, + "step": 5250 + }, + { + "epoch": 0.5, + "grad_norm": 0.3268586937562193, + "learning_rate": 0.00017866897912119907, + "loss": 1.0391, + "step": 5251 + }, + { + "epoch": 0.5, + "grad_norm": 0.2628697700403388, + "learning_rate": 0.00017865921199800415, + "loss": 1.0788, + "step": 5252 + }, + { + "epoch": 0.5, + "grad_norm": 0.25466346365515374, + "learning_rate": 0.00017864944290631301, + "loss": 1.1108, + "step": 5253 + }, + { + "epoch": 0.5, + "grad_norm": 0.2785430342878654, + "learning_rate": 0.00017863967184637014, + "loss": 0.9847, + "step": 5254 + }, + { + "epoch": 0.5, + "grad_norm": 0.2821653592665947, + "learning_rate": 0.00017862989881842003, + "loss": 1.1659, + "step": 5255 + }, + { + "epoch": 0.5, + "grad_norm": 0.28381789349374786, + "learning_rate": 0.0001786201238227073, + "loss": 1.0368, + "step": 5256 + }, + { + "epoch": 0.5, + "grad_norm": 0.28526194687906037, + "learning_rate": 0.00017861034685947658, + "loss": 1.0789, + "step": 5257 + }, + { + "epoch": 0.5, + "grad_norm": 0.2646173880017078, + "learning_rate": 0.0001786005679289725, + "loss": 1.0564, + "step": 5258 + }, + { + "epoch": 0.5, + "grad_norm": 0.23710824870546326, + "learning_rate": 0.0001785907870314398, + "loss": 0.9625, + "step": 5259 + }, + { + "epoch": 0.5, + "grad_norm": 0.26259679119084095, + "learning_rate": 0.0001785810041671233, + "loss": 1.1021, + "step": 5260 + }, + { + "epoch": 0.5, + "grad_norm": 0.2709420502635169, + "learning_rate": 0.00017857121933626777, + "loss": 1.0062, + "step": 5261 + }, + { + "epoch": 0.5, + "grad_norm": 0.2613609827266697, + "learning_rate": 0.0001785614325391181, + "loss": 1.0678, + "step": 5262 + }, + { + "epoch": 0.5, + "grad_norm": 0.28813131561794453, + "learning_rate": 0.00017855164377591918, + "loss": 0.9172, + "step": 5263 + }, + { + "epoch": 0.5, + "grad_norm": 0.27234699286481917, + "learning_rate": 0.000178541853046916, + "loss": 1.1471, + "step": 5264 + }, + { + "epoch": 0.5, + "grad_norm": 0.2506386238760721, + "learning_rate": 0.0001785320603523536, + "loss": 1.0259, + "step": 5265 + }, + { + "epoch": 0.5, + "grad_norm": 0.2540092486293724, + "learning_rate": 0.00017852226569247708, + "loss": 1.0877, + "step": 5266 + }, + { + "epoch": 0.5, + "grad_norm": 0.29602747120892153, + "learning_rate": 0.00017851246906753145, + "loss": 1.109, + "step": 5267 + }, + { + "epoch": 0.5, + "grad_norm": 0.2992977885795131, + "learning_rate": 0.00017850267047776197, + "loss": 1.0069, + "step": 5268 + }, + { + "epoch": 0.5, + "grad_norm": 0.28770619817428295, + "learning_rate": 0.0001784928699234138, + "loss": 1.0456, + "step": 5269 + }, + { + "epoch": 0.5, + "grad_norm": 0.2621648485300015, + "learning_rate": 0.00017848306740473227, + "loss": 1.0159, + "step": 5270 + }, + { + "epoch": 0.5, + "grad_norm": 0.27475331658348695, + "learning_rate": 0.00017847326292196261, + "loss": 1.026, + "step": 5271 + }, + { + "epoch": 0.5, + "grad_norm": 0.2883171040107314, + "learning_rate": 0.00017846345647535026, + "loss": 1.0636, + "step": 5272 + }, + { + "epoch": 0.5, + "grad_norm": 0.2679836068828528, + "learning_rate": 0.0001784536480651406, + "loss": 1.1411, + "step": 5273 + }, + { + "epoch": 0.5, + "grad_norm": 0.2718315189359781, + "learning_rate": 0.00017844383769157905, + "loss": 1.0714, + "step": 5274 + }, + { + "epoch": 0.5, + "grad_norm": 0.2679104057163504, + "learning_rate": 0.0001784340253549112, + "loss": 1.0169, + "step": 5275 + }, + { + "epoch": 0.5, + "grad_norm": 0.28568786629626025, + "learning_rate": 0.00017842421105538256, + "loss": 1.0771, + "step": 5276 + }, + { + "epoch": 0.5, + "grad_norm": 0.2548173055279772, + "learning_rate": 0.00017841439479323877, + "loss": 1.1359, + "step": 5277 + }, + { + "epoch": 0.5, + "grad_norm": 0.24999339748919044, + "learning_rate": 0.00017840457656872544, + "loss": 1.0132, + "step": 5278 + }, + { + "epoch": 0.51, + "grad_norm": 0.3003717665696886, + "learning_rate": 0.00017839475638208832, + "loss": 1.0838, + "step": 5279 + }, + { + "epoch": 0.51, + "grad_norm": 0.31887733553865744, + "learning_rate": 0.00017838493423357314, + "loss": 1.0885, + "step": 5280 + }, + { + "epoch": 0.51, + "grad_norm": 0.2625967483202807, + "learning_rate": 0.00017837511012342572, + "loss": 1.1534, + "step": 5281 + }, + { + "epoch": 0.51, + "grad_norm": 0.31614913625651914, + "learning_rate": 0.0001783652840518919, + "loss": 1.0403, + "step": 5282 + }, + { + "epoch": 0.51, + "grad_norm": 0.26033042065792134, + "learning_rate": 0.00017835545601921764, + "loss": 1.0581, + "step": 5283 + }, + { + "epoch": 0.51, + "grad_norm": 0.2856070132728536, + "learning_rate": 0.00017834562602564883, + "loss": 1.0107, + "step": 5284 + }, + { + "epoch": 0.51, + "grad_norm": 0.2536339900536725, + "learning_rate": 0.00017833579407143147, + "loss": 0.9525, + "step": 5285 + }, + { + "epoch": 0.51, + "grad_norm": 0.26624532509069715, + "learning_rate": 0.00017832596015681165, + "loss": 1.0309, + "step": 5286 + }, + { + "epoch": 0.51, + "grad_norm": 0.26639398423968824, + "learning_rate": 0.00017831612428203543, + "loss": 1.103, + "step": 5287 + }, + { + "epoch": 0.51, + "grad_norm": 0.2854143682160096, + "learning_rate": 0.00017830628644734898, + "loss": 1.1898, + "step": 5288 + }, + { + "epoch": 0.51, + "grad_norm": 0.2647981085190368, + "learning_rate": 0.0001782964466529985, + "loss": 1.0559, + "step": 5289 + }, + { + "epoch": 0.51, + "grad_norm": 0.2692642057535311, + "learning_rate": 0.00017828660489923025, + "loss": 1.1121, + "step": 5290 + }, + { + "epoch": 0.51, + "grad_norm": 0.2790653912792434, + "learning_rate": 0.00017827676118629054, + "loss": 1.119, + "step": 5291 + }, + { + "epoch": 0.51, + "grad_norm": 0.2731285711487075, + "learning_rate": 0.00017826691551442564, + "loss": 1.1326, + "step": 5292 + }, + { + "epoch": 0.51, + "grad_norm": 0.27848520893646905, + "learning_rate": 0.000178257067883882, + "loss": 1.0876, + "step": 5293 + }, + { + "epoch": 0.51, + "grad_norm": 0.27766691943147104, + "learning_rate": 0.00017824721829490608, + "loss": 1.0075, + "step": 5294 + }, + { + "epoch": 0.51, + "grad_norm": 0.24812476086316548, + "learning_rate": 0.00017823736674774432, + "loss": 0.9859, + "step": 5295 + }, + { + "epoch": 0.51, + "grad_norm": 0.24136690327253105, + "learning_rate": 0.00017822751324264328, + "loss": 0.9295, + "step": 5296 + }, + { + "epoch": 0.51, + "grad_norm": 0.22887560800126114, + "learning_rate": 0.00017821765777984957, + "loss": 1.1376, + "step": 5297 + }, + { + "epoch": 0.51, + "grad_norm": 0.2746004567986452, + "learning_rate": 0.0001782078003596098, + "loss": 1.0057, + "step": 5298 + }, + { + "epoch": 0.51, + "grad_norm": 0.2425791039008265, + "learning_rate": 0.0001781979409821707, + "loss": 1.1405, + "step": 5299 + }, + { + "epoch": 0.51, + "grad_norm": 0.2718956254441623, + "learning_rate": 0.00017818807964777898, + "loss": 1.0884, + "step": 5300 + }, + { + "epoch": 0.51, + "grad_norm": 0.2616875209648741, + "learning_rate": 0.0001781782163566814, + "loss": 1.0947, + "step": 5301 + }, + { + "epoch": 0.51, + "grad_norm": 0.24842391055912055, + "learning_rate": 0.00017816835110912485, + "loss": 0.9207, + "step": 5302 + }, + { + "epoch": 0.51, + "grad_norm": 0.2747180515251053, + "learning_rate": 0.00017815848390535617, + "loss": 0.9877, + "step": 5303 + }, + { + "epoch": 0.51, + "grad_norm": 0.2875077951068339, + "learning_rate": 0.00017814861474562232, + "loss": 1.0289, + "step": 5304 + }, + { + "epoch": 0.51, + "grad_norm": 0.2763116859597204, + "learning_rate": 0.00017813874363017027, + "loss": 1.0357, + "step": 5305 + }, + { + "epoch": 0.51, + "grad_norm": 0.28069395829776694, + "learning_rate": 0.00017812887055924703, + "loss": 1.1048, + "step": 5306 + }, + { + "epoch": 0.51, + "grad_norm": 0.30047513289045635, + "learning_rate": 0.00017811899553309975, + "loss": 1.0333, + "step": 5307 + }, + { + "epoch": 0.51, + "grad_norm": 0.2650398736977853, + "learning_rate": 0.00017810911855197547, + "loss": 1.0565, + "step": 5308 + }, + { + "epoch": 0.51, + "grad_norm": 0.24640154980602844, + "learning_rate": 0.0001780992396161214, + "loss": 1.1352, + "step": 5309 + }, + { + "epoch": 0.51, + "grad_norm": 0.28475244026032503, + "learning_rate": 0.00017808935872578482, + "loss": 1.0479, + "step": 5310 + }, + { + "epoch": 0.51, + "grad_norm": 0.2756255403166274, + "learning_rate": 0.00017807947588121295, + "loss": 1.0636, + "step": 5311 + }, + { + "epoch": 0.51, + "grad_norm": 0.3137323153824364, + "learning_rate": 0.00017806959108265308, + "loss": 1.2068, + "step": 5312 + }, + { + "epoch": 0.51, + "grad_norm": 0.2842470117681127, + "learning_rate": 0.00017805970433035266, + "loss": 1.229, + "step": 5313 + }, + { + "epoch": 0.51, + "grad_norm": 0.2981813888336136, + "learning_rate": 0.00017804981562455908, + "loss": 1.1881, + "step": 5314 + }, + { + "epoch": 0.51, + "grad_norm": 0.28279186295473696, + "learning_rate": 0.00017803992496551982, + "loss": 1.0685, + "step": 5315 + }, + { + "epoch": 0.51, + "grad_norm": 0.2702456382826126, + "learning_rate": 0.0001780300323534824, + "loss": 0.9998, + "step": 5316 + }, + { + "epoch": 0.51, + "grad_norm": 0.276914892943855, + "learning_rate": 0.00017802013778869436, + "loss": 1.0531, + "step": 5317 + }, + { + "epoch": 0.51, + "grad_norm": 0.27960841961549454, + "learning_rate": 0.0001780102412714033, + "loss": 1.1359, + "step": 5318 + }, + { + "epoch": 0.51, + "grad_norm": 0.25797705830410184, + "learning_rate": 0.00017800034280185699, + "loss": 1.0185, + "step": 5319 + }, + { + "epoch": 0.51, + "grad_norm": 0.2857057108150563, + "learning_rate": 0.00017799044238030307, + "loss": 1.0631, + "step": 5320 + }, + { + "epoch": 0.51, + "grad_norm": 0.25584721058778875, + "learning_rate": 0.0001779805400069893, + "loss": 0.987, + "step": 5321 + }, + { + "epoch": 0.51, + "grad_norm": 0.2325654159439427, + "learning_rate": 0.0001779706356821635, + "loss": 1.0651, + "step": 5322 + }, + { + "epoch": 0.51, + "grad_norm": 0.2970569353859898, + "learning_rate": 0.00017796072940607353, + "loss": 1.13, + "step": 5323 + }, + { + "epoch": 0.51, + "grad_norm": 0.2929022741608403, + "learning_rate": 0.00017795082117896734, + "loss": 1.0719, + "step": 5324 + }, + { + "epoch": 0.51, + "grad_norm": 0.2609795745084068, + "learning_rate": 0.00017794091100109283, + "loss": 1.0067, + "step": 5325 + }, + { + "epoch": 0.51, + "grad_norm": 0.30762587611696396, + "learning_rate": 0.0001779309988726981, + "loss": 1.147, + "step": 5326 + }, + { + "epoch": 0.51, + "grad_norm": 0.2661981298944252, + "learning_rate": 0.00017792108479403106, + "loss": 1.0734, + "step": 5327 + }, + { + "epoch": 0.51, + "grad_norm": 0.2764342632822514, + "learning_rate": 0.00017791116876533994, + "loss": 1.1101, + "step": 5328 + }, + { + "epoch": 0.51, + "grad_norm": 0.24938502770088783, + "learning_rate": 0.00017790125078687288, + "loss": 1.0666, + "step": 5329 + }, + { + "epoch": 0.51, + "grad_norm": 0.3336408049693267, + "learning_rate": 0.000177891330858878, + "loss": 1.0902, + "step": 5330 + }, + { + "epoch": 0.51, + "grad_norm": 0.3218184473435925, + "learning_rate": 0.00017788140898160367, + "loss": 1.0904, + "step": 5331 + }, + { + "epoch": 0.51, + "grad_norm": 0.2789740899659255, + "learning_rate": 0.0001778714851552981, + "loss": 1.0243, + "step": 5332 + }, + { + "epoch": 0.51, + "grad_norm": 0.266597050786924, + "learning_rate": 0.00017786155938020968, + "loss": 1.0069, + "step": 5333 + }, + { + "epoch": 0.51, + "grad_norm": 0.28223064418935384, + "learning_rate": 0.00017785163165658685, + "loss": 1.1401, + "step": 5334 + }, + { + "epoch": 0.51, + "grad_norm": 0.272562171370601, + "learning_rate": 0.00017784170198467797, + "loss": 1.0984, + "step": 5335 + }, + { + "epoch": 0.51, + "grad_norm": 0.23364083669723149, + "learning_rate": 0.00017783177036473155, + "loss": 1.0585, + "step": 5336 + }, + { + "epoch": 0.51, + "grad_norm": 0.2423799752683352, + "learning_rate": 0.0001778218367969962, + "loss": 0.9586, + "step": 5337 + }, + { + "epoch": 0.51, + "grad_norm": 0.28056034525742624, + "learning_rate": 0.00017781190128172045, + "loss": 1.0374, + "step": 5338 + }, + { + "epoch": 0.51, + "grad_norm": 0.31575138951832793, + "learning_rate": 0.000177801963819153, + "loss": 1.1025, + "step": 5339 + }, + { + "epoch": 0.51, + "grad_norm": 0.2756564243357627, + "learning_rate": 0.00017779202440954247, + "loss": 1.073, + "step": 5340 + }, + { + "epoch": 0.51, + "grad_norm": 0.2855725418706688, + "learning_rate": 0.00017778208305313766, + "loss": 0.9841, + "step": 5341 + }, + { + "epoch": 0.51, + "grad_norm": 0.2622842600538124, + "learning_rate": 0.00017777213975018734, + "loss": 0.9923, + "step": 5342 + }, + { + "epoch": 0.51, + "grad_norm": 0.27868325397356525, + "learning_rate": 0.00017776219450094032, + "loss": 1.0104, + "step": 5343 + }, + { + "epoch": 0.51, + "grad_norm": 0.2910306279110282, + "learning_rate": 0.00017775224730564554, + "loss": 0.9543, + "step": 5344 + }, + { + "epoch": 0.51, + "grad_norm": 0.30979629764973976, + "learning_rate": 0.0001777422981645519, + "loss": 1.1145, + "step": 5345 + }, + { + "epoch": 0.51, + "grad_norm": 0.30299227741795537, + "learning_rate": 0.00017773234707790838, + "loss": 1.0723, + "step": 5346 + }, + { + "epoch": 0.51, + "grad_norm": 0.2925383103599133, + "learning_rate": 0.00017772239404596402, + "loss": 1.1494, + "step": 5347 + }, + { + "epoch": 0.51, + "grad_norm": 0.25939612879354135, + "learning_rate": 0.00017771243906896793, + "loss": 1.0362, + "step": 5348 + }, + { + "epoch": 0.51, + "grad_norm": 0.292894121185352, + "learning_rate": 0.00017770248214716918, + "loss": 1.0974, + "step": 5349 + }, + { + "epoch": 0.51, + "grad_norm": 0.2934306234314348, + "learning_rate": 0.000177692523280817, + "loss": 1.0471, + "step": 5350 + }, + { + "epoch": 0.51, + "grad_norm": 0.2620038040723942, + "learning_rate": 0.0001776825624701606, + "loss": 1.05, + "step": 5351 + }, + { + "epoch": 0.51, + "grad_norm": 0.258906223183488, + "learning_rate": 0.00017767259971544923, + "loss": 1.0016, + "step": 5352 + }, + { + "epoch": 0.51, + "grad_norm": 0.2693727580254897, + "learning_rate": 0.00017766263501693222, + "loss": 1.1004, + "step": 5353 + }, + { + "epoch": 0.51, + "grad_norm": 0.2656780441277641, + "learning_rate": 0.000177652668374859, + "loss": 1.1314, + "step": 5354 + }, + { + "epoch": 0.51, + "grad_norm": 0.26233985901713786, + "learning_rate": 0.00017764269978947893, + "loss": 1.0327, + "step": 5355 + }, + { + "epoch": 0.51, + "grad_norm": 0.2816892908241542, + "learning_rate": 0.00017763272926104152, + "loss": 1.1384, + "step": 5356 + }, + { + "epoch": 0.51, + "grad_norm": 0.29263486374255826, + "learning_rate": 0.00017762275678979625, + "loss": 1.1649, + "step": 5357 + }, + { + "epoch": 0.51, + "grad_norm": 0.25054193247048423, + "learning_rate": 0.00017761278237599272, + "loss": 0.9422, + "step": 5358 + }, + { + "epoch": 0.51, + "grad_norm": 0.2728514545749119, + "learning_rate": 0.00017760280601988052, + "loss": 1.1264, + "step": 5359 + }, + { + "epoch": 0.51, + "grad_norm": 0.2787897298192036, + "learning_rate": 0.00017759282772170933, + "loss": 1.069, + "step": 5360 + }, + { + "epoch": 0.51, + "grad_norm": 0.28051149945852427, + "learning_rate": 0.00017758284748172889, + "loss": 1.0434, + "step": 5361 + }, + { + "epoch": 0.51, + "grad_norm": 0.2434867215928191, + "learning_rate": 0.0001775728653001889, + "loss": 1.094, + "step": 5362 + }, + { + "epoch": 0.51, + "grad_norm": 0.298894712000934, + "learning_rate": 0.00017756288117733922, + "loss": 1.0858, + "step": 5363 + }, + { + "epoch": 0.51, + "grad_norm": 0.2989152889611858, + "learning_rate": 0.00017755289511342968, + "loss": 1.0862, + "step": 5364 + }, + { + "epoch": 0.51, + "grad_norm": 0.26061563505956564, + "learning_rate": 0.0001775429071087102, + "loss": 1.1229, + "step": 5365 + }, + { + "epoch": 0.51, + "grad_norm": 0.3078317769181313, + "learning_rate": 0.00017753291716343075, + "loss": 1.0443, + "step": 5366 + }, + { + "epoch": 0.51, + "grad_norm": 0.271922412205937, + "learning_rate": 0.00017752292527784132, + "loss": 1.0569, + "step": 5367 + }, + { + "epoch": 0.51, + "grad_norm": 0.2981868268132513, + "learning_rate": 0.00017751293145219194, + "loss": 1.1245, + "step": 5368 + }, + { + "epoch": 0.51, + "grad_norm": 0.28791587214094305, + "learning_rate": 0.00017750293568673275, + "loss": 1.1032, + "step": 5369 + }, + { + "epoch": 0.51, + "grad_norm": 0.2770400531641322, + "learning_rate": 0.00017749293798171388, + "loss": 1.1548, + "step": 5370 + }, + { + "epoch": 0.51, + "grad_norm": 0.31035234976544707, + "learning_rate": 0.00017748293833738554, + "loss": 1.0248, + "step": 5371 + }, + { + "epoch": 0.51, + "grad_norm": 0.32563997547553536, + "learning_rate": 0.000177472936753998, + "loss": 1.0142, + "step": 5372 + }, + { + "epoch": 0.51, + "grad_norm": 0.2965449735119415, + "learning_rate": 0.0001774629332318015, + "loss": 1.074, + "step": 5373 + }, + { + "epoch": 0.51, + "grad_norm": 0.24827461718041824, + "learning_rate": 0.00017745292777104638, + "loss": 1.0601, + "step": 5374 + }, + { + "epoch": 0.51, + "grad_norm": 0.28164100400780073, + "learning_rate": 0.00017744292037198312, + "loss": 1.1411, + "step": 5375 + }, + { + "epoch": 0.51, + "grad_norm": 0.255726016665569, + "learning_rate": 0.00017743291103486207, + "loss": 1.0318, + "step": 5376 + }, + { + "epoch": 0.51, + "grad_norm": 0.25998427104390703, + "learning_rate": 0.0001774228997599338, + "loss": 1.0986, + "step": 5377 + }, + { + "epoch": 0.51, + "grad_norm": 0.28575851348545084, + "learning_rate": 0.00017741288654744874, + "loss": 1.0325, + "step": 5378 + }, + { + "epoch": 0.51, + "grad_norm": 0.31525644009615983, + "learning_rate": 0.0001774028713976576, + "loss": 1.0881, + "step": 5379 + }, + { + "epoch": 0.51, + "grad_norm": 0.27772393103651905, + "learning_rate": 0.00017739285431081093, + "loss": 1.0819, + "step": 5380 + }, + { + "epoch": 0.51, + "grad_norm": 0.28458960847371934, + "learning_rate": 0.00017738283528715944, + "loss": 1.053, + "step": 5381 + }, + { + "epoch": 0.51, + "grad_norm": 0.2579176716986825, + "learning_rate": 0.00017737281432695387, + "loss": 1.0221, + "step": 5382 + }, + { + "epoch": 0.51, + "grad_norm": 0.2682071919326544, + "learning_rate": 0.000177362791430445, + "loss": 1.0632, + "step": 5383 + }, + { + "epoch": 0.52, + "grad_norm": 0.2817647505413598, + "learning_rate": 0.00017735276659788365, + "loss": 1.0892, + "step": 5384 + }, + { + "epoch": 0.52, + "grad_norm": 0.27370042339496314, + "learning_rate": 0.0001773427398295207, + "loss": 1.1051, + "step": 5385 + }, + { + "epoch": 0.52, + "grad_norm": 0.29120157753379705, + "learning_rate": 0.00017733271112560707, + "loss": 1.057, + "step": 5386 + }, + { + "epoch": 0.52, + "grad_norm": 0.274235384389006, + "learning_rate": 0.00017732268048639376, + "loss": 1.1645, + "step": 5387 + }, + { + "epoch": 0.52, + "grad_norm": 0.2554060673031894, + "learning_rate": 0.00017731264791213177, + "loss": 1.1206, + "step": 5388 + }, + { + "epoch": 0.52, + "grad_norm": 0.29038444135208097, + "learning_rate": 0.00017730261340307216, + "loss": 1.0847, + "step": 5389 + }, + { + "epoch": 0.52, + "grad_norm": 0.2927716873164211, + "learning_rate": 0.00017729257695946608, + "loss": 1.1842, + "step": 5390 + }, + { + "epoch": 0.52, + "grad_norm": 0.28575627634335404, + "learning_rate": 0.00017728253858156467, + "loss": 0.9855, + "step": 5391 + }, + { + "epoch": 0.52, + "grad_norm": 0.2741064932408415, + "learning_rate": 0.0001772724982696192, + "loss": 1.0681, + "step": 5392 + }, + { + "epoch": 0.52, + "grad_norm": 0.3221555396964685, + "learning_rate": 0.00017726245602388087, + "loss": 1.1304, + "step": 5393 + }, + { + "epoch": 0.52, + "grad_norm": 0.29940916808744256, + "learning_rate": 0.00017725241184460101, + "loss": 1.1972, + "step": 5394 + }, + { + "epoch": 0.52, + "grad_norm": 0.2555325051925566, + "learning_rate": 0.000177242365732031, + "loss": 1.0663, + "step": 5395 + }, + { + "epoch": 0.52, + "grad_norm": 0.29478956996412253, + "learning_rate": 0.00017723231768642227, + "loss": 1.1792, + "step": 5396 + }, + { + "epoch": 0.52, + "grad_norm": 0.2691856309392673, + "learning_rate": 0.0001772222677080262, + "loss": 1.0935, + "step": 5397 + }, + { + "epoch": 0.52, + "grad_norm": 0.26554481426737436, + "learning_rate": 0.00017721221579709438, + "loss": 1.1013, + "step": 5398 + }, + { + "epoch": 0.52, + "grad_norm": 0.2974199816061973, + "learning_rate": 0.00017720216195387834, + "loss": 1.1026, + "step": 5399 + }, + { + "epoch": 0.52, + "grad_norm": 0.25988903587070844, + "learning_rate": 0.00017719210617862967, + "loss": 0.9989, + "step": 5400 + }, + { + "epoch": 0.52, + "grad_norm": 0.2644737230686996, + "learning_rate": 0.00017718204847160004, + "loss": 1.1928, + "step": 5401 + }, + { + "epoch": 0.52, + "grad_norm": 0.321621116578329, + "learning_rate": 0.0001771719888330411, + "loss": 1.0699, + "step": 5402 + }, + { + "epoch": 0.52, + "grad_norm": 0.3117270874799254, + "learning_rate": 0.00017716192726320468, + "loss": 1.0265, + "step": 5403 + }, + { + "epoch": 0.52, + "grad_norm": 0.26397074260532366, + "learning_rate": 0.0001771518637623425, + "loss": 1.0595, + "step": 5404 + }, + { + "epoch": 0.52, + "grad_norm": 0.26070999892086566, + "learning_rate": 0.00017714179833070646, + "loss": 0.9732, + "step": 5405 + }, + { + "epoch": 0.52, + "grad_norm": 0.29516458483721847, + "learning_rate": 0.00017713173096854846, + "loss": 1.244, + "step": 5406 + }, + { + "epoch": 0.52, + "grad_norm": 0.28006511868535894, + "learning_rate": 0.0001771216616761204, + "loss": 1.0229, + "step": 5407 + }, + { + "epoch": 0.52, + "grad_norm": 0.2785160995661278, + "learning_rate": 0.0001771115904536743, + "loss": 1.0974, + "step": 5408 + }, + { + "epoch": 0.52, + "grad_norm": 0.3039976285784444, + "learning_rate": 0.00017710151730146215, + "loss": 1.1096, + "step": 5409 + }, + { + "epoch": 0.52, + "grad_norm": 0.2725502855772708, + "learning_rate": 0.0001770914422197361, + "loss": 1.0443, + "step": 5410 + }, + { + "epoch": 0.52, + "grad_norm": 0.2968618423089058, + "learning_rate": 0.00017708136520874822, + "loss": 1.0383, + "step": 5411 + }, + { + "epoch": 0.52, + "grad_norm": 0.2787402009547091, + "learning_rate": 0.00017707128626875078, + "loss": 1.1659, + "step": 5412 + }, + { + "epoch": 0.52, + "grad_norm": 0.2873480043549193, + "learning_rate": 0.00017706120539999595, + "loss": 0.9287, + "step": 5413 + }, + { + "epoch": 0.52, + "grad_norm": 0.27057828580034904, + "learning_rate": 0.00017705112260273602, + "loss": 0.9655, + "step": 5414 + }, + { + "epoch": 0.52, + "grad_norm": 0.2705987981572107, + "learning_rate": 0.00017704103787722332, + "loss": 1.1033, + "step": 5415 + }, + { + "epoch": 0.52, + "grad_norm": 0.2883167160040527, + "learning_rate": 0.00017703095122371024, + "loss": 1.1945, + "step": 5416 + }, + { + "epoch": 0.52, + "grad_norm": 0.28040572260172153, + "learning_rate": 0.00017702086264244918, + "loss": 1.1136, + "step": 5417 + }, + { + "epoch": 0.52, + "grad_norm": 0.23953916000277492, + "learning_rate": 0.0001770107721336926, + "loss": 1.127, + "step": 5418 + }, + { + "epoch": 0.52, + "grad_norm": 0.2666697798476311, + "learning_rate": 0.0001770006796976931, + "loss": 1.1178, + "step": 5419 + }, + { + "epoch": 0.52, + "grad_norm": 0.26595583947693385, + "learning_rate": 0.00017699058533470318, + "loss": 1.1537, + "step": 5420 + }, + { + "epoch": 0.52, + "grad_norm": 0.3018520026520265, + "learning_rate": 0.00017698048904497547, + "loss": 1.0672, + "step": 5421 + }, + { + "epoch": 0.52, + "grad_norm": 0.2585809987651865, + "learning_rate": 0.00017697039082876264, + "loss": 0.9036, + "step": 5422 + }, + { + "epoch": 0.52, + "grad_norm": 0.30520095394033386, + "learning_rate": 0.0001769602906863174, + "loss": 1.1266, + "step": 5423 + }, + { + "epoch": 0.52, + "grad_norm": 0.26703193999809494, + "learning_rate": 0.00017695018861789254, + "loss": 1.0113, + "step": 5424 + }, + { + "epoch": 0.52, + "grad_norm": 0.2665069045940571, + "learning_rate": 0.00017694008462374082, + "loss": 1.1435, + "step": 5425 + }, + { + "epoch": 0.52, + "grad_norm": 0.3066957091926963, + "learning_rate": 0.00017692997870411513, + "loss": 1.1477, + "step": 5426 + }, + { + "epoch": 0.52, + "grad_norm": 0.3060426948221189, + "learning_rate": 0.0001769198708592684, + "loss": 1.0636, + "step": 5427 + }, + { + "epoch": 0.52, + "grad_norm": 0.31035289985606923, + "learning_rate": 0.00017690976108945353, + "loss": 1.1531, + "step": 5428 + }, + { + "epoch": 0.52, + "grad_norm": 0.27105532382356223, + "learning_rate": 0.00017689964939492358, + "loss": 1.1738, + "step": 5429 + }, + { + "epoch": 0.52, + "grad_norm": 0.30441191653566924, + "learning_rate": 0.00017688953577593158, + "loss": 1.272, + "step": 5430 + }, + { + "epoch": 0.52, + "grad_norm": 0.30963973002090833, + "learning_rate": 0.0001768794202327306, + "loss": 1.2062, + "step": 5431 + }, + { + "epoch": 0.52, + "grad_norm": 0.27923640796457394, + "learning_rate": 0.0001768693027655738, + "loss": 1.1393, + "step": 5432 + }, + { + "epoch": 0.52, + "grad_norm": 0.2714745780737401, + "learning_rate": 0.00017685918337471442, + "loss": 1.0229, + "step": 5433 + }, + { + "epoch": 0.52, + "grad_norm": 0.2746763330279501, + "learning_rate": 0.00017684906206040567, + "loss": 1.025, + "step": 5434 + }, + { + "epoch": 0.52, + "grad_norm": 0.25420475121865566, + "learning_rate": 0.0001768389388229008, + "loss": 1.0173, + "step": 5435 + }, + { + "epoch": 0.52, + "grad_norm": 0.23858794911194617, + "learning_rate": 0.00017682881366245322, + "loss": 1.0893, + "step": 5436 + }, + { + "epoch": 0.52, + "grad_norm": 0.30224482212537046, + "learning_rate": 0.0001768186865793163, + "loss": 1.1222, + "step": 5437 + }, + { + "epoch": 0.52, + "grad_norm": 0.23278958691465643, + "learning_rate": 0.00017680855757374345, + "loss": 1.0458, + "step": 5438 + }, + { + "epoch": 0.52, + "grad_norm": 0.26428873295787486, + "learning_rate": 0.0001767984266459882, + "loss": 1.0792, + "step": 5439 + }, + { + "epoch": 0.52, + "grad_norm": 0.2594486178953975, + "learning_rate": 0.00017678829379630406, + "loss": 1.0737, + "step": 5440 + }, + { + "epoch": 0.52, + "grad_norm": 0.2743596761285101, + "learning_rate": 0.0001767781590249446, + "loss": 1.0216, + "step": 5441 + }, + { + "epoch": 0.52, + "grad_norm": 0.3014933928943263, + "learning_rate": 0.00017676802233216346, + "loss": 1.0686, + "step": 5442 + }, + { + "epoch": 0.52, + "grad_norm": 0.30975481238566765, + "learning_rate": 0.00017675788371821432, + "loss": 1.1826, + "step": 5443 + }, + { + "epoch": 0.52, + "grad_norm": 0.24533869531288546, + "learning_rate": 0.00017674774318335085, + "loss": 1.162, + "step": 5444 + }, + { + "epoch": 0.52, + "grad_norm": 0.27791112618813957, + "learning_rate": 0.0001767376007278269, + "loss": 0.9609, + "step": 5445 + }, + { + "epoch": 0.52, + "grad_norm": 0.31278080093016536, + "learning_rate": 0.00017672745635189633, + "loss": 1.1661, + "step": 5446 + }, + { + "epoch": 0.52, + "grad_norm": 0.29874080218471216, + "learning_rate": 0.00017671731005581287, + "loss": 1.1068, + "step": 5447 + }, + { + "epoch": 0.52, + "grad_norm": 0.2569503493201101, + "learning_rate": 0.0001767071618398305, + "loss": 1.0538, + "step": 5448 + }, + { + "epoch": 0.52, + "grad_norm": 0.29826323414906175, + "learning_rate": 0.00017669701170420322, + "loss": 1.1264, + "step": 5449 + }, + { + "epoch": 0.52, + "grad_norm": 0.2812826097473197, + "learning_rate": 0.00017668685964918504, + "loss": 1.0982, + "step": 5450 + }, + { + "epoch": 0.52, + "grad_norm": 0.2973251253473092, + "learning_rate": 0.00017667670567502998, + "loss": 1.0728, + "step": 5451 + }, + { + "epoch": 0.52, + "grad_norm": 0.24818251204984365, + "learning_rate": 0.0001766665497819922, + "loss": 1.0567, + "step": 5452 + }, + { + "epoch": 0.52, + "grad_norm": 0.2884278666490467, + "learning_rate": 0.00017665639197032582, + "loss": 1.0685, + "step": 5453 + }, + { + "epoch": 0.52, + "grad_norm": 0.262999153831761, + "learning_rate": 0.00017664623224028503, + "loss": 0.9473, + "step": 5454 + }, + { + "epoch": 0.52, + "grad_norm": 0.3152269030127414, + "learning_rate": 0.0001766360705921241, + "loss": 1.169, + "step": 5455 + }, + { + "epoch": 0.52, + "grad_norm": 0.25251227559062966, + "learning_rate": 0.00017662590702609737, + "loss": 1.0352, + "step": 5456 + }, + { + "epoch": 0.52, + "grad_norm": 0.32106633074490737, + "learning_rate": 0.00017661574154245914, + "loss": 1.0894, + "step": 5457 + }, + { + "epoch": 0.52, + "grad_norm": 0.2693010524825926, + "learning_rate": 0.00017660557414146384, + "loss": 1.0703, + "step": 5458 + }, + { + "epoch": 0.52, + "grad_norm": 0.33501583797096984, + "learning_rate": 0.0001765954048233659, + "loss": 1.2, + "step": 5459 + }, + { + "epoch": 0.52, + "grad_norm": 0.2766667108631253, + "learning_rate": 0.0001765852335884198, + "loss": 1.1096, + "step": 5460 + }, + { + "epoch": 0.52, + "grad_norm": 0.28642621560137427, + "learning_rate": 0.0001765750604368801, + "loss": 1.0589, + "step": 5461 + }, + { + "epoch": 0.52, + "grad_norm": 0.29446092888521896, + "learning_rate": 0.0001765648853690014, + "loss": 1.0758, + "step": 5462 + }, + { + "epoch": 0.52, + "grad_norm": 0.26574617011082774, + "learning_rate": 0.00017655470838503834, + "loss": 1.0517, + "step": 5463 + }, + { + "epoch": 0.52, + "grad_norm": 0.29380273713342564, + "learning_rate": 0.00017654452948524555, + "loss": 1.1355, + "step": 5464 + }, + { + "epoch": 0.52, + "grad_norm": 0.26610600805445106, + "learning_rate": 0.00017653434866987783, + "loss": 1.156, + "step": 5465 + }, + { + "epoch": 0.52, + "grad_norm": 0.23226425198871892, + "learning_rate": 0.00017652416593918994, + "loss": 1.1446, + "step": 5466 + }, + { + "epoch": 0.52, + "grad_norm": 0.3136944620672886, + "learning_rate": 0.00017651398129343667, + "loss": 1.1359, + "step": 5467 + }, + { + "epoch": 0.52, + "grad_norm": 0.29970255604775625, + "learning_rate": 0.00017650379473287296, + "loss": 1.0718, + "step": 5468 + }, + { + "epoch": 0.52, + "grad_norm": 0.2782035372010567, + "learning_rate": 0.0001764936062577537, + "loss": 1.0908, + "step": 5469 + }, + { + "epoch": 0.52, + "grad_norm": 0.2616477257641661, + "learning_rate": 0.00017648341586833387, + "loss": 0.9107, + "step": 5470 + }, + { + "epoch": 0.52, + "grad_norm": 0.2663523876553537, + "learning_rate": 0.00017647322356486848, + "loss": 0.9981, + "step": 5471 + }, + { + "epoch": 0.52, + "grad_norm": 0.26092658264988006, + "learning_rate": 0.0001764630293476126, + "loss": 1.1115, + "step": 5472 + }, + { + "epoch": 0.52, + "grad_norm": 0.27133782980711774, + "learning_rate": 0.0001764528332168214, + "loss": 1.1506, + "step": 5473 + }, + { + "epoch": 0.52, + "grad_norm": 0.29494184409148894, + "learning_rate": 0.00017644263517274997, + "loss": 1.0381, + "step": 5474 + }, + { + "epoch": 0.52, + "grad_norm": 0.293344464237028, + "learning_rate": 0.00017643243521565355, + "loss": 1.0955, + "step": 5475 + }, + { + "epoch": 0.52, + "grad_norm": 0.3113070820581294, + "learning_rate": 0.0001764222333457874, + "loss": 0.9733, + "step": 5476 + }, + { + "epoch": 0.52, + "grad_norm": 0.2686929294263848, + "learning_rate": 0.00017641202956340685, + "loss": 1.141, + "step": 5477 + }, + { + "epoch": 0.52, + "grad_norm": 0.2963778388036239, + "learning_rate": 0.0001764018238687672, + "loss": 1.049, + "step": 5478 + }, + { + "epoch": 0.52, + "grad_norm": 0.27712005782523735, + "learning_rate": 0.00017639161626212393, + "loss": 0.9666, + "step": 5479 + }, + { + "epoch": 0.52, + "grad_norm": 0.25074497020772385, + "learning_rate": 0.00017638140674373245, + "loss": 1.091, + "step": 5480 + }, + { + "epoch": 0.52, + "grad_norm": 0.27481911625527383, + "learning_rate": 0.00017637119531384822, + "loss": 0.9804, + "step": 5481 + }, + { + "epoch": 0.52, + "grad_norm": 0.30326990661840897, + "learning_rate": 0.00017636098197272687, + "loss": 1.0196, + "step": 5482 + }, + { + "epoch": 0.52, + "grad_norm": 0.2536248159519646, + "learning_rate": 0.00017635076672062395, + "loss": 1.0655, + "step": 5483 + }, + { + "epoch": 0.52, + "grad_norm": 0.24871071887818122, + "learning_rate": 0.0001763405495577951, + "loss": 1.0337, + "step": 5484 + }, + { + "epoch": 0.52, + "grad_norm": 0.27076559378833587, + "learning_rate": 0.00017633033048449607, + "loss": 1.0868, + "step": 5485 + }, + { + "epoch": 0.52, + "grad_norm": 0.26447743099546034, + "learning_rate": 0.00017632010950098247, + "loss": 1.1067, + "step": 5486 + }, + { + "epoch": 0.52, + "grad_norm": 0.2354573802397408, + "learning_rate": 0.00017630988660751018, + "loss": 0.9972, + "step": 5487 + }, + { + "epoch": 0.53, + "grad_norm": 0.31657477736419243, + "learning_rate": 0.00017629966180433503, + "loss": 1.1436, + "step": 5488 + }, + { + "epoch": 0.53, + "grad_norm": 0.31018143648114505, + "learning_rate": 0.0001762894350917129, + "loss": 1.1186, + "step": 5489 + }, + { + "epoch": 0.53, + "grad_norm": 0.2928001553899392, + "learning_rate": 0.00017627920646989971, + "loss": 1.1422, + "step": 5490 + }, + { + "epoch": 0.53, + "grad_norm": 0.30032975451874694, + "learning_rate": 0.00017626897593915142, + "loss": 1.0623, + "step": 5491 + }, + { + "epoch": 0.53, + "grad_norm": 0.3019231833502392, + "learning_rate": 0.0001762587434997241, + "loss": 1.0632, + "step": 5492 + }, + { + "epoch": 0.53, + "grad_norm": 0.2647986946052283, + "learning_rate": 0.0001762485091518738, + "loss": 1.0093, + "step": 5493 + }, + { + "epoch": 0.53, + "grad_norm": 0.2490169986270707, + "learning_rate": 0.0001762382728958566, + "loss": 1.1866, + "step": 5494 + }, + { + "epoch": 0.53, + "grad_norm": 0.27623003094440113, + "learning_rate": 0.00017622803473192874, + "loss": 1.0377, + "step": 5495 + }, + { + "epoch": 0.53, + "grad_norm": 0.29379957830634423, + "learning_rate": 0.0001762177946603464, + "loss": 1.1129, + "step": 5496 + }, + { + "epoch": 0.53, + "grad_norm": 0.26006713944566145, + "learning_rate": 0.00017620755268136584, + "loss": 1.0786, + "step": 5497 + }, + { + "epoch": 0.53, + "grad_norm": 0.2984517115544308, + "learning_rate": 0.00017619730879524337, + "loss": 1.145, + "step": 5498 + }, + { + "epoch": 0.53, + "grad_norm": 0.2634404683614875, + "learning_rate": 0.00017618706300223536, + "loss": 1.0225, + "step": 5499 + }, + { + "epoch": 0.53, + "grad_norm": 0.2770749610740461, + "learning_rate": 0.00017617681530259822, + "loss": 1.0321, + "step": 5500 + }, + { + "epoch": 0.53, + "grad_norm": 0.29909952213865876, + "learning_rate": 0.00017616656569658843, + "loss": 0.9445, + "step": 5501 + }, + { + "epoch": 0.53, + "grad_norm": 0.28919886120746485, + "learning_rate": 0.00017615631418446242, + "loss": 1.0648, + "step": 5502 + }, + { + "epoch": 0.53, + "grad_norm": 0.28680714124802115, + "learning_rate": 0.00017614606076647683, + "loss": 1.1729, + "step": 5503 + }, + { + "epoch": 0.53, + "grad_norm": 0.3018876386617487, + "learning_rate": 0.00017613580544288817, + "loss": 0.9817, + "step": 5504 + }, + { + "epoch": 0.53, + "grad_norm": 0.28707084465144506, + "learning_rate": 0.00017612554821395314, + "loss": 1.1636, + "step": 5505 + }, + { + "epoch": 0.53, + "grad_norm": 0.31719224150490105, + "learning_rate": 0.00017611528907992844, + "loss": 1.0808, + "step": 5506 + }, + { + "epoch": 0.53, + "grad_norm": 0.2625161917732417, + "learning_rate": 0.00017610502804107082, + "loss": 1.1023, + "step": 5507 + }, + { + "epoch": 0.53, + "grad_norm": 0.2872356902170121, + "learning_rate": 0.00017609476509763698, + "loss": 1.2038, + "step": 5508 + }, + { + "epoch": 0.53, + "grad_norm": 0.25936712334845474, + "learning_rate": 0.00017608450024988382, + "loss": 0.9567, + "step": 5509 + }, + { + "epoch": 0.53, + "grad_norm": 0.2791045831669703, + "learning_rate": 0.0001760742334980683, + "loss": 1.1003, + "step": 5510 + }, + { + "epoch": 0.53, + "grad_norm": 0.2647336003050076, + "learning_rate": 0.00017606396484244721, + "loss": 1.1102, + "step": 5511 + }, + { + "epoch": 0.53, + "grad_norm": 0.27574681080006713, + "learning_rate": 0.00017605369428327761, + "loss": 1.048, + "step": 5512 + }, + { + "epoch": 0.53, + "grad_norm": 0.27994961262509854, + "learning_rate": 0.00017604342182081653, + "loss": 0.9866, + "step": 5513 + }, + { + "epoch": 0.53, + "grad_norm": 0.261516937266442, + "learning_rate": 0.000176033147455321, + "loss": 1.0791, + "step": 5514 + }, + { + "epoch": 0.53, + "grad_norm": 0.2995061891654347, + "learning_rate": 0.0001760228711870482, + "loss": 1.1309, + "step": 5515 + }, + { + "epoch": 0.53, + "grad_norm": 0.2820497610571214, + "learning_rate": 0.00017601259301625524, + "loss": 1.0, + "step": 5516 + }, + { + "epoch": 0.53, + "grad_norm": 0.24255160815288854, + "learning_rate": 0.0001760023129431994, + "loss": 1.0015, + "step": 5517 + }, + { + "epoch": 0.53, + "grad_norm": 0.25094239441876454, + "learning_rate": 0.0001759920309681379, + "loss": 1.0133, + "step": 5518 + }, + { + "epoch": 0.53, + "grad_norm": 0.283208208913356, + "learning_rate": 0.00017598174709132803, + "loss": 1.1402, + "step": 5519 + }, + { + "epoch": 0.53, + "grad_norm": 0.2826249410012862, + "learning_rate": 0.00017597146131302722, + "loss": 1.0725, + "step": 5520 + }, + { + "epoch": 0.53, + "grad_norm": 0.28680753301102535, + "learning_rate": 0.00017596117363349282, + "loss": 1.1058, + "step": 5521 + }, + { + "epoch": 0.53, + "grad_norm": 0.2915298750178331, + "learning_rate": 0.00017595088405298234, + "loss": 1.0234, + "step": 5522 + }, + { + "epoch": 0.53, + "grad_norm": 0.29543680871161176, + "learning_rate": 0.00017594059257175325, + "loss": 1.1357, + "step": 5523 + }, + { + "epoch": 0.53, + "grad_norm": 0.2734660068763002, + "learning_rate": 0.0001759302991900631, + "loss": 0.9803, + "step": 5524 + }, + { + "epoch": 0.53, + "grad_norm": 0.29698197284064065, + "learning_rate": 0.0001759200039081695, + "loss": 1.0425, + "step": 5525 + }, + { + "epoch": 0.53, + "grad_norm": 0.28542474538774965, + "learning_rate": 0.00017590970672633007, + "loss": 1.0379, + "step": 5526 + }, + { + "epoch": 0.53, + "grad_norm": 0.2891844662811193, + "learning_rate": 0.00017589940764480252, + "loss": 1.0073, + "step": 5527 + }, + { + "epoch": 0.53, + "grad_norm": 0.28262400994939874, + "learning_rate": 0.00017588910666384462, + "loss": 1.0782, + "step": 5528 + }, + { + "epoch": 0.53, + "grad_norm": 0.2780628600213389, + "learning_rate": 0.00017587880378371412, + "loss": 1.0688, + "step": 5529 + }, + { + "epoch": 0.53, + "grad_norm": 0.2884519639543894, + "learning_rate": 0.00017586849900466883, + "loss": 1.0724, + "step": 5530 + }, + { + "epoch": 0.53, + "grad_norm": 0.2815574093742316, + "learning_rate": 0.00017585819232696675, + "loss": 1.0518, + "step": 5531 + }, + { + "epoch": 0.53, + "grad_norm": 0.28038754616537154, + "learning_rate": 0.00017584788375086565, + "loss": 1.0102, + "step": 5532 + }, + { + "epoch": 0.53, + "grad_norm": 0.2525746581044452, + "learning_rate": 0.00017583757327662363, + "loss": 1.0554, + "step": 5533 + }, + { + "epoch": 0.53, + "grad_norm": 0.2922109980484516, + "learning_rate": 0.00017582726090449867, + "loss": 1.1027, + "step": 5534 + }, + { + "epoch": 0.53, + "grad_norm": 0.2751484254089623, + "learning_rate": 0.00017581694663474886, + "loss": 1.0159, + "step": 5535 + }, + { + "epoch": 0.53, + "grad_norm": 0.26863647079539626, + "learning_rate": 0.00017580663046763231, + "loss": 1.0621, + "step": 5536 + }, + { + "epoch": 0.53, + "grad_norm": 0.2985191792690513, + "learning_rate": 0.00017579631240340716, + "loss": 1.1758, + "step": 5537 + }, + { + "epoch": 0.53, + "grad_norm": 0.27614555889342746, + "learning_rate": 0.00017578599244233168, + "loss": 1.061, + "step": 5538 + }, + { + "epoch": 0.53, + "grad_norm": 0.25905510867782044, + "learning_rate": 0.00017577567058466414, + "loss": 1.1777, + "step": 5539 + }, + { + "epoch": 0.53, + "grad_norm": 0.3278506380359264, + "learning_rate": 0.00017576534683066278, + "loss": 1.1552, + "step": 5540 + }, + { + "epoch": 0.53, + "grad_norm": 0.273031516195577, + "learning_rate": 0.000175755021180586, + "loss": 1.1249, + "step": 5541 + }, + { + "epoch": 0.53, + "grad_norm": 0.29918764475128873, + "learning_rate": 0.00017574469363469222, + "loss": 1.0937, + "step": 5542 + }, + { + "epoch": 0.53, + "grad_norm": 0.2753312938615492, + "learning_rate": 0.00017573436419323986, + "loss": 1.1123, + "step": 5543 + }, + { + "epoch": 0.53, + "grad_norm": 0.3076985822417243, + "learning_rate": 0.00017572403285648743, + "loss": 1.1392, + "step": 5544 + }, + { + "epoch": 0.53, + "grad_norm": 0.2858977995760822, + "learning_rate": 0.00017571369962469352, + "loss": 1.0683, + "step": 5545 + }, + { + "epoch": 0.53, + "grad_norm": 0.2826635982664086, + "learning_rate": 0.00017570336449811667, + "loss": 1.01, + "step": 5546 + }, + { + "epoch": 0.53, + "grad_norm": 0.25331688616473097, + "learning_rate": 0.00017569302747701558, + "loss": 1.0202, + "step": 5547 + }, + { + "epoch": 0.53, + "grad_norm": 0.25299991837835445, + "learning_rate": 0.00017568268856164886, + "loss": 0.9968, + "step": 5548 + }, + { + "epoch": 0.53, + "grad_norm": 0.26907253785105184, + "learning_rate": 0.00017567234775227533, + "loss": 1.0216, + "step": 5549 + }, + { + "epoch": 0.53, + "grad_norm": 0.2638154646751891, + "learning_rate": 0.0001756620050491537, + "loss": 1.169, + "step": 5550 + }, + { + "epoch": 0.53, + "grad_norm": 0.2674914565002557, + "learning_rate": 0.0001756516604525429, + "loss": 1.0112, + "step": 5551 + }, + { + "epoch": 0.53, + "grad_norm": 0.27204031327263634, + "learning_rate": 0.00017564131396270168, + "loss": 1.0392, + "step": 5552 + }, + { + "epoch": 0.53, + "grad_norm": 0.2808788802646591, + "learning_rate": 0.0001756309655798891, + "loss": 1.1142, + "step": 5553 + }, + { + "epoch": 0.53, + "grad_norm": 0.2737824930333972, + "learning_rate": 0.00017562061530436405, + "loss": 1.1227, + "step": 5554 + }, + { + "epoch": 0.53, + "grad_norm": 0.28525600932665507, + "learning_rate": 0.00017561026313638557, + "loss": 1.0276, + "step": 5555 + }, + { + "epoch": 0.53, + "grad_norm": 0.26176420613090035, + "learning_rate": 0.00017559990907621274, + "loss": 1.1253, + "step": 5556 + }, + { + "epoch": 0.53, + "grad_norm": 0.25918345855645814, + "learning_rate": 0.00017558955312410468, + "loss": 1.1035, + "step": 5557 + }, + { + "epoch": 0.53, + "grad_norm": 0.29529919222022755, + "learning_rate": 0.00017557919528032054, + "loss": 1.1018, + "step": 5558 + }, + { + "epoch": 0.53, + "grad_norm": 0.28939151113442757, + "learning_rate": 0.00017556883554511953, + "loss": 1.0248, + "step": 5559 + }, + { + "epoch": 0.53, + "grad_norm": 0.3092935541500706, + "learning_rate": 0.00017555847391876093, + "loss": 1.133, + "step": 5560 + }, + { + "epoch": 0.53, + "grad_norm": 0.268156083959369, + "learning_rate": 0.00017554811040150403, + "loss": 1.1213, + "step": 5561 + }, + { + "epoch": 0.53, + "grad_norm": 0.2621845882914305, + "learning_rate": 0.0001755377449936082, + "loss": 1.0878, + "step": 5562 + }, + { + "epoch": 0.53, + "grad_norm": 0.2661667850236094, + "learning_rate": 0.0001755273776953328, + "loss": 1.0683, + "step": 5563 + }, + { + "epoch": 0.53, + "grad_norm": 0.25877418531921775, + "learning_rate": 0.00017551700850693732, + "loss": 1.1574, + "step": 5564 + }, + { + "epoch": 0.53, + "grad_norm": 0.28288655943020663, + "learning_rate": 0.00017550663742868126, + "loss": 1.1328, + "step": 5565 + }, + { + "epoch": 0.53, + "grad_norm": 0.26185751105253613, + "learning_rate": 0.00017549626446082412, + "loss": 0.973, + "step": 5566 + }, + { + "epoch": 0.53, + "grad_norm": 0.295683213989208, + "learning_rate": 0.0001754858896036255, + "loss": 0.9697, + "step": 5567 + }, + { + "epoch": 0.53, + "grad_norm": 0.3264307309216949, + "learning_rate": 0.0001754755128573451, + "loss": 1.0159, + "step": 5568 + }, + { + "epoch": 0.53, + "grad_norm": 0.2753013510908348, + "learning_rate": 0.00017546513422224253, + "loss": 1.0423, + "step": 5569 + }, + { + "epoch": 0.53, + "grad_norm": 0.25968975647863907, + "learning_rate": 0.00017545475369857755, + "loss": 1.1648, + "step": 5570 + }, + { + "epoch": 0.53, + "grad_norm": 0.2804203410262089, + "learning_rate": 0.00017544437128660993, + "loss": 1.0268, + "step": 5571 + }, + { + "epoch": 0.53, + "grad_norm": 0.31239158563885117, + "learning_rate": 0.0001754339869865995, + "loss": 1.1495, + "step": 5572 + }, + { + "epoch": 0.53, + "grad_norm": 0.2733500389923679, + "learning_rate": 0.00017542360079880615, + "loss": 1.1128, + "step": 5573 + }, + { + "epoch": 0.53, + "grad_norm": 0.24320614149769154, + "learning_rate": 0.00017541321272348978, + "loss": 1.0674, + "step": 5574 + }, + { + "epoch": 0.53, + "grad_norm": 0.3181218743470043, + "learning_rate": 0.00017540282276091039, + "loss": 1.0091, + "step": 5575 + }, + { + "epoch": 0.53, + "grad_norm": 0.3031447060986254, + "learning_rate": 0.00017539243091132793, + "loss": 1.1715, + "step": 5576 + }, + { + "epoch": 0.53, + "grad_norm": 0.34380612538960487, + "learning_rate": 0.00017538203717500252, + "loss": 0.9964, + "step": 5577 + }, + { + "epoch": 0.53, + "grad_norm": 0.2661056584426639, + "learning_rate": 0.00017537164155219428, + "loss": 1.0928, + "step": 5578 + }, + { + "epoch": 0.53, + "grad_norm": 0.26522368104967226, + "learning_rate": 0.0001753612440431633, + "loss": 1.0348, + "step": 5579 + }, + { + "epoch": 0.53, + "grad_norm": 0.28303690003405435, + "learning_rate": 0.00017535084464816985, + "loss": 1.0077, + "step": 5580 + }, + { + "epoch": 0.53, + "grad_norm": 0.29888676081156723, + "learning_rate": 0.00017534044336747418, + "loss": 0.9651, + "step": 5581 + }, + { + "epoch": 0.53, + "grad_norm": 0.23610813701308117, + "learning_rate": 0.00017533004020133653, + "loss": 1.0869, + "step": 5582 + }, + { + "epoch": 0.53, + "grad_norm": 0.2473777150573363, + "learning_rate": 0.00017531963515001725, + "loss": 1.0581, + "step": 5583 + }, + { + "epoch": 0.53, + "grad_norm": 0.2878283161585639, + "learning_rate": 0.00017530922821377683, + "loss": 1.1182, + "step": 5584 + }, + { + "epoch": 0.53, + "grad_norm": 0.2895723267837339, + "learning_rate": 0.0001752988193928756, + "loss": 1.0193, + "step": 5585 + }, + { + "epoch": 0.53, + "grad_norm": 0.25924626472676504, + "learning_rate": 0.00017528840868757413, + "loss": 1.0764, + "step": 5586 + }, + { + "epoch": 0.53, + "grad_norm": 0.2697046584787591, + "learning_rate": 0.00017527799609813287, + "loss": 1.2123, + "step": 5587 + }, + { + "epoch": 0.53, + "grad_norm": 0.2779570564976789, + "learning_rate": 0.00017526758162481247, + "loss": 1.0463, + "step": 5588 + }, + { + "epoch": 0.53, + "grad_norm": 0.2659291393898864, + "learning_rate": 0.00017525716526787353, + "loss": 0.8657, + "step": 5589 + }, + { + "epoch": 0.53, + "grad_norm": 0.24940778956498255, + "learning_rate": 0.00017524674702757676, + "loss": 0.9349, + "step": 5590 + }, + { + "epoch": 0.53, + "grad_norm": 0.3512612443921498, + "learning_rate": 0.00017523632690418281, + "loss": 1.0309, + "step": 5591 + }, + { + "epoch": 0.53, + "grad_norm": 0.27949443291870935, + "learning_rate": 0.0001752259048979525, + "loss": 1.0867, + "step": 5592 + }, + { + "epoch": 0.54, + "grad_norm": 0.25241117168172916, + "learning_rate": 0.00017521548100914668, + "loss": 0.9595, + "step": 5593 + }, + { + "epoch": 0.54, + "grad_norm": 0.31177379926003695, + "learning_rate": 0.00017520505523802615, + "loss": 1.1147, + "step": 5594 + }, + { + "epoch": 0.54, + "grad_norm": 0.3035376452103113, + "learning_rate": 0.00017519462758485186, + "loss": 1.1091, + "step": 5595 + }, + { + "epoch": 0.54, + "grad_norm": 0.28334338339825155, + "learning_rate": 0.00017518419804988473, + "loss": 1.0027, + "step": 5596 + }, + { + "epoch": 0.54, + "grad_norm": 0.27635425273795633, + "learning_rate": 0.00017517376663338583, + "loss": 1.0832, + "step": 5597 + }, + { + "epoch": 0.54, + "grad_norm": 0.27479092014996065, + "learning_rate": 0.00017516333333561615, + "loss": 1.0254, + "step": 5598 + }, + { + "epoch": 0.54, + "grad_norm": 0.2923804540268794, + "learning_rate": 0.00017515289815683683, + "loss": 1.0939, + "step": 5599 + }, + { + "epoch": 0.54, + "grad_norm": 0.2689789333872261, + "learning_rate": 0.000175142461097309, + "loss": 1.0924, + "step": 5600 + }, + { + "epoch": 0.54, + "grad_norm": 0.2712229566662846, + "learning_rate": 0.00017513202215729384, + "loss": 1.1212, + "step": 5601 + }, + { + "epoch": 0.54, + "grad_norm": 0.28499521398979466, + "learning_rate": 0.0001751215813370526, + "loss": 1.1271, + "step": 5602 + }, + { + "epoch": 0.54, + "grad_norm": 0.272591489298431, + "learning_rate": 0.00017511113863684662, + "loss": 1.1602, + "step": 5603 + }, + { + "epoch": 0.54, + "grad_norm": 0.3008741005719719, + "learning_rate": 0.00017510069405693714, + "loss": 1.115, + "step": 5604 + }, + { + "epoch": 0.54, + "grad_norm": 0.2531166951692093, + "learning_rate": 0.00017509024759758561, + "loss": 1.0802, + "step": 5605 + }, + { + "epoch": 0.54, + "grad_norm": 0.24148412845313758, + "learning_rate": 0.00017507979925905347, + "loss": 0.953, + "step": 5606 + }, + { + "epoch": 0.54, + "grad_norm": 0.2534357412942829, + "learning_rate": 0.00017506934904160213, + "loss": 1.0784, + "step": 5607 + }, + { + "epoch": 0.54, + "grad_norm": 0.270455450479634, + "learning_rate": 0.00017505889694549316, + "loss": 1.0495, + "step": 5608 + }, + { + "epoch": 0.54, + "grad_norm": 0.2718743775512695, + "learning_rate": 0.00017504844297098812, + "loss": 1.1477, + "step": 5609 + }, + { + "epoch": 0.54, + "grad_norm": 0.2627508977482193, + "learning_rate": 0.00017503798711834863, + "loss": 0.9724, + "step": 5610 + }, + { + "epoch": 0.54, + "grad_norm": 0.28688718615510195, + "learning_rate": 0.00017502752938783637, + "loss": 1.0391, + "step": 5611 + }, + { + "epoch": 0.54, + "grad_norm": 0.3088622269070572, + "learning_rate": 0.000175017069779713, + "loss": 1.07, + "step": 5612 + }, + { + "epoch": 0.54, + "grad_norm": 0.2519490699405952, + "learning_rate": 0.00017500660829424035, + "loss": 0.9973, + "step": 5613 + }, + { + "epoch": 0.54, + "grad_norm": 0.2593258598031295, + "learning_rate": 0.00017499614493168017, + "loss": 1.1488, + "step": 5614 + }, + { + "epoch": 0.54, + "grad_norm": 0.28609091063527664, + "learning_rate": 0.00017498567969229432, + "loss": 1.1316, + "step": 5615 + }, + { + "epoch": 0.54, + "grad_norm": 0.2876241038098421, + "learning_rate": 0.00017497521257634472, + "loss": 1.142, + "step": 5616 + }, + { + "epoch": 0.54, + "grad_norm": 0.2699264999096545, + "learning_rate": 0.0001749647435840933, + "loss": 1.1133, + "step": 5617 + }, + { + "epoch": 0.54, + "grad_norm": 0.23212212244543703, + "learning_rate": 0.00017495427271580207, + "loss": 1.0177, + "step": 5618 + }, + { + "epoch": 0.54, + "grad_norm": 0.3191035435173965, + "learning_rate": 0.00017494379997173306, + "loss": 1.0753, + "step": 5619 + }, + { + "epoch": 0.54, + "grad_norm": 0.28382503691739164, + "learning_rate": 0.00017493332535214835, + "loss": 1.0277, + "step": 5620 + }, + { + "epoch": 0.54, + "grad_norm": 0.2620982386743405, + "learning_rate": 0.00017492284885731006, + "loss": 1.1087, + "step": 5621 + }, + { + "epoch": 0.54, + "grad_norm": 0.24048768135309437, + "learning_rate": 0.00017491237048748042, + "loss": 1.0179, + "step": 5622 + }, + { + "epoch": 0.54, + "grad_norm": 0.29297621702120474, + "learning_rate": 0.00017490189024292157, + "loss": 1.031, + "step": 5623 + }, + { + "epoch": 0.54, + "grad_norm": 0.24169868093525804, + "learning_rate": 0.00017489140812389591, + "loss": 1.1275, + "step": 5624 + }, + { + "epoch": 0.54, + "grad_norm": 0.3265517682655214, + "learning_rate": 0.00017488092413066566, + "loss": 1.09, + "step": 5625 + }, + { + "epoch": 0.54, + "grad_norm": 0.2889070937315453, + "learning_rate": 0.00017487043826349324, + "loss": 1.0827, + "step": 5626 + }, + { + "epoch": 0.54, + "grad_norm": 0.2616726142921512, + "learning_rate": 0.00017485995052264107, + "loss": 1.1226, + "step": 5627 + }, + { + "epoch": 0.54, + "grad_norm": 0.2823144964930083, + "learning_rate": 0.00017484946090837153, + "loss": 0.9189, + "step": 5628 + }, + { + "epoch": 0.54, + "grad_norm": 0.26247896162782614, + "learning_rate": 0.0001748389694209472, + "loss": 1.0863, + "step": 5629 + }, + { + "epoch": 0.54, + "grad_norm": 0.29529068834973227, + "learning_rate": 0.0001748284760606307, + "loss": 1.1572, + "step": 5630 + }, + { + "epoch": 0.54, + "grad_norm": 0.2725314072487987, + "learning_rate": 0.00017481798082768447, + "loss": 1.0527, + "step": 5631 + }, + { + "epoch": 0.54, + "grad_norm": 0.3268266589015598, + "learning_rate": 0.0001748074837223713, + "loss": 1.1146, + "step": 5632 + }, + { + "epoch": 0.54, + "grad_norm": 0.31412407097134504, + "learning_rate": 0.0001747969847449538, + "loss": 1.0571, + "step": 5633 + }, + { + "epoch": 0.54, + "grad_norm": 0.3129044080305532, + "learning_rate": 0.0001747864838956948, + "loss": 1.1354, + "step": 5634 + }, + { + "epoch": 0.54, + "grad_norm": 0.26484366612789245, + "learning_rate": 0.00017477598117485697, + "loss": 1.0219, + "step": 5635 + }, + { + "epoch": 0.54, + "grad_norm": 0.2881609277060793, + "learning_rate": 0.00017476547658270327, + "loss": 1.0661, + "step": 5636 + }, + { + "epoch": 0.54, + "grad_norm": 0.2785791649184835, + "learning_rate": 0.0001747549701194965, + "loss": 1.009, + "step": 5637 + }, + { + "epoch": 0.54, + "grad_norm": 0.2781943063291394, + "learning_rate": 0.00017474446178549963, + "loss": 1.1618, + "step": 5638 + }, + { + "epoch": 0.54, + "grad_norm": 0.286700570393826, + "learning_rate": 0.00017473395158097566, + "loss": 1.1184, + "step": 5639 + }, + { + "epoch": 0.54, + "grad_norm": 0.332805537003659, + "learning_rate": 0.00017472343950618755, + "loss": 1.0486, + "step": 5640 + }, + { + "epoch": 0.54, + "grad_norm": 0.27971090508186786, + "learning_rate": 0.0001747129255613984, + "loss": 1.1044, + "step": 5641 + }, + { + "epoch": 0.54, + "grad_norm": 0.28836371050426973, + "learning_rate": 0.00017470240974687133, + "loss": 1.0402, + "step": 5642 + }, + { + "epoch": 0.54, + "grad_norm": 0.30716094757551443, + "learning_rate": 0.00017469189206286952, + "loss": 1.113, + "step": 5643 + }, + { + "epoch": 0.54, + "grad_norm": 0.2743312373131173, + "learning_rate": 0.00017468137250965617, + "loss": 1.0299, + "step": 5644 + }, + { + "epoch": 0.54, + "grad_norm": 0.2609975564796487, + "learning_rate": 0.00017467085108749454, + "loss": 1.104, + "step": 5645 + }, + { + "epoch": 0.54, + "grad_norm": 0.29364822283678577, + "learning_rate": 0.0001746603277966479, + "loss": 1.0865, + "step": 5646 + }, + { + "epoch": 0.54, + "grad_norm": 0.2745578477787385, + "learning_rate": 0.00017464980263737968, + "loss": 1.1527, + "step": 5647 + }, + { + "epoch": 0.54, + "grad_norm": 0.26626720698396383, + "learning_rate": 0.00017463927560995321, + "loss": 1.0189, + "step": 5648 + }, + { + "epoch": 0.54, + "grad_norm": 0.27709851029020705, + "learning_rate": 0.000174628746714632, + "loss": 1.1241, + "step": 5649 + }, + { + "epoch": 0.54, + "grad_norm": 0.306265199776124, + "learning_rate": 0.00017461821595167945, + "loss": 1.1816, + "step": 5650 + }, + { + "epoch": 0.54, + "grad_norm": 0.23094636122975018, + "learning_rate": 0.00017460768332135918, + "loss": 1.0002, + "step": 5651 + }, + { + "epoch": 0.54, + "grad_norm": 0.25739454457437366, + "learning_rate": 0.00017459714882393473, + "loss": 1.0748, + "step": 5652 + }, + { + "epoch": 0.54, + "grad_norm": 0.24839067246041194, + "learning_rate": 0.00017458661245966974, + "loss": 1.1297, + "step": 5653 + }, + { + "epoch": 0.54, + "grad_norm": 0.2592947317889122, + "learning_rate": 0.0001745760742288279, + "loss": 1.1097, + "step": 5654 + }, + { + "epoch": 0.54, + "grad_norm": 0.29910161986761424, + "learning_rate": 0.00017456553413167293, + "loss": 1.0156, + "step": 5655 + }, + { + "epoch": 0.54, + "grad_norm": 0.2740550204065586, + "learning_rate": 0.00017455499216846864, + "loss": 1.1618, + "step": 5656 + }, + { + "epoch": 0.54, + "grad_norm": 0.2773348179265606, + "learning_rate": 0.00017454444833947877, + "loss": 1.0988, + "step": 5657 + }, + { + "epoch": 0.54, + "grad_norm": 0.2813065691260808, + "learning_rate": 0.00017453390264496728, + "loss": 0.9826, + "step": 5658 + }, + { + "epoch": 0.54, + "grad_norm": 0.2775051391562762, + "learning_rate": 0.000174523355085198, + "loss": 1.0356, + "step": 5659 + }, + { + "epoch": 0.54, + "grad_norm": 0.28015989674393643, + "learning_rate": 0.00017451280566043492, + "loss": 1.1507, + "step": 5660 + }, + { + "epoch": 0.54, + "grad_norm": 0.3011891640480555, + "learning_rate": 0.00017450225437094208, + "loss": 1.16, + "step": 5661 + }, + { + "epoch": 0.54, + "grad_norm": 0.38100561539213623, + "learning_rate": 0.00017449170121698347, + "loss": 1.0236, + "step": 5662 + }, + { + "epoch": 0.54, + "grad_norm": 0.27806057374613613, + "learning_rate": 0.00017448114619882321, + "loss": 1.0843, + "step": 5663 + }, + { + "epoch": 0.54, + "grad_norm": 0.2851575165624332, + "learning_rate": 0.0001744705893167255, + "loss": 1.1458, + "step": 5664 + }, + { + "epoch": 0.54, + "grad_norm": 0.2913077381839934, + "learning_rate": 0.00017446003057095447, + "loss": 1.1039, + "step": 5665 + }, + { + "epoch": 0.54, + "grad_norm": 0.28664285006948, + "learning_rate": 0.00017444946996177433, + "loss": 1.1214, + "step": 5666 + }, + { + "epoch": 0.54, + "grad_norm": 0.2838055565232777, + "learning_rate": 0.00017443890748944946, + "loss": 1.0184, + "step": 5667 + }, + { + "epoch": 0.54, + "grad_norm": 0.2511465662279637, + "learning_rate": 0.00017442834315424416, + "loss": 1.0698, + "step": 5668 + }, + { + "epoch": 0.54, + "grad_norm": 0.27586967822568237, + "learning_rate": 0.0001744177769564228, + "loss": 1.0234, + "step": 5669 + }, + { + "epoch": 0.54, + "grad_norm": 0.23851868493807254, + "learning_rate": 0.00017440720889624978, + "loss": 1.109, + "step": 5670 + }, + { + "epoch": 0.54, + "grad_norm": 0.29280665470416317, + "learning_rate": 0.00017439663897398958, + "loss": 1.1473, + "step": 5671 + }, + { + "epoch": 0.54, + "grad_norm": 0.27293284903881204, + "learning_rate": 0.00017438606718990675, + "loss": 1.0855, + "step": 5672 + }, + { + "epoch": 0.54, + "grad_norm": 0.3102331601705705, + "learning_rate": 0.00017437549354426586, + "loss": 0.979, + "step": 5673 + }, + { + "epoch": 0.54, + "grad_norm": 0.2904424568361712, + "learning_rate": 0.00017436491803733147, + "loss": 1.0758, + "step": 5674 + }, + { + "epoch": 0.54, + "grad_norm": 0.3038807854764857, + "learning_rate": 0.00017435434066936828, + "loss": 1.0813, + "step": 5675 + }, + { + "epoch": 0.54, + "grad_norm": 0.2982170129842694, + "learning_rate": 0.00017434376144064096, + "loss": 0.9269, + "step": 5676 + }, + { + "epoch": 0.54, + "grad_norm": 0.29066232686572985, + "learning_rate": 0.00017433318035141432, + "loss": 1.0761, + "step": 5677 + }, + { + "epoch": 0.54, + "grad_norm": 0.36384380111246045, + "learning_rate": 0.0001743225974019531, + "loss": 1.1277, + "step": 5678 + }, + { + "epoch": 0.54, + "grad_norm": 0.28539850304066966, + "learning_rate": 0.00017431201259252222, + "loss": 1.1838, + "step": 5679 + }, + { + "epoch": 0.54, + "grad_norm": 0.3010319695376003, + "learning_rate": 0.00017430142592338648, + "loss": 0.9309, + "step": 5680 + }, + { + "epoch": 0.54, + "grad_norm": 0.2639994319325743, + "learning_rate": 0.00017429083739481087, + "loss": 1.1049, + "step": 5681 + }, + { + "epoch": 0.54, + "grad_norm": 0.2979159287960645, + "learning_rate": 0.00017428024700706036, + "loss": 1.1221, + "step": 5682 + }, + { + "epoch": 0.54, + "grad_norm": 0.2819099394302753, + "learning_rate": 0.0001742696547604, + "loss": 1.0916, + "step": 5683 + }, + { + "epoch": 0.54, + "grad_norm": 0.260104363656028, + "learning_rate": 0.00017425906065509484, + "loss": 1.0034, + "step": 5684 + }, + { + "epoch": 0.54, + "grad_norm": 0.2970820007503617, + "learning_rate": 0.00017424846469141, + "loss": 1.1024, + "step": 5685 + }, + { + "epoch": 0.54, + "grad_norm": 0.2620883115817067, + "learning_rate": 0.0001742378668696107, + "loss": 1.0562, + "step": 5686 + }, + { + "epoch": 0.54, + "grad_norm": 0.2491078043946645, + "learning_rate": 0.0001742272671899621, + "loss": 1.0625, + "step": 5687 + }, + { + "epoch": 0.54, + "grad_norm": 0.30629114639440114, + "learning_rate": 0.00017421666565272948, + "loss": 1.202, + "step": 5688 + }, + { + "epoch": 0.54, + "grad_norm": 0.30830018039821566, + "learning_rate": 0.0001742060622581782, + "loss": 1.0535, + "step": 5689 + }, + { + "epoch": 0.54, + "grad_norm": 0.2724222943145667, + "learning_rate": 0.00017419545700657354, + "loss": 1.1325, + "step": 5690 + }, + { + "epoch": 0.54, + "grad_norm": 0.23708317781034122, + "learning_rate": 0.00017418484989818096, + "loss": 1.0638, + "step": 5691 + }, + { + "epoch": 0.54, + "grad_norm": 0.26229220094670336, + "learning_rate": 0.00017417424093326588, + "loss": 0.9439, + "step": 5692 + }, + { + "epoch": 0.54, + "grad_norm": 0.25017286913990483, + "learning_rate": 0.0001741636301120938, + "loss": 1.079, + "step": 5693 + }, + { + "epoch": 0.54, + "grad_norm": 0.2860341282853943, + "learning_rate": 0.00017415301743493026, + "loss": 1.109, + "step": 5694 + }, + { + "epoch": 0.54, + "grad_norm": 0.27000375628865514, + "learning_rate": 0.00017414240290204087, + "loss": 1.0366, + "step": 5695 + }, + { + "epoch": 0.54, + "grad_norm": 0.2944772274521784, + "learning_rate": 0.00017413178651369123, + "loss": 1.106, + "step": 5696 + }, + { + "epoch": 0.55, + "grad_norm": 0.29923112387635076, + "learning_rate": 0.00017412116827014707, + "loss": 1.118, + "step": 5697 + }, + { + "epoch": 0.55, + "grad_norm": 0.27744625628639163, + "learning_rate": 0.00017411054817167407, + "loss": 1.0582, + "step": 5698 + }, + { + "epoch": 0.55, + "grad_norm": 0.2872887689318743, + "learning_rate": 0.00017409992621853803, + "loss": 1.135, + "step": 5699 + }, + { + "epoch": 0.55, + "grad_norm": 0.2868590588865866, + "learning_rate": 0.00017408930241100476, + "loss": 1.1218, + "step": 5700 + }, + { + "epoch": 0.55, + "grad_norm": 0.28039756959308726, + "learning_rate": 0.00017407867674934014, + "loss": 1.0089, + "step": 5701 + }, + { + "epoch": 0.55, + "grad_norm": 0.26834033841073884, + "learning_rate": 0.00017406804923381008, + "loss": 1.0931, + "step": 5702 + }, + { + "epoch": 0.55, + "grad_norm": 0.2679055295777244, + "learning_rate": 0.00017405741986468054, + "loss": 1.0362, + "step": 5703 + }, + { + "epoch": 0.55, + "grad_norm": 0.31948114555469886, + "learning_rate": 0.00017404678864221752, + "loss": 1.1499, + "step": 5704 + }, + { + "epoch": 0.55, + "grad_norm": 0.254735913314033, + "learning_rate": 0.00017403615556668708, + "loss": 1.0521, + "step": 5705 + }, + { + "epoch": 0.55, + "grad_norm": 0.2758568173458296, + "learning_rate": 0.00017402552063835533, + "loss": 1.0276, + "step": 5706 + }, + { + "epoch": 0.55, + "grad_norm": 0.2881056253518769, + "learning_rate": 0.0001740148838574884, + "loss": 0.9892, + "step": 5707 + }, + { + "epoch": 0.55, + "grad_norm": 0.28754810292102423, + "learning_rate": 0.00017400424522435247, + "loss": 1.0138, + "step": 5708 + }, + { + "epoch": 0.55, + "grad_norm": 0.26622897762881104, + "learning_rate": 0.0001739936047392138, + "loss": 1.0121, + "step": 5709 + }, + { + "epoch": 0.55, + "grad_norm": 0.3257799896467255, + "learning_rate": 0.00017398296240233866, + "loss": 1.0488, + "step": 5710 + }, + { + "epoch": 0.55, + "grad_norm": 0.300245783416075, + "learning_rate": 0.0001739723182139934, + "loss": 1.199, + "step": 5711 + }, + { + "epoch": 0.55, + "grad_norm": 0.29793504189936215, + "learning_rate": 0.00017396167217444437, + "loss": 1.0326, + "step": 5712 + }, + { + "epoch": 0.55, + "grad_norm": 0.2895788092153529, + "learning_rate": 0.00017395102428395803, + "loss": 1.0282, + "step": 5713 + }, + { + "epoch": 0.55, + "grad_norm": 0.2759129672263007, + "learning_rate": 0.0001739403745428008, + "loss": 1.1353, + "step": 5714 + }, + { + "epoch": 0.55, + "grad_norm": 0.2736823382715656, + "learning_rate": 0.0001739297229512393, + "loss": 1.0918, + "step": 5715 + }, + { + "epoch": 0.55, + "grad_norm": 0.30836802392427065, + "learning_rate": 0.00017391906950953994, + "loss": 1.1101, + "step": 5716 + }, + { + "epoch": 0.55, + "grad_norm": 0.2923956322766849, + "learning_rate": 0.00017390841421796943, + "loss": 1.1005, + "step": 5717 + }, + { + "epoch": 0.55, + "grad_norm": 0.2582193114884755, + "learning_rate": 0.00017389775707679444, + "loss": 1.0708, + "step": 5718 + }, + { + "epoch": 0.55, + "grad_norm": 0.24558151292411884, + "learning_rate": 0.0001738870980862816, + "loss": 1.1303, + "step": 5719 + }, + { + "epoch": 0.55, + "grad_norm": 0.29322536081033934, + "learning_rate": 0.0001738764372466977, + "loss": 1.0927, + "step": 5720 + }, + { + "epoch": 0.55, + "grad_norm": 0.2659755712941869, + "learning_rate": 0.00017386577455830952, + "loss": 1.0032, + "step": 5721 + }, + { + "epoch": 0.55, + "grad_norm": 0.2635454724857395, + "learning_rate": 0.00017385511002138393, + "loss": 1.2808, + "step": 5722 + }, + { + "epoch": 0.55, + "grad_norm": 0.26786955193926654, + "learning_rate": 0.0001738444436361878, + "loss": 1.0475, + "step": 5723 + }, + { + "epoch": 0.55, + "grad_norm": 0.28425119186729847, + "learning_rate": 0.00017383377540298805, + "loss": 1.0817, + "step": 5724 + }, + { + "epoch": 0.55, + "grad_norm": 0.28386565821296494, + "learning_rate": 0.00017382310532205165, + "loss": 0.9009, + "step": 5725 + }, + { + "epoch": 0.55, + "grad_norm": 0.2455076247487801, + "learning_rate": 0.00017381243339364565, + "loss": 0.9972, + "step": 5726 + }, + { + "epoch": 0.55, + "grad_norm": 0.2949244070026697, + "learning_rate": 0.00017380175961803713, + "loss": 1.0042, + "step": 5727 + }, + { + "epoch": 0.55, + "grad_norm": 0.2806527868238312, + "learning_rate": 0.00017379108399549317, + "loss": 1.1932, + "step": 5728 + }, + { + "epoch": 0.55, + "grad_norm": 0.27785708544616866, + "learning_rate": 0.000173780406526281, + "loss": 1.0914, + "step": 5729 + }, + { + "epoch": 0.55, + "grad_norm": 0.28683282908042806, + "learning_rate": 0.00017376972721066776, + "loss": 1.1145, + "step": 5730 + }, + { + "epoch": 0.55, + "grad_norm": 0.2838770112878104, + "learning_rate": 0.00017375904604892073, + "loss": 1.0765, + "step": 5731 + }, + { + "epoch": 0.55, + "grad_norm": 0.2934314888524707, + "learning_rate": 0.0001737483630413072, + "loss": 1.0595, + "step": 5732 + }, + { + "epoch": 0.55, + "grad_norm": 0.25830945410699585, + "learning_rate": 0.00017373767818809456, + "loss": 0.9669, + "step": 5733 + }, + { + "epoch": 0.55, + "grad_norm": 0.2665953859888515, + "learning_rate": 0.00017372699148955018, + "loss": 1.0714, + "step": 5734 + }, + { + "epoch": 0.55, + "grad_norm": 0.2731137251463795, + "learning_rate": 0.0001737163029459415, + "loss": 1.0746, + "step": 5735 + }, + { + "epoch": 0.55, + "grad_norm": 0.25825981988483343, + "learning_rate": 0.00017370561255753602, + "loss": 0.9534, + "step": 5736 + }, + { + "epoch": 0.55, + "grad_norm": 0.30383684481721607, + "learning_rate": 0.00017369492032460123, + "loss": 1.1384, + "step": 5737 + }, + { + "epoch": 0.55, + "grad_norm": 0.31417342120569464, + "learning_rate": 0.00017368422624740478, + "loss": 1.0576, + "step": 5738 + }, + { + "epoch": 0.55, + "grad_norm": 0.2593381194717112, + "learning_rate": 0.00017367353032621426, + "loss": 1.0573, + "step": 5739 + }, + { + "epoch": 0.55, + "grad_norm": 0.2778893532357316, + "learning_rate": 0.00017366283256129732, + "loss": 0.9766, + "step": 5740 + }, + { + "epoch": 0.55, + "grad_norm": 0.3072153615495613, + "learning_rate": 0.0001736521329529217, + "loss": 1.1247, + "step": 5741 + }, + { + "epoch": 0.55, + "grad_norm": 0.28745857193672664, + "learning_rate": 0.00017364143150135517, + "loss": 1.1141, + "step": 5742 + }, + { + "epoch": 0.55, + "grad_norm": 0.2651127640673508, + "learning_rate": 0.00017363072820686552, + "loss": 1.0829, + "step": 5743 + }, + { + "epoch": 0.55, + "grad_norm": 0.2902298639371502, + "learning_rate": 0.00017362002306972065, + "loss": 0.987, + "step": 5744 + }, + { + "epoch": 0.55, + "grad_norm": 0.27655920423756686, + "learning_rate": 0.00017360931609018842, + "loss": 1.0076, + "step": 5745 + }, + { + "epoch": 0.55, + "grad_norm": 0.2567390152989073, + "learning_rate": 0.0001735986072685368, + "loss": 1.0375, + "step": 5746 + }, + { + "epoch": 0.55, + "grad_norm": 0.24515803399806585, + "learning_rate": 0.00017358789660503377, + "loss": 1.0745, + "step": 5747 + }, + { + "epoch": 0.55, + "grad_norm": 0.2575517841295073, + "learning_rate": 0.0001735771840999474, + "loss": 1.0153, + "step": 5748 + }, + { + "epoch": 0.55, + "grad_norm": 0.2764368481508909, + "learning_rate": 0.0001735664697535457, + "loss": 1.1584, + "step": 5749 + }, + { + "epoch": 0.55, + "grad_norm": 0.27351695694822586, + "learning_rate": 0.0001735557535660969, + "loss": 1.0281, + "step": 5750 + }, + { + "epoch": 0.55, + "grad_norm": 0.2667030418236546, + "learning_rate": 0.00017354503553786916, + "loss": 1.1321, + "step": 5751 + }, + { + "epoch": 0.55, + "grad_norm": 0.29279846320754066, + "learning_rate": 0.00017353431566913066, + "loss": 0.9984, + "step": 5752 + }, + { + "epoch": 0.55, + "grad_norm": 0.2415120154785472, + "learning_rate": 0.0001735235939601497, + "loss": 1.0611, + "step": 5753 + }, + { + "epoch": 0.55, + "grad_norm": 0.25955357361068576, + "learning_rate": 0.00017351287041119458, + "loss": 1.1294, + "step": 5754 + }, + { + "epoch": 0.55, + "grad_norm": 0.294206732380647, + "learning_rate": 0.0001735021450225337, + "loss": 1.0812, + "step": 5755 + }, + { + "epoch": 0.55, + "grad_norm": 0.29031247550246536, + "learning_rate": 0.00017349141779443542, + "loss": 1.0576, + "step": 5756 + }, + { + "epoch": 0.55, + "grad_norm": 0.2672750801707316, + "learning_rate": 0.00017348068872716823, + "loss": 1.0851, + "step": 5757 + }, + { + "epoch": 0.55, + "grad_norm": 0.26957335518481657, + "learning_rate": 0.00017346995782100062, + "loss": 1.0858, + "step": 5758 + }, + { + "epoch": 0.55, + "grad_norm": 0.32734612669273955, + "learning_rate": 0.00017345922507620116, + "loss": 1.0656, + "step": 5759 + }, + { + "epoch": 0.55, + "grad_norm": 0.24154451886063427, + "learning_rate": 0.00017344849049303842, + "loss": 0.9896, + "step": 5760 + }, + { + "epoch": 0.55, + "grad_norm": 0.27709567176869415, + "learning_rate": 0.00017343775407178104, + "loss": 1.0805, + "step": 5761 + }, + { + "epoch": 0.55, + "grad_norm": 0.272244078427738, + "learning_rate": 0.0001734270158126977, + "loss": 1.0433, + "step": 5762 + }, + { + "epoch": 0.55, + "grad_norm": 0.2673872164348252, + "learning_rate": 0.00017341627571605716, + "loss": 1.1464, + "step": 5763 + }, + { + "epoch": 0.55, + "grad_norm": 0.27315997652853985, + "learning_rate": 0.00017340553378212816, + "loss": 1.1567, + "step": 5764 + }, + { + "epoch": 0.55, + "grad_norm": 0.23474379585480734, + "learning_rate": 0.00017339479001117955, + "loss": 1.0089, + "step": 5765 + }, + { + "epoch": 0.55, + "grad_norm": 0.27652850547755903, + "learning_rate": 0.00017338404440348022, + "loss": 1.0461, + "step": 5766 + }, + { + "epoch": 0.55, + "grad_norm": 0.31896644764441623, + "learning_rate": 0.00017337329695929902, + "loss": 1.0359, + "step": 5767 + }, + { + "epoch": 0.55, + "grad_norm": 0.274574680140215, + "learning_rate": 0.00017336254767890498, + "loss": 1.0201, + "step": 5768 + }, + { + "epoch": 0.55, + "grad_norm": 0.2630831469195627, + "learning_rate": 0.00017335179656256705, + "loss": 1.0975, + "step": 5769 + }, + { + "epoch": 0.55, + "grad_norm": 0.2793728024955909, + "learning_rate": 0.00017334104361055436, + "loss": 1.1332, + "step": 5770 + }, + { + "epoch": 0.55, + "grad_norm": 0.27408839587852935, + "learning_rate": 0.0001733302888231359, + "loss": 0.939, + "step": 5771 + }, + { + "epoch": 0.55, + "grad_norm": 0.27297436472505615, + "learning_rate": 0.0001733195322005809, + "loss": 1.0168, + "step": 5772 + }, + { + "epoch": 0.55, + "grad_norm": 0.30630256010686685, + "learning_rate": 0.00017330877374315855, + "loss": 1.1169, + "step": 5773 + }, + { + "epoch": 0.55, + "grad_norm": 0.2767879583144832, + "learning_rate": 0.00017329801345113802, + "loss": 1.0233, + "step": 5774 + }, + { + "epoch": 0.55, + "grad_norm": 0.29684472098320214, + "learning_rate": 0.0001732872513247887, + "loss": 1.0463, + "step": 5775 + }, + { + "epoch": 0.55, + "grad_norm": 0.2785233560114013, + "learning_rate": 0.00017327648736437977, + "loss": 1.142, + "step": 5776 + }, + { + "epoch": 0.55, + "grad_norm": 0.3208276011124166, + "learning_rate": 0.00017326572157018078, + "loss": 1.1932, + "step": 5777 + }, + { + "epoch": 0.55, + "grad_norm": 0.22684754778328245, + "learning_rate": 0.000173254953942461, + "loss": 1.0656, + "step": 5778 + }, + { + "epoch": 0.55, + "grad_norm": 0.3374242730404179, + "learning_rate": 0.00017324418448148998, + "loss": 1.049, + "step": 5779 + }, + { + "epoch": 0.55, + "grad_norm": 0.3428578883405557, + "learning_rate": 0.0001732334131875372, + "loss": 1.1952, + "step": 5780 + }, + { + "epoch": 0.55, + "grad_norm": 0.27472370192399703, + "learning_rate": 0.00017322264006087225, + "loss": 1.2441, + "step": 5781 + }, + { + "epoch": 0.55, + "grad_norm": 0.27491429642165643, + "learning_rate": 0.0001732118651017647, + "loss": 1.0994, + "step": 5782 + }, + { + "epoch": 0.55, + "grad_norm": 0.27459654102558345, + "learning_rate": 0.00017320108831048422, + "loss": 1.0496, + "step": 5783 + }, + { + "epoch": 0.55, + "grad_norm": 0.2699347475330186, + "learning_rate": 0.0001731903096873005, + "loss": 1.1214, + "step": 5784 + }, + { + "epoch": 0.55, + "grad_norm": 0.296175698180158, + "learning_rate": 0.00017317952923248328, + "loss": 1.0308, + "step": 5785 + }, + { + "epoch": 0.55, + "grad_norm": 0.29058979499652565, + "learning_rate": 0.00017316874694630236, + "loss": 1.0828, + "step": 5786 + }, + { + "epoch": 0.55, + "grad_norm": 0.2997585209236349, + "learning_rate": 0.00017315796282902753, + "loss": 1.1366, + "step": 5787 + }, + { + "epoch": 0.55, + "grad_norm": 0.27673867013826936, + "learning_rate": 0.00017314717688092873, + "loss": 0.945, + "step": 5788 + }, + { + "epoch": 0.55, + "grad_norm": 0.23598291014298442, + "learning_rate": 0.00017313638910227585, + "loss": 0.9402, + "step": 5789 + }, + { + "epoch": 0.55, + "grad_norm": 0.26553154287144975, + "learning_rate": 0.00017312559949333886, + "loss": 1.1785, + "step": 5790 + }, + { + "epoch": 0.55, + "grad_norm": 0.28452803788674824, + "learning_rate": 0.0001731148080543878, + "loss": 1.124, + "step": 5791 + }, + { + "epoch": 0.55, + "grad_norm": 0.2705626232253567, + "learning_rate": 0.00017310401478569273, + "loss": 1.0191, + "step": 5792 + }, + { + "epoch": 0.55, + "grad_norm": 0.24817380433889646, + "learning_rate": 0.0001730932196875237, + "loss": 1.0482, + "step": 5793 + }, + { + "epoch": 0.55, + "grad_norm": 0.26652308262279306, + "learning_rate": 0.00017308242276015094, + "loss": 1.0738, + "step": 5794 + }, + { + "epoch": 0.55, + "grad_norm": 0.24818290945260654, + "learning_rate": 0.00017307162400384462, + "loss": 1.0175, + "step": 5795 + }, + { + "epoch": 0.55, + "grad_norm": 0.26331611360600954, + "learning_rate": 0.000173060823418875, + "loss": 1.0788, + "step": 5796 + }, + { + "epoch": 0.55, + "grad_norm": 0.25070740236687494, + "learning_rate": 0.00017305002100551233, + "loss": 0.9522, + "step": 5797 + }, + { + "epoch": 0.55, + "grad_norm": 0.27552761246559027, + "learning_rate": 0.000173039216764027, + "loss": 1.1854, + "step": 5798 + }, + { + "epoch": 0.55, + "grad_norm": 0.24607203901766253, + "learning_rate": 0.00017302841069468934, + "loss": 1.0574, + "step": 5799 + }, + { + "epoch": 0.55, + "grad_norm": 0.24789972386779166, + "learning_rate": 0.00017301760279776982, + "loss": 1.0585, + "step": 5800 + }, + { + "epoch": 0.55, + "grad_norm": 0.22933484912929925, + "learning_rate": 0.00017300679307353888, + "loss": 1.0806, + "step": 5801 + }, + { + "epoch": 0.56, + "grad_norm": 0.27649671163848355, + "learning_rate": 0.0001729959815222671, + "loss": 1.1766, + "step": 5802 + }, + { + "epoch": 0.56, + "grad_norm": 0.24226545584694414, + "learning_rate": 0.00017298516814422498, + "loss": 0.9117, + "step": 5803 + }, + { + "epoch": 0.56, + "grad_norm": 0.26242930789541363, + "learning_rate": 0.00017297435293968315, + "loss": 1.1165, + "step": 5804 + }, + { + "epoch": 0.56, + "grad_norm": 0.3120420916073829, + "learning_rate": 0.0001729635359089123, + "loss": 1.0574, + "step": 5805 + }, + { + "epoch": 0.56, + "grad_norm": 0.2701256919492001, + "learning_rate": 0.00017295271705218307, + "loss": 1.1047, + "step": 5806 + }, + { + "epoch": 0.56, + "grad_norm": 0.29645770053433096, + "learning_rate": 0.0001729418963697663, + "loss": 1.1329, + "step": 5807 + }, + { + "epoch": 0.56, + "grad_norm": 0.24632237776935018, + "learning_rate": 0.0001729310738619327, + "loss": 1.1359, + "step": 5808 + }, + { + "epoch": 0.56, + "grad_norm": 0.27163467992709467, + "learning_rate": 0.00017292024952895313, + "loss": 1.1006, + "step": 5809 + }, + { + "epoch": 0.56, + "grad_norm": 0.26959200346069406, + "learning_rate": 0.0001729094233710985, + "loss": 1.0042, + "step": 5810 + }, + { + "epoch": 0.56, + "grad_norm": 0.289959983695105, + "learning_rate": 0.00017289859538863973, + "loss": 1.1085, + "step": 5811 + }, + { + "epoch": 0.56, + "grad_norm": 0.3213875350320641, + "learning_rate": 0.0001728877655818478, + "loss": 1.1062, + "step": 5812 + }, + { + "epoch": 0.56, + "grad_norm": 0.3270439141750809, + "learning_rate": 0.0001728769339509937, + "loss": 1.0258, + "step": 5813 + }, + { + "epoch": 0.56, + "grad_norm": 0.3409072299007263, + "learning_rate": 0.00017286610049634856, + "loss": 0.9644, + "step": 5814 + }, + { + "epoch": 0.56, + "grad_norm": 0.24496972615719473, + "learning_rate": 0.00017285526521818346, + "loss": 1.1259, + "step": 5815 + }, + { + "epoch": 0.56, + "grad_norm": 0.3139089621120804, + "learning_rate": 0.00017284442811676953, + "loss": 1.0404, + "step": 5816 + }, + { + "epoch": 0.56, + "grad_norm": 0.26970603580675145, + "learning_rate": 0.00017283358919237802, + "loss": 1.0835, + "step": 5817 + }, + { + "epoch": 0.56, + "grad_norm": 0.2733732615441018, + "learning_rate": 0.0001728227484452802, + "loss": 1.0744, + "step": 5818 + }, + { + "epoch": 0.56, + "grad_norm": 0.28036462116083893, + "learning_rate": 0.00017281190587574728, + "loss": 1.1692, + "step": 5819 + }, + { + "epoch": 0.56, + "grad_norm": 0.2871660582969163, + "learning_rate": 0.0001728010614840507, + "loss": 1.0495, + "step": 5820 + }, + { + "epoch": 0.56, + "grad_norm": 0.23884476735212962, + "learning_rate": 0.00017279021527046178, + "loss": 1.0443, + "step": 5821 + }, + { + "epoch": 0.56, + "grad_norm": 0.2873207540071599, + "learning_rate": 0.00017277936723525197, + "loss": 0.993, + "step": 5822 + }, + { + "epoch": 0.56, + "grad_norm": 0.30547644132922835, + "learning_rate": 0.00017276851737869274, + "loss": 1.0843, + "step": 5823 + }, + { + "epoch": 0.56, + "grad_norm": 0.3057414154299913, + "learning_rate": 0.00017275766570105567, + "loss": 1.0655, + "step": 5824 + }, + { + "epoch": 0.56, + "grad_norm": 0.2632962822442975, + "learning_rate": 0.00017274681220261226, + "loss": 1.0939, + "step": 5825 + }, + { + "epoch": 0.56, + "grad_norm": 0.30877886648124775, + "learning_rate": 0.00017273595688363416, + "loss": 1.1277, + "step": 5826 + }, + { + "epoch": 0.56, + "grad_norm": 0.26307234053148654, + "learning_rate": 0.00017272509974439304, + "loss": 1.0079, + "step": 5827 + }, + { + "epoch": 0.56, + "grad_norm": 0.2841767029229268, + "learning_rate": 0.00017271424078516055, + "loss": 1.1768, + "step": 5828 + }, + { + "epoch": 0.56, + "grad_norm": 0.2603214619363828, + "learning_rate": 0.00017270338000620856, + "loss": 1.0806, + "step": 5829 + }, + { + "epoch": 0.56, + "grad_norm": 0.2545067452234432, + "learning_rate": 0.00017269251740780874, + "loss": 1.0412, + "step": 5830 + }, + { + "epoch": 0.56, + "grad_norm": 0.27855295154779947, + "learning_rate": 0.000172681652990233, + "loss": 1.2206, + "step": 5831 + }, + { + "epoch": 0.56, + "grad_norm": 0.28261805590758377, + "learning_rate": 0.00017267078675375322, + "loss": 1.0033, + "step": 5832 + }, + { + "epoch": 0.56, + "grad_norm": 0.27667495248962876, + "learning_rate": 0.00017265991869864128, + "loss": 1.0831, + "step": 5833 + }, + { + "epoch": 0.56, + "grad_norm": 0.2714857639489485, + "learning_rate": 0.00017264904882516928, + "loss": 1.109, + "step": 5834 + }, + { + "epoch": 0.56, + "grad_norm": 0.27229604294119086, + "learning_rate": 0.00017263817713360915, + "loss": 1.0874, + "step": 5835 + }, + { + "epoch": 0.56, + "grad_norm": 0.293920707567098, + "learning_rate": 0.00017262730362423297, + "loss": 1.0823, + "step": 5836 + }, + { + "epoch": 0.56, + "grad_norm": 0.2613726158263339, + "learning_rate": 0.00017261642829731287, + "loss": 1.0599, + "step": 5837 + }, + { + "epoch": 0.56, + "grad_norm": 0.2889492784232317, + "learning_rate": 0.00017260555115312104, + "loss": 1.1224, + "step": 5838 + }, + { + "epoch": 0.56, + "grad_norm": 0.25970536207303385, + "learning_rate": 0.00017259467219192968, + "loss": 1.1015, + "step": 5839 + }, + { + "epoch": 0.56, + "grad_norm": 0.28971995596673233, + "learning_rate": 0.00017258379141401098, + "loss": 1.0696, + "step": 5840 + }, + { + "epoch": 0.56, + "grad_norm": 0.30430404198918504, + "learning_rate": 0.00017257290881963732, + "loss": 1.0823, + "step": 5841 + }, + { + "epoch": 0.56, + "grad_norm": 0.2713987221161718, + "learning_rate": 0.00017256202440908095, + "loss": 1.0604, + "step": 5842 + }, + { + "epoch": 0.56, + "grad_norm": 0.3001859725452708, + "learning_rate": 0.00017255113818261437, + "loss": 0.8537, + "step": 5843 + }, + { + "epoch": 0.56, + "grad_norm": 0.24037659758164331, + "learning_rate": 0.00017254025014050995, + "loss": 1.0035, + "step": 5844 + }, + { + "epoch": 0.56, + "grad_norm": 0.29289291269851125, + "learning_rate": 0.00017252936028304015, + "loss": 0.9527, + "step": 5845 + }, + { + "epoch": 0.56, + "grad_norm": 0.28320339512377446, + "learning_rate": 0.00017251846861047755, + "loss": 1.0616, + "step": 5846 + }, + { + "epoch": 0.56, + "grad_norm": 0.2844954523450815, + "learning_rate": 0.0001725075751230947, + "loss": 1.1248, + "step": 5847 + }, + { + "epoch": 0.56, + "grad_norm": 0.25465891657756395, + "learning_rate": 0.0001724966798211642, + "loss": 1.2268, + "step": 5848 + }, + { + "epoch": 0.56, + "grad_norm": 0.318243854381763, + "learning_rate": 0.00017248578270495873, + "loss": 1.1983, + "step": 5849 + }, + { + "epoch": 0.56, + "grad_norm": 0.24392495708256506, + "learning_rate": 0.00017247488377475102, + "loss": 1.1131, + "step": 5850 + }, + { + "epoch": 0.56, + "grad_norm": 0.339924875849168, + "learning_rate": 0.00017246398303081377, + "loss": 1.1255, + "step": 5851 + }, + { + "epoch": 0.56, + "grad_norm": 0.2824682986273372, + "learning_rate": 0.00017245308047341977, + "loss": 1.0928, + "step": 5852 + }, + { + "epoch": 0.56, + "grad_norm": 0.294553844921709, + "learning_rate": 0.00017244217610284194, + "loss": 1.1941, + "step": 5853 + }, + { + "epoch": 0.56, + "grad_norm": 0.3175139266139292, + "learning_rate": 0.0001724312699193531, + "loss": 1.0488, + "step": 5854 + }, + { + "epoch": 0.56, + "grad_norm": 0.26336543686584807, + "learning_rate": 0.0001724203619232262, + "loss": 1.0406, + "step": 5855 + }, + { + "epoch": 0.56, + "grad_norm": 0.28698720470502975, + "learning_rate": 0.00017240945211473426, + "loss": 1.1673, + "step": 5856 + }, + { + "epoch": 0.56, + "grad_norm": 0.2705696042656757, + "learning_rate": 0.0001723985404941503, + "loss": 1.0654, + "step": 5857 + }, + { + "epoch": 0.56, + "grad_norm": 0.2609774023108237, + "learning_rate": 0.0001723876270617473, + "loss": 1.0166, + "step": 5858 + }, + { + "epoch": 0.56, + "grad_norm": 0.2879084621496255, + "learning_rate": 0.0001723767118177985, + "loss": 1.1731, + "step": 5859 + }, + { + "epoch": 0.56, + "grad_norm": 0.25995995206356726, + "learning_rate": 0.00017236579476257694, + "loss": 1.1141, + "step": 5860 + }, + { + "epoch": 0.56, + "grad_norm": 0.2832052274386346, + "learning_rate": 0.00017235487589635593, + "loss": 1.0356, + "step": 5861 + }, + { + "epoch": 0.56, + "grad_norm": 0.3009633175427442, + "learning_rate": 0.00017234395521940866, + "loss": 1.0634, + "step": 5862 + }, + { + "epoch": 0.56, + "grad_norm": 0.28724524890382835, + "learning_rate": 0.00017233303273200842, + "loss": 1.0978, + "step": 5863 + }, + { + "epoch": 0.56, + "grad_norm": 0.30150303153924973, + "learning_rate": 0.0001723221084344286, + "loss": 1.0883, + "step": 5864 + }, + { + "epoch": 0.56, + "grad_norm": 0.2602169228739236, + "learning_rate": 0.00017231118232694255, + "loss": 0.8809, + "step": 5865 + }, + { + "epoch": 0.56, + "grad_norm": 0.2597086205288419, + "learning_rate": 0.00017230025440982373, + "loss": 1.0913, + "step": 5866 + }, + { + "epoch": 0.56, + "grad_norm": 0.2627390421998769, + "learning_rate": 0.0001722893246833456, + "loss": 0.9786, + "step": 5867 + }, + { + "epoch": 0.56, + "grad_norm": 0.29086276852649495, + "learning_rate": 0.0001722783931477817, + "loss": 1.1312, + "step": 5868 + }, + { + "epoch": 0.56, + "grad_norm": 0.2631914398523322, + "learning_rate": 0.00017226745980340556, + "loss": 1.1383, + "step": 5869 + }, + { + "epoch": 0.56, + "grad_norm": 0.293675216189509, + "learning_rate": 0.00017225652465049086, + "loss": 1.037, + "step": 5870 + }, + { + "epoch": 0.56, + "grad_norm": 0.27269400481933986, + "learning_rate": 0.0001722455876893112, + "loss": 1.0594, + "step": 5871 + }, + { + "epoch": 0.56, + "grad_norm": 0.2913003315526418, + "learning_rate": 0.00017223464892014028, + "loss": 1.0343, + "step": 5872 + }, + { + "epoch": 0.56, + "grad_norm": 0.30749338727180847, + "learning_rate": 0.0001722237083432519, + "loss": 1.0609, + "step": 5873 + }, + { + "epoch": 0.56, + "grad_norm": 0.2780051476505012, + "learning_rate": 0.00017221276595891984, + "loss": 1.0403, + "step": 5874 + }, + { + "epoch": 0.56, + "grad_norm": 0.28351257017042697, + "learning_rate": 0.0001722018217674179, + "loss": 1.1222, + "step": 5875 + }, + { + "epoch": 0.56, + "grad_norm": 0.30556461707747595, + "learning_rate": 0.00017219087576902, + "loss": 1.1095, + "step": 5876 + }, + { + "epoch": 0.56, + "grad_norm": 0.2747330702712088, + "learning_rate": 0.00017217992796400005, + "loss": 1.0258, + "step": 5877 + }, + { + "epoch": 0.56, + "grad_norm": 0.3102546058358946, + "learning_rate": 0.00017216897835263209, + "loss": 0.9957, + "step": 5878 + }, + { + "epoch": 0.56, + "grad_norm": 0.3232197983937078, + "learning_rate": 0.00017215802693519003, + "loss": 1.0768, + "step": 5879 + }, + { + "epoch": 0.56, + "grad_norm": 0.269670235437008, + "learning_rate": 0.00017214707371194802, + "loss": 1.1033, + "step": 5880 + }, + { + "epoch": 0.56, + "grad_norm": 0.2633897286188578, + "learning_rate": 0.00017213611868318015, + "loss": 1.0035, + "step": 5881 + }, + { + "epoch": 0.56, + "grad_norm": 0.3042144256612858, + "learning_rate": 0.00017212516184916056, + "loss": 1.0715, + "step": 5882 + }, + { + "epoch": 0.56, + "grad_norm": 0.266078239126639, + "learning_rate": 0.0001721142032101635, + "loss": 1.1544, + "step": 5883 + }, + { + "epoch": 0.56, + "grad_norm": 0.30458594940724576, + "learning_rate": 0.00017210324276646316, + "loss": 1.1251, + "step": 5884 + }, + { + "epoch": 0.56, + "grad_norm": 0.28395232279028726, + "learning_rate": 0.00017209228051833387, + "loss": 1.0925, + "step": 5885 + }, + { + "epoch": 0.56, + "grad_norm": 0.29349557855153724, + "learning_rate": 0.00017208131646604993, + "loss": 1.1322, + "step": 5886 + }, + { + "epoch": 0.56, + "grad_norm": 0.2729717369394539, + "learning_rate": 0.00017207035060988574, + "loss": 1.0009, + "step": 5887 + }, + { + "epoch": 0.56, + "grad_norm": 0.278418729052514, + "learning_rate": 0.00017205938295011575, + "loss": 1.0637, + "step": 5888 + }, + { + "epoch": 0.56, + "grad_norm": 0.2689002291046586, + "learning_rate": 0.00017204841348701438, + "loss": 1.1421, + "step": 5889 + }, + { + "epoch": 0.56, + "grad_norm": 0.2936863871934149, + "learning_rate": 0.00017203744222085623, + "loss": 1.078, + "step": 5890 + }, + { + "epoch": 0.56, + "grad_norm": 0.291267791636599, + "learning_rate": 0.00017202646915191578, + "loss": 1.2519, + "step": 5891 + }, + { + "epoch": 0.56, + "grad_norm": 0.2717676026532944, + "learning_rate": 0.0001720154942804677, + "loss": 1.0844, + "step": 5892 + }, + { + "epoch": 0.56, + "grad_norm": 0.2774147698904939, + "learning_rate": 0.0001720045176067866, + "loss": 1.0487, + "step": 5893 + }, + { + "epoch": 0.56, + "grad_norm": 0.27887254178612914, + "learning_rate": 0.00017199353913114717, + "loss": 1.1334, + "step": 5894 + }, + { + "epoch": 0.56, + "grad_norm": 0.28978059963913083, + "learning_rate": 0.00017198255885382421, + "loss": 1.0868, + "step": 5895 + }, + { + "epoch": 0.56, + "grad_norm": 0.2766397079264888, + "learning_rate": 0.00017197157677509246, + "loss": 1.1068, + "step": 5896 + }, + { + "epoch": 0.56, + "grad_norm": 0.2851293309828098, + "learning_rate": 0.00017196059289522678, + "loss": 1.0717, + "step": 5897 + }, + { + "epoch": 0.56, + "grad_norm": 0.2549062649316722, + "learning_rate": 0.00017194960721450206, + "loss": 1.0199, + "step": 5898 + }, + { + "epoch": 0.56, + "grad_norm": 0.2461949098073852, + "learning_rate": 0.00017193861973319316, + "loss": 1.0461, + "step": 5899 + }, + { + "epoch": 0.56, + "grad_norm": 0.27203801042895437, + "learning_rate": 0.0001719276304515751, + "loss": 1.1691, + "step": 5900 + }, + { + "epoch": 0.56, + "grad_norm": 0.27460790915022437, + "learning_rate": 0.00017191663936992288, + "loss": 1.105, + "step": 5901 + }, + { + "epoch": 0.56, + "grad_norm": 0.29915013883951985, + "learning_rate": 0.0001719056464885116, + "loss": 0.9848, + "step": 5902 + }, + { + "epoch": 0.56, + "grad_norm": 0.29579312716715256, + "learning_rate": 0.00017189465180761628, + "loss": 1.1115, + "step": 5903 + }, + { + "epoch": 0.56, + "grad_norm": 0.30761711337398795, + "learning_rate": 0.00017188365532751213, + "loss": 1.164, + "step": 5904 + }, + { + "epoch": 0.56, + "grad_norm": 0.32410192655697073, + "learning_rate": 0.00017187265704847433, + "loss": 1.089, + "step": 5905 + }, + { + "epoch": 0.57, + "grad_norm": 0.2629249103887719, + "learning_rate": 0.00017186165697077809, + "loss": 0.9588, + "step": 5906 + }, + { + "epoch": 0.57, + "grad_norm": 0.28511065582542094, + "learning_rate": 0.00017185065509469876, + "loss": 1.1679, + "step": 5907 + }, + { + "epoch": 0.57, + "grad_norm": 0.2813371419019583, + "learning_rate": 0.00017183965142051163, + "loss": 1.0888, + "step": 5908 + }, + { + "epoch": 0.57, + "grad_norm": 0.27646089721769807, + "learning_rate": 0.00017182864594849205, + "loss": 1.0513, + "step": 5909 + }, + { + "epoch": 0.57, + "grad_norm": 0.30831778021556755, + "learning_rate": 0.00017181763867891547, + "loss": 1.1152, + "step": 5910 + }, + { + "epoch": 0.57, + "grad_norm": 0.31669929806320846, + "learning_rate": 0.00017180662961205733, + "loss": 1.0554, + "step": 5911 + }, + { + "epoch": 0.57, + "grad_norm": 0.2881244715767111, + "learning_rate": 0.00017179561874819318, + "loss": 1.0702, + "step": 5912 + }, + { + "epoch": 0.57, + "grad_norm": 0.28072579932667696, + "learning_rate": 0.00017178460608759853, + "loss": 0.9758, + "step": 5913 + }, + { + "epoch": 0.57, + "grad_norm": 0.2943649793795249, + "learning_rate": 0.00017177359163054903, + "loss": 1.0715, + "step": 5914 + }, + { + "epoch": 0.57, + "grad_norm": 0.24478814096705148, + "learning_rate": 0.00017176257537732025, + "loss": 1.0606, + "step": 5915 + }, + { + "epoch": 0.57, + "grad_norm": 0.27922088346647145, + "learning_rate": 0.00017175155732818796, + "loss": 1.1677, + "step": 5916 + }, + { + "epoch": 0.57, + "grad_norm": 0.25994214062479126, + "learning_rate": 0.00017174053748342783, + "loss": 0.9742, + "step": 5917 + }, + { + "epoch": 0.57, + "grad_norm": 0.282588772331292, + "learning_rate": 0.00017172951584331565, + "loss": 0.9751, + "step": 5918 + }, + { + "epoch": 0.57, + "grad_norm": 0.2878741945844007, + "learning_rate": 0.0001717184924081273, + "loss": 1.0041, + "step": 5919 + }, + { + "epoch": 0.57, + "grad_norm": 0.3029923936191996, + "learning_rate": 0.00017170746717813854, + "loss": 1.1071, + "step": 5920 + }, + { + "epoch": 0.57, + "grad_norm": 0.27290844629846106, + "learning_rate": 0.0001716964401536254, + "loss": 0.8786, + "step": 5921 + }, + { + "epoch": 0.57, + "grad_norm": 0.31966494850943117, + "learning_rate": 0.00017168541133486377, + "loss": 1.1118, + "step": 5922 + }, + { + "epoch": 0.57, + "grad_norm": 0.2689850435328001, + "learning_rate": 0.00017167438072212968, + "loss": 1.1024, + "step": 5923 + }, + { + "epoch": 0.57, + "grad_norm": 0.27928731290969944, + "learning_rate": 0.00017166334831569916, + "loss": 1.1399, + "step": 5924 + }, + { + "epoch": 0.57, + "grad_norm": 0.29591002661910865, + "learning_rate": 0.00017165231411584827, + "loss": 1.1074, + "step": 5925 + }, + { + "epoch": 0.57, + "grad_norm": 0.3104516805825287, + "learning_rate": 0.00017164127812285324, + "loss": 1.0186, + "step": 5926 + }, + { + "epoch": 0.57, + "grad_norm": 0.3329888234877537, + "learning_rate": 0.00017163024033699017, + "loss": 1.1095, + "step": 5927 + }, + { + "epoch": 0.57, + "grad_norm": 0.27420800800981127, + "learning_rate": 0.00017161920075853534, + "loss": 1.1286, + "step": 5928 + }, + { + "epoch": 0.57, + "grad_norm": 0.2687683336478087, + "learning_rate": 0.000171608159387765, + "loss": 1.1067, + "step": 5929 + }, + { + "epoch": 0.57, + "grad_norm": 0.28848253158286347, + "learning_rate": 0.00017159711622495544, + "loss": 1.1874, + "step": 5930 + }, + { + "epoch": 0.57, + "grad_norm": 0.28239325672690274, + "learning_rate": 0.0001715860712703831, + "loss": 1.1561, + "step": 5931 + }, + { + "epoch": 0.57, + "grad_norm": 0.2918884091102849, + "learning_rate": 0.00017157502452432429, + "loss": 1.0429, + "step": 5932 + }, + { + "epoch": 0.57, + "grad_norm": 0.24790386180695678, + "learning_rate": 0.00017156397598705548, + "loss": 0.9923, + "step": 5933 + }, + { + "epoch": 0.57, + "grad_norm": 0.24570072442193652, + "learning_rate": 0.00017155292565885328, + "loss": 1.0945, + "step": 5934 + }, + { + "epoch": 0.57, + "grad_norm": 0.23125439331824077, + "learning_rate": 0.0001715418735399941, + "loss": 0.9957, + "step": 5935 + }, + { + "epoch": 0.57, + "grad_norm": 0.2554465249970479, + "learning_rate": 0.00017153081963075458, + "loss": 1.1284, + "step": 5936 + }, + { + "epoch": 0.57, + "grad_norm": 0.2794800525821183, + "learning_rate": 0.00017151976393141132, + "loss": 0.9941, + "step": 5937 + }, + { + "epoch": 0.57, + "grad_norm": 0.30723518305938063, + "learning_rate": 0.000171508706442241, + "loss": 1.1475, + "step": 5938 + }, + { + "epoch": 0.57, + "grad_norm": 0.30203130938607553, + "learning_rate": 0.00017149764716352045, + "loss": 1.0853, + "step": 5939 + }, + { + "epoch": 0.57, + "grad_norm": 0.260328670983559, + "learning_rate": 0.00017148658609552627, + "loss": 1.1423, + "step": 5940 + }, + { + "epoch": 0.57, + "grad_norm": 0.293768400385368, + "learning_rate": 0.00017147552323853538, + "loss": 1.0503, + "step": 5941 + }, + { + "epoch": 0.57, + "grad_norm": 0.26640544240980046, + "learning_rate": 0.00017146445859282457, + "loss": 1.0018, + "step": 5942 + }, + { + "epoch": 0.57, + "grad_norm": 0.3159499546692293, + "learning_rate": 0.00017145339215867078, + "loss": 1.0303, + "step": 5943 + }, + { + "epoch": 0.57, + "grad_norm": 0.25980174549043206, + "learning_rate": 0.00017144232393635094, + "loss": 1.1472, + "step": 5944 + }, + { + "epoch": 0.57, + "grad_norm": 0.2727359561908714, + "learning_rate": 0.00017143125392614207, + "loss": 1.1425, + "step": 5945 + }, + { + "epoch": 0.57, + "grad_norm": 0.28270017496530925, + "learning_rate": 0.0001714201821283212, + "loss": 1.1023, + "step": 5946 + }, + { + "epoch": 0.57, + "grad_norm": 0.299277933501389, + "learning_rate": 0.0001714091085431653, + "loss": 1.1593, + "step": 5947 + }, + { + "epoch": 0.57, + "grad_norm": 0.2762476973789334, + "learning_rate": 0.00017139803317095165, + "loss": 1.0942, + "step": 5948 + }, + { + "epoch": 0.57, + "grad_norm": 0.3120883708584473, + "learning_rate": 0.00017138695601195733, + "loss": 1.0574, + "step": 5949 + }, + { + "epoch": 0.57, + "grad_norm": 0.27024241469527543, + "learning_rate": 0.00017137587706645956, + "loss": 1.12, + "step": 5950 + }, + { + "epoch": 0.57, + "grad_norm": 0.2584604551495038, + "learning_rate": 0.00017136479633473562, + "loss": 1.0682, + "step": 5951 + }, + { + "epoch": 0.57, + "grad_norm": 0.2525657024101026, + "learning_rate": 0.0001713537138170628, + "loss": 1.1037, + "step": 5952 + }, + { + "epoch": 0.57, + "grad_norm": 0.28274675021766404, + "learning_rate": 0.00017134262951371842, + "loss": 1.0135, + "step": 5953 + }, + { + "epoch": 0.57, + "grad_norm": 0.2774062534984412, + "learning_rate": 0.00017133154342497995, + "loss": 0.9824, + "step": 5954 + }, + { + "epoch": 0.57, + "grad_norm": 0.2832539855963225, + "learning_rate": 0.00017132045555112474, + "loss": 0.9708, + "step": 5955 + }, + { + "epoch": 0.57, + "grad_norm": 0.24641803171377838, + "learning_rate": 0.0001713093658924303, + "loss": 0.9826, + "step": 5956 + }, + { + "epoch": 0.57, + "grad_norm": 0.29062056115142054, + "learning_rate": 0.0001712982744491742, + "loss": 0.9739, + "step": 5957 + }, + { + "epoch": 0.57, + "grad_norm": 0.2922884188711339, + "learning_rate": 0.00017128718122163395, + "loss": 0.9959, + "step": 5958 + }, + { + "epoch": 0.57, + "grad_norm": 0.26793724543335196, + "learning_rate": 0.00017127608621008718, + "loss": 1.0553, + "step": 5959 + }, + { + "epoch": 0.57, + "grad_norm": 0.276065664300244, + "learning_rate": 0.00017126498941481155, + "loss": 1.1004, + "step": 5960 + }, + { + "epoch": 0.57, + "grad_norm": 0.26445107517178895, + "learning_rate": 0.00017125389083608479, + "loss": 1.0136, + "step": 5961 + }, + { + "epoch": 0.57, + "grad_norm": 0.2724968692637002, + "learning_rate": 0.00017124279047418464, + "loss": 0.9554, + "step": 5962 + }, + { + "epoch": 0.57, + "grad_norm": 0.2638896257757664, + "learning_rate": 0.00017123168832938886, + "loss": 1.0249, + "step": 5963 + }, + { + "epoch": 0.57, + "grad_norm": 0.25113449381499126, + "learning_rate": 0.00017122058440197533, + "loss": 1.0309, + "step": 5964 + }, + { + "epoch": 0.57, + "grad_norm": 0.29069944989919566, + "learning_rate": 0.0001712094786922219, + "loss": 1.0919, + "step": 5965 + }, + { + "epoch": 0.57, + "grad_norm": 0.28312314556833795, + "learning_rate": 0.00017119837120040652, + "loss": 1.0403, + "step": 5966 + }, + { + "epoch": 0.57, + "grad_norm": 0.26755188643974503, + "learning_rate": 0.00017118726192680717, + "loss": 1.0553, + "step": 5967 + }, + { + "epoch": 0.57, + "grad_norm": 0.26486022885030275, + "learning_rate": 0.0001711761508717018, + "loss": 1.0489, + "step": 5968 + }, + { + "epoch": 0.57, + "grad_norm": 0.3020328536278636, + "learning_rate": 0.00017116503803536856, + "loss": 1.1584, + "step": 5969 + }, + { + "epoch": 0.57, + "grad_norm": 0.34335564008695746, + "learning_rate": 0.00017115392341808555, + "loss": 1.1107, + "step": 5970 + }, + { + "epoch": 0.57, + "grad_norm": 0.30126089791181193, + "learning_rate": 0.00017114280702013084, + "loss": 1.0434, + "step": 5971 + }, + { + "epoch": 0.57, + "grad_norm": 0.28087942592247517, + "learning_rate": 0.00017113168884178267, + "loss": 1.0781, + "step": 5972 + }, + { + "epoch": 0.57, + "grad_norm": 0.2728680461126566, + "learning_rate": 0.0001711205688833193, + "loss": 1.0257, + "step": 5973 + }, + { + "epoch": 0.57, + "grad_norm": 0.2985709605948755, + "learning_rate": 0.000171109447145019, + "loss": 1.1728, + "step": 5974 + }, + { + "epoch": 0.57, + "grad_norm": 0.3082359218373612, + "learning_rate": 0.0001710983236271601, + "loss": 1.0554, + "step": 5975 + }, + { + "epoch": 0.57, + "grad_norm": 0.2795284645154645, + "learning_rate": 0.00017108719833002094, + "loss": 1.0948, + "step": 5976 + }, + { + "epoch": 0.57, + "grad_norm": 0.28120362222029427, + "learning_rate": 0.00017107607125387998, + "loss": 0.9671, + "step": 5977 + }, + { + "epoch": 0.57, + "grad_norm": 0.2823788055491324, + "learning_rate": 0.00017106494239901566, + "loss": 1.0073, + "step": 5978 + }, + { + "epoch": 0.57, + "grad_norm": 0.31065181033843386, + "learning_rate": 0.00017105381176570652, + "loss": 1.1014, + "step": 5979 + }, + { + "epoch": 0.57, + "grad_norm": 0.30900428726246143, + "learning_rate": 0.00017104267935423107, + "loss": 1.0998, + "step": 5980 + }, + { + "epoch": 0.57, + "grad_norm": 0.30988350895081024, + "learning_rate": 0.0001710315451648679, + "loss": 1.0667, + "step": 5981 + }, + { + "epoch": 0.57, + "grad_norm": 0.2981798835140071, + "learning_rate": 0.0001710204091978957, + "loss": 1.065, + "step": 5982 + }, + { + "epoch": 0.57, + "grad_norm": 0.2732778988161561, + "learning_rate": 0.0001710092714535931, + "loss": 1.184, + "step": 5983 + }, + { + "epoch": 0.57, + "grad_norm": 0.27164434920682934, + "learning_rate": 0.00017099813193223887, + "loss": 1.0524, + "step": 5984 + }, + { + "epoch": 0.57, + "grad_norm": 0.29410498736707635, + "learning_rate": 0.00017098699063411178, + "loss": 1.0777, + "step": 5985 + }, + { + "epoch": 0.57, + "grad_norm": 0.28163842201092487, + "learning_rate": 0.0001709758475594906, + "loss": 1.0853, + "step": 5986 + }, + { + "epoch": 0.57, + "grad_norm": 0.27782202027455466, + "learning_rate": 0.00017096470270865427, + "loss": 1.0146, + "step": 5987 + }, + { + "epoch": 0.57, + "grad_norm": 0.2768790776720804, + "learning_rate": 0.0001709535560818816, + "loss": 1.1312, + "step": 5988 + }, + { + "epoch": 0.57, + "grad_norm": 0.2824780756717249, + "learning_rate": 0.00017094240767945166, + "loss": 1.059, + "step": 5989 + }, + { + "epoch": 0.57, + "grad_norm": 0.27033728590320183, + "learning_rate": 0.00017093125750164333, + "loss": 1.197, + "step": 5990 + }, + { + "epoch": 0.57, + "grad_norm": 0.3100948675967715, + "learning_rate": 0.00017092010554873574, + "loss": 1.1042, + "step": 5991 + }, + { + "epoch": 0.57, + "grad_norm": 0.28820788597927405, + "learning_rate": 0.0001709089518210079, + "loss": 1.1635, + "step": 5992 + }, + { + "epoch": 0.57, + "grad_norm": 0.28201539629588646, + "learning_rate": 0.00017089779631873904, + "loss": 0.9284, + "step": 5993 + }, + { + "epoch": 0.57, + "grad_norm": 0.27679231851615277, + "learning_rate": 0.0001708866390422082, + "loss": 1.0167, + "step": 5994 + }, + { + "epoch": 0.57, + "grad_norm": 0.27944884514951135, + "learning_rate": 0.0001708754799916947, + "loss": 1.139, + "step": 5995 + }, + { + "epoch": 0.57, + "grad_norm": 0.3002084383540107, + "learning_rate": 0.00017086431916747778, + "loss": 1.0454, + "step": 5996 + }, + { + "epoch": 0.57, + "grad_norm": 0.26816351694504426, + "learning_rate": 0.0001708531565698367, + "loss": 1.0773, + "step": 5997 + }, + { + "epoch": 0.57, + "grad_norm": 0.30990517015210073, + "learning_rate": 0.0001708419921990509, + "loss": 1.1122, + "step": 5998 + }, + { + "epoch": 0.57, + "grad_norm": 0.30574794780484754, + "learning_rate": 0.0001708308260553997, + "loss": 1.1214, + "step": 5999 + }, + { + "epoch": 0.57, + "grad_norm": 0.2690373353619774, + "learning_rate": 0.00017081965813916253, + "loss": 1.062, + "step": 6000 + }, + { + "epoch": 0.57, + "grad_norm": 0.27472519148333513, + "learning_rate": 0.00017080848845061892, + "loss": 1.0179, + "step": 6001 + }, + { + "epoch": 0.57, + "grad_norm": 0.27470990890182256, + "learning_rate": 0.0001707973169900484, + "loss": 1.0747, + "step": 6002 + }, + { + "epoch": 0.57, + "grad_norm": 0.29061144311513604, + "learning_rate": 0.00017078614375773052, + "loss": 1.1307, + "step": 6003 + }, + { + "epoch": 0.57, + "grad_norm": 0.2713055223809409, + "learning_rate": 0.00017077496875394493, + "loss": 1.0008, + "step": 6004 + }, + { + "epoch": 0.57, + "grad_norm": 0.2925514548076972, + "learning_rate": 0.00017076379197897122, + "loss": 1.1317, + "step": 6005 + }, + { + "epoch": 0.57, + "grad_norm": 0.31101769602212137, + "learning_rate": 0.00017075261343308916, + "loss": 1.1037, + "step": 6006 + }, + { + "epoch": 0.57, + "grad_norm": 0.2647833297981797, + "learning_rate": 0.00017074143311657852, + "loss": 1.0076, + "step": 6007 + }, + { + "epoch": 0.57, + "grad_norm": 0.3013454403605899, + "learning_rate": 0.00017073025102971903, + "loss": 1.0792, + "step": 6008 + }, + { + "epoch": 0.57, + "grad_norm": 0.263491300792746, + "learning_rate": 0.00017071906717279053, + "loss": 0.9797, + "step": 6009 + }, + { + "epoch": 0.57, + "grad_norm": 0.2724539339009908, + "learning_rate": 0.00017070788154607293, + "loss": 0.9805, + "step": 6010 + }, + { + "epoch": 0.58, + "grad_norm": 0.31840312141141264, + "learning_rate": 0.00017069669414984618, + "loss": 1.142, + "step": 6011 + }, + { + "epoch": 0.58, + "grad_norm": 0.2983653164774024, + "learning_rate": 0.00017068550498439025, + "loss": 1.103, + "step": 6012 + }, + { + "epoch": 0.58, + "grad_norm": 0.282150969685538, + "learning_rate": 0.00017067431404998507, + "loss": 1.0682, + "step": 6013 + }, + { + "epoch": 0.58, + "grad_norm": 0.2771901829735762, + "learning_rate": 0.00017066312134691083, + "loss": 1.009, + "step": 6014 + }, + { + "epoch": 0.58, + "grad_norm": 0.2933084300533384, + "learning_rate": 0.00017065192687544753, + "loss": 1.102, + "step": 6015 + }, + { + "epoch": 0.58, + "grad_norm": 0.27679497108587947, + "learning_rate": 0.00017064073063587535, + "loss": 1.1393, + "step": 6016 + }, + { + "epoch": 0.58, + "grad_norm": 0.28895991711416996, + "learning_rate": 0.00017062953262847455, + "loss": 1.0694, + "step": 6017 + }, + { + "epoch": 0.58, + "grad_norm": 0.27823308988900375, + "learning_rate": 0.00017061833285352527, + "loss": 1.0905, + "step": 6018 + }, + { + "epoch": 0.58, + "grad_norm": 0.2748970724583933, + "learning_rate": 0.00017060713131130778, + "loss": 1.1278, + "step": 6019 + }, + { + "epoch": 0.58, + "grad_norm": 0.2730226539296915, + "learning_rate": 0.00017059592800210252, + "loss": 1.0858, + "step": 6020 + }, + { + "epoch": 0.58, + "grad_norm": 0.271328807355675, + "learning_rate": 0.00017058472292618977, + "loss": 0.9972, + "step": 6021 + }, + { + "epoch": 0.58, + "grad_norm": 0.27544958968420746, + "learning_rate": 0.00017057351608384995, + "loss": 1.1456, + "step": 6022 + }, + { + "epoch": 0.58, + "grad_norm": 0.2833122301378086, + "learning_rate": 0.00017056230747536355, + "loss": 1.1727, + "step": 6023 + }, + { + "epoch": 0.58, + "grad_norm": 0.2894626470697644, + "learning_rate": 0.00017055109710101108, + "loss": 1.2121, + "step": 6024 + }, + { + "epoch": 0.58, + "grad_norm": 0.3306470002946546, + "learning_rate": 0.00017053988496107305, + "loss": 1.0776, + "step": 6025 + }, + { + "epoch": 0.58, + "grad_norm": 0.27472508074504254, + "learning_rate": 0.00017052867105583005, + "loss": 0.973, + "step": 6026 + }, + { + "epoch": 0.58, + "grad_norm": 0.3021983840922597, + "learning_rate": 0.00017051745538556278, + "loss": 1.0972, + "step": 6027 + }, + { + "epoch": 0.58, + "grad_norm": 0.2970721247856769, + "learning_rate": 0.0001705062379505518, + "loss": 1.1562, + "step": 6028 + }, + { + "epoch": 0.58, + "grad_norm": 0.2830455403468363, + "learning_rate": 0.00017049501875107795, + "loss": 0.9997, + "step": 6029 + }, + { + "epoch": 0.58, + "grad_norm": 0.28710721053351523, + "learning_rate": 0.00017048379778742193, + "loss": 1.1453, + "step": 6030 + }, + { + "epoch": 0.58, + "grad_norm": 0.27478245168556914, + "learning_rate": 0.00017047257505986457, + "loss": 1.0092, + "step": 6031 + }, + { + "epoch": 0.58, + "grad_norm": 0.26896422201399867, + "learning_rate": 0.00017046135056868677, + "loss": 1.0855, + "step": 6032 + }, + { + "epoch": 0.58, + "grad_norm": 0.24707636757742915, + "learning_rate": 0.00017045012431416936, + "loss": 1.0971, + "step": 6033 + }, + { + "epoch": 0.58, + "grad_norm": 0.2493183820340796, + "learning_rate": 0.0001704388962965933, + "loss": 0.9823, + "step": 6034 + }, + { + "epoch": 0.58, + "grad_norm": 0.2805800589880677, + "learning_rate": 0.00017042766651623962, + "loss": 1.0523, + "step": 6035 + }, + { + "epoch": 0.58, + "grad_norm": 0.3038587968507171, + "learning_rate": 0.00017041643497338931, + "loss": 1.0162, + "step": 6036 + }, + { + "epoch": 0.58, + "grad_norm": 0.30638016511222943, + "learning_rate": 0.00017040520166832344, + "loss": 1.0421, + "step": 6037 + }, + { + "epoch": 0.58, + "grad_norm": 0.26320036877135733, + "learning_rate": 0.00017039396660132317, + "loss": 0.946, + "step": 6038 + }, + { + "epoch": 0.58, + "grad_norm": 0.2754763891016706, + "learning_rate": 0.00017038272977266966, + "loss": 1.0652, + "step": 6039 + }, + { + "epoch": 0.58, + "grad_norm": 0.26134384808972805, + "learning_rate": 0.0001703714911826441, + "loss": 1.1064, + "step": 6040 + }, + { + "epoch": 0.58, + "grad_norm": 0.2651686121986517, + "learning_rate": 0.0001703602508315277, + "loss": 1.1166, + "step": 6041 + }, + { + "epoch": 0.58, + "grad_norm": 0.25447003550809477, + "learning_rate": 0.00017034900871960184, + "loss": 1.0199, + "step": 6042 + }, + { + "epoch": 0.58, + "grad_norm": 0.28285744813232555, + "learning_rate": 0.0001703377648471478, + "loss": 1.0086, + "step": 6043 + }, + { + "epoch": 0.58, + "grad_norm": 0.2651737161168531, + "learning_rate": 0.000170326519214447, + "loss": 1.0729, + "step": 6044 + }, + { + "epoch": 0.58, + "grad_norm": 0.2855727067410992, + "learning_rate": 0.00017031527182178092, + "loss": 1.0919, + "step": 6045 + }, + { + "epoch": 0.58, + "grad_norm": 0.26177841865296425, + "learning_rate": 0.0001703040226694309, + "loss": 0.9408, + "step": 6046 + }, + { + "epoch": 0.58, + "grad_norm": 0.23691819299378247, + "learning_rate": 0.00017029277175767854, + "loss": 1.0849, + "step": 6047 + }, + { + "epoch": 0.58, + "grad_norm": 0.2511952002294269, + "learning_rate": 0.0001702815190868054, + "loss": 1.0779, + "step": 6048 + }, + { + "epoch": 0.58, + "grad_norm": 0.29110924767645496, + "learning_rate": 0.00017027026465709307, + "loss": 0.9933, + "step": 6049 + }, + { + "epoch": 0.58, + "grad_norm": 0.2830264573548182, + "learning_rate": 0.00017025900846882321, + "loss": 1.0192, + "step": 6050 + }, + { + "epoch": 0.58, + "grad_norm": 0.28455741758111286, + "learning_rate": 0.00017024775052227752, + "loss": 1.0588, + "step": 6051 + }, + { + "epoch": 0.58, + "grad_norm": 0.28932106749302483, + "learning_rate": 0.0001702364908177377, + "loss": 1.1211, + "step": 6052 + }, + { + "epoch": 0.58, + "grad_norm": 0.3307206565435221, + "learning_rate": 0.00017022522935548554, + "loss": 1.0975, + "step": 6053 + }, + { + "epoch": 0.58, + "grad_norm": 0.25352957645066343, + "learning_rate": 0.0001702139661358029, + "loss": 1.0298, + "step": 6054 + }, + { + "epoch": 0.58, + "grad_norm": 0.2572310440026188, + "learning_rate": 0.00017020270115897164, + "loss": 1.0728, + "step": 6055 + }, + { + "epoch": 0.58, + "grad_norm": 0.3172532579427463, + "learning_rate": 0.00017019143442527365, + "loss": 1.1023, + "step": 6056 + }, + { + "epoch": 0.58, + "grad_norm": 0.282565292417829, + "learning_rate": 0.0001701801659349909, + "loss": 1.0488, + "step": 6057 + }, + { + "epoch": 0.58, + "grad_norm": 0.275135543466168, + "learning_rate": 0.00017016889568840542, + "loss": 1.1721, + "step": 6058 + }, + { + "epoch": 0.58, + "grad_norm": 0.26182321188521385, + "learning_rate": 0.00017015762368579918, + "loss": 1.0598, + "step": 6059 + }, + { + "epoch": 0.58, + "grad_norm": 0.26125752561442744, + "learning_rate": 0.00017014634992745434, + "loss": 1.1684, + "step": 6060 + }, + { + "epoch": 0.58, + "grad_norm": 0.27290128374782624, + "learning_rate": 0.000170135074413653, + "loss": 0.9901, + "step": 6061 + }, + { + "epoch": 0.58, + "grad_norm": 0.2653240038158296, + "learning_rate": 0.00017012379714467736, + "loss": 1.1086, + "step": 6062 + }, + { + "epoch": 0.58, + "grad_norm": 0.3293682851680674, + "learning_rate": 0.0001701125181208096, + "loss": 1.117, + "step": 6063 + }, + { + "epoch": 0.58, + "grad_norm": 0.2329956724588023, + "learning_rate": 0.00017010123734233204, + "loss": 1.0925, + "step": 6064 + }, + { + "epoch": 0.58, + "grad_norm": 0.2755426850023404, + "learning_rate": 0.00017008995480952694, + "loss": 1.1292, + "step": 6065 + }, + { + "epoch": 0.58, + "grad_norm": 0.29650880006845154, + "learning_rate": 0.00017007867052267666, + "loss": 1.0334, + "step": 6066 + }, + { + "epoch": 0.58, + "grad_norm": 0.2803458685525201, + "learning_rate": 0.00017006738448206363, + "loss": 1.0168, + "step": 6067 + }, + { + "epoch": 0.58, + "grad_norm": 0.2916272901141261, + "learning_rate": 0.00017005609668797024, + "loss": 1.0039, + "step": 6068 + }, + { + "epoch": 0.58, + "grad_norm": 0.2691648334575106, + "learning_rate": 0.00017004480714067903, + "loss": 1.0727, + "step": 6069 + }, + { + "epoch": 0.58, + "grad_norm": 0.3270372768288688, + "learning_rate": 0.00017003351584047249, + "loss": 1.1879, + "step": 6070 + }, + { + "epoch": 0.58, + "grad_norm": 0.2782777828019017, + "learning_rate": 0.0001700222227876332, + "loss": 1.0006, + "step": 6071 + }, + { + "epoch": 0.58, + "grad_norm": 0.2661246430524472, + "learning_rate": 0.00017001092798244377, + "loss": 1.0393, + "step": 6072 + }, + { + "epoch": 0.58, + "grad_norm": 0.281715885539005, + "learning_rate": 0.00016999963142518687, + "loss": 1.1322, + "step": 6073 + }, + { + "epoch": 0.58, + "grad_norm": 0.2986783823949029, + "learning_rate": 0.0001699883331161452, + "loss": 0.9984, + "step": 6074 + }, + { + "epoch": 0.58, + "grad_norm": 0.28065012498362923, + "learning_rate": 0.00016997703305560153, + "loss": 1.06, + "step": 6075 + }, + { + "epoch": 0.58, + "grad_norm": 0.3292647162053623, + "learning_rate": 0.00016996573124383862, + "loss": 1.075, + "step": 6076 + }, + { + "epoch": 0.58, + "grad_norm": 0.3073957042018125, + "learning_rate": 0.0001699544276811393, + "loss": 1.1219, + "step": 6077 + }, + { + "epoch": 0.58, + "grad_norm": 0.3076254806022189, + "learning_rate": 0.00016994312236778646, + "loss": 1.1214, + "step": 6078 + }, + { + "epoch": 0.58, + "grad_norm": 0.24563100316558975, + "learning_rate": 0.00016993181530406304, + "loss": 1.0868, + "step": 6079 + }, + { + "epoch": 0.58, + "grad_norm": 0.32781029130768446, + "learning_rate": 0.00016992050649025197, + "loss": 1.0481, + "step": 6080 + }, + { + "epoch": 0.58, + "grad_norm": 0.27854015122577225, + "learning_rate": 0.0001699091959266363, + "loss": 1.0749, + "step": 6081 + }, + { + "epoch": 0.58, + "grad_norm": 0.2844546319680473, + "learning_rate": 0.00016989788361349906, + "loss": 1.0692, + "step": 6082 + }, + { + "epoch": 0.58, + "grad_norm": 0.26692237610439323, + "learning_rate": 0.00016988656955112337, + "loss": 1.1151, + "step": 6083 + }, + { + "epoch": 0.58, + "grad_norm": 0.26269204364410026, + "learning_rate": 0.00016987525373979233, + "loss": 0.9348, + "step": 6084 + }, + { + "epoch": 0.58, + "grad_norm": 0.27306822357867006, + "learning_rate": 0.00016986393617978918, + "loss": 1.2159, + "step": 6085 + }, + { + "epoch": 0.58, + "grad_norm": 0.2827921168255787, + "learning_rate": 0.0001698526168713971, + "loss": 0.9376, + "step": 6086 + }, + { + "epoch": 0.58, + "grad_norm": 0.23947335215371623, + "learning_rate": 0.00016984129581489935, + "loss": 1.0817, + "step": 6087 + }, + { + "epoch": 0.58, + "grad_norm": 0.3343810073022071, + "learning_rate": 0.0001698299730105793, + "loss": 1.1555, + "step": 6088 + }, + { + "epoch": 0.58, + "grad_norm": 0.26515752282559285, + "learning_rate": 0.00016981864845872033, + "loss": 1.1123, + "step": 6089 + }, + { + "epoch": 0.58, + "grad_norm": 0.27681937200219026, + "learning_rate": 0.00016980732215960575, + "loss": 1.0367, + "step": 6090 + }, + { + "epoch": 0.58, + "grad_norm": 0.2519010649566014, + "learning_rate": 0.0001697959941135191, + "loss": 1.0861, + "step": 6091 + }, + { + "epoch": 0.58, + "grad_norm": 0.2704443985177932, + "learning_rate": 0.00016978466432074381, + "loss": 1.0398, + "step": 6092 + }, + { + "epoch": 0.58, + "grad_norm": 0.28884051221369256, + "learning_rate": 0.00016977333278156347, + "loss": 1.0501, + "step": 6093 + }, + { + "epoch": 0.58, + "grad_norm": 0.2749924386950333, + "learning_rate": 0.0001697619994962616, + "loss": 1.0743, + "step": 6094 + }, + { + "epoch": 0.58, + "grad_norm": 0.279690239046547, + "learning_rate": 0.00016975066446512185, + "loss": 1.1504, + "step": 6095 + }, + { + "epoch": 0.58, + "grad_norm": 0.24616630543118756, + "learning_rate": 0.00016973932768842787, + "loss": 1.1121, + "step": 6096 + }, + { + "epoch": 0.58, + "grad_norm": 0.25087990953142, + "learning_rate": 0.00016972798916646336, + "loss": 1.0454, + "step": 6097 + }, + { + "epoch": 0.58, + "grad_norm": 0.2766189199572125, + "learning_rate": 0.00016971664889951215, + "loss": 1.0582, + "step": 6098 + }, + { + "epoch": 0.58, + "grad_norm": 0.28980752883287725, + "learning_rate": 0.00016970530688785798, + "loss": 1.1664, + "step": 6099 + }, + { + "epoch": 0.58, + "grad_norm": 0.334844857943314, + "learning_rate": 0.0001696939631317847, + "loss": 1.1833, + "step": 6100 + }, + { + "epoch": 0.58, + "grad_norm": 0.2923718680275688, + "learning_rate": 0.0001696826176315762, + "loss": 1.063, + "step": 6101 + }, + { + "epoch": 0.58, + "grad_norm": 0.2927999981743822, + "learning_rate": 0.00016967127038751637, + "loss": 1.0568, + "step": 6102 + }, + { + "epoch": 0.58, + "grad_norm": 0.30725008382724595, + "learning_rate": 0.0001696599213998892, + "loss": 1.113, + "step": 6103 + }, + { + "epoch": 0.58, + "grad_norm": 0.2864392247154817, + "learning_rate": 0.00016964857066897876, + "loss": 1.0959, + "step": 6104 + }, + { + "epoch": 0.58, + "grad_norm": 0.29730179190891975, + "learning_rate": 0.00016963721819506904, + "loss": 1.0681, + "step": 6105 + }, + { + "epoch": 0.58, + "grad_norm": 0.2338409227151746, + "learning_rate": 0.00016962586397844417, + "loss": 0.9203, + "step": 6106 + }, + { + "epoch": 0.58, + "grad_norm": 0.30140296809951767, + "learning_rate": 0.0001696145080193883, + "loss": 1.1501, + "step": 6107 + }, + { + "epoch": 0.58, + "grad_norm": 0.29957908772250824, + "learning_rate": 0.00016960315031818563, + "loss": 0.9797, + "step": 6108 + }, + { + "epoch": 0.58, + "grad_norm": 0.2703387299476374, + "learning_rate": 0.00016959179087512038, + "loss": 1.1028, + "step": 6109 + }, + { + "epoch": 0.58, + "grad_norm": 0.24345511227668115, + "learning_rate": 0.0001695804296904768, + "loss": 1.1393, + "step": 6110 + }, + { + "epoch": 0.58, + "grad_norm": 0.28466641760021855, + "learning_rate": 0.00016956906676453927, + "loss": 1.0691, + "step": 6111 + }, + { + "epoch": 0.58, + "grad_norm": 0.26942292513037813, + "learning_rate": 0.00016955770209759206, + "loss": 1.1351, + "step": 6112 + }, + { + "epoch": 0.58, + "grad_norm": 0.2432370590626464, + "learning_rate": 0.0001695463356899197, + "loss": 1.111, + "step": 6113 + }, + { + "epoch": 0.58, + "grad_norm": 0.2768776116799371, + "learning_rate": 0.00016953496754180657, + "loss": 1.0402, + "step": 6114 + }, + { + "epoch": 0.59, + "grad_norm": 0.2782542666733751, + "learning_rate": 0.00016952359765353716, + "loss": 1.016, + "step": 6115 + }, + { + "epoch": 0.59, + "grad_norm": 0.2606113670203369, + "learning_rate": 0.00016951222602539604, + "loss": 1.1145, + "step": 6116 + }, + { + "epoch": 0.59, + "grad_norm": 0.28152618216858916, + "learning_rate": 0.00016950085265766775, + "loss": 1.0144, + "step": 6117 + }, + { + "epoch": 0.59, + "grad_norm": 0.2762557373966403, + "learning_rate": 0.000169489477550637, + "loss": 1.0188, + "step": 6118 + }, + { + "epoch": 0.59, + "grad_norm": 0.27464396847770955, + "learning_rate": 0.00016947810070458836, + "loss": 1.0246, + "step": 6119 + }, + { + "epoch": 0.59, + "grad_norm": 0.2553714383261195, + "learning_rate": 0.00016946672211980656, + "loss": 1.0676, + "step": 6120 + }, + { + "epoch": 0.59, + "grad_norm": 0.2663697635305504, + "learning_rate": 0.00016945534179657642, + "loss": 1.0607, + "step": 6121 + }, + { + "epoch": 0.59, + "grad_norm": 0.2724085363177121, + "learning_rate": 0.00016944395973518273, + "loss": 1.0022, + "step": 6122 + }, + { + "epoch": 0.59, + "grad_norm": 0.2830680768080686, + "learning_rate": 0.00016943257593591025, + "loss": 1.1051, + "step": 6123 + }, + { + "epoch": 0.59, + "grad_norm": 0.28224872460306244, + "learning_rate": 0.00016942119039904392, + "loss": 1.066, + "step": 6124 + }, + { + "epoch": 0.59, + "grad_norm": 0.26377009917280125, + "learning_rate": 0.0001694098031248687, + "loss": 1.0324, + "step": 6125 + }, + { + "epoch": 0.59, + "grad_norm": 0.2718917219979864, + "learning_rate": 0.0001693984141136695, + "loss": 1.0499, + "step": 6126 + }, + { + "epoch": 0.59, + "grad_norm": 0.259809210248362, + "learning_rate": 0.0001693870233657314, + "loss": 1.0218, + "step": 6127 + }, + { + "epoch": 0.59, + "grad_norm": 0.2757737836543942, + "learning_rate": 0.00016937563088133942, + "loss": 1.0728, + "step": 6128 + }, + { + "epoch": 0.59, + "grad_norm": 0.3068744212914902, + "learning_rate": 0.0001693642366607787, + "loss": 1.0689, + "step": 6129 + }, + { + "epoch": 0.59, + "grad_norm": 0.2559474725346445, + "learning_rate": 0.00016935284070433436, + "loss": 0.9259, + "step": 6130 + }, + { + "epoch": 0.59, + "grad_norm": 0.2889397480153887, + "learning_rate": 0.00016934144301229155, + "loss": 1.1747, + "step": 6131 + }, + { + "epoch": 0.59, + "grad_norm": 0.3088905871334589, + "learning_rate": 0.0001693300435849356, + "loss": 1.172, + "step": 6132 + }, + { + "epoch": 0.59, + "grad_norm": 0.30471384991519385, + "learning_rate": 0.00016931864242255171, + "loss": 1.0735, + "step": 6133 + }, + { + "epoch": 0.59, + "grad_norm": 0.5872459177211959, + "learning_rate": 0.00016930723952542523, + "loss": 0.9714, + "step": 6134 + }, + { + "epoch": 0.59, + "grad_norm": 0.26880571869976677, + "learning_rate": 0.0001692958348938415, + "loss": 1.0637, + "step": 6135 + }, + { + "epoch": 0.59, + "grad_norm": 0.2842892019706417, + "learning_rate": 0.000169284428528086, + "loss": 1.0986, + "step": 6136 + }, + { + "epoch": 0.59, + "grad_norm": 0.3139992368614553, + "learning_rate": 0.0001692730204284441, + "loss": 1.0542, + "step": 6137 + }, + { + "epoch": 0.59, + "grad_norm": 0.2939973190730885, + "learning_rate": 0.00016926161059520133, + "loss": 1.018, + "step": 6138 + }, + { + "epoch": 0.59, + "grad_norm": 0.2745235711764705, + "learning_rate": 0.00016925019902864325, + "loss": 1.0936, + "step": 6139 + }, + { + "epoch": 0.59, + "grad_norm": 0.295538764046495, + "learning_rate": 0.0001692387857290554, + "loss": 1.238, + "step": 6140 + }, + { + "epoch": 0.59, + "grad_norm": 0.2918293376304929, + "learning_rate": 0.00016922737069672344, + "loss": 1.0324, + "step": 6141 + }, + { + "epoch": 0.59, + "grad_norm": 0.2661937879474406, + "learning_rate": 0.000169215953931933, + "loss": 1.1461, + "step": 6142 + }, + { + "epoch": 0.59, + "grad_norm": 0.29698439715428343, + "learning_rate": 0.0001692045354349698, + "loss": 1.0072, + "step": 6143 + }, + { + "epoch": 0.59, + "grad_norm": 0.2841385747639344, + "learning_rate": 0.0001691931152061196, + "loss": 1.0169, + "step": 6144 + }, + { + "epoch": 0.59, + "grad_norm": 0.2525826472239577, + "learning_rate": 0.00016918169324566825, + "loss": 1.0821, + "step": 6145 + }, + { + "epoch": 0.59, + "grad_norm": 0.30426659332063666, + "learning_rate": 0.0001691702695539015, + "loss": 1.0161, + "step": 6146 + }, + { + "epoch": 0.59, + "grad_norm": 0.3247256246142582, + "learning_rate": 0.00016915884413110536, + "loss": 1.0768, + "step": 6147 + }, + { + "epoch": 0.59, + "grad_norm": 0.28154171353151247, + "learning_rate": 0.0001691474169775656, + "loss": 1.0462, + "step": 6148 + }, + { + "epoch": 0.59, + "grad_norm": 0.32206989451215157, + "learning_rate": 0.0001691359880935683, + "loss": 1.0335, + "step": 6149 + }, + { + "epoch": 0.59, + "grad_norm": 0.29975094798814433, + "learning_rate": 0.00016912455747939946, + "loss": 1.08, + "step": 6150 + }, + { + "epoch": 0.59, + "grad_norm": 0.2940671992246637, + "learning_rate": 0.0001691131251353451, + "loss": 1.0279, + "step": 6151 + }, + { + "epoch": 0.59, + "grad_norm": 0.30973218098094907, + "learning_rate": 0.00016910169106169138, + "loss": 1.1113, + "step": 6152 + }, + { + "epoch": 0.59, + "grad_norm": 0.2985525692876313, + "learning_rate": 0.0001690902552587244, + "loss": 1.0637, + "step": 6153 + }, + { + "epoch": 0.59, + "grad_norm": 0.2539451576750145, + "learning_rate": 0.00016907881772673032, + "loss": 1.0424, + "step": 6154 + }, + { + "epoch": 0.59, + "grad_norm": 0.2687758329509323, + "learning_rate": 0.00016906737846599548, + "loss": 1.0618, + "step": 6155 + }, + { + "epoch": 0.59, + "grad_norm": 0.27971131955326073, + "learning_rate": 0.00016905593747680602, + "loss": 1.0893, + "step": 6156 + }, + { + "epoch": 0.59, + "grad_norm": 0.2543125383692707, + "learning_rate": 0.00016904449475944837, + "loss": 1.1021, + "step": 6157 + }, + { + "epoch": 0.59, + "grad_norm": 0.2789446404233202, + "learning_rate": 0.00016903305031420885, + "loss": 1.1103, + "step": 6158 + }, + { + "epoch": 0.59, + "grad_norm": 0.28334066222200344, + "learning_rate": 0.00016902160414137383, + "loss": 1.065, + "step": 6159 + }, + { + "epoch": 0.59, + "grad_norm": 0.2358577232682755, + "learning_rate": 0.0001690101562412298, + "loss": 1.021, + "step": 6160 + }, + { + "epoch": 0.59, + "grad_norm": 0.27559840264421875, + "learning_rate": 0.00016899870661406325, + "loss": 1.1527, + "step": 6161 + }, + { + "epoch": 0.59, + "grad_norm": 0.30571401380369834, + "learning_rate": 0.0001689872552601607, + "loss": 1.1118, + "step": 6162 + }, + { + "epoch": 0.59, + "grad_norm": 0.29488378034589946, + "learning_rate": 0.00016897580217980872, + "loss": 1.1922, + "step": 6163 + }, + { + "epoch": 0.59, + "grad_norm": 0.31253654893817673, + "learning_rate": 0.00016896434737329394, + "loss": 0.9932, + "step": 6164 + }, + { + "epoch": 0.59, + "grad_norm": 0.271362147934014, + "learning_rate": 0.00016895289084090304, + "loss": 1.1036, + "step": 6165 + }, + { + "epoch": 0.59, + "grad_norm": 0.2703212529224629, + "learning_rate": 0.0001689414325829227, + "loss": 1.1416, + "step": 6166 + }, + { + "epoch": 0.59, + "grad_norm": 0.2824957796659181, + "learning_rate": 0.0001689299725996397, + "loss": 1.0145, + "step": 6167 + }, + { + "epoch": 0.59, + "grad_norm": 0.2976711651462754, + "learning_rate": 0.00016891851089134079, + "loss": 1.0386, + "step": 6168 + }, + { + "epoch": 0.59, + "grad_norm": 0.26975195101683597, + "learning_rate": 0.00016890704745831282, + "loss": 1.0879, + "step": 6169 + }, + { + "epoch": 0.59, + "grad_norm": 0.2643048862785792, + "learning_rate": 0.00016889558230084273, + "loss": 1.0786, + "step": 6170 + }, + { + "epoch": 0.59, + "grad_norm": 0.2717495195222049, + "learning_rate": 0.00016888411541921735, + "loss": 1.1159, + "step": 6171 + }, + { + "epoch": 0.59, + "grad_norm": 0.2833620530227191, + "learning_rate": 0.0001688726468137237, + "loss": 1.097, + "step": 6172 + }, + { + "epoch": 0.59, + "grad_norm": 0.25257033358758985, + "learning_rate": 0.00016886117648464878, + "loss": 1.0967, + "step": 6173 + }, + { + "epoch": 0.59, + "grad_norm": 0.27530661316906846, + "learning_rate": 0.00016884970443227965, + "loss": 1.0395, + "step": 6174 + }, + { + "epoch": 0.59, + "grad_norm": 0.2527678120212698, + "learning_rate": 0.00016883823065690337, + "loss": 1.0305, + "step": 6175 + }, + { + "epoch": 0.59, + "grad_norm": 0.2574184683768036, + "learning_rate": 0.00016882675515880714, + "loss": 1.0878, + "step": 6176 + }, + { + "epoch": 0.59, + "grad_norm": 0.25761118308699854, + "learning_rate": 0.00016881527793827808, + "loss": 0.9574, + "step": 6177 + }, + { + "epoch": 0.59, + "grad_norm": 0.27107127416113136, + "learning_rate": 0.00016880379899560346, + "loss": 1.1092, + "step": 6178 + }, + { + "epoch": 0.59, + "grad_norm": 0.2842811642803488, + "learning_rate": 0.0001687923183310705, + "loss": 1.1637, + "step": 6179 + }, + { + "epoch": 0.59, + "grad_norm": 0.2728440826452907, + "learning_rate": 0.00016878083594496656, + "loss": 0.9771, + "step": 6180 + }, + { + "epoch": 0.59, + "grad_norm": 0.2886790568351847, + "learning_rate": 0.000168769351837579, + "loss": 0.9462, + "step": 6181 + }, + { + "epoch": 0.59, + "grad_norm": 0.25378061719229456, + "learning_rate": 0.00016875786600919514, + "loss": 1.0195, + "step": 6182 + }, + { + "epoch": 0.59, + "grad_norm": 0.2942815181656749, + "learning_rate": 0.00016874637846010248, + "loss": 1.1533, + "step": 6183 + }, + { + "epoch": 0.59, + "grad_norm": 0.26947359910559543, + "learning_rate": 0.00016873488919058854, + "loss": 0.9764, + "step": 6184 + }, + { + "epoch": 0.59, + "grad_norm": 0.2997412184794112, + "learning_rate": 0.00016872339820094074, + "loss": 1.0704, + "step": 6185 + }, + { + "epoch": 0.59, + "grad_norm": 0.241116336668137, + "learning_rate": 0.00016871190549144673, + "loss": 1.0213, + "step": 6186 + }, + { + "epoch": 0.59, + "grad_norm": 0.29798499768309145, + "learning_rate": 0.00016870041106239412, + "loss": 1.0051, + "step": 6187 + }, + { + "epoch": 0.59, + "grad_norm": 0.28286155546938635, + "learning_rate": 0.00016868891491407054, + "loss": 1.1061, + "step": 6188 + }, + { + "epoch": 0.59, + "grad_norm": 0.24398969837876033, + "learning_rate": 0.00016867741704676368, + "loss": 1.0401, + "step": 6189 + }, + { + "epoch": 0.59, + "grad_norm": 0.28740259957474623, + "learning_rate": 0.0001686659174607613, + "loss": 1.1156, + "step": 6190 + }, + { + "epoch": 0.59, + "grad_norm": 0.2892058554716867, + "learning_rate": 0.0001686544161563512, + "loss": 1.0922, + "step": 6191 + }, + { + "epoch": 0.59, + "grad_norm": 0.31955410366097675, + "learning_rate": 0.00016864291313382115, + "loss": 1.0641, + "step": 6192 + }, + { + "epoch": 0.59, + "grad_norm": 0.2794578951295337, + "learning_rate": 0.00016863140839345908, + "loss": 1.0945, + "step": 6193 + }, + { + "epoch": 0.59, + "grad_norm": 0.30665149624277827, + "learning_rate": 0.00016861990193555292, + "loss": 1.0357, + "step": 6194 + }, + { + "epoch": 0.59, + "grad_norm": 0.27440411091642547, + "learning_rate": 0.00016860839376039053, + "loss": 1.2011, + "step": 6195 + }, + { + "epoch": 0.59, + "grad_norm": 0.2817725535852179, + "learning_rate": 0.00016859688386826, + "loss": 1.0482, + "step": 6196 + }, + { + "epoch": 0.59, + "grad_norm": 0.3011982191792621, + "learning_rate": 0.0001685853722594493, + "loss": 1.0853, + "step": 6197 + }, + { + "epoch": 0.59, + "grad_norm": 0.29959655201744034, + "learning_rate": 0.00016857385893424658, + "loss": 1.0111, + "step": 6198 + }, + { + "epoch": 0.59, + "grad_norm": 0.2802735304530325, + "learning_rate": 0.00016856234389293995, + "loss": 1.1118, + "step": 6199 + }, + { + "epoch": 0.59, + "grad_norm": 0.30399590914216984, + "learning_rate": 0.00016855082713581758, + "loss": 1.0576, + "step": 6200 + }, + { + "epoch": 0.59, + "grad_norm": 0.28089634625081406, + "learning_rate": 0.00016853930866316766, + "loss": 1.1154, + "step": 6201 + }, + { + "epoch": 0.59, + "grad_norm": 0.33064903413133756, + "learning_rate": 0.00016852778847527847, + "loss": 1.0004, + "step": 6202 + }, + { + "epoch": 0.59, + "grad_norm": 0.28719188673845863, + "learning_rate": 0.00016851626657243834, + "loss": 1.0599, + "step": 6203 + }, + { + "epoch": 0.59, + "grad_norm": 0.2562638224677485, + "learning_rate": 0.0001685047429549355, + "loss": 1.0726, + "step": 6204 + }, + { + "epoch": 0.59, + "grad_norm": 0.27769483083836394, + "learning_rate": 0.00016849321762305846, + "loss": 1.0452, + "step": 6205 + }, + { + "epoch": 0.59, + "grad_norm": 0.311376243880295, + "learning_rate": 0.00016848169057709562, + "loss": 1.1139, + "step": 6206 + }, + { + "epoch": 0.59, + "grad_norm": 0.33186169126683296, + "learning_rate": 0.00016847016181733543, + "loss": 1.1708, + "step": 6207 + }, + { + "epoch": 0.59, + "grad_norm": 0.31561407579512624, + "learning_rate": 0.0001684586313440664, + "loss": 1.0678, + "step": 6208 + }, + { + "epoch": 0.59, + "grad_norm": 0.2907024924078061, + "learning_rate": 0.00016844709915757707, + "loss": 1.0885, + "step": 6209 + }, + { + "epoch": 0.59, + "grad_norm": 0.29061404968369564, + "learning_rate": 0.0001684355652581561, + "loss": 1.0829, + "step": 6210 + }, + { + "epoch": 0.59, + "grad_norm": 0.2379708775530534, + "learning_rate": 0.00016842402964609209, + "loss": 1.0688, + "step": 6211 + }, + { + "epoch": 0.59, + "grad_norm": 0.278600564728756, + "learning_rate": 0.00016841249232167372, + "loss": 0.975, + "step": 6212 + }, + { + "epoch": 0.59, + "grad_norm": 0.2995894305662367, + "learning_rate": 0.00016840095328518975, + "loss": 1.0657, + "step": 6213 + }, + { + "epoch": 0.59, + "grad_norm": 0.28752497877593164, + "learning_rate": 0.00016838941253692895, + "loss": 1.1441, + "step": 6214 + }, + { + "epoch": 0.59, + "grad_norm": 0.24322263909794603, + "learning_rate": 0.00016837787007718008, + "loss": 0.9605, + "step": 6215 + }, + { + "epoch": 0.59, + "grad_norm": 0.2706035311294752, + "learning_rate": 0.00016836632590623208, + "loss": 1.0975, + "step": 6216 + }, + { + "epoch": 0.59, + "grad_norm": 0.32174695455051877, + "learning_rate": 0.0001683547800243738, + "loss": 1.0976, + "step": 6217 + }, + { + "epoch": 0.59, + "grad_norm": 0.2886586498798772, + "learning_rate": 0.00016834323243189415, + "loss": 1.1534, + "step": 6218 + }, + { + "epoch": 0.59, + "grad_norm": 0.2855976937236902, + "learning_rate": 0.00016833168312908222, + "loss": 1.1865, + "step": 6219 + }, + { + "epoch": 0.6, + "grad_norm": 0.2667954790754751, + "learning_rate": 0.00016832013211622694, + "loss": 1.2281, + "step": 6220 + }, + { + "epoch": 0.6, + "grad_norm": 0.24933114672059709, + "learning_rate": 0.00016830857939361738, + "loss": 0.9391, + "step": 6221 + }, + { + "epoch": 0.6, + "grad_norm": 0.27072321221050405, + "learning_rate": 0.0001682970249615427, + "loss": 1.0534, + "step": 6222 + }, + { + "epoch": 0.6, + "grad_norm": 0.28347062120739175, + "learning_rate": 0.0001682854688202921, + "loss": 1.0556, + "step": 6223 + }, + { + "epoch": 0.6, + "grad_norm": 0.31128711878834986, + "learning_rate": 0.00016827391097015473, + "loss": 1.0196, + "step": 6224 + }, + { + "epoch": 0.6, + "grad_norm": 0.29466538688649235, + "learning_rate": 0.00016826235141141976, + "loss": 1.1365, + "step": 6225 + }, + { + "epoch": 0.6, + "grad_norm": 0.25144131922524754, + "learning_rate": 0.0001682507901443766, + "loss": 1.01, + "step": 6226 + }, + { + "epoch": 0.6, + "grad_norm": 0.2613185058399168, + "learning_rate": 0.00016823922716931451, + "loss": 1.0745, + "step": 6227 + }, + { + "epoch": 0.6, + "grad_norm": 0.27346829355051, + "learning_rate": 0.00016822766248652288, + "loss": 1.0676, + "step": 6228 + }, + { + "epoch": 0.6, + "grad_norm": 0.28753894574718736, + "learning_rate": 0.0001682160960962911, + "loss": 1.1616, + "step": 6229 + }, + { + "epoch": 0.6, + "grad_norm": 0.27774064585558034, + "learning_rate": 0.00016820452799890865, + "loss": 1.0945, + "step": 6230 + }, + { + "epoch": 0.6, + "grad_norm": 0.29844620149160067, + "learning_rate": 0.00016819295819466503, + "loss": 1.1999, + "step": 6231 + }, + { + "epoch": 0.6, + "grad_norm": 0.23278252939346147, + "learning_rate": 0.00016818138668384976, + "loss": 1.1572, + "step": 6232 + }, + { + "epoch": 0.6, + "grad_norm": 0.2910036401832354, + "learning_rate": 0.00016816981346675242, + "loss": 1.0418, + "step": 6233 + }, + { + "epoch": 0.6, + "grad_norm": 0.2679309567861235, + "learning_rate": 0.00016815823854366267, + "loss": 1.1591, + "step": 6234 + }, + { + "epoch": 0.6, + "grad_norm": 0.28959484717417816, + "learning_rate": 0.0001681466619148702, + "loss": 1.0724, + "step": 6235 + }, + { + "epoch": 0.6, + "grad_norm": 0.24969815430510078, + "learning_rate": 0.00016813508358066466, + "loss": 1.0834, + "step": 6236 + }, + { + "epoch": 0.6, + "grad_norm": 0.2817944152182526, + "learning_rate": 0.00016812350354133583, + "loss": 1.0752, + "step": 6237 + }, + { + "epoch": 0.6, + "grad_norm": 0.3095644924541878, + "learning_rate": 0.00016811192179717353, + "loss": 1.0517, + "step": 6238 + }, + { + "epoch": 0.6, + "grad_norm": 0.2826064816728548, + "learning_rate": 0.00016810033834846754, + "loss": 0.9918, + "step": 6239 + }, + { + "epoch": 0.6, + "grad_norm": 0.2670350115993338, + "learning_rate": 0.0001680887531955078, + "loss": 1.0787, + "step": 6240 + }, + { + "epoch": 0.6, + "grad_norm": 0.2786656526566983, + "learning_rate": 0.00016807716633858425, + "loss": 1.1872, + "step": 6241 + }, + { + "epoch": 0.6, + "grad_norm": 0.24633827043340034, + "learning_rate": 0.00016806557777798676, + "loss": 1.1754, + "step": 6242 + }, + { + "epoch": 0.6, + "grad_norm": 0.2510571689731006, + "learning_rate": 0.00016805398751400548, + "loss": 1.0483, + "step": 6243 + }, + { + "epoch": 0.6, + "grad_norm": 0.25726341692210536, + "learning_rate": 0.00016804239554693036, + "loss": 1.0626, + "step": 6244 + }, + { + "epoch": 0.6, + "grad_norm": 0.2973987113917484, + "learning_rate": 0.00016803080187705152, + "loss": 1.1194, + "step": 6245 + }, + { + "epoch": 0.6, + "grad_norm": 0.24985679408993092, + "learning_rate": 0.00016801920650465912, + "loss": 0.9716, + "step": 6246 + }, + { + "epoch": 0.6, + "grad_norm": 0.2373094805089608, + "learning_rate": 0.00016800760943004334, + "loss": 0.9422, + "step": 6247 + }, + { + "epoch": 0.6, + "grad_norm": 0.30043718466849206, + "learning_rate": 0.0001679960106534944, + "loss": 1.0484, + "step": 6248 + }, + { + "epoch": 0.6, + "grad_norm": 0.286122704125855, + "learning_rate": 0.0001679844101753025, + "loss": 1.1008, + "step": 6249 + }, + { + "epoch": 0.6, + "grad_norm": 0.262959271095953, + "learning_rate": 0.00016797280799575804, + "loss": 1.1192, + "step": 6250 + }, + { + "epoch": 0.6, + "grad_norm": 0.2715165644061917, + "learning_rate": 0.00016796120411515138, + "loss": 1.0921, + "step": 6251 + }, + { + "epoch": 0.6, + "grad_norm": 0.28638836988275007, + "learning_rate": 0.00016794959853377284, + "loss": 1.1773, + "step": 6252 + }, + { + "epoch": 0.6, + "grad_norm": 0.2776266911958128, + "learning_rate": 0.00016793799125191288, + "loss": 1.0659, + "step": 6253 + }, + { + "epoch": 0.6, + "grad_norm": 0.28766018953162986, + "learning_rate": 0.00016792638226986202, + "loss": 1.0234, + "step": 6254 + }, + { + "epoch": 0.6, + "grad_norm": 0.24147276451994776, + "learning_rate": 0.00016791477158791077, + "loss": 1.0404, + "step": 6255 + }, + { + "epoch": 0.6, + "grad_norm": 0.28431529367278485, + "learning_rate": 0.00016790315920634964, + "loss": 1.1169, + "step": 6256 + }, + { + "epoch": 0.6, + "grad_norm": 0.297855580080018, + "learning_rate": 0.00016789154512546927, + "loss": 1.1081, + "step": 6257 + }, + { + "epoch": 0.6, + "grad_norm": 0.26389657019849716, + "learning_rate": 0.00016787992934556032, + "loss": 1.0314, + "step": 6258 + }, + { + "epoch": 0.6, + "grad_norm": 0.2928163841306302, + "learning_rate": 0.0001678683118669135, + "loss": 1.2123, + "step": 6259 + }, + { + "epoch": 0.6, + "grad_norm": 0.2975129278676374, + "learning_rate": 0.00016785669268981949, + "loss": 1.1446, + "step": 6260 + }, + { + "epoch": 0.6, + "grad_norm": 0.2853505641305865, + "learning_rate": 0.00016784507181456912, + "loss": 0.9765, + "step": 6261 + }, + { + "epoch": 0.6, + "grad_norm": 0.29085919322220416, + "learning_rate": 0.0001678334492414532, + "loss": 1.0566, + "step": 6262 + }, + { + "epoch": 0.6, + "grad_norm": 0.2951008224142174, + "learning_rate": 0.00016782182497076257, + "loss": 1.0795, + "step": 6263 + }, + { + "epoch": 0.6, + "grad_norm": 0.2866385453022363, + "learning_rate": 0.00016781019900278813, + "loss": 1.043, + "step": 6264 + }, + { + "epoch": 0.6, + "grad_norm": 0.28440677949059723, + "learning_rate": 0.00016779857133782087, + "loss": 1.0659, + "step": 6265 + }, + { + "epoch": 0.6, + "grad_norm": 0.2513177187771401, + "learning_rate": 0.0001677869419761517, + "loss": 1.0256, + "step": 6266 + }, + { + "epoch": 0.6, + "grad_norm": 0.28595770700712264, + "learning_rate": 0.00016777531091807175, + "loss": 1.1266, + "step": 6267 + }, + { + "epoch": 0.6, + "grad_norm": 0.2514182299644808, + "learning_rate": 0.00016776367816387206, + "loss": 1.118, + "step": 6268 + }, + { + "epoch": 0.6, + "grad_norm": 0.2766744455914722, + "learning_rate": 0.00016775204371384373, + "loss": 1.0138, + "step": 6269 + }, + { + "epoch": 0.6, + "grad_norm": 0.30206625962707667, + "learning_rate": 0.0001677404075682779, + "loss": 1.194, + "step": 6270 + }, + { + "epoch": 0.6, + "grad_norm": 0.2631889376384602, + "learning_rate": 0.0001677287697274658, + "loss": 0.9856, + "step": 6271 + }, + { + "epoch": 0.6, + "grad_norm": 0.25406693794632235, + "learning_rate": 0.0001677171301916987, + "loss": 1.0063, + "step": 6272 + }, + { + "epoch": 0.6, + "grad_norm": 0.28950694169287894, + "learning_rate": 0.00016770548896126783, + "loss": 1.0609, + "step": 6273 + }, + { + "epoch": 0.6, + "grad_norm": 0.2761692174626653, + "learning_rate": 0.00016769384603646455, + "loss": 1.0876, + "step": 6274 + }, + { + "epoch": 0.6, + "grad_norm": 0.27628317931169183, + "learning_rate": 0.00016768220141758023, + "loss": 1.0243, + "step": 6275 + }, + { + "epoch": 0.6, + "grad_norm": 0.2660866756756718, + "learning_rate": 0.0001676705551049063, + "loss": 1.0615, + "step": 6276 + }, + { + "epoch": 0.6, + "grad_norm": 0.24048261615675012, + "learning_rate": 0.0001676589070987342, + "loss": 1.152, + "step": 6277 + }, + { + "epoch": 0.6, + "grad_norm": 0.30207736888758835, + "learning_rate": 0.0001676472573993554, + "loss": 1.1271, + "step": 6278 + }, + { + "epoch": 0.6, + "grad_norm": 0.28419649601292135, + "learning_rate": 0.0001676356060070615, + "loss": 1.0366, + "step": 6279 + }, + { + "epoch": 0.6, + "grad_norm": 0.25939328583829085, + "learning_rate": 0.00016762395292214404, + "loss": 1.1239, + "step": 6280 + }, + { + "epoch": 0.6, + "grad_norm": 0.24732667648488688, + "learning_rate": 0.00016761229814489466, + "loss": 1.0374, + "step": 6281 + }, + { + "epoch": 0.6, + "grad_norm": 0.28033363094981284, + "learning_rate": 0.00016760064167560502, + "loss": 1.1033, + "step": 6282 + }, + { + "epoch": 0.6, + "grad_norm": 0.2837131506670996, + "learning_rate": 0.00016758898351456683, + "loss": 1.1314, + "step": 6283 + }, + { + "epoch": 0.6, + "grad_norm": 0.3341961682280646, + "learning_rate": 0.00016757732366207186, + "loss": 1.1564, + "step": 6284 + }, + { + "epoch": 0.6, + "grad_norm": 0.2789710186742297, + "learning_rate": 0.00016756566211841188, + "loss": 1.0238, + "step": 6285 + }, + { + "epoch": 0.6, + "grad_norm": 0.3018369450049017, + "learning_rate": 0.00016755399888387874, + "loss": 1.0962, + "step": 6286 + }, + { + "epoch": 0.6, + "grad_norm": 0.265390590494802, + "learning_rate": 0.00016754233395876439, + "loss": 1.0181, + "step": 6287 + }, + { + "epoch": 0.6, + "grad_norm": 0.2849229079192597, + "learning_rate": 0.0001675306673433606, + "loss": 1.0362, + "step": 6288 + }, + { + "epoch": 0.6, + "grad_norm": 0.279551424881973, + "learning_rate": 0.00016751899903795947, + "loss": 1.0863, + "step": 6289 + }, + { + "epoch": 0.6, + "grad_norm": 0.30798802423123717, + "learning_rate": 0.00016750732904285292, + "loss": 1.0984, + "step": 6290 + }, + { + "epoch": 0.6, + "grad_norm": 0.28386059483142134, + "learning_rate": 0.00016749565735833306, + "loss": 1.0513, + "step": 6291 + }, + { + "epoch": 0.6, + "grad_norm": 0.2971213923065991, + "learning_rate": 0.0001674839839846919, + "loss": 1.0989, + "step": 6292 + }, + { + "epoch": 0.6, + "grad_norm": 0.279028774197727, + "learning_rate": 0.0001674723089222217, + "loss": 0.9628, + "step": 6293 + }, + { + "epoch": 0.6, + "grad_norm": 0.3005236299374139, + "learning_rate": 0.00016746063217121452, + "loss": 0.9983, + "step": 6294 + }, + { + "epoch": 0.6, + "grad_norm": 0.2556549845006676, + "learning_rate": 0.00016744895373196265, + "loss": 1.0838, + "step": 6295 + }, + { + "epoch": 0.6, + "grad_norm": 0.2754893169400004, + "learning_rate": 0.00016743727360475833, + "loss": 1.0528, + "step": 6296 + }, + { + "epoch": 0.6, + "grad_norm": 0.2525035036290481, + "learning_rate": 0.00016742559178989383, + "loss": 1.1438, + "step": 6297 + }, + { + "epoch": 0.6, + "grad_norm": 0.2994082116871912, + "learning_rate": 0.00016741390828766152, + "loss": 1.0641, + "step": 6298 + }, + { + "epoch": 0.6, + "grad_norm": 0.2915966725527121, + "learning_rate": 0.00016740222309835382, + "loss": 1.1256, + "step": 6299 + }, + { + "epoch": 0.6, + "grad_norm": 0.2976305302047498, + "learning_rate": 0.00016739053622226305, + "loss": 1.0256, + "step": 6300 + }, + { + "epoch": 0.6, + "grad_norm": 0.30003197401374204, + "learning_rate": 0.00016737884765968185, + "loss": 1.1071, + "step": 6301 + }, + { + "epoch": 0.6, + "grad_norm": 0.25781935696696723, + "learning_rate": 0.0001673671574109026, + "loss": 1.0172, + "step": 6302 + }, + { + "epoch": 0.6, + "grad_norm": 0.2854845428390189, + "learning_rate": 0.00016735546547621787, + "loss": 1.1589, + "step": 6303 + }, + { + "epoch": 0.6, + "grad_norm": 0.30166239723579663, + "learning_rate": 0.00016734377185592032, + "loss": 1.1384, + "step": 6304 + }, + { + "epoch": 0.6, + "grad_norm": 0.28301402595792946, + "learning_rate": 0.00016733207655030254, + "loss": 1.0882, + "step": 6305 + }, + { + "epoch": 0.6, + "grad_norm": 0.25169386672382876, + "learning_rate": 0.00016732037955965724, + "loss": 1.1254, + "step": 6306 + }, + { + "epoch": 0.6, + "grad_norm": 0.28028422765627997, + "learning_rate": 0.00016730868088427712, + "loss": 1.0061, + "step": 6307 + }, + { + "epoch": 0.6, + "grad_norm": 0.29537665905438676, + "learning_rate": 0.00016729698052445497, + "loss": 1.0539, + "step": 6308 + }, + { + "epoch": 0.6, + "grad_norm": 0.28211755903503166, + "learning_rate": 0.0001672852784804836, + "loss": 1.0818, + "step": 6309 + }, + { + "epoch": 0.6, + "grad_norm": 0.2685003161595143, + "learning_rate": 0.00016727357475265582, + "loss": 0.9727, + "step": 6310 + }, + { + "epoch": 0.6, + "grad_norm": 0.271237791328004, + "learning_rate": 0.00016726186934126457, + "loss": 1.0693, + "step": 6311 + }, + { + "epoch": 0.6, + "grad_norm": 0.29234966306787097, + "learning_rate": 0.00016725016224660274, + "loss": 1.0985, + "step": 6312 + }, + { + "epoch": 0.6, + "grad_norm": 0.2415128764029434, + "learning_rate": 0.00016723845346896336, + "loss": 1.0949, + "step": 6313 + }, + { + "epoch": 0.6, + "grad_norm": 0.28487373231440377, + "learning_rate": 0.00016722674300863942, + "loss": 1.0031, + "step": 6314 + }, + { + "epoch": 0.6, + "grad_norm": 0.24380319070580195, + "learning_rate": 0.00016721503086592398, + "loss": 0.9791, + "step": 6315 + }, + { + "epoch": 0.6, + "grad_norm": 0.2854973044582919, + "learning_rate": 0.00016720331704111015, + "loss": 1.0771, + "step": 6316 + }, + { + "epoch": 0.6, + "grad_norm": 0.2926702434021319, + "learning_rate": 0.0001671916015344911, + "loss": 1.0796, + "step": 6317 + }, + { + "epoch": 0.6, + "grad_norm": 0.2625594361259241, + "learning_rate": 0.00016717988434636, + "loss": 1.0848, + "step": 6318 + }, + { + "epoch": 0.6, + "grad_norm": 0.272123940034111, + "learning_rate": 0.00016716816547701003, + "loss": 1.0751, + "step": 6319 + }, + { + "epoch": 0.6, + "grad_norm": 0.25026070785323234, + "learning_rate": 0.00016715644492673452, + "loss": 1.0925, + "step": 6320 + }, + { + "epoch": 0.6, + "grad_norm": 0.28111670498694746, + "learning_rate": 0.00016714472269582678, + "loss": 1.1395, + "step": 6321 + }, + { + "epoch": 0.6, + "grad_norm": 0.30210515943962146, + "learning_rate": 0.00016713299878458012, + "loss": 1.117, + "step": 6322 + }, + { + "epoch": 0.6, + "grad_norm": 0.2717616248041951, + "learning_rate": 0.00016712127319328803, + "loss": 1.0637, + "step": 6323 + }, + { + "epoch": 0.61, + "grad_norm": 0.24801900424048062, + "learning_rate": 0.00016710954592224386, + "loss": 1.1057, + "step": 6324 + }, + { + "epoch": 0.61, + "grad_norm": 0.2898349287109403, + "learning_rate": 0.00016709781697174113, + "loss": 1.0904, + "step": 6325 + }, + { + "epoch": 0.61, + "grad_norm": 0.2615926237179176, + "learning_rate": 0.00016708608634207338, + "loss": 1.013, + "step": 6326 + }, + { + "epoch": 0.61, + "grad_norm": 0.25404922506608696, + "learning_rate": 0.00016707435403353412, + "loss": 1.0322, + "step": 6327 + }, + { + "epoch": 0.61, + "grad_norm": 0.2993573106392043, + "learning_rate": 0.000167062620046417, + "loss": 1.1574, + "step": 6328 + }, + { + "epoch": 0.61, + "grad_norm": 0.27922342474327827, + "learning_rate": 0.0001670508843810157, + "loss": 1.0711, + "step": 6329 + }, + { + "epoch": 0.61, + "grad_norm": 0.3011854686091883, + "learning_rate": 0.00016703914703762387, + "loss": 1.054, + "step": 6330 + }, + { + "epoch": 0.61, + "grad_norm": 0.25394776998572083, + "learning_rate": 0.00016702740801653523, + "loss": 1.0363, + "step": 6331 + }, + { + "epoch": 0.61, + "grad_norm": 0.32790248402799294, + "learning_rate": 0.00016701566731804358, + "loss": 1.1621, + "step": 6332 + }, + { + "epoch": 0.61, + "grad_norm": 0.2825346749177906, + "learning_rate": 0.00016700392494244277, + "loss": 0.9924, + "step": 6333 + }, + { + "epoch": 0.61, + "grad_norm": 0.2615222834120971, + "learning_rate": 0.0001669921808900266, + "loss": 1.1129, + "step": 6334 + }, + { + "epoch": 0.61, + "grad_norm": 0.2685877115229659, + "learning_rate": 0.000166980435161089, + "loss": 1.0624, + "step": 6335 + }, + { + "epoch": 0.61, + "grad_norm": 0.2782305195102996, + "learning_rate": 0.00016696868775592394, + "loss": 1.1125, + "step": 6336 + }, + { + "epoch": 0.61, + "grad_norm": 0.26429721749726304, + "learning_rate": 0.00016695693867482535, + "loss": 1.0898, + "step": 6337 + }, + { + "epoch": 0.61, + "grad_norm": 0.3006719307055734, + "learning_rate": 0.0001669451879180873, + "loss": 1.1202, + "step": 6338 + }, + { + "epoch": 0.61, + "grad_norm": 0.2546442290098055, + "learning_rate": 0.00016693343548600386, + "loss": 1.0205, + "step": 6339 + }, + { + "epoch": 0.61, + "grad_norm": 0.3163145075653169, + "learning_rate": 0.00016692168137886912, + "loss": 1.0082, + "step": 6340 + }, + { + "epoch": 0.61, + "grad_norm": 0.26767591471900337, + "learning_rate": 0.00016690992559697726, + "loss": 1.0872, + "step": 6341 + }, + { + "epoch": 0.61, + "grad_norm": 0.3092574347023085, + "learning_rate": 0.00016689816814062245, + "loss": 1.0457, + "step": 6342 + }, + { + "epoch": 0.61, + "grad_norm": 0.2637784366965473, + "learning_rate": 0.00016688640901009894, + "loss": 1.0477, + "step": 6343 + }, + { + "epoch": 0.61, + "grad_norm": 0.30076256747451274, + "learning_rate": 0.000166874648205701, + "loss": 1.1079, + "step": 6344 + }, + { + "epoch": 0.61, + "grad_norm": 0.2923545840530909, + "learning_rate": 0.00016686288572772295, + "loss": 1.2222, + "step": 6345 + }, + { + "epoch": 0.61, + "grad_norm": 0.3153084026273102, + "learning_rate": 0.00016685112157645916, + "loss": 0.9666, + "step": 6346 + }, + { + "epoch": 0.61, + "grad_norm": 0.2963713357046184, + "learning_rate": 0.00016683935575220407, + "loss": 1.0839, + "step": 6347 + }, + { + "epoch": 0.61, + "grad_norm": 0.2737775027067931, + "learning_rate": 0.00016682758825525208, + "loss": 1.1275, + "step": 6348 + }, + { + "epoch": 0.61, + "grad_norm": 0.2800942528032241, + "learning_rate": 0.00016681581908589772, + "loss": 0.9692, + "step": 6349 + }, + { + "epoch": 0.61, + "grad_norm": 0.3109943800000079, + "learning_rate": 0.00016680404824443546, + "loss": 1.0421, + "step": 6350 + }, + { + "epoch": 0.61, + "grad_norm": 0.27462352106623983, + "learning_rate": 0.0001667922757311599, + "loss": 1.1845, + "step": 6351 + }, + { + "epoch": 0.61, + "grad_norm": 0.32025726456616344, + "learning_rate": 0.0001667805015463657, + "loss": 1.0136, + "step": 6352 + }, + { + "epoch": 0.61, + "grad_norm": 0.301660874521527, + "learning_rate": 0.00016676872569034744, + "loss": 1.1865, + "step": 6353 + }, + { + "epoch": 0.61, + "grad_norm": 0.29842518248408173, + "learning_rate": 0.00016675694816339987, + "loss": 1.2503, + "step": 6354 + }, + { + "epoch": 0.61, + "grad_norm": 0.26028217660743824, + "learning_rate": 0.00016674516896581773, + "loss": 0.9257, + "step": 6355 + }, + { + "epoch": 0.61, + "grad_norm": 0.290277459931021, + "learning_rate": 0.00016673338809789577, + "loss": 1.0985, + "step": 6356 + }, + { + "epoch": 0.61, + "grad_norm": 0.27210815585379544, + "learning_rate": 0.00016672160555992885, + "loss": 1.1111, + "step": 6357 + }, + { + "epoch": 0.61, + "grad_norm": 0.27064221588352777, + "learning_rate": 0.0001667098213522118, + "loss": 0.9628, + "step": 6358 + }, + { + "epoch": 0.61, + "grad_norm": 0.31149953692608723, + "learning_rate": 0.00016669803547503958, + "loss": 1.0404, + "step": 6359 + }, + { + "epoch": 0.61, + "grad_norm": 0.2859751349658579, + "learning_rate": 0.0001666862479287071, + "loss": 0.9679, + "step": 6360 + }, + { + "epoch": 0.61, + "grad_norm": 0.29734676588988057, + "learning_rate": 0.0001666744587135093, + "loss": 1.1466, + "step": 6361 + }, + { + "epoch": 0.61, + "grad_norm": 0.2695100397906395, + "learning_rate": 0.00016666266782974133, + "loss": 1.0106, + "step": 6362 + }, + { + "epoch": 0.61, + "grad_norm": 0.33059150738359744, + "learning_rate": 0.00016665087527769815, + "loss": 1.1697, + "step": 6363 + }, + { + "epoch": 0.61, + "grad_norm": 0.2783320081837106, + "learning_rate": 0.00016663908105767495, + "loss": 1.0822, + "step": 6364 + }, + { + "epoch": 0.61, + "grad_norm": 0.31565170424334077, + "learning_rate": 0.00016662728516996688, + "loss": 1.1806, + "step": 6365 + }, + { + "epoch": 0.61, + "grad_norm": 0.2709155225707643, + "learning_rate": 0.0001666154876148691, + "loss": 1.0718, + "step": 6366 + }, + { + "epoch": 0.61, + "grad_norm": 0.2743956028640576, + "learning_rate": 0.00016660368839267693, + "loss": 1.1101, + "step": 6367 + }, + { + "epoch": 0.61, + "grad_norm": 0.3083906678441462, + "learning_rate": 0.00016659188750368554, + "loss": 1.1813, + "step": 6368 + }, + { + "epoch": 0.61, + "grad_norm": 0.27005425077077816, + "learning_rate": 0.00016658008494819032, + "loss": 1.01, + "step": 6369 + }, + { + "epoch": 0.61, + "grad_norm": 0.3190770015107477, + "learning_rate": 0.00016656828072648665, + "loss": 1.0918, + "step": 6370 + }, + { + "epoch": 0.61, + "grad_norm": 0.2786920758144284, + "learning_rate": 0.00016655647483886988, + "loss": 0.9536, + "step": 6371 + }, + { + "epoch": 0.61, + "grad_norm": 0.28799801927513924, + "learning_rate": 0.00016654466728563557, + "loss": 1.1653, + "step": 6372 + }, + { + "epoch": 0.61, + "grad_norm": 0.2740543474851488, + "learning_rate": 0.00016653285806707908, + "loss": 1.0493, + "step": 6373 + }, + { + "epoch": 0.61, + "grad_norm": 0.283012370411115, + "learning_rate": 0.000166521047183496, + "loss": 1.0719, + "step": 6374 + }, + { + "epoch": 0.61, + "grad_norm": 0.2866968650229277, + "learning_rate": 0.00016650923463518196, + "loss": 0.9907, + "step": 6375 + }, + { + "epoch": 0.61, + "grad_norm": 0.2802944367538119, + "learning_rate": 0.00016649742042243248, + "loss": 1.0706, + "step": 6376 + }, + { + "epoch": 0.61, + "grad_norm": 0.27154248970374756, + "learning_rate": 0.00016648560454554328, + "loss": 1.1223, + "step": 6377 + }, + { + "epoch": 0.61, + "grad_norm": 0.2838775500172997, + "learning_rate": 0.00016647378700481005, + "loss": 1.0581, + "step": 6378 + }, + { + "epoch": 0.61, + "grad_norm": 0.2920529073936615, + "learning_rate": 0.00016646196780052848, + "loss": 1.0481, + "step": 6379 + }, + { + "epoch": 0.61, + "grad_norm": 0.2670726827629129, + "learning_rate": 0.00016645014693299442, + "loss": 1.0974, + "step": 6380 + }, + { + "epoch": 0.61, + "grad_norm": 0.24730959097979163, + "learning_rate": 0.00016643832440250367, + "loss": 1.1209, + "step": 6381 + }, + { + "epoch": 0.61, + "grad_norm": 0.2947114263680111, + "learning_rate": 0.00016642650020935214, + "loss": 0.9936, + "step": 6382 + }, + { + "epoch": 0.61, + "grad_norm": 0.3233060078932617, + "learning_rate": 0.00016641467435383564, + "loss": 1.1597, + "step": 6383 + }, + { + "epoch": 0.61, + "grad_norm": 0.26002181675498415, + "learning_rate": 0.00016640284683625017, + "loss": 0.9699, + "step": 6384 + }, + { + "epoch": 0.61, + "grad_norm": 0.26273440181552443, + "learning_rate": 0.0001663910176568918, + "loss": 1.1475, + "step": 6385 + }, + { + "epoch": 0.61, + "grad_norm": 0.24674523967677028, + "learning_rate": 0.00016637918681605639, + "loss": 1.0134, + "step": 6386 + }, + { + "epoch": 0.61, + "grad_norm": 0.2618701917902359, + "learning_rate": 0.00016636735431404019, + "loss": 1.1473, + "step": 6387 + }, + { + "epoch": 0.61, + "grad_norm": 0.28468586915692684, + "learning_rate": 0.00016635552015113918, + "loss": 1.0526, + "step": 6388 + }, + { + "epoch": 0.61, + "grad_norm": 0.3028992395965799, + "learning_rate": 0.0001663436843276496, + "loss": 1.0536, + "step": 6389 + }, + { + "epoch": 0.61, + "grad_norm": 0.2789358949175681, + "learning_rate": 0.00016633184684386763, + "loss": 1.1303, + "step": 6390 + }, + { + "epoch": 0.61, + "grad_norm": 0.29126579487824283, + "learning_rate": 0.00016632000770008947, + "loss": 1.0763, + "step": 6391 + }, + { + "epoch": 0.61, + "grad_norm": 0.27542318323495574, + "learning_rate": 0.0001663081668966115, + "loss": 1.1802, + "step": 6392 + }, + { + "epoch": 0.61, + "grad_norm": 0.30182241732147397, + "learning_rate": 0.00016629632443372993, + "loss": 1.054, + "step": 6393 + }, + { + "epoch": 0.61, + "grad_norm": 0.2819317111614903, + "learning_rate": 0.0001662844803117412, + "loss": 1.0247, + "step": 6394 + }, + { + "epoch": 0.61, + "grad_norm": 0.2813661907175226, + "learning_rate": 0.00016627263453094168, + "loss": 1.153, + "step": 6395 + }, + { + "epoch": 0.61, + "grad_norm": 0.3156855165232155, + "learning_rate": 0.00016626078709162782, + "loss": 1.1216, + "step": 6396 + }, + { + "epoch": 0.61, + "grad_norm": 0.2762036811217373, + "learning_rate": 0.00016624893799409613, + "loss": 1.0578, + "step": 6397 + }, + { + "epoch": 0.61, + "grad_norm": 0.25813174874980066, + "learning_rate": 0.00016623708723864314, + "loss": 1.0028, + "step": 6398 + }, + { + "epoch": 0.61, + "grad_norm": 0.2547111721851036, + "learning_rate": 0.0001662252348255654, + "loss": 1.0322, + "step": 6399 + }, + { + "epoch": 0.61, + "grad_norm": 0.2670666010121307, + "learning_rate": 0.00016621338075515954, + "loss": 1.0951, + "step": 6400 + }, + { + "epoch": 0.61, + "grad_norm": 0.25941920255451006, + "learning_rate": 0.00016620152502772224, + "loss": 1.009, + "step": 6401 + }, + { + "epoch": 0.61, + "grad_norm": 0.32953249149853675, + "learning_rate": 0.00016618966764355016, + "loss": 1.1824, + "step": 6402 + }, + { + "epoch": 0.61, + "grad_norm": 0.3400097911397109, + "learning_rate": 0.00016617780860294002, + "loss": 1.0785, + "step": 6403 + }, + { + "epoch": 0.61, + "grad_norm": 0.2606374235637558, + "learning_rate": 0.00016616594790618865, + "loss": 1.0066, + "step": 6404 + }, + { + "epoch": 0.61, + "grad_norm": 0.28689829208173906, + "learning_rate": 0.00016615408555359284, + "loss": 1.1033, + "step": 6405 + }, + { + "epoch": 0.61, + "grad_norm": 0.275071242493895, + "learning_rate": 0.00016614222154544948, + "loss": 1.0145, + "step": 6406 + }, + { + "epoch": 0.61, + "grad_norm": 0.3036143858185991, + "learning_rate": 0.00016613035588205542, + "loss": 0.976, + "step": 6407 + }, + { + "epoch": 0.61, + "grad_norm": 0.2739250339275323, + "learning_rate": 0.00016611848856370768, + "loss": 0.9842, + "step": 6408 + }, + { + "epoch": 0.61, + "grad_norm": 0.269718907859172, + "learning_rate": 0.0001661066195907032, + "loss": 1.0573, + "step": 6409 + }, + { + "epoch": 0.61, + "grad_norm": 0.27815112294798416, + "learning_rate": 0.000166094748963339, + "loss": 1.1291, + "step": 6410 + }, + { + "epoch": 0.61, + "grad_norm": 0.2810699578192083, + "learning_rate": 0.0001660828766819122, + "loss": 0.9918, + "step": 6411 + }, + { + "epoch": 0.61, + "grad_norm": 0.25506946682445086, + "learning_rate": 0.00016607100274671982, + "loss": 1.1449, + "step": 6412 + }, + { + "epoch": 0.61, + "grad_norm": 0.2890524163750954, + "learning_rate": 0.00016605912715805915, + "loss": 1.124, + "step": 6413 + }, + { + "epoch": 0.61, + "grad_norm": 0.29021337271256664, + "learning_rate": 0.00016604724991622726, + "loss": 1.0194, + "step": 6414 + }, + { + "epoch": 0.61, + "grad_norm": 0.27539899144845825, + "learning_rate": 0.00016603537102152145, + "loss": 1.077, + "step": 6415 + }, + { + "epoch": 0.61, + "grad_norm": 0.29031381020521185, + "learning_rate": 0.00016602349047423895, + "loss": 1.068, + "step": 6416 + }, + { + "epoch": 0.61, + "grad_norm": 0.259955978828572, + "learning_rate": 0.00016601160827467713, + "loss": 0.9664, + "step": 6417 + }, + { + "epoch": 0.61, + "grad_norm": 0.29036829398239994, + "learning_rate": 0.00016599972442313333, + "loss": 1.0306, + "step": 6418 + }, + { + "epoch": 0.61, + "grad_norm": 0.2771649514160956, + "learning_rate": 0.00016598783891990496, + "loss": 1.0726, + "step": 6419 + }, + { + "epoch": 0.61, + "grad_norm": 0.2597924047332126, + "learning_rate": 0.00016597595176528942, + "loss": 1.0015, + "step": 6420 + }, + { + "epoch": 0.61, + "grad_norm": 0.291533841923736, + "learning_rate": 0.00016596406295958421, + "loss": 1.0385, + "step": 6421 + }, + { + "epoch": 0.61, + "grad_norm": 0.27196336627994283, + "learning_rate": 0.0001659521725030869, + "loss": 1.0915, + "step": 6422 + }, + { + "epoch": 0.61, + "grad_norm": 0.3091516771574349, + "learning_rate": 0.00016594028039609504, + "loss": 1.1498, + "step": 6423 + }, + { + "epoch": 0.61, + "grad_norm": 0.269385742381281, + "learning_rate": 0.00016592838663890617, + "loss": 1.1685, + "step": 6424 + }, + { + "epoch": 0.61, + "grad_norm": 0.31387068153422903, + "learning_rate": 0.00016591649123181803, + "loss": 1.0578, + "step": 6425 + }, + { + "epoch": 0.61, + "grad_norm": 0.2715541694948404, + "learning_rate": 0.00016590459417512824, + "loss": 1.0411, + "step": 6426 + }, + { + "epoch": 0.61, + "grad_norm": 0.31668645833456616, + "learning_rate": 0.00016589269546913457, + "loss": 1.045, + "step": 6427 + }, + { + "epoch": 0.61, + "grad_norm": 0.26537674929552535, + "learning_rate": 0.0001658807951141348, + "loss": 0.9741, + "step": 6428 + }, + { + "epoch": 0.62, + "grad_norm": 0.30617980217717927, + "learning_rate": 0.00016586889311042674, + "loss": 1.0323, + "step": 6429 + }, + { + "epoch": 0.62, + "grad_norm": 0.2893643055407271, + "learning_rate": 0.00016585698945830818, + "loss": 0.9841, + "step": 6430 + }, + { + "epoch": 0.62, + "grad_norm": 0.26428036894894746, + "learning_rate": 0.00016584508415807712, + "loss": 1.0233, + "step": 6431 + }, + { + "epoch": 0.62, + "grad_norm": 0.25987613015099514, + "learning_rate": 0.00016583317721003142, + "loss": 0.9113, + "step": 6432 + }, + { + "epoch": 0.62, + "grad_norm": 0.2676269631291612, + "learning_rate": 0.0001658212686144691, + "loss": 1.0322, + "step": 6433 + }, + { + "epoch": 0.62, + "grad_norm": 0.27792570272364814, + "learning_rate": 0.00016580935837168817, + "loss": 1.1305, + "step": 6434 + }, + { + "epoch": 0.62, + "grad_norm": 0.3054138054002042, + "learning_rate": 0.00016579744648198666, + "loss": 0.9909, + "step": 6435 + }, + { + "epoch": 0.62, + "grad_norm": 0.28755696638880873, + "learning_rate": 0.0001657855329456627, + "loss": 1.1987, + "step": 6436 + }, + { + "epoch": 0.62, + "grad_norm": 0.2937260358743029, + "learning_rate": 0.0001657736177630145, + "loss": 1.175, + "step": 6437 + }, + { + "epoch": 0.62, + "grad_norm": 0.25543914091501174, + "learning_rate": 0.00016576170093434008, + "loss": 0.9054, + "step": 6438 + }, + { + "epoch": 0.62, + "grad_norm": 0.2358502828421239, + "learning_rate": 0.00016574978245993783, + "loss": 1.1184, + "step": 6439 + }, + { + "epoch": 0.62, + "grad_norm": 0.32159479122906715, + "learning_rate": 0.00016573786234010593, + "loss": 1.0697, + "step": 6440 + }, + { + "epoch": 0.62, + "grad_norm": 0.29597420637835464, + "learning_rate": 0.0001657259405751427, + "loss": 1.0147, + "step": 6441 + }, + { + "epoch": 0.62, + "grad_norm": 0.27502750185112845, + "learning_rate": 0.0001657140171653465, + "loss": 1.1519, + "step": 6442 + }, + { + "epoch": 0.62, + "grad_norm": 0.28880378692907466, + "learning_rate": 0.00016570209211101578, + "loss": 1.0251, + "step": 6443 + }, + { + "epoch": 0.62, + "grad_norm": 0.29201502579906263, + "learning_rate": 0.00016569016541244884, + "loss": 1.1337, + "step": 6444 + }, + { + "epoch": 0.62, + "grad_norm": 0.2399540511362018, + "learning_rate": 0.00016567823706994426, + "loss": 1.0998, + "step": 6445 + }, + { + "epoch": 0.62, + "grad_norm": 0.3268134405014831, + "learning_rate": 0.00016566630708380052, + "loss": 0.9888, + "step": 6446 + }, + { + "epoch": 0.62, + "grad_norm": 0.3103957574422285, + "learning_rate": 0.00016565437545431618, + "loss": 1.1474, + "step": 6447 + }, + { + "epoch": 0.62, + "grad_norm": 0.2718259809863342, + "learning_rate": 0.0001656424421817898, + "loss": 0.9704, + "step": 6448 + }, + { + "epoch": 0.62, + "grad_norm": 0.27353989936463624, + "learning_rate": 0.00016563050726652007, + "loss": 1.0992, + "step": 6449 + }, + { + "epoch": 0.62, + "grad_norm": 0.2856249195760054, + "learning_rate": 0.00016561857070880565, + "loss": 1.1137, + "step": 6450 + }, + { + "epoch": 0.62, + "grad_norm": 0.273725036502166, + "learning_rate": 0.00016560663250894526, + "loss": 0.9277, + "step": 6451 + }, + { + "epoch": 0.62, + "grad_norm": 0.2763856735936746, + "learning_rate": 0.00016559469266723767, + "loss": 1.0204, + "step": 6452 + }, + { + "epoch": 0.62, + "grad_norm": 0.28085668794591295, + "learning_rate": 0.00016558275118398164, + "loss": 0.9921, + "step": 6453 + }, + { + "epoch": 0.62, + "grad_norm": 0.2735774703202355, + "learning_rate": 0.00016557080805947605, + "loss": 0.9692, + "step": 6454 + }, + { + "epoch": 0.62, + "grad_norm": 0.26616410789129574, + "learning_rate": 0.0001655588632940198, + "loss": 0.9541, + "step": 6455 + }, + { + "epoch": 0.62, + "grad_norm": 0.2890311486765243, + "learning_rate": 0.0001655469168879118, + "loss": 1.0578, + "step": 6456 + }, + { + "epoch": 0.62, + "grad_norm": 0.29609104767351785, + "learning_rate": 0.00016553496884145097, + "loss": 1.0883, + "step": 6457 + }, + { + "epoch": 0.62, + "grad_norm": 0.2608637836134015, + "learning_rate": 0.0001655230191549364, + "loss": 1.0304, + "step": 6458 + }, + { + "epoch": 0.62, + "grad_norm": 0.27959926336448077, + "learning_rate": 0.00016551106782866705, + "loss": 1.0061, + "step": 6459 + }, + { + "epoch": 0.62, + "grad_norm": 0.2771253997843709, + "learning_rate": 0.0001654991148629421, + "loss": 1.1194, + "step": 6460 + }, + { + "epoch": 0.62, + "grad_norm": 0.28449814950864316, + "learning_rate": 0.00016548716025806062, + "loss": 1.088, + "step": 6461 + }, + { + "epoch": 0.62, + "grad_norm": 0.2691654812600947, + "learning_rate": 0.0001654752040143218, + "loss": 1.0968, + "step": 6462 + }, + { + "epoch": 0.62, + "grad_norm": 0.31954023097003986, + "learning_rate": 0.00016546324613202483, + "loss": 0.9794, + "step": 6463 + }, + { + "epoch": 0.62, + "grad_norm": 0.2649303795932421, + "learning_rate": 0.000165451286611469, + "loss": 1.0693, + "step": 6464 + }, + { + "epoch": 0.62, + "grad_norm": 0.2630512812762983, + "learning_rate": 0.0001654393254529536, + "loss": 1.0332, + "step": 6465 + }, + { + "epoch": 0.62, + "grad_norm": 0.2938590102565347, + "learning_rate": 0.00016542736265677795, + "loss": 1.1377, + "step": 6466 + }, + { + "epoch": 0.62, + "grad_norm": 0.29423857954744015, + "learning_rate": 0.0001654153982232414, + "loss": 1.0742, + "step": 6467 + }, + { + "epoch": 0.62, + "grad_norm": 0.2770519185899091, + "learning_rate": 0.00016540343215264342, + "loss": 0.9763, + "step": 6468 + }, + { + "epoch": 0.62, + "grad_norm": 0.27149015096394086, + "learning_rate": 0.00016539146444528345, + "loss": 1.2458, + "step": 6469 + }, + { + "epoch": 0.62, + "grad_norm": 0.2723835734113916, + "learning_rate": 0.00016537949510146097, + "loss": 1.0588, + "step": 6470 + }, + { + "epoch": 0.62, + "grad_norm": 0.2860060696336331, + "learning_rate": 0.00016536752412147555, + "loss": 0.9471, + "step": 6471 + }, + { + "epoch": 0.62, + "grad_norm": 0.256259821327159, + "learning_rate": 0.0001653555515056268, + "loss": 1.0662, + "step": 6472 + }, + { + "epoch": 0.62, + "grad_norm": 0.26590917435909617, + "learning_rate": 0.00016534357725421422, + "loss": 0.9938, + "step": 6473 + }, + { + "epoch": 0.62, + "grad_norm": 0.30036253247255307, + "learning_rate": 0.0001653316013675376, + "loss": 0.9861, + "step": 6474 + }, + { + "epoch": 0.62, + "grad_norm": 0.24447569581162235, + "learning_rate": 0.00016531962384589655, + "loss": 1.0813, + "step": 6475 + }, + { + "epoch": 0.62, + "grad_norm": 0.31129076026873814, + "learning_rate": 0.0001653076446895909, + "loss": 1.0898, + "step": 6476 + }, + { + "epoch": 0.62, + "grad_norm": 0.2755133042122456, + "learning_rate": 0.00016529566389892039, + "loss": 1.0924, + "step": 6477 + }, + { + "epoch": 0.62, + "grad_norm": 0.24784520601298288, + "learning_rate": 0.00016528368147418485, + "loss": 1.2024, + "step": 6478 + }, + { + "epoch": 0.62, + "grad_norm": 0.2869637235166629, + "learning_rate": 0.00016527169741568416, + "loss": 1.0971, + "step": 6479 + }, + { + "epoch": 0.62, + "grad_norm": 0.25871213333736603, + "learning_rate": 0.00016525971172371822, + "loss": 0.993, + "step": 6480 + }, + { + "epoch": 0.62, + "grad_norm": 0.2604087934206153, + "learning_rate": 0.00016524772439858694, + "loss": 1.1067, + "step": 6481 + }, + { + "epoch": 0.62, + "grad_norm": 0.2816319137305283, + "learning_rate": 0.0001652357354405904, + "loss": 1.0467, + "step": 6482 + }, + { + "epoch": 0.62, + "grad_norm": 0.27702751178720125, + "learning_rate": 0.0001652237448500286, + "loss": 1.0449, + "step": 6483 + }, + { + "epoch": 0.62, + "grad_norm": 0.27403004840211326, + "learning_rate": 0.00016521175262720154, + "loss": 1.1382, + "step": 6484 + }, + { + "epoch": 0.62, + "grad_norm": 0.27012660723215115, + "learning_rate": 0.00016519975877240942, + "loss": 1.1069, + "step": 6485 + }, + { + "epoch": 0.62, + "grad_norm": 0.262756289576385, + "learning_rate": 0.00016518776328595234, + "loss": 1.0257, + "step": 6486 + }, + { + "epoch": 0.62, + "grad_norm": 0.2898435765096172, + "learning_rate": 0.0001651757661681305, + "loss": 1.2361, + "step": 6487 + }, + { + "epoch": 0.62, + "grad_norm": 0.2778059074062683, + "learning_rate": 0.0001651637674192442, + "loss": 1.116, + "step": 6488 + }, + { + "epoch": 0.62, + "grad_norm": 0.29747157960661386, + "learning_rate": 0.00016515176703959364, + "loss": 1.0449, + "step": 6489 + }, + { + "epoch": 0.62, + "grad_norm": 0.2758337700493432, + "learning_rate": 0.00016513976502947913, + "loss": 0.9708, + "step": 6490 + }, + { + "epoch": 0.62, + "grad_norm": 0.29558292743334763, + "learning_rate": 0.00016512776138920108, + "loss": 1.0372, + "step": 6491 + }, + { + "epoch": 0.62, + "grad_norm": 0.26433384453275566, + "learning_rate": 0.0001651157561190599, + "loss": 1.1413, + "step": 6492 + }, + { + "epoch": 0.62, + "grad_norm": 0.3164936697696587, + "learning_rate": 0.00016510374921935598, + "loss": 1.1305, + "step": 6493 + }, + { + "epoch": 0.62, + "grad_norm": 0.2680448222970725, + "learning_rate": 0.00016509174069038985, + "loss": 1.2003, + "step": 6494 + }, + { + "epoch": 0.62, + "grad_norm": 0.26701918502526756, + "learning_rate": 0.00016507973053246197, + "loss": 0.9809, + "step": 6495 + }, + { + "epoch": 0.62, + "grad_norm": 0.28698630137463055, + "learning_rate": 0.00016506771874587296, + "loss": 1.1306, + "step": 6496 + }, + { + "epoch": 0.62, + "grad_norm": 0.2861081221048606, + "learning_rate": 0.00016505570533092333, + "loss": 1.2112, + "step": 6497 + }, + { + "epoch": 0.62, + "grad_norm": 0.27462328764365, + "learning_rate": 0.00016504369028791382, + "loss": 1.0667, + "step": 6498 + }, + { + "epoch": 0.62, + "grad_norm": 0.2542616461897349, + "learning_rate": 0.0001650316736171451, + "loss": 1.1647, + "step": 6499 + }, + { + "epoch": 0.62, + "grad_norm": 0.268648077124173, + "learning_rate": 0.00016501965531891786, + "loss": 1.023, + "step": 6500 + }, + { + "epoch": 0.62, + "grad_norm": 0.30932903309172555, + "learning_rate": 0.0001650076353935329, + "loss": 1.0771, + "step": 6501 + }, + { + "epoch": 0.62, + "grad_norm": 0.2711465953776904, + "learning_rate": 0.000164995613841291, + "loss": 1.023, + "step": 6502 + }, + { + "epoch": 0.62, + "grad_norm": 0.2488690322666522, + "learning_rate": 0.000164983590662493, + "loss": 1.1685, + "step": 6503 + }, + { + "epoch": 0.62, + "grad_norm": 0.2447593866202204, + "learning_rate": 0.00016497156585743982, + "loss": 1.0553, + "step": 6504 + }, + { + "epoch": 0.62, + "grad_norm": 0.28996537913588744, + "learning_rate": 0.00016495953942643237, + "loss": 1.0453, + "step": 6505 + }, + { + "epoch": 0.62, + "grad_norm": 0.3100091579089353, + "learning_rate": 0.00016494751136977165, + "loss": 1.0363, + "step": 6506 + }, + { + "epoch": 0.62, + "grad_norm": 0.26438432513334514, + "learning_rate": 0.0001649354816877586, + "loss": 1.0239, + "step": 6507 + }, + { + "epoch": 0.62, + "grad_norm": 0.3136841961736258, + "learning_rate": 0.0001649234503806943, + "loss": 1.1726, + "step": 6508 + }, + { + "epoch": 0.62, + "grad_norm": 0.2793775958515165, + "learning_rate": 0.0001649114174488799, + "loss": 1.0953, + "step": 6509 + }, + { + "epoch": 0.62, + "grad_norm": 0.2991111217534248, + "learning_rate": 0.0001648993828926164, + "loss": 0.9635, + "step": 6510 + }, + { + "epoch": 0.62, + "grad_norm": 0.33123211260741653, + "learning_rate": 0.00016488734671220512, + "loss": 1.09, + "step": 6511 + }, + { + "epoch": 0.62, + "grad_norm": 0.3071684112575216, + "learning_rate": 0.0001648753089079472, + "loss": 1.112, + "step": 6512 + }, + { + "epoch": 0.62, + "grad_norm": 0.2919753257761195, + "learning_rate": 0.0001648632694801439, + "loss": 1.2142, + "step": 6513 + }, + { + "epoch": 0.62, + "grad_norm": 0.3047462341213951, + "learning_rate": 0.00016485122842909653, + "loss": 1.0747, + "step": 6514 + }, + { + "epoch": 0.62, + "grad_norm": 0.2561697275636995, + "learning_rate": 0.00016483918575510638, + "loss": 1.0719, + "step": 6515 + }, + { + "epoch": 0.62, + "grad_norm": 0.29441141071756, + "learning_rate": 0.00016482714145847488, + "loss": 1.1003, + "step": 6516 + }, + { + "epoch": 0.62, + "grad_norm": 0.2507495981732362, + "learning_rate": 0.0001648150955395034, + "loss": 1.0107, + "step": 6517 + }, + { + "epoch": 0.62, + "grad_norm": 0.2745896765591778, + "learning_rate": 0.00016480304799849343, + "loss": 1.0221, + "step": 6518 + }, + { + "epoch": 0.62, + "grad_norm": 0.2956074131649605, + "learning_rate": 0.00016479099883574648, + "loss": 1.0595, + "step": 6519 + }, + { + "epoch": 0.62, + "grad_norm": 0.2755124978473152, + "learning_rate": 0.00016477894805156404, + "loss": 0.9841, + "step": 6520 + }, + { + "epoch": 0.62, + "grad_norm": 0.2653332774638079, + "learning_rate": 0.00016476689564624773, + "loss": 1.1523, + "step": 6521 + }, + { + "epoch": 0.62, + "grad_norm": 0.28336035642857826, + "learning_rate": 0.00016475484162009913, + "loss": 1.0458, + "step": 6522 + }, + { + "epoch": 0.62, + "grad_norm": 0.27133324752026045, + "learning_rate": 0.00016474278597341995, + "loss": 1.0068, + "step": 6523 + }, + { + "epoch": 0.62, + "grad_norm": 0.254789334192467, + "learning_rate": 0.00016473072870651183, + "loss": 1.0911, + "step": 6524 + }, + { + "epoch": 0.62, + "grad_norm": 0.2645683646629639, + "learning_rate": 0.00016471866981967654, + "loss": 1.0735, + "step": 6525 + }, + { + "epoch": 0.62, + "grad_norm": 0.29514733454194775, + "learning_rate": 0.0001647066093132159, + "loss": 1.1258, + "step": 6526 + }, + { + "epoch": 0.62, + "grad_norm": 0.30773726775104837, + "learning_rate": 0.00016469454718743166, + "loss": 1.2085, + "step": 6527 + }, + { + "epoch": 0.62, + "grad_norm": 0.28753719268726, + "learning_rate": 0.00016468248344262575, + "loss": 1.1034, + "step": 6528 + }, + { + "epoch": 0.62, + "grad_norm": 0.28935088461393205, + "learning_rate": 0.00016467041807910002, + "loss": 0.9435, + "step": 6529 + }, + { + "epoch": 0.62, + "grad_norm": 0.2910181115331406, + "learning_rate": 0.00016465835109715643, + "loss": 0.8991, + "step": 6530 + }, + { + "epoch": 0.62, + "grad_norm": 0.26222425373411046, + "learning_rate": 0.00016464628249709699, + "loss": 1.1253, + "step": 6531 + }, + { + "epoch": 0.62, + "grad_norm": 0.24656232690052832, + "learning_rate": 0.00016463421227922367, + "loss": 0.979, + "step": 6532 + }, + { + "epoch": 0.63, + "grad_norm": 0.2997042750535666, + "learning_rate": 0.0001646221404438386, + "loss": 1.0947, + "step": 6533 + }, + { + "epoch": 0.63, + "grad_norm": 0.26285239590686826, + "learning_rate": 0.0001646100669912438, + "loss": 0.9987, + "step": 6534 + }, + { + "epoch": 0.63, + "grad_norm": 0.2810386117270015, + "learning_rate": 0.00016459799192174152, + "loss": 1.08, + "step": 6535 + }, + { + "epoch": 0.63, + "grad_norm": 0.2899490777892802, + "learning_rate": 0.0001645859152356339, + "loss": 1.0636, + "step": 6536 + }, + { + "epoch": 0.63, + "grad_norm": 0.32728289531293414, + "learning_rate": 0.00016457383693322314, + "loss": 1.1029, + "step": 6537 + }, + { + "epoch": 0.63, + "grad_norm": 0.2873444747610267, + "learning_rate": 0.0001645617570148115, + "loss": 1.1222, + "step": 6538 + }, + { + "epoch": 0.63, + "grad_norm": 0.26266152625659334, + "learning_rate": 0.00016454967548070135, + "loss": 1.1339, + "step": 6539 + }, + { + "epoch": 0.63, + "grad_norm": 0.2803672145152364, + "learning_rate": 0.00016453759233119503, + "loss": 1.094, + "step": 6540 + }, + { + "epoch": 0.63, + "grad_norm": 0.27084575822983264, + "learning_rate": 0.00016452550756659482, + "loss": 0.937, + "step": 6541 + }, + { + "epoch": 0.63, + "grad_norm": 0.2701911111109992, + "learning_rate": 0.00016451342118720328, + "loss": 1.029, + "step": 6542 + }, + { + "epoch": 0.63, + "grad_norm": 0.2716711794064477, + "learning_rate": 0.00016450133319332282, + "loss": 0.8668, + "step": 6543 + }, + { + "epoch": 0.63, + "grad_norm": 0.2794432093629184, + "learning_rate": 0.00016448924358525595, + "loss": 1.0109, + "step": 6544 + }, + { + "epoch": 0.63, + "grad_norm": 0.28134829606945316, + "learning_rate": 0.00016447715236330524, + "loss": 1.191, + "step": 6545 + }, + { + "epoch": 0.63, + "grad_norm": 0.22932974730919156, + "learning_rate": 0.0001644650595277733, + "loss": 1.0067, + "step": 6546 + }, + { + "epoch": 0.63, + "grad_norm": 0.2627137036762324, + "learning_rate": 0.00016445296507896267, + "loss": 1.0458, + "step": 6547 + }, + { + "epoch": 0.63, + "grad_norm": 0.31351571153357294, + "learning_rate": 0.00016444086901717614, + "loss": 1.0699, + "step": 6548 + }, + { + "epoch": 0.63, + "grad_norm": 0.3113263872681726, + "learning_rate": 0.00016442877134271633, + "loss": 1.1383, + "step": 6549 + }, + { + "epoch": 0.63, + "grad_norm": 0.24271597078080498, + "learning_rate": 0.00016441667205588603, + "loss": 0.9184, + "step": 6550 + }, + { + "epoch": 0.63, + "grad_norm": 0.27790009137065697, + "learning_rate": 0.00016440457115698802, + "loss": 1.2613, + "step": 6551 + }, + { + "epoch": 0.63, + "grad_norm": 0.27765509036577246, + "learning_rate": 0.0001643924686463252, + "loss": 1.1003, + "step": 6552 + }, + { + "epoch": 0.63, + "grad_norm": 0.29484287527906927, + "learning_rate": 0.00016438036452420032, + "loss": 1.0934, + "step": 6553 + }, + { + "epoch": 0.63, + "grad_norm": 0.29018553943805164, + "learning_rate": 0.00016436825879091635, + "loss": 1.0676, + "step": 6554 + }, + { + "epoch": 0.63, + "grad_norm": 0.28130026896984306, + "learning_rate": 0.00016435615144677629, + "loss": 1.1587, + "step": 6555 + }, + { + "epoch": 0.63, + "grad_norm": 0.2878221364125203, + "learning_rate": 0.00016434404249208306, + "loss": 1.0381, + "step": 6556 + }, + { + "epoch": 0.63, + "grad_norm": 0.2777652669244412, + "learning_rate": 0.00016433193192713974, + "loss": 1.0429, + "step": 6557 + }, + { + "epoch": 0.63, + "grad_norm": 0.26453569833758483, + "learning_rate": 0.00016431981975224938, + "loss": 1.1111, + "step": 6558 + }, + { + "epoch": 0.63, + "grad_norm": 0.28622435501200705, + "learning_rate": 0.00016430770596771512, + "loss": 1.0154, + "step": 6559 + }, + { + "epoch": 0.63, + "grad_norm": 0.2847988417285998, + "learning_rate": 0.00016429559057384011, + "loss": 0.9996, + "step": 6560 + }, + { + "epoch": 0.63, + "grad_norm": 0.26384397601020537, + "learning_rate": 0.00016428347357092755, + "loss": 0.9437, + "step": 6561 + }, + { + "epoch": 0.63, + "grad_norm": 0.2775557302168176, + "learning_rate": 0.00016427135495928062, + "loss": 1.1182, + "step": 6562 + }, + { + "epoch": 0.63, + "grad_norm": 0.30234120956824934, + "learning_rate": 0.00016425923473920267, + "loss": 1.0162, + "step": 6563 + }, + { + "epoch": 0.63, + "grad_norm": 0.2667444994608358, + "learning_rate": 0.000164247112910997, + "loss": 0.9534, + "step": 6564 + }, + { + "epoch": 0.63, + "grad_norm": 0.295387882239055, + "learning_rate": 0.0001642349894749669, + "loss": 1.1292, + "step": 6565 + }, + { + "epoch": 0.63, + "grad_norm": 0.25364552624174114, + "learning_rate": 0.00016422286443141585, + "loss": 1.0281, + "step": 6566 + }, + { + "epoch": 0.63, + "grad_norm": 0.30870163058659145, + "learning_rate": 0.00016421073778064726, + "loss": 0.9925, + "step": 6567 + }, + { + "epoch": 0.63, + "grad_norm": 0.31418016446029906, + "learning_rate": 0.0001641986095229646, + "loss": 1.0183, + "step": 6568 + }, + { + "epoch": 0.63, + "grad_norm": 0.30568890062391635, + "learning_rate": 0.0001641864796586714, + "loss": 1.1813, + "step": 6569 + }, + { + "epoch": 0.63, + "grad_norm": 0.28403262150412845, + "learning_rate": 0.00016417434818807118, + "loss": 1.1134, + "step": 6570 + }, + { + "epoch": 0.63, + "grad_norm": 0.26397330303772965, + "learning_rate": 0.00016416221511146757, + "loss": 1.113, + "step": 6571 + }, + { + "epoch": 0.63, + "grad_norm": 0.2863559947419068, + "learning_rate": 0.0001641500804291642, + "loss": 1.1201, + "step": 6572 + }, + { + "epoch": 0.63, + "grad_norm": 0.24406593687365424, + "learning_rate": 0.00016413794414146476, + "loss": 1.0542, + "step": 6573 + }, + { + "epoch": 0.63, + "grad_norm": 0.27139228105504454, + "learning_rate": 0.00016412580624867299, + "loss": 1.012, + "step": 6574 + }, + { + "epoch": 0.63, + "grad_norm": 0.27171095999953015, + "learning_rate": 0.00016411366675109256, + "loss": 0.9942, + "step": 6575 + }, + { + "epoch": 0.63, + "grad_norm": 0.2692825756718052, + "learning_rate": 0.00016410152564902734, + "loss": 1.0619, + "step": 6576 + }, + { + "epoch": 0.63, + "grad_norm": 0.2694897408553471, + "learning_rate": 0.00016408938294278118, + "loss": 1.1153, + "step": 6577 + }, + { + "epoch": 0.63, + "grad_norm": 0.2946068069084391, + "learning_rate": 0.0001640772386326579, + "loss": 0.9959, + "step": 6578 + }, + { + "epoch": 0.63, + "grad_norm": 0.2732041052214766, + "learning_rate": 0.0001640650927189615, + "loss": 1.1325, + "step": 6579 + }, + { + "epoch": 0.63, + "grad_norm": 0.25828674961643106, + "learning_rate": 0.00016405294520199586, + "loss": 1.0311, + "step": 6580 + }, + { + "epoch": 0.63, + "grad_norm": 0.2528935519865577, + "learning_rate": 0.000164040796082065, + "loss": 1.0198, + "step": 6581 + }, + { + "epoch": 0.63, + "grad_norm": 0.24788820972140796, + "learning_rate": 0.00016402864535947298, + "loss": 1.0777, + "step": 6582 + }, + { + "epoch": 0.63, + "grad_norm": 0.27027895661109375, + "learning_rate": 0.00016401649303452386, + "loss": 0.9911, + "step": 6583 + }, + { + "epoch": 0.63, + "grad_norm": 0.29776619082303163, + "learning_rate": 0.0001640043391075218, + "loss": 0.995, + "step": 6584 + }, + { + "epoch": 0.63, + "grad_norm": 0.25164405759102015, + "learning_rate": 0.0001639921835787709, + "loss": 1.0057, + "step": 6585 + }, + { + "epoch": 0.63, + "grad_norm": 0.28442402581138315, + "learning_rate": 0.00016398002644857538, + "loss": 1.0983, + "step": 6586 + }, + { + "epoch": 0.63, + "grad_norm": 0.27597236003640674, + "learning_rate": 0.00016396786771723953, + "loss": 1.0816, + "step": 6587 + }, + { + "epoch": 0.63, + "grad_norm": 0.3036495741618543, + "learning_rate": 0.00016395570738506754, + "loss": 1.0313, + "step": 6588 + }, + { + "epoch": 0.63, + "grad_norm": 0.2983086346865117, + "learning_rate": 0.0001639435454523638, + "loss": 1.1382, + "step": 6589 + }, + { + "epoch": 0.63, + "grad_norm": 0.24415345911957764, + "learning_rate": 0.00016393138191943266, + "loss": 0.9764, + "step": 6590 + }, + { + "epoch": 0.63, + "grad_norm": 0.26401568236315054, + "learning_rate": 0.0001639192167865785, + "loss": 1.0753, + "step": 6591 + }, + { + "epoch": 0.63, + "grad_norm": 0.30616078316140916, + "learning_rate": 0.00016390705005410577, + "loss": 1.1654, + "step": 6592 + }, + { + "epoch": 0.63, + "grad_norm": 0.268584602967286, + "learning_rate": 0.00016389488172231895, + "loss": 1.1281, + "step": 6593 + }, + { + "epoch": 0.63, + "grad_norm": 0.3042044038945393, + "learning_rate": 0.00016388271179152255, + "loss": 1.1451, + "step": 6594 + }, + { + "epoch": 0.63, + "grad_norm": 0.30865017852012816, + "learning_rate": 0.00016387054026202114, + "loss": 1.0497, + "step": 6595 + }, + { + "epoch": 0.63, + "grad_norm": 0.2981587654841934, + "learning_rate": 0.00016385836713411932, + "loss": 1.0117, + "step": 6596 + }, + { + "epoch": 0.63, + "grad_norm": 0.30634610666151335, + "learning_rate": 0.00016384619240812173, + "loss": 1.0624, + "step": 6597 + }, + { + "epoch": 0.63, + "grad_norm": 0.31330552508402865, + "learning_rate": 0.00016383401608433305, + "loss": 1.0021, + "step": 6598 + }, + { + "epoch": 0.63, + "grad_norm": 0.2746478862766659, + "learning_rate": 0.00016382183816305798, + "loss": 1.092, + "step": 6599 + }, + { + "epoch": 0.63, + "grad_norm": 0.31666878049663144, + "learning_rate": 0.00016380965864460135, + "loss": 1.1224, + "step": 6600 + }, + { + "epoch": 0.63, + "grad_norm": 0.2805806328462776, + "learning_rate": 0.00016379747752926787, + "loss": 1.0326, + "step": 6601 + }, + { + "epoch": 0.63, + "grad_norm": 0.27859538132326334, + "learning_rate": 0.00016378529481736242, + "loss": 1.0901, + "step": 6602 + }, + { + "epoch": 0.63, + "grad_norm": 0.27971190411521635, + "learning_rate": 0.00016377311050918989, + "loss": 1.0349, + "step": 6603 + }, + { + "epoch": 0.63, + "grad_norm": 0.35928083318694953, + "learning_rate": 0.0001637609246050552, + "loss": 1.0325, + "step": 6604 + }, + { + "epoch": 0.63, + "grad_norm": 0.2791809789319585, + "learning_rate": 0.00016374873710526327, + "loss": 1.101, + "step": 6605 + }, + { + "epoch": 0.63, + "grad_norm": 0.25414723944354833, + "learning_rate": 0.00016373654801011913, + "loss": 0.9845, + "step": 6606 + }, + { + "epoch": 0.63, + "grad_norm": 0.29501863717394433, + "learning_rate": 0.00016372435731992784, + "loss": 1.0297, + "step": 6607 + }, + { + "epoch": 0.63, + "grad_norm": 0.27739025744682017, + "learning_rate": 0.00016371216503499443, + "loss": 1.1649, + "step": 6608 + }, + { + "epoch": 0.63, + "grad_norm": 0.27992773939480853, + "learning_rate": 0.0001636999711556241, + "loss": 1.1623, + "step": 6609 + }, + { + "epoch": 0.63, + "grad_norm": 0.256055609384181, + "learning_rate": 0.00016368777568212192, + "loss": 1.0349, + "step": 6610 + }, + { + "epoch": 0.63, + "grad_norm": 0.27352283442582775, + "learning_rate": 0.00016367557861479316, + "loss": 1.081, + "step": 6611 + }, + { + "epoch": 0.63, + "grad_norm": 0.2878898287968566, + "learning_rate": 0.00016366337995394296, + "loss": 1.0915, + "step": 6612 + }, + { + "epoch": 0.63, + "grad_norm": 0.29458815084779255, + "learning_rate": 0.0001636511796998767, + "loss": 1.0093, + "step": 6613 + }, + { + "epoch": 0.63, + "grad_norm": 0.2833302474424508, + "learning_rate": 0.0001636389778528997, + "loss": 1.1375, + "step": 6614 + }, + { + "epoch": 0.63, + "grad_norm": 0.2827351352279611, + "learning_rate": 0.00016362677441331727, + "loss": 1.133, + "step": 6615 + }, + { + "epoch": 0.63, + "grad_norm": 0.2776267400325992, + "learning_rate": 0.0001636145693814348, + "loss": 1.082, + "step": 6616 + }, + { + "epoch": 0.63, + "grad_norm": 0.29210767304951707, + "learning_rate": 0.00016360236275755777, + "loss": 1.0961, + "step": 6617 + }, + { + "epoch": 0.63, + "grad_norm": 0.3111465892344082, + "learning_rate": 0.00016359015454199161, + "loss": 1.0517, + "step": 6618 + }, + { + "epoch": 0.63, + "grad_norm": 0.30230280360893014, + "learning_rate": 0.0001635779447350419, + "loss": 1.0257, + "step": 6619 + }, + { + "epoch": 0.63, + "grad_norm": 0.2640103618422073, + "learning_rate": 0.00016356573333701414, + "loss": 0.9745, + "step": 6620 + }, + { + "epoch": 0.63, + "grad_norm": 0.27301899027846943, + "learning_rate": 0.00016355352034821396, + "loss": 0.974, + "step": 6621 + }, + { + "epoch": 0.63, + "grad_norm": 0.2738570363795273, + "learning_rate": 0.00016354130576894698, + "loss": 0.995, + "step": 6622 + }, + { + "epoch": 0.63, + "grad_norm": 0.29413026629702177, + "learning_rate": 0.00016352908959951892, + "loss": 1.16, + "step": 6623 + }, + { + "epoch": 0.63, + "grad_norm": 0.30276135434523793, + "learning_rate": 0.00016351687184023547, + "loss": 1.0801, + "step": 6624 + }, + { + "epoch": 0.63, + "grad_norm": 0.26212091458025155, + "learning_rate": 0.00016350465249140235, + "loss": 1.1354, + "step": 6625 + }, + { + "epoch": 0.63, + "grad_norm": 0.295200448114549, + "learning_rate": 0.0001634924315533254, + "loss": 1.1294, + "step": 6626 + }, + { + "epoch": 0.63, + "grad_norm": 0.2719331912386029, + "learning_rate": 0.00016348020902631047, + "loss": 1.0, + "step": 6627 + }, + { + "epoch": 0.63, + "grad_norm": 0.2801548078187952, + "learning_rate": 0.0001634679849106634, + "loss": 1.0472, + "step": 6628 + }, + { + "epoch": 0.63, + "grad_norm": 0.28890264685867756, + "learning_rate": 0.0001634557592066901, + "loss": 1.159, + "step": 6629 + }, + { + "epoch": 0.63, + "grad_norm": 0.2972148378840613, + "learning_rate": 0.00016344353191469657, + "loss": 1.0649, + "step": 6630 + }, + { + "epoch": 0.63, + "grad_norm": 0.30972438758134824, + "learning_rate": 0.00016343130303498877, + "loss": 1.1313, + "step": 6631 + }, + { + "epoch": 0.63, + "grad_norm": 0.2787430037836057, + "learning_rate": 0.00016341907256787273, + "loss": 1.0461, + "step": 6632 + }, + { + "epoch": 0.63, + "grad_norm": 0.2509205560032734, + "learning_rate": 0.00016340684051365458, + "loss": 1.1465, + "step": 6633 + }, + { + "epoch": 0.63, + "grad_norm": 0.3371469012042158, + "learning_rate": 0.00016339460687264039, + "loss": 0.9917, + "step": 6634 + }, + { + "epoch": 0.63, + "grad_norm": 0.28031780574967835, + "learning_rate": 0.0001633823716451363, + "loss": 1.0859, + "step": 6635 + }, + { + "epoch": 0.63, + "grad_norm": 0.32014706377626473, + "learning_rate": 0.00016337013483144853, + "loss": 1.0125, + "step": 6636 + }, + { + "epoch": 0.63, + "grad_norm": 0.28048230557063836, + "learning_rate": 0.00016335789643188333, + "loss": 1.119, + "step": 6637 + }, + { + "epoch": 0.64, + "grad_norm": 0.27484347493922334, + "learning_rate": 0.0001633456564467469, + "loss": 1.001, + "step": 6638 + }, + { + "epoch": 0.64, + "grad_norm": 0.26589912392685283, + "learning_rate": 0.00016333341487634567, + "loss": 1.2356, + "step": 6639 + }, + { + "epoch": 0.64, + "grad_norm": 0.3261860350977622, + "learning_rate": 0.0001633211717209859, + "loss": 1.1768, + "step": 6640 + }, + { + "epoch": 0.64, + "grad_norm": 0.30705986319442513, + "learning_rate": 0.000163308926980974, + "loss": 1.072, + "step": 6641 + }, + { + "epoch": 0.64, + "grad_norm": 0.3098762264842706, + "learning_rate": 0.00016329668065661644, + "loss": 1.088, + "step": 6642 + }, + { + "epoch": 0.64, + "grad_norm": 0.24610152808075564, + "learning_rate": 0.00016328443274821964, + "loss": 1.0679, + "step": 6643 + }, + { + "epoch": 0.64, + "grad_norm": 0.293387945611742, + "learning_rate": 0.00016327218325609018, + "loss": 1.0764, + "step": 6644 + }, + { + "epoch": 0.64, + "grad_norm": 0.24880390773011782, + "learning_rate": 0.0001632599321805345, + "loss": 1.1687, + "step": 6645 + }, + { + "epoch": 0.64, + "grad_norm": 0.2871610507138743, + "learning_rate": 0.00016324767952185932, + "loss": 1.0554, + "step": 6646 + }, + { + "epoch": 0.64, + "grad_norm": 0.33306363001968703, + "learning_rate": 0.00016323542528037116, + "loss": 1.066, + "step": 6647 + }, + { + "epoch": 0.64, + "grad_norm": 0.2601315042428321, + "learning_rate": 0.0001632231694563768, + "loss": 1.062, + "step": 6648 + }, + { + "epoch": 0.64, + "grad_norm": 0.3122750901555906, + "learning_rate": 0.00016321091205018283, + "loss": 0.988, + "step": 6649 + }, + { + "epoch": 0.64, + "grad_norm": 0.2530794057553161, + "learning_rate": 0.0001631986530620961, + "loss": 0.9536, + "step": 6650 + }, + { + "epoch": 0.64, + "grad_norm": 0.25248623075991256, + "learning_rate": 0.00016318639249242336, + "loss": 1.0528, + "step": 6651 + }, + { + "epoch": 0.64, + "grad_norm": 0.25241303157165856, + "learning_rate": 0.00016317413034147143, + "loss": 0.9887, + "step": 6652 + }, + { + "epoch": 0.64, + "grad_norm": 0.25151161516493253, + "learning_rate": 0.00016316186660954716, + "loss": 1.0605, + "step": 6653 + }, + { + "epoch": 0.64, + "grad_norm": 0.2537064242578609, + "learning_rate": 0.0001631496012969575, + "loss": 1.0014, + "step": 6654 + }, + { + "epoch": 0.64, + "grad_norm": 0.26974830206999884, + "learning_rate": 0.00016313733440400941, + "loss": 1.1359, + "step": 6655 + }, + { + "epoch": 0.64, + "grad_norm": 0.28423801348421174, + "learning_rate": 0.0001631250659310098, + "loss": 1.0911, + "step": 6656 + }, + { + "epoch": 0.64, + "grad_norm": 0.3067572434414906, + "learning_rate": 0.00016311279587826575, + "loss": 1.1511, + "step": 6657 + }, + { + "epoch": 0.64, + "grad_norm": 0.30349651329317273, + "learning_rate": 0.00016310052424608435, + "loss": 1.0717, + "step": 6658 + }, + { + "epoch": 0.64, + "grad_norm": 0.31106706237441717, + "learning_rate": 0.00016308825103477262, + "loss": 1.0138, + "step": 6659 + }, + { + "epoch": 0.64, + "grad_norm": 0.30512529577510444, + "learning_rate": 0.0001630759762446378, + "loss": 1.1175, + "step": 6660 + }, + { + "epoch": 0.64, + "grad_norm": 0.3156400341146579, + "learning_rate": 0.00016306369987598705, + "loss": 1.0594, + "step": 6661 + }, + { + "epoch": 0.64, + "grad_norm": 0.3024624555405739, + "learning_rate": 0.00016305142192912754, + "loss": 1.1203, + "step": 6662 + }, + { + "epoch": 0.64, + "grad_norm": 0.24676397866245015, + "learning_rate": 0.00016303914240436656, + "loss": 1.0936, + "step": 6663 + }, + { + "epoch": 0.64, + "grad_norm": 0.2585771707617533, + "learning_rate": 0.00016302686130201144, + "loss": 1.1232, + "step": 6664 + }, + { + "epoch": 0.64, + "grad_norm": 0.2726696313152824, + "learning_rate": 0.00016301457862236954, + "loss": 1.0913, + "step": 6665 + }, + { + "epoch": 0.64, + "grad_norm": 0.2760746274104927, + "learning_rate": 0.00016300229436574815, + "loss": 0.9746, + "step": 6666 + }, + { + "epoch": 0.64, + "grad_norm": 0.2755395263384695, + "learning_rate": 0.00016299000853245475, + "loss": 1.0183, + "step": 6667 + }, + { + "epoch": 0.64, + "grad_norm": 0.27852069729003387, + "learning_rate": 0.00016297772112279683, + "loss": 1.0574, + "step": 6668 + }, + { + "epoch": 0.64, + "grad_norm": 0.27979974439621436, + "learning_rate": 0.00016296543213708184, + "loss": 0.9499, + "step": 6669 + }, + { + "epoch": 0.64, + "grad_norm": 0.2960060531441859, + "learning_rate": 0.00016295314157561736, + "loss": 1.0549, + "step": 6670 + }, + { + "epoch": 0.64, + "grad_norm": 0.2660000695711491, + "learning_rate": 0.00016294084943871092, + "loss": 1.1072, + "step": 6671 + }, + { + "epoch": 0.64, + "grad_norm": 0.2564269859230904, + "learning_rate": 0.0001629285557266702, + "loss": 1.1333, + "step": 6672 + }, + { + "epoch": 0.64, + "grad_norm": 0.27504533401306863, + "learning_rate": 0.00016291626043980282, + "loss": 1.0034, + "step": 6673 + }, + { + "epoch": 0.64, + "grad_norm": 0.3099324729512832, + "learning_rate": 0.00016290396357841646, + "loss": 1.0459, + "step": 6674 + }, + { + "epoch": 0.64, + "grad_norm": 0.2844820883790574, + "learning_rate": 0.00016289166514281888, + "loss": 0.9184, + "step": 6675 + }, + { + "epoch": 0.64, + "grad_norm": 0.27114882440422344, + "learning_rate": 0.00016287936513331787, + "loss": 1.1727, + "step": 6676 + }, + { + "epoch": 0.64, + "grad_norm": 0.2720012225716546, + "learning_rate": 0.00016286706355022118, + "loss": 1.0654, + "step": 6677 + }, + { + "epoch": 0.64, + "grad_norm": 0.2916372120440236, + "learning_rate": 0.00016285476039383675, + "loss": 1.0744, + "step": 6678 + }, + { + "epoch": 0.64, + "grad_norm": 0.27052138249686447, + "learning_rate": 0.00016284245566447245, + "loss": 1.1014, + "step": 6679 + }, + { + "epoch": 0.64, + "grad_norm": 0.2652236414824706, + "learning_rate": 0.0001628301493624362, + "loss": 1.1672, + "step": 6680 + }, + { + "epoch": 0.64, + "grad_norm": 0.27950873661043685, + "learning_rate": 0.00016281784148803596, + "loss": 1.0014, + "step": 6681 + }, + { + "epoch": 0.64, + "grad_norm": 0.2827281504818738, + "learning_rate": 0.0001628055320415798, + "loss": 0.9676, + "step": 6682 + }, + { + "epoch": 0.64, + "grad_norm": 0.2609854109436044, + "learning_rate": 0.00016279322102337565, + "loss": 0.8978, + "step": 6683 + }, + { + "epoch": 0.64, + "grad_norm": 0.24726688155868715, + "learning_rate": 0.00016278090843373173, + "loss": 1.0843, + "step": 6684 + }, + { + "epoch": 0.64, + "grad_norm": 0.3193189601858733, + "learning_rate": 0.00016276859427295613, + "loss": 1.028, + "step": 6685 + }, + { + "epoch": 0.64, + "grad_norm": 0.2845484123594936, + "learning_rate": 0.00016275627854135698, + "loss": 0.9256, + "step": 6686 + }, + { + "epoch": 0.64, + "grad_norm": 0.29028230142099054, + "learning_rate": 0.00016274396123924252, + "loss": 1.0812, + "step": 6687 + }, + { + "epoch": 0.64, + "grad_norm": 0.32773446539660916, + "learning_rate": 0.000162731642366921, + "loss": 1.0608, + "step": 6688 + }, + { + "epoch": 0.64, + "grad_norm": 0.29790785005575593, + "learning_rate": 0.00016271932192470074, + "loss": 1.0306, + "step": 6689 + }, + { + "epoch": 0.64, + "grad_norm": 0.27883966783277836, + "learning_rate": 0.00016270699991289, + "loss": 1.0423, + "step": 6690 + }, + { + "epoch": 0.64, + "grad_norm": 0.28349188815486853, + "learning_rate": 0.0001626946763317972, + "loss": 1.0897, + "step": 6691 + }, + { + "epoch": 0.64, + "grad_norm": 0.3054195899080486, + "learning_rate": 0.00016268235118173068, + "loss": 1.0126, + "step": 6692 + }, + { + "epoch": 0.64, + "grad_norm": 0.297957213586038, + "learning_rate": 0.00016267002446299891, + "loss": 1.0832, + "step": 6693 + }, + { + "epoch": 0.64, + "grad_norm": 0.2786420902687923, + "learning_rate": 0.00016265769617591046, + "loss": 1.1095, + "step": 6694 + }, + { + "epoch": 0.64, + "grad_norm": 0.28426733069435445, + "learning_rate": 0.00016264536632077376, + "loss": 1.0612, + "step": 6695 + }, + { + "epoch": 0.64, + "grad_norm": 0.27433860519495945, + "learning_rate": 0.0001626330348978974, + "loss": 1.0349, + "step": 6696 + }, + { + "epoch": 0.64, + "grad_norm": 0.317775496735832, + "learning_rate": 0.00016262070190758995, + "loss": 1.1712, + "step": 6697 + }, + { + "epoch": 0.64, + "grad_norm": 0.2568267188069798, + "learning_rate": 0.00016260836735016012, + "loss": 0.9826, + "step": 6698 + }, + { + "epoch": 0.64, + "grad_norm": 0.2852886758776049, + "learning_rate": 0.00016259603122591653, + "loss": 1.0588, + "step": 6699 + }, + { + "epoch": 0.64, + "grad_norm": 0.28264369508139864, + "learning_rate": 0.0001625836935351679, + "loss": 1.139, + "step": 6700 + }, + { + "epoch": 0.64, + "grad_norm": 0.3102222867034462, + "learning_rate": 0.00016257135427822302, + "loss": 1.0697, + "step": 6701 + }, + { + "epoch": 0.64, + "grad_norm": 0.268465641442644, + "learning_rate": 0.00016255901345539072, + "loss": 1.0887, + "step": 6702 + }, + { + "epoch": 0.64, + "grad_norm": 0.30026142767107994, + "learning_rate": 0.00016254667106697972, + "loss": 1.1509, + "step": 6703 + }, + { + "epoch": 0.64, + "grad_norm": 0.2706314630840685, + "learning_rate": 0.000162534327113299, + "loss": 1.0657, + "step": 6704 + }, + { + "epoch": 0.64, + "grad_norm": 0.26773703311582037, + "learning_rate": 0.00016252198159465744, + "loss": 1.1287, + "step": 6705 + }, + { + "epoch": 0.64, + "grad_norm": 0.2957605324588231, + "learning_rate": 0.000162509634511364, + "loss": 1.2115, + "step": 6706 + }, + { + "epoch": 0.64, + "grad_norm": 0.29017697603422365, + "learning_rate": 0.00016249728586372765, + "loss": 1.0255, + "step": 6707 + }, + { + "epoch": 0.64, + "grad_norm": 0.27496495480231525, + "learning_rate": 0.0001624849356520575, + "loss": 1.0261, + "step": 6708 + }, + { + "epoch": 0.64, + "grad_norm": 0.26386368126415394, + "learning_rate": 0.0001624725838766625, + "loss": 1.1855, + "step": 6709 + }, + { + "epoch": 0.64, + "grad_norm": 0.27191856479446785, + "learning_rate": 0.00016246023053785184, + "loss": 1.033, + "step": 6710 + }, + { + "epoch": 0.64, + "grad_norm": 0.26761649181678265, + "learning_rate": 0.0001624478756359347, + "loss": 1.137, + "step": 6711 + }, + { + "epoch": 0.64, + "grad_norm": 0.26810717439455456, + "learning_rate": 0.00016243551917122017, + "loss": 1.169, + "step": 6712 + }, + { + "epoch": 0.64, + "grad_norm": 0.23716095057095146, + "learning_rate": 0.00016242316114401754, + "loss": 0.9461, + "step": 6713 + }, + { + "epoch": 0.64, + "grad_norm": 0.2820129032334972, + "learning_rate": 0.00016241080155463613, + "loss": 1.028, + "step": 6714 + }, + { + "epoch": 0.64, + "grad_norm": 0.29187354158124584, + "learning_rate": 0.00016239844040338513, + "loss": 1.1075, + "step": 6715 + }, + { + "epoch": 0.64, + "grad_norm": 0.2707882691645488, + "learning_rate": 0.00016238607769057396, + "loss": 1.1043, + "step": 6716 + }, + { + "epoch": 0.64, + "grad_norm": 0.27825828988627205, + "learning_rate": 0.00016237371341651198, + "loss": 0.9317, + "step": 6717 + }, + { + "epoch": 0.64, + "grad_norm": 0.24456292837088042, + "learning_rate": 0.00016236134758150863, + "loss": 1.115, + "step": 6718 + }, + { + "epoch": 0.64, + "grad_norm": 0.30642644995469454, + "learning_rate": 0.00016234898018587337, + "loss": 1.0855, + "step": 6719 + }, + { + "epoch": 0.64, + "grad_norm": 0.2536012594797928, + "learning_rate": 0.00016233661122991568, + "loss": 1.0347, + "step": 6720 + }, + { + "epoch": 0.64, + "grad_norm": 0.28201291743191076, + "learning_rate": 0.00016232424071394513, + "loss": 0.923, + "step": 6721 + }, + { + "epoch": 0.64, + "grad_norm": 0.26500412097642506, + "learning_rate": 0.00016231186863827128, + "loss": 1.1197, + "step": 6722 + }, + { + "epoch": 0.64, + "grad_norm": 0.28336108524806514, + "learning_rate": 0.00016229949500320376, + "loss": 0.9663, + "step": 6723 + }, + { + "epoch": 0.64, + "grad_norm": 0.30535153183981684, + "learning_rate": 0.00016228711980905222, + "loss": 1.0546, + "step": 6724 + }, + { + "epoch": 0.64, + "grad_norm": 0.2738980528128915, + "learning_rate": 0.00016227474305612635, + "loss": 1.1652, + "step": 6725 + }, + { + "epoch": 0.64, + "grad_norm": 0.29696172073189514, + "learning_rate": 0.00016226236474473592, + "loss": 1.1307, + "step": 6726 + }, + { + "epoch": 0.64, + "grad_norm": 0.2575129900484908, + "learning_rate": 0.00016224998487519065, + "loss": 1.0553, + "step": 6727 + }, + { + "epoch": 0.64, + "grad_norm": 0.25508951971135796, + "learning_rate": 0.0001622376034478004, + "loss": 0.9931, + "step": 6728 + }, + { + "epoch": 0.64, + "grad_norm": 0.3026293198793625, + "learning_rate": 0.00016222522046287506, + "loss": 1.0979, + "step": 6729 + }, + { + "epoch": 0.64, + "grad_norm": 0.2598437183575833, + "learning_rate": 0.00016221283592072442, + "loss": 1.0128, + "step": 6730 + }, + { + "epoch": 0.64, + "grad_norm": 0.3163844838276975, + "learning_rate": 0.00016220044982165845, + "loss": 1.1538, + "step": 6731 + }, + { + "epoch": 0.64, + "grad_norm": 0.2362891477357362, + "learning_rate": 0.00016218806216598713, + "loss": 1.0506, + "step": 6732 + }, + { + "epoch": 0.64, + "grad_norm": 0.26662591447108, + "learning_rate": 0.00016217567295402052, + "loss": 0.962, + "step": 6733 + }, + { + "epoch": 0.64, + "grad_norm": 0.29725496528185136, + "learning_rate": 0.00016216328218606856, + "loss": 1.0977, + "step": 6734 + }, + { + "epoch": 0.64, + "grad_norm": 0.2571941755837014, + "learning_rate": 0.00016215088986244145, + "loss": 1.0301, + "step": 6735 + }, + { + "epoch": 0.64, + "grad_norm": 0.24442116258690824, + "learning_rate": 0.00016213849598344923, + "loss": 1.0068, + "step": 6736 + }, + { + "epoch": 0.64, + "grad_norm": 0.2747700753479199, + "learning_rate": 0.0001621261005494021, + "loss": 0.984, + "step": 6737 + }, + { + "epoch": 0.64, + "grad_norm": 0.2947494451070546, + "learning_rate": 0.00016211370356061024, + "loss": 0.9723, + "step": 6738 + }, + { + "epoch": 0.64, + "grad_norm": 0.28377559231615324, + "learning_rate": 0.00016210130501738393, + "loss": 1.1093, + "step": 6739 + }, + { + "epoch": 0.64, + "grad_norm": 0.281938530095994, + "learning_rate": 0.00016208890492003345, + "loss": 1.0411, + "step": 6740 + }, + { + "epoch": 0.64, + "grad_norm": 0.25782869247051643, + "learning_rate": 0.00016207650326886908, + "loss": 1.0448, + "step": 6741 + }, + { + "epoch": 0.65, + "grad_norm": 0.27863659241136324, + "learning_rate": 0.0001620641000642012, + "loss": 1.0766, + "step": 6742 + }, + { + "epoch": 0.65, + "grad_norm": 0.3034063935548644, + "learning_rate": 0.00016205169530634022, + "loss": 1.0668, + "step": 6743 + }, + { + "epoch": 0.65, + "grad_norm": 0.29522301302575077, + "learning_rate": 0.00016203928899559655, + "loss": 1.0437, + "step": 6744 + }, + { + "epoch": 0.65, + "grad_norm": 0.2407728356207695, + "learning_rate": 0.0001620268811322807, + "loss": 1.1312, + "step": 6745 + }, + { + "epoch": 0.65, + "grad_norm": 0.26324055091027326, + "learning_rate": 0.0001620144717167032, + "loss": 1.0093, + "step": 6746 + }, + { + "epoch": 0.65, + "grad_norm": 0.2804316768014678, + "learning_rate": 0.0001620020607491745, + "loss": 1.0229, + "step": 6747 + }, + { + "epoch": 0.65, + "grad_norm": 0.2753277098551558, + "learning_rate": 0.00016198964823000531, + "loss": 1.0634, + "step": 6748 + }, + { + "epoch": 0.65, + "grad_norm": 0.2800366285769209, + "learning_rate": 0.00016197723415950618, + "loss": 1.0365, + "step": 6749 + }, + { + "epoch": 0.65, + "grad_norm": 0.302146197809392, + "learning_rate": 0.00016196481853798783, + "loss": 0.9998, + "step": 6750 + }, + { + "epoch": 0.65, + "grad_norm": 0.3568635660133519, + "learning_rate": 0.00016195240136576098, + "loss": 1.1658, + "step": 6751 + }, + { + "epoch": 0.65, + "grad_norm": 0.2716917931898554, + "learning_rate": 0.00016193998264313632, + "loss": 1.0812, + "step": 6752 + }, + { + "epoch": 0.65, + "grad_norm": 0.27290621018790484, + "learning_rate": 0.0001619275623704247, + "loss": 1.094, + "step": 6753 + }, + { + "epoch": 0.65, + "grad_norm": 0.2682620960621536, + "learning_rate": 0.00016191514054793687, + "loss": 1.0381, + "step": 6754 + }, + { + "epoch": 0.65, + "grad_norm": 0.2845594233710903, + "learning_rate": 0.00016190271717598376, + "loss": 1.1278, + "step": 6755 + }, + { + "epoch": 0.65, + "grad_norm": 0.27355020118683776, + "learning_rate": 0.0001618902922548762, + "loss": 0.9993, + "step": 6756 + }, + { + "epoch": 0.65, + "grad_norm": 0.287166856164538, + "learning_rate": 0.00016187786578492527, + "loss": 1.1079, + "step": 6757 + }, + { + "epoch": 0.65, + "grad_norm": 0.31367583134610894, + "learning_rate": 0.00016186543776644177, + "loss": 1.0392, + "step": 6758 + }, + { + "epoch": 0.65, + "grad_norm": 0.26476803361212475, + "learning_rate": 0.00016185300819973687, + "loss": 1.0339, + "step": 6759 + }, + { + "epoch": 0.65, + "grad_norm": 0.2869935686558382, + "learning_rate": 0.00016184057708512156, + "loss": 1.0353, + "step": 6760 + }, + { + "epoch": 0.65, + "grad_norm": 0.2607335209317506, + "learning_rate": 0.0001618281444229069, + "loss": 1.0328, + "step": 6761 + }, + { + "epoch": 0.65, + "grad_norm": 0.2819258447169869, + "learning_rate": 0.0001618157102134041, + "loss": 1.0694, + "step": 6762 + }, + { + "epoch": 0.65, + "grad_norm": 0.2486602423602287, + "learning_rate": 0.0001618032744569243, + "loss": 1.0398, + "step": 6763 + }, + { + "epoch": 0.65, + "grad_norm": 0.2964468238045624, + "learning_rate": 0.0001617908371537787, + "loss": 1.1674, + "step": 6764 + }, + { + "epoch": 0.65, + "grad_norm": 0.28968276628283657, + "learning_rate": 0.00016177839830427862, + "loss": 1.0923, + "step": 6765 + }, + { + "epoch": 0.65, + "grad_norm": 0.30259410481968824, + "learning_rate": 0.00016176595790873526, + "loss": 1.1047, + "step": 6766 + }, + { + "epoch": 0.65, + "grad_norm": 0.299883854383291, + "learning_rate": 0.00016175351596745997, + "loss": 1.0987, + "step": 6767 + }, + { + "epoch": 0.65, + "grad_norm": 0.28742268599750015, + "learning_rate": 0.00016174107248076414, + "loss": 1.1337, + "step": 6768 + }, + { + "epoch": 0.65, + "grad_norm": 0.27078457130344585, + "learning_rate": 0.00016172862744895917, + "loss": 1.1273, + "step": 6769 + }, + { + "epoch": 0.65, + "grad_norm": 0.28329813420365574, + "learning_rate": 0.00016171618087235652, + "loss": 1.042, + "step": 6770 + }, + { + "epoch": 0.65, + "grad_norm": 0.2835411240856413, + "learning_rate": 0.00016170373275126761, + "loss": 1.1055, + "step": 6771 + }, + { + "epoch": 0.65, + "grad_norm": 0.2697682905187557, + "learning_rate": 0.00016169128308600404, + "loss": 1.1592, + "step": 6772 + }, + { + "epoch": 0.65, + "grad_norm": 0.2606901412284456, + "learning_rate": 0.00016167883187687737, + "loss": 1.0796, + "step": 6773 + }, + { + "epoch": 0.65, + "grad_norm": 0.2718079210220933, + "learning_rate": 0.0001616663791241991, + "loss": 0.9178, + "step": 6774 + }, + { + "epoch": 0.65, + "grad_norm": 0.2992698573782557, + "learning_rate": 0.00016165392482828098, + "loss": 0.9155, + "step": 6775 + }, + { + "epoch": 0.65, + "grad_norm": 0.2820273589437464, + "learning_rate": 0.00016164146898943463, + "loss": 1.1096, + "step": 6776 + }, + { + "epoch": 0.65, + "grad_norm": 0.27964729949513284, + "learning_rate": 0.00016162901160797182, + "loss": 1.0301, + "step": 6777 + }, + { + "epoch": 0.65, + "grad_norm": 0.30035527749172686, + "learning_rate": 0.0001616165526842042, + "loss": 1.0112, + "step": 6778 + }, + { + "epoch": 0.65, + "grad_norm": 0.26367756968433076, + "learning_rate": 0.0001616040922184437, + "loss": 1.062, + "step": 6779 + }, + { + "epoch": 0.65, + "grad_norm": 0.30454178782431157, + "learning_rate": 0.000161591630211002, + "loss": 1.0908, + "step": 6780 + }, + { + "epoch": 0.65, + "grad_norm": 0.2736709005007848, + "learning_rate": 0.0001615791666621911, + "loss": 1.0642, + "step": 6781 + }, + { + "epoch": 0.65, + "grad_norm": 0.28955239699459784, + "learning_rate": 0.00016156670157232278, + "loss": 1.038, + "step": 6782 + }, + { + "epoch": 0.65, + "grad_norm": 0.3055984799372455, + "learning_rate": 0.00016155423494170913, + "loss": 1.083, + "step": 6783 + }, + { + "epoch": 0.65, + "grad_norm": 0.28914856789930005, + "learning_rate": 0.00016154176677066204, + "loss": 1.0251, + "step": 6784 + }, + { + "epoch": 0.65, + "grad_norm": 0.2937920207039406, + "learning_rate": 0.00016152929705949356, + "loss": 1.0395, + "step": 6785 + }, + { + "epoch": 0.65, + "grad_norm": 0.26681380130500015, + "learning_rate": 0.00016151682580851576, + "loss": 1.1342, + "step": 6786 + }, + { + "epoch": 0.65, + "grad_norm": 0.31513605238395026, + "learning_rate": 0.00016150435301804072, + "loss": 1.1459, + "step": 6787 + }, + { + "epoch": 0.65, + "grad_norm": 0.2962395662354257, + "learning_rate": 0.0001614918786883806, + "loss": 0.9866, + "step": 6788 + }, + { + "epoch": 0.65, + "grad_norm": 0.27715689822063977, + "learning_rate": 0.00016147940281984754, + "loss": 1.0869, + "step": 6789 + }, + { + "epoch": 0.65, + "grad_norm": 0.261319217896763, + "learning_rate": 0.00016146692541275383, + "loss": 0.9879, + "step": 6790 + }, + { + "epoch": 0.65, + "grad_norm": 0.29020817288316764, + "learning_rate": 0.00016145444646741166, + "loss": 1.1018, + "step": 6791 + }, + { + "epoch": 0.65, + "grad_norm": 0.2564090714223762, + "learning_rate": 0.00016144196598413336, + "loss": 1.0812, + "step": 6792 + }, + { + "epoch": 0.65, + "grad_norm": 0.252997662859385, + "learning_rate": 0.00016142948396323124, + "loss": 1.0051, + "step": 6793 + }, + { + "epoch": 0.65, + "grad_norm": 0.27979442491585477, + "learning_rate": 0.00016141700040501767, + "loss": 1.0166, + "step": 6794 + }, + { + "epoch": 0.65, + "grad_norm": 0.2723199327047408, + "learning_rate": 0.00016140451530980503, + "loss": 1.0498, + "step": 6795 + }, + { + "epoch": 0.65, + "grad_norm": 0.24997068906233713, + "learning_rate": 0.00016139202867790586, + "loss": 1.0254, + "step": 6796 + }, + { + "epoch": 0.65, + "grad_norm": 0.25606449185707403, + "learning_rate": 0.00016137954050963256, + "loss": 1.0375, + "step": 6797 + }, + { + "epoch": 0.65, + "grad_norm": 0.27072931875871203, + "learning_rate": 0.0001613670508052977, + "loss": 1.1278, + "step": 6798 + }, + { + "epoch": 0.65, + "grad_norm": 0.2861760154233243, + "learning_rate": 0.00016135455956521383, + "loss": 1.0276, + "step": 6799 + }, + { + "epoch": 0.65, + "grad_norm": 0.281890088050873, + "learning_rate": 0.00016134206678969351, + "loss": 1.151, + "step": 6800 + }, + { + "epoch": 0.65, + "grad_norm": 0.2561578106187753, + "learning_rate": 0.00016132957247904948, + "loss": 1.0579, + "step": 6801 + }, + { + "epoch": 0.65, + "grad_norm": 0.296476813043644, + "learning_rate": 0.0001613170766335943, + "loss": 1.0815, + "step": 6802 + }, + { + "epoch": 0.65, + "grad_norm": 0.24047219584398974, + "learning_rate": 0.00016130457925364074, + "loss": 1.0674, + "step": 6803 + }, + { + "epoch": 0.65, + "grad_norm": 0.28537999657913954, + "learning_rate": 0.00016129208033950157, + "loss": 1.1703, + "step": 6804 + }, + { + "epoch": 0.65, + "grad_norm": 0.3071076656903495, + "learning_rate": 0.00016127957989148958, + "loss": 1.1097, + "step": 6805 + }, + { + "epoch": 0.65, + "grad_norm": 0.2912482911905167, + "learning_rate": 0.00016126707790991757, + "loss": 1.0033, + "step": 6806 + }, + { + "epoch": 0.65, + "grad_norm": 0.2873587122615141, + "learning_rate": 0.00016125457439509843, + "loss": 1.1402, + "step": 6807 + }, + { + "epoch": 0.65, + "grad_norm": 0.2822651267978888, + "learning_rate": 0.00016124206934734509, + "loss": 1.0684, + "step": 6808 + }, + { + "epoch": 0.65, + "grad_norm": 0.30569024152993257, + "learning_rate": 0.0001612295627669705, + "loss": 1.0356, + "step": 6809 + }, + { + "epoch": 0.65, + "grad_norm": 0.2552189424981213, + "learning_rate": 0.00016121705465428756, + "loss": 1.1015, + "step": 6810 + }, + { + "epoch": 0.65, + "grad_norm": 0.2780062595426105, + "learning_rate": 0.0001612045450096094, + "loss": 1.0252, + "step": 6811 + }, + { + "epoch": 0.65, + "grad_norm": 0.2958626948496943, + "learning_rate": 0.000161192033833249, + "loss": 1.047, + "step": 6812 + }, + { + "epoch": 0.65, + "grad_norm": 0.2547715653655437, + "learning_rate": 0.0001611795211255195, + "loss": 0.9724, + "step": 6813 + }, + { + "epoch": 0.65, + "grad_norm": 0.26184844172876026, + "learning_rate": 0.00016116700688673406, + "loss": 0.9989, + "step": 6814 + }, + { + "epoch": 0.65, + "grad_norm": 0.2903838433558316, + "learning_rate": 0.0001611544911172058, + "loss": 1.0365, + "step": 6815 + }, + { + "epoch": 0.65, + "grad_norm": 0.2787578244449706, + "learning_rate": 0.00016114197381724798, + "loss": 1.1122, + "step": 6816 + }, + { + "epoch": 0.65, + "grad_norm": 0.2890262361588652, + "learning_rate": 0.00016112945498717384, + "loss": 0.9743, + "step": 6817 + }, + { + "epoch": 0.65, + "grad_norm": 0.25957088901852887, + "learning_rate": 0.00016111693462729666, + "loss": 1.0411, + "step": 6818 + }, + { + "epoch": 0.65, + "grad_norm": 0.29141337706844284, + "learning_rate": 0.0001611044127379298, + "loss": 1.0738, + "step": 6819 + }, + { + "epoch": 0.65, + "grad_norm": 0.27868532305453414, + "learning_rate": 0.00016109188931938658, + "loss": 1.0309, + "step": 6820 + }, + { + "epoch": 0.65, + "grad_norm": 0.2865127928357969, + "learning_rate": 0.00016107936437198048, + "loss": 1.0432, + "step": 6821 + }, + { + "epoch": 0.65, + "grad_norm": 0.28901415788858237, + "learning_rate": 0.00016106683789602485, + "loss": 1.0913, + "step": 6822 + }, + { + "epoch": 0.65, + "grad_norm": 0.31087668233226007, + "learning_rate": 0.00016105430989183324, + "loss": 1.0044, + "step": 6823 + }, + { + "epoch": 0.65, + "grad_norm": 0.2907385138727252, + "learning_rate": 0.0001610417803597192, + "loss": 0.9812, + "step": 6824 + }, + { + "epoch": 0.65, + "grad_norm": 0.2955311073850929, + "learning_rate": 0.00016102924929999618, + "loss": 0.9718, + "step": 6825 + }, + { + "epoch": 0.65, + "grad_norm": 0.25938433521785725, + "learning_rate": 0.00016101671671297786, + "loss": 1.0653, + "step": 6826 + }, + { + "epoch": 0.65, + "grad_norm": 0.29835875083842295, + "learning_rate": 0.00016100418259897787, + "loss": 1.1136, + "step": 6827 + }, + { + "epoch": 0.65, + "grad_norm": 0.28475427911325607, + "learning_rate": 0.00016099164695830987, + "loss": 1.0817, + "step": 6828 + }, + { + "epoch": 0.65, + "grad_norm": 0.29712133149339015, + "learning_rate": 0.00016097910979128756, + "loss": 1.2519, + "step": 6829 + }, + { + "epoch": 0.65, + "grad_norm": 0.30942764161840036, + "learning_rate": 0.00016096657109822472, + "loss": 1.153, + "step": 6830 + }, + { + "epoch": 0.65, + "grad_norm": 0.24877924990251216, + "learning_rate": 0.0001609540308794351, + "loss": 1.0787, + "step": 6831 + }, + { + "epoch": 0.65, + "grad_norm": 0.2491230221886881, + "learning_rate": 0.00016094148913523254, + "loss": 1.0693, + "step": 6832 + }, + { + "epoch": 0.65, + "grad_norm": 0.33875565455907103, + "learning_rate": 0.00016092894586593098, + "loss": 1.0437, + "step": 6833 + }, + { + "epoch": 0.65, + "grad_norm": 0.25683830659317614, + "learning_rate": 0.00016091640107184418, + "loss": 1.0761, + "step": 6834 + }, + { + "epoch": 0.65, + "grad_norm": 0.28029893510677445, + "learning_rate": 0.00016090385475328616, + "loss": 1.0066, + "step": 6835 + }, + { + "epoch": 0.65, + "grad_norm": 0.2638298088456665, + "learning_rate": 0.00016089130691057096, + "loss": 1.0488, + "step": 6836 + }, + { + "epoch": 0.65, + "grad_norm": 0.2732308385667981, + "learning_rate": 0.0001608787575440125, + "loss": 1.0224, + "step": 6837 + }, + { + "epoch": 0.65, + "grad_norm": 0.2652051822786465, + "learning_rate": 0.0001608662066539249, + "loss": 1.1503, + "step": 6838 + }, + { + "epoch": 0.65, + "grad_norm": 0.3030626041580578, + "learning_rate": 0.00016085365424062218, + "loss": 1.0871, + "step": 6839 + }, + { + "epoch": 0.65, + "grad_norm": 0.30008149751746244, + "learning_rate": 0.00016084110030441853, + "loss": 1.0668, + "step": 6840 + }, + { + "epoch": 0.65, + "grad_norm": 0.2888340632480941, + "learning_rate": 0.00016082854484562813, + "loss": 1.0374, + "step": 6841 + }, + { + "epoch": 0.65, + "grad_norm": 0.2862699272392478, + "learning_rate": 0.00016081598786456516, + "loss": 1.1416, + "step": 6842 + }, + { + "epoch": 0.65, + "grad_norm": 0.2717722410816119, + "learning_rate": 0.00016080342936154388, + "loss": 1.0422, + "step": 6843 + }, + { + "epoch": 0.65, + "grad_norm": 0.2726242149316908, + "learning_rate": 0.00016079086933687854, + "loss": 1.0457, + "step": 6844 + }, + { + "epoch": 0.65, + "grad_norm": 0.2531509697020992, + "learning_rate": 0.0001607783077908835, + "loss": 1.0804, + "step": 6845 + }, + { + "epoch": 0.65, + "grad_norm": 0.2574041172829116, + "learning_rate": 0.0001607657447238731, + "loss": 1.0583, + "step": 6846 + }, + { + "epoch": 0.66, + "grad_norm": 0.28649411980651573, + "learning_rate": 0.00016075318013616174, + "loss": 1.0341, + "step": 6847 + }, + { + "epoch": 0.66, + "grad_norm": 0.32154960312770714, + "learning_rate": 0.0001607406140280639, + "loss": 1.0304, + "step": 6848 + }, + { + "epoch": 0.66, + "grad_norm": 0.27521017806337644, + "learning_rate": 0.000160728046399894, + "loss": 1.0798, + "step": 6849 + }, + { + "epoch": 0.66, + "grad_norm": 0.27096865352974614, + "learning_rate": 0.00016071547725196657, + "loss": 1.1019, + "step": 6850 + }, + { + "epoch": 0.66, + "grad_norm": 0.2827906410490978, + "learning_rate": 0.0001607029065845962, + "loss": 1.0481, + "step": 6851 + }, + { + "epoch": 0.66, + "grad_norm": 0.28755989114991476, + "learning_rate": 0.00016069033439809738, + "loss": 1.221, + "step": 6852 + }, + { + "epoch": 0.66, + "grad_norm": 0.2849156830204197, + "learning_rate": 0.00016067776069278485, + "loss": 1.0805, + "step": 6853 + }, + { + "epoch": 0.66, + "grad_norm": 0.2738501723030775, + "learning_rate": 0.0001606651854689732, + "loss": 1.0744, + "step": 6854 + }, + { + "epoch": 0.66, + "grad_norm": 0.30408775836991736, + "learning_rate": 0.00016065260872697717, + "loss": 1.0593, + "step": 6855 + }, + { + "epoch": 0.66, + "grad_norm": 0.2742033230168456, + "learning_rate": 0.00016064003046711148, + "loss": 1.1384, + "step": 6856 + }, + { + "epoch": 0.66, + "grad_norm": 0.2623388811156543, + "learning_rate": 0.00016062745068969088, + "loss": 1.0076, + "step": 6857 + }, + { + "epoch": 0.66, + "grad_norm": 0.2830978029989236, + "learning_rate": 0.00016061486939503028, + "loss": 1.0956, + "step": 6858 + }, + { + "epoch": 0.66, + "grad_norm": 0.2825549299962127, + "learning_rate": 0.00016060228658344445, + "loss": 0.9938, + "step": 6859 + }, + { + "epoch": 0.66, + "grad_norm": 0.2787473530770543, + "learning_rate": 0.00016058970225524833, + "loss": 1.108, + "step": 6860 + }, + { + "epoch": 0.66, + "grad_norm": 0.2766509594000378, + "learning_rate": 0.00016057711641075684, + "loss": 1.0104, + "step": 6861 + }, + { + "epoch": 0.66, + "grad_norm": 0.2835424799524376, + "learning_rate": 0.00016056452905028492, + "loss": 1.0538, + "step": 6862 + }, + { + "epoch": 0.66, + "grad_norm": 0.2718902303042608, + "learning_rate": 0.0001605519401741476, + "loss": 1.1156, + "step": 6863 + }, + { + "epoch": 0.66, + "grad_norm": 0.31396026069305516, + "learning_rate": 0.0001605393497826599, + "loss": 1.0302, + "step": 6864 + }, + { + "epoch": 0.66, + "grad_norm": 0.3105439907019263, + "learning_rate": 0.00016052675787613696, + "loss": 1.0684, + "step": 6865 + }, + { + "epoch": 0.66, + "grad_norm": 0.2728613957688195, + "learning_rate": 0.00016051416445489385, + "loss": 1.0522, + "step": 6866 + }, + { + "epoch": 0.66, + "grad_norm": 0.26407320682849367, + "learning_rate": 0.00016050156951924574, + "loss": 0.9828, + "step": 6867 + }, + { + "epoch": 0.66, + "grad_norm": 0.286389653868074, + "learning_rate": 0.00016048897306950784, + "loss": 1.0011, + "step": 6868 + }, + { + "epoch": 0.66, + "grad_norm": 0.20462171644432642, + "learning_rate": 0.00016047637510599534, + "loss": 0.9521, + "step": 6869 + }, + { + "epoch": 0.66, + "grad_norm": 0.28123560210233106, + "learning_rate": 0.00016046377562902356, + "loss": 0.9788, + "step": 6870 + }, + { + "epoch": 0.66, + "grad_norm": 0.30001191282665973, + "learning_rate": 0.0001604511746389078, + "loss": 1.0776, + "step": 6871 + }, + { + "epoch": 0.66, + "grad_norm": 0.24622530091185105, + "learning_rate": 0.00016043857213596344, + "loss": 1.0529, + "step": 6872 + }, + { + "epoch": 0.66, + "grad_norm": 0.27492099325556013, + "learning_rate": 0.00016042596812050576, + "loss": 1.0378, + "step": 6873 + }, + { + "epoch": 0.66, + "grad_norm": 0.2918899294834224, + "learning_rate": 0.00016041336259285031, + "loss": 1.0596, + "step": 6874 + }, + { + "epoch": 0.66, + "grad_norm": 0.2623677600554193, + "learning_rate": 0.00016040075555331246, + "loss": 1.1632, + "step": 6875 + }, + { + "epoch": 0.66, + "grad_norm": 0.26087552461867863, + "learning_rate": 0.00016038814700220777, + "loss": 1.0231, + "step": 6876 + }, + { + "epoch": 0.66, + "grad_norm": 0.27308513799182704, + "learning_rate": 0.00016037553693985172, + "loss": 1.0093, + "step": 6877 + }, + { + "epoch": 0.66, + "grad_norm": 0.3112434726283997, + "learning_rate": 0.00016036292536655993, + "loss": 1.1635, + "step": 6878 + }, + { + "epoch": 0.66, + "grad_norm": 0.2884405944631099, + "learning_rate": 0.00016035031228264798, + "loss": 1.1504, + "step": 6879 + }, + { + "epoch": 0.66, + "grad_norm": 0.25052783602254847, + "learning_rate": 0.00016033769768843153, + "loss": 1.0535, + "step": 6880 + }, + { + "epoch": 0.66, + "grad_norm": 0.2952430681334669, + "learning_rate": 0.00016032508158422633, + "loss": 1.1563, + "step": 6881 + }, + { + "epoch": 0.66, + "grad_norm": 0.2775844498309543, + "learning_rate": 0.00016031246397034797, + "loss": 1.063, + "step": 6882 + }, + { + "epoch": 0.66, + "grad_norm": 0.2641737504846404, + "learning_rate": 0.00016029984484711233, + "loss": 0.9881, + "step": 6883 + }, + { + "epoch": 0.66, + "grad_norm": 0.25954131255382207, + "learning_rate": 0.0001602872242148352, + "loss": 0.9942, + "step": 6884 + }, + { + "epoch": 0.66, + "grad_norm": 0.235184004978054, + "learning_rate": 0.00016027460207383238, + "loss": 1.0592, + "step": 6885 + }, + { + "epoch": 0.66, + "grad_norm": 0.3208319103526141, + "learning_rate": 0.00016026197842441975, + "loss": 0.9071, + "step": 6886 + }, + { + "epoch": 0.66, + "grad_norm": 0.38310981570001407, + "learning_rate": 0.00016024935326691323, + "loss": 1.0084, + "step": 6887 + }, + { + "epoch": 0.66, + "grad_norm": 0.2736848488804497, + "learning_rate": 0.00016023672660162881, + "loss": 1.1093, + "step": 6888 + }, + { + "epoch": 0.66, + "grad_norm": 0.30255083039667124, + "learning_rate": 0.00016022409842888244, + "loss": 1.0453, + "step": 6889 + }, + { + "epoch": 0.66, + "grad_norm": 0.29616770728129344, + "learning_rate": 0.00016021146874899015, + "loss": 1.1753, + "step": 6890 + }, + { + "epoch": 0.66, + "grad_norm": 0.26891944823488517, + "learning_rate": 0.000160198837562268, + "loss": 0.998, + "step": 6891 + }, + { + "epoch": 0.66, + "grad_norm": 0.3069851887194386, + "learning_rate": 0.00016018620486903213, + "loss": 0.9517, + "step": 6892 + }, + { + "epoch": 0.66, + "grad_norm": 0.2878707069041881, + "learning_rate": 0.00016017357066959863, + "loss": 1.0137, + "step": 6893 + }, + { + "epoch": 0.66, + "grad_norm": 0.2804636505710241, + "learning_rate": 0.0001601609349642837, + "loss": 1.0739, + "step": 6894 + }, + { + "epoch": 0.66, + "grad_norm": 0.272015497269986, + "learning_rate": 0.00016014829775340362, + "loss": 1.1176, + "step": 6895 + }, + { + "epoch": 0.66, + "grad_norm": 0.24507472885852935, + "learning_rate": 0.00016013565903727454, + "loss": 1.1288, + "step": 6896 + }, + { + "epoch": 0.66, + "grad_norm": 0.30186275659636463, + "learning_rate": 0.00016012301881621283, + "loss": 1.0384, + "step": 6897 + }, + { + "epoch": 0.66, + "grad_norm": 0.3163308512937789, + "learning_rate": 0.00016011037709053478, + "loss": 1.0186, + "step": 6898 + }, + { + "epoch": 0.66, + "grad_norm": 0.30810804171123124, + "learning_rate": 0.00016009773386055676, + "loss": 1.1093, + "step": 6899 + }, + { + "epoch": 0.66, + "grad_norm": 0.2816814074951005, + "learning_rate": 0.00016008508912659518, + "loss": 1.0513, + "step": 6900 + }, + { + "epoch": 0.66, + "grad_norm": 0.30201609837419807, + "learning_rate": 0.00016007244288896645, + "loss": 0.9661, + "step": 6901 + }, + { + "epoch": 0.66, + "grad_norm": 0.28829302388205935, + "learning_rate": 0.00016005979514798713, + "loss": 1.0704, + "step": 6902 + }, + { + "epoch": 0.66, + "grad_norm": 0.26515326393964955, + "learning_rate": 0.00016004714590397366, + "loss": 1.1436, + "step": 6903 + }, + { + "epoch": 0.66, + "grad_norm": 0.2971122785850026, + "learning_rate": 0.00016003449515724263, + "loss": 1.1088, + "step": 6904 + }, + { + "epoch": 0.66, + "grad_norm": 0.3097599416745419, + "learning_rate": 0.00016002184290811065, + "loss": 1.0972, + "step": 6905 + }, + { + "epoch": 0.66, + "grad_norm": 0.2522652174175764, + "learning_rate": 0.00016000918915689432, + "loss": 1.0747, + "step": 6906 + }, + { + "epoch": 0.66, + "grad_norm": 0.3182119441827633, + "learning_rate": 0.0001599965339039103, + "loss": 1.027, + "step": 6907 + }, + { + "epoch": 0.66, + "grad_norm": 0.2783524829429487, + "learning_rate": 0.0001599838771494753, + "loss": 1.0525, + "step": 6908 + }, + { + "epoch": 0.66, + "grad_norm": 0.26729480846780146, + "learning_rate": 0.0001599712188939061, + "loss": 1.02, + "step": 6909 + }, + { + "epoch": 0.66, + "grad_norm": 0.27005772910227627, + "learning_rate": 0.00015995855913751946, + "loss": 1.1131, + "step": 6910 + }, + { + "epoch": 0.66, + "grad_norm": 0.23819686657027173, + "learning_rate": 0.00015994589788063222, + "loss": 1.0546, + "step": 6911 + }, + { + "epoch": 0.66, + "grad_norm": 0.2937203478865386, + "learning_rate": 0.00015993323512356118, + "loss": 1.1432, + "step": 6912 + }, + { + "epoch": 0.66, + "grad_norm": 0.2788834474966195, + "learning_rate": 0.00015992057086662323, + "loss": 1.2207, + "step": 6913 + }, + { + "epoch": 0.66, + "grad_norm": 0.26680578605709754, + "learning_rate": 0.0001599079051101354, + "loss": 1.0555, + "step": 6914 + }, + { + "epoch": 0.66, + "grad_norm": 0.2829489557203681, + "learning_rate": 0.00015989523785441456, + "loss": 0.9449, + "step": 6915 + }, + { + "epoch": 0.66, + "grad_norm": 0.29363875973797865, + "learning_rate": 0.00015988256909977777, + "loss": 0.9878, + "step": 6916 + }, + { + "epoch": 0.66, + "grad_norm": 0.2543061502185059, + "learning_rate": 0.00015986989884654202, + "loss": 1.1064, + "step": 6917 + }, + { + "epoch": 0.66, + "grad_norm": 0.26512820084084004, + "learning_rate": 0.00015985722709502444, + "loss": 1.036, + "step": 6918 + }, + { + "epoch": 0.66, + "grad_norm": 0.28652859367622907, + "learning_rate": 0.00015984455384554215, + "loss": 1.1668, + "step": 6919 + }, + { + "epoch": 0.66, + "grad_norm": 0.31723374235235535, + "learning_rate": 0.00015983187909841226, + "loss": 1.0463, + "step": 6920 + }, + { + "epoch": 0.66, + "grad_norm": 0.29129953499102984, + "learning_rate": 0.00015981920285395202, + "loss": 1.058, + "step": 6921 + }, + { + "epoch": 0.66, + "grad_norm": 0.29993610209681537, + "learning_rate": 0.0001598065251124786, + "loss": 1.04, + "step": 6922 + }, + { + "epoch": 0.66, + "grad_norm": 0.2947422459230847, + "learning_rate": 0.00015979384587430935, + "loss": 1.0852, + "step": 6923 + }, + { + "epoch": 0.66, + "grad_norm": 0.2714231587486264, + "learning_rate": 0.00015978116513976152, + "loss": 0.9586, + "step": 6924 + }, + { + "epoch": 0.66, + "grad_norm": 0.31441338998821106, + "learning_rate": 0.00015976848290915244, + "loss": 1.0261, + "step": 6925 + }, + { + "epoch": 0.66, + "grad_norm": 0.3020176096145915, + "learning_rate": 0.0001597557991827995, + "loss": 1.0744, + "step": 6926 + }, + { + "epoch": 0.66, + "grad_norm": 0.2941393454482125, + "learning_rate": 0.00015974311396102015, + "loss": 1.0624, + "step": 6927 + }, + { + "epoch": 0.66, + "grad_norm": 0.29719508910446507, + "learning_rate": 0.00015973042724413183, + "loss": 1.18, + "step": 6928 + }, + { + "epoch": 0.66, + "grad_norm": 0.28110919244257476, + "learning_rate": 0.00015971773903245202, + "loss": 0.9661, + "step": 6929 + }, + { + "epoch": 0.66, + "grad_norm": 0.2803124705765725, + "learning_rate": 0.00015970504932629823, + "loss": 1.0396, + "step": 6930 + }, + { + "epoch": 0.66, + "grad_norm": 0.29208189180807126, + "learning_rate": 0.0001596923581259881, + "loss": 1.1863, + "step": 6931 + }, + { + "epoch": 0.66, + "grad_norm": 0.29689595292251414, + "learning_rate": 0.0001596796654318392, + "loss": 1.0811, + "step": 6932 + }, + { + "epoch": 0.66, + "grad_norm": 0.3077207175340659, + "learning_rate": 0.00015966697124416914, + "loss": 1.0538, + "step": 6933 + }, + { + "epoch": 0.66, + "grad_norm": 0.30686176921995806, + "learning_rate": 0.0001596542755632956, + "loss": 1.0504, + "step": 6934 + }, + { + "epoch": 0.66, + "grad_norm": 0.24712629310974943, + "learning_rate": 0.00015964157838953638, + "loss": 1.0084, + "step": 6935 + }, + { + "epoch": 0.66, + "grad_norm": 0.26002780535214853, + "learning_rate": 0.00015962887972320914, + "loss": 1.0972, + "step": 6936 + }, + { + "epoch": 0.66, + "grad_norm": 0.29963814827197655, + "learning_rate": 0.00015961617956463173, + "loss": 1.1617, + "step": 6937 + }, + { + "epoch": 0.66, + "grad_norm": 0.29022102343364803, + "learning_rate": 0.00015960347791412196, + "loss": 0.9665, + "step": 6938 + }, + { + "epoch": 0.66, + "grad_norm": 0.30836910286066316, + "learning_rate": 0.00015959077477199765, + "loss": 1.0885, + "step": 6939 + }, + { + "epoch": 0.66, + "grad_norm": 0.2836035210572582, + "learning_rate": 0.0001595780701385768, + "loss": 1.0539, + "step": 6940 + }, + { + "epoch": 0.66, + "grad_norm": 0.30435356184547685, + "learning_rate": 0.0001595653640141773, + "loss": 0.976, + "step": 6941 + }, + { + "epoch": 0.66, + "grad_norm": 0.26960506796384626, + "learning_rate": 0.00015955265639911711, + "loss": 1.1245, + "step": 6942 + }, + { + "epoch": 0.66, + "grad_norm": 0.36290217128036273, + "learning_rate": 0.00015953994729371427, + "loss": 1.0396, + "step": 6943 + }, + { + "epoch": 0.66, + "grad_norm": 0.32401610966187167, + "learning_rate": 0.00015952723669828683, + "loss": 1.0519, + "step": 6944 + }, + { + "epoch": 0.66, + "grad_norm": 0.27984623740696507, + "learning_rate": 0.00015951452461315292, + "loss": 1.0498, + "step": 6945 + }, + { + "epoch": 0.66, + "grad_norm": 0.24996113180352433, + "learning_rate": 0.00015950181103863056, + "loss": 1.0642, + "step": 6946 + }, + { + "epoch": 0.66, + "grad_norm": 0.2587685668722977, + "learning_rate": 0.000159489095975038, + "loss": 1.1062, + "step": 6947 + }, + { + "epoch": 0.66, + "grad_norm": 0.32763762154630816, + "learning_rate": 0.00015947637942269343, + "loss": 1.1287, + "step": 6948 + }, + { + "epoch": 0.66, + "grad_norm": 0.2585215704752277, + "learning_rate": 0.0001594636613819151, + "loss": 1.0892, + "step": 6949 + }, + { + "epoch": 0.66, + "grad_norm": 0.2885043831452831, + "learning_rate": 0.0001594509418530213, + "loss": 1.1353, + "step": 6950 + }, + { + "epoch": 0.67, + "grad_norm": 0.26515145631234566, + "learning_rate": 0.00015943822083633026, + "loss": 0.9613, + "step": 6951 + }, + { + "epoch": 0.67, + "grad_norm": 0.2832070363192226, + "learning_rate": 0.00015942549833216043, + "loss": 1.047, + "step": 6952 + }, + { + "epoch": 0.67, + "grad_norm": 0.2851606867131872, + "learning_rate": 0.00015941277434083014, + "loss": 1.1776, + "step": 6953 + }, + { + "epoch": 0.67, + "grad_norm": 0.26970800321487326, + "learning_rate": 0.00015940004886265781, + "loss": 1.0471, + "step": 6954 + }, + { + "epoch": 0.67, + "grad_norm": 0.30024373875561594, + "learning_rate": 0.00015938732189796196, + "loss": 1.1125, + "step": 6955 + }, + { + "epoch": 0.67, + "grad_norm": 0.29870880904188896, + "learning_rate": 0.00015937459344706105, + "loss": 1.106, + "step": 6956 + }, + { + "epoch": 0.67, + "grad_norm": 0.2990216369851441, + "learning_rate": 0.0001593618635102736, + "loss": 1.0811, + "step": 6957 + }, + { + "epoch": 0.67, + "grad_norm": 0.2848511215046961, + "learning_rate": 0.00015934913208791825, + "loss": 1.0252, + "step": 6958 + }, + { + "epoch": 0.67, + "grad_norm": 0.26962263498375855, + "learning_rate": 0.00015933639918031353, + "loss": 1.1282, + "step": 6959 + }, + { + "epoch": 0.67, + "grad_norm": 0.3226088976764872, + "learning_rate": 0.00015932366478777816, + "loss": 1.1156, + "step": 6960 + }, + { + "epoch": 0.67, + "grad_norm": 0.2775279024141733, + "learning_rate": 0.00015931092891063078, + "loss": 1.0649, + "step": 6961 + }, + { + "epoch": 0.67, + "grad_norm": 0.29817813856951386, + "learning_rate": 0.0001592981915491901, + "loss": 1.0693, + "step": 6962 + }, + { + "epoch": 0.67, + "grad_norm": 0.2685645767583313, + "learning_rate": 0.00015928545270377494, + "loss": 1.019, + "step": 6963 + }, + { + "epoch": 0.67, + "grad_norm": 0.2950719704962928, + "learning_rate": 0.00015927271237470408, + "loss": 1.0845, + "step": 6964 + }, + { + "epoch": 0.67, + "grad_norm": 0.2834261349957536, + "learning_rate": 0.0001592599705622963, + "loss": 1.153, + "step": 6965 + }, + { + "epoch": 0.67, + "grad_norm": 0.2703350081406132, + "learning_rate": 0.00015924722726687058, + "loss": 1.1079, + "step": 6966 + }, + { + "epoch": 0.67, + "grad_norm": 0.2573244560178056, + "learning_rate": 0.0001592344824887457, + "loss": 1.0111, + "step": 6967 + }, + { + "epoch": 0.67, + "grad_norm": 0.24815316601701018, + "learning_rate": 0.0001592217362282407, + "loss": 1.0481, + "step": 6968 + }, + { + "epoch": 0.67, + "grad_norm": 0.30891957392030694, + "learning_rate": 0.0001592089884856745, + "loss": 1.0926, + "step": 6969 + }, + { + "epoch": 0.67, + "grad_norm": 0.30606523617707265, + "learning_rate": 0.00015919623926136618, + "loss": 1.099, + "step": 6970 + }, + { + "epoch": 0.67, + "grad_norm": 0.28728857946398434, + "learning_rate": 0.00015918348855563477, + "loss": 1.1097, + "step": 6971 + }, + { + "epoch": 0.67, + "grad_norm": 0.26905395320578945, + "learning_rate": 0.00015917073636879936, + "loss": 1.0316, + "step": 6972 + }, + { + "epoch": 0.67, + "grad_norm": 0.2791301663055386, + "learning_rate": 0.00015915798270117905, + "loss": 1.078, + "step": 6973 + }, + { + "epoch": 0.67, + "grad_norm": 0.27771895204228286, + "learning_rate": 0.0001591452275530931, + "loss": 0.9038, + "step": 6974 + }, + { + "epoch": 0.67, + "grad_norm": 0.27933418779286445, + "learning_rate": 0.0001591324709248606, + "loss": 1.0519, + "step": 6975 + }, + { + "epoch": 0.67, + "grad_norm": 0.2816750044645727, + "learning_rate": 0.00015911971281680088, + "loss": 0.9544, + "step": 6976 + }, + { + "epoch": 0.67, + "grad_norm": 0.285922508579133, + "learning_rate": 0.0001591069532292332, + "loss": 1.0725, + "step": 6977 + }, + { + "epoch": 0.67, + "grad_norm": 0.33215001802712657, + "learning_rate": 0.00015909419216247688, + "loss": 1.0101, + "step": 6978 + }, + { + "epoch": 0.67, + "grad_norm": 0.23393732531173045, + "learning_rate": 0.00015908142961685125, + "loss": 1.0705, + "step": 6979 + }, + { + "epoch": 0.67, + "grad_norm": 0.257596551482316, + "learning_rate": 0.0001590686655926757, + "loss": 1.1013, + "step": 6980 + }, + { + "epoch": 0.67, + "grad_norm": 0.3135249829190419, + "learning_rate": 0.00015905590009026967, + "loss": 1.0304, + "step": 6981 + }, + { + "epoch": 0.67, + "grad_norm": 0.2856466968173445, + "learning_rate": 0.00015904313310995263, + "loss": 0.9811, + "step": 6982 + }, + { + "epoch": 0.67, + "grad_norm": 0.2829987935572402, + "learning_rate": 0.00015903036465204407, + "loss": 1.1061, + "step": 6983 + }, + { + "epoch": 0.67, + "grad_norm": 0.2780452591589329, + "learning_rate": 0.00015901759471686358, + "loss": 1.1667, + "step": 6984 + }, + { + "epoch": 0.67, + "grad_norm": 0.27814002322278586, + "learning_rate": 0.00015900482330473062, + "loss": 1.1414, + "step": 6985 + }, + { + "epoch": 0.67, + "grad_norm": 0.2860266253901477, + "learning_rate": 0.0001589920504159649, + "loss": 0.9647, + "step": 6986 + }, + { + "epoch": 0.67, + "grad_norm": 0.2760940301311592, + "learning_rate": 0.0001589792760508861, + "loss": 1.0185, + "step": 6987 + }, + { + "epoch": 0.67, + "grad_norm": 0.2706769903514577, + "learning_rate": 0.00015896650020981378, + "loss": 1.2201, + "step": 6988 + }, + { + "epoch": 0.67, + "grad_norm": 0.2622304954412599, + "learning_rate": 0.00015895372289306776, + "loss": 1.0844, + "step": 6989 + }, + { + "epoch": 0.67, + "grad_norm": 0.30180292321521496, + "learning_rate": 0.00015894094410096775, + "loss": 0.9252, + "step": 6990 + }, + { + "epoch": 0.67, + "grad_norm": 0.280964051345383, + "learning_rate": 0.0001589281638338336, + "loss": 1.0206, + "step": 6991 + }, + { + "epoch": 0.67, + "grad_norm": 0.2722800997770079, + "learning_rate": 0.0001589153820919851, + "loss": 0.9916, + "step": 6992 + }, + { + "epoch": 0.67, + "grad_norm": 0.27884975208948837, + "learning_rate": 0.00015890259887574215, + "loss": 0.9978, + "step": 6993 + }, + { + "epoch": 0.67, + "grad_norm": 0.24598496860742308, + "learning_rate": 0.00015888981418542462, + "loss": 1.0904, + "step": 6994 + }, + { + "epoch": 0.67, + "grad_norm": 0.2961053801063102, + "learning_rate": 0.00015887702802135252, + "loss": 0.9764, + "step": 6995 + }, + { + "epoch": 0.67, + "grad_norm": 0.26389220997829704, + "learning_rate": 0.00015886424038384577, + "loss": 1.0735, + "step": 6996 + }, + { + "epoch": 0.67, + "grad_norm": 0.3058819655409103, + "learning_rate": 0.00015885145127322438, + "loss": 1.1001, + "step": 6997 + }, + { + "epoch": 0.67, + "grad_norm": 0.279716453620408, + "learning_rate": 0.00015883866068980846, + "loss": 1.1435, + "step": 6998 + }, + { + "epoch": 0.67, + "grad_norm": 0.2938126610887859, + "learning_rate": 0.00015882586863391807, + "loss": 1.0411, + "step": 6999 + }, + { + "epoch": 0.67, + "grad_norm": 0.24676335418681838, + "learning_rate": 0.00015881307510587337, + "loss": 0.9959, + "step": 7000 + }, + { + "epoch": 0.67, + "grad_norm": 0.2653940896900372, + "learning_rate": 0.0001588002801059945, + "loss": 1.0835, + "step": 7001 + }, + { + "epoch": 0.67, + "grad_norm": 0.3109595788932631, + "learning_rate": 0.00015878748363460163, + "loss": 1.0282, + "step": 7002 + }, + { + "epoch": 0.67, + "grad_norm": 0.30081836243424015, + "learning_rate": 0.00015877468569201506, + "loss": 0.9415, + "step": 7003 + }, + { + "epoch": 0.67, + "grad_norm": 0.31448590635210893, + "learning_rate": 0.00015876188627855507, + "loss": 1.074, + "step": 7004 + }, + { + "epoch": 0.67, + "grad_norm": 0.3017891643305516, + "learning_rate": 0.00015874908539454188, + "loss": 1.0724, + "step": 7005 + }, + { + "epoch": 0.67, + "grad_norm": 0.3277818013492165, + "learning_rate": 0.00015873628304029596, + "loss": 1.1018, + "step": 7006 + }, + { + "epoch": 0.67, + "grad_norm": 0.2802892779964618, + "learning_rate": 0.00015872347921613763, + "loss": 1.0991, + "step": 7007 + }, + { + "epoch": 0.67, + "grad_norm": 0.28604449999798276, + "learning_rate": 0.0001587106739223873, + "loss": 0.9591, + "step": 7008 + }, + { + "epoch": 0.67, + "grad_norm": 0.2974499942407421, + "learning_rate": 0.0001586978671593655, + "loss": 1.0818, + "step": 7009 + }, + { + "epoch": 0.67, + "grad_norm": 0.26346530008044805, + "learning_rate": 0.00015868505892739266, + "loss": 1.0774, + "step": 7010 + }, + { + "epoch": 0.67, + "grad_norm": 0.30282010918915797, + "learning_rate": 0.00015867224922678933, + "loss": 1.0961, + "step": 7011 + }, + { + "epoch": 0.67, + "grad_norm": 0.31016719288564637, + "learning_rate": 0.0001586594380578761, + "loss": 1.1236, + "step": 7012 + }, + { + "epoch": 0.67, + "grad_norm": 0.28052033504795476, + "learning_rate": 0.00015864662542097358, + "loss": 1.0947, + "step": 7013 + }, + { + "epoch": 0.67, + "grad_norm": 0.23638403426307897, + "learning_rate": 0.00015863381131640236, + "loss": 1.0231, + "step": 7014 + }, + { + "epoch": 0.67, + "grad_norm": 0.291740553126471, + "learning_rate": 0.00015862099574448317, + "loss": 0.9802, + "step": 7015 + }, + { + "epoch": 0.67, + "grad_norm": 0.3029857623248768, + "learning_rate": 0.00015860817870553677, + "loss": 1.12, + "step": 7016 + }, + { + "epoch": 0.67, + "grad_norm": 0.2630855945498341, + "learning_rate": 0.00015859536019988384, + "loss": 0.9507, + "step": 7017 + }, + { + "epoch": 0.67, + "grad_norm": 0.2758349794123403, + "learning_rate": 0.00015858254022784515, + "loss": 1.1074, + "step": 7018 + }, + { + "epoch": 0.67, + "grad_norm": 0.27661938873325614, + "learning_rate": 0.00015856971878974163, + "loss": 1.0374, + "step": 7019 + }, + { + "epoch": 0.67, + "grad_norm": 0.2714412044951736, + "learning_rate": 0.00015855689588589405, + "loss": 1.052, + "step": 7020 + }, + { + "epoch": 0.67, + "grad_norm": 0.2677737703288316, + "learning_rate": 0.00015854407151662337, + "loss": 1.0147, + "step": 7021 + }, + { + "epoch": 0.67, + "grad_norm": 0.2927994644038343, + "learning_rate": 0.0001585312456822505, + "loss": 1.0889, + "step": 7022 + }, + { + "epoch": 0.67, + "grad_norm": 0.2790949416690381, + "learning_rate": 0.0001585184183830964, + "loss": 1.1484, + "step": 7023 + }, + { + "epoch": 0.67, + "grad_norm": 0.28034621779286634, + "learning_rate": 0.00015850558961948217, + "loss": 1.0267, + "step": 7024 + }, + { + "epoch": 0.67, + "grad_norm": 0.2745504280573277, + "learning_rate": 0.00015849275939172874, + "loss": 1.1027, + "step": 7025 + }, + { + "epoch": 0.67, + "grad_norm": 0.26978925236279233, + "learning_rate": 0.00015847992770015725, + "loss": 0.9182, + "step": 7026 + }, + { + "epoch": 0.67, + "grad_norm": 0.29839939566941587, + "learning_rate": 0.00015846709454508883, + "loss": 1.0897, + "step": 7027 + }, + { + "epoch": 0.67, + "grad_norm": 0.26196842338250126, + "learning_rate": 0.0001584542599268446, + "loss": 1.0644, + "step": 7028 + }, + { + "epoch": 0.67, + "grad_norm": 0.28373187399028676, + "learning_rate": 0.0001584414238457458, + "loss": 1.0633, + "step": 7029 + }, + { + "epoch": 0.67, + "grad_norm": 0.2537841362135921, + "learning_rate": 0.0001584285863021136, + "loss": 1.0892, + "step": 7030 + }, + { + "epoch": 0.67, + "grad_norm": 0.27473559874173337, + "learning_rate": 0.00015841574729626935, + "loss": 0.9814, + "step": 7031 + }, + { + "epoch": 0.67, + "grad_norm": 0.30313556481972587, + "learning_rate": 0.00015840290682853428, + "loss": 1.016, + "step": 7032 + }, + { + "epoch": 0.67, + "grad_norm": 0.28512884288704177, + "learning_rate": 0.0001583900648992298, + "loss": 0.9909, + "step": 7033 + }, + { + "epoch": 0.67, + "grad_norm": 0.2551073361295876, + "learning_rate": 0.00015837722150867722, + "loss": 0.9779, + "step": 7034 + }, + { + "epoch": 0.67, + "grad_norm": 0.2939908816315301, + "learning_rate": 0.00015836437665719798, + "loss": 1.1285, + "step": 7035 + }, + { + "epoch": 0.67, + "grad_norm": 0.2725649426346149, + "learning_rate": 0.00015835153034511357, + "loss": 1.1472, + "step": 7036 + }, + { + "epoch": 0.67, + "grad_norm": 0.2718133362148875, + "learning_rate": 0.0001583386825727454, + "loss": 1.0443, + "step": 7037 + }, + { + "epoch": 0.67, + "grad_norm": 0.27419111607826285, + "learning_rate": 0.00015832583334041505, + "loss": 1.0012, + "step": 7038 + }, + { + "epoch": 0.67, + "grad_norm": 0.2849089224699415, + "learning_rate": 0.00015831298264844406, + "loss": 1.0507, + "step": 7039 + }, + { + "epoch": 0.67, + "grad_norm": 0.3068910572836713, + "learning_rate": 0.00015830013049715404, + "loss": 1.0838, + "step": 7040 + }, + { + "epoch": 0.67, + "grad_norm": 0.2779576724478555, + "learning_rate": 0.0001582872768868666, + "loss": 1.0766, + "step": 7041 + }, + { + "epoch": 0.67, + "grad_norm": 0.3302291612230447, + "learning_rate": 0.00015827442181790344, + "loss": 0.9843, + "step": 7042 + }, + { + "epoch": 0.67, + "grad_norm": 0.25384666865589006, + "learning_rate": 0.00015826156529058624, + "loss": 1.0086, + "step": 7043 + }, + { + "epoch": 0.67, + "grad_norm": 0.27280371588140856, + "learning_rate": 0.00015824870730523675, + "loss": 1.0686, + "step": 7044 + }, + { + "epoch": 0.67, + "grad_norm": 0.3030164504363727, + "learning_rate": 0.00015823584786217677, + "loss": 1.0226, + "step": 7045 + }, + { + "epoch": 0.67, + "grad_norm": 0.26909739332574156, + "learning_rate": 0.00015822298696172805, + "loss": 0.99, + "step": 7046 + }, + { + "epoch": 0.67, + "grad_norm": 0.2877874749400455, + "learning_rate": 0.00015821012460421255, + "loss": 0.9923, + "step": 7047 + }, + { + "epoch": 0.67, + "grad_norm": 0.2877538039040546, + "learning_rate": 0.00015819726078995208, + "loss": 1.0115, + "step": 7048 + }, + { + "epoch": 0.67, + "grad_norm": 0.30848201813052933, + "learning_rate": 0.00015818439551926856, + "loss": 1.0591, + "step": 7049 + }, + { + "epoch": 0.67, + "grad_norm": 0.29140546183062116, + "learning_rate": 0.000158171528792484, + "loss": 1.0429, + "step": 7050 + }, + { + "epoch": 0.67, + "grad_norm": 0.26994340789160964, + "learning_rate": 0.00015815866060992035, + "loss": 1.011, + "step": 7051 + }, + { + "epoch": 0.67, + "grad_norm": 0.2792969580723623, + "learning_rate": 0.00015814579097189966, + "loss": 1.0891, + "step": 7052 + }, + { + "epoch": 0.67, + "grad_norm": 0.28250949172509243, + "learning_rate": 0.00015813291987874407, + "loss": 1.1065, + "step": 7053 + }, + { + "epoch": 0.67, + "grad_norm": 0.2978217599721609, + "learning_rate": 0.00015812004733077554, + "loss": 0.9388, + "step": 7054 + }, + { + "epoch": 0.67, + "grad_norm": 0.2908978622536936, + "learning_rate": 0.00015810717332831635, + "loss": 1.0988, + "step": 7055 + }, + { + "epoch": 0.68, + "grad_norm": 0.2799056103902689, + "learning_rate": 0.0001580942978716886, + "loss": 1.1076, + "step": 7056 + }, + { + "epoch": 0.68, + "grad_norm": 0.30269500535419785, + "learning_rate": 0.00015808142096121456, + "loss": 1.1737, + "step": 7057 + }, + { + "epoch": 0.68, + "grad_norm": 0.28238103258320846, + "learning_rate": 0.00015806854259721646, + "loss": 1.0724, + "step": 7058 + }, + { + "epoch": 0.68, + "grad_norm": 0.2985043169531798, + "learning_rate": 0.00015805566278001657, + "loss": 0.9103, + "step": 7059 + }, + { + "epoch": 0.68, + "grad_norm": 0.293719688850402, + "learning_rate": 0.00015804278150993722, + "loss": 1.0877, + "step": 7060 + }, + { + "epoch": 0.68, + "grad_norm": 0.27699549920215416, + "learning_rate": 0.00015802989878730084, + "loss": 1.1226, + "step": 7061 + }, + { + "epoch": 0.68, + "grad_norm": 0.26487298015420774, + "learning_rate": 0.00015801701461242974, + "loss": 1.1144, + "step": 7062 + }, + { + "epoch": 0.68, + "grad_norm": 0.26324552701590875, + "learning_rate": 0.00015800412898564636, + "loss": 1.173, + "step": 7063 + }, + { + "epoch": 0.68, + "grad_norm": 0.27384596081191975, + "learning_rate": 0.00015799124190727322, + "loss": 0.9868, + "step": 7064 + }, + { + "epoch": 0.68, + "grad_norm": 0.25963681695290847, + "learning_rate": 0.00015797835337763282, + "loss": 0.9288, + "step": 7065 + }, + { + "epoch": 0.68, + "grad_norm": 0.29013003407961624, + "learning_rate": 0.00015796546339704766, + "loss": 1.0005, + "step": 7066 + }, + { + "epoch": 0.68, + "grad_norm": 0.3020156469079804, + "learning_rate": 0.00015795257196584038, + "loss": 0.9843, + "step": 7067 + }, + { + "epoch": 0.68, + "grad_norm": 0.2895277107493972, + "learning_rate": 0.00015793967908433353, + "loss": 1.105, + "step": 7068 + }, + { + "epoch": 0.68, + "grad_norm": 0.2755240824809856, + "learning_rate": 0.0001579267847528498, + "loss": 1.082, + "step": 7069 + }, + { + "epoch": 0.68, + "grad_norm": 0.30036604934270644, + "learning_rate": 0.00015791388897171192, + "loss": 1.2018, + "step": 7070 + }, + { + "epoch": 0.68, + "grad_norm": 0.30614155747981436, + "learning_rate": 0.00015790099174124253, + "loss": 1.0759, + "step": 7071 + }, + { + "epoch": 0.68, + "grad_norm": 0.2941340821458282, + "learning_rate": 0.00015788809306176447, + "loss": 1.0597, + "step": 7072 + }, + { + "epoch": 0.68, + "grad_norm": 0.30856171230251395, + "learning_rate": 0.00015787519293360044, + "loss": 1.1318, + "step": 7073 + }, + { + "epoch": 0.68, + "grad_norm": 0.29580057657767084, + "learning_rate": 0.00015786229135707338, + "loss": 1.0522, + "step": 7074 + }, + { + "epoch": 0.68, + "grad_norm": 0.274616244018645, + "learning_rate": 0.0001578493883325061, + "loss": 1.0299, + "step": 7075 + }, + { + "epoch": 0.68, + "grad_norm": 0.3114772800268081, + "learning_rate": 0.00015783648386022151, + "loss": 1.161, + "step": 7076 + }, + { + "epoch": 0.68, + "grad_norm": 0.2802238017688198, + "learning_rate": 0.0001578235779405426, + "loss": 1.0197, + "step": 7077 + }, + { + "epoch": 0.68, + "grad_norm": 0.2944810135727739, + "learning_rate": 0.00015781067057379228, + "loss": 1.1013, + "step": 7078 + }, + { + "epoch": 0.68, + "grad_norm": 0.24998357909349614, + "learning_rate": 0.00015779776176029356, + "loss": 1.0692, + "step": 7079 + }, + { + "epoch": 0.68, + "grad_norm": 0.2945811953423457, + "learning_rate": 0.0001577848515003696, + "loss": 1.068, + "step": 7080 + }, + { + "epoch": 0.68, + "grad_norm": 0.25806411996635104, + "learning_rate": 0.00015777193979434338, + "loss": 1.0476, + "step": 7081 + }, + { + "epoch": 0.68, + "grad_norm": 0.2598704969021484, + "learning_rate": 0.00015775902664253808, + "loss": 1.0567, + "step": 7082 + }, + { + "epoch": 0.68, + "grad_norm": 0.28645413042100387, + "learning_rate": 0.00015774611204527681, + "loss": 1.0646, + "step": 7083 + }, + { + "epoch": 0.68, + "grad_norm": 0.25410792181961595, + "learning_rate": 0.0001577331960028828, + "loss": 0.9907, + "step": 7084 + }, + { + "epoch": 0.68, + "grad_norm": 0.26092421697879675, + "learning_rate": 0.00015772027851567927, + "loss": 0.9608, + "step": 7085 + }, + { + "epoch": 0.68, + "grad_norm": 0.28943346794498837, + "learning_rate": 0.00015770735958398952, + "loss": 1.0073, + "step": 7086 + }, + { + "epoch": 0.68, + "grad_norm": 0.26566974144868033, + "learning_rate": 0.0001576944392081368, + "loss": 1.109, + "step": 7087 + }, + { + "epoch": 0.68, + "grad_norm": 0.28616557026410305, + "learning_rate": 0.0001576815173884445, + "loss": 1.1788, + "step": 7088 + }, + { + "epoch": 0.68, + "grad_norm": 0.3140389757089799, + "learning_rate": 0.00015766859412523596, + "loss": 1.1028, + "step": 7089 + }, + { + "epoch": 0.68, + "grad_norm": 0.2903534504256985, + "learning_rate": 0.0001576556694188346, + "loss": 1.0848, + "step": 7090 + }, + { + "epoch": 0.68, + "grad_norm": 0.2739660674757886, + "learning_rate": 0.00015764274326956392, + "loss": 1.088, + "step": 7091 + }, + { + "epoch": 0.68, + "grad_norm": 0.26183889146766914, + "learning_rate": 0.00015762981567774733, + "loss": 1.045, + "step": 7092 + }, + { + "epoch": 0.68, + "grad_norm": 0.2773165562228698, + "learning_rate": 0.0001576168866437084, + "loss": 1.0625, + "step": 7093 + }, + { + "epoch": 0.68, + "grad_norm": 0.28250207452060827, + "learning_rate": 0.00015760395616777064, + "loss": 1.0738, + "step": 7094 + }, + { + "epoch": 0.68, + "grad_norm": 0.2664810800693953, + "learning_rate": 0.0001575910242502577, + "loss": 1.1199, + "step": 7095 + }, + { + "epoch": 0.68, + "grad_norm": 0.29450171685022813, + "learning_rate": 0.00015757809089149319, + "loss": 1.0634, + "step": 7096 + }, + { + "epoch": 0.68, + "grad_norm": 0.28549635512987454, + "learning_rate": 0.00015756515609180073, + "loss": 1.0337, + "step": 7097 + }, + { + "epoch": 0.68, + "grad_norm": 0.28429692726911365, + "learning_rate": 0.00015755221985150412, + "loss": 1.1475, + "step": 7098 + }, + { + "epoch": 0.68, + "grad_norm": 0.30253389968364547, + "learning_rate": 0.000157539282170927, + "loss": 1.047, + "step": 7099 + }, + { + "epoch": 0.68, + "grad_norm": 0.3098958322121182, + "learning_rate": 0.00015752634305039317, + "loss": 0.965, + "step": 7100 + }, + { + "epoch": 0.68, + "grad_norm": 0.25143769789194087, + "learning_rate": 0.0001575134024902265, + "loss": 0.9581, + "step": 7101 + }, + { + "epoch": 0.68, + "grad_norm": 0.3016367636629631, + "learning_rate": 0.00015750046049075076, + "loss": 1.0633, + "step": 7102 + }, + { + "epoch": 0.68, + "grad_norm": 0.3271227303031079, + "learning_rate": 0.00015748751705228984, + "loss": 0.9946, + "step": 7103 + }, + { + "epoch": 0.68, + "grad_norm": 0.25230564082743323, + "learning_rate": 0.00015747457217516768, + "loss": 0.998, + "step": 7104 + }, + { + "epoch": 0.68, + "grad_norm": 0.29985887178796994, + "learning_rate": 0.00015746162585970826, + "loss": 1.0747, + "step": 7105 + }, + { + "epoch": 0.68, + "grad_norm": 0.3268684025188319, + "learning_rate": 0.00015744867810623553, + "loss": 1.2323, + "step": 7106 + }, + { + "epoch": 0.68, + "grad_norm": 0.24907462724647375, + "learning_rate": 0.0001574357289150735, + "loss": 1.0212, + "step": 7107 + }, + { + "epoch": 0.68, + "grad_norm": 0.2670820715697481, + "learning_rate": 0.0001574227782865463, + "loss": 0.9549, + "step": 7108 + }, + { + "epoch": 0.68, + "grad_norm": 0.2872956326475909, + "learning_rate": 0.00015740982622097793, + "loss": 1.0685, + "step": 7109 + }, + { + "epoch": 0.68, + "grad_norm": 0.3131384318834513, + "learning_rate": 0.00015739687271869258, + "loss": 1.0596, + "step": 7110 + }, + { + "epoch": 0.68, + "grad_norm": 0.2658074059936171, + "learning_rate": 0.00015738391778001446, + "loss": 1.0484, + "step": 7111 + }, + { + "epoch": 0.68, + "grad_norm": 0.2628890032193891, + "learning_rate": 0.00015737096140526773, + "loss": 0.9861, + "step": 7112 + }, + { + "epoch": 0.68, + "grad_norm": 0.3111781620355991, + "learning_rate": 0.0001573580035947766, + "loss": 1.0708, + "step": 7113 + }, + { + "epoch": 0.68, + "grad_norm": 0.28054369956222347, + "learning_rate": 0.0001573450443488654, + "loss": 1.0379, + "step": 7114 + }, + { + "epoch": 0.68, + "grad_norm": 0.2972730553838039, + "learning_rate": 0.00015733208366785847, + "loss": 1.1547, + "step": 7115 + }, + { + "epoch": 0.68, + "grad_norm": 0.2731099842689578, + "learning_rate": 0.00015731912155208004, + "loss": 1.0216, + "step": 7116 + }, + { + "epoch": 0.68, + "grad_norm": 0.25449973903583906, + "learning_rate": 0.0001573061580018546, + "loss": 1.0567, + "step": 7117 + }, + { + "epoch": 0.68, + "grad_norm": 0.3130389985554061, + "learning_rate": 0.00015729319301750655, + "loss": 1.0886, + "step": 7118 + }, + { + "epoch": 0.68, + "grad_norm": 0.2764607660175635, + "learning_rate": 0.00015728022659936033, + "loss": 0.9724, + "step": 7119 + }, + { + "epoch": 0.68, + "grad_norm": 0.24734793912425435, + "learning_rate": 0.0001572672587477404, + "loss": 1.0651, + "step": 7120 + }, + { + "epoch": 0.68, + "grad_norm": 0.28747093554729464, + "learning_rate": 0.00015725428946297137, + "loss": 1.1001, + "step": 7121 + }, + { + "epoch": 0.68, + "grad_norm": 0.270729761427449, + "learning_rate": 0.0001572413187453778, + "loss": 1.0384, + "step": 7122 + }, + { + "epoch": 0.68, + "grad_norm": 0.2928842971220595, + "learning_rate": 0.00015722834659528422, + "loss": 1.1656, + "step": 7123 + }, + { + "epoch": 0.68, + "grad_norm": 0.2793916142108935, + "learning_rate": 0.00015721537301301527, + "loss": 0.9688, + "step": 7124 + }, + { + "epoch": 0.68, + "grad_norm": 0.29909818823600665, + "learning_rate": 0.0001572023979988957, + "loss": 0.9318, + "step": 7125 + }, + { + "epoch": 0.68, + "grad_norm": 0.2696167713839793, + "learning_rate": 0.00015718942155325011, + "loss": 1.1889, + "step": 7126 + }, + { + "epoch": 0.68, + "grad_norm": 0.29698260893246986, + "learning_rate": 0.00015717644367640334, + "loss": 1.1648, + "step": 7127 + }, + { + "epoch": 0.68, + "grad_norm": 0.27565722448673946, + "learning_rate": 0.00015716346436868016, + "loss": 1.0083, + "step": 7128 + }, + { + "epoch": 0.68, + "grad_norm": 0.27200588461690844, + "learning_rate": 0.00015715048363040533, + "loss": 1.0808, + "step": 7129 + }, + { + "epoch": 0.68, + "grad_norm": 0.274483688022852, + "learning_rate": 0.00015713750146190372, + "loss": 1.0244, + "step": 7130 + }, + { + "epoch": 0.68, + "grad_norm": 0.2603147750459217, + "learning_rate": 0.00015712451786350023, + "loss": 1.0541, + "step": 7131 + }, + { + "epoch": 0.68, + "grad_norm": 0.2611050046236202, + "learning_rate": 0.0001571115328355198, + "loss": 0.9513, + "step": 7132 + }, + { + "epoch": 0.68, + "grad_norm": 0.2611354213480838, + "learning_rate": 0.00015709854637828733, + "loss": 1.0073, + "step": 7133 + }, + { + "epoch": 0.68, + "grad_norm": 0.29780693358025156, + "learning_rate": 0.0001570855584921279, + "loss": 1.0521, + "step": 7134 + }, + { + "epoch": 0.68, + "grad_norm": 0.298538656430755, + "learning_rate": 0.00015707256917736647, + "loss": 1.2377, + "step": 7135 + }, + { + "epoch": 0.68, + "grad_norm": 0.2748841653644941, + "learning_rate": 0.0001570595784343281, + "loss": 0.9403, + "step": 7136 + }, + { + "epoch": 0.68, + "grad_norm": 0.2746937829833261, + "learning_rate": 0.00015704658626333794, + "loss": 1.1224, + "step": 7137 + }, + { + "epoch": 0.68, + "grad_norm": 0.2842293587504884, + "learning_rate": 0.00015703359266472112, + "loss": 1.1599, + "step": 7138 + }, + { + "epoch": 0.68, + "grad_norm": 0.2670841998105975, + "learning_rate": 0.0001570205976388028, + "loss": 1.138, + "step": 7139 + }, + { + "epoch": 0.68, + "grad_norm": 0.27756874671553816, + "learning_rate": 0.00015700760118590815, + "loss": 0.9721, + "step": 7140 + }, + { + "epoch": 0.68, + "grad_norm": 0.28756494584263737, + "learning_rate": 0.00015699460330636248, + "loss": 1.1181, + "step": 7141 + }, + { + "epoch": 0.68, + "grad_norm": 0.32898360339581917, + "learning_rate": 0.00015698160400049105, + "loss": 1.219, + "step": 7142 + }, + { + "epoch": 0.68, + "grad_norm": 0.2803241831359587, + "learning_rate": 0.00015696860326861917, + "loss": 1.1124, + "step": 7143 + }, + { + "epoch": 0.68, + "grad_norm": 0.27535423833378253, + "learning_rate": 0.00015695560111107218, + "loss": 1.1046, + "step": 7144 + }, + { + "epoch": 0.68, + "grad_norm": 0.2606122473280347, + "learning_rate": 0.0001569425975281755, + "loss": 1.158, + "step": 7145 + }, + { + "epoch": 0.68, + "grad_norm": 0.26610041445452237, + "learning_rate": 0.00015692959252025447, + "loss": 1.1423, + "step": 7146 + }, + { + "epoch": 0.68, + "grad_norm": 0.2953405796549938, + "learning_rate": 0.00015691658608763467, + "loss": 1.1065, + "step": 7147 + }, + { + "epoch": 0.68, + "grad_norm": 0.2809208524967904, + "learning_rate": 0.00015690357823064147, + "loss": 1.0091, + "step": 7148 + }, + { + "epoch": 0.68, + "grad_norm": 0.298379105314993, + "learning_rate": 0.00015689056894960054, + "loss": 0.9916, + "step": 7149 + }, + { + "epoch": 0.68, + "grad_norm": 0.29683307271349985, + "learning_rate": 0.00015687755824483733, + "loss": 1.1125, + "step": 7150 + }, + { + "epoch": 0.68, + "grad_norm": 0.29899393435333316, + "learning_rate": 0.00015686454611667745, + "loss": 1.0796, + "step": 7151 + }, + { + "epoch": 0.68, + "grad_norm": 0.31478792374668824, + "learning_rate": 0.00015685153256544658, + "loss": 1.0593, + "step": 7152 + }, + { + "epoch": 0.68, + "grad_norm": 0.2930746130040074, + "learning_rate": 0.0001568385175914704, + "loss": 1.0662, + "step": 7153 + }, + { + "epoch": 0.68, + "grad_norm": 0.2882046106628799, + "learning_rate": 0.00015682550119507457, + "loss": 1.0464, + "step": 7154 + }, + { + "epoch": 0.68, + "grad_norm": 0.24774454066142032, + "learning_rate": 0.0001568124833765849, + "loss": 1.0727, + "step": 7155 + }, + { + "epoch": 0.68, + "grad_norm": 0.28400220165986323, + "learning_rate": 0.0001567994641363271, + "loss": 1.2001, + "step": 7156 + }, + { + "epoch": 0.68, + "grad_norm": 0.2847089193485156, + "learning_rate": 0.000156786443474627, + "loss": 1.0203, + "step": 7157 + }, + { + "epoch": 0.68, + "grad_norm": 0.25521289971018285, + "learning_rate": 0.0001567734213918105, + "loss": 1.0875, + "step": 7158 + }, + { + "epoch": 0.68, + "grad_norm": 0.26777210499225285, + "learning_rate": 0.0001567603978882034, + "loss": 1.0413, + "step": 7159 + }, + { + "epoch": 0.69, + "grad_norm": 0.29958229941874526, + "learning_rate": 0.00015674737296413171, + "loss": 1.1779, + "step": 7160 + }, + { + "epoch": 0.69, + "grad_norm": 0.2973923709791303, + "learning_rate": 0.00015673434661992133, + "loss": 1.0605, + "step": 7161 + }, + { + "epoch": 0.69, + "grad_norm": 0.29890629646161765, + "learning_rate": 0.00015672131885589827, + "loss": 1.1378, + "step": 7162 + }, + { + "epoch": 0.69, + "grad_norm": 0.27465315968911586, + "learning_rate": 0.00015670828967238857, + "loss": 1.0176, + "step": 7163 + }, + { + "epoch": 0.69, + "grad_norm": 0.27330309587431983, + "learning_rate": 0.00015669525906971825, + "loss": 1.099, + "step": 7164 + }, + { + "epoch": 0.69, + "grad_norm": 0.2830955541275837, + "learning_rate": 0.00015668222704821346, + "loss": 1.0174, + "step": 7165 + }, + { + "epoch": 0.69, + "grad_norm": 0.2918518106052513, + "learning_rate": 0.00015666919360820034, + "loss": 1.0828, + "step": 7166 + }, + { + "epoch": 0.69, + "grad_norm": 0.2938118596159364, + "learning_rate": 0.000156656158750005, + "loss": 1.1368, + "step": 7167 + }, + { + "epoch": 0.69, + "grad_norm": 0.33564117245367897, + "learning_rate": 0.0001566431224739537, + "loss": 1.1594, + "step": 7168 + }, + { + "epoch": 0.69, + "grad_norm": 0.2791063027357139, + "learning_rate": 0.00015663008478037263, + "loss": 1.0643, + "step": 7169 + }, + { + "epoch": 0.69, + "grad_norm": 0.2836589857893266, + "learning_rate": 0.00015661704566958816, + "loss": 1.0865, + "step": 7170 + }, + { + "epoch": 0.69, + "grad_norm": 0.3268578411481459, + "learning_rate": 0.00015660400514192648, + "loss": 1.1146, + "step": 7171 + }, + { + "epoch": 0.69, + "grad_norm": 0.26432781847496883, + "learning_rate": 0.00015659096319771401, + "loss": 1.2087, + "step": 7172 + }, + { + "epoch": 0.69, + "grad_norm": 0.2947543226709198, + "learning_rate": 0.00015657791983727715, + "loss": 1.0841, + "step": 7173 + }, + { + "epoch": 0.69, + "grad_norm": 0.3064445919955032, + "learning_rate": 0.00015656487506094226, + "loss": 1.1338, + "step": 7174 + }, + { + "epoch": 0.69, + "grad_norm": 0.2768287567420236, + "learning_rate": 0.00015655182886903582, + "loss": 1.1217, + "step": 7175 + }, + { + "epoch": 0.69, + "grad_norm": 0.3115480635288813, + "learning_rate": 0.00015653878126188433, + "loss": 1.1119, + "step": 7176 + }, + { + "epoch": 0.69, + "grad_norm": 0.26658700110300837, + "learning_rate": 0.00015652573223981432, + "loss": 1.1532, + "step": 7177 + }, + { + "epoch": 0.69, + "grad_norm": 0.30729315346172287, + "learning_rate": 0.0001565126818031523, + "loss": 1.0824, + "step": 7178 + }, + { + "epoch": 0.69, + "grad_norm": 0.27545018818322364, + "learning_rate": 0.00015649962995222493, + "loss": 1.0008, + "step": 7179 + }, + { + "epoch": 0.69, + "grad_norm": 0.2840294196959938, + "learning_rate": 0.0001564865766873588, + "loss": 0.9888, + "step": 7180 + }, + { + "epoch": 0.69, + "grad_norm": 0.2704386331778757, + "learning_rate": 0.00015647352200888056, + "loss": 1.0845, + "step": 7181 + }, + { + "epoch": 0.69, + "grad_norm": 0.309378454139959, + "learning_rate": 0.00015646046591711698, + "loss": 1.1094, + "step": 7182 + }, + { + "epoch": 0.69, + "grad_norm": 0.33236253311844427, + "learning_rate": 0.00015644740841239477, + "loss": 1.0607, + "step": 7183 + }, + { + "epoch": 0.69, + "grad_norm": 0.2654938165964374, + "learning_rate": 0.00015643434949504066, + "loss": 1.0977, + "step": 7184 + }, + { + "epoch": 0.69, + "grad_norm": 0.25891560294027866, + "learning_rate": 0.00015642128916538148, + "loss": 1.0862, + "step": 7185 + }, + { + "epoch": 0.69, + "grad_norm": 0.30191735715591245, + "learning_rate": 0.00015640822742374411, + "loss": 1.0986, + "step": 7186 + }, + { + "epoch": 0.69, + "grad_norm": 0.3016522569398602, + "learning_rate": 0.00015639516427045538, + "loss": 1.1071, + "step": 7187 + }, + { + "epoch": 0.69, + "grad_norm": 0.3747262023726705, + "learning_rate": 0.00015638209970584218, + "loss": 0.9347, + "step": 7188 + }, + { + "epoch": 0.69, + "grad_norm": 0.26969894694899865, + "learning_rate": 0.0001563690337302316, + "loss": 0.9472, + "step": 7189 + }, + { + "epoch": 0.69, + "grad_norm": 0.27707566766666775, + "learning_rate": 0.00015635596634395045, + "loss": 1.0062, + "step": 7190 + }, + { + "epoch": 0.69, + "grad_norm": 0.299605953934922, + "learning_rate": 0.00015634289754732584, + "loss": 1.0789, + "step": 7191 + }, + { + "epoch": 0.69, + "grad_norm": 0.2756403847856143, + "learning_rate": 0.00015632982734068479, + "loss": 1.0478, + "step": 7192 + }, + { + "epoch": 0.69, + "grad_norm": 0.28164477492401824, + "learning_rate": 0.00015631675572435442, + "loss": 1.0234, + "step": 7193 + }, + { + "epoch": 0.69, + "grad_norm": 0.3139943484530815, + "learning_rate": 0.00015630368269866187, + "loss": 1.0499, + "step": 7194 + }, + { + "epoch": 0.69, + "grad_norm": 0.30405542664327845, + "learning_rate": 0.0001562906082639342, + "loss": 1.023, + "step": 7195 + }, + { + "epoch": 0.69, + "grad_norm": 0.2874490807251814, + "learning_rate": 0.00015627753242049877, + "loss": 0.9521, + "step": 7196 + }, + { + "epoch": 0.69, + "grad_norm": 0.2643259838229851, + "learning_rate": 0.0001562644551686827, + "loss": 1.0435, + "step": 7197 + }, + { + "epoch": 0.69, + "grad_norm": 0.2865014648758085, + "learning_rate": 0.00015625137650881324, + "loss": 1.0175, + "step": 7198 + }, + { + "epoch": 0.69, + "grad_norm": 0.30454373875693874, + "learning_rate": 0.00015623829644121777, + "loss": 1.134, + "step": 7199 + }, + { + "epoch": 0.69, + "grad_norm": 0.31859855773654133, + "learning_rate": 0.00015622521496622355, + "loss": 1.0659, + "step": 7200 + }, + { + "epoch": 0.69, + "grad_norm": 0.2858177196834928, + "learning_rate": 0.00015621213208415804, + "loss": 0.9598, + "step": 7201 + }, + { + "epoch": 0.69, + "grad_norm": 0.26598227866462754, + "learning_rate": 0.00015619904779534856, + "loss": 0.9907, + "step": 7202 + }, + { + "epoch": 0.69, + "grad_norm": 0.2951400583438705, + "learning_rate": 0.00015618596210012256, + "loss": 1.0765, + "step": 7203 + }, + { + "epoch": 0.69, + "grad_norm": 0.3114702561846883, + "learning_rate": 0.00015617287499880762, + "loss": 1.1041, + "step": 7204 + }, + { + "epoch": 0.69, + "grad_norm": 0.278499185049389, + "learning_rate": 0.00015615978649173112, + "loss": 0.9533, + "step": 7205 + }, + { + "epoch": 0.69, + "grad_norm": 0.28555299537963824, + "learning_rate": 0.0001561466965792207, + "loss": 1.0967, + "step": 7206 + }, + { + "epoch": 0.69, + "grad_norm": 0.2636715267212951, + "learning_rate": 0.00015613360526160392, + "loss": 1.086, + "step": 7207 + }, + { + "epoch": 0.69, + "grad_norm": 0.29854739591999657, + "learning_rate": 0.00015612051253920836, + "loss": 1.1259, + "step": 7208 + }, + { + "epoch": 0.69, + "grad_norm": 0.29584977312779975, + "learning_rate": 0.00015610741841236173, + "loss": 1.1717, + "step": 7209 + }, + { + "epoch": 0.69, + "grad_norm": 0.2853540873126679, + "learning_rate": 0.00015609432288139167, + "loss": 1.0673, + "step": 7210 + }, + { + "epoch": 0.69, + "grad_norm": 0.26104277026079287, + "learning_rate": 0.00015608122594662596, + "loss": 1.0772, + "step": 7211 + }, + { + "epoch": 0.69, + "grad_norm": 0.3189693042004204, + "learning_rate": 0.00015606812760839226, + "loss": 1.0342, + "step": 7212 + }, + { + "epoch": 0.69, + "grad_norm": 0.27816239495102424, + "learning_rate": 0.00015605502786701848, + "loss": 1.0825, + "step": 7213 + }, + { + "epoch": 0.69, + "grad_norm": 0.2635774551759029, + "learning_rate": 0.0001560419267228324, + "loss": 1.0783, + "step": 7214 + }, + { + "epoch": 0.69, + "grad_norm": 0.29195022738920845, + "learning_rate": 0.00015602882417616184, + "loss": 1.1758, + "step": 7215 + }, + { + "epoch": 0.69, + "grad_norm": 0.3144273867404004, + "learning_rate": 0.0001560157202273348, + "loss": 1.0973, + "step": 7216 + }, + { + "epoch": 0.69, + "grad_norm": 0.3065507511047062, + "learning_rate": 0.00015600261487667912, + "loss": 1.1111, + "step": 7217 + }, + { + "epoch": 0.69, + "grad_norm": 0.3131853451412242, + "learning_rate": 0.0001559895081245228, + "loss": 1.2187, + "step": 7218 + }, + { + "epoch": 0.69, + "grad_norm": 0.26515742013497673, + "learning_rate": 0.00015597639997119389, + "loss": 1.0605, + "step": 7219 + }, + { + "epoch": 0.69, + "grad_norm": 0.3129582152206669, + "learning_rate": 0.00015596329041702036, + "loss": 1.0686, + "step": 7220 + }, + { + "epoch": 0.69, + "grad_norm": 0.34295960089795735, + "learning_rate": 0.00015595017946233033, + "loss": 1.0771, + "step": 7221 + }, + { + "epoch": 0.69, + "grad_norm": 0.2721307338351633, + "learning_rate": 0.00015593706710745187, + "loss": 1.0284, + "step": 7222 + }, + { + "epoch": 0.69, + "grad_norm": 0.304506700589439, + "learning_rate": 0.00015592395335271316, + "loss": 1.055, + "step": 7223 + }, + { + "epoch": 0.69, + "grad_norm": 0.31053881654022203, + "learning_rate": 0.0001559108381984424, + "loss": 1.1429, + "step": 7224 + }, + { + "epoch": 0.69, + "grad_norm": 0.30691636576138964, + "learning_rate": 0.00015589772164496774, + "loss": 1.1403, + "step": 7225 + }, + { + "epoch": 0.69, + "grad_norm": 0.2833916505818309, + "learning_rate": 0.00015588460369261748, + "loss": 0.9685, + "step": 7226 + }, + { + "epoch": 0.69, + "grad_norm": 0.28649595640599806, + "learning_rate": 0.0001558714843417199, + "loss": 0.9536, + "step": 7227 + }, + { + "epoch": 0.69, + "grad_norm": 0.27284801442293516, + "learning_rate": 0.0001558583635926033, + "loss": 1.1062, + "step": 7228 + }, + { + "epoch": 0.69, + "grad_norm": 0.27007347326614567, + "learning_rate": 0.00015584524144559604, + "loss": 1.1615, + "step": 7229 + }, + { + "epoch": 0.69, + "grad_norm": 0.3039328987168462, + "learning_rate": 0.00015583211790102652, + "loss": 1.1376, + "step": 7230 + }, + { + "epoch": 0.69, + "grad_norm": 0.26912448480042633, + "learning_rate": 0.00015581899295922318, + "loss": 1.0096, + "step": 7231 + }, + { + "epoch": 0.69, + "grad_norm": 0.2653339147614556, + "learning_rate": 0.00015580586662051444, + "loss": 1.0514, + "step": 7232 + }, + { + "epoch": 0.69, + "grad_norm": 0.26697292715319065, + "learning_rate": 0.0001557927388852288, + "loss": 0.9873, + "step": 7233 + }, + { + "epoch": 0.69, + "grad_norm": 0.2937309164974668, + "learning_rate": 0.00015577960975369484, + "loss": 1.1235, + "step": 7234 + }, + { + "epoch": 0.69, + "grad_norm": 0.2781751496250474, + "learning_rate": 0.00015576647922624105, + "loss": 1.0094, + "step": 7235 + }, + { + "epoch": 0.69, + "grad_norm": 0.28016398794516684, + "learning_rate": 0.00015575334730319611, + "loss": 1.1144, + "step": 7236 + }, + { + "epoch": 0.69, + "grad_norm": 0.2756048290515205, + "learning_rate": 0.00015574021398488862, + "loss": 1.0359, + "step": 7237 + }, + { + "epoch": 0.69, + "grad_norm": 0.24131864496426217, + "learning_rate": 0.0001557270792716472, + "loss": 1.0668, + "step": 7238 + }, + { + "epoch": 0.69, + "grad_norm": 0.2840317470056723, + "learning_rate": 0.00015571394316380062, + "loss": 1.1596, + "step": 7239 + }, + { + "epoch": 0.69, + "grad_norm": 0.27376264865951977, + "learning_rate": 0.0001557008056616776, + "loss": 0.9976, + "step": 7240 + }, + { + "epoch": 0.69, + "grad_norm": 0.3019067450221086, + "learning_rate": 0.0001556876667656069, + "loss": 1.0813, + "step": 7241 + }, + { + "epoch": 0.69, + "grad_norm": 0.3162835708665788, + "learning_rate": 0.00015567452647591732, + "loss": 1.1084, + "step": 7242 + }, + { + "epoch": 0.69, + "grad_norm": 0.27896797749608704, + "learning_rate": 0.00015566138479293775, + "loss": 1.0595, + "step": 7243 + }, + { + "epoch": 0.69, + "grad_norm": 0.2992448181482971, + "learning_rate": 0.00015564824171699707, + "loss": 0.9487, + "step": 7244 + }, + { + "epoch": 0.69, + "grad_norm": 0.27553640224406994, + "learning_rate": 0.00015563509724842413, + "loss": 1.0404, + "step": 7245 + }, + { + "epoch": 0.69, + "grad_norm": 0.27552371209875953, + "learning_rate": 0.00015562195138754792, + "loss": 1.1528, + "step": 7246 + }, + { + "epoch": 0.69, + "grad_norm": 0.29426928783634326, + "learning_rate": 0.00015560880413469742, + "loss": 1.0984, + "step": 7247 + }, + { + "epoch": 0.69, + "grad_norm": 0.27853895487179603, + "learning_rate": 0.00015559565549020169, + "loss": 1.1804, + "step": 7248 + }, + { + "epoch": 0.69, + "grad_norm": 0.3136870635528011, + "learning_rate": 0.00015558250545438972, + "loss": 1.0698, + "step": 7249 + }, + { + "epoch": 0.69, + "grad_norm": 0.2624215588147093, + "learning_rate": 0.0001555693540275906, + "loss": 1.1105, + "step": 7250 + }, + { + "epoch": 0.69, + "grad_norm": 0.23782920442038563, + "learning_rate": 0.0001555562012101335, + "loss": 1.1211, + "step": 7251 + }, + { + "epoch": 0.69, + "grad_norm": 0.2695440813802512, + "learning_rate": 0.00015554304700234747, + "loss": 1.0031, + "step": 7252 + }, + { + "epoch": 0.69, + "grad_norm": 0.30147711904941943, + "learning_rate": 0.00015552989140456185, + "loss": 1.1955, + "step": 7253 + }, + { + "epoch": 0.69, + "grad_norm": 0.2430503125575205, + "learning_rate": 0.0001555167344171058, + "loss": 0.983, + "step": 7254 + }, + { + "epoch": 0.69, + "grad_norm": 0.29554732708566633, + "learning_rate": 0.00015550357604030856, + "loss": 1.0728, + "step": 7255 + }, + { + "epoch": 0.69, + "grad_norm": 0.33084597515765934, + "learning_rate": 0.00015549041627449945, + "loss": 1.0754, + "step": 7256 + }, + { + "epoch": 0.69, + "grad_norm": 0.35533675482762406, + "learning_rate": 0.0001554772551200078, + "loss": 1.1181, + "step": 7257 + }, + { + "epoch": 0.69, + "grad_norm": 0.27809709182399867, + "learning_rate": 0.00015546409257716296, + "loss": 1.1706, + "step": 7258 + }, + { + "epoch": 0.69, + "grad_norm": 0.2644018070633185, + "learning_rate": 0.00015545092864629437, + "loss": 1.1147, + "step": 7259 + }, + { + "epoch": 0.69, + "grad_norm": 0.2749285252651721, + "learning_rate": 0.00015543776332773142, + "loss": 1.1475, + "step": 7260 + }, + { + "epoch": 0.69, + "grad_norm": 0.26759109696003713, + "learning_rate": 0.00015542459662180362, + "loss": 1.1167, + "step": 7261 + }, + { + "epoch": 0.69, + "grad_norm": 0.2744664795704848, + "learning_rate": 0.00015541142852884044, + "loss": 0.9624, + "step": 7262 + }, + { + "epoch": 0.69, + "grad_norm": 0.2656603725314074, + "learning_rate": 0.00015539825904917147, + "loss": 1.0158, + "step": 7263 + }, + { + "epoch": 0.69, + "grad_norm": 0.27736864447841364, + "learning_rate": 0.0001553850881831262, + "loss": 1.0009, + "step": 7264 + }, + { + "epoch": 0.7, + "grad_norm": 0.23778024632317593, + "learning_rate": 0.00015537191593103432, + "loss": 1.1653, + "step": 7265 + }, + { + "epoch": 0.7, + "grad_norm": 0.30730004897313506, + "learning_rate": 0.00015535874229322545, + "loss": 1.0293, + "step": 7266 + }, + { + "epoch": 0.7, + "grad_norm": 0.26576264865967103, + "learning_rate": 0.00015534556727002925, + "loss": 1.1149, + "step": 7267 + }, + { + "epoch": 0.7, + "grad_norm": 0.2910068427246134, + "learning_rate": 0.00015533239086177548, + "loss": 1.0587, + "step": 7268 + }, + { + "epoch": 0.7, + "grad_norm": 0.2719761437844997, + "learning_rate": 0.0001553192130687938, + "loss": 1.0378, + "step": 7269 + }, + { + "epoch": 0.7, + "grad_norm": 0.2578029237737743, + "learning_rate": 0.00015530603389141408, + "loss": 1.0591, + "step": 7270 + }, + { + "epoch": 0.7, + "grad_norm": 0.2799149293630919, + "learning_rate": 0.0001552928533299661, + "loss": 1.0374, + "step": 7271 + }, + { + "epoch": 0.7, + "grad_norm": 0.260847576690139, + "learning_rate": 0.00015527967138477967, + "loss": 0.9975, + "step": 7272 + }, + { + "epoch": 0.7, + "grad_norm": 0.2907795091020474, + "learning_rate": 0.00015526648805618478, + "loss": 1.1593, + "step": 7273 + }, + { + "epoch": 0.7, + "grad_norm": 0.22310418172043128, + "learning_rate": 0.00015525330334451127, + "loss": 0.9877, + "step": 7274 + }, + { + "epoch": 0.7, + "grad_norm": 0.29425332767908924, + "learning_rate": 0.00015524011725008912, + "loss": 1.0992, + "step": 7275 + }, + { + "epoch": 0.7, + "grad_norm": 0.3105156950201626, + "learning_rate": 0.0001552269297732483, + "loss": 1.0496, + "step": 7276 + }, + { + "epoch": 0.7, + "grad_norm": 0.2558739957093287, + "learning_rate": 0.00015521374091431888, + "loss": 1.0964, + "step": 7277 + }, + { + "epoch": 0.7, + "grad_norm": 0.28594690539220524, + "learning_rate": 0.00015520055067363089, + "loss": 1.0916, + "step": 7278 + }, + { + "epoch": 0.7, + "grad_norm": 0.3061482044460731, + "learning_rate": 0.00015518735905151442, + "loss": 1.0903, + "step": 7279 + }, + { + "epoch": 0.7, + "grad_norm": 0.3191580668312093, + "learning_rate": 0.00015517416604829962, + "loss": 0.9281, + "step": 7280 + }, + { + "epoch": 0.7, + "grad_norm": 0.26708235106774547, + "learning_rate": 0.00015516097166431663, + "loss": 1.1284, + "step": 7281 + }, + { + "epoch": 0.7, + "grad_norm": 0.2741503886756114, + "learning_rate": 0.00015514777589989564, + "loss": 0.9834, + "step": 7282 + }, + { + "epoch": 0.7, + "grad_norm": 0.2721226495347994, + "learning_rate": 0.00015513457875536692, + "loss": 1.0924, + "step": 7283 + }, + { + "epoch": 0.7, + "grad_norm": 0.2903941706772716, + "learning_rate": 0.0001551213802310607, + "loss": 1.1032, + "step": 7284 + }, + { + "epoch": 0.7, + "grad_norm": 0.26719034153148796, + "learning_rate": 0.0001551081803273073, + "loss": 1.0721, + "step": 7285 + }, + { + "epoch": 0.7, + "grad_norm": 0.2410538608083278, + "learning_rate": 0.00015509497904443706, + "loss": 1.0446, + "step": 7286 + }, + { + "epoch": 0.7, + "grad_norm": 0.288052793290722, + "learning_rate": 0.00015508177638278036, + "loss": 1.0968, + "step": 7287 + }, + { + "epoch": 0.7, + "grad_norm": 0.28114598468656504, + "learning_rate": 0.00015506857234266755, + "loss": 1.2161, + "step": 7288 + }, + { + "epoch": 0.7, + "grad_norm": 0.30363151390349236, + "learning_rate": 0.00015505536692442915, + "loss": 1.1299, + "step": 7289 + }, + { + "epoch": 0.7, + "grad_norm": 0.3430101339177829, + "learning_rate": 0.00015504216012839555, + "loss": 1.057, + "step": 7290 + }, + { + "epoch": 0.7, + "grad_norm": 0.31124798109609725, + "learning_rate": 0.00015502895195489735, + "loss": 1.0329, + "step": 7291 + }, + { + "epoch": 0.7, + "grad_norm": 0.31977891525097923, + "learning_rate": 0.000155015742404265, + "loss": 1.0195, + "step": 7292 + }, + { + "epoch": 0.7, + "grad_norm": 0.2605016731210418, + "learning_rate": 0.00015500253147682913, + "loss": 1.1187, + "step": 7293 + }, + { + "epoch": 0.7, + "grad_norm": 0.25377938560950036, + "learning_rate": 0.00015498931917292037, + "loss": 0.9918, + "step": 7294 + }, + { + "epoch": 0.7, + "grad_norm": 0.3046468563219072, + "learning_rate": 0.0001549761054928693, + "loss": 1.112, + "step": 7295 + }, + { + "epoch": 0.7, + "grad_norm": 0.34132422940513657, + "learning_rate": 0.00015496289043700665, + "loss": 1.0857, + "step": 7296 + }, + { + "epoch": 0.7, + "grad_norm": 0.25593975051636847, + "learning_rate": 0.00015494967400566311, + "loss": 1.0094, + "step": 7297 + }, + { + "epoch": 0.7, + "grad_norm": 0.30660036228331966, + "learning_rate": 0.00015493645619916947, + "loss": 1.0384, + "step": 7298 + }, + { + "epoch": 0.7, + "grad_norm": 0.28934490627014536, + "learning_rate": 0.0001549232370178565, + "loss": 1.1638, + "step": 7299 + }, + { + "epoch": 0.7, + "grad_norm": 0.3082067568322258, + "learning_rate": 0.00015491001646205496, + "loss": 1.1484, + "step": 7300 + }, + { + "epoch": 0.7, + "grad_norm": 0.24608283441694623, + "learning_rate": 0.00015489679453209578, + "loss": 1.0935, + "step": 7301 + }, + { + "epoch": 0.7, + "grad_norm": 0.2878652096204575, + "learning_rate": 0.0001548835712283098, + "loss": 1.0195, + "step": 7302 + }, + { + "epoch": 0.7, + "grad_norm": 0.2445700870996487, + "learning_rate": 0.00015487034655102796, + "loss": 1.0676, + "step": 7303 + }, + { + "epoch": 0.7, + "grad_norm": 0.2792976608554407, + "learning_rate": 0.00015485712050058125, + "loss": 1.0282, + "step": 7304 + }, + { + "epoch": 0.7, + "grad_norm": 0.2792852923073424, + "learning_rate": 0.00015484389307730056, + "loss": 1.0393, + "step": 7305 + }, + { + "epoch": 0.7, + "grad_norm": 0.2921531944021332, + "learning_rate": 0.00015483066428151703, + "loss": 1.1671, + "step": 7306 + }, + { + "epoch": 0.7, + "grad_norm": 0.2840087290455877, + "learning_rate": 0.00015481743411356163, + "loss": 1.0849, + "step": 7307 + }, + { + "epoch": 0.7, + "grad_norm": 0.3040563023087563, + "learning_rate": 0.0001548042025737655, + "loss": 1.1268, + "step": 7308 + }, + { + "epoch": 0.7, + "grad_norm": 0.29355520065474283, + "learning_rate": 0.00015479096966245978, + "loss": 1.03, + "step": 7309 + }, + { + "epoch": 0.7, + "grad_norm": 0.2760436996975324, + "learning_rate": 0.00015477773537997557, + "loss": 1.1871, + "step": 7310 + }, + { + "epoch": 0.7, + "grad_norm": 0.27687371076005574, + "learning_rate": 0.00015476449972664412, + "loss": 1.1466, + "step": 7311 + }, + { + "epoch": 0.7, + "grad_norm": 0.2845099189305587, + "learning_rate": 0.00015475126270279667, + "loss": 1.0636, + "step": 7312 + }, + { + "epoch": 0.7, + "grad_norm": 0.2505523226102105, + "learning_rate": 0.00015473802430876444, + "loss": 1.0668, + "step": 7313 + }, + { + "epoch": 0.7, + "grad_norm": 0.32837255120480047, + "learning_rate": 0.00015472478454487876, + "loss": 1.1224, + "step": 7314 + }, + { + "epoch": 0.7, + "grad_norm": 0.26481048965460596, + "learning_rate": 0.00015471154341147094, + "loss": 1.0428, + "step": 7315 + }, + { + "epoch": 0.7, + "grad_norm": 0.2690353020587212, + "learning_rate": 0.00015469830090887235, + "loss": 1.0432, + "step": 7316 + }, + { + "epoch": 0.7, + "grad_norm": 0.27626600703824794, + "learning_rate": 0.00015468505703741442, + "loss": 1.0638, + "step": 7317 + }, + { + "epoch": 0.7, + "grad_norm": 0.3065923845948702, + "learning_rate": 0.00015467181179742857, + "loss": 0.914, + "step": 7318 + }, + { + "epoch": 0.7, + "grad_norm": 0.2829656174177215, + "learning_rate": 0.0001546585651892463, + "loss": 1.0586, + "step": 7319 + }, + { + "epoch": 0.7, + "grad_norm": 0.277648311025837, + "learning_rate": 0.00015464531721319903, + "loss": 0.9368, + "step": 7320 + }, + { + "epoch": 0.7, + "grad_norm": 0.26414058914433497, + "learning_rate": 0.00015463206786961838, + "loss": 0.9696, + "step": 7321 + }, + { + "epoch": 0.7, + "grad_norm": 0.3161051076868171, + "learning_rate": 0.0001546188171588359, + "loss": 1.0229, + "step": 7322 + }, + { + "epoch": 0.7, + "grad_norm": 0.2799019895774647, + "learning_rate": 0.0001546055650811832, + "loss": 1.1345, + "step": 7323 + }, + { + "epoch": 0.7, + "grad_norm": 0.305865812690483, + "learning_rate": 0.0001545923116369919, + "loss": 1.0023, + "step": 7324 + }, + { + "epoch": 0.7, + "grad_norm": 0.30382443041697793, + "learning_rate": 0.00015457905682659368, + "loss": 1.0488, + "step": 7325 + }, + { + "epoch": 0.7, + "grad_norm": 0.31386464628323874, + "learning_rate": 0.0001545658006503203, + "loss": 1.0361, + "step": 7326 + }, + { + "epoch": 0.7, + "grad_norm": 0.27019240058841637, + "learning_rate": 0.00015455254310850345, + "loss": 1.0565, + "step": 7327 + }, + { + "epoch": 0.7, + "grad_norm": 0.266426547020745, + "learning_rate": 0.0001545392842014749, + "loss": 0.9752, + "step": 7328 + }, + { + "epoch": 0.7, + "grad_norm": 0.2995784354581383, + "learning_rate": 0.0001545260239295665, + "loss": 1.0005, + "step": 7329 + }, + { + "epoch": 0.7, + "grad_norm": 0.317973794140023, + "learning_rate": 0.0001545127622931101, + "loss": 1.0538, + "step": 7330 + }, + { + "epoch": 0.7, + "grad_norm": 0.2947298100738568, + "learning_rate": 0.00015449949929243755, + "loss": 1.0116, + "step": 7331 + }, + { + "epoch": 0.7, + "grad_norm": 0.3348856644428043, + "learning_rate": 0.00015448623492788076, + "loss": 0.9802, + "step": 7332 + }, + { + "epoch": 0.7, + "grad_norm": 0.2989455629929353, + "learning_rate": 0.00015447296919977172, + "loss": 1.0376, + "step": 7333 + }, + { + "epoch": 0.7, + "grad_norm": 0.3011061423556309, + "learning_rate": 0.0001544597021084424, + "loss": 1.0908, + "step": 7334 + }, + { + "epoch": 0.7, + "grad_norm": 0.3049819395201172, + "learning_rate": 0.00015444643365422478, + "loss": 1.0768, + "step": 7335 + }, + { + "epoch": 0.7, + "grad_norm": 0.26542548997246496, + "learning_rate": 0.00015443316383745095, + "loss": 1.024, + "step": 7336 + }, + { + "epoch": 0.7, + "grad_norm": 0.2797529721086807, + "learning_rate": 0.00015441989265845297, + "loss": 1.072, + "step": 7337 + }, + { + "epoch": 0.7, + "grad_norm": 0.2538780155773843, + "learning_rate": 0.000154406620117563, + "loss": 1.038, + "step": 7338 + }, + { + "epoch": 0.7, + "grad_norm": 0.2831186717367567, + "learning_rate": 0.00015439334621511318, + "loss": 1.0638, + "step": 7339 + }, + { + "epoch": 0.7, + "grad_norm": 0.28787268851064585, + "learning_rate": 0.00015438007095143567, + "loss": 1.0426, + "step": 7340 + }, + { + "epoch": 0.7, + "grad_norm": 0.2948120261226757, + "learning_rate": 0.0001543667943268627, + "loss": 1.1221, + "step": 7341 + }, + { + "epoch": 0.7, + "grad_norm": 0.32242230412435835, + "learning_rate": 0.00015435351634172654, + "loss": 1.0958, + "step": 7342 + }, + { + "epoch": 0.7, + "grad_norm": 0.2654273360662479, + "learning_rate": 0.00015434023699635948, + "loss": 1.0071, + "step": 7343 + }, + { + "epoch": 0.7, + "grad_norm": 0.27327197799382885, + "learning_rate": 0.00015432695629109385, + "loss": 1.0576, + "step": 7344 + }, + { + "epoch": 0.7, + "grad_norm": 0.2754942416375848, + "learning_rate": 0.00015431367422626195, + "loss": 1.1498, + "step": 7345 + }, + { + "epoch": 0.7, + "grad_norm": 0.27801354524717076, + "learning_rate": 0.00015430039080219625, + "loss": 1.1084, + "step": 7346 + }, + { + "epoch": 0.7, + "grad_norm": 0.2914103943454639, + "learning_rate": 0.00015428710601922914, + "loss": 1.0507, + "step": 7347 + }, + { + "epoch": 0.7, + "grad_norm": 0.2638150945585758, + "learning_rate": 0.00015427381987769307, + "loss": 1.0324, + "step": 7348 + }, + { + "epoch": 0.7, + "grad_norm": 0.2526962446018732, + "learning_rate": 0.0001542605323779206, + "loss": 0.9352, + "step": 7349 + }, + { + "epoch": 0.7, + "grad_norm": 0.2595273949493344, + "learning_rate": 0.00015424724352024413, + "loss": 1.0355, + "step": 7350 + }, + { + "epoch": 0.7, + "grad_norm": 0.28782174706723024, + "learning_rate": 0.00015423395330499632, + "loss": 1.1083, + "step": 7351 + }, + { + "epoch": 0.7, + "grad_norm": 0.3115649702709986, + "learning_rate": 0.00015422066173250974, + "loss": 1.0422, + "step": 7352 + }, + { + "epoch": 0.7, + "grad_norm": 0.29111738038165, + "learning_rate": 0.00015420736880311707, + "loss": 1.0315, + "step": 7353 + }, + { + "epoch": 0.7, + "grad_norm": 0.28032086546612484, + "learning_rate": 0.00015419407451715088, + "loss": 1.1888, + "step": 7354 + }, + { + "epoch": 0.7, + "grad_norm": 0.2716417113526186, + "learning_rate": 0.00015418077887494394, + "loss": 1.0331, + "step": 7355 + }, + { + "epoch": 0.7, + "grad_norm": 0.28116455658959255, + "learning_rate": 0.00015416748187682897, + "loss": 1.0433, + "step": 7356 + }, + { + "epoch": 0.7, + "grad_norm": 0.3165740582098866, + "learning_rate": 0.00015415418352313868, + "loss": 1.0488, + "step": 7357 + }, + { + "epoch": 0.7, + "grad_norm": 0.24167939460825494, + "learning_rate": 0.00015414088381420594, + "loss": 1.1047, + "step": 7358 + }, + { + "epoch": 0.7, + "grad_norm": 0.2362202925878594, + "learning_rate": 0.00015412758275036356, + "loss": 1.111, + "step": 7359 + }, + { + "epoch": 0.7, + "grad_norm": 0.30927705441472164, + "learning_rate": 0.0001541142803319444, + "loss": 0.9948, + "step": 7360 + }, + { + "epoch": 0.7, + "grad_norm": 0.26694430612905373, + "learning_rate": 0.00015410097655928136, + "loss": 1.2141, + "step": 7361 + }, + { + "epoch": 0.7, + "grad_norm": 0.28735202961238854, + "learning_rate": 0.00015408767143270738, + "loss": 1.0482, + "step": 7362 + }, + { + "epoch": 0.7, + "grad_norm": 0.3028944111263906, + "learning_rate": 0.00015407436495255543, + "loss": 0.9829, + "step": 7363 + }, + { + "epoch": 0.7, + "grad_norm": 0.3036916238157443, + "learning_rate": 0.0001540610571191585, + "loss": 1.1743, + "step": 7364 + }, + { + "epoch": 0.7, + "grad_norm": 0.2897890002077795, + "learning_rate": 0.00015404774793284967, + "loss": 1.0033, + "step": 7365 + }, + { + "epoch": 0.7, + "grad_norm": 0.29386071816379244, + "learning_rate": 0.00015403443739396195, + "loss": 1.1666, + "step": 7366 + }, + { + "epoch": 0.7, + "grad_norm": 0.32062513986248775, + "learning_rate": 0.0001540211255028285, + "loss": 1.141, + "step": 7367 + }, + { + "epoch": 0.7, + "grad_norm": 0.3180897552174907, + "learning_rate": 0.00015400781225978242, + "loss": 1.0144, + "step": 7368 + }, + { + "epoch": 0.7, + "grad_norm": 0.27245479542574785, + "learning_rate": 0.00015399449766515688, + "loss": 1.0604, + "step": 7369 + }, + { + "epoch": 0.71, + "grad_norm": 0.2915213084795467, + "learning_rate": 0.00015398118171928516, + "loss": 1.1338, + "step": 7370 + }, + { + "epoch": 0.71, + "grad_norm": 0.23168432157865712, + "learning_rate": 0.00015396786442250035, + "loss": 1.0496, + "step": 7371 + }, + { + "epoch": 0.71, + "grad_norm": 0.2782432262911835, + "learning_rate": 0.0001539545457751359, + "loss": 1.0979, + "step": 7372 + }, + { + "epoch": 0.71, + "grad_norm": 0.27977300422557033, + "learning_rate": 0.000153941225777525, + "loss": 1.0744, + "step": 7373 + }, + { + "epoch": 0.71, + "grad_norm": 0.2545144680006588, + "learning_rate": 0.000153927904430001, + "loss": 1.0237, + "step": 7374 + }, + { + "epoch": 0.71, + "grad_norm": 0.23952050585666343, + "learning_rate": 0.00015391458173289734, + "loss": 1.1329, + "step": 7375 + }, + { + "epoch": 0.71, + "grad_norm": 0.27650814084254066, + "learning_rate": 0.00015390125768654738, + "loss": 1.128, + "step": 7376 + }, + { + "epoch": 0.71, + "grad_norm": 0.2957362698116198, + "learning_rate": 0.00015388793229128455, + "loss": 1.1255, + "step": 7377 + }, + { + "epoch": 0.71, + "grad_norm": 0.24485467093779958, + "learning_rate": 0.00015387460554744235, + "loss": 1.0197, + "step": 7378 + }, + { + "epoch": 0.71, + "grad_norm": 0.25214439017148976, + "learning_rate": 0.0001538612774553543, + "loss": 1.0601, + "step": 7379 + }, + { + "epoch": 0.71, + "grad_norm": 0.282880344840166, + "learning_rate": 0.00015384794801535394, + "loss": 1.1426, + "step": 7380 + }, + { + "epoch": 0.71, + "grad_norm": 0.3152140814752052, + "learning_rate": 0.00015383461722777482, + "loss": 1.0964, + "step": 7381 + }, + { + "epoch": 0.71, + "grad_norm": 0.24688971723773373, + "learning_rate": 0.00015382128509295058, + "loss": 1.0926, + "step": 7382 + }, + { + "epoch": 0.71, + "grad_norm": 0.2959651478445274, + "learning_rate": 0.00015380795161121485, + "loss": 1.0349, + "step": 7383 + }, + { + "epoch": 0.71, + "grad_norm": 0.29305053218157, + "learning_rate": 0.0001537946167829013, + "loss": 1.1347, + "step": 7384 + }, + { + "epoch": 0.71, + "grad_norm": 0.2717846083673245, + "learning_rate": 0.00015378128060834366, + "loss": 1.0096, + "step": 7385 + }, + { + "epoch": 0.71, + "grad_norm": 0.2620308073344996, + "learning_rate": 0.0001537679430878757, + "loss": 1.0911, + "step": 7386 + }, + { + "epoch": 0.71, + "grad_norm": 0.2531071813077481, + "learning_rate": 0.00015375460422183116, + "loss": 0.9925, + "step": 7387 + }, + { + "epoch": 0.71, + "grad_norm": 0.2797391994034549, + "learning_rate": 0.00015374126401054383, + "loss": 1.1076, + "step": 7388 + }, + { + "epoch": 0.71, + "grad_norm": 0.2727192530520494, + "learning_rate": 0.00015372792245434765, + "loss": 1.0334, + "step": 7389 + }, + { + "epoch": 0.71, + "grad_norm": 0.29997950673640783, + "learning_rate": 0.00015371457955357643, + "loss": 0.9707, + "step": 7390 + }, + { + "epoch": 0.71, + "grad_norm": 0.2816972910580794, + "learning_rate": 0.00015370123530856407, + "loss": 1.1254, + "step": 7391 + }, + { + "epoch": 0.71, + "grad_norm": 0.2959827743331956, + "learning_rate": 0.00015368788971964454, + "loss": 1.0022, + "step": 7392 + }, + { + "epoch": 0.71, + "grad_norm": 0.3027989628861647, + "learning_rate": 0.0001536745427871519, + "loss": 1.0555, + "step": 7393 + }, + { + "epoch": 0.71, + "grad_norm": 0.27628323794583914, + "learning_rate": 0.00015366119451142002, + "loss": 0.985, + "step": 7394 + }, + { + "epoch": 0.71, + "grad_norm": 0.3098642640517758, + "learning_rate": 0.00015364784489278304, + "loss": 1.0772, + "step": 7395 + }, + { + "epoch": 0.71, + "grad_norm": 0.3045327679601191, + "learning_rate": 0.00015363449393157504, + "loss": 1.1383, + "step": 7396 + }, + { + "epoch": 0.71, + "grad_norm": 0.27647845171066426, + "learning_rate": 0.00015362114162813012, + "loss": 1.0514, + "step": 7397 + }, + { + "epoch": 0.71, + "grad_norm": 0.27708090315989137, + "learning_rate": 0.00015360778798278243, + "loss": 0.9376, + "step": 7398 + }, + { + "epoch": 0.71, + "grad_norm": 0.2903239285477044, + "learning_rate": 0.00015359443299586614, + "loss": 1.1016, + "step": 7399 + }, + { + "epoch": 0.71, + "grad_norm": 0.3010839768456833, + "learning_rate": 0.0001535810766677155, + "loss": 1.0842, + "step": 7400 + }, + { + "epoch": 0.71, + "grad_norm": 0.276746949232895, + "learning_rate": 0.0001535677189986647, + "loss": 1.2018, + "step": 7401 + }, + { + "epoch": 0.71, + "grad_norm": 0.29483934180185845, + "learning_rate": 0.0001535543599890481, + "loss": 1.0932, + "step": 7402 + }, + { + "epoch": 0.71, + "grad_norm": 0.2944209969280942, + "learning_rate": 0.0001535409996392, + "loss": 1.1482, + "step": 7403 + }, + { + "epoch": 0.71, + "grad_norm": 0.31126699379727446, + "learning_rate": 0.0001535276379494547, + "loss": 1.1388, + "step": 7404 + }, + { + "epoch": 0.71, + "grad_norm": 0.2843244483748962, + "learning_rate": 0.00015351427492014662, + "loss": 1.122, + "step": 7405 + }, + { + "epoch": 0.71, + "grad_norm": 0.2940291993707008, + "learning_rate": 0.00015350091055161023, + "loss": 0.8989, + "step": 7406 + }, + { + "epoch": 0.71, + "grad_norm": 0.3351477452765215, + "learning_rate": 0.0001534875448441799, + "loss": 1.1108, + "step": 7407 + }, + { + "epoch": 0.71, + "grad_norm": 0.2906887381085749, + "learning_rate": 0.0001534741777981901, + "loss": 0.9912, + "step": 7408 + }, + { + "epoch": 0.71, + "grad_norm": 0.3122468085486279, + "learning_rate": 0.00015346080941397544, + "loss": 1.113, + "step": 7409 + }, + { + "epoch": 0.71, + "grad_norm": 0.28278667430577964, + "learning_rate": 0.00015344743969187042, + "loss": 1.0403, + "step": 7410 + }, + { + "epoch": 0.71, + "grad_norm": 0.27569491315509187, + "learning_rate": 0.00015343406863220962, + "loss": 1.121, + "step": 7411 + }, + { + "epoch": 0.71, + "grad_norm": 0.2804750598763221, + "learning_rate": 0.0001534206962353277, + "loss": 1.0365, + "step": 7412 + }, + { + "epoch": 0.71, + "grad_norm": 0.2524239751697441, + "learning_rate": 0.00015340732250155927, + "loss": 0.9917, + "step": 7413 + }, + { + "epoch": 0.71, + "grad_norm": 0.29005569119447944, + "learning_rate": 0.000153393947431239, + "loss": 1.1511, + "step": 7414 + }, + { + "epoch": 0.71, + "grad_norm": 0.2749251352441248, + "learning_rate": 0.00015338057102470164, + "loss": 1.0658, + "step": 7415 + }, + { + "epoch": 0.71, + "grad_norm": 0.2838189917053858, + "learning_rate": 0.000153367193282282, + "loss": 1.0391, + "step": 7416 + }, + { + "epoch": 0.71, + "grad_norm": 0.3244226146833391, + "learning_rate": 0.00015335381420431476, + "loss": 1.1094, + "step": 7417 + }, + { + "epoch": 0.71, + "grad_norm": 0.2574324902880777, + "learning_rate": 0.0001533404337911348, + "loss": 1.1391, + "step": 7418 + }, + { + "epoch": 0.71, + "grad_norm": 0.30913323555851724, + "learning_rate": 0.00015332705204307696, + "loss": 1.0444, + "step": 7419 + }, + { + "epoch": 0.71, + "grad_norm": 0.2902578699002574, + "learning_rate": 0.00015331366896047613, + "loss": 0.9731, + "step": 7420 + }, + { + "epoch": 0.71, + "grad_norm": 0.2736910311823876, + "learning_rate": 0.00015330028454366723, + "loss": 1.096, + "step": 7421 + }, + { + "epoch": 0.71, + "grad_norm": 0.2595210721180384, + "learning_rate": 0.0001532868987929852, + "loss": 1.0444, + "step": 7422 + }, + { + "epoch": 0.71, + "grad_norm": 0.2691043276914828, + "learning_rate": 0.00015327351170876504, + "loss": 0.9763, + "step": 7423 + }, + { + "epoch": 0.71, + "grad_norm": 0.2625617634998602, + "learning_rate": 0.00015326012329134177, + "loss": 1.0655, + "step": 7424 + }, + { + "epoch": 0.71, + "grad_norm": 0.3040586098635329, + "learning_rate": 0.00015324673354105044, + "loss": 1.0892, + "step": 7425 + }, + { + "epoch": 0.71, + "grad_norm": 0.2688279407074845, + "learning_rate": 0.00015323334245822613, + "loss": 1.0881, + "step": 7426 + }, + { + "epoch": 0.71, + "grad_norm": 0.2772663570425649, + "learning_rate": 0.00015321995004320398, + "loss": 1.0775, + "step": 7427 + }, + { + "epoch": 0.71, + "grad_norm": 0.28558771196591126, + "learning_rate": 0.00015320655629631915, + "loss": 1.1682, + "step": 7428 + }, + { + "epoch": 0.71, + "grad_norm": 0.3068876838609169, + "learning_rate": 0.00015319316121790676, + "loss": 1.1325, + "step": 7429 + }, + { + "epoch": 0.71, + "grad_norm": 0.25376782398833597, + "learning_rate": 0.00015317976480830214, + "loss": 1.051, + "step": 7430 + }, + { + "epoch": 0.71, + "grad_norm": 0.26689013628589564, + "learning_rate": 0.00015316636706784047, + "loss": 1.0535, + "step": 7431 + }, + { + "epoch": 0.71, + "grad_norm": 0.27913817719785716, + "learning_rate": 0.00015315296799685703, + "loss": 1.0799, + "step": 7432 + }, + { + "epoch": 0.71, + "grad_norm": 0.27105117018470604, + "learning_rate": 0.00015313956759568717, + "loss": 0.9799, + "step": 7433 + }, + { + "epoch": 0.71, + "grad_norm": 0.2650201668901937, + "learning_rate": 0.00015312616586466625, + "loss": 1.0213, + "step": 7434 + }, + { + "epoch": 0.71, + "grad_norm": 0.2597414213724003, + "learning_rate": 0.0001531127628041296, + "loss": 1.0696, + "step": 7435 + }, + { + "epoch": 0.71, + "grad_norm": 0.2720574465290946, + "learning_rate": 0.0001530993584144127, + "loss": 1.0064, + "step": 7436 + }, + { + "epoch": 0.71, + "grad_norm": 0.2926790787938213, + "learning_rate": 0.000153085952695851, + "loss": 1.0582, + "step": 7437 + }, + { + "epoch": 0.71, + "grad_norm": 0.30451030733173934, + "learning_rate": 0.00015307254564877996, + "loss": 1.0364, + "step": 7438 + }, + { + "epoch": 0.71, + "grad_norm": 0.2969242563944365, + "learning_rate": 0.00015305913727353508, + "loss": 0.9929, + "step": 7439 + }, + { + "epoch": 0.71, + "grad_norm": 0.2843406055001775, + "learning_rate": 0.000153045727570452, + "loss": 1.0706, + "step": 7440 + }, + { + "epoch": 0.71, + "grad_norm": 0.27079473763903633, + "learning_rate": 0.0001530323165398662, + "loss": 1.13, + "step": 7441 + }, + { + "epoch": 0.71, + "grad_norm": 0.31673668159080653, + "learning_rate": 0.00015301890418211337, + "loss": 1.033, + "step": 7442 + }, + { + "epoch": 0.71, + "grad_norm": 0.25852536375912855, + "learning_rate": 0.00015300549049752915, + "loss": 0.9558, + "step": 7443 + }, + { + "epoch": 0.71, + "grad_norm": 0.2610535794849494, + "learning_rate": 0.00015299207548644922, + "loss": 0.9075, + "step": 7444 + }, + { + "epoch": 0.71, + "grad_norm": 0.3144932740991477, + "learning_rate": 0.0001529786591492093, + "loss": 1.088, + "step": 7445 + }, + { + "epoch": 0.71, + "grad_norm": 0.3002199992017584, + "learning_rate": 0.0001529652414861451, + "loss": 1.0292, + "step": 7446 + }, + { + "epoch": 0.71, + "grad_norm": 0.29137989165257083, + "learning_rate": 0.00015295182249759246, + "loss": 1.1373, + "step": 7447 + }, + { + "epoch": 0.71, + "grad_norm": 0.3000740319743772, + "learning_rate": 0.0001529384021838872, + "loss": 1.0349, + "step": 7448 + }, + { + "epoch": 0.71, + "grad_norm": 0.37952107317617517, + "learning_rate": 0.00015292498054536515, + "loss": 1.027, + "step": 7449 + }, + { + "epoch": 0.71, + "grad_norm": 0.2658313512672852, + "learning_rate": 0.00015291155758236219, + "loss": 1.0365, + "step": 7450 + }, + { + "epoch": 0.71, + "grad_norm": 0.30711739440319363, + "learning_rate": 0.00015289813329521427, + "loss": 1.0623, + "step": 7451 + }, + { + "epoch": 0.71, + "grad_norm": 0.26947346016409285, + "learning_rate": 0.0001528847076842573, + "loss": 1.0008, + "step": 7452 + }, + { + "epoch": 0.71, + "grad_norm": 0.294244616915276, + "learning_rate": 0.00015287128074982728, + "loss": 1.1527, + "step": 7453 + }, + { + "epoch": 0.71, + "grad_norm": 0.26434224087661645, + "learning_rate": 0.00015285785249226025, + "loss": 1.0076, + "step": 7454 + }, + { + "epoch": 0.71, + "grad_norm": 0.2638878195945205, + "learning_rate": 0.00015284442291189224, + "loss": 1.0129, + "step": 7455 + }, + { + "epoch": 0.71, + "grad_norm": 0.291822498691914, + "learning_rate": 0.0001528309920090593, + "loss": 1.0994, + "step": 7456 + }, + { + "epoch": 0.71, + "grad_norm": 0.24079989611181216, + "learning_rate": 0.00015281755978409763, + "loss": 0.9589, + "step": 7457 + }, + { + "epoch": 0.71, + "grad_norm": 0.26439658524128423, + "learning_rate": 0.00015280412623734331, + "loss": 1.0177, + "step": 7458 + }, + { + "epoch": 0.71, + "grad_norm": 0.28359751564704094, + "learning_rate": 0.00015279069136913252, + "loss": 1.0652, + "step": 7459 + }, + { + "epoch": 0.71, + "grad_norm": 0.27983466260304535, + "learning_rate": 0.00015277725517980152, + "loss": 1.0485, + "step": 7460 + }, + { + "epoch": 0.71, + "grad_norm": 0.29708736109513595, + "learning_rate": 0.00015276381766968656, + "loss": 1.074, + "step": 7461 + }, + { + "epoch": 0.71, + "grad_norm": 0.29289233835975614, + "learning_rate": 0.0001527503788391239, + "loss": 1.1547, + "step": 7462 + }, + { + "epoch": 0.71, + "grad_norm": 0.2505395055508232, + "learning_rate": 0.00015273693868844983, + "loss": 0.9737, + "step": 7463 + }, + { + "epoch": 0.71, + "grad_norm": 0.32901669963923147, + "learning_rate": 0.00015272349721800075, + "loss": 1.0536, + "step": 7464 + }, + { + "epoch": 0.71, + "grad_norm": 0.256290518603839, + "learning_rate": 0.000152710054428113, + "loss": 1.1606, + "step": 7465 + }, + { + "epoch": 0.71, + "grad_norm": 0.26769516189713743, + "learning_rate": 0.000152696610319123, + "loss": 0.9284, + "step": 7466 + }, + { + "epoch": 0.71, + "grad_norm": 0.2644771395655536, + "learning_rate": 0.00015268316489136722, + "loss": 1.1177, + "step": 7467 + }, + { + "epoch": 0.71, + "grad_norm": 0.2806512107610077, + "learning_rate": 0.00015266971814518213, + "loss": 0.9686, + "step": 7468 + }, + { + "epoch": 0.71, + "grad_norm": 0.2845865397349829, + "learning_rate": 0.00015265627008090424, + "loss": 1.0744, + "step": 7469 + }, + { + "epoch": 0.71, + "grad_norm": 0.26504898270571114, + "learning_rate": 0.00015264282069887012, + "loss": 1.1322, + "step": 7470 + }, + { + "epoch": 0.71, + "grad_norm": 0.2916082135751575, + "learning_rate": 0.0001526293699994163, + "loss": 1.103, + "step": 7471 + }, + { + "epoch": 0.71, + "grad_norm": 0.31144997066611424, + "learning_rate": 0.00015261591798287945, + "loss": 1.0014, + "step": 7472 + }, + { + "epoch": 0.71, + "grad_norm": 0.2574195705247452, + "learning_rate": 0.00015260246464959614, + "loss": 1.1275, + "step": 7473 + }, + { + "epoch": 0.72, + "grad_norm": 0.31598031433107676, + "learning_rate": 0.00015258900999990313, + "loss": 0.9769, + "step": 7474 + }, + { + "epoch": 0.72, + "grad_norm": 0.2733626438470682, + "learning_rate": 0.00015257555403413707, + "loss": 1.1188, + "step": 7475 + }, + { + "epoch": 0.72, + "grad_norm": 0.27826283583830214, + "learning_rate": 0.00015256209675263473, + "loss": 0.9995, + "step": 7476 + }, + { + "epoch": 0.72, + "grad_norm": 0.3089275459108109, + "learning_rate": 0.0001525486381557329, + "loss": 1.0316, + "step": 7477 + }, + { + "epoch": 0.72, + "grad_norm": 0.2928951184842569, + "learning_rate": 0.00015253517824376838, + "loss": 1.0514, + "step": 7478 + }, + { + "epoch": 0.72, + "grad_norm": 0.26365816516744844, + "learning_rate": 0.00015252171701707798, + "loss": 1.0454, + "step": 7479 + }, + { + "epoch": 0.72, + "grad_norm": 0.28372734957336343, + "learning_rate": 0.00015250825447599863, + "loss": 1.0397, + "step": 7480 + }, + { + "epoch": 0.72, + "grad_norm": 0.24476440536696728, + "learning_rate": 0.0001524947906208672, + "loss": 1.1185, + "step": 7481 + }, + { + "epoch": 0.72, + "grad_norm": 0.29553606635449614, + "learning_rate": 0.00015248132545202066, + "loss": 1.0536, + "step": 7482 + }, + { + "epoch": 0.72, + "grad_norm": 0.28192161411982414, + "learning_rate": 0.00015246785896979592, + "loss": 1.0789, + "step": 7483 + }, + { + "epoch": 0.72, + "grad_norm": 0.28836399016991415, + "learning_rate": 0.00015245439117453005, + "loss": 1.1376, + "step": 7484 + }, + { + "epoch": 0.72, + "grad_norm": 0.285252586401288, + "learning_rate": 0.00015244092206656012, + "loss": 1.0715, + "step": 7485 + }, + { + "epoch": 0.72, + "grad_norm": 0.34180059296617976, + "learning_rate": 0.0001524274516462231, + "loss": 1.0974, + "step": 7486 + }, + { + "epoch": 0.72, + "grad_norm": 0.28411030591653186, + "learning_rate": 0.0001524139799138562, + "loss": 1.1046, + "step": 7487 + }, + { + "epoch": 0.72, + "grad_norm": 0.2688829534104327, + "learning_rate": 0.00015240050686979648, + "loss": 1.0534, + "step": 7488 + }, + { + "epoch": 0.72, + "grad_norm": 0.3171051572047796, + "learning_rate": 0.00015238703251438116, + "loss": 0.9673, + "step": 7489 + }, + { + "epoch": 0.72, + "grad_norm": 0.2715311978491321, + "learning_rate": 0.00015237355684794742, + "loss": 1.0009, + "step": 7490 + }, + { + "epoch": 0.72, + "grad_norm": 0.3107037826927023, + "learning_rate": 0.0001523600798708325, + "loss": 1.1782, + "step": 7491 + }, + { + "epoch": 0.72, + "grad_norm": 0.3004283097955692, + "learning_rate": 0.00015234660158337367, + "loss": 1.1222, + "step": 7492 + }, + { + "epoch": 0.72, + "grad_norm": 0.34256065220175824, + "learning_rate": 0.00015233312198590824, + "loss": 1.1572, + "step": 7493 + }, + { + "epoch": 0.72, + "grad_norm": 0.30264641686373545, + "learning_rate": 0.00015231964107877355, + "loss": 0.9944, + "step": 7494 + }, + { + "epoch": 0.72, + "grad_norm": 0.29115852098449335, + "learning_rate": 0.00015230615886230696, + "loss": 1.0331, + "step": 7495 + }, + { + "epoch": 0.72, + "grad_norm": 0.2663234013844971, + "learning_rate": 0.00015229267533684588, + "loss": 0.9686, + "step": 7496 + }, + { + "epoch": 0.72, + "grad_norm": 0.2706122042940165, + "learning_rate": 0.00015227919050272775, + "loss": 1.029, + "step": 7497 + }, + { + "epoch": 0.72, + "grad_norm": 0.28192878540642374, + "learning_rate": 0.00015226570436028996, + "loss": 1.0831, + "step": 7498 + }, + { + "epoch": 0.72, + "grad_norm": 0.3136140113978119, + "learning_rate": 0.00015225221690987013, + "loss": 1.1534, + "step": 7499 + }, + { + "epoch": 0.72, + "grad_norm": 0.29044795123500067, + "learning_rate": 0.0001522387281518057, + "loss": 1.0986, + "step": 7500 + }, + { + "epoch": 0.72, + "grad_norm": 0.25709365283504254, + "learning_rate": 0.00015222523808643428, + "loss": 1.1544, + "step": 7501 + }, + { + "epoch": 0.72, + "grad_norm": 0.27642043583466885, + "learning_rate": 0.00015221174671409347, + "loss": 1.0419, + "step": 7502 + }, + { + "epoch": 0.72, + "grad_norm": 0.28235849443833444, + "learning_rate": 0.00015219825403512086, + "loss": 1.1054, + "step": 7503 + }, + { + "epoch": 0.72, + "grad_norm": 0.3347988942349422, + "learning_rate": 0.00015218476004985414, + "loss": 1.1444, + "step": 7504 + }, + { + "epoch": 0.72, + "grad_norm": 0.28011799381802555, + "learning_rate": 0.000152171264758631, + "loss": 1.0586, + "step": 7505 + }, + { + "epoch": 0.72, + "grad_norm": 0.29631516225770976, + "learning_rate": 0.00015215776816178918, + "loss": 1.066, + "step": 7506 + }, + { + "epoch": 0.72, + "grad_norm": 0.312587038609111, + "learning_rate": 0.00015214427025966642, + "loss": 1.06, + "step": 7507 + }, + { + "epoch": 0.72, + "grad_norm": 0.26267971626273046, + "learning_rate": 0.00015213077105260053, + "loss": 0.99, + "step": 7508 + }, + { + "epoch": 0.72, + "grad_norm": 0.29362317699509005, + "learning_rate": 0.00015211727054092932, + "loss": 1.0374, + "step": 7509 + }, + { + "epoch": 0.72, + "grad_norm": 0.2743031690741454, + "learning_rate": 0.00015210376872499068, + "loss": 0.9676, + "step": 7510 + }, + { + "epoch": 0.72, + "grad_norm": 0.3008169494678444, + "learning_rate": 0.0001520902656051225, + "loss": 1.0962, + "step": 7511 + }, + { + "epoch": 0.72, + "grad_norm": 0.2510179331269226, + "learning_rate": 0.00015207676118166266, + "loss": 1.0657, + "step": 7512 + }, + { + "epoch": 0.72, + "grad_norm": 0.3006570781756079, + "learning_rate": 0.00015206325545494913, + "loss": 1.1252, + "step": 7513 + }, + { + "epoch": 0.72, + "grad_norm": 0.3074556798522242, + "learning_rate": 0.00015204974842531995, + "loss": 1.1561, + "step": 7514 + }, + { + "epoch": 0.72, + "grad_norm": 0.3251122148768213, + "learning_rate": 0.00015203624009311307, + "loss": 1.1283, + "step": 7515 + }, + { + "epoch": 0.72, + "grad_norm": 0.2595576229638888, + "learning_rate": 0.0001520227304586666, + "loss": 0.9711, + "step": 7516 + }, + { + "epoch": 0.72, + "grad_norm": 0.29928452954008067, + "learning_rate": 0.00015200921952231858, + "loss": 1.0824, + "step": 7517 + }, + { + "epoch": 0.72, + "grad_norm": 0.3139867616135075, + "learning_rate": 0.0001519957072844072, + "loss": 1.1125, + "step": 7518 + }, + { + "epoch": 0.72, + "grad_norm": 0.29374530619599887, + "learning_rate": 0.00015198219374527053, + "loss": 1.1183, + "step": 7519 + }, + { + "epoch": 0.72, + "grad_norm": 0.2952280509086445, + "learning_rate": 0.00015196867890524676, + "loss": 1.0937, + "step": 7520 + }, + { + "epoch": 0.72, + "grad_norm": 0.2826852773745996, + "learning_rate": 0.00015195516276467422, + "loss": 1.1857, + "step": 7521 + }, + { + "epoch": 0.72, + "grad_norm": 0.2803858395978367, + "learning_rate": 0.000151941645323891, + "loss": 0.9876, + "step": 7522 + }, + { + "epoch": 0.72, + "grad_norm": 0.27577711685224354, + "learning_rate": 0.00015192812658323552, + "loss": 1.0477, + "step": 7523 + }, + { + "epoch": 0.72, + "grad_norm": 0.29937814744106744, + "learning_rate": 0.00015191460654304602, + "loss": 1.0781, + "step": 7524 + }, + { + "epoch": 0.72, + "grad_norm": 0.344475738203272, + "learning_rate": 0.00015190108520366085, + "loss": 1.0677, + "step": 7525 + }, + { + "epoch": 0.72, + "grad_norm": 0.269619028961468, + "learning_rate": 0.00015188756256541842, + "loss": 1.1207, + "step": 7526 + }, + { + "epoch": 0.72, + "grad_norm": 0.2856659171795633, + "learning_rate": 0.0001518740386286571, + "loss": 0.9971, + "step": 7527 + }, + { + "epoch": 0.72, + "grad_norm": 0.2690448461790446, + "learning_rate": 0.0001518605133937154, + "loss": 1.0434, + "step": 7528 + }, + { + "epoch": 0.72, + "grad_norm": 0.3153664787777255, + "learning_rate": 0.00015184698686093173, + "loss": 1.1609, + "step": 7529 + }, + { + "epoch": 0.72, + "grad_norm": 0.31002810501031186, + "learning_rate": 0.00015183345903064467, + "loss": 1.0097, + "step": 7530 + }, + { + "epoch": 0.72, + "grad_norm": 0.2764530793093042, + "learning_rate": 0.00015181992990319265, + "loss": 1.0459, + "step": 7531 + }, + { + "epoch": 0.72, + "grad_norm": 0.2581950187662785, + "learning_rate": 0.00015180639947891437, + "loss": 1.0612, + "step": 7532 + }, + { + "epoch": 0.72, + "grad_norm": 0.3228690135774238, + "learning_rate": 0.0001517928677581484, + "loss": 1.1375, + "step": 7533 + }, + { + "epoch": 0.72, + "grad_norm": 0.2830701920026826, + "learning_rate": 0.0001517793347412333, + "loss": 0.9702, + "step": 7534 + }, + { + "epoch": 0.72, + "grad_norm": 0.30970817365242975, + "learning_rate": 0.00015176580042850787, + "loss": 1.0011, + "step": 7535 + }, + { + "epoch": 0.72, + "grad_norm": 0.2879799004430592, + "learning_rate": 0.00015175226482031073, + "loss": 1.0469, + "step": 7536 + }, + { + "epoch": 0.72, + "grad_norm": 0.2721982692551152, + "learning_rate": 0.0001517387279169806, + "loss": 1.0661, + "step": 7537 + }, + { + "epoch": 0.72, + "grad_norm": 0.30110301529460076, + "learning_rate": 0.00015172518971885634, + "loss": 1.0512, + "step": 7538 + }, + { + "epoch": 0.72, + "grad_norm": 0.26509470901065213, + "learning_rate": 0.00015171165022627667, + "loss": 1.0675, + "step": 7539 + }, + { + "epoch": 0.72, + "grad_norm": 0.29294036548567454, + "learning_rate": 0.00015169810943958044, + "loss": 1.12, + "step": 7540 + }, + { + "epoch": 0.72, + "grad_norm": 0.2756337326490096, + "learning_rate": 0.00015168456735910657, + "loss": 1.0387, + "step": 7541 + }, + { + "epoch": 0.72, + "grad_norm": 0.2974971175746808, + "learning_rate": 0.0001516710239851939, + "loss": 1.1836, + "step": 7542 + }, + { + "epoch": 0.72, + "grad_norm": 0.31089357056437184, + "learning_rate": 0.0001516574793181814, + "loss": 1.1023, + "step": 7543 + }, + { + "epoch": 0.72, + "grad_norm": 0.30077365644294696, + "learning_rate": 0.00015164393335840798, + "loss": 0.9271, + "step": 7544 + }, + { + "epoch": 0.72, + "grad_norm": 0.2565124797415133, + "learning_rate": 0.00015163038610621269, + "loss": 1.0294, + "step": 7545 + }, + { + "epoch": 0.72, + "grad_norm": 0.2826401399220724, + "learning_rate": 0.00015161683756193456, + "loss": 1.0563, + "step": 7546 + }, + { + "epoch": 0.72, + "grad_norm": 0.30362864947281726, + "learning_rate": 0.00015160328772591256, + "loss": 1.1748, + "step": 7547 + }, + { + "epoch": 0.72, + "grad_norm": 0.29370524398637515, + "learning_rate": 0.00015158973659848592, + "loss": 1.0426, + "step": 7548 + }, + { + "epoch": 0.72, + "grad_norm": 0.2684686193459749, + "learning_rate": 0.00015157618417999366, + "loss": 1.0464, + "step": 7549 + }, + { + "epoch": 0.72, + "grad_norm": 0.2761867607331919, + "learning_rate": 0.000151562630470775, + "loss": 1.0953, + "step": 7550 + }, + { + "epoch": 0.72, + "grad_norm": 0.25846363686227103, + "learning_rate": 0.0001515490754711691, + "loss": 1.061, + "step": 7551 + }, + { + "epoch": 0.72, + "grad_norm": 0.2642419825902769, + "learning_rate": 0.0001515355191815152, + "loss": 1.1347, + "step": 7552 + }, + { + "epoch": 0.72, + "grad_norm": 0.2559409653328977, + "learning_rate": 0.00015152196160215253, + "loss": 1.0638, + "step": 7553 + }, + { + "epoch": 0.72, + "grad_norm": 0.28318731458349305, + "learning_rate": 0.00015150840273342038, + "loss": 1.0592, + "step": 7554 + }, + { + "epoch": 0.72, + "grad_norm": 0.2850162725399112, + "learning_rate": 0.00015149484257565813, + "loss": 1.1141, + "step": 7555 + }, + { + "epoch": 0.72, + "grad_norm": 0.26118830684653604, + "learning_rate": 0.000151481281129205, + "loss": 1.0173, + "step": 7556 + }, + { + "epoch": 0.72, + "grad_norm": 0.27293247877399707, + "learning_rate": 0.0001514677183944005, + "loss": 1.0846, + "step": 7557 + }, + { + "epoch": 0.72, + "grad_norm": 0.304675314149257, + "learning_rate": 0.00015145415437158401, + "loss": 1.1428, + "step": 7558 + }, + { + "epoch": 0.72, + "grad_norm": 0.2972819357482946, + "learning_rate": 0.00015144058906109496, + "loss": 1.0655, + "step": 7559 + }, + { + "epoch": 0.72, + "grad_norm": 0.28299905657676133, + "learning_rate": 0.00015142702246327287, + "loss": 1.0705, + "step": 7560 + }, + { + "epoch": 0.72, + "grad_norm": 0.29542732633039054, + "learning_rate": 0.00015141345457845716, + "loss": 1.0193, + "step": 7561 + }, + { + "epoch": 0.72, + "grad_norm": 0.2663372553374203, + "learning_rate": 0.00015139988540698748, + "loss": 0.9831, + "step": 7562 + }, + { + "epoch": 0.72, + "grad_norm": 0.25393991962427676, + "learning_rate": 0.00015138631494920337, + "loss": 1.1387, + "step": 7563 + }, + { + "epoch": 0.72, + "grad_norm": 0.27744832277420794, + "learning_rate": 0.00015137274320544438, + "loss": 1.1088, + "step": 7564 + }, + { + "epoch": 0.72, + "grad_norm": 0.27311375703469265, + "learning_rate": 0.00015135917017605025, + "loss": 1.092, + "step": 7565 + }, + { + "epoch": 0.72, + "grad_norm": 0.3264586447066704, + "learning_rate": 0.0001513455958613606, + "loss": 1.1544, + "step": 7566 + }, + { + "epoch": 0.72, + "grad_norm": 0.29926443610547104, + "learning_rate": 0.00015133202026171514, + "loss": 1.0584, + "step": 7567 + }, + { + "epoch": 0.72, + "grad_norm": 0.2945733521922124, + "learning_rate": 0.00015131844337745362, + "loss": 0.9397, + "step": 7568 + }, + { + "epoch": 0.72, + "grad_norm": 0.2730547592785977, + "learning_rate": 0.00015130486520891582, + "loss": 0.9991, + "step": 7569 + }, + { + "epoch": 0.72, + "grad_norm": 0.30119301432734397, + "learning_rate": 0.00015129128575644147, + "loss": 1.0843, + "step": 7570 + }, + { + "epoch": 0.72, + "grad_norm": 0.30645684857055866, + "learning_rate": 0.00015127770502037052, + "loss": 1.0142, + "step": 7571 + }, + { + "epoch": 0.72, + "grad_norm": 0.2821703954502372, + "learning_rate": 0.00015126412300104272, + "loss": 0.982, + "step": 7572 + }, + { + "epoch": 0.72, + "grad_norm": 0.27920328503608616, + "learning_rate": 0.00015125053969879807, + "loss": 1.0294, + "step": 7573 + }, + { + "epoch": 0.72, + "grad_norm": 0.2385258107376846, + "learning_rate": 0.0001512369551139764, + "loss": 1.0406, + "step": 7574 + }, + { + "epoch": 0.72, + "grad_norm": 0.2972641315163676, + "learning_rate": 0.0001512233692469178, + "loss": 1.1835, + "step": 7575 + }, + { + "epoch": 0.72, + "grad_norm": 0.2602247995915003, + "learning_rate": 0.00015120978209796213, + "loss": 1.0921, + "step": 7576 + }, + { + "epoch": 0.72, + "grad_norm": 0.27839769616677684, + "learning_rate": 0.00015119619366744952, + "loss": 1.099, + "step": 7577 + }, + { + "epoch": 0.72, + "grad_norm": 0.316376234085856, + "learning_rate": 0.00015118260395571994, + "loss": 1.0387, + "step": 7578 + }, + { + "epoch": 0.73, + "grad_norm": 0.31660368584142895, + "learning_rate": 0.00015116901296311356, + "loss": 1.1306, + "step": 7579 + }, + { + "epoch": 0.73, + "grad_norm": 0.260900438740011, + "learning_rate": 0.00015115542068997047, + "loss": 0.9071, + "step": 7580 + }, + { + "epoch": 0.73, + "grad_norm": 0.2624881211285536, + "learning_rate": 0.00015114182713663082, + "loss": 1.1411, + "step": 7581 + }, + { + "epoch": 0.73, + "grad_norm": 0.257486571814327, + "learning_rate": 0.00015112823230343484, + "loss": 1.0307, + "step": 7582 + }, + { + "epoch": 0.73, + "grad_norm": 0.2823321215311315, + "learning_rate": 0.00015111463619072265, + "loss": 1.1471, + "step": 7583 + }, + { + "epoch": 0.73, + "grad_norm": 0.30117635588768243, + "learning_rate": 0.00015110103879883462, + "loss": 1.1452, + "step": 7584 + }, + { + "epoch": 0.73, + "grad_norm": 0.3022579959192287, + "learning_rate": 0.00015108744012811096, + "loss": 1.0584, + "step": 7585 + }, + { + "epoch": 0.73, + "grad_norm": 0.29740329090227624, + "learning_rate": 0.000151073840178892, + "loss": 1.0415, + "step": 7586 + }, + { + "epoch": 0.73, + "grad_norm": 0.29372205328515927, + "learning_rate": 0.0001510602389515181, + "loss": 1.1383, + "step": 7587 + }, + { + "epoch": 0.73, + "grad_norm": 0.2920566881167881, + "learning_rate": 0.00015104663644632962, + "loss": 1.0683, + "step": 7588 + }, + { + "epoch": 0.73, + "grad_norm": 0.22058431176486398, + "learning_rate": 0.000151033032663667, + "loss": 1.0229, + "step": 7589 + }, + { + "epoch": 0.73, + "grad_norm": 0.25213010351059206, + "learning_rate": 0.00015101942760387065, + "loss": 1.1396, + "step": 7590 + }, + { + "epoch": 0.73, + "grad_norm": 0.25918222437239363, + "learning_rate": 0.00015100582126728105, + "loss": 1.0704, + "step": 7591 + }, + { + "epoch": 0.73, + "grad_norm": 0.2961632170304152, + "learning_rate": 0.00015099221365423872, + "loss": 1.1117, + "step": 7592 + }, + { + "epoch": 0.73, + "grad_norm": 0.28187090698557027, + "learning_rate": 0.0001509786047650842, + "loss": 0.9727, + "step": 7593 + }, + { + "epoch": 0.73, + "grad_norm": 0.2892636190812079, + "learning_rate": 0.00015096499460015805, + "loss": 1.013, + "step": 7594 + }, + { + "epoch": 0.73, + "grad_norm": 0.28236820071622526, + "learning_rate": 0.0001509513831598009, + "loss": 0.9727, + "step": 7595 + }, + { + "epoch": 0.73, + "grad_norm": 0.2722476913875187, + "learning_rate": 0.00015093777044435333, + "loss": 1.0157, + "step": 7596 + }, + { + "epoch": 0.73, + "grad_norm": 0.2652646040859842, + "learning_rate": 0.00015092415645415606, + "loss": 1.0091, + "step": 7597 + }, + { + "epoch": 0.73, + "grad_norm": 0.28143692367142764, + "learning_rate": 0.00015091054118954978, + "loss": 1.0371, + "step": 7598 + }, + { + "epoch": 0.73, + "grad_norm": 0.2767178043247685, + "learning_rate": 0.0001508969246508752, + "loss": 1.0008, + "step": 7599 + }, + { + "epoch": 0.73, + "grad_norm": 0.25719222763437416, + "learning_rate": 0.0001508833068384731, + "loss": 1.0056, + "step": 7600 + }, + { + "epoch": 0.73, + "grad_norm": 0.30194394764309046, + "learning_rate": 0.00015086968775268427, + "loss": 1.0588, + "step": 7601 + }, + { + "epoch": 0.73, + "grad_norm": 0.2775361029431778, + "learning_rate": 0.00015085606739384953, + "loss": 1.1844, + "step": 7602 + }, + { + "epoch": 0.73, + "grad_norm": 0.2529941931845389, + "learning_rate": 0.00015084244576230976, + "loss": 0.9901, + "step": 7603 + }, + { + "epoch": 0.73, + "grad_norm": 0.3165846646756356, + "learning_rate": 0.00015082882285840578, + "loss": 0.997, + "step": 7604 + }, + { + "epoch": 0.73, + "grad_norm": 0.2782657643449699, + "learning_rate": 0.0001508151986824786, + "loss": 1.0908, + "step": 7605 + }, + { + "epoch": 0.73, + "grad_norm": 0.2781877735281842, + "learning_rate": 0.00015080157323486915, + "loss": 1.1314, + "step": 7606 + }, + { + "epoch": 0.73, + "grad_norm": 0.281042713599222, + "learning_rate": 0.00015078794651591837, + "loss": 1.0458, + "step": 7607 + }, + { + "epoch": 0.73, + "grad_norm": 0.27849574824675116, + "learning_rate": 0.0001507743185259673, + "loss": 1.1229, + "step": 7608 + }, + { + "epoch": 0.73, + "grad_norm": 0.2759377287295315, + "learning_rate": 0.00015076068926535706, + "loss": 1.044, + "step": 7609 + }, + { + "epoch": 0.73, + "grad_norm": 0.2821493820751014, + "learning_rate": 0.00015074705873442863, + "loss": 1.0843, + "step": 7610 + }, + { + "epoch": 0.73, + "grad_norm": 0.2607548413248963, + "learning_rate": 0.00015073342693352316, + "loss": 1.0291, + "step": 7611 + }, + { + "epoch": 0.73, + "grad_norm": 0.3107914480290541, + "learning_rate": 0.0001507197938629818, + "loss": 1.1632, + "step": 7612 + }, + { + "epoch": 0.73, + "grad_norm": 0.2644427769663581, + "learning_rate": 0.0001507061595231457, + "loss": 1.112, + "step": 7613 + }, + { + "epoch": 0.73, + "grad_norm": 0.29569519438471115, + "learning_rate": 0.00015069252391435614, + "loss": 0.8825, + "step": 7614 + }, + { + "epoch": 0.73, + "grad_norm": 0.2725562040988613, + "learning_rate": 0.00015067888703695426, + "loss": 0.975, + "step": 7615 + }, + { + "epoch": 0.73, + "grad_norm": 0.27141691407540564, + "learning_rate": 0.00015066524889128139, + "loss": 1.1129, + "step": 7616 + }, + { + "epoch": 0.73, + "grad_norm": 0.286170558294994, + "learning_rate": 0.00015065160947767887, + "loss": 1.1549, + "step": 7617 + }, + { + "epoch": 0.73, + "grad_norm": 0.28827191966294335, + "learning_rate": 0.00015063796879648793, + "loss": 1.0636, + "step": 7618 + }, + { + "epoch": 0.73, + "grad_norm": 0.34763534936988866, + "learning_rate": 0.00015062432684805, + "loss": 1.0821, + "step": 7619 + }, + { + "epoch": 0.73, + "grad_norm": 0.2928541057535552, + "learning_rate": 0.00015061068363270654, + "loss": 0.9746, + "step": 7620 + }, + { + "epoch": 0.73, + "grad_norm": 0.22676940791019204, + "learning_rate": 0.00015059703915079888, + "loss": 0.9351, + "step": 7621 + }, + { + "epoch": 0.73, + "grad_norm": 0.2932641803272752, + "learning_rate": 0.0001505833934026685, + "loss": 1.169, + "step": 7622 + }, + { + "epoch": 0.73, + "grad_norm": 0.2545708248714656, + "learning_rate": 0.0001505697463886569, + "loss": 1.0129, + "step": 7623 + }, + { + "epoch": 0.73, + "grad_norm": 0.31554332098638554, + "learning_rate": 0.00015055609810910565, + "loss": 1.0664, + "step": 7624 + }, + { + "epoch": 0.73, + "grad_norm": 0.3159256467723614, + "learning_rate": 0.00015054244856435624, + "loss": 1.1755, + "step": 7625 + }, + { + "epoch": 0.73, + "grad_norm": 0.25618170107349153, + "learning_rate": 0.0001505287977547503, + "loss": 1.0721, + "step": 7626 + }, + { + "epoch": 0.73, + "grad_norm": 0.3162579358667636, + "learning_rate": 0.00015051514568062947, + "loss": 0.958, + "step": 7627 + }, + { + "epoch": 0.73, + "grad_norm": 0.25894438378797824, + "learning_rate": 0.00015050149234233532, + "loss": 1.0019, + "step": 7628 + }, + { + "epoch": 0.73, + "grad_norm": 0.2852760760049263, + "learning_rate": 0.00015048783774020962, + "loss": 1.1271, + "step": 7629 + }, + { + "epoch": 0.73, + "grad_norm": 0.3912607456759231, + "learning_rate": 0.00015047418187459405, + "loss": 1.1038, + "step": 7630 + }, + { + "epoch": 0.73, + "grad_norm": 0.322140496874572, + "learning_rate": 0.00015046052474583033, + "loss": 1.0758, + "step": 7631 + }, + { + "epoch": 0.73, + "grad_norm": 0.3229270016075276, + "learning_rate": 0.0001504468663542603, + "loss": 1.0946, + "step": 7632 + }, + { + "epoch": 0.73, + "grad_norm": 0.3081581987259254, + "learning_rate": 0.0001504332067002257, + "loss": 1.0036, + "step": 7633 + }, + { + "epoch": 0.73, + "grad_norm": 0.29552024618919837, + "learning_rate": 0.00015041954578406844, + "loss": 1.0949, + "step": 7634 + }, + { + "epoch": 0.73, + "grad_norm": 0.2963238428604876, + "learning_rate": 0.00015040588360613034, + "loss": 1.0493, + "step": 7635 + }, + { + "epoch": 0.73, + "grad_norm": 0.29301025689881727, + "learning_rate": 0.00015039222016675332, + "loss": 1.0601, + "step": 7636 + }, + { + "epoch": 0.73, + "grad_norm": 0.31243960952554717, + "learning_rate": 0.0001503785554662793, + "loss": 1.0141, + "step": 7637 + }, + { + "epoch": 0.73, + "grad_norm": 0.2810933699464373, + "learning_rate": 0.00015036488950505032, + "loss": 0.9789, + "step": 7638 + }, + { + "epoch": 0.73, + "grad_norm": 0.2800782903010916, + "learning_rate": 0.0001503512222834083, + "loss": 1.0996, + "step": 7639 + }, + { + "epoch": 0.73, + "grad_norm": 0.2737931904759353, + "learning_rate": 0.0001503375538016953, + "loss": 1.0249, + "step": 7640 + }, + { + "epoch": 0.73, + "grad_norm": 0.28069227541654734, + "learning_rate": 0.0001503238840602534, + "loss": 0.9117, + "step": 7641 + }, + { + "epoch": 0.73, + "grad_norm": 0.26949283739174246, + "learning_rate": 0.00015031021305942464, + "loss": 1.0084, + "step": 7642 + }, + { + "epoch": 0.73, + "grad_norm": 0.31475584607752194, + "learning_rate": 0.00015029654079955118, + "loss": 1.0414, + "step": 7643 + }, + { + "epoch": 0.73, + "grad_norm": 0.28698683483189275, + "learning_rate": 0.0001502828672809752, + "loss": 1.0434, + "step": 7644 + }, + { + "epoch": 0.73, + "grad_norm": 0.25256982279011375, + "learning_rate": 0.00015026919250403883, + "loss": 1.0292, + "step": 7645 + }, + { + "epoch": 0.73, + "grad_norm": 0.29577865157695005, + "learning_rate": 0.00015025551646908437, + "loss": 1.1659, + "step": 7646 + }, + { + "epoch": 0.73, + "grad_norm": 0.2728622137260916, + "learning_rate": 0.000150241839176454, + "loss": 1.0527, + "step": 7647 + }, + { + "epoch": 0.73, + "grad_norm": 0.30338258907392546, + "learning_rate": 0.00015022816062649005, + "loss": 1.1148, + "step": 7648 + }, + { + "epoch": 0.73, + "grad_norm": 0.26931970248244586, + "learning_rate": 0.0001502144808195348, + "loss": 1.1357, + "step": 7649 + }, + { + "epoch": 0.73, + "grad_norm": 0.2742596946281973, + "learning_rate": 0.0001502007997559306, + "loss": 1.0646, + "step": 7650 + }, + { + "epoch": 0.73, + "grad_norm": 0.3011532902374543, + "learning_rate": 0.00015018711743601984, + "loss": 1.1351, + "step": 7651 + }, + { + "epoch": 0.73, + "grad_norm": 0.2821557355247745, + "learning_rate": 0.00015017343386014494, + "loss": 1.0087, + "step": 7652 + }, + { + "epoch": 0.73, + "grad_norm": 0.31425399583262464, + "learning_rate": 0.0001501597490286483, + "loss": 1.0384, + "step": 7653 + }, + { + "epoch": 0.73, + "grad_norm": 0.2931669718113375, + "learning_rate": 0.0001501460629418724, + "loss": 1.0466, + "step": 7654 + }, + { + "epoch": 0.73, + "grad_norm": 0.2875430271869409, + "learning_rate": 0.0001501323756001598, + "loss": 1.0878, + "step": 7655 + }, + { + "epoch": 0.73, + "grad_norm": 0.2597750556495404, + "learning_rate": 0.00015011868700385298, + "loss": 1.0548, + "step": 7656 + }, + { + "epoch": 0.73, + "grad_norm": 0.27705706713516576, + "learning_rate": 0.00015010499715329453, + "loss": 0.984, + "step": 7657 + }, + { + "epoch": 0.73, + "grad_norm": 0.26538202477264977, + "learning_rate": 0.00015009130604882702, + "loss": 1.0417, + "step": 7658 + }, + { + "epoch": 0.73, + "grad_norm": 0.28174584601024333, + "learning_rate": 0.0001500776136907931, + "loss": 1.0257, + "step": 7659 + }, + { + "epoch": 0.73, + "grad_norm": 0.25036859344556583, + "learning_rate": 0.00015006392007953543, + "loss": 1.1059, + "step": 7660 + }, + { + "epoch": 0.73, + "grad_norm": 0.27900752760462394, + "learning_rate": 0.00015005022521539672, + "loss": 1.1089, + "step": 7661 + }, + { + "epoch": 0.73, + "grad_norm": 0.31107819751826155, + "learning_rate": 0.0001500365290987196, + "loss": 1.0433, + "step": 7662 + }, + { + "epoch": 0.73, + "grad_norm": 0.2843202029978909, + "learning_rate": 0.00015002283172984695, + "loss": 1.0845, + "step": 7663 + }, + { + "epoch": 0.73, + "grad_norm": 0.2589541502663174, + "learning_rate": 0.00015000913310912148, + "loss": 0.9485, + "step": 7664 + }, + { + "epoch": 0.73, + "grad_norm": 0.300671018081089, + "learning_rate": 0.00014999543323688603, + "loss": 1.0407, + "step": 7665 + }, + { + "epoch": 0.73, + "grad_norm": 0.27922119836694304, + "learning_rate": 0.00014998173211348343, + "loss": 0.9969, + "step": 7666 + }, + { + "epoch": 0.73, + "grad_norm": 0.29425343441337976, + "learning_rate": 0.0001499680297392566, + "loss": 0.9274, + "step": 7667 + }, + { + "epoch": 0.73, + "grad_norm": 0.2941232322325698, + "learning_rate": 0.0001499543261145484, + "loss": 1.0088, + "step": 7668 + }, + { + "epoch": 0.73, + "grad_norm": 0.2914706729504272, + "learning_rate": 0.0001499406212397018, + "loss": 1.0058, + "step": 7669 + }, + { + "epoch": 0.73, + "grad_norm": 0.2953576562328182, + "learning_rate": 0.00014992691511505975, + "loss": 1.0051, + "step": 7670 + }, + { + "epoch": 0.73, + "grad_norm": 0.34306316772779066, + "learning_rate": 0.0001499132077409653, + "loss": 1.153, + "step": 7671 + }, + { + "epoch": 0.73, + "grad_norm": 0.2679983439498134, + "learning_rate": 0.0001498994991177614, + "loss": 1.0104, + "step": 7672 + }, + { + "epoch": 0.73, + "grad_norm": 0.2594539874622118, + "learning_rate": 0.00014988578924579122, + "loss": 1.1645, + "step": 7673 + }, + { + "epoch": 0.73, + "grad_norm": 0.3048213376270825, + "learning_rate": 0.0001498720781253978, + "loss": 1.098, + "step": 7674 + }, + { + "epoch": 0.73, + "grad_norm": 0.3077629403566935, + "learning_rate": 0.0001498583657569243, + "loss": 1.1484, + "step": 7675 + }, + { + "epoch": 0.73, + "grad_norm": 0.31902620346830557, + "learning_rate": 0.00014984465214071385, + "loss": 1.009, + "step": 7676 + }, + { + "epoch": 0.73, + "grad_norm": 0.29403015567312174, + "learning_rate": 0.00014983093727710965, + "loss": 0.9773, + "step": 7677 + }, + { + "epoch": 0.73, + "grad_norm": 0.2548359871414429, + "learning_rate": 0.00014981722116645495, + "loss": 1.0751, + "step": 7678 + }, + { + "epoch": 0.73, + "grad_norm": 0.2787506686083023, + "learning_rate": 0.00014980350380909294, + "loss": 1.1147, + "step": 7679 + }, + { + "epoch": 0.73, + "grad_norm": 0.2992261122500776, + "learning_rate": 0.00014978978520536698, + "loss": 1.0858, + "step": 7680 + }, + { + "epoch": 0.73, + "grad_norm": 0.3011621462376174, + "learning_rate": 0.00014977606535562034, + "loss": 1.046, + "step": 7681 + }, + { + "epoch": 0.73, + "grad_norm": 0.2986573455838004, + "learning_rate": 0.0001497623442601964, + "loss": 1.0208, + "step": 7682 + }, + { + "epoch": 0.74, + "grad_norm": 0.28656498019974624, + "learning_rate": 0.00014974862191943848, + "loss": 1.1759, + "step": 7683 + }, + { + "epoch": 0.74, + "grad_norm": 0.2762485785593129, + "learning_rate": 0.00014973489833369004, + "loss": 1.1052, + "step": 7684 + }, + { + "epoch": 0.74, + "grad_norm": 0.3083477893434321, + "learning_rate": 0.00014972117350329456, + "loss": 0.9617, + "step": 7685 + }, + { + "epoch": 0.74, + "grad_norm": 0.3003280251753417, + "learning_rate": 0.0001497074474285954, + "loss": 1.029, + "step": 7686 + }, + { + "epoch": 0.74, + "grad_norm": 0.2572025859628947, + "learning_rate": 0.00014969372010993618, + "loss": 1.2292, + "step": 7687 + }, + { + "epoch": 0.74, + "grad_norm": 0.24791565205884394, + "learning_rate": 0.00014967999154766036, + "loss": 1.1202, + "step": 7688 + }, + { + "epoch": 0.74, + "grad_norm": 0.2834724391112066, + "learning_rate": 0.00014966626174211153, + "loss": 1.1773, + "step": 7689 + }, + { + "epoch": 0.74, + "grad_norm": 0.2793645427994029, + "learning_rate": 0.0001496525306936333, + "loss": 1.0002, + "step": 7690 + }, + { + "epoch": 0.74, + "grad_norm": 0.2742995002805119, + "learning_rate": 0.00014963879840256927, + "loss": 1.1219, + "step": 7691 + }, + { + "epoch": 0.74, + "grad_norm": 0.31255435123133335, + "learning_rate": 0.0001496250648692631, + "loss": 1.0062, + "step": 7692 + }, + { + "epoch": 0.74, + "grad_norm": 0.3095875184013366, + "learning_rate": 0.00014961133009405852, + "loss": 1.0721, + "step": 7693 + }, + { + "epoch": 0.74, + "grad_norm": 0.2804828390216687, + "learning_rate": 0.00014959759407729922, + "loss": 0.9511, + "step": 7694 + }, + { + "epoch": 0.74, + "grad_norm": 0.28459275473311924, + "learning_rate": 0.00014958385681932893, + "loss": 0.8945, + "step": 7695 + }, + { + "epoch": 0.74, + "grad_norm": 0.30693979635734253, + "learning_rate": 0.00014957011832049147, + "loss": 1.0574, + "step": 7696 + }, + { + "epoch": 0.74, + "grad_norm": 0.2661866180961575, + "learning_rate": 0.00014955637858113065, + "loss": 1.1286, + "step": 7697 + }, + { + "epoch": 0.74, + "grad_norm": 0.2748632910585887, + "learning_rate": 0.00014954263760159033, + "loss": 1.0264, + "step": 7698 + }, + { + "epoch": 0.74, + "grad_norm": 0.2639660707256238, + "learning_rate": 0.00014952889538221434, + "loss": 0.9609, + "step": 7699 + }, + { + "epoch": 0.74, + "grad_norm": 0.28950568503410234, + "learning_rate": 0.00014951515192334665, + "loss": 1.0243, + "step": 7700 + }, + { + "epoch": 0.74, + "grad_norm": 0.2741181044596097, + "learning_rate": 0.00014950140722533114, + "loss": 1.1703, + "step": 7701 + }, + { + "epoch": 0.74, + "grad_norm": 0.2939600698983923, + "learning_rate": 0.0001494876612885118, + "loss": 1.0576, + "step": 7702 + }, + { + "epoch": 0.74, + "grad_norm": 0.27769959394242444, + "learning_rate": 0.00014947391411323263, + "loss": 1.032, + "step": 7703 + }, + { + "epoch": 0.74, + "grad_norm": 0.27126944244428325, + "learning_rate": 0.0001494601656998377, + "loss": 0.9935, + "step": 7704 + }, + { + "epoch": 0.74, + "grad_norm": 0.2842820549012656, + "learning_rate": 0.000149446416048671, + "loss": 1.1503, + "step": 7705 + }, + { + "epoch": 0.74, + "grad_norm": 0.2669012700932571, + "learning_rate": 0.0001494326651600767, + "loss": 1.0303, + "step": 7706 + }, + { + "epoch": 0.74, + "grad_norm": 0.30368700424850303, + "learning_rate": 0.00014941891303439886, + "loss": 1.0606, + "step": 7707 + }, + { + "epoch": 0.74, + "grad_norm": 0.30976284634068807, + "learning_rate": 0.0001494051596719817, + "loss": 1.0383, + "step": 7708 + }, + { + "epoch": 0.74, + "grad_norm": 0.3052315022073979, + "learning_rate": 0.00014939140507316934, + "loss": 1.0763, + "step": 7709 + }, + { + "epoch": 0.74, + "grad_norm": 0.28582065805745044, + "learning_rate": 0.00014937764923830598, + "loss": 1.2036, + "step": 7710 + }, + { + "epoch": 0.74, + "grad_norm": 0.2783946932750405, + "learning_rate": 0.000149363892167736, + "loss": 1.0478, + "step": 7711 + }, + { + "epoch": 0.74, + "grad_norm": 0.2863324153725781, + "learning_rate": 0.00014935013386180353, + "loss": 1.0988, + "step": 7712 + }, + { + "epoch": 0.74, + "grad_norm": 0.2785613479438333, + "learning_rate": 0.00014933637432085295, + "loss": 1.0189, + "step": 7713 + }, + { + "epoch": 0.74, + "grad_norm": 0.28499263197797736, + "learning_rate": 0.00014932261354522864, + "loss": 1.1063, + "step": 7714 + }, + { + "epoch": 0.74, + "grad_norm": 0.30255835930843594, + "learning_rate": 0.00014930885153527492, + "loss": 1.0056, + "step": 7715 + }, + { + "epoch": 0.74, + "grad_norm": 0.23887877046224554, + "learning_rate": 0.00014929508829133616, + "loss": 1.0982, + "step": 7716 + }, + { + "epoch": 0.74, + "grad_norm": 0.25941492372004377, + "learning_rate": 0.0001492813238137569, + "loss": 1.0775, + "step": 7717 + }, + { + "epoch": 0.74, + "grad_norm": 0.2840455310981083, + "learning_rate": 0.00014926755810288147, + "loss": 1.0016, + "step": 7718 + }, + { + "epoch": 0.74, + "grad_norm": 0.3159294197735588, + "learning_rate": 0.00014925379115905444, + "loss": 1.1289, + "step": 7719 + }, + { + "epoch": 0.74, + "grad_norm": 0.3001807062258611, + "learning_rate": 0.00014924002298262034, + "loss": 0.9867, + "step": 7720 + }, + { + "epoch": 0.74, + "grad_norm": 0.2783216610430766, + "learning_rate": 0.00014922625357392376, + "loss": 1.0793, + "step": 7721 + }, + { + "epoch": 0.74, + "grad_norm": 0.28020041552297037, + "learning_rate": 0.00014921248293330922, + "loss": 1.0532, + "step": 7722 + }, + { + "epoch": 0.74, + "grad_norm": 0.3424716991146364, + "learning_rate": 0.00014919871106112135, + "loss": 1.0768, + "step": 7723 + }, + { + "epoch": 0.74, + "grad_norm": 0.2779795066958149, + "learning_rate": 0.00014918493795770482, + "loss": 0.9554, + "step": 7724 + }, + { + "epoch": 0.74, + "grad_norm": 0.3181269730793016, + "learning_rate": 0.00014917116362340435, + "loss": 1.0211, + "step": 7725 + }, + { + "epoch": 0.74, + "grad_norm": 0.28616822041259976, + "learning_rate": 0.00014915738805856458, + "loss": 1.1136, + "step": 7726 + }, + { + "epoch": 0.74, + "grad_norm": 0.25638910226945644, + "learning_rate": 0.00014914361126353026, + "loss": 0.9941, + "step": 7727 + }, + { + "epoch": 0.74, + "grad_norm": 0.25181470798369615, + "learning_rate": 0.0001491298332386462, + "loss": 1.0635, + "step": 7728 + }, + { + "epoch": 0.74, + "grad_norm": 0.279163837257328, + "learning_rate": 0.0001491160539842572, + "loss": 1.0245, + "step": 7729 + }, + { + "epoch": 0.74, + "grad_norm": 0.28555043885157716, + "learning_rate": 0.00014910227350070805, + "loss": 1.1297, + "step": 7730 + }, + { + "epoch": 0.74, + "grad_norm": 0.28507597908988613, + "learning_rate": 0.00014908849178834366, + "loss": 1.1125, + "step": 7731 + }, + { + "epoch": 0.74, + "grad_norm": 0.28046175174288296, + "learning_rate": 0.00014907470884750892, + "loss": 1.1712, + "step": 7732 + }, + { + "epoch": 0.74, + "grad_norm": 0.2841570962434119, + "learning_rate": 0.00014906092467854875, + "loss": 1.061, + "step": 7733 + }, + { + "epoch": 0.74, + "grad_norm": 0.25589378852673406, + "learning_rate": 0.00014904713928180806, + "loss": 1.0623, + "step": 7734 + }, + { + "epoch": 0.74, + "grad_norm": 0.29195274168710955, + "learning_rate": 0.00014903335265763193, + "loss": 1.008, + "step": 7735 + }, + { + "epoch": 0.74, + "grad_norm": 0.2742605176860971, + "learning_rate": 0.00014901956480636535, + "loss": 1.153, + "step": 7736 + }, + { + "epoch": 0.74, + "grad_norm": 0.28128186302274166, + "learning_rate": 0.0001490057757283533, + "loss": 1.0106, + "step": 7737 + }, + { + "epoch": 0.74, + "grad_norm": 0.2829056481616711, + "learning_rate": 0.00014899198542394094, + "loss": 1.1351, + "step": 7738 + }, + { + "epoch": 0.74, + "grad_norm": 0.2715236434676326, + "learning_rate": 0.00014897819389347335, + "loss": 1.0404, + "step": 7739 + }, + { + "epoch": 0.74, + "grad_norm": 0.2911015197712587, + "learning_rate": 0.00014896440113729568, + "loss": 1.2098, + "step": 7740 + }, + { + "epoch": 0.74, + "grad_norm": 0.2752066845807117, + "learning_rate": 0.0001489506071557531, + "loss": 1.0378, + "step": 7741 + }, + { + "epoch": 0.74, + "grad_norm": 0.33207199501826795, + "learning_rate": 0.00014893681194919084, + "loss": 0.967, + "step": 7742 + }, + { + "epoch": 0.74, + "grad_norm": 0.2838659301448611, + "learning_rate": 0.00014892301551795408, + "loss": 1.0683, + "step": 7743 + }, + { + "epoch": 0.74, + "grad_norm": 0.29348859811094435, + "learning_rate": 0.0001489092178623881, + "loss": 1.1144, + "step": 7744 + }, + { + "epoch": 0.74, + "grad_norm": 0.2682746013538995, + "learning_rate": 0.00014889541898283821, + "loss": 1.1194, + "step": 7745 + }, + { + "epoch": 0.74, + "grad_norm": 0.30171726380766734, + "learning_rate": 0.00014888161887964974, + "loss": 1.023, + "step": 7746 + }, + { + "epoch": 0.74, + "grad_norm": 0.25296517645030825, + "learning_rate": 0.000148867817553168, + "loss": 1.1727, + "step": 7747 + }, + { + "epoch": 0.74, + "grad_norm": 0.31768749485918707, + "learning_rate": 0.00014885401500373845, + "loss": 1.0391, + "step": 7748 + }, + { + "epoch": 0.74, + "grad_norm": 0.2438411570540729, + "learning_rate": 0.0001488402112317065, + "loss": 1.0145, + "step": 7749 + }, + { + "epoch": 0.74, + "grad_norm": 0.3034976664176374, + "learning_rate": 0.0001488264062374175, + "loss": 1.1215, + "step": 7750 + }, + { + "epoch": 0.74, + "grad_norm": 0.265851976673612, + "learning_rate": 0.00014881260002121705, + "loss": 0.9724, + "step": 7751 + }, + { + "epoch": 0.74, + "grad_norm": 0.29764121681730704, + "learning_rate": 0.00014879879258345057, + "loss": 1.0749, + "step": 7752 + }, + { + "epoch": 0.74, + "grad_norm": 0.2797392427717897, + "learning_rate": 0.00014878498392446366, + "loss": 0.8735, + "step": 7753 + }, + { + "epoch": 0.74, + "grad_norm": 0.24366220050786846, + "learning_rate": 0.00014877117404460185, + "loss": 1.0055, + "step": 7754 + }, + { + "epoch": 0.74, + "grad_norm": 0.28588196020457907, + "learning_rate": 0.00014875736294421078, + "loss": 1.16, + "step": 7755 + }, + { + "epoch": 0.74, + "grad_norm": 0.30525823027606785, + "learning_rate": 0.00014874355062363605, + "loss": 1.1435, + "step": 7756 + }, + { + "epoch": 0.74, + "grad_norm": 0.248479759938403, + "learning_rate": 0.00014872973708322332, + "loss": 1.0148, + "step": 7757 + }, + { + "epoch": 0.74, + "grad_norm": 0.2705904252639747, + "learning_rate": 0.00014871592232331833, + "loss": 1.1533, + "step": 7758 + }, + { + "epoch": 0.74, + "grad_norm": 0.2624337205195208, + "learning_rate": 0.0001487021063442667, + "loss": 0.9806, + "step": 7759 + }, + { + "epoch": 0.74, + "grad_norm": 0.2526960220148543, + "learning_rate": 0.00014868828914641431, + "loss": 0.9377, + "step": 7760 + }, + { + "epoch": 0.74, + "grad_norm": 0.2974913829902583, + "learning_rate": 0.00014867447073010686, + "loss": 1.0434, + "step": 7761 + }, + { + "epoch": 0.74, + "grad_norm": 0.22591683924408296, + "learning_rate": 0.0001486606510956902, + "loss": 1.145, + "step": 7762 + }, + { + "epoch": 0.74, + "grad_norm": 0.26106572494311425, + "learning_rate": 0.00014864683024351017, + "loss": 1.0687, + "step": 7763 + }, + { + "epoch": 0.74, + "grad_norm": 0.2540501633396988, + "learning_rate": 0.00014863300817391262, + "loss": 1.0217, + "step": 7764 + }, + { + "epoch": 0.74, + "grad_norm": 0.2801665117949245, + "learning_rate": 0.0001486191848872435, + "loss": 1.1218, + "step": 7765 + }, + { + "epoch": 0.74, + "grad_norm": 0.3187353776069642, + "learning_rate": 0.0001486053603838487, + "loss": 1.0898, + "step": 7766 + }, + { + "epoch": 0.74, + "grad_norm": 0.25763576552152534, + "learning_rate": 0.0001485915346640742, + "loss": 1.0585, + "step": 7767 + }, + { + "epoch": 0.74, + "grad_norm": 0.2876319448148584, + "learning_rate": 0.00014857770772826602, + "loss": 1.0169, + "step": 7768 + }, + { + "epoch": 0.74, + "grad_norm": 0.25997267525577705, + "learning_rate": 0.0001485638795767702, + "loss": 1.0577, + "step": 7769 + }, + { + "epoch": 0.74, + "grad_norm": 0.2870532110656952, + "learning_rate": 0.00014855005020993276, + "loss": 1.1589, + "step": 7770 + }, + { + "epoch": 0.74, + "grad_norm": 0.28744735564229734, + "learning_rate": 0.00014853621962809975, + "loss": 1.0786, + "step": 7771 + }, + { + "epoch": 0.74, + "grad_norm": 0.26983452322850415, + "learning_rate": 0.0001485223878316174, + "loss": 1.2123, + "step": 7772 + }, + { + "epoch": 0.74, + "grad_norm": 0.33765627577778073, + "learning_rate": 0.00014850855482083177, + "loss": 1.1044, + "step": 7773 + }, + { + "epoch": 0.74, + "grad_norm": 0.272206284878232, + "learning_rate": 0.00014849472059608906, + "loss": 1.0212, + "step": 7774 + }, + { + "epoch": 0.74, + "grad_norm": 0.29277937755205086, + "learning_rate": 0.00014848088515773553, + "loss": 1.1695, + "step": 7775 + }, + { + "epoch": 0.74, + "grad_norm": 0.25356023491550367, + "learning_rate": 0.00014846704850611736, + "loss": 1.1737, + "step": 7776 + }, + { + "epoch": 0.74, + "grad_norm": 0.3022516410233581, + "learning_rate": 0.00014845321064158083, + "loss": 1.0756, + "step": 7777 + }, + { + "epoch": 0.74, + "grad_norm": 0.2530511912100064, + "learning_rate": 0.00014843937156447227, + "loss": 1.0036, + "step": 7778 + }, + { + "epoch": 0.74, + "grad_norm": 0.3022412340923616, + "learning_rate": 0.000148425531275138, + "loss": 0.9716, + "step": 7779 + }, + { + "epoch": 0.74, + "grad_norm": 0.2678799188134652, + "learning_rate": 0.00014841168977392432, + "loss": 1.1076, + "step": 7780 + }, + { + "epoch": 0.74, + "grad_norm": 0.2963039241793785, + "learning_rate": 0.00014839784706117775, + "loss": 1.0797, + "step": 7781 + }, + { + "epoch": 0.74, + "grad_norm": 0.27673978607030375, + "learning_rate": 0.00014838400313724458, + "loss": 1.1354, + "step": 7782 + }, + { + "epoch": 0.74, + "grad_norm": 0.2599969777043114, + "learning_rate": 0.00014837015800247137, + "loss": 1.0195, + "step": 7783 + }, + { + "epoch": 0.74, + "grad_norm": 0.28872142479078317, + "learning_rate": 0.0001483563116572045, + "loss": 0.9883, + "step": 7784 + }, + { + "epoch": 0.74, + "grad_norm": 0.2728653682651342, + "learning_rate": 0.0001483424641017906, + "loss": 1.1095, + "step": 7785 + }, + { + "epoch": 0.74, + "grad_norm": 0.3069290092163076, + "learning_rate": 0.00014832861533657613, + "loss": 1.093, + "step": 7786 + }, + { + "epoch": 0.74, + "grad_norm": 0.25616829490396015, + "learning_rate": 0.0001483147653619077, + "loss": 1.1398, + "step": 7787 + }, + { + "epoch": 0.75, + "grad_norm": 0.33842983259832143, + "learning_rate": 0.00014830091417813188, + "loss": 1.0526, + "step": 7788 + }, + { + "epoch": 0.75, + "grad_norm": 0.29499869905887005, + "learning_rate": 0.00014828706178559534, + "loss": 1.1995, + "step": 7789 + }, + { + "epoch": 0.75, + "grad_norm": 0.2867836759486996, + "learning_rate": 0.00014827320818464474, + "loss": 1.1038, + "step": 7790 + }, + { + "epoch": 0.75, + "grad_norm": 0.27597630873723183, + "learning_rate": 0.00014825935337562673, + "loss": 1.1934, + "step": 7791 + }, + { + "epoch": 0.75, + "grad_norm": 0.2566617689778122, + "learning_rate": 0.0001482454973588881, + "loss": 1.0383, + "step": 7792 + }, + { + "epoch": 0.75, + "grad_norm": 0.28408572266180726, + "learning_rate": 0.0001482316401347756, + "loss": 0.9326, + "step": 7793 + }, + { + "epoch": 0.75, + "grad_norm": 0.2745042716972083, + "learning_rate": 0.00014821778170363595, + "loss": 1.0336, + "step": 7794 + }, + { + "epoch": 0.75, + "grad_norm": 0.3032701457073397, + "learning_rate": 0.00014820392206581602, + "loss": 1.0234, + "step": 7795 + }, + { + "epoch": 0.75, + "grad_norm": 0.2751280009313665, + "learning_rate": 0.00014819006122166267, + "loss": 1.0432, + "step": 7796 + }, + { + "epoch": 0.75, + "grad_norm": 0.3125193076347104, + "learning_rate": 0.00014817619917152275, + "loss": 1.1172, + "step": 7797 + }, + { + "epoch": 0.75, + "grad_norm": 0.26116307629794683, + "learning_rate": 0.00014816233591574313, + "loss": 1.0669, + "step": 7798 + }, + { + "epoch": 0.75, + "grad_norm": 0.28349617158355267, + "learning_rate": 0.0001481484714546708, + "loss": 1.1477, + "step": 7799 + }, + { + "epoch": 0.75, + "grad_norm": 0.34282406819344863, + "learning_rate": 0.00014813460578865274, + "loss": 1.009, + "step": 7800 + }, + { + "epoch": 0.75, + "grad_norm": 0.28578123221740864, + "learning_rate": 0.00014812073891803587, + "loss": 1.156, + "step": 7801 + }, + { + "epoch": 0.75, + "grad_norm": 0.2980354742362817, + "learning_rate": 0.0001481068708431673, + "loss": 1.0768, + "step": 7802 + }, + { + "epoch": 0.75, + "grad_norm": 0.3220759599695297, + "learning_rate": 0.00014809300156439406, + "loss": 1.0764, + "step": 7803 + }, + { + "epoch": 0.75, + "grad_norm": 0.2640835765034414, + "learning_rate": 0.00014807913108206322, + "loss": 1.0768, + "step": 7804 + }, + { + "epoch": 0.75, + "grad_norm": 0.27220178093073455, + "learning_rate": 0.00014806525939652188, + "loss": 1.1449, + "step": 7805 + }, + { + "epoch": 0.75, + "grad_norm": 0.27363838878553143, + "learning_rate": 0.00014805138650811724, + "loss": 1.0526, + "step": 7806 + }, + { + "epoch": 0.75, + "grad_norm": 0.2883264760303332, + "learning_rate": 0.0001480375124171965, + "loss": 1.0072, + "step": 7807 + }, + { + "epoch": 0.75, + "grad_norm": 0.28230583793684294, + "learning_rate": 0.00014802363712410673, + "loss": 1.043, + "step": 7808 + }, + { + "epoch": 0.75, + "grad_norm": 0.2615628239719908, + "learning_rate": 0.00014800976062919532, + "loss": 1.0759, + "step": 7809 + }, + { + "epoch": 0.75, + "grad_norm": 0.29229275325827664, + "learning_rate": 0.00014799588293280946, + "loss": 1.0116, + "step": 7810 + }, + { + "epoch": 0.75, + "grad_norm": 0.2738661601965768, + "learning_rate": 0.00014798200403529646, + "loss": 1.1214, + "step": 7811 + }, + { + "epoch": 0.75, + "grad_norm": 0.2743653194914421, + "learning_rate": 0.00014796812393700368, + "loss": 1.0277, + "step": 7812 + }, + { + "epoch": 0.75, + "grad_norm": 0.27052008019631224, + "learning_rate": 0.00014795424263827842, + "loss": 1.0564, + "step": 7813 + }, + { + "epoch": 0.75, + "grad_norm": 0.30231813442957706, + "learning_rate": 0.00014794036013946813, + "loss": 1.1308, + "step": 7814 + }, + { + "epoch": 0.75, + "grad_norm": 0.29045069706562976, + "learning_rate": 0.00014792647644092016, + "loss": 1.0502, + "step": 7815 + }, + { + "epoch": 0.75, + "grad_norm": 0.2663099880383512, + "learning_rate": 0.000147912591542982, + "loss": 1.1342, + "step": 7816 + }, + { + "epoch": 0.75, + "grad_norm": 0.2671935344264389, + "learning_rate": 0.00014789870544600116, + "loss": 1.0337, + "step": 7817 + }, + { + "epoch": 0.75, + "grad_norm": 0.2376028787105682, + "learning_rate": 0.00014788481815032509, + "loss": 1.0951, + "step": 7818 + }, + { + "epoch": 0.75, + "grad_norm": 0.2873919541494794, + "learning_rate": 0.00014787092965630135, + "loss": 1.0885, + "step": 7819 + }, + { + "epoch": 0.75, + "grad_norm": 0.28326748757574954, + "learning_rate": 0.00014785703996427754, + "loss": 0.9765, + "step": 7820 + }, + { + "epoch": 0.75, + "grad_norm": 0.26333898401252714, + "learning_rate": 0.0001478431490746012, + "loss": 0.9915, + "step": 7821 + }, + { + "epoch": 0.75, + "grad_norm": 0.2638238903309792, + "learning_rate": 0.00014782925698761997, + "loss": 1.0908, + "step": 7822 + }, + { + "epoch": 0.75, + "grad_norm": 0.30274071491597526, + "learning_rate": 0.00014781536370368157, + "loss": 1.0742, + "step": 7823 + }, + { + "epoch": 0.75, + "grad_norm": 0.2610374416233597, + "learning_rate": 0.0001478014692231336, + "loss": 1.1589, + "step": 7824 + }, + { + "epoch": 0.75, + "grad_norm": 0.312177588979833, + "learning_rate": 0.00014778757354632382, + "loss": 1.023, + "step": 7825 + }, + { + "epoch": 0.75, + "grad_norm": 0.2817902834771327, + "learning_rate": 0.00014777367667360002, + "loss": 1.0925, + "step": 7826 + }, + { + "epoch": 0.75, + "grad_norm": 0.24540218148082688, + "learning_rate": 0.00014775977860530988, + "loss": 0.9525, + "step": 7827 + }, + { + "epoch": 0.75, + "grad_norm": 0.292375111038645, + "learning_rate": 0.0001477458793418013, + "loss": 1.0295, + "step": 7828 + }, + { + "epoch": 0.75, + "grad_norm": 0.25821078004181164, + "learning_rate": 0.0001477319788834221, + "loss": 1.0538, + "step": 7829 + }, + { + "epoch": 0.75, + "grad_norm": 0.26371249547752684, + "learning_rate": 0.00014771807723052013, + "loss": 1.0396, + "step": 7830 + }, + { + "epoch": 0.75, + "grad_norm": 0.24548178512781826, + "learning_rate": 0.00014770417438344325, + "loss": 1.1311, + "step": 7831 + }, + { + "epoch": 0.75, + "grad_norm": 0.28303419109838207, + "learning_rate": 0.00014769027034253944, + "loss": 1.0892, + "step": 7832 + }, + { + "epoch": 0.75, + "grad_norm": 0.27687609304875627, + "learning_rate": 0.00014767636510815667, + "loss": 1.0869, + "step": 7833 + }, + { + "epoch": 0.75, + "grad_norm": 0.28746479888055076, + "learning_rate": 0.00014766245868064285, + "loss": 1.1869, + "step": 7834 + }, + { + "epoch": 0.75, + "grad_norm": 0.2879143036391422, + "learning_rate": 0.00014764855106034607, + "loss": 1.0875, + "step": 7835 + }, + { + "epoch": 0.75, + "grad_norm": 0.2653922351709626, + "learning_rate": 0.00014763464224761436, + "loss": 1.1238, + "step": 7836 + }, + { + "epoch": 0.75, + "grad_norm": 0.2748579199210656, + "learning_rate": 0.00014762073224279578, + "loss": 1.0366, + "step": 7837 + }, + { + "epoch": 0.75, + "grad_norm": 0.29359776941173216, + "learning_rate": 0.00014760682104623845, + "loss": 0.9695, + "step": 7838 + }, + { + "epoch": 0.75, + "grad_norm": 0.2671398097358774, + "learning_rate": 0.00014759290865829053, + "loss": 1.0089, + "step": 7839 + }, + { + "epoch": 0.75, + "eval_loss": 1.1271681785583496, + "eval_runtime": 4227.9142, + "eval_samples_per_second": 19.778, + "eval_steps_per_second": 2.472, + "step": 7839 + }, + { + "epoch": 0.75, + "grad_norm": 0.28108668366507505, + "learning_rate": 0.00014757899507930012, + "loss": 1.1518, + "step": 7840 + }, + { + "epoch": 0.75, + "grad_norm": 0.2889156834394568, + "learning_rate": 0.00014756508030961543, + "loss": 1.057, + "step": 7841 + }, + { + "epoch": 0.75, + "grad_norm": 0.23620442034659647, + "learning_rate": 0.00014755116434958477, + "loss": 1.0139, + "step": 7842 + }, + { + "epoch": 0.75, + "grad_norm": 0.26963408436529646, + "learning_rate": 0.00014753724719955634, + "loss": 1.2173, + "step": 7843 + }, + { + "epoch": 0.75, + "grad_norm": 0.2930307296639112, + "learning_rate": 0.0001475233288598784, + "loss": 1.1191, + "step": 7844 + }, + { + "epoch": 0.75, + "grad_norm": 0.26447900586343337, + "learning_rate": 0.00014750940933089927, + "loss": 1.0972, + "step": 7845 + }, + { + "epoch": 0.75, + "grad_norm": 0.28229996008223884, + "learning_rate": 0.00014749548861296734, + "loss": 1.1229, + "step": 7846 + }, + { + "epoch": 0.75, + "grad_norm": 0.2670260276637567, + "learning_rate": 0.00014748156670643097, + "loss": 1.2654, + "step": 7847 + }, + { + "epoch": 0.75, + "grad_norm": 0.29233732430571957, + "learning_rate": 0.00014746764361163854, + "loss": 1.1009, + "step": 7848 + }, + { + "epoch": 0.75, + "grad_norm": 0.2872352622058822, + "learning_rate": 0.00014745371932893848, + "loss": 1.0123, + "step": 7849 + }, + { + "epoch": 0.75, + "grad_norm": 0.26320292513113624, + "learning_rate": 0.00014743979385867928, + "loss": 1.056, + "step": 7850 + }, + { + "epoch": 0.75, + "grad_norm": 0.29353850314037555, + "learning_rate": 0.00014742586720120943, + "loss": 1.0711, + "step": 7851 + }, + { + "epoch": 0.75, + "grad_norm": 0.24643873017384735, + "learning_rate": 0.00014741193935687743, + "loss": 0.942, + "step": 7852 + }, + { + "epoch": 0.75, + "grad_norm": 0.30839731486986416, + "learning_rate": 0.00014739801032603186, + "loss": 0.932, + "step": 7853 + }, + { + "epoch": 0.75, + "grad_norm": 0.25565523985069327, + "learning_rate": 0.0001473840801090213, + "loss": 1.0941, + "step": 7854 + }, + { + "epoch": 0.75, + "grad_norm": 0.26831609426984876, + "learning_rate": 0.00014737014870619438, + "loss": 1.079, + "step": 7855 + }, + { + "epoch": 0.75, + "grad_norm": 0.2782662024037316, + "learning_rate": 0.00014735621611789963, + "loss": 1.085, + "step": 7856 + }, + { + "epoch": 0.75, + "grad_norm": 0.29723517036293495, + "learning_rate": 0.0001473422823444859, + "loss": 1.0157, + "step": 7857 + }, + { + "epoch": 0.75, + "grad_norm": 0.26674243695427563, + "learning_rate": 0.00014732834738630178, + "loss": 1.0748, + "step": 7858 + }, + { + "epoch": 0.75, + "grad_norm": 0.3178208105289837, + "learning_rate": 0.00014731441124369598, + "loss": 1.047, + "step": 7859 + }, + { + "epoch": 0.75, + "grad_norm": 0.3797982501806393, + "learning_rate": 0.00014730047391701737, + "loss": 0.9933, + "step": 7860 + }, + { + "epoch": 0.75, + "grad_norm": 0.2901368038846893, + "learning_rate": 0.00014728653540661463, + "loss": 1.0821, + "step": 7861 + }, + { + "epoch": 0.75, + "grad_norm": 0.2955915921047405, + "learning_rate": 0.0001472725957128366, + "loss": 0.9773, + "step": 7862 + }, + { + "epoch": 0.75, + "grad_norm": 0.29647306511427657, + "learning_rate": 0.00014725865483603218, + "loss": 1.1752, + "step": 7863 + }, + { + "epoch": 0.75, + "grad_norm": 0.2918367246052348, + "learning_rate": 0.00014724471277655023, + "loss": 0.9763, + "step": 7864 + }, + { + "epoch": 0.75, + "grad_norm": 0.2815720894283948, + "learning_rate": 0.00014723076953473965, + "loss": 1.098, + "step": 7865 + }, + { + "epoch": 0.75, + "grad_norm": 0.2930097766676262, + "learning_rate": 0.00014721682511094935, + "loss": 1.0655, + "step": 7866 + }, + { + "epoch": 0.75, + "grad_norm": 0.29679633835988606, + "learning_rate": 0.00014720287950552836, + "loss": 1.103, + "step": 7867 + }, + { + "epoch": 0.75, + "grad_norm": 0.29540666426295414, + "learning_rate": 0.00014718893271882562, + "loss": 1.1377, + "step": 7868 + }, + { + "epoch": 0.75, + "grad_norm": 0.28866732431657105, + "learning_rate": 0.0001471749847511902, + "loss": 1.1552, + "step": 7869 + }, + { + "epoch": 0.75, + "grad_norm": 0.2897250322336478, + "learning_rate": 0.00014716103560297116, + "loss": 1.1178, + "step": 7870 + }, + { + "epoch": 0.75, + "grad_norm": 0.2871031890874088, + "learning_rate": 0.00014714708527451752, + "loss": 1.0608, + "step": 7871 + }, + { + "epoch": 0.75, + "grad_norm": 0.2660012537881913, + "learning_rate": 0.00014713313376617845, + "loss": 0.9785, + "step": 7872 + }, + { + "epoch": 0.75, + "grad_norm": 0.30288071506941316, + "learning_rate": 0.00014711918107830312, + "loss": 1.0374, + "step": 7873 + }, + { + "epoch": 0.75, + "grad_norm": 0.2623082689214895, + "learning_rate": 0.00014710522721124062, + "loss": 1.1882, + "step": 7874 + }, + { + "epoch": 0.75, + "grad_norm": 0.29218912698161004, + "learning_rate": 0.00014709127216534025, + "loss": 1.1121, + "step": 7875 + }, + { + "epoch": 0.75, + "grad_norm": 0.2926792107209431, + "learning_rate": 0.00014707731594095118, + "loss": 1.1558, + "step": 7876 + }, + { + "epoch": 0.75, + "grad_norm": 0.3036786630685544, + "learning_rate": 0.00014706335853842268, + "loss": 1.1285, + "step": 7877 + }, + { + "epoch": 0.75, + "grad_norm": 0.3070257247499537, + "learning_rate": 0.00014704939995810408, + "loss": 1.1566, + "step": 7878 + }, + { + "epoch": 0.75, + "grad_norm": 0.2753834785951813, + "learning_rate": 0.00014703544020034463, + "loss": 1.1336, + "step": 7879 + }, + { + "epoch": 0.75, + "grad_norm": 0.30835133105536705, + "learning_rate": 0.00014702147926549377, + "loss": 1.0733, + "step": 7880 + }, + { + "epoch": 0.75, + "grad_norm": 0.30129377981823663, + "learning_rate": 0.00014700751715390084, + "loss": 1.1398, + "step": 7881 + }, + { + "epoch": 0.75, + "grad_norm": 0.26568110306618115, + "learning_rate": 0.00014699355386591523, + "loss": 1.0773, + "step": 7882 + }, + { + "epoch": 0.75, + "grad_norm": 0.28052397758769976, + "learning_rate": 0.00014697958940188642, + "loss": 1.0731, + "step": 7883 + }, + { + "epoch": 0.75, + "grad_norm": 0.2696382620489903, + "learning_rate": 0.00014696562376216381, + "loss": 0.9844, + "step": 7884 + }, + { + "epoch": 0.75, + "grad_norm": 0.2532216857709228, + "learning_rate": 0.000146951656947097, + "loss": 1.0863, + "step": 7885 + }, + { + "epoch": 0.75, + "grad_norm": 0.2622266542497967, + "learning_rate": 0.00014693768895703544, + "loss": 1.0305, + "step": 7886 + }, + { + "epoch": 0.75, + "grad_norm": 0.2800195946028661, + "learning_rate": 0.00014692371979232872, + "loss": 1.1075, + "step": 7887 + }, + { + "epoch": 0.75, + "grad_norm": 0.2953080493463318, + "learning_rate": 0.00014690974945332644, + "loss": 0.9948, + "step": 7888 + }, + { + "epoch": 0.75, + "grad_norm": 0.2886266231002391, + "learning_rate": 0.00014689577794037818, + "loss": 0.9764, + "step": 7889 + }, + { + "epoch": 0.75, + "grad_norm": 0.29706661719229843, + "learning_rate": 0.0001468818052538336, + "loss": 1.055, + "step": 7890 + }, + { + "epoch": 0.75, + "grad_norm": 0.28179682965161795, + "learning_rate": 0.0001468678313940424, + "loss": 1.0903, + "step": 7891 + }, + { + "epoch": 0.76, + "grad_norm": 0.3030242128434006, + "learning_rate": 0.00014685385636135426, + "loss": 1.1486, + "step": 7892 + }, + { + "epoch": 0.76, + "grad_norm": 0.2647061302202202, + "learning_rate": 0.00014683988015611892, + "loss": 1.0548, + "step": 7893 + }, + { + "epoch": 0.76, + "grad_norm": 0.300733623893973, + "learning_rate": 0.00014682590277868612, + "loss": 1.1367, + "step": 7894 + }, + { + "epoch": 0.76, + "grad_norm": 0.2995793418222508, + "learning_rate": 0.0001468119242294057, + "loss": 1.0878, + "step": 7895 + }, + { + "epoch": 0.76, + "grad_norm": 0.28708772163438406, + "learning_rate": 0.00014679794450862745, + "loss": 0.9396, + "step": 7896 + }, + { + "epoch": 0.76, + "grad_norm": 0.311063842263335, + "learning_rate": 0.0001467839636167012, + "loss": 1.1202, + "step": 7897 + }, + { + "epoch": 0.76, + "grad_norm": 0.2986192805747378, + "learning_rate": 0.0001467699815539769, + "loss": 1.0607, + "step": 7898 + }, + { + "epoch": 0.76, + "grad_norm": 0.258096740209968, + "learning_rate": 0.0001467559983208044, + "loss": 0.9521, + "step": 7899 + }, + { + "epoch": 0.76, + "grad_norm": 0.27283666156818637, + "learning_rate": 0.00014674201391753363, + "loss": 1.1342, + "step": 7900 + }, + { + "epoch": 0.76, + "grad_norm": 0.28773671837545195, + "learning_rate": 0.0001467280283445146, + "loss": 1.1128, + "step": 7901 + }, + { + "epoch": 0.76, + "grad_norm": 0.307556564712858, + "learning_rate": 0.00014671404160209733, + "loss": 0.9963, + "step": 7902 + }, + { + "epoch": 0.76, + "grad_norm": 0.2684719138920676, + "learning_rate": 0.0001467000536906318, + "loss": 1.0201, + "step": 7903 + }, + { + "epoch": 0.76, + "grad_norm": 0.26240667501651854, + "learning_rate": 0.00014668606461046806, + "loss": 1.0589, + "step": 7904 + }, + { + "epoch": 0.76, + "grad_norm": 0.27922442657332547, + "learning_rate": 0.0001466720743619562, + "loss": 1.0683, + "step": 7905 + }, + { + "epoch": 0.76, + "grad_norm": 0.2540084140712392, + "learning_rate": 0.00014665808294544633, + "loss": 0.9643, + "step": 7906 + }, + { + "epoch": 0.76, + "grad_norm": 0.31142085724795016, + "learning_rate": 0.00014664409036128866, + "loss": 1.045, + "step": 7907 + }, + { + "epoch": 0.76, + "grad_norm": 0.3206848269424768, + "learning_rate": 0.00014663009660983328, + "loss": 1.1147, + "step": 7908 + }, + { + "epoch": 0.76, + "grad_norm": 0.33340917807727916, + "learning_rate": 0.00014661610169143044, + "loss": 1.0744, + "step": 7909 + }, + { + "epoch": 0.76, + "grad_norm": 0.32071991522507715, + "learning_rate": 0.00014660210560643036, + "loss": 1.1297, + "step": 7910 + }, + { + "epoch": 0.76, + "grad_norm": 0.2761333180916678, + "learning_rate": 0.00014658810835518332, + "loss": 1.0477, + "step": 7911 + }, + { + "epoch": 0.76, + "grad_norm": 0.2859125633527157, + "learning_rate": 0.00014657410993803956, + "loss": 1.0781, + "step": 7912 + }, + { + "epoch": 0.76, + "grad_norm": 0.3274797315976737, + "learning_rate": 0.00014656011035534943, + "loss": 1.0144, + "step": 7913 + }, + { + "epoch": 0.76, + "grad_norm": 0.28598470593343445, + "learning_rate": 0.00014654610960746327, + "loss": 1.0887, + "step": 7914 + }, + { + "epoch": 0.76, + "grad_norm": 0.28926204149947815, + "learning_rate": 0.00014653210769473147, + "loss": 1.0627, + "step": 7915 + }, + { + "epoch": 0.76, + "grad_norm": 0.26324720206303615, + "learning_rate": 0.00014651810461750446, + "loss": 1.1053, + "step": 7916 + }, + { + "epoch": 0.76, + "grad_norm": 0.30675528347879105, + "learning_rate": 0.0001465041003761326, + "loss": 1.1054, + "step": 7917 + }, + { + "epoch": 0.76, + "grad_norm": 0.26066528416246476, + "learning_rate": 0.0001464900949709664, + "loss": 1.0405, + "step": 7918 + }, + { + "epoch": 0.76, + "grad_norm": 0.29329946245835375, + "learning_rate": 0.0001464760884023564, + "loss": 1.0615, + "step": 7919 + }, + { + "epoch": 0.76, + "grad_norm": 0.2679473490528041, + "learning_rate": 0.00014646208067065305, + "loss": 1.0988, + "step": 7920 + }, + { + "epoch": 0.76, + "grad_norm": 0.29866219772813785, + "learning_rate": 0.00014644807177620694, + "loss": 1.1193, + "step": 7921 + }, + { + "epoch": 0.76, + "grad_norm": 0.3004083603649936, + "learning_rate": 0.00014643406171936863, + "loss": 1.0458, + "step": 7922 + }, + { + "epoch": 0.76, + "grad_norm": 0.3255922513505334, + "learning_rate": 0.00014642005050048877, + "loss": 1.1641, + "step": 7923 + }, + { + "epoch": 0.76, + "grad_norm": 0.27331269973566386, + "learning_rate": 0.00014640603811991794, + "loss": 1.0904, + "step": 7924 + }, + { + "epoch": 0.76, + "grad_norm": 0.3021895089941293, + "learning_rate": 0.00014639202457800688, + "loss": 1.0426, + "step": 7925 + }, + { + "epoch": 0.76, + "grad_norm": 0.31651111587188924, + "learning_rate": 0.0001463780098751062, + "loss": 0.9561, + "step": 7926 + }, + { + "epoch": 0.76, + "grad_norm": 0.27245193343641233, + "learning_rate": 0.00014636399401156668, + "loss": 0.9084, + "step": 7927 + }, + { + "epoch": 0.76, + "grad_norm": 0.28171140443893944, + "learning_rate": 0.0001463499769877391, + "loss": 1.0122, + "step": 7928 + }, + { + "epoch": 0.76, + "grad_norm": 0.2722172920609029, + "learning_rate": 0.00014633595880397422, + "loss": 0.9773, + "step": 7929 + }, + { + "epoch": 0.76, + "grad_norm": 0.314701712266059, + "learning_rate": 0.00014632193946062283, + "loss": 1.0967, + "step": 7930 + }, + { + "epoch": 0.76, + "grad_norm": 0.2766120012080896, + "learning_rate": 0.0001463079189580358, + "loss": 1.1545, + "step": 7931 + }, + { + "epoch": 0.76, + "grad_norm": 0.26393567149311037, + "learning_rate": 0.00014629389729656399, + "loss": 0.8912, + "step": 7932 + }, + { + "epoch": 0.76, + "grad_norm": 0.30280952487557516, + "learning_rate": 0.0001462798744765583, + "loss": 1.1161, + "step": 7933 + }, + { + "epoch": 0.76, + "grad_norm": 0.31616202197131665, + "learning_rate": 0.0001462658504983697, + "loss": 1.179, + "step": 7934 + }, + { + "epoch": 0.76, + "grad_norm": 0.26565495442579756, + "learning_rate": 0.0001462518253623491, + "loss": 1.0568, + "step": 7935 + }, + { + "epoch": 0.76, + "grad_norm": 0.318280638548035, + "learning_rate": 0.00014623779906884748, + "loss": 1.1108, + "step": 7936 + }, + { + "epoch": 0.76, + "grad_norm": 0.2586299065330331, + "learning_rate": 0.00014622377161821587, + "loss": 1.148, + "step": 7937 + }, + { + "epoch": 0.76, + "grad_norm": 0.2715217059364236, + "learning_rate": 0.00014620974301080537, + "loss": 1.0559, + "step": 7938 + }, + { + "epoch": 0.76, + "grad_norm": 0.24985474931051407, + "learning_rate": 0.00014619571324696697, + "loss": 1.1047, + "step": 7939 + }, + { + "epoch": 0.76, + "grad_norm": 0.2961624550424931, + "learning_rate": 0.00014618168232705182, + "loss": 0.9659, + "step": 7940 + }, + { + "epoch": 0.76, + "grad_norm": 0.2966955141929237, + "learning_rate": 0.00014616765025141106, + "loss": 1.0266, + "step": 7941 + }, + { + "epoch": 0.76, + "grad_norm": 0.2986341505874412, + "learning_rate": 0.00014615361702039582, + "loss": 1.0165, + "step": 7942 + }, + { + "epoch": 0.76, + "grad_norm": 0.291306108234052, + "learning_rate": 0.00014613958263435734, + "loss": 1.0463, + "step": 7943 + }, + { + "epoch": 0.76, + "grad_norm": 0.24770026210630766, + "learning_rate": 0.00014612554709364677, + "loss": 1.0362, + "step": 7944 + }, + { + "epoch": 0.76, + "grad_norm": 0.30727250672800005, + "learning_rate": 0.00014611151039861542, + "loss": 1.0903, + "step": 7945 + }, + { + "epoch": 0.76, + "grad_norm": 0.30656468148580795, + "learning_rate": 0.00014609747254961452, + "loss": 1.0681, + "step": 7946 + }, + { + "epoch": 0.76, + "grad_norm": 0.28351015720243533, + "learning_rate": 0.0001460834335469954, + "loss": 1.166, + "step": 7947 + }, + { + "epoch": 0.76, + "grad_norm": 0.28579851887764157, + "learning_rate": 0.0001460693933911094, + "loss": 1.0082, + "step": 7948 + }, + { + "epoch": 0.76, + "grad_norm": 0.26715615685634764, + "learning_rate": 0.00014605535208230789, + "loss": 1.1272, + "step": 7949 + }, + { + "epoch": 0.76, + "grad_norm": 0.2908957018166822, + "learning_rate": 0.0001460413096209422, + "loss": 1.0386, + "step": 7950 + }, + { + "epoch": 0.76, + "grad_norm": 0.30421751423231963, + "learning_rate": 0.00014602726600736388, + "loss": 1.1368, + "step": 7951 + }, + { + "epoch": 0.76, + "grad_norm": 0.24654599232461855, + "learning_rate": 0.00014601322124192426, + "loss": 1.0704, + "step": 7952 + }, + { + "epoch": 0.76, + "grad_norm": 0.2858412396022753, + "learning_rate": 0.00014599917532497487, + "loss": 0.9629, + "step": 7953 + }, + { + "epoch": 0.76, + "grad_norm": 0.31986567150353395, + "learning_rate": 0.00014598512825686718, + "loss": 1.1515, + "step": 7954 + }, + { + "epoch": 0.76, + "grad_norm": 0.26767724316265096, + "learning_rate": 0.0001459710800379528, + "loss": 1.0711, + "step": 7955 + }, + { + "epoch": 0.76, + "grad_norm": 0.2683093058976096, + "learning_rate": 0.0001459570306685832, + "loss": 0.9834, + "step": 7956 + }, + { + "epoch": 0.76, + "grad_norm": 0.26418718399717656, + "learning_rate": 0.00014594298014911005, + "loss": 1.1891, + "step": 7957 + }, + { + "epoch": 0.76, + "grad_norm": 0.2997490177491894, + "learning_rate": 0.00014592892847988494, + "loss": 1.1303, + "step": 7958 + }, + { + "epoch": 0.76, + "grad_norm": 0.3092712454899324, + "learning_rate": 0.00014591487566125957, + "loss": 1.0605, + "step": 7959 + }, + { + "epoch": 0.76, + "grad_norm": 0.270510283984863, + "learning_rate": 0.00014590082169358554, + "loss": 1.0886, + "step": 7960 + }, + { + "epoch": 0.76, + "grad_norm": 0.24910690948882463, + "learning_rate": 0.0001458867665772146, + "loss": 1.0897, + "step": 7961 + }, + { + "epoch": 0.76, + "grad_norm": 0.29126461074740784, + "learning_rate": 0.0001458727103124985, + "loss": 1.1572, + "step": 7962 + }, + { + "epoch": 0.76, + "grad_norm": 0.32863806873359025, + "learning_rate": 0.000145858652899789, + "loss": 1.0036, + "step": 7963 + }, + { + "epoch": 0.76, + "grad_norm": 0.27115363531745906, + "learning_rate": 0.00014584459433943786, + "loss": 1.0609, + "step": 7964 + }, + { + "epoch": 0.76, + "grad_norm": 0.26632334307016087, + "learning_rate": 0.00014583053463179695, + "loss": 0.929, + "step": 7965 + }, + { + "epoch": 0.76, + "grad_norm": 0.30896648642067515, + "learning_rate": 0.00014581647377721812, + "loss": 1.1302, + "step": 7966 + }, + { + "epoch": 0.76, + "grad_norm": 0.31271610841373415, + "learning_rate": 0.00014580241177605322, + "loss": 1.0555, + "step": 7967 + }, + { + "epoch": 0.76, + "grad_norm": 0.2862084572430668, + "learning_rate": 0.0001457883486286542, + "loss": 1.1462, + "step": 7968 + }, + { + "epoch": 0.76, + "grad_norm": 0.2722638712262504, + "learning_rate": 0.00014577428433537297, + "loss": 1.0671, + "step": 7969 + }, + { + "epoch": 0.76, + "grad_norm": 0.29776545230446366, + "learning_rate": 0.0001457602188965615, + "loss": 1.0892, + "step": 7970 + }, + { + "epoch": 0.76, + "grad_norm": 0.28339621303738605, + "learning_rate": 0.00014574615231257177, + "loss": 1.0042, + "step": 7971 + }, + { + "epoch": 0.76, + "grad_norm": 0.2872649900886519, + "learning_rate": 0.00014573208458375586, + "loss": 1.0962, + "step": 7972 + }, + { + "epoch": 0.76, + "grad_norm": 0.29073436106492967, + "learning_rate": 0.0001457180157104658, + "loss": 1.096, + "step": 7973 + }, + { + "epoch": 0.76, + "grad_norm": 0.2782075864070279, + "learning_rate": 0.00014570394569305366, + "loss": 1.0278, + "step": 7974 + }, + { + "epoch": 0.76, + "grad_norm": 0.31350190862970956, + "learning_rate": 0.00014568987453187154, + "loss": 1.091, + "step": 7975 + }, + { + "epoch": 0.76, + "grad_norm": 0.2652268268602932, + "learning_rate": 0.0001456758022272716, + "loss": 1.128, + "step": 7976 + }, + { + "epoch": 0.76, + "grad_norm": 0.2810839334912306, + "learning_rate": 0.00014566172877960603, + "loss": 1.0408, + "step": 7977 + }, + { + "epoch": 0.76, + "grad_norm": 0.25013935049819563, + "learning_rate": 0.00014564765418922696, + "loss": 1.1124, + "step": 7978 + }, + { + "epoch": 0.76, + "grad_norm": 0.23668034177376465, + "learning_rate": 0.00014563357845648667, + "loss": 1.028, + "step": 7979 + }, + { + "epoch": 0.76, + "grad_norm": 0.2736819717949704, + "learning_rate": 0.0001456195015817374, + "loss": 0.9969, + "step": 7980 + }, + { + "epoch": 0.76, + "grad_norm": 0.2771456339595817, + "learning_rate": 0.00014560542356533142, + "loss": 1.1049, + "step": 7981 + }, + { + "epoch": 0.76, + "grad_norm": 0.294782138767455, + "learning_rate": 0.00014559134440762108, + "loss": 1.13, + "step": 7982 + }, + { + "epoch": 0.76, + "grad_norm": 0.26345886128180046, + "learning_rate": 0.0001455772641089587, + "loss": 1.1155, + "step": 7983 + }, + { + "epoch": 0.76, + "grad_norm": 0.2829069470535079, + "learning_rate": 0.00014556318266969656, + "loss": 1.0918, + "step": 7984 + }, + { + "epoch": 0.76, + "grad_norm": 0.28542191746861517, + "learning_rate": 0.00014554910009018722, + "loss": 1.0391, + "step": 7985 + }, + { + "epoch": 0.76, + "grad_norm": 0.2538797891395937, + "learning_rate": 0.000145535016370783, + "loss": 0.9585, + "step": 7986 + }, + { + "epoch": 0.76, + "grad_norm": 0.26884615169189024, + "learning_rate": 0.0001455209315118364, + "loss": 1.0022, + "step": 7987 + }, + { + "epoch": 0.76, + "grad_norm": 0.2716483059032336, + "learning_rate": 0.00014550684551369985, + "loss": 1.0702, + "step": 7988 + }, + { + "epoch": 0.76, + "grad_norm": 0.27843349775471815, + "learning_rate": 0.00014549275837672586, + "loss": 1.1401, + "step": 7989 + }, + { + "epoch": 0.76, + "grad_norm": 0.2518481347111056, + "learning_rate": 0.00014547867010126706, + "loss": 1.0111, + "step": 7990 + }, + { + "epoch": 0.76, + "grad_norm": 0.3216392004906807, + "learning_rate": 0.00014546458068767594, + "loss": 1.0472, + "step": 7991 + }, + { + "epoch": 0.76, + "grad_norm": 0.28909463374066535, + "learning_rate": 0.00014545049013630512, + "loss": 1.0201, + "step": 7992 + }, + { + "epoch": 0.76, + "grad_norm": 0.31788442875623607, + "learning_rate": 0.0001454363984475072, + "loss": 1.0955, + "step": 7993 + }, + { + "epoch": 0.76, + "grad_norm": 0.2758373823123496, + "learning_rate": 0.00014542230562163488, + "loss": 1.0873, + "step": 7994 + }, + { + "epoch": 0.76, + "grad_norm": 0.2818308726562138, + "learning_rate": 0.0001454082116590408, + "loss": 1.2114, + "step": 7995 + }, + { + "epoch": 0.76, + "grad_norm": 0.2641986472531651, + "learning_rate": 0.0001453941165600777, + "loss": 1.1287, + "step": 7996 + }, + { + "epoch": 0.77, + "grad_norm": 0.27941463601912914, + "learning_rate": 0.0001453800203250983, + "loss": 1.0574, + "step": 7997 + }, + { + "epoch": 0.77, + "grad_norm": 0.2756789148686793, + "learning_rate": 0.00014536592295445532, + "loss": 0.9556, + "step": 7998 + }, + { + "epoch": 0.77, + "grad_norm": 0.27533489457097576, + "learning_rate": 0.00014535182444850165, + "loss": 1.1021, + "step": 7999 + }, + { + "epoch": 0.77, + "grad_norm": 0.2964512342749194, + "learning_rate": 0.00014533772480759008, + "loss": 1.0421, + "step": 8000 + }, + { + "epoch": 0.77, + "grad_norm": 0.26212904131408093, + "learning_rate": 0.00014532362403207346, + "loss": 1.0015, + "step": 8001 + }, + { + "epoch": 0.77, + "grad_norm": 0.2949190595576002, + "learning_rate": 0.00014530952212230463, + "loss": 1.0511, + "step": 8002 + }, + { + "epoch": 0.77, + "grad_norm": 0.3066540690434103, + "learning_rate": 0.00014529541907863655, + "loss": 1.0643, + "step": 8003 + }, + { + "epoch": 0.77, + "grad_norm": 0.27584654577393586, + "learning_rate": 0.00014528131490142217, + "loss": 0.9773, + "step": 8004 + }, + { + "epoch": 0.77, + "grad_norm": 0.27774031420621953, + "learning_rate": 0.00014526720959101436, + "loss": 0.9925, + "step": 8005 + }, + { + "epoch": 0.77, + "grad_norm": 0.2512278740693556, + "learning_rate": 0.00014525310314776623, + "loss": 1.0008, + "step": 8006 + }, + { + "epoch": 0.77, + "grad_norm": 0.2630439424936692, + "learning_rate": 0.00014523899557203075, + "loss": 1.1098, + "step": 8007 + }, + { + "epoch": 0.77, + "grad_norm": 0.27426353058067837, + "learning_rate": 0.00014522488686416097, + "loss": 1.0728, + "step": 8008 + }, + { + "epoch": 0.77, + "grad_norm": 0.2763756735231871, + "learning_rate": 0.00014521077702450995, + "loss": 1.0639, + "step": 8009 + }, + { + "epoch": 0.77, + "grad_norm": 0.3325734526066199, + "learning_rate": 0.00014519666605343083, + "loss": 1.0027, + "step": 8010 + }, + { + "epoch": 0.77, + "grad_norm": 0.26469084339952254, + "learning_rate": 0.00014518255395127677, + "loss": 1.0773, + "step": 8011 + }, + { + "epoch": 0.77, + "grad_norm": 0.2755841427968526, + "learning_rate": 0.00014516844071840086, + "loss": 1.118, + "step": 8012 + }, + { + "epoch": 0.77, + "grad_norm": 0.2970488079219033, + "learning_rate": 0.00014515432635515635, + "loss": 1.0362, + "step": 8013 + }, + { + "epoch": 0.77, + "grad_norm": 0.2954323242845108, + "learning_rate": 0.00014514021086189645, + "loss": 1.0746, + "step": 8014 + }, + { + "epoch": 0.77, + "grad_norm": 0.2617764506782068, + "learning_rate": 0.00014512609423897438, + "loss": 1.0385, + "step": 8015 + }, + { + "epoch": 0.77, + "grad_norm": 0.30127357515303343, + "learning_rate": 0.00014511197648674348, + "loss": 1.0966, + "step": 8016 + }, + { + "epoch": 0.77, + "grad_norm": 0.3004406925080267, + "learning_rate": 0.00014509785760555697, + "loss": 1.0441, + "step": 8017 + }, + { + "epoch": 0.77, + "grad_norm": 0.25228201013427193, + "learning_rate": 0.00014508373759576824, + "loss": 1.0692, + "step": 8018 + }, + { + "epoch": 0.77, + "grad_norm": 0.3052520210235085, + "learning_rate": 0.00014506961645773068, + "loss": 1.1513, + "step": 8019 + }, + { + "epoch": 0.77, + "grad_norm": 0.2709522809700183, + "learning_rate": 0.00014505549419179765, + "loss": 0.991, + "step": 8020 + }, + { + "epoch": 0.77, + "grad_norm": 0.26525464351023736, + "learning_rate": 0.00014504137079832252, + "loss": 0.8964, + "step": 8021 + }, + { + "epoch": 0.77, + "grad_norm": 0.2877907446668608, + "learning_rate": 0.00014502724627765877, + "loss": 1.0461, + "step": 8022 + }, + { + "epoch": 0.77, + "grad_norm": 0.29764029690496574, + "learning_rate": 0.00014501312063015993, + "loss": 1.0284, + "step": 8023 + }, + { + "epoch": 0.77, + "grad_norm": 0.2883370401645907, + "learning_rate": 0.00014499899385617943, + "loss": 1.0268, + "step": 8024 + }, + { + "epoch": 0.77, + "grad_norm": 0.2563877310771201, + "learning_rate": 0.0001449848659560708, + "loss": 1.1029, + "step": 8025 + }, + { + "epoch": 0.77, + "grad_norm": 0.29710595080293883, + "learning_rate": 0.00014497073693018768, + "loss": 1.1315, + "step": 8026 + }, + { + "epoch": 0.77, + "grad_norm": 0.2559529397106755, + "learning_rate": 0.00014495660677888358, + "loss": 1.009, + "step": 8027 + }, + { + "epoch": 0.77, + "grad_norm": 0.29014013639604846, + "learning_rate": 0.00014494247550251213, + "loss": 1.0681, + "step": 8028 + }, + { + "epoch": 0.77, + "grad_norm": 0.2875485426750669, + "learning_rate": 0.00014492834310142702, + "loss": 1.1195, + "step": 8029 + }, + { + "epoch": 0.77, + "grad_norm": 0.286722769874282, + "learning_rate": 0.00014491420957598184, + "loss": 1.0529, + "step": 8030 + }, + { + "epoch": 0.77, + "grad_norm": 0.2674676429533566, + "learning_rate": 0.0001449000749265304, + "loss": 1.0475, + "step": 8031 + }, + { + "epoch": 0.77, + "grad_norm": 0.29840922095847167, + "learning_rate": 0.00014488593915342628, + "loss": 1.123, + "step": 8032 + }, + { + "epoch": 0.77, + "grad_norm": 0.29901175240287103, + "learning_rate": 0.0001448718022570234, + "loss": 1.148, + "step": 8033 + }, + { + "epoch": 0.77, + "grad_norm": 0.2915382990881953, + "learning_rate": 0.00014485766423767544, + "loss": 1.0401, + "step": 8034 + }, + { + "epoch": 0.77, + "grad_norm": 0.26281760733012527, + "learning_rate": 0.00014484352509573626, + "loss": 1.1292, + "step": 8035 + }, + { + "epoch": 0.77, + "grad_norm": 0.2756692483739733, + "learning_rate": 0.00014482938483155965, + "loss": 1.0544, + "step": 8036 + }, + { + "epoch": 0.77, + "grad_norm": 0.28042375304828776, + "learning_rate": 0.00014481524344549953, + "loss": 1.1094, + "step": 8037 + }, + { + "epoch": 0.77, + "grad_norm": 0.2666048364114282, + "learning_rate": 0.00014480110093790976, + "loss": 1.0748, + "step": 8038 + }, + { + "epoch": 0.77, + "grad_norm": 0.5049686356926107, + "learning_rate": 0.0001447869573091443, + "loss": 1.0392, + "step": 8039 + }, + { + "epoch": 0.77, + "grad_norm": 0.2898604153674101, + "learning_rate": 0.0001447728125595571, + "loss": 1.0527, + "step": 8040 + }, + { + "epoch": 0.77, + "grad_norm": 0.2938423642503213, + "learning_rate": 0.0001447586666895021, + "loss": 1.0593, + "step": 8041 + }, + { + "epoch": 0.77, + "grad_norm": 0.3199560544400549, + "learning_rate": 0.00014474451969933333, + "loss": 1.1253, + "step": 8042 + }, + { + "epoch": 0.77, + "grad_norm": 0.279058551647216, + "learning_rate": 0.00014473037158940484, + "loss": 1.0904, + "step": 8043 + }, + { + "epoch": 0.77, + "grad_norm": 0.3317633487011477, + "learning_rate": 0.0001447162223600707, + "loss": 1.0621, + "step": 8044 + }, + { + "epoch": 0.77, + "grad_norm": 0.31113174621821804, + "learning_rate": 0.00014470207201168497, + "loss": 1.1192, + "step": 8045 + }, + { + "epoch": 0.77, + "grad_norm": 0.2732527847810805, + "learning_rate": 0.00014468792054460184, + "loss": 1.0248, + "step": 8046 + }, + { + "epoch": 0.77, + "grad_norm": 0.29153439327487507, + "learning_rate": 0.00014467376795917537, + "loss": 1.124, + "step": 8047 + }, + { + "epoch": 0.77, + "grad_norm": 0.2860523862518081, + "learning_rate": 0.0001446596142557598, + "loss": 1.0621, + "step": 8048 + }, + { + "epoch": 0.77, + "grad_norm": 0.2862949081963216, + "learning_rate": 0.00014464545943470932, + "loss": 1.0017, + "step": 8049 + }, + { + "epoch": 0.77, + "grad_norm": 0.2968576640237807, + "learning_rate": 0.00014463130349637814, + "loss": 1.0521, + "step": 8050 + }, + { + "epoch": 0.77, + "grad_norm": 0.3203158707213885, + "learning_rate": 0.00014461714644112053, + "loss": 1.0283, + "step": 8051 + }, + { + "epoch": 0.77, + "grad_norm": 0.23962634495433535, + "learning_rate": 0.0001446029882692908, + "loss": 1.0052, + "step": 8052 + }, + { + "epoch": 0.77, + "grad_norm": 0.29338455713698386, + "learning_rate": 0.0001445888289812433, + "loss": 1.1304, + "step": 8053 + }, + { + "epoch": 0.77, + "grad_norm": 0.2650099607677807, + "learning_rate": 0.0001445746685773323, + "loss": 0.9916, + "step": 8054 + }, + { + "epoch": 0.77, + "grad_norm": 0.26198071838847975, + "learning_rate": 0.00014456050705791216, + "loss": 1.0875, + "step": 8055 + }, + { + "epoch": 0.77, + "grad_norm": 0.292914913997817, + "learning_rate": 0.00014454634442333738, + "loss": 1.0666, + "step": 8056 + }, + { + "epoch": 0.77, + "grad_norm": 0.3015285693686079, + "learning_rate": 0.00014453218067396231, + "loss": 1.1363, + "step": 8057 + }, + { + "epoch": 0.77, + "grad_norm": 0.30434995117078845, + "learning_rate": 0.00014451801581014147, + "loss": 1.0074, + "step": 8058 + }, + { + "epoch": 0.77, + "grad_norm": 0.30572536430411557, + "learning_rate": 0.00014450384983222926, + "loss": 1.0941, + "step": 8059 + }, + { + "epoch": 0.77, + "grad_norm": 0.2665325169928879, + "learning_rate": 0.00014448968274058025, + "loss": 1.0584, + "step": 8060 + }, + { + "epoch": 0.77, + "grad_norm": 0.291238529425458, + "learning_rate": 0.000144475514535549, + "loss": 0.9758, + "step": 8061 + }, + { + "epoch": 0.77, + "grad_norm": 0.320923065004721, + "learning_rate": 0.00014446134521749, + "loss": 0.9887, + "step": 8062 + }, + { + "epoch": 0.77, + "grad_norm": 0.2695062501972563, + "learning_rate": 0.00014444717478675792, + "loss": 1.063, + "step": 8063 + }, + { + "epoch": 0.77, + "grad_norm": 0.29036294842452276, + "learning_rate": 0.00014443300324370738, + "loss": 1.0825, + "step": 8064 + }, + { + "epoch": 0.77, + "grad_norm": 0.2667481516709462, + "learning_rate": 0.00014441883058869298, + "loss": 1.0139, + "step": 8065 + }, + { + "epoch": 0.77, + "grad_norm": 0.31330268446835163, + "learning_rate": 0.00014440465682206944, + "loss": 1.1519, + "step": 8066 + }, + { + "epoch": 0.77, + "grad_norm": 0.29426667401104506, + "learning_rate": 0.00014439048194419141, + "loss": 1.001, + "step": 8067 + }, + { + "epoch": 0.77, + "grad_norm": 0.30892190035705414, + "learning_rate": 0.00014437630595541374, + "loss": 1.0714, + "step": 8068 + }, + { + "epoch": 0.77, + "grad_norm": 0.28625371937125405, + "learning_rate": 0.00014436212885609106, + "loss": 1.1328, + "step": 8069 + }, + { + "epoch": 0.77, + "grad_norm": 0.2904730209899161, + "learning_rate": 0.00014434795064657827, + "loss": 0.9807, + "step": 8070 + }, + { + "epoch": 0.77, + "grad_norm": 0.29946573442663216, + "learning_rate": 0.0001443337713272301, + "loss": 1.108, + "step": 8071 + }, + { + "epoch": 0.77, + "grad_norm": 0.25138650851267935, + "learning_rate": 0.0001443195908984015, + "loss": 0.9354, + "step": 8072 + }, + { + "epoch": 0.77, + "grad_norm": 0.3046129695319591, + "learning_rate": 0.00014430540936044724, + "loss": 1.0464, + "step": 8073 + }, + { + "epoch": 0.77, + "grad_norm": 0.2815772869608019, + "learning_rate": 0.0001442912267137223, + "loss": 1.1689, + "step": 8074 + }, + { + "epoch": 0.77, + "grad_norm": 0.28788534180895897, + "learning_rate": 0.00014427704295858154, + "loss": 1.1042, + "step": 8075 + }, + { + "epoch": 0.77, + "grad_norm": 0.28620999022686416, + "learning_rate": 0.00014426285809537997, + "loss": 1.0644, + "step": 8076 + }, + { + "epoch": 0.77, + "grad_norm": 0.2718948107560663, + "learning_rate": 0.00014424867212447254, + "loss": 1.0555, + "step": 8077 + }, + { + "epoch": 0.77, + "grad_norm": 0.2745313908908246, + "learning_rate": 0.0001442344850462143, + "loss": 0.953, + "step": 8078 + }, + { + "epoch": 0.77, + "grad_norm": 0.2782122491457099, + "learning_rate": 0.0001442202968609603, + "loss": 1.098, + "step": 8079 + }, + { + "epoch": 0.77, + "grad_norm": 0.27832697424885927, + "learning_rate": 0.00014420610756906552, + "loss": 0.9896, + "step": 8080 + }, + { + "epoch": 0.77, + "grad_norm": 0.2850630781424915, + "learning_rate": 0.00014419191717088517, + "loss": 1.0579, + "step": 8081 + }, + { + "epoch": 0.77, + "grad_norm": 0.30698153794733524, + "learning_rate": 0.00014417772566677428, + "loss": 0.9522, + "step": 8082 + }, + { + "epoch": 0.77, + "grad_norm": 0.2412252387912062, + "learning_rate": 0.00014416353305708802, + "loss": 0.963, + "step": 8083 + }, + { + "epoch": 0.77, + "grad_norm": 0.2760933371029509, + "learning_rate": 0.00014414933934218165, + "loss": 1.039, + "step": 8084 + }, + { + "epoch": 0.77, + "grad_norm": 0.27478977307311525, + "learning_rate": 0.0001441351445224103, + "loss": 1.0191, + "step": 8085 + }, + { + "epoch": 0.77, + "grad_norm": 0.27460908719231536, + "learning_rate": 0.0001441209485981292, + "loss": 0.942, + "step": 8086 + }, + { + "epoch": 0.77, + "grad_norm": 0.300097647062556, + "learning_rate": 0.00014410675156969362, + "loss": 1.1083, + "step": 8087 + }, + { + "epoch": 0.77, + "grad_norm": 0.301434940986813, + "learning_rate": 0.0001440925534374589, + "loss": 1.1201, + "step": 8088 + }, + { + "epoch": 0.77, + "grad_norm": 0.34304082819084863, + "learning_rate": 0.00014407835420178028, + "loss": 1.2234, + "step": 8089 + }, + { + "epoch": 0.77, + "grad_norm": 0.30266233132954434, + "learning_rate": 0.00014406415386301319, + "loss": 1.0731, + "step": 8090 + }, + { + "epoch": 0.77, + "grad_norm": 0.29052074003899026, + "learning_rate": 0.00014404995242151293, + "loss": 1.0351, + "step": 8091 + }, + { + "epoch": 0.77, + "grad_norm": 0.31236829856827036, + "learning_rate": 0.00014403574987763493, + "loss": 1.1238, + "step": 8092 + }, + { + "epoch": 0.77, + "grad_norm": 0.27979550559379457, + "learning_rate": 0.0001440215462317346, + "loss": 1.142, + "step": 8093 + }, + { + "epoch": 0.77, + "grad_norm": 0.303167287374647, + "learning_rate": 0.00014400734148416742, + "loss": 1.1546, + "step": 8094 + }, + { + "epoch": 0.77, + "grad_norm": 0.2722206268311316, + "learning_rate": 0.00014399313563528886, + "loss": 1.0722, + "step": 8095 + }, + { + "epoch": 0.77, + "grad_norm": 0.29950103549697416, + "learning_rate": 0.00014397892868545442, + "loss": 1.0869, + "step": 8096 + }, + { + "epoch": 0.77, + "grad_norm": 0.29083600055293385, + "learning_rate": 0.00014396472063501968, + "loss": 1.0576, + "step": 8097 + }, + { + "epoch": 0.77, + "grad_norm": 0.2883189452608079, + "learning_rate": 0.00014395051148434015, + "loss": 0.9811, + "step": 8098 + }, + { + "epoch": 0.77, + "grad_norm": 0.2614201885833928, + "learning_rate": 0.0001439363012337715, + "loss": 1.0721, + "step": 8099 + }, + { + "epoch": 0.77, + "grad_norm": 0.27854811920048617, + "learning_rate": 0.00014392208988366921, + "loss": 1.0133, + "step": 8100 + }, + { + "epoch": 0.78, + "grad_norm": 0.33867360910508465, + "learning_rate": 0.00014390787743438907, + "loss": 1.0265, + "step": 8101 + }, + { + "epoch": 0.78, + "grad_norm": 0.27479933367603354, + "learning_rate": 0.0001438936638862867, + "loss": 0.9831, + "step": 8102 + }, + { + "epoch": 0.78, + "grad_norm": 0.3270439622691649, + "learning_rate": 0.00014387944923971782, + "loss": 1.1035, + "step": 8103 + }, + { + "epoch": 0.78, + "grad_norm": 0.29562739459578125, + "learning_rate": 0.0001438652334950381, + "loss": 1.0352, + "step": 8104 + }, + { + "epoch": 0.78, + "grad_norm": 0.2771019668036566, + "learning_rate": 0.00014385101665260338, + "loss": 1.0598, + "step": 8105 + }, + { + "epoch": 0.78, + "grad_norm": 0.26504488308797247, + "learning_rate": 0.0001438367987127694, + "loss": 1.1604, + "step": 8106 + }, + { + "epoch": 0.78, + "grad_norm": 0.3218630057212102, + "learning_rate": 0.000143822579675892, + "loss": 1.0142, + "step": 8107 + }, + { + "epoch": 0.78, + "grad_norm": 0.28024189474691924, + "learning_rate": 0.00014380835954232697, + "loss": 1.0721, + "step": 8108 + }, + { + "epoch": 0.78, + "grad_norm": 0.2852976599263691, + "learning_rate": 0.00014379413831243026, + "loss": 0.9966, + "step": 8109 + }, + { + "epoch": 0.78, + "grad_norm": 0.2904612714774512, + "learning_rate": 0.00014377991598655765, + "loss": 1.0675, + "step": 8110 + }, + { + "epoch": 0.78, + "grad_norm": 0.26588060857502965, + "learning_rate": 0.00014376569256506516, + "loss": 1.0215, + "step": 8111 + }, + { + "epoch": 0.78, + "grad_norm": 0.28963536427741476, + "learning_rate": 0.0001437514680483087, + "loss": 0.9806, + "step": 8112 + }, + { + "epoch": 0.78, + "grad_norm": 0.2669044448329335, + "learning_rate": 0.00014373724243664423, + "loss": 1.1047, + "step": 8113 + }, + { + "epoch": 0.78, + "grad_norm": 0.2904302678513009, + "learning_rate": 0.00014372301573042782, + "loss": 1.0147, + "step": 8114 + }, + { + "epoch": 0.78, + "grad_norm": 0.25491810409780064, + "learning_rate": 0.00014370878793001546, + "loss": 1.0453, + "step": 8115 + }, + { + "epoch": 0.78, + "grad_norm": 0.2592085806502541, + "learning_rate": 0.0001436945590357632, + "loss": 1.0276, + "step": 8116 + }, + { + "epoch": 0.78, + "grad_norm": 0.2821215453803441, + "learning_rate": 0.00014368032904802714, + "loss": 1.0488, + "step": 8117 + }, + { + "epoch": 0.78, + "grad_norm": 0.2615363952942239, + "learning_rate": 0.00014366609796716338, + "loss": 1.1121, + "step": 8118 + }, + { + "epoch": 0.78, + "grad_norm": 0.31736889976384736, + "learning_rate": 0.0001436518657935281, + "loss": 1.1233, + "step": 8119 + }, + { + "epoch": 0.78, + "grad_norm": 0.2598336195194818, + "learning_rate": 0.00014363763252747745, + "loss": 1.0805, + "step": 8120 + }, + { + "epoch": 0.78, + "grad_norm": 0.3284573374828336, + "learning_rate": 0.0001436233981693676, + "loss": 1.0575, + "step": 8121 + }, + { + "epoch": 0.78, + "grad_norm": 0.2908417184290244, + "learning_rate": 0.00014360916271955482, + "loss": 1.0383, + "step": 8122 + }, + { + "epoch": 0.78, + "grad_norm": 0.28781407930130587, + "learning_rate": 0.0001435949261783953, + "loss": 0.9912, + "step": 8123 + }, + { + "epoch": 0.78, + "grad_norm": 0.2929647743747517, + "learning_rate": 0.0001435806885462454, + "loss": 1.0553, + "step": 8124 + }, + { + "epoch": 0.78, + "grad_norm": 0.2753194722230229, + "learning_rate": 0.00014356644982346133, + "loss": 0.9769, + "step": 8125 + }, + { + "epoch": 0.78, + "grad_norm": 0.3013577274487869, + "learning_rate": 0.0001435522100103995, + "loss": 1.0244, + "step": 8126 + }, + { + "epoch": 0.78, + "grad_norm": 0.2901508132919651, + "learning_rate": 0.00014353796910741623, + "loss": 1.011, + "step": 8127 + }, + { + "epoch": 0.78, + "grad_norm": 0.29254447956246066, + "learning_rate": 0.0001435237271148679, + "loss": 1.0236, + "step": 8128 + }, + { + "epoch": 0.78, + "grad_norm": 0.3042957195012631, + "learning_rate": 0.000143509484033111, + "loss": 1.0665, + "step": 8129 + }, + { + "epoch": 0.78, + "grad_norm": 0.29428251771956543, + "learning_rate": 0.0001434952398625019, + "loss": 1.065, + "step": 8130 + }, + { + "epoch": 0.78, + "grad_norm": 0.2734033579364972, + "learning_rate": 0.00014348099460339707, + "loss": 1.001, + "step": 8131 + }, + { + "epoch": 0.78, + "grad_norm": 0.3303462730760466, + "learning_rate": 0.00014346674825615303, + "loss": 1.0916, + "step": 8132 + }, + { + "epoch": 0.78, + "grad_norm": 0.27668942643869243, + "learning_rate": 0.0001434525008211263, + "loss": 1.0658, + "step": 8133 + }, + { + "epoch": 0.78, + "grad_norm": 0.2953192209869138, + "learning_rate": 0.00014343825229867343, + "loss": 1.0055, + "step": 8134 + }, + { + "epoch": 0.78, + "grad_norm": 0.30590123544431624, + "learning_rate": 0.00014342400268915097, + "loss": 1.1254, + "step": 8135 + }, + { + "epoch": 0.78, + "grad_norm": 0.289254633390817, + "learning_rate": 0.00014340975199291558, + "loss": 0.8961, + "step": 8136 + }, + { + "epoch": 0.78, + "grad_norm": 0.2554256044992401, + "learning_rate": 0.00014339550021032384, + "loss": 1.0181, + "step": 8137 + }, + { + "epoch": 0.78, + "grad_norm": 0.3141083587137128, + "learning_rate": 0.00014338124734173245, + "loss": 1.044, + "step": 8138 + }, + { + "epoch": 0.78, + "grad_norm": 0.2569367079798127, + "learning_rate": 0.0001433669933874981, + "loss": 1.1267, + "step": 8139 + }, + { + "epoch": 0.78, + "grad_norm": 0.2678458091756298, + "learning_rate": 0.00014335273834797745, + "loss": 1.1156, + "step": 8140 + }, + { + "epoch": 0.78, + "grad_norm": 0.29169647370051666, + "learning_rate": 0.0001433384822235273, + "loss": 1.0323, + "step": 8141 + }, + { + "epoch": 0.78, + "grad_norm": 0.2742299999761089, + "learning_rate": 0.0001433242250145044, + "loss": 1.0005, + "step": 8142 + }, + { + "epoch": 0.78, + "grad_norm": 0.2685403344557798, + "learning_rate": 0.00014330996672126553, + "loss": 0.9437, + "step": 8143 + }, + { + "epoch": 0.78, + "grad_norm": 0.29515305992400265, + "learning_rate": 0.0001432957073441675, + "loss": 1.1712, + "step": 8144 + }, + { + "epoch": 0.78, + "grad_norm": 0.2932051822985775, + "learning_rate": 0.00014328144688356722, + "loss": 1.083, + "step": 8145 + }, + { + "epoch": 0.78, + "grad_norm": 0.28221973458585115, + "learning_rate": 0.00014326718533982154, + "loss": 1.0657, + "step": 8146 + }, + { + "epoch": 0.78, + "grad_norm": 0.2528985840125888, + "learning_rate": 0.00014325292271328733, + "loss": 1.1343, + "step": 8147 + }, + { + "epoch": 0.78, + "grad_norm": 0.3099604595171793, + "learning_rate": 0.00014323865900432153, + "loss": 1.1354, + "step": 8148 + }, + { + "epoch": 0.78, + "grad_norm": 0.28724423689538947, + "learning_rate": 0.00014322439421328114, + "loss": 1.0258, + "step": 8149 + }, + { + "epoch": 0.78, + "grad_norm": 0.29908396514945557, + "learning_rate": 0.0001432101283405231, + "loss": 1.0992, + "step": 8150 + }, + { + "epoch": 0.78, + "grad_norm": 0.25176928414336436, + "learning_rate": 0.00014319586138640447, + "loss": 1.127, + "step": 8151 + }, + { + "epoch": 0.78, + "grad_norm": 0.27010420300606885, + "learning_rate": 0.00014318159335128226, + "loss": 1.112, + "step": 8152 + }, + { + "epoch": 0.78, + "grad_norm": 0.27285928105597573, + "learning_rate": 0.0001431673242355135, + "loss": 1.0926, + "step": 8153 + }, + { + "epoch": 0.78, + "grad_norm": 0.2606896768983877, + "learning_rate": 0.00014315305403945534, + "loss": 1.0197, + "step": 8154 + }, + { + "epoch": 0.78, + "grad_norm": 0.2828962978680743, + "learning_rate": 0.0001431387827634649, + "loss": 1.1644, + "step": 8155 + }, + { + "epoch": 0.78, + "grad_norm": 0.30392229600605375, + "learning_rate": 0.00014312451040789928, + "loss": 1.0046, + "step": 8156 + }, + { + "epoch": 0.78, + "grad_norm": 0.22992598628886599, + "learning_rate": 0.0001431102369731157, + "loss": 1.1334, + "step": 8157 + }, + { + "epoch": 0.78, + "grad_norm": 0.27602772065447234, + "learning_rate": 0.00014309596245947134, + "loss": 1.1262, + "step": 8158 + }, + { + "epoch": 0.78, + "grad_norm": 0.25737443548054917, + "learning_rate": 0.00014308168686732344, + "loss": 0.9536, + "step": 8159 + }, + { + "epoch": 0.78, + "grad_norm": 0.2674977275846256, + "learning_rate": 0.00014306741019702926, + "loss": 1.1088, + "step": 8160 + }, + { + "epoch": 0.78, + "grad_norm": 0.26488508791506327, + "learning_rate": 0.00014305313244894604, + "loss": 1.0541, + "step": 8161 + }, + { + "epoch": 0.78, + "grad_norm": 0.2689193524609303, + "learning_rate": 0.00014303885362343115, + "loss": 1.1577, + "step": 8162 + }, + { + "epoch": 0.78, + "grad_norm": 0.2879200289332181, + "learning_rate": 0.00014302457372084192, + "loss": 1.1757, + "step": 8163 + }, + { + "epoch": 0.78, + "grad_norm": 0.2867128935586349, + "learning_rate": 0.00014301029274153563, + "loss": 1.0072, + "step": 8164 + }, + { + "epoch": 0.78, + "grad_norm": 0.24341160678709342, + "learning_rate": 0.00014299601068586978, + "loss": 1.0225, + "step": 8165 + }, + { + "epoch": 0.78, + "grad_norm": 0.2762861135172704, + "learning_rate": 0.00014298172755420173, + "loss": 1.1304, + "step": 8166 + }, + { + "epoch": 0.78, + "grad_norm": 0.2769763284115676, + "learning_rate": 0.00014296744334688893, + "loss": 1.0157, + "step": 8167 + }, + { + "epoch": 0.78, + "grad_norm": 0.26734305962867105, + "learning_rate": 0.0001429531580642889, + "loss": 1.1552, + "step": 8168 + }, + { + "epoch": 0.78, + "grad_norm": 0.26480907501831297, + "learning_rate": 0.00014293887170675903, + "loss": 0.9893, + "step": 8169 + }, + { + "epoch": 0.78, + "grad_norm": 0.29281043302440196, + "learning_rate": 0.00014292458427465695, + "loss": 0.9863, + "step": 8170 + }, + { + "epoch": 0.78, + "grad_norm": 0.2888301828367874, + "learning_rate": 0.00014291029576834013, + "loss": 1.1172, + "step": 8171 + }, + { + "epoch": 0.78, + "grad_norm": 0.3039150196032684, + "learning_rate": 0.00014289600618816627, + "loss": 1.0797, + "step": 8172 + }, + { + "epoch": 0.78, + "grad_norm": 0.2841118966636821, + "learning_rate": 0.0001428817155344928, + "loss": 1.1855, + "step": 8173 + }, + { + "epoch": 0.78, + "grad_norm": 0.28079473707368974, + "learning_rate": 0.0001428674238076775, + "loss": 1.0272, + "step": 8174 + }, + { + "epoch": 0.78, + "grad_norm": 0.2862456351557848, + "learning_rate": 0.00014285313100807797, + "loss": 1.0838, + "step": 8175 + }, + { + "epoch": 0.78, + "grad_norm": 0.2602397115203145, + "learning_rate": 0.00014283883713605192, + "loss": 1.083, + "step": 8176 + }, + { + "epoch": 0.78, + "grad_norm": 0.30104111034073594, + "learning_rate": 0.00014282454219195702, + "loss": 1.0324, + "step": 8177 + }, + { + "epoch": 0.78, + "grad_norm": 0.24143983810247652, + "learning_rate": 0.00014281024617615105, + "loss": 1.0873, + "step": 8178 + }, + { + "epoch": 0.78, + "grad_norm": 0.2932603381628741, + "learning_rate": 0.00014279594908899175, + "loss": 1.017, + "step": 8179 + }, + { + "epoch": 0.78, + "grad_norm": 0.3006594349278346, + "learning_rate": 0.00014278165093083696, + "loss": 1.0753, + "step": 8180 + }, + { + "epoch": 0.78, + "grad_norm": 0.28022927773599904, + "learning_rate": 0.00014276735170204444, + "loss": 1.0561, + "step": 8181 + }, + { + "epoch": 0.78, + "grad_norm": 0.27844733340976957, + "learning_rate": 0.0001427530514029721, + "loss": 1.1647, + "step": 8182 + }, + { + "epoch": 0.78, + "grad_norm": 0.2571022498864135, + "learning_rate": 0.00014273875003397774, + "loss": 0.92, + "step": 8183 + }, + { + "epoch": 0.78, + "grad_norm": 0.2692587322659136, + "learning_rate": 0.0001427244475954193, + "loss": 1.0409, + "step": 8184 + }, + { + "epoch": 0.78, + "grad_norm": 0.2868029667790984, + "learning_rate": 0.00014271014408765472, + "loss": 0.9207, + "step": 8185 + }, + { + "epoch": 0.78, + "grad_norm": 0.3122721251692393, + "learning_rate": 0.00014269583951104196, + "loss": 1.0165, + "step": 8186 + }, + { + "epoch": 0.78, + "grad_norm": 0.240316991720147, + "learning_rate": 0.00014268153386593898, + "loss": 1.1213, + "step": 8187 + }, + { + "epoch": 0.78, + "grad_norm": 0.283970934006317, + "learning_rate": 0.00014266722715270376, + "loss": 1.1421, + "step": 8188 + }, + { + "epoch": 0.78, + "grad_norm": 0.2848531576221525, + "learning_rate": 0.0001426529193716944, + "loss": 1.0823, + "step": 8189 + }, + { + "epoch": 0.78, + "grad_norm": 0.2683429644781905, + "learning_rate": 0.0001426386105232689, + "loss": 1.0785, + "step": 8190 + }, + { + "epoch": 0.78, + "grad_norm": 0.2862688618200794, + "learning_rate": 0.00014262430060778538, + "loss": 1.0695, + "step": 8191 + }, + { + "epoch": 0.78, + "grad_norm": 0.26946047893455116, + "learning_rate": 0.00014260998962560195, + "loss": 0.9804, + "step": 8192 + }, + { + "epoch": 0.78, + "grad_norm": 0.3044969174847609, + "learning_rate": 0.00014259567757707675, + "loss": 1.0052, + "step": 8193 + }, + { + "epoch": 0.78, + "grad_norm": 0.27698543650468876, + "learning_rate": 0.00014258136446256795, + "loss": 0.9854, + "step": 8194 + }, + { + "epoch": 0.78, + "grad_norm": 0.2971561209977439, + "learning_rate": 0.00014256705028243375, + "loss": 1.0358, + "step": 8195 + }, + { + "epoch": 0.78, + "grad_norm": 0.2770540731397757, + "learning_rate": 0.00014255273503703238, + "loss": 1.0042, + "step": 8196 + }, + { + "epoch": 0.78, + "grad_norm": 0.28561457756539305, + "learning_rate": 0.00014253841872672202, + "loss": 1.1068, + "step": 8197 + }, + { + "epoch": 0.78, + "grad_norm": 0.2813535778333693, + "learning_rate": 0.00014252410135186103, + "loss": 1.1568, + "step": 8198 + }, + { + "epoch": 0.78, + "grad_norm": 0.3051058643154024, + "learning_rate": 0.00014250978291280766, + "loss": 1.0966, + "step": 8199 + }, + { + "epoch": 0.78, + "grad_norm": 0.2995687199626439, + "learning_rate": 0.00014249546340992027, + "loss": 0.9566, + "step": 8200 + }, + { + "epoch": 0.78, + "grad_norm": 0.25038359737911414, + "learning_rate": 0.0001424811428435572, + "loss": 0.9642, + "step": 8201 + }, + { + "epoch": 0.78, + "grad_norm": 0.2755441297303153, + "learning_rate": 0.00014246682121407686, + "loss": 0.9964, + "step": 8202 + }, + { + "epoch": 0.78, + "grad_norm": 0.28771744037000346, + "learning_rate": 0.0001424524985218376, + "loss": 1.1178, + "step": 8203 + }, + { + "epoch": 0.78, + "grad_norm": 0.26096812303528166, + "learning_rate": 0.00014243817476719789, + "loss": 1.0602, + "step": 8204 + }, + { + "epoch": 0.78, + "grad_norm": 0.2910822777089926, + "learning_rate": 0.00014242384995051617, + "loss": 1.1028, + "step": 8205 + }, + { + "epoch": 0.79, + "grad_norm": 0.2946214010693757, + "learning_rate": 0.000142409524072151, + "loss": 1.0582, + "step": 8206 + }, + { + "epoch": 0.79, + "grad_norm": 0.2782362063554336, + "learning_rate": 0.00014239519713246077, + "loss": 1.0472, + "step": 8207 + }, + { + "epoch": 0.79, + "grad_norm": 0.3096824457906856, + "learning_rate": 0.00014238086913180407, + "loss": 1.1764, + "step": 8208 + }, + { + "epoch": 0.79, + "grad_norm": 0.2705085127084864, + "learning_rate": 0.00014236654007053956, + "loss": 1.0761, + "step": 8209 + }, + { + "epoch": 0.79, + "grad_norm": 0.2746196439259208, + "learning_rate": 0.00014235220994902572, + "loss": 1.0842, + "step": 8210 + }, + { + "epoch": 0.79, + "grad_norm": 0.2818643359206753, + "learning_rate": 0.0001423378787676212, + "loss": 1.0669, + "step": 8211 + }, + { + "epoch": 0.79, + "grad_norm": 0.3016521365181387, + "learning_rate": 0.0001423235465266847, + "loss": 1.1674, + "step": 8212 + }, + { + "epoch": 0.79, + "grad_norm": 0.3097236944936014, + "learning_rate": 0.0001423092132265748, + "loss": 1.019, + "step": 8213 + }, + { + "epoch": 0.79, + "grad_norm": 0.25229115162540394, + "learning_rate": 0.00014229487886765026, + "loss": 1.1445, + "step": 8214 + }, + { + "epoch": 0.79, + "grad_norm": 0.27446316421143896, + "learning_rate": 0.0001422805434502698, + "loss": 1.1201, + "step": 8215 + }, + { + "epoch": 0.79, + "grad_norm": 0.265891783959781, + "learning_rate": 0.00014226620697479217, + "loss": 0.9653, + "step": 8216 + }, + { + "epoch": 0.79, + "grad_norm": 0.2858901663372903, + "learning_rate": 0.00014225186944157614, + "loss": 1.1012, + "step": 8217 + }, + { + "epoch": 0.79, + "grad_norm": 0.24368459446009416, + "learning_rate": 0.00014223753085098052, + "loss": 1.0438, + "step": 8218 + }, + { + "epoch": 0.79, + "grad_norm": 0.2658297205018217, + "learning_rate": 0.00014222319120336415, + "loss": 1.0923, + "step": 8219 + }, + { + "epoch": 0.79, + "grad_norm": 0.27643416390290804, + "learning_rate": 0.0001422088504990859, + "loss": 1.0201, + "step": 8220 + }, + { + "epoch": 0.79, + "grad_norm": 0.27516078932172305, + "learning_rate": 0.00014219450873850464, + "loss": 0.9544, + "step": 8221 + }, + { + "epoch": 0.79, + "grad_norm": 0.27601315150275746, + "learning_rate": 0.00014218016592197925, + "loss": 1.0343, + "step": 8222 + }, + { + "epoch": 0.79, + "grad_norm": 0.24192057612707965, + "learning_rate": 0.00014216582204986872, + "loss": 1.0806, + "step": 8223 + }, + { + "epoch": 0.79, + "grad_norm": 0.28077409693408506, + "learning_rate": 0.000142151477122532, + "loss": 0.8763, + "step": 8224 + }, + { + "epoch": 0.79, + "grad_norm": 0.27896630435292175, + "learning_rate": 0.00014213713114032803, + "loss": 1.0493, + "step": 8225 + }, + { + "epoch": 0.79, + "grad_norm": 0.2737335624375359, + "learning_rate": 0.0001421227841036159, + "loss": 1.0675, + "step": 8226 + }, + { + "epoch": 0.79, + "grad_norm": 0.3407111785009769, + "learning_rate": 0.00014210843601275466, + "loss": 1.0628, + "step": 8227 + }, + { + "epoch": 0.79, + "grad_norm": 0.2617914284389135, + "learning_rate": 0.00014209408686810329, + "loss": 1.0987, + "step": 8228 + }, + { + "epoch": 0.79, + "grad_norm": 0.3013100303523883, + "learning_rate": 0.00014207973667002097, + "loss": 0.9312, + "step": 8229 + }, + { + "epoch": 0.79, + "grad_norm": 0.2846920379517571, + "learning_rate": 0.00014206538541886677, + "loss": 1.0237, + "step": 8230 + }, + { + "epoch": 0.79, + "grad_norm": 0.30283490816190023, + "learning_rate": 0.0001420510331149999, + "loss": 1.0535, + "step": 8231 + }, + { + "epoch": 0.79, + "grad_norm": 0.24872498659564674, + "learning_rate": 0.00014203667975877946, + "loss": 1.057, + "step": 8232 + }, + { + "epoch": 0.79, + "grad_norm": 0.27630382799540015, + "learning_rate": 0.00014202232535056472, + "loss": 1.0121, + "step": 8233 + }, + { + "epoch": 0.79, + "grad_norm": 0.28477175943154037, + "learning_rate": 0.00014200796989071487, + "loss": 1.045, + "step": 8234 + }, + { + "epoch": 0.79, + "grad_norm": 0.2719512123825706, + "learning_rate": 0.00014199361337958915, + "loss": 1.075, + "step": 8235 + }, + { + "epoch": 0.79, + "grad_norm": 0.27304851623487275, + "learning_rate": 0.0001419792558175469, + "loss": 0.9973, + "step": 8236 + }, + { + "epoch": 0.79, + "grad_norm": 0.2749777621769799, + "learning_rate": 0.0001419648972049474, + "loss": 1.0225, + "step": 8237 + }, + { + "epoch": 0.79, + "grad_norm": 0.26176184932542573, + "learning_rate": 0.0001419505375421499, + "loss": 1.061, + "step": 8238 + }, + { + "epoch": 0.79, + "grad_norm": 0.2825657955231182, + "learning_rate": 0.0001419361768295139, + "loss": 1.039, + "step": 8239 + }, + { + "epoch": 0.79, + "grad_norm": 0.26238648793597996, + "learning_rate": 0.00014192181506739868, + "loss": 0.9806, + "step": 8240 + }, + { + "epoch": 0.79, + "grad_norm": 0.2596777187775208, + "learning_rate": 0.0001419074522561637, + "loss": 1.0234, + "step": 8241 + }, + { + "epoch": 0.79, + "grad_norm": 0.2883448002693972, + "learning_rate": 0.0001418930883961684, + "loss": 1.113, + "step": 8242 + }, + { + "epoch": 0.79, + "grad_norm": 0.31700421775940696, + "learning_rate": 0.00014187872348777223, + "loss": 0.9246, + "step": 8243 + }, + { + "epoch": 0.79, + "grad_norm": 0.2502687124845023, + "learning_rate": 0.00014186435753133468, + "loss": 0.9632, + "step": 8244 + }, + { + "epoch": 0.79, + "grad_norm": 0.2721584066704403, + "learning_rate": 0.00014184999052721528, + "loss": 1.0489, + "step": 8245 + }, + { + "epoch": 0.79, + "grad_norm": 0.2672093711111685, + "learning_rate": 0.00014183562247577358, + "loss": 1.029, + "step": 8246 + }, + { + "epoch": 0.79, + "grad_norm": 0.31595242015080555, + "learning_rate": 0.00014182125337736912, + "loss": 1.0484, + "step": 8247 + }, + { + "epoch": 0.79, + "grad_norm": 0.217568494553999, + "learning_rate": 0.0001418068832323615, + "loss": 0.8719, + "step": 8248 + }, + { + "epoch": 0.79, + "grad_norm": 0.27618501018822034, + "learning_rate": 0.00014179251204111037, + "loss": 0.9553, + "step": 8249 + }, + { + "epoch": 0.79, + "grad_norm": 0.28614850226175115, + "learning_rate": 0.00014177813980397535, + "loss": 1.0244, + "step": 8250 + }, + { + "epoch": 0.79, + "grad_norm": 0.2804392960583254, + "learning_rate": 0.00014176376652131614, + "loss": 1.117, + "step": 8251 + }, + { + "epoch": 0.79, + "grad_norm": 0.296161246928998, + "learning_rate": 0.0001417493921934924, + "loss": 0.9899, + "step": 8252 + }, + { + "epoch": 0.79, + "grad_norm": 0.27500753507752407, + "learning_rate": 0.00014173501682086389, + "loss": 1.0332, + "step": 8253 + }, + { + "epoch": 0.79, + "grad_norm": 0.27240297524975027, + "learning_rate": 0.00014172064040379037, + "loss": 1.1169, + "step": 8254 + }, + { + "epoch": 0.79, + "grad_norm": 0.2862065599891984, + "learning_rate": 0.00014170626294263158, + "loss": 0.9608, + "step": 8255 + }, + { + "epoch": 0.79, + "grad_norm": 0.29049647009040475, + "learning_rate": 0.00014169188443774737, + "loss": 1.0737, + "step": 8256 + }, + { + "epoch": 0.79, + "grad_norm": 0.30650214779789686, + "learning_rate": 0.00014167750488949753, + "loss": 1.1447, + "step": 8257 + }, + { + "epoch": 0.79, + "grad_norm": 0.25682892320136824, + "learning_rate": 0.00014166312429824196, + "loss": 1.1008, + "step": 8258 + }, + { + "epoch": 0.79, + "grad_norm": 0.289522043516876, + "learning_rate": 0.0001416487426643405, + "loss": 0.9996, + "step": 8259 + }, + { + "epoch": 0.79, + "grad_norm": 0.2612733947909529, + "learning_rate": 0.00014163435998815308, + "loss": 1.0454, + "step": 8260 + }, + { + "epoch": 0.79, + "grad_norm": 0.3017746734519574, + "learning_rate": 0.00014161997627003964, + "loss": 1.1057, + "step": 8261 + }, + { + "epoch": 0.79, + "grad_norm": 0.3026134541065002, + "learning_rate": 0.0001416055915103601, + "loss": 1.0563, + "step": 8262 + }, + { + "epoch": 0.79, + "grad_norm": 0.2921549638360822, + "learning_rate": 0.00014159120570947454, + "loss": 1.2412, + "step": 8263 + }, + { + "epoch": 0.79, + "grad_norm": 0.27462991805196874, + "learning_rate": 0.00014157681886774293, + "loss": 1.1579, + "step": 8264 + }, + { + "epoch": 0.79, + "grad_norm": 0.30353840216135913, + "learning_rate": 0.0001415624309855253, + "loss": 1.0529, + "step": 8265 + }, + { + "epoch": 0.79, + "grad_norm": 0.2628785239134127, + "learning_rate": 0.00014154804206318165, + "loss": 1.0783, + "step": 8266 + }, + { + "epoch": 0.79, + "grad_norm": 0.279824355560124, + "learning_rate": 0.00014153365210107217, + "loss": 0.992, + "step": 8267 + }, + { + "epoch": 0.79, + "grad_norm": 0.25845741171089465, + "learning_rate": 0.00014151926109955696, + "loss": 0.9018, + "step": 8268 + }, + { + "epoch": 0.79, + "grad_norm": 0.2949235009130459, + "learning_rate": 0.0001415048690589961, + "loss": 1.1514, + "step": 8269 + }, + { + "epoch": 0.79, + "grad_norm": 0.3091426382260449, + "learning_rate": 0.00014149047597974984, + "loss": 1.128, + "step": 8270 + }, + { + "epoch": 0.79, + "grad_norm": 0.261254800590016, + "learning_rate": 0.00014147608186217836, + "loss": 1.0248, + "step": 8271 + }, + { + "epoch": 0.79, + "grad_norm": 0.28155062611196985, + "learning_rate": 0.0001414616867066418, + "loss": 1.1088, + "step": 8272 + }, + { + "epoch": 0.79, + "grad_norm": 0.2802398085480191, + "learning_rate": 0.00014144729051350055, + "loss": 1.0411, + "step": 8273 + }, + { + "epoch": 0.79, + "grad_norm": 0.28950599861246645, + "learning_rate": 0.00014143289328311478, + "loss": 1.1909, + "step": 8274 + }, + { + "epoch": 0.79, + "grad_norm": 0.3156838267805892, + "learning_rate": 0.0001414184950158448, + "loss": 1.0001, + "step": 8275 + }, + { + "epoch": 0.79, + "grad_norm": 0.2761437699110209, + "learning_rate": 0.00014140409571205095, + "loss": 1.0747, + "step": 8276 + }, + { + "epoch": 0.79, + "grad_norm": 0.2556738607507545, + "learning_rate": 0.00014138969537209358, + "loss": 1.1727, + "step": 8277 + }, + { + "epoch": 0.79, + "grad_norm": 0.3116455068455913, + "learning_rate": 0.0001413752939963331, + "loss": 1.0714, + "step": 8278 + }, + { + "epoch": 0.79, + "grad_norm": 0.3023777310208979, + "learning_rate": 0.00014136089158512985, + "loss": 1.1541, + "step": 8279 + }, + { + "epoch": 0.79, + "grad_norm": 0.29881211105271765, + "learning_rate": 0.00014134648813884433, + "loss": 1.1441, + "step": 8280 + }, + { + "epoch": 0.79, + "grad_norm": 0.2564362293165476, + "learning_rate": 0.00014133208365783693, + "loss": 1.2231, + "step": 8281 + }, + { + "epoch": 0.79, + "grad_norm": 0.25459170299471473, + "learning_rate": 0.00014131767814246817, + "loss": 1.0638, + "step": 8282 + }, + { + "epoch": 0.79, + "grad_norm": 0.3164293509155231, + "learning_rate": 0.00014130327159309853, + "loss": 1.0238, + "step": 8283 + }, + { + "epoch": 0.79, + "grad_norm": 0.3161096823663635, + "learning_rate": 0.0001412888640100886, + "loss": 1.0997, + "step": 8284 + }, + { + "epoch": 0.79, + "grad_norm": 0.2926302601160675, + "learning_rate": 0.00014127445539379886, + "loss": 1.0269, + "step": 8285 + }, + { + "epoch": 0.79, + "grad_norm": 0.3026119265810598, + "learning_rate": 0.00014126004574458996, + "loss": 1.0535, + "step": 8286 + }, + { + "epoch": 0.79, + "grad_norm": 0.31239765394033814, + "learning_rate": 0.00014124563506282247, + "loss": 1.1042, + "step": 8287 + }, + { + "epoch": 0.79, + "grad_norm": 0.2730026191662197, + "learning_rate": 0.00014123122334885706, + "loss": 1.0406, + "step": 8288 + }, + { + "epoch": 0.79, + "grad_norm": 0.2802587764341756, + "learning_rate": 0.00014121681060305435, + "loss": 1.0393, + "step": 8289 + }, + { + "epoch": 0.79, + "grad_norm": 0.2965134782106928, + "learning_rate": 0.00014120239682577506, + "loss": 1.1487, + "step": 8290 + }, + { + "epoch": 0.79, + "grad_norm": 0.3604661013234594, + "learning_rate": 0.0001411879820173799, + "loss": 1.0004, + "step": 8291 + }, + { + "epoch": 0.79, + "grad_norm": 0.2754245502665909, + "learning_rate": 0.0001411735661782296, + "loss": 0.9554, + "step": 8292 + }, + { + "epoch": 0.79, + "grad_norm": 0.24638138967620207, + "learning_rate": 0.00014115914930868493, + "loss": 1.1353, + "step": 8293 + }, + { + "epoch": 0.79, + "grad_norm": 0.30600504343829077, + "learning_rate": 0.00014114473140910668, + "loss": 1.1201, + "step": 8294 + }, + { + "epoch": 0.79, + "grad_norm": 0.3035845944676315, + "learning_rate": 0.00014113031247985566, + "loss": 1.0861, + "step": 8295 + }, + { + "epoch": 0.79, + "grad_norm": 0.3384934195188291, + "learning_rate": 0.00014111589252129272, + "loss": 1.1501, + "step": 8296 + }, + { + "epoch": 0.79, + "grad_norm": 0.2716845796991304, + "learning_rate": 0.00014110147153377874, + "loss": 1.0534, + "step": 8297 + }, + { + "epoch": 0.79, + "grad_norm": 0.3062067246424864, + "learning_rate": 0.0001410870495176746, + "loss": 1.1802, + "step": 8298 + }, + { + "epoch": 0.79, + "grad_norm": 0.2673331008196946, + "learning_rate": 0.0001410726264733412, + "loss": 1.0404, + "step": 8299 + }, + { + "epoch": 0.79, + "grad_norm": 0.2860719333532579, + "learning_rate": 0.00014105820240113955, + "loss": 1.1296, + "step": 8300 + }, + { + "epoch": 0.79, + "grad_norm": 0.2924049271387296, + "learning_rate": 0.0001410437773014306, + "loss": 1.0039, + "step": 8301 + }, + { + "epoch": 0.79, + "grad_norm": 0.32493379544572254, + "learning_rate": 0.00014102935117457524, + "loss": 0.9896, + "step": 8302 + }, + { + "epoch": 0.79, + "grad_norm": 0.308689353469358, + "learning_rate": 0.00014101492402093463, + "loss": 0.9233, + "step": 8303 + }, + { + "epoch": 0.79, + "grad_norm": 0.2713635705097109, + "learning_rate": 0.00014100049584086979, + "loss": 1.0322, + "step": 8304 + }, + { + "epoch": 0.79, + "grad_norm": 0.2853711362692117, + "learning_rate": 0.00014098606663474176, + "loss": 1.0401, + "step": 8305 + }, + { + "epoch": 0.79, + "grad_norm": 0.25822823305650644, + "learning_rate": 0.00014097163640291164, + "loss": 1.0283, + "step": 8306 + }, + { + "epoch": 0.79, + "grad_norm": 0.3343555252521715, + "learning_rate": 0.00014095720514574058, + "loss": 1.0707, + "step": 8307 + }, + { + "epoch": 0.79, + "grad_norm": 0.28266257518096666, + "learning_rate": 0.00014094277286358972, + "loss": 1.1636, + "step": 8308 + }, + { + "epoch": 0.79, + "grad_norm": 0.2649055365462347, + "learning_rate": 0.00014092833955682026, + "loss": 1.0662, + "step": 8309 + }, + { + "epoch": 0.8, + "grad_norm": 0.31075468177448395, + "learning_rate": 0.00014091390522579333, + "loss": 1.0101, + "step": 8310 + }, + { + "epoch": 0.8, + "grad_norm": 0.3011177221612655, + "learning_rate": 0.00014089946987087023, + "loss": 1.136, + "step": 8311 + }, + { + "epoch": 0.8, + "grad_norm": 0.29851607856732926, + "learning_rate": 0.00014088503349241223, + "loss": 1.1139, + "step": 8312 + }, + { + "epoch": 0.8, + "grad_norm": 0.29855074002538673, + "learning_rate": 0.00014087059609078052, + "loss": 1.0843, + "step": 8313 + }, + { + "epoch": 0.8, + "grad_norm": 0.2768067900609332, + "learning_rate": 0.0001408561576663365, + "loss": 1.0972, + "step": 8314 + }, + { + "epoch": 0.8, + "grad_norm": 0.2819211049236916, + "learning_rate": 0.00014084171821944144, + "loss": 1.1288, + "step": 8315 + }, + { + "epoch": 0.8, + "grad_norm": 0.28684815717935336, + "learning_rate": 0.00014082727775045667, + "loss": 1.1531, + "step": 8316 + }, + { + "epoch": 0.8, + "grad_norm": 0.29080240408238495, + "learning_rate": 0.00014081283625974367, + "loss": 1.1591, + "step": 8317 + }, + { + "epoch": 0.8, + "grad_norm": 0.29429553590146723, + "learning_rate": 0.0001407983937476638, + "loss": 1.1297, + "step": 8318 + }, + { + "epoch": 0.8, + "grad_norm": 0.30787470171045134, + "learning_rate": 0.00014078395021457845, + "loss": 1.2024, + "step": 8319 + }, + { + "epoch": 0.8, + "grad_norm": 0.27975097914183483, + "learning_rate": 0.0001407695056608491, + "loss": 0.9402, + "step": 8320 + }, + { + "epoch": 0.8, + "grad_norm": 0.2832395257132092, + "learning_rate": 0.0001407550600868373, + "loss": 1.0379, + "step": 8321 + }, + { + "epoch": 0.8, + "grad_norm": 0.29437871608032573, + "learning_rate": 0.00014074061349290447, + "loss": 1.0362, + "step": 8322 + }, + { + "epoch": 0.8, + "grad_norm": 0.269143724602035, + "learning_rate": 0.00014072616587941218, + "loss": 1.0537, + "step": 8323 + }, + { + "epoch": 0.8, + "grad_norm": 0.29710110944603785, + "learning_rate": 0.00014071171724672202, + "loss": 1.0264, + "step": 8324 + }, + { + "epoch": 0.8, + "grad_norm": 0.30415387019725554, + "learning_rate": 0.00014069726759519553, + "loss": 1.231, + "step": 8325 + }, + { + "epoch": 0.8, + "grad_norm": 0.29030887320948506, + "learning_rate": 0.00014068281692519434, + "loss": 1.1106, + "step": 8326 + }, + { + "epoch": 0.8, + "grad_norm": 0.32054128469538806, + "learning_rate": 0.0001406683652370801, + "loss": 1.1057, + "step": 8327 + }, + { + "epoch": 0.8, + "grad_norm": 0.30650933574016137, + "learning_rate": 0.00014065391253121446, + "loss": 1.1166, + "step": 8328 + }, + { + "epoch": 0.8, + "grad_norm": 0.2694980594720075, + "learning_rate": 0.0001406394588079591, + "loss": 0.9278, + "step": 8329 + }, + { + "epoch": 0.8, + "grad_norm": 0.2696507606928678, + "learning_rate": 0.00014062500406767574, + "loss": 1.079, + "step": 8330 + }, + { + "epoch": 0.8, + "grad_norm": 0.2759447283420335, + "learning_rate": 0.00014061054831072614, + "loss": 0.9918, + "step": 8331 + }, + { + "epoch": 0.8, + "grad_norm": 0.2738734236935922, + "learning_rate": 0.00014059609153747204, + "loss": 1.1079, + "step": 8332 + }, + { + "epoch": 0.8, + "grad_norm": 0.28515443322196976, + "learning_rate": 0.00014058163374827521, + "loss": 1.0569, + "step": 8333 + }, + { + "epoch": 0.8, + "grad_norm": 0.3080422030491291, + "learning_rate": 0.0001405671749434975, + "loss": 1.0172, + "step": 8334 + }, + { + "epoch": 0.8, + "grad_norm": 0.6582786045483043, + "learning_rate": 0.00014055271512350079, + "loss": 1.458, + "step": 8335 + }, + { + "epoch": 0.8, + "grad_norm": 0.29640316228647345, + "learning_rate": 0.00014053825428864686, + "loss": 0.9889, + "step": 8336 + }, + { + "epoch": 0.8, + "grad_norm": 0.3224808939799605, + "learning_rate": 0.00014052379243929762, + "loss": 1.143, + "step": 8337 + }, + { + "epoch": 0.8, + "grad_norm": 0.26780388495165136, + "learning_rate": 0.00014050932957581505, + "loss": 1.1065, + "step": 8338 + }, + { + "epoch": 0.8, + "grad_norm": 0.3007238046192213, + "learning_rate": 0.000140494865698561, + "loss": 1.2191, + "step": 8339 + }, + { + "epoch": 0.8, + "grad_norm": 0.3150280632023674, + "learning_rate": 0.00014048040080789752, + "loss": 1.1275, + "step": 8340 + }, + { + "epoch": 0.8, + "grad_norm": 0.25501610087568716, + "learning_rate": 0.00014046593490418656, + "loss": 1.0839, + "step": 8341 + }, + { + "epoch": 0.8, + "grad_norm": 0.3241245476281749, + "learning_rate": 0.00014045146798779014, + "loss": 1.0121, + "step": 8342 + }, + { + "epoch": 0.8, + "grad_norm": 0.28717103288948476, + "learning_rate": 0.00014043700005907033, + "loss": 1.0748, + "step": 8343 + }, + { + "epoch": 0.8, + "grad_norm": 0.2949240771446552, + "learning_rate": 0.00014042253111838917, + "loss": 1.1718, + "step": 8344 + }, + { + "epoch": 0.8, + "grad_norm": 0.3080849599666515, + "learning_rate": 0.00014040806116610873, + "loss": 1.1352, + "step": 8345 + }, + { + "epoch": 0.8, + "grad_norm": 0.2757839282426602, + "learning_rate": 0.0001403935902025912, + "loss": 1.0233, + "step": 8346 + }, + { + "epoch": 0.8, + "grad_norm": 0.2993473446158299, + "learning_rate": 0.00014037911822819868, + "loss": 1.1544, + "step": 8347 + }, + { + "epoch": 0.8, + "grad_norm": 0.292778124826498, + "learning_rate": 0.00014036464524329337, + "loss": 1.1381, + "step": 8348 + }, + { + "epoch": 0.8, + "grad_norm": 0.2958157472586977, + "learning_rate": 0.00014035017124823743, + "loss": 1.0634, + "step": 8349 + }, + { + "epoch": 0.8, + "grad_norm": 0.2973614768918341, + "learning_rate": 0.00014033569624339308, + "loss": 1.0993, + "step": 8350 + }, + { + "epoch": 0.8, + "grad_norm": 0.2877291411238009, + "learning_rate": 0.0001403212202291226, + "loss": 1.1405, + "step": 8351 + }, + { + "epoch": 0.8, + "grad_norm": 0.3161483762081105, + "learning_rate": 0.00014030674320578823, + "loss": 1.0157, + "step": 8352 + }, + { + "epoch": 0.8, + "grad_norm": 0.3097883664079197, + "learning_rate": 0.0001402922651737523, + "loss": 1.1105, + "step": 8353 + }, + { + "epoch": 0.8, + "grad_norm": 0.3052718311315742, + "learning_rate": 0.00014027778613337708, + "loss": 1.1271, + "step": 8354 + }, + { + "epoch": 0.8, + "grad_norm": 0.24254225363550241, + "learning_rate": 0.00014026330608502496, + "loss": 0.9638, + "step": 8355 + }, + { + "epoch": 0.8, + "grad_norm": 0.28453519098553043, + "learning_rate": 0.00014024882502905833, + "loss": 1.095, + "step": 8356 + }, + { + "epoch": 0.8, + "grad_norm": 0.3148338032717982, + "learning_rate": 0.0001402343429658395, + "loss": 1.1137, + "step": 8357 + }, + { + "epoch": 0.8, + "grad_norm": 0.3117328371653278, + "learning_rate": 0.000140219859895731, + "loss": 1.1287, + "step": 8358 + }, + { + "epoch": 0.8, + "grad_norm": 0.26695229050684094, + "learning_rate": 0.00014020537581909524, + "loss": 1.0255, + "step": 8359 + }, + { + "epoch": 0.8, + "grad_norm": 0.28843280197149884, + "learning_rate": 0.00014019089073629464, + "loss": 1.0416, + "step": 8360 + }, + { + "epoch": 0.8, + "grad_norm": 0.3067274794496299, + "learning_rate": 0.00014017640464769176, + "loss": 1.0825, + "step": 8361 + }, + { + "epoch": 0.8, + "grad_norm": 0.2739855692621317, + "learning_rate": 0.00014016191755364908, + "loss": 1.0509, + "step": 8362 + }, + { + "epoch": 0.8, + "grad_norm": 0.2395995459480104, + "learning_rate": 0.0001401474294545292, + "loss": 1.0843, + "step": 8363 + }, + { + "epoch": 0.8, + "grad_norm": 0.2759339406448203, + "learning_rate": 0.00014013294035069467, + "loss": 1.1089, + "step": 8364 + }, + { + "epoch": 0.8, + "grad_norm": 0.3012501447545405, + "learning_rate": 0.00014011845024250805, + "loss": 1.0218, + "step": 8365 + }, + { + "epoch": 0.8, + "grad_norm": 0.2743545412783308, + "learning_rate": 0.00014010395913033202, + "loss": 0.9881, + "step": 8366 + }, + { + "epoch": 0.8, + "grad_norm": 0.22999177193777853, + "learning_rate": 0.00014008946701452921, + "loss": 1.0122, + "step": 8367 + }, + { + "epoch": 0.8, + "grad_norm": 0.3145753551754261, + "learning_rate": 0.00014007497389546228, + "loss": 1.1592, + "step": 8368 + }, + { + "epoch": 0.8, + "grad_norm": 0.29504190170329436, + "learning_rate": 0.000140060479773494, + "loss": 1.0783, + "step": 8369 + }, + { + "epoch": 0.8, + "grad_norm": 0.27650998107466573, + "learning_rate": 0.00014004598464898698, + "loss": 1.1227, + "step": 8370 + }, + { + "epoch": 0.8, + "grad_norm": 0.2594677164964031, + "learning_rate": 0.00014003148852230403, + "loss": 1.0184, + "step": 8371 + }, + { + "epoch": 0.8, + "grad_norm": 0.26498120130511177, + "learning_rate": 0.00014001699139380792, + "loss": 1.016, + "step": 8372 + }, + { + "epoch": 0.8, + "grad_norm": 0.33720870260470887, + "learning_rate": 0.00014000249326386147, + "loss": 0.9954, + "step": 8373 + }, + { + "epoch": 0.8, + "grad_norm": 0.27003696605507693, + "learning_rate": 0.0001399879941328275, + "loss": 1.0899, + "step": 8374 + }, + { + "epoch": 0.8, + "grad_norm": 0.25731301945171275, + "learning_rate": 0.0001399734940010688, + "loss": 1.1751, + "step": 8375 + }, + { + "epoch": 0.8, + "grad_norm": 0.288234443384105, + "learning_rate": 0.0001399589928689483, + "loss": 1.1105, + "step": 8376 + }, + { + "epoch": 0.8, + "grad_norm": 0.3086646943980646, + "learning_rate": 0.0001399444907368289, + "loss": 1.0816, + "step": 8377 + }, + { + "epoch": 0.8, + "grad_norm": 0.2857761154797601, + "learning_rate": 0.00013992998760507352, + "loss": 1.1379, + "step": 8378 + }, + { + "epoch": 0.8, + "grad_norm": 0.2848709741740318, + "learning_rate": 0.00013991548347404512, + "loss": 1.1337, + "step": 8379 + }, + { + "epoch": 0.8, + "grad_norm": 0.26144857764795576, + "learning_rate": 0.00013990097834410664, + "loss": 0.9858, + "step": 8380 + }, + { + "epoch": 0.8, + "grad_norm": 0.28140647243062356, + "learning_rate": 0.0001398864722156211, + "loss": 1.1319, + "step": 8381 + }, + { + "epoch": 0.8, + "grad_norm": 0.2757644481768791, + "learning_rate": 0.00013987196508895153, + "loss": 0.9251, + "step": 8382 + }, + { + "epoch": 0.8, + "grad_norm": 0.29461811148669015, + "learning_rate": 0.00013985745696446097, + "loss": 1.1373, + "step": 8383 + }, + { + "epoch": 0.8, + "grad_norm": 0.3141106450117521, + "learning_rate": 0.0001398429478425125, + "loss": 1.0046, + "step": 8384 + }, + { + "epoch": 0.8, + "grad_norm": 0.2932601643453963, + "learning_rate": 0.00013982843772346922, + "loss": 1.0836, + "step": 8385 + }, + { + "epoch": 0.8, + "grad_norm": 0.3080641745939993, + "learning_rate": 0.00013981392660769424, + "loss": 1.1587, + "step": 8386 + }, + { + "epoch": 0.8, + "grad_norm": 0.3161172120816325, + "learning_rate": 0.00013979941449555075, + "loss": 1.0562, + "step": 8387 + }, + { + "epoch": 0.8, + "grad_norm": 0.2919139460605944, + "learning_rate": 0.00013978490138740187, + "loss": 1.1742, + "step": 8388 + }, + { + "epoch": 0.8, + "grad_norm": 0.27923451724228243, + "learning_rate": 0.00013977038728361086, + "loss": 1.1083, + "step": 8389 + }, + { + "epoch": 0.8, + "grad_norm": 0.31621277440252016, + "learning_rate": 0.0001397558721845409, + "loss": 1.0143, + "step": 8390 + }, + { + "epoch": 0.8, + "grad_norm": 0.3053969306091787, + "learning_rate": 0.00013974135609055527, + "loss": 1.0729, + "step": 8391 + }, + { + "epoch": 0.8, + "grad_norm": 0.2825597454845708, + "learning_rate": 0.00013972683900201723, + "loss": 1.0619, + "step": 8392 + }, + { + "epoch": 0.8, + "grad_norm": 0.29068615722749824, + "learning_rate": 0.00013971232091929006, + "loss": 1.065, + "step": 8393 + }, + { + "epoch": 0.8, + "grad_norm": 0.2853464043964099, + "learning_rate": 0.00013969780184273705, + "loss": 0.9789, + "step": 8394 + }, + { + "epoch": 0.8, + "grad_norm": 0.2849572721200672, + "learning_rate": 0.0001396832817727217, + "loss": 1.0263, + "step": 8395 + }, + { + "epoch": 0.8, + "grad_norm": 0.28211914596567356, + "learning_rate": 0.00013966876070960722, + "loss": 1.1037, + "step": 8396 + }, + { + "epoch": 0.8, + "grad_norm": 0.2995697358994056, + "learning_rate": 0.00013965423865375712, + "loss": 1.1653, + "step": 8397 + }, + { + "epoch": 0.8, + "grad_norm": 0.29293561141447033, + "learning_rate": 0.0001396397156055347, + "loss": 1.094, + "step": 8398 + }, + { + "epoch": 0.8, + "grad_norm": 0.25956145112228035, + "learning_rate": 0.00013962519156530354, + "loss": 1.0856, + "step": 8399 + }, + { + "epoch": 0.8, + "grad_norm": 0.27947255917814445, + "learning_rate": 0.00013961066653342706, + "loss": 1.0245, + "step": 8400 + }, + { + "epoch": 0.8, + "grad_norm": 0.2745380656704247, + "learning_rate": 0.00013959614051026873, + "loss": 1.0483, + "step": 8401 + }, + { + "epoch": 0.8, + "grad_norm": 0.2980202904500265, + "learning_rate": 0.0001395816134961921, + "loss": 1.0607, + "step": 8402 + }, + { + "epoch": 0.8, + "grad_norm": 0.25864322411654844, + "learning_rate": 0.00013956708549156072, + "loss": 0.9817, + "step": 8403 + }, + { + "epoch": 0.8, + "grad_norm": 0.27059445234233453, + "learning_rate": 0.00013955255649673816, + "loss": 1.0721, + "step": 8404 + }, + { + "epoch": 0.8, + "grad_norm": 0.30462552256042574, + "learning_rate": 0.00013953802651208802, + "loss": 1.0382, + "step": 8405 + }, + { + "epoch": 0.8, + "grad_norm": 0.2863563756673883, + "learning_rate": 0.0001395234955379739, + "loss": 1.1493, + "step": 8406 + }, + { + "epoch": 0.8, + "grad_norm": 0.2862298189990451, + "learning_rate": 0.0001395089635747595, + "loss": 1.1067, + "step": 8407 + }, + { + "epoch": 0.8, + "grad_norm": 0.2993560987033485, + "learning_rate": 0.00013949443062280842, + "loss": 1.081, + "step": 8408 + }, + { + "epoch": 0.8, + "grad_norm": 0.2717526863604508, + "learning_rate": 0.00013947989668248442, + "loss": 1.0246, + "step": 8409 + }, + { + "epoch": 0.8, + "grad_norm": 0.2843109420913706, + "learning_rate": 0.00013946536175415118, + "loss": 1.1101, + "step": 8410 + }, + { + "epoch": 0.8, + "grad_norm": 0.31237587843637793, + "learning_rate": 0.00013945082583817245, + "loss": 1.0994, + "step": 8411 + }, + { + "epoch": 0.8, + "grad_norm": 0.2655591048449623, + "learning_rate": 0.00013943628893491202, + "loss": 1.1926, + "step": 8412 + }, + { + "epoch": 0.8, + "grad_norm": 0.2923686496830888, + "learning_rate": 0.0001394217510447337, + "loss": 1.1175, + "step": 8413 + }, + { + "epoch": 0.8, + "grad_norm": 0.26432275264898286, + "learning_rate": 0.00013940721216800127, + "loss": 0.9868, + "step": 8414 + }, + { + "epoch": 0.81, + "grad_norm": 0.2767290301083631, + "learning_rate": 0.00013939267230507856, + "loss": 0.9424, + "step": 8415 + }, + { + "epoch": 0.81, + "grad_norm": 0.2865371880691285, + "learning_rate": 0.0001393781314563295, + "loss": 1.1268, + "step": 8416 + }, + { + "epoch": 0.81, + "grad_norm": 0.27359931565493517, + "learning_rate": 0.00013936358962211794, + "loss": 0.894, + "step": 8417 + }, + { + "epoch": 0.81, + "grad_norm": 0.2682490335376522, + "learning_rate": 0.00013934904680280781, + "loss": 1.0634, + "step": 8418 + }, + { + "epoch": 0.81, + "grad_norm": 0.323666262645315, + "learning_rate": 0.00013933450299876305, + "loss": 1.0572, + "step": 8419 + }, + { + "epoch": 0.81, + "grad_norm": 0.313542678895405, + "learning_rate": 0.00013931995821034766, + "loss": 1.0228, + "step": 8420 + }, + { + "epoch": 0.81, + "grad_norm": 0.2741537767451146, + "learning_rate": 0.00013930541243792555, + "loss": 1.1194, + "step": 8421 + }, + { + "epoch": 0.81, + "grad_norm": 0.2727440924938454, + "learning_rate": 0.00013929086568186083, + "loss": 1.1567, + "step": 8422 + }, + { + "epoch": 0.81, + "grad_norm": 0.27596495736712096, + "learning_rate": 0.0001392763179425175, + "loss": 1.0803, + "step": 8423 + }, + { + "epoch": 0.81, + "grad_norm": 0.2789242022899579, + "learning_rate": 0.00013926176922025963, + "loss": 0.912, + "step": 8424 + }, + { + "epoch": 0.81, + "grad_norm": 0.28533265323613294, + "learning_rate": 0.00013924721951545128, + "loss": 1.1343, + "step": 8425 + }, + { + "epoch": 0.81, + "grad_norm": 0.29008984332342763, + "learning_rate": 0.00013923266882845666, + "loss": 1.0476, + "step": 8426 + }, + { + "epoch": 0.81, + "grad_norm": 0.24250004605487083, + "learning_rate": 0.00013921811715963977, + "loss": 1.062, + "step": 8427 + }, + { + "epoch": 0.81, + "grad_norm": 0.29532956323803095, + "learning_rate": 0.0001392035645093649, + "loss": 1.0561, + "step": 8428 + }, + { + "epoch": 0.81, + "grad_norm": 0.3001458878270363, + "learning_rate": 0.00013918901087799616, + "loss": 1.081, + "step": 8429 + }, + { + "epoch": 0.81, + "grad_norm": 0.3037811128687649, + "learning_rate": 0.0001391744562658978, + "loss": 1.098, + "step": 8430 + }, + { + "epoch": 0.81, + "grad_norm": 0.3304936166647709, + "learning_rate": 0.00013915990067343408, + "loss": 0.9485, + "step": 8431 + }, + { + "epoch": 0.81, + "grad_norm": 0.2831506029753717, + "learning_rate": 0.0001391453441009692, + "loss": 1.0727, + "step": 8432 + }, + { + "epoch": 0.81, + "grad_norm": 0.2528874867012647, + "learning_rate": 0.0001391307865488675, + "loss": 1.046, + "step": 8433 + }, + { + "epoch": 0.81, + "grad_norm": 0.2926620284475519, + "learning_rate": 0.00013911622801749326, + "loss": 1.066, + "step": 8434 + }, + { + "epoch": 0.81, + "grad_norm": 0.28658384259986724, + "learning_rate": 0.00013910166850721086, + "loss": 1.1164, + "step": 8435 + }, + { + "epoch": 0.81, + "grad_norm": 0.2611948977130402, + "learning_rate": 0.0001390871080183846, + "loss": 1.1024, + "step": 8436 + }, + { + "epoch": 0.81, + "grad_norm": 0.29623740606511145, + "learning_rate": 0.0001390725465513789, + "loss": 0.9396, + "step": 8437 + }, + { + "epoch": 0.81, + "grad_norm": 0.2759634164236242, + "learning_rate": 0.00013905798410655817, + "loss": 0.991, + "step": 8438 + }, + { + "epoch": 0.81, + "grad_norm": 0.2559555603571331, + "learning_rate": 0.00013904342068428688, + "loss": 0.9583, + "step": 8439 + }, + { + "epoch": 0.81, + "grad_norm": 0.2834397140569843, + "learning_rate": 0.00013902885628492938, + "loss": 1.1322, + "step": 8440 + }, + { + "epoch": 0.81, + "grad_norm": 0.2959073999001046, + "learning_rate": 0.00013901429090885028, + "loss": 1.0687, + "step": 8441 + }, + { + "epoch": 0.81, + "grad_norm": 0.25399362804027736, + "learning_rate": 0.000138999724556414, + "loss": 1.0374, + "step": 8442 + }, + { + "epoch": 0.81, + "grad_norm": 0.29065321813906453, + "learning_rate": 0.00013898515722798513, + "loss": 1.0508, + "step": 8443 + }, + { + "epoch": 0.81, + "grad_norm": 0.37210670978861016, + "learning_rate": 0.00013897058892392818, + "loss": 0.9855, + "step": 8444 + }, + { + "epoch": 0.81, + "grad_norm": 0.2800545837279327, + "learning_rate": 0.00013895601964460775, + "loss": 1.1169, + "step": 8445 + }, + { + "epoch": 0.81, + "grad_norm": 0.2926130011031746, + "learning_rate": 0.00013894144939038844, + "loss": 1.0492, + "step": 8446 + }, + { + "epoch": 0.81, + "grad_norm": 0.31446837274856276, + "learning_rate": 0.00013892687816163487, + "loss": 1.077, + "step": 8447 + }, + { + "epoch": 0.81, + "grad_norm": 0.2894693179667743, + "learning_rate": 0.00013891230595871175, + "loss": 1.1005, + "step": 8448 + }, + { + "epoch": 0.81, + "grad_norm": 0.2658040957365846, + "learning_rate": 0.0001388977327819837, + "loss": 1.129, + "step": 8449 + }, + { + "epoch": 0.81, + "grad_norm": 0.30570591750106335, + "learning_rate": 0.0001388831586318154, + "loss": 1.1457, + "step": 8450 + }, + { + "epoch": 0.81, + "grad_norm": 0.30282216470657447, + "learning_rate": 0.00013886858350857167, + "loss": 1.0694, + "step": 8451 + }, + { + "epoch": 0.81, + "grad_norm": 0.3179740476883664, + "learning_rate": 0.00013885400741261717, + "loss": 1.1539, + "step": 8452 + }, + { + "epoch": 0.81, + "grad_norm": 0.2714947725647423, + "learning_rate": 0.00013883943034431677, + "loss": 1.0887, + "step": 8453 + }, + { + "epoch": 0.81, + "grad_norm": 0.2617135621115103, + "learning_rate": 0.0001388248523040352, + "loss": 1.1484, + "step": 8454 + }, + { + "epoch": 0.81, + "grad_norm": 0.2589470456450907, + "learning_rate": 0.00013881027329213727, + "loss": 0.976, + "step": 8455 + }, + { + "epoch": 0.81, + "grad_norm": 0.2971405117388152, + "learning_rate": 0.0001387956933089879, + "loss": 1.1232, + "step": 8456 + }, + { + "epoch": 0.81, + "grad_norm": 0.2638200620165654, + "learning_rate": 0.0001387811123549519, + "loss": 1.1529, + "step": 8457 + }, + { + "epoch": 0.81, + "grad_norm": 0.2792026970736888, + "learning_rate": 0.00013876653043039418, + "loss": 1.1957, + "step": 8458 + }, + { + "epoch": 0.81, + "grad_norm": 0.27823742191201595, + "learning_rate": 0.0001387519475356797, + "loss": 1.0414, + "step": 8459 + }, + { + "epoch": 0.81, + "grad_norm": 0.25238962999634595, + "learning_rate": 0.00013873736367117336, + "loss": 1.0228, + "step": 8460 + }, + { + "epoch": 0.81, + "grad_norm": 0.28008622103734476, + "learning_rate": 0.00013872277883724015, + "loss": 1.023, + "step": 8461 + }, + { + "epoch": 0.81, + "grad_norm": 0.2777949786844138, + "learning_rate": 0.00013870819303424506, + "loss": 1.1302, + "step": 8462 + }, + { + "epoch": 0.81, + "grad_norm": 0.24701542829888543, + "learning_rate": 0.0001386936062625531, + "loss": 1.1174, + "step": 8463 + }, + { + "epoch": 0.81, + "grad_norm": 0.29033108955425646, + "learning_rate": 0.00013867901852252935, + "loss": 1.0872, + "step": 8464 + }, + { + "epoch": 0.81, + "grad_norm": 0.2747150319565986, + "learning_rate": 0.00013866442981453887, + "loss": 0.95, + "step": 8465 + }, + { + "epoch": 0.81, + "grad_norm": 0.28133751977616195, + "learning_rate": 0.00013864984013894669, + "loss": 1.04, + "step": 8466 + }, + { + "epoch": 0.81, + "grad_norm": 0.2603016003322199, + "learning_rate": 0.00013863524949611798, + "loss": 1.0926, + "step": 8467 + }, + { + "epoch": 0.81, + "grad_norm": 0.28873463989684345, + "learning_rate": 0.00013862065788641787, + "loss": 1.0727, + "step": 8468 + }, + { + "epoch": 0.81, + "grad_norm": 0.3043522634022851, + "learning_rate": 0.00013860606531021155, + "loss": 1.017, + "step": 8469 + }, + { + "epoch": 0.81, + "grad_norm": 0.258266209093827, + "learning_rate": 0.00013859147176786417, + "loss": 1.0572, + "step": 8470 + }, + { + "epoch": 0.81, + "grad_norm": 0.2960901666392521, + "learning_rate": 0.00013857687725974093, + "loss": 0.9861, + "step": 8471 + }, + { + "epoch": 0.81, + "grad_norm": 0.29910616519981614, + "learning_rate": 0.00013856228178620709, + "loss": 1.1596, + "step": 8472 + }, + { + "epoch": 0.81, + "grad_norm": 0.2990576464485771, + "learning_rate": 0.00013854768534762795, + "loss": 1.0265, + "step": 8473 + }, + { + "epoch": 0.81, + "grad_norm": 0.31441877420729514, + "learning_rate": 0.00013853308794436876, + "loss": 1.0613, + "step": 8474 + }, + { + "epoch": 0.81, + "grad_norm": 0.2634200444546087, + "learning_rate": 0.0001385184895767948, + "loss": 1.004, + "step": 8475 + }, + { + "epoch": 0.81, + "grad_norm": 0.2556240322403397, + "learning_rate": 0.0001385038902452714, + "loss": 1.1238, + "step": 8476 + }, + { + "epoch": 0.81, + "grad_norm": 0.27888482542250587, + "learning_rate": 0.00013848928995016403, + "loss": 1.1326, + "step": 8477 + }, + { + "epoch": 0.81, + "grad_norm": 0.27815317220422986, + "learning_rate": 0.00013847468869183796, + "loss": 1.123, + "step": 8478 + }, + { + "epoch": 0.81, + "grad_norm": 0.2831745331072525, + "learning_rate": 0.00013846008647065857, + "loss": 1.0664, + "step": 8479 + }, + { + "epoch": 0.81, + "grad_norm": 0.3041401975450099, + "learning_rate": 0.0001384454832869914, + "loss": 1.0162, + "step": 8480 + }, + { + "epoch": 0.81, + "grad_norm": 0.27417387963872447, + "learning_rate": 0.00013843087914120185, + "loss": 1.1522, + "step": 8481 + }, + { + "epoch": 0.81, + "grad_norm": 0.2551238865778478, + "learning_rate": 0.00013841627403365537, + "loss": 1.1159, + "step": 8482 + }, + { + "epoch": 0.81, + "grad_norm": 0.2772835560718097, + "learning_rate": 0.0001384016679647175, + "loss": 1.0014, + "step": 8483 + }, + { + "epoch": 0.81, + "grad_norm": 0.266588619864142, + "learning_rate": 0.00013838706093475379, + "loss": 1.0242, + "step": 8484 + }, + { + "epoch": 0.81, + "grad_norm": 0.2950429695678513, + "learning_rate": 0.0001383724529441297, + "loss": 1.0789, + "step": 8485 + }, + { + "epoch": 0.81, + "grad_norm": 0.26853491046599237, + "learning_rate": 0.00013835784399321088, + "loss": 1.1386, + "step": 8486 + }, + { + "epoch": 0.81, + "grad_norm": 0.27338834723695976, + "learning_rate": 0.0001383432340823629, + "loss": 0.9977, + "step": 8487 + }, + { + "epoch": 0.81, + "grad_norm": 0.2865977979325858, + "learning_rate": 0.00013832862321195143, + "loss": 1.0583, + "step": 8488 + }, + { + "epoch": 0.81, + "grad_norm": 0.28068233052657926, + "learning_rate": 0.000138314011382342, + "loss": 1.0611, + "step": 8489 + }, + { + "epoch": 0.81, + "grad_norm": 0.3053917578678752, + "learning_rate": 0.0001382993985939004, + "loss": 1.1556, + "step": 8490 + }, + { + "epoch": 0.81, + "grad_norm": 0.26177227751546944, + "learning_rate": 0.00013828478484699227, + "loss": 1.1853, + "step": 8491 + }, + { + "epoch": 0.81, + "grad_norm": 0.28197772368176605, + "learning_rate": 0.00013827017014198336, + "loss": 1.108, + "step": 8492 + }, + { + "epoch": 0.81, + "grad_norm": 0.25534797811164694, + "learning_rate": 0.00013825555447923935, + "loss": 1.1122, + "step": 8493 + }, + { + "epoch": 0.81, + "grad_norm": 0.2985449459023056, + "learning_rate": 0.00013824093785912609, + "loss": 1.0912, + "step": 8494 + }, + { + "epoch": 0.81, + "grad_norm": 0.3047078671768318, + "learning_rate": 0.0001382263202820093, + "loss": 0.909, + "step": 8495 + }, + { + "epoch": 0.81, + "grad_norm": 0.2566715818782515, + "learning_rate": 0.0001382117017482548, + "loss": 1.0322, + "step": 8496 + }, + { + "epoch": 0.81, + "grad_norm": 0.2612952596331514, + "learning_rate": 0.0001381970822582285, + "loss": 0.9924, + "step": 8497 + }, + { + "epoch": 0.81, + "grad_norm": 0.2933790624492483, + "learning_rate": 0.00013818246181229618, + "loss": 1.0607, + "step": 8498 + }, + { + "epoch": 0.81, + "grad_norm": 0.3051960049265422, + "learning_rate": 0.00013816784041082374, + "loss": 1.1308, + "step": 8499 + }, + { + "epoch": 0.81, + "grad_norm": 0.2790257819761861, + "learning_rate": 0.0001381532180541772, + "loss": 1.0386, + "step": 8500 + }, + { + "epoch": 0.81, + "grad_norm": 0.3118122919886825, + "learning_rate": 0.0001381385947427223, + "loss": 1.1372, + "step": 8501 + }, + { + "epoch": 0.81, + "grad_norm": 0.29570017068411963, + "learning_rate": 0.00013812397047682513, + "loss": 1.0147, + "step": 8502 + }, + { + "epoch": 0.81, + "grad_norm": 0.31683426314572966, + "learning_rate": 0.00013810934525685165, + "loss": 1.0712, + "step": 8503 + }, + { + "epoch": 0.81, + "grad_norm": 0.2909795486157777, + "learning_rate": 0.00013809471908316783, + "loss": 1.2234, + "step": 8504 + }, + { + "epoch": 0.81, + "grad_norm": 0.2955103667345504, + "learning_rate": 0.00013808009195613973, + "loss": 1.0783, + "step": 8505 + }, + { + "epoch": 0.81, + "grad_norm": 0.2867522171152345, + "learning_rate": 0.0001380654638761334, + "loss": 1.0277, + "step": 8506 + }, + { + "epoch": 0.81, + "grad_norm": 0.3681601782152508, + "learning_rate": 0.0001380508348435149, + "loss": 1.1069, + "step": 8507 + }, + { + "epoch": 0.81, + "grad_norm": 0.2932701192842754, + "learning_rate": 0.00013803620485865035, + "loss": 1.0882, + "step": 8508 + }, + { + "epoch": 0.81, + "grad_norm": 0.31028006826949395, + "learning_rate": 0.0001380215739219059, + "loss": 0.9705, + "step": 8509 + }, + { + "epoch": 0.81, + "grad_norm": 0.311953807926566, + "learning_rate": 0.00013800694203364763, + "loss": 1.1564, + "step": 8510 + }, + { + "epoch": 0.81, + "grad_norm": 0.2959800715326205, + "learning_rate": 0.00013799230919424175, + "loss": 1.0579, + "step": 8511 + }, + { + "epoch": 0.81, + "grad_norm": 0.2605263218657084, + "learning_rate": 0.00013797767540405447, + "loss": 1.0171, + "step": 8512 + }, + { + "epoch": 0.81, + "grad_norm": 0.30387836906175536, + "learning_rate": 0.00013796304066345197, + "loss": 0.9672, + "step": 8513 + }, + { + "epoch": 0.81, + "grad_norm": 0.299921065982018, + "learning_rate": 0.00013794840497280056, + "loss": 1.1199, + "step": 8514 + }, + { + "epoch": 0.81, + "grad_norm": 0.32487111245703615, + "learning_rate": 0.00013793376833246644, + "loss": 1.2367, + "step": 8515 + }, + { + "epoch": 0.81, + "grad_norm": 0.2847819157265213, + "learning_rate": 0.00013791913074281595, + "loss": 1.1113, + "step": 8516 + }, + { + "epoch": 0.81, + "grad_norm": 0.2868712720370698, + "learning_rate": 0.00013790449220421535, + "loss": 1.1692, + "step": 8517 + }, + { + "epoch": 0.81, + "grad_norm": 0.2678298209756187, + "learning_rate": 0.00013788985271703105, + "loss": 1.1474, + "step": 8518 + }, + { + "epoch": 0.82, + "grad_norm": 0.29099670328302274, + "learning_rate": 0.00013787521228162934, + "loss": 1.0656, + "step": 8519 + }, + { + "epoch": 0.82, + "grad_norm": 0.30474414530355776, + "learning_rate": 0.00013786057089837663, + "loss": 1.0057, + "step": 8520 + }, + { + "epoch": 0.82, + "grad_norm": 0.28018691357325703, + "learning_rate": 0.00013784592856763936, + "loss": 1.0637, + "step": 8521 + }, + { + "epoch": 0.82, + "grad_norm": 0.30281968491489586, + "learning_rate": 0.00013783128528978395, + "loss": 0.9859, + "step": 8522 + }, + { + "epoch": 0.82, + "grad_norm": 0.2833193148012386, + "learning_rate": 0.00013781664106517685, + "loss": 1.1878, + "step": 8523 + }, + { + "epoch": 0.82, + "grad_norm": 0.27855400568860456, + "learning_rate": 0.00013780199589418453, + "loss": 1.0736, + "step": 8524 + }, + { + "epoch": 0.82, + "grad_norm": 0.3023424425522343, + "learning_rate": 0.00013778734977717348, + "loss": 0.9647, + "step": 8525 + }, + { + "epoch": 0.82, + "grad_norm": 0.24163910511953707, + "learning_rate": 0.00013777270271451031, + "loss": 1.0617, + "step": 8526 + }, + { + "epoch": 0.82, + "grad_norm": 0.2643004336760188, + "learning_rate": 0.00013775805470656147, + "loss": 1.0083, + "step": 8527 + }, + { + "epoch": 0.82, + "grad_norm": 0.30690050558969756, + "learning_rate": 0.00013774340575369357, + "loss": 1.1863, + "step": 8528 + }, + { + "epoch": 0.82, + "grad_norm": 0.2803369681980211, + "learning_rate": 0.00013772875585627326, + "loss": 0.9811, + "step": 8529 + }, + { + "epoch": 0.82, + "grad_norm": 0.27882776591624236, + "learning_rate": 0.00013771410501466712, + "loss": 1.0438, + "step": 8530 + }, + { + "epoch": 0.82, + "grad_norm": 0.2866694179672929, + "learning_rate": 0.00013769945322924179, + "loss": 1.0089, + "step": 8531 + }, + { + "epoch": 0.82, + "grad_norm": 0.3193049906912792, + "learning_rate": 0.00013768480050036392, + "loss": 0.9945, + "step": 8532 + }, + { + "epoch": 0.82, + "grad_norm": 0.3134610280410675, + "learning_rate": 0.00013767014682840027, + "loss": 1.0041, + "step": 8533 + }, + { + "epoch": 0.82, + "grad_norm": 0.25858959641085166, + "learning_rate": 0.0001376554922137175, + "loss": 1.0951, + "step": 8534 + }, + { + "epoch": 0.82, + "grad_norm": 0.27495664936656555, + "learning_rate": 0.00013764083665668237, + "loss": 1.0078, + "step": 8535 + }, + { + "epoch": 0.82, + "grad_norm": 0.26221912952359344, + "learning_rate": 0.00013762618015766167, + "loss": 1.0953, + "step": 8536 + }, + { + "epoch": 0.82, + "grad_norm": 0.2969797589840666, + "learning_rate": 0.00013761152271702214, + "loss": 1.104, + "step": 8537 + }, + { + "epoch": 0.82, + "grad_norm": 0.3003478528891654, + "learning_rate": 0.00013759686433513062, + "loss": 1.0608, + "step": 8538 + }, + { + "epoch": 0.82, + "grad_norm": 0.35023226379753625, + "learning_rate": 0.00013758220501235396, + "loss": 1.0195, + "step": 8539 + }, + { + "epoch": 0.82, + "grad_norm": 0.2598816030866285, + "learning_rate": 0.000137567544749059, + "loss": 0.8741, + "step": 8540 + }, + { + "epoch": 0.82, + "grad_norm": 0.29603444421799885, + "learning_rate": 0.0001375528835456126, + "loss": 1.124, + "step": 8541 + }, + { + "epoch": 0.82, + "grad_norm": 0.24876455442000533, + "learning_rate": 0.0001375382214023817, + "loss": 0.9333, + "step": 8542 + }, + { + "epoch": 0.82, + "grad_norm": 0.29348410647341444, + "learning_rate": 0.00013752355831973324, + "loss": 0.9545, + "step": 8543 + }, + { + "epoch": 0.82, + "grad_norm": 0.31014489060350814, + "learning_rate": 0.00013750889429803412, + "loss": 1.0946, + "step": 8544 + }, + { + "epoch": 0.82, + "grad_norm": 0.2901149550164192, + "learning_rate": 0.00013749422933765135, + "loss": 1.1371, + "step": 8545 + }, + { + "epoch": 0.82, + "grad_norm": 0.2796054693890663, + "learning_rate": 0.00013747956343895194, + "loss": 1.0809, + "step": 8546 + }, + { + "epoch": 0.82, + "grad_norm": 0.2809110179945892, + "learning_rate": 0.00013746489660230288, + "loss": 1.0668, + "step": 8547 + }, + { + "epoch": 0.82, + "grad_norm": 0.29179871521386425, + "learning_rate": 0.00013745022882807127, + "loss": 1.1174, + "step": 8548 + }, + { + "epoch": 0.82, + "grad_norm": 0.310921629295036, + "learning_rate": 0.00013743556011662413, + "loss": 1.0533, + "step": 8549 + }, + { + "epoch": 0.82, + "grad_norm": 0.2538334627557585, + "learning_rate": 0.00013742089046832855, + "loss": 0.994, + "step": 8550 + }, + { + "epoch": 0.82, + "grad_norm": 0.2919456298279374, + "learning_rate": 0.00013740621988355168, + "loss": 1.0132, + "step": 8551 + }, + { + "epoch": 0.82, + "grad_norm": 0.2532952808062169, + "learning_rate": 0.00013739154836266064, + "loss": 0.9537, + "step": 8552 + }, + { + "epoch": 0.82, + "grad_norm": 0.31354688802504277, + "learning_rate": 0.0001373768759060226, + "loss": 1.1491, + "step": 8553 + }, + { + "epoch": 0.82, + "grad_norm": 0.2485754538763557, + "learning_rate": 0.00013736220251400478, + "loss": 1.1273, + "step": 8554 + }, + { + "epoch": 0.82, + "grad_norm": 0.29503224281382395, + "learning_rate": 0.00013734752818697434, + "loss": 1.0555, + "step": 8555 + }, + { + "epoch": 0.82, + "grad_norm": 0.2671248855695662, + "learning_rate": 0.00013733285292529855, + "loss": 1.001, + "step": 8556 + }, + { + "epoch": 0.82, + "grad_norm": 0.3018392538404123, + "learning_rate": 0.00013731817672934463, + "loss": 1.1354, + "step": 8557 + }, + { + "epoch": 0.82, + "grad_norm": 0.297525745763971, + "learning_rate": 0.0001373034995994799, + "loss": 1.1386, + "step": 8558 + }, + { + "epoch": 0.82, + "grad_norm": 0.3031989211002846, + "learning_rate": 0.00013728882153607165, + "loss": 1.0058, + "step": 8559 + }, + { + "epoch": 0.82, + "grad_norm": 0.2925383632398921, + "learning_rate": 0.00013727414253948719, + "loss": 0.9204, + "step": 8560 + }, + { + "epoch": 0.82, + "grad_norm": 0.28581280375824064, + "learning_rate": 0.0001372594626100939, + "loss": 1.1115, + "step": 8561 + }, + { + "epoch": 0.82, + "grad_norm": 0.2849917863325391, + "learning_rate": 0.00013724478174825916, + "loss": 1.0351, + "step": 8562 + }, + { + "epoch": 0.82, + "grad_norm": 0.2867357667558398, + "learning_rate": 0.0001372300999543503, + "loss": 0.9713, + "step": 8563 + }, + { + "epoch": 0.82, + "grad_norm": 0.2912838079543734, + "learning_rate": 0.00013721541722873484, + "loss": 1.0435, + "step": 8564 + }, + { + "epoch": 0.82, + "grad_norm": 0.3121331293675006, + "learning_rate": 0.00013720073357178017, + "loss": 1.1627, + "step": 8565 + }, + { + "epoch": 0.82, + "grad_norm": 0.29971607543863277, + "learning_rate": 0.00013718604898385375, + "loss": 1.1364, + "step": 8566 + }, + { + "epoch": 0.82, + "grad_norm": 0.2740375080807219, + "learning_rate": 0.00013717136346532306, + "loss": 1.0376, + "step": 8567 + }, + { + "epoch": 0.82, + "grad_norm": 0.26055014984233066, + "learning_rate": 0.00013715667701655565, + "loss": 0.9892, + "step": 8568 + }, + { + "epoch": 0.82, + "grad_norm": 0.28985982137880956, + "learning_rate": 0.00013714198963791908, + "loss": 1.0258, + "step": 8569 + }, + { + "epoch": 0.82, + "grad_norm": 0.24153544179134467, + "learning_rate": 0.00013712730132978083, + "loss": 1.0979, + "step": 8570 + }, + { + "epoch": 0.82, + "grad_norm": 0.2819940528233919, + "learning_rate": 0.00013711261209250857, + "loss": 1.0627, + "step": 8571 + }, + { + "epoch": 0.82, + "grad_norm": 0.3014061072273147, + "learning_rate": 0.00013709792192646985, + "loss": 1.0036, + "step": 8572 + }, + { + "epoch": 0.82, + "grad_norm": 0.2943687582185628, + "learning_rate": 0.00013708323083203228, + "loss": 1.1475, + "step": 8573 + }, + { + "epoch": 0.82, + "grad_norm": 0.29418992771044433, + "learning_rate": 0.0001370685388095636, + "loss": 1.0402, + "step": 8574 + }, + { + "epoch": 0.82, + "grad_norm": 0.3135438299333758, + "learning_rate": 0.00013705384585943145, + "loss": 1.1125, + "step": 8575 + }, + { + "epoch": 0.82, + "grad_norm": 0.28733629193402527, + "learning_rate": 0.00013703915198200347, + "loss": 1.0161, + "step": 8576 + }, + { + "epoch": 0.82, + "grad_norm": 0.29180260421642634, + "learning_rate": 0.00013702445717764746, + "loss": 1.0476, + "step": 8577 + }, + { + "epoch": 0.82, + "grad_norm": 0.2980601220397487, + "learning_rate": 0.00013700976144673116, + "loss": 1.0602, + "step": 8578 + }, + { + "epoch": 0.82, + "grad_norm": 0.3066488522260643, + "learning_rate": 0.00013699506478962231, + "loss": 1.0332, + "step": 8579 + }, + { + "epoch": 0.82, + "grad_norm": 0.3353162200492158, + "learning_rate": 0.00013698036720668873, + "loss": 0.9495, + "step": 8580 + }, + { + "epoch": 0.82, + "grad_norm": 0.2816337655551374, + "learning_rate": 0.00013696566869829816, + "loss": 1.1274, + "step": 8581 + }, + { + "epoch": 0.82, + "grad_norm": 0.3100139223925439, + "learning_rate": 0.00013695096926481855, + "loss": 1.0753, + "step": 8582 + }, + { + "epoch": 0.82, + "grad_norm": 0.2746129531995423, + "learning_rate": 0.0001369362689066177, + "loss": 1.0354, + "step": 8583 + }, + { + "epoch": 0.82, + "grad_norm": 0.2697359432512603, + "learning_rate": 0.00013692156762406347, + "loss": 0.9328, + "step": 8584 + }, + { + "epoch": 0.82, + "grad_norm": 0.290636331567875, + "learning_rate": 0.00013690686541752384, + "loss": 1.0889, + "step": 8585 + }, + { + "epoch": 0.82, + "grad_norm": 0.28720885198550766, + "learning_rate": 0.0001368921622873667, + "loss": 1.1372, + "step": 8586 + }, + { + "epoch": 0.82, + "grad_norm": 0.30775748112991347, + "learning_rate": 0.00013687745823396007, + "loss": 1.1094, + "step": 8587 + }, + { + "epoch": 0.82, + "grad_norm": 0.2624465157124863, + "learning_rate": 0.0001368627532576718, + "loss": 0.9509, + "step": 8588 + }, + { + "epoch": 0.82, + "grad_norm": 0.26983758242587186, + "learning_rate": 0.00013684804735887, + "loss": 1.0852, + "step": 8589 + }, + { + "epoch": 0.82, + "grad_norm": 0.3019798930382428, + "learning_rate": 0.00013683334053792262, + "loss": 1.0498, + "step": 8590 + }, + { + "epoch": 0.82, + "grad_norm": 0.28333579655941465, + "learning_rate": 0.00013681863279519776, + "loss": 1.0397, + "step": 8591 + }, + { + "epoch": 0.82, + "grad_norm": 0.27677050640426154, + "learning_rate": 0.0001368039241310635, + "loss": 1.0302, + "step": 8592 + }, + { + "epoch": 0.82, + "grad_norm": 0.2740272283421442, + "learning_rate": 0.00013678921454588787, + "loss": 0.9894, + "step": 8593 + }, + { + "epoch": 0.82, + "grad_norm": 0.30385602594318123, + "learning_rate": 0.00013677450404003905, + "loss": 0.9967, + "step": 8594 + }, + { + "epoch": 0.82, + "grad_norm": 0.2543360042755725, + "learning_rate": 0.0001367597926138851, + "loss": 1.1073, + "step": 8595 + }, + { + "epoch": 0.82, + "grad_norm": 0.30243436611460256, + "learning_rate": 0.0001367450802677943, + "loss": 1.0471, + "step": 8596 + }, + { + "epoch": 0.82, + "grad_norm": 0.298278417963026, + "learning_rate": 0.00013673036700213476, + "loss": 1.108, + "step": 8597 + }, + { + "epoch": 0.82, + "grad_norm": 0.344466314776565, + "learning_rate": 0.0001367156528172747, + "loss": 1.0931, + "step": 8598 + }, + { + "epoch": 0.82, + "grad_norm": 0.2597226647292173, + "learning_rate": 0.00013670093771358234, + "loss": 1.0871, + "step": 8599 + }, + { + "epoch": 0.82, + "grad_norm": 0.24557911295793564, + "learning_rate": 0.00013668622169142597, + "loss": 1.0619, + "step": 8600 + }, + { + "epoch": 0.82, + "grad_norm": 0.2747878102964318, + "learning_rate": 0.00013667150475117382, + "loss": 1.1528, + "step": 8601 + }, + { + "epoch": 0.82, + "grad_norm": 0.2853209445830605, + "learning_rate": 0.00013665678689319424, + "loss": 1.0169, + "step": 8602 + }, + { + "epoch": 0.82, + "grad_norm": 0.29466027746832646, + "learning_rate": 0.00013664206811785554, + "loss": 1.0538, + "step": 8603 + }, + { + "epoch": 0.82, + "grad_norm": 0.28940310201790015, + "learning_rate": 0.000136627348425526, + "loss": 0.9515, + "step": 8604 + }, + { + "epoch": 0.82, + "grad_norm": 0.28401605740381525, + "learning_rate": 0.0001366126278165741, + "loss": 1.1516, + "step": 8605 + }, + { + "epoch": 0.82, + "grad_norm": 0.2740954715279189, + "learning_rate": 0.00013659790629136817, + "loss": 1.0099, + "step": 8606 + }, + { + "epoch": 0.82, + "grad_norm": 0.2874894186741719, + "learning_rate": 0.00013658318385027665, + "loss": 0.9993, + "step": 8607 + }, + { + "epoch": 0.82, + "grad_norm": 0.2673309894002083, + "learning_rate": 0.0001365684604936679, + "loss": 1.1178, + "step": 8608 + }, + { + "epoch": 0.82, + "grad_norm": 0.28060184644025465, + "learning_rate": 0.0001365537362219105, + "loss": 1.1111, + "step": 8609 + }, + { + "epoch": 0.82, + "grad_norm": 0.262926201140649, + "learning_rate": 0.00013653901103537287, + "loss": 1.0677, + "step": 8610 + }, + { + "epoch": 0.82, + "grad_norm": 0.2603806384626445, + "learning_rate": 0.0001365242849344235, + "loss": 1.0661, + "step": 8611 + }, + { + "epoch": 0.82, + "grad_norm": 0.27821013084562063, + "learning_rate": 0.00013650955791943097, + "loss": 1.1453, + "step": 8612 + }, + { + "epoch": 0.82, + "grad_norm": 0.3025962932247498, + "learning_rate": 0.0001364948299907638, + "loss": 1.2105, + "step": 8613 + }, + { + "epoch": 0.82, + "grad_norm": 0.27742930612541283, + "learning_rate": 0.00013648010114879056, + "loss": 0.9934, + "step": 8614 + }, + { + "epoch": 0.82, + "grad_norm": 0.2436905652104897, + "learning_rate": 0.0001364653713938799, + "loss": 0.98, + "step": 8615 + }, + { + "epoch": 0.82, + "grad_norm": 0.308431748553756, + "learning_rate": 0.00013645064072640036, + "loss": 1.1859, + "step": 8616 + }, + { + "epoch": 0.82, + "grad_norm": 0.32785207627829216, + "learning_rate": 0.00013643590914672065, + "loss": 1.0128, + "step": 8617 + }, + { + "epoch": 0.82, + "grad_norm": 0.3089734401683715, + "learning_rate": 0.00013642117665520938, + "loss": 1.1553, + "step": 8618 + }, + { + "epoch": 0.82, + "grad_norm": 0.3077673526631775, + "learning_rate": 0.0001364064432522353, + "loss": 1.0379, + "step": 8619 + }, + { + "epoch": 0.82, + "grad_norm": 0.29158179240095367, + "learning_rate": 0.00013639170893816713, + "loss": 1.0954, + "step": 8620 + }, + { + "epoch": 0.82, + "grad_norm": 0.2899047445443715, + "learning_rate": 0.00013637697371337353, + "loss": 1.0493, + "step": 8621 + }, + { + "epoch": 0.82, + "grad_norm": 0.2409967915205593, + "learning_rate": 0.0001363622375782233, + "loss": 1.0478, + "step": 8622 + }, + { + "epoch": 0.82, + "grad_norm": 0.294664521205138, + "learning_rate": 0.00013634750053308524, + "loss": 1.1147, + "step": 8623 + }, + { + "epoch": 0.83, + "grad_norm": 0.2691622747456586, + "learning_rate": 0.00013633276257832814, + "loss": 1.0245, + "step": 8624 + }, + { + "epoch": 0.83, + "grad_norm": 0.2662427740875133, + "learning_rate": 0.0001363180237143208, + "loss": 1.1204, + "step": 8625 + }, + { + "epoch": 0.83, + "grad_norm": 0.315565873248549, + "learning_rate": 0.00013630328394143213, + "loss": 1.0977, + "step": 8626 + }, + { + "epoch": 0.83, + "grad_norm": 0.2958160614252357, + "learning_rate": 0.00013628854326003093, + "loss": 1.0883, + "step": 8627 + }, + { + "epoch": 0.83, + "grad_norm": 0.27828748808814535, + "learning_rate": 0.00013627380167048614, + "loss": 0.9, + "step": 8628 + }, + { + "epoch": 0.83, + "grad_norm": 0.2605213690147047, + "learning_rate": 0.00013625905917316665, + "loss": 1.1766, + "step": 8629 + }, + { + "epoch": 0.83, + "grad_norm": 0.28580811147831237, + "learning_rate": 0.00013624431576844144, + "loss": 1.142, + "step": 8630 + }, + { + "epoch": 0.83, + "grad_norm": 0.2645180239308723, + "learning_rate": 0.00013622957145667945, + "loss": 1.0222, + "step": 8631 + }, + { + "epoch": 0.83, + "grad_norm": 0.27332832418491226, + "learning_rate": 0.00013621482623824965, + "loss": 1.1453, + "step": 8632 + }, + { + "epoch": 0.83, + "grad_norm": 0.24290270100131645, + "learning_rate": 0.00013620008011352105, + "loss": 1.0981, + "step": 8633 + }, + { + "epoch": 0.83, + "grad_norm": 0.2738275452675957, + "learning_rate": 0.0001361853330828627, + "loss": 1.087, + "step": 8634 + }, + { + "epoch": 0.83, + "grad_norm": 0.2973345148003589, + "learning_rate": 0.00013617058514664367, + "loss": 1.0528, + "step": 8635 + }, + { + "epoch": 0.83, + "grad_norm": 0.30243238138049405, + "learning_rate": 0.000136155836305233, + "loss": 1.1979, + "step": 8636 + }, + { + "epoch": 0.83, + "grad_norm": 0.2973554678344532, + "learning_rate": 0.0001361410865589998, + "loss": 1.1496, + "step": 8637 + }, + { + "epoch": 0.83, + "grad_norm": 0.27907427949892327, + "learning_rate": 0.00013612633590831319, + "loss": 1.1112, + "step": 8638 + }, + { + "epoch": 0.83, + "grad_norm": 0.2546823283144357, + "learning_rate": 0.00013611158435354232, + "loss": 0.9379, + "step": 8639 + }, + { + "epoch": 0.83, + "grad_norm": 0.24491672704420625, + "learning_rate": 0.0001360968318950564, + "loss": 1.1017, + "step": 8640 + }, + { + "epoch": 0.83, + "grad_norm": 0.2397150988377803, + "learning_rate": 0.00013608207853322454, + "loss": 0.9593, + "step": 8641 + }, + { + "epoch": 0.83, + "grad_norm": 0.27419817711093586, + "learning_rate": 0.00013606732426841596, + "loss": 0.937, + "step": 8642 + }, + { + "epoch": 0.83, + "grad_norm": 0.3159034039720681, + "learning_rate": 0.00013605256910099997, + "loss": 1.1914, + "step": 8643 + }, + { + "epoch": 0.83, + "grad_norm": 0.27134589534535863, + "learning_rate": 0.00013603781303134576, + "loss": 1.1164, + "step": 8644 + }, + { + "epoch": 0.83, + "grad_norm": 0.3025553917420271, + "learning_rate": 0.00013602305605982262, + "loss": 1.0479, + "step": 8645 + }, + { + "epoch": 0.83, + "grad_norm": 0.27129927828178074, + "learning_rate": 0.0001360082981867999, + "loss": 1.1052, + "step": 8646 + }, + { + "epoch": 0.83, + "grad_norm": 0.31517001904029224, + "learning_rate": 0.00013599353941264684, + "loss": 1.0203, + "step": 8647 + }, + { + "epoch": 0.83, + "grad_norm": 0.2851479531322408, + "learning_rate": 0.0001359787797377329, + "loss": 1.0592, + "step": 8648 + }, + { + "epoch": 0.83, + "grad_norm": 0.2856016662841958, + "learning_rate": 0.00013596401916242732, + "loss": 1.0729, + "step": 8649 + }, + { + "epoch": 0.83, + "grad_norm": 0.2847359988614896, + "learning_rate": 0.00013594925768709959, + "loss": 1.0519, + "step": 8650 + }, + { + "epoch": 0.83, + "grad_norm": 0.2656376058979404, + "learning_rate": 0.00013593449531211908, + "loss": 1.0247, + "step": 8651 + }, + { + "epoch": 0.83, + "grad_norm": 0.2591314203645854, + "learning_rate": 0.00013591973203785524, + "loss": 1.099, + "step": 8652 + }, + { + "epoch": 0.83, + "grad_norm": 0.28779350087950495, + "learning_rate": 0.00013590496786467754, + "loss": 1.054, + "step": 8653 + }, + { + "epoch": 0.83, + "grad_norm": 0.2546447105719811, + "learning_rate": 0.00013589020279295544, + "loss": 1.0369, + "step": 8654 + }, + { + "epoch": 0.83, + "grad_norm": 0.33112794737534357, + "learning_rate": 0.00013587543682305847, + "loss": 1.0239, + "step": 8655 + }, + { + "epoch": 0.83, + "grad_norm": 0.261490090551106, + "learning_rate": 0.00013586066995535616, + "loss": 1.0201, + "step": 8656 + }, + { + "epoch": 0.83, + "grad_norm": 0.2880468769072062, + "learning_rate": 0.000135845902190218, + "loss": 1.1193, + "step": 8657 + }, + { + "epoch": 0.83, + "grad_norm": 0.288557868149757, + "learning_rate": 0.00013583113352801367, + "loss": 1.1048, + "step": 8658 + }, + { + "epoch": 0.83, + "grad_norm": 0.29517378742374495, + "learning_rate": 0.00013581636396911266, + "loss": 1.0905, + "step": 8659 + }, + { + "epoch": 0.83, + "grad_norm": 0.2617708019113313, + "learning_rate": 0.00013580159351388464, + "loss": 1.0682, + "step": 8660 + }, + { + "epoch": 0.83, + "grad_norm": 0.31436033052025997, + "learning_rate": 0.00013578682216269927, + "loss": 0.9827, + "step": 8661 + }, + { + "epoch": 0.83, + "grad_norm": 0.2710558291831824, + "learning_rate": 0.00013577204991592617, + "loss": 1.0678, + "step": 8662 + }, + { + "epoch": 0.83, + "grad_norm": 0.30589524951652153, + "learning_rate": 0.000135757276773935, + "loss": 1.0166, + "step": 8663 + }, + { + "epoch": 0.83, + "grad_norm": 0.31535012911315774, + "learning_rate": 0.00013574250273709555, + "loss": 1.0809, + "step": 8664 + }, + { + "epoch": 0.83, + "grad_norm": 0.34357963327522195, + "learning_rate": 0.0001357277278057775, + "loss": 1.0025, + "step": 8665 + }, + { + "epoch": 0.83, + "grad_norm": 0.29336089111450286, + "learning_rate": 0.0001357129519803506, + "loss": 1.14, + "step": 8666 + }, + { + "epoch": 0.83, + "grad_norm": 0.2988503565133178, + "learning_rate": 0.00013569817526118465, + "loss": 0.9728, + "step": 8667 + }, + { + "epoch": 0.83, + "grad_norm": 0.2586925578598481, + "learning_rate": 0.0001356833976486494, + "loss": 1.0163, + "step": 8668 + }, + { + "epoch": 0.83, + "grad_norm": 0.2937963873608329, + "learning_rate": 0.0001356686191431147, + "loss": 1.0555, + "step": 8669 + }, + { + "epoch": 0.83, + "grad_norm": 0.29571540658867496, + "learning_rate": 0.0001356538397449504, + "loss": 1.0379, + "step": 8670 + }, + { + "epoch": 0.83, + "grad_norm": 0.27868553879152896, + "learning_rate": 0.00013563905945452638, + "loss": 1.0352, + "step": 8671 + }, + { + "epoch": 0.83, + "grad_norm": 0.29105897745078474, + "learning_rate": 0.00013562427827221244, + "loss": 0.993, + "step": 8672 + }, + { + "epoch": 0.83, + "grad_norm": 0.24758183366141243, + "learning_rate": 0.0001356094961983786, + "loss": 0.8968, + "step": 8673 + }, + { + "epoch": 0.83, + "grad_norm": 0.2605888157367459, + "learning_rate": 0.0001355947132333947, + "loss": 1.0564, + "step": 8674 + }, + { + "epoch": 0.83, + "grad_norm": 0.2924270653891825, + "learning_rate": 0.00013557992937763077, + "loss": 1.0911, + "step": 8675 + }, + { + "epoch": 0.83, + "grad_norm": 0.3153612885581894, + "learning_rate": 0.00013556514463145672, + "loss": 1.0308, + "step": 8676 + }, + { + "epoch": 0.83, + "grad_norm": 0.27974012653020464, + "learning_rate": 0.00013555035899524257, + "loss": 1.0977, + "step": 8677 + }, + { + "epoch": 0.83, + "grad_norm": 0.2686146240839227, + "learning_rate": 0.00013553557246935834, + "loss": 0.9855, + "step": 8678 + }, + { + "epoch": 0.83, + "grad_norm": 0.2809894346574622, + "learning_rate": 0.00013552078505417412, + "loss": 1.1707, + "step": 8679 + }, + { + "epoch": 0.83, + "grad_norm": 0.2869178669204025, + "learning_rate": 0.00013550599675005986, + "loss": 1.1491, + "step": 8680 + }, + { + "epoch": 0.83, + "grad_norm": 0.2955879869986699, + "learning_rate": 0.00013549120755738576, + "loss": 1.0608, + "step": 8681 + }, + { + "epoch": 0.83, + "grad_norm": 0.2875101038157737, + "learning_rate": 0.00013547641747652187, + "loss": 1.0307, + "step": 8682 + }, + { + "epoch": 0.83, + "grad_norm": 0.28175365232286614, + "learning_rate": 0.00013546162650783836, + "loss": 1.0323, + "step": 8683 + }, + { + "epoch": 0.83, + "grad_norm": 0.27831603189171494, + "learning_rate": 0.00013544683465170537, + "loss": 1.057, + "step": 8684 + }, + { + "epoch": 0.83, + "grad_norm": 0.3073334793522284, + "learning_rate": 0.00013543204190849303, + "loss": 1.0045, + "step": 8685 + }, + { + "epoch": 0.83, + "grad_norm": 0.29215172333875483, + "learning_rate": 0.00013541724827857157, + "loss": 1.1192, + "step": 8686 + }, + { + "epoch": 0.83, + "grad_norm": 0.2771543039799703, + "learning_rate": 0.00013540245376231122, + "loss": 1.1609, + "step": 8687 + }, + { + "epoch": 0.83, + "grad_norm": 0.29777594479305014, + "learning_rate": 0.00013538765836008224, + "loss": 0.9681, + "step": 8688 + }, + { + "epoch": 0.83, + "grad_norm": 0.2545360895510685, + "learning_rate": 0.00013537286207225484, + "loss": 1.0139, + "step": 8689 + }, + { + "epoch": 0.83, + "grad_norm": 0.26647845539942555, + "learning_rate": 0.00013535806489919935, + "loss": 1.1347, + "step": 8690 + }, + { + "epoch": 0.83, + "grad_norm": 0.29722399863436927, + "learning_rate": 0.00013534326684128605, + "loss": 1.1584, + "step": 8691 + }, + { + "epoch": 0.83, + "grad_norm": 0.30891387675873, + "learning_rate": 0.00013532846789888532, + "loss": 1.0211, + "step": 8692 + }, + { + "epoch": 0.83, + "grad_norm": 0.31589225263631776, + "learning_rate": 0.00013531366807236742, + "loss": 1.0062, + "step": 8693 + }, + { + "epoch": 0.83, + "grad_norm": 0.2972943846537337, + "learning_rate": 0.00013529886736210285, + "loss": 1.0088, + "step": 8694 + }, + { + "epoch": 0.83, + "grad_norm": 0.28937720201774464, + "learning_rate": 0.00013528406576846189, + "loss": 0.9853, + "step": 8695 + }, + { + "epoch": 0.83, + "grad_norm": 0.24765821705508315, + "learning_rate": 0.000135269263291815, + "loss": 0.9191, + "step": 8696 + }, + { + "epoch": 0.83, + "grad_norm": 0.27414169282442363, + "learning_rate": 0.00013525445993253267, + "loss": 1.0309, + "step": 8697 + }, + { + "epoch": 0.83, + "grad_norm": 0.2688265567951683, + "learning_rate": 0.0001352396556909853, + "loss": 1.0866, + "step": 8698 + }, + { + "epoch": 0.83, + "grad_norm": 0.26180708297498495, + "learning_rate": 0.0001352248505675434, + "loss": 1.0702, + "step": 8699 + }, + { + "epoch": 0.83, + "grad_norm": 0.296931683455755, + "learning_rate": 0.00013521004456257748, + "loss": 1.0738, + "step": 8700 + }, + { + "epoch": 0.83, + "grad_norm": 0.2518973661220569, + "learning_rate": 0.0001351952376764581, + "loss": 1.0174, + "step": 8701 + }, + { + "epoch": 0.83, + "grad_norm": 0.30852287173282533, + "learning_rate": 0.00013518042990955575, + "loss": 1.0339, + "step": 8702 + }, + { + "epoch": 0.83, + "grad_norm": 0.28709043823004965, + "learning_rate": 0.000135165621262241, + "loss": 1.127, + "step": 8703 + }, + { + "epoch": 0.83, + "grad_norm": 0.2402225336023348, + "learning_rate": 0.00013515081173488453, + "loss": 1.0512, + "step": 8704 + }, + { + "epoch": 0.83, + "grad_norm": 0.3009799139239725, + "learning_rate": 0.00013513600132785688, + "loss": 0.995, + "step": 8705 + }, + { + "epoch": 0.83, + "grad_norm": 0.31841563271518275, + "learning_rate": 0.0001351211900415287, + "loss": 1.1342, + "step": 8706 + }, + { + "epoch": 0.83, + "grad_norm": 0.2726227896301979, + "learning_rate": 0.00013510637787627068, + "loss": 1.1005, + "step": 8707 + }, + { + "epoch": 0.83, + "grad_norm": 0.30718818182896745, + "learning_rate": 0.0001350915648324535, + "loss": 0.9629, + "step": 8708 + }, + { + "epoch": 0.83, + "grad_norm": 0.24754498668980748, + "learning_rate": 0.00013507675091044787, + "loss": 1.1344, + "step": 8709 + }, + { + "epoch": 0.83, + "grad_norm": 0.3341325246272737, + "learning_rate": 0.00013506193611062444, + "loss": 1.0433, + "step": 8710 + }, + { + "epoch": 0.83, + "grad_norm": 0.2831390747623922, + "learning_rate": 0.0001350471204333541, + "loss": 0.9654, + "step": 8711 + }, + { + "epoch": 0.83, + "grad_norm": 0.27958419761160536, + "learning_rate": 0.0001350323038790075, + "loss": 1.0065, + "step": 8712 + }, + { + "epoch": 0.83, + "grad_norm": 0.3219792061230287, + "learning_rate": 0.00013501748644795548, + "loss": 1.008, + "step": 8713 + }, + { + "epoch": 0.83, + "grad_norm": 0.27010290849276297, + "learning_rate": 0.00013500266814056886, + "loss": 0.9119, + "step": 8714 + }, + { + "epoch": 0.83, + "grad_norm": 0.3035143869003971, + "learning_rate": 0.0001349878489572185, + "loss": 1.0691, + "step": 8715 + }, + { + "epoch": 0.83, + "grad_norm": 0.27992327647557635, + "learning_rate": 0.0001349730288982752, + "loss": 1.067, + "step": 8716 + }, + { + "epoch": 0.83, + "grad_norm": 0.31109391121037, + "learning_rate": 0.00013495820796410987, + "loss": 1.0007, + "step": 8717 + }, + { + "epoch": 0.83, + "grad_norm": 0.30367401228401525, + "learning_rate": 0.00013494338615509344, + "loss": 1.1086, + "step": 8718 + }, + { + "epoch": 0.83, + "grad_norm": 0.27392132119871376, + "learning_rate": 0.00013492856347159678, + "loss": 0.9861, + "step": 8719 + }, + { + "epoch": 0.83, + "grad_norm": 0.318345859783819, + "learning_rate": 0.00013491373991399088, + "loss": 1.042, + "step": 8720 + }, + { + "epoch": 0.83, + "grad_norm": 0.3354485387598985, + "learning_rate": 0.0001348989154826467, + "loss": 1.1203, + "step": 8721 + }, + { + "epoch": 0.83, + "grad_norm": 0.3027876660339266, + "learning_rate": 0.0001348840901779352, + "loss": 1.1517, + "step": 8722 + }, + { + "epoch": 0.83, + "grad_norm": 0.3025089370772015, + "learning_rate": 0.00013486926400022744, + "loss": 1.2937, + "step": 8723 + }, + { + "epoch": 0.83, + "grad_norm": 0.27699234873722095, + "learning_rate": 0.00013485443694989443, + "loss": 1.0503, + "step": 8724 + }, + { + "epoch": 0.83, + "grad_norm": 0.30429620705081767, + "learning_rate": 0.00013483960902730725, + "loss": 1.0699, + "step": 8725 + }, + { + "epoch": 0.83, + "grad_norm": 0.3040291893430194, + "learning_rate": 0.00013482478023283694, + "loss": 1.0117, + "step": 8726 + }, + { + "epoch": 0.83, + "grad_norm": 0.28916049582699416, + "learning_rate": 0.00013480995056685462, + "loss": 1.0908, + "step": 8727 + }, + { + "epoch": 0.84, + "grad_norm": 0.24217865268899172, + "learning_rate": 0.00013479512002973143, + "loss": 1.056, + "step": 8728 + }, + { + "epoch": 0.84, + "grad_norm": 0.27032893449561624, + "learning_rate": 0.00013478028862183846, + "loss": 1.0668, + "step": 8729 + }, + { + "epoch": 0.84, + "grad_norm": 0.27542193970843054, + "learning_rate": 0.00013476545634354692, + "loss": 0.9492, + "step": 8730 + }, + { + "epoch": 0.84, + "grad_norm": 0.23543044415563721, + "learning_rate": 0.000134750623195228, + "loss": 1.0316, + "step": 8731 + }, + { + "epoch": 0.84, + "grad_norm": 0.3223797172938787, + "learning_rate": 0.0001347357891772529, + "loss": 1.1393, + "step": 8732 + }, + { + "epoch": 0.84, + "grad_norm": 0.2998222335002957, + "learning_rate": 0.0001347209542899928, + "loss": 1.1096, + "step": 8733 + }, + { + "epoch": 0.84, + "grad_norm": 0.2845913693982625, + "learning_rate": 0.00013470611853381905, + "loss": 1.0304, + "step": 8734 + }, + { + "epoch": 0.84, + "grad_norm": 0.312734748694851, + "learning_rate": 0.00013469128190910285, + "loss": 1.0816, + "step": 8735 + }, + { + "epoch": 0.84, + "grad_norm": 0.2784778983582345, + "learning_rate": 0.00013467644441621552, + "loss": 0.9477, + "step": 8736 + }, + { + "epoch": 0.84, + "grad_norm": 0.294746691747119, + "learning_rate": 0.00013466160605552836, + "loss": 1.1006, + "step": 8737 + }, + { + "epoch": 0.84, + "grad_norm": 0.2729555754078699, + "learning_rate": 0.00013464676682741275, + "loss": 1.165, + "step": 8738 + }, + { + "epoch": 0.84, + "grad_norm": 0.27914582732602733, + "learning_rate": 0.00013463192673223998, + "loss": 1.0035, + "step": 8739 + }, + { + "epoch": 0.84, + "grad_norm": 0.2759299004442835, + "learning_rate": 0.00013461708577038154, + "loss": 1.1369, + "step": 8740 + }, + { + "epoch": 0.84, + "grad_norm": 0.25997588040826375, + "learning_rate": 0.00013460224394220871, + "loss": 1.0008, + "step": 8741 + }, + { + "epoch": 0.84, + "grad_norm": 0.3006336454437912, + "learning_rate": 0.00013458740124809302, + "loss": 1.0918, + "step": 8742 + }, + { + "epoch": 0.84, + "grad_norm": 0.2865486476111894, + "learning_rate": 0.00013457255768840586, + "loss": 1.0318, + "step": 8743 + }, + { + "epoch": 0.84, + "grad_norm": 0.2610230590960148, + "learning_rate": 0.00013455771326351874, + "loss": 1.0608, + "step": 8744 + }, + { + "epoch": 0.84, + "grad_norm": 0.2747856141015647, + "learning_rate": 0.0001345428679738031, + "loss": 0.9236, + "step": 8745 + }, + { + "epoch": 0.84, + "grad_norm": 0.250840631667433, + "learning_rate": 0.0001345280218196305, + "loss": 1.076, + "step": 8746 + }, + { + "epoch": 0.84, + "grad_norm": 0.3427774684297268, + "learning_rate": 0.0001345131748013724, + "loss": 1.0931, + "step": 8747 + }, + { + "epoch": 0.84, + "grad_norm": 0.30101078325665653, + "learning_rate": 0.0001344983269194005, + "loss": 0.9994, + "step": 8748 + }, + { + "epoch": 0.84, + "grad_norm": 0.285861675215507, + "learning_rate": 0.00013448347817408623, + "loss": 1.0467, + "step": 8749 + }, + { + "epoch": 0.84, + "grad_norm": 0.28454224361070146, + "learning_rate": 0.00013446862856580127, + "loss": 1.0008, + "step": 8750 + }, + { + "epoch": 0.84, + "grad_norm": 0.33159442196308175, + "learning_rate": 0.0001344537780949172, + "loss": 1.206, + "step": 8751 + }, + { + "epoch": 0.84, + "grad_norm": 0.28094929027693405, + "learning_rate": 0.0001344389267618057, + "loss": 1.182, + "step": 8752 + }, + { + "epoch": 0.84, + "grad_norm": 0.2904531787454453, + "learning_rate": 0.0001344240745668384, + "loss": 0.982, + "step": 8753 + }, + { + "epoch": 0.84, + "grad_norm": 0.3124634413609586, + "learning_rate": 0.00013440922151038698, + "loss": 1.0623, + "step": 8754 + }, + { + "epoch": 0.84, + "grad_norm": 0.28809714187038504, + "learning_rate": 0.0001343943675928232, + "loss": 1.1198, + "step": 8755 + }, + { + "epoch": 0.84, + "grad_norm": 0.2873805808293868, + "learning_rate": 0.00013437951281451875, + "loss": 1.0509, + "step": 8756 + }, + { + "epoch": 0.84, + "grad_norm": 0.3130611893567796, + "learning_rate": 0.00013436465717584533, + "loss": 1.0459, + "step": 8757 + }, + { + "epoch": 0.84, + "grad_norm": 0.3002619940368619, + "learning_rate": 0.00013434980067717484, + "loss": 0.9741, + "step": 8758 + }, + { + "epoch": 0.84, + "grad_norm": 0.29994300371561994, + "learning_rate": 0.00013433494331887896, + "loss": 1.0306, + "step": 8759 + }, + { + "epoch": 0.84, + "grad_norm": 0.2709309546499614, + "learning_rate": 0.00013432008510132955, + "loss": 1.0472, + "step": 8760 + }, + { + "epoch": 0.84, + "grad_norm": 0.30275123120474723, + "learning_rate": 0.00013430522602489846, + "loss": 1.0934, + "step": 8761 + }, + { + "epoch": 0.84, + "grad_norm": 0.27267018063077797, + "learning_rate": 0.0001342903660899575, + "loss": 1.0372, + "step": 8762 + }, + { + "epoch": 0.84, + "grad_norm": 0.3179594835836974, + "learning_rate": 0.0001342755052968786, + "loss": 1.0563, + "step": 8763 + }, + { + "epoch": 0.84, + "grad_norm": 0.31426468902967697, + "learning_rate": 0.0001342606436460336, + "loss": 1.1932, + "step": 8764 + }, + { + "epoch": 0.84, + "grad_norm": 0.2715991486632539, + "learning_rate": 0.00013424578113779452, + "loss": 1.0489, + "step": 8765 + }, + { + "epoch": 0.84, + "grad_norm": 0.31900892380705087, + "learning_rate": 0.00013423091777253323, + "loss": 1.109, + "step": 8766 + }, + { + "epoch": 0.84, + "grad_norm": 0.2661697001151372, + "learning_rate": 0.0001342160535506217, + "loss": 1.0726, + "step": 8767 + }, + { + "epoch": 0.84, + "grad_norm": 0.29001722553751685, + "learning_rate": 0.00013420118847243191, + "loss": 1.0504, + "step": 8768 + }, + { + "epoch": 0.84, + "grad_norm": 0.26543562475469634, + "learning_rate": 0.0001341863225383359, + "loss": 1.0057, + "step": 8769 + }, + { + "epoch": 0.84, + "grad_norm": 0.30664308972274495, + "learning_rate": 0.0001341714557487057, + "loss": 1.0982, + "step": 8770 + }, + { + "epoch": 0.84, + "grad_norm": 0.29541897062886224, + "learning_rate": 0.0001341565881039133, + "loss": 1.0859, + "step": 8771 + }, + { + "epoch": 0.84, + "grad_norm": 0.280865811627392, + "learning_rate": 0.00013414171960433085, + "loss": 1.119, + "step": 8772 + }, + { + "epoch": 0.84, + "grad_norm": 0.24611125528299096, + "learning_rate": 0.00013412685025033038, + "loss": 1.1245, + "step": 8773 + }, + { + "epoch": 0.84, + "grad_norm": 0.31141735928913117, + "learning_rate": 0.00013411198004228405, + "loss": 1.0752, + "step": 8774 + }, + { + "epoch": 0.84, + "grad_norm": 0.23851136770012923, + "learning_rate": 0.000134097108980564, + "loss": 1.0644, + "step": 8775 + }, + { + "epoch": 0.84, + "grad_norm": 0.2371811186751139, + "learning_rate": 0.00013408223706554235, + "loss": 1.0813, + "step": 8776 + }, + { + "epoch": 0.84, + "grad_norm": 0.2987397006710603, + "learning_rate": 0.0001340673642975913, + "loss": 1.0765, + "step": 8777 + }, + { + "epoch": 0.84, + "grad_norm": 0.26831922729943924, + "learning_rate": 0.00013405249067708304, + "loss": 1.0317, + "step": 8778 + }, + { + "epoch": 0.84, + "grad_norm": 0.2899859345363733, + "learning_rate": 0.00013403761620438983, + "loss": 1.0402, + "step": 8779 + }, + { + "epoch": 0.84, + "grad_norm": 0.2878652206230178, + "learning_rate": 0.00013402274087988384, + "loss": 0.9933, + "step": 8780 + }, + { + "epoch": 0.84, + "grad_norm": 0.31702536400845066, + "learning_rate": 0.0001340078647039374, + "loss": 1.0253, + "step": 8781 + }, + { + "epoch": 0.84, + "grad_norm": 0.30001278045384056, + "learning_rate": 0.00013399298767692277, + "loss": 1.123, + "step": 8782 + }, + { + "epoch": 0.84, + "grad_norm": 0.2624717683492848, + "learning_rate": 0.00013397810979921227, + "loss": 1.021, + "step": 8783 + }, + { + "epoch": 0.84, + "grad_norm": 0.316080617053837, + "learning_rate": 0.0001339632310711782, + "loss": 1.133, + "step": 8784 + }, + { + "epoch": 0.84, + "grad_norm": 0.27598469027777045, + "learning_rate": 0.00013394835149319292, + "loss": 0.8538, + "step": 8785 + }, + { + "epoch": 0.84, + "grad_norm": 0.29532824662066726, + "learning_rate": 0.00013393347106562884, + "loss": 1.0311, + "step": 8786 + }, + { + "epoch": 0.84, + "grad_norm": 0.3293468526276591, + "learning_rate": 0.00013391858978885828, + "loss": 1.0673, + "step": 8787 + }, + { + "epoch": 0.84, + "grad_norm": 0.2643309414987189, + "learning_rate": 0.00013390370766325373, + "loss": 1.1561, + "step": 8788 + }, + { + "epoch": 0.84, + "grad_norm": 0.2763364535801901, + "learning_rate": 0.00013388882468918758, + "loss": 1.0627, + "step": 8789 + }, + { + "epoch": 0.84, + "grad_norm": 0.2862222208420642, + "learning_rate": 0.0001338739408670323, + "loss": 1.0606, + "step": 8790 + }, + { + "epoch": 0.84, + "grad_norm": 0.2927406980062334, + "learning_rate": 0.00013385905619716032, + "loss": 1.0987, + "step": 8791 + }, + { + "epoch": 0.84, + "grad_norm": 0.27073771357355814, + "learning_rate": 0.00013384417067994423, + "loss": 1.0864, + "step": 8792 + }, + { + "epoch": 0.84, + "grad_norm": 0.26681668946322573, + "learning_rate": 0.00013382928431575648, + "loss": 0.9944, + "step": 8793 + }, + { + "epoch": 0.84, + "grad_norm": 0.2872926853091687, + "learning_rate": 0.00013381439710496962, + "loss": 1.1929, + "step": 8794 + }, + { + "epoch": 0.84, + "grad_norm": 0.29473601888234835, + "learning_rate": 0.00013379950904795625, + "loss": 1.0424, + "step": 8795 + }, + { + "epoch": 0.84, + "grad_norm": 0.2767351991760329, + "learning_rate": 0.0001337846201450889, + "loss": 0.976, + "step": 8796 + }, + { + "epoch": 0.84, + "grad_norm": 0.31278846280951567, + "learning_rate": 0.00013376973039674019, + "loss": 1.0786, + "step": 8797 + }, + { + "epoch": 0.84, + "grad_norm": 0.29106202302244005, + "learning_rate": 0.00013375483980328275, + "loss": 1.0206, + "step": 8798 + }, + { + "epoch": 0.84, + "grad_norm": 0.3062557626049918, + "learning_rate": 0.00013373994836508925, + "loss": 1.0934, + "step": 8799 + }, + { + "epoch": 0.84, + "grad_norm": 0.2930477172648966, + "learning_rate": 0.00013372505608253235, + "loss": 1.0262, + "step": 8800 + }, + { + "epoch": 0.84, + "grad_norm": 0.34634712769514453, + "learning_rate": 0.0001337101629559847, + "loss": 0.9546, + "step": 8801 + }, + { + "epoch": 0.84, + "grad_norm": 0.2772018365006181, + "learning_rate": 0.00013369526898581902, + "loss": 1.0512, + "step": 8802 + }, + { + "epoch": 0.84, + "grad_norm": 0.2709787690909469, + "learning_rate": 0.00013368037417240807, + "loss": 0.994, + "step": 8803 + }, + { + "epoch": 0.84, + "grad_norm": 0.2814766172965834, + "learning_rate": 0.0001336654785161246, + "loss": 1.0845, + "step": 8804 + }, + { + "epoch": 0.84, + "grad_norm": 0.2853983711193067, + "learning_rate": 0.00013365058201734135, + "loss": 0.9669, + "step": 8805 + }, + { + "epoch": 0.84, + "grad_norm": 0.3256691908202577, + "learning_rate": 0.00013363568467643117, + "loss": 1.0751, + "step": 8806 + }, + { + "epoch": 0.84, + "grad_norm": 0.2878851657217033, + "learning_rate": 0.00013362078649376683, + "loss": 1.045, + "step": 8807 + }, + { + "epoch": 0.84, + "grad_norm": 0.30585377114209605, + "learning_rate": 0.00013360588746972118, + "loss": 1.0328, + "step": 8808 + }, + { + "epoch": 0.84, + "grad_norm": 0.2793018113302156, + "learning_rate": 0.00013359098760466707, + "loss": 1.123, + "step": 8809 + }, + { + "epoch": 0.84, + "grad_norm": 0.25457785398487776, + "learning_rate": 0.0001335760868989774, + "loss": 1.1305, + "step": 8810 + }, + { + "epoch": 0.84, + "grad_norm": 0.26931250840260823, + "learning_rate": 0.00013356118535302503, + "loss": 1.1142, + "step": 8811 + }, + { + "epoch": 0.84, + "grad_norm": 0.2779912568855338, + "learning_rate": 0.00013354628296718293, + "loss": 1.0774, + "step": 8812 + }, + { + "epoch": 0.84, + "grad_norm": 0.30698926954918604, + "learning_rate": 0.000133531379741824, + "loss": 1.1206, + "step": 8813 + }, + { + "epoch": 0.84, + "grad_norm": 0.28571250644527174, + "learning_rate": 0.0001335164756773212, + "loss": 1.1259, + "step": 8814 + }, + { + "epoch": 0.84, + "grad_norm": 0.30864009381335133, + "learning_rate": 0.00013350157077404755, + "loss": 0.9197, + "step": 8815 + }, + { + "epoch": 0.84, + "grad_norm": 0.2714746153198062, + "learning_rate": 0.00013348666503237603, + "loss": 0.9842, + "step": 8816 + }, + { + "epoch": 0.84, + "grad_norm": 0.28173495612303234, + "learning_rate": 0.0001334717584526797, + "loss": 0.9923, + "step": 8817 + }, + { + "epoch": 0.84, + "grad_norm": 0.3216688395468582, + "learning_rate": 0.00013345685103533154, + "loss": 1.0964, + "step": 8818 + }, + { + "epoch": 0.84, + "grad_norm": 0.2823984993319558, + "learning_rate": 0.00013344194278070467, + "loss": 1.0409, + "step": 8819 + }, + { + "epoch": 0.84, + "grad_norm": 0.257746108679655, + "learning_rate": 0.00013342703368917217, + "loss": 1.1301, + "step": 8820 + }, + { + "epoch": 0.84, + "grad_norm": 0.3051717861148401, + "learning_rate": 0.00013341212376110715, + "loss": 1.0672, + "step": 8821 + }, + { + "epoch": 0.84, + "grad_norm": 0.27852339454292335, + "learning_rate": 0.00013339721299688272, + "loss": 0.9949, + "step": 8822 + }, + { + "epoch": 0.84, + "grad_norm": 0.30905296931037346, + "learning_rate": 0.00013338230139687206, + "loss": 1.0062, + "step": 8823 + }, + { + "epoch": 0.84, + "grad_norm": 0.24655148826104478, + "learning_rate": 0.0001333673889614483, + "loss": 1.001, + "step": 8824 + }, + { + "epoch": 0.84, + "grad_norm": 0.30269701257489295, + "learning_rate": 0.00013335247569098467, + "loss": 1.0285, + "step": 8825 + }, + { + "epoch": 0.84, + "grad_norm": 0.29912297649204367, + "learning_rate": 0.00013333756158585437, + "loss": 1.1004, + "step": 8826 + }, + { + "epoch": 0.84, + "grad_norm": 0.29035499574876744, + "learning_rate": 0.00013332264664643067, + "loss": 1.053, + "step": 8827 + }, + { + "epoch": 0.84, + "grad_norm": 0.2869996333200353, + "learning_rate": 0.00013330773087308676, + "loss": 1.0357, + "step": 8828 + }, + { + "epoch": 0.84, + "grad_norm": 0.29232380763052546, + "learning_rate": 0.00013329281426619597, + "loss": 1.0288, + "step": 8829 + }, + { + "epoch": 0.84, + "grad_norm": 0.2495577755639189, + "learning_rate": 0.0001332778968261316, + "loss": 1.0611, + "step": 8830 + }, + { + "epoch": 0.84, + "grad_norm": 0.2979321424581524, + "learning_rate": 0.0001332629785532669, + "loss": 0.9719, + "step": 8831 + }, + { + "epoch": 0.84, + "grad_norm": 0.2821090849292814, + "learning_rate": 0.0001332480594479753, + "loss": 1.1135, + "step": 8832 + }, + { + "epoch": 0.85, + "grad_norm": 0.3016300483432541, + "learning_rate": 0.0001332331395106301, + "loss": 1.1395, + "step": 8833 + }, + { + "epoch": 0.85, + "grad_norm": 0.3017945019892449, + "learning_rate": 0.00013321821874160472, + "loss": 0.9148, + "step": 8834 + }, + { + "epoch": 0.85, + "grad_norm": 0.25675693104083414, + "learning_rate": 0.00013320329714127248, + "loss": 1.0208, + "step": 8835 + }, + { + "epoch": 0.85, + "grad_norm": 0.30879551976248903, + "learning_rate": 0.0001331883747100069, + "loss": 1.1158, + "step": 8836 + }, + { + "epoch": 0.85, + "grad_norm": 0.2910089252086533, + "learning_rate": 0.0001331734514481814, + "loss": 1.0531, + "step": 8837 + }, + { + "epoch": 0.85, + "grad_norm": 0.2800038334110076, + "learning_rate": 0.0001331585273561694, + "loss": 1.0722, + "step": 8838 + }, + { + "epoch": 0.85, + "grad_norm": 0.29386882904498585, + "learning_rate": 0.00013314360243434442, + "loss": 1.0914, + "step": 8839 + }, + { + "epoch": 0.85, + "grad_norm": 0.32822389242666516, + "learning_rate": 0.00013312867668307998, + "loss": 1.0664, + "step": 8840 + }, + { + "epoch": 0.85, + "grad_norm": 0.28150462602342563, + "learning_rate": 0.00013311375010274958, + "loss": 1.0505, + "step": 8841 + }, + { + "epoch": 0.85, + "grad_norm": 0.28190761168490724, + "learning_rate": 0.00013309882269372676, + "loss": 1.0243, + "step": 8842 + }, + { + "epoch": 0.85, + "grad_norm": 0.324588959170944, + "learning_rate": 0.00013308389445638508, + "loss": 1.1538, + "step": 8843 + }, + { + "epoch": 0.85, + "grad_norm": 0.313116228461298, + "learning_rate": 0.0001330689653910982, + "loss": 1.0932, + "step": 8844 + }, + { + "epoch": 0.85, + "grad_norm": 0.3028011762250344, + "learning_rate": 0.00013305403549823962, + "loss": 1.0032, + "step": 8845 + }, + { + "epoch": 0.85, + "grad_norm": 0.2679913864423184, + "learning_rate": 0.00013303910477818306, + "loss": 1.0489, + "step": 8846 + }, + { + "epoch": 0.85, + "grad_norm": 0.23253134929888603, + "learning_rate": 0.00013302417323130214, + "loss": 1.0339, + "step": 8847 + }, + { + "epoch": 0.85, + "grad_norm": 0.32940341276625706, + "learning_rate": 0.00013300924085797052, + "loss": 1.0542, + "step": 8848 + }, + { + "epoch": 0.85, + "grad_norm": 0.32063577062079657, + "learning_rate": 0.0001329943076585619, + "loss": 1.029, + "step": 8849 + }, + { + "epoch": 0.85, + "grad_norm": 0.30995947199370877, + "learning_rate": 0.00013297937363345, + "loss": 1.0355, + "step": 8850 + }, + { + "epoch": 0.85, + "grad_norm": 0.25561792258910343, + "learning_rate": 0.00013296443878300858, + "loss": 1.0827, + "step": 8851 + }, + { + "epoch": 0.85, + "grad_norm": 0.26801538052648793, + "learning_rate": 0.0001329495031076113, + "loss": 1.0586, + "step": 8852 + }, + { + "epoch": 0.85, + "grad_norm": 0.2728699949209647, + "learning_rate": 0.00013293456660763204, + "loss": 1.033, + "step": 8853 + }, + { + "epoch": 0.85, + "grad_norm": 0.2807693199525417, + "learning_rate": 0.00013291962928344456, + "loss": 1.0447, + "step": 8854 + }, + { + "epoch": 0.85, + "grad_norm": 0.30144064538223736, + "learning_rate": 0.00013290469113542264, + "loss": 1.0855, + "step": 8855 + }, + { + "epoch": 0.85, + "grad_norm": 0.2871386350695731, + "learning_rate": 0.00013288975216394015, + "loss": 1.045, + "step": 8856 + }, + { + "epoch": 0.85, + "grad_norm": 0.2783823690348202, + "learning_rate": 0.00013287481236937094, + "loss": 1.0184, + "step": 8857 + }, + { + "epoch": 0.85, + "grad_norm": 0.28082328661100076, + "learning_rate": 0.0001328598717520889, + "loss": 1.11, + "step": 8858 + }, + { + "epoch": 0.85, + "grad_norm": 0.2537605984201099, + "learning_rate": 0.00013284493031246792, + "loss": 1.0461, + "step": 8859 + }, + { + "epoch": 0.85, + "grad_norm": 0.29693120122561123, + "learning_rate": 0.00013282998805088191, + "loss": 0.9376, + "step": 8860 + }, + { + "epoch": 0.85, + "grad_norm": 0.2736943955631886, + "learning_rate": 0.0001328150449677048, + "loss": 1.1058, + "step": 8861 + }, + { + "epoch": 0.85, + "grad_norm": 0.3420821362503677, + "learning_rate": 0.00013280010106331058, + "loss": 1.1671, + "step": 8862 + }, + { + "epoch": 0.85, + "grad_norm": 0.2989694017306944, + "learning_rate": 0.00013278515633807322, + "loss": 1.1161, + "step": 8863 + }, + { + "epoch": 0.85, + "grad_norm": 0.24965561624077762, + "learning_rate": 0.00013277021079236673, + "loss": 0.9428, + "step": 8864 + }, + { + "epoch": 0.85, + "grad_norm": 0.27560232204493745, + "learning_rate": 0.0001327552644265651, + "loss": 1.0965, + "step": 8865 + }, + { + "epoch": 0.85, + "grad_norm": 0.31916956119622164, + "learning_rate": 0.0001327403172410424, + "loss": 1.1617, + "step": 8866 + }, + { + "epoch": 0.85, + "grad_norm": 0.26752857028568255, + "learning_rate": 0.00013272536923617266, + "loss": 0.9989, + "step": 8867 + }, + { + "epoch": 0.85, + "grad_norm": 0.2883358139977724, + "learning_rate": 0.00013271042041233003, + "loss": 0.9731, + "step": 8868 + }, + { + "epoch": 0.85, + "grad_norm": 0.33884977680481115, + "learning_rate": 0.00013269547076988854, + "loss": 1.1386, + "step": 8869 + }, + { + "epoch": 0.85, + "grad_norm": 0.32546200445254864, + "learning_rate": 0.00013268052030922237, + "loss": 0.9609, + "step": 8870 + }, + { + "epoch": 0.85, + "grad_norm": 0.2826007467613128, + "learning_rate": 0.00013266556903070563, + "loss": 1.0233, + "step": 8871 + }, + { + "epoch": 0.85, + "grad_norm": 0.2778261537338506, + "learning_rate": 0.00013265061693471246, + "loss": 1.0323, + "step": 8872 + }, + { + "epoch": 0.85, + "grad_norm": 0.26913753452272493, + "learning_rate": 0.00013263566402161713, + "loss": 0.9943, + "step": 8873 + }, + { + "epoch": 0.85, + "grad_norm": 0.26448807283032155, + "learning_rate": 0.0001326207102917938, + "loss": 1.0572, + "step": 8874 + }, + { + "epoch": 0.85, + "grad_norm": 0.3046847456760344, + "learning_rate": 0.00013260575574561666, + "loss": 1.0784, + "step": 8875 + }, + { + "epoch": 0.85, + "grad_norm": 0.297455531468288, + "learning_rate": 0.00013259080038345998, + "loss": 1.1246, + "step": 8876 + }, + { + "epoch": 0.85, + "grad_norm": 0.31150901502574146, + "learning_rate": 0.0001325758442056981, + "loss": 1.1541, + "step": 8877 + }, + { + "epoch": 0.85, + "grad_norm": 0.32499878866127807, + "learning_rate": 0.00013256088721270518, + "loss": 1.1098, + "step": 8878 + }, + { + "epoch": 0.85, + "grad_norm": 0.30353184520322335, + "learning_rate": 0.00013254592940485562, + "loss": 1.1114, + "step": 8879 + }, + { + "epoch": 0.85, + "grad_norm": 0.2765014106506231, + "learning_rate": 0.00013253097078252374, + "loss": 1.0607, + "step": 8880 + }, + { + "epoch": 0.85, + "grad_norm": 0.3232413136845846, + "learning_rate": 0.00013251601134608385, + "loss": 1.0267, + "step": 8881 + }, + { + "epoch": 0.85, + "grad_norm": 0.2576596041228671, + "learning_rate": 0.00013250105109591034, + "loss": 1.1395, + "step": 8882 + }, + { + "epoch": 0.85, + "grad_norm": 0.299087343065221, + "learning_rate": 0.00013248609003237762, + "loss": 1.0855, + "step": 8883 + }, + { + "epoch": 0.85, + "grad_norm": 0.31373524675219927, + "learning_rate": 0.00013247112815586008, + "loss": 1.0168, + "step": 8884 + }, + { + "epoch": 0.85, + "grad_norm": 0.29307123636308, + "learning_rate": 0.00013245616546673212, + "loss": 1.1391, + "step": 8885 + }, + { + "epoch": 0.85, + "grad_norm": 0.2892716141501793, + "learning_rate": 0.00013244120196536825, + "loss": 1.0368, + "step": 8886 + }, + { + "epoch": 0.85, + "grad_norm": 0.30030230235650646, + "learning_rate": 0.0001324262376521429, + "loss": 0.9953, + "step": 8887 + }, + { + "epoch": 0.85, + "grad_norm": 0.2958420718954474, + "learning_rate": 0.00013241127252743056, + "loss": 1.0635, + "step": 8888 + }, + { + "epoch": 0.85, + "grad_norm": 0.30426355904923175, + "learning_rate": 0.00013239630659160577, + "loss": 0.9941, + "step": 8889 + }, + { + "epoch": 0.85, + "grad_norm": 0.26204874988846405, + "learning_rate": 0.00013238133984504305, + "loss": 1.0014, + "step": 8890 + }, + { + "epoch": 0.85, + "grad_norm": 0.2808476806284864, + "learning_rate": 0.00013236637228811695, + "loss": 0.9811, + "step": 8891 + }, + { + "epoch": 0.85, + "grad_norm": 0.2613165358148853, + "learning_rate": 0.00013235140392120202, + "loss": 1.0464, + "step": 8892 + }, + { + "epoch": 0.85, + "grad_norm": 0.2614569034666025, + "learning_rate": 0.0001323364347446729, + "loss": 1.0346, + "step": 8893 + }, + { + "epoch": 0.85, + "grad_norm": 0.3048725191683244, + "learning_rate": 0.00013232146475890415, + "loss": 1.1072, + "step": 8894 + }, + { + "epoch": 0.85, + "grad_norm": 0.3051855377395641, + "learning_rate": 0.00013230649396427048, + "loss": 1.2126, + "step": 8895 + }, + { + "epoch": 0.85, + "grad_norm": 0.2885176846905764, + "learning_rate": 0.00013229152236114646, + "loss": 1.1485, + "step": 8896 + }, + { + "epoch": 0.85, + "grad_norm": 0.27063874711622693, + "learning_rate": 0.0001322765499499068, + "loss": 1.0701, + "step": 8897 + }, + { + "epoch": 0.85, + "grad_norm": 0.2650234430261914, + "learning_rate": 0.0001322615767309262, + "loss": 0.9788, + "step": 8898 + }, + { + "epoch": 0.85, + "grad_norm": 0.3332394548518117, + "learning_rate": 0.00013224660270457937, + "loss": 1.0501, + "step": 8899 + }, + { + "epoch": 0.85, + "grad_norm": 0.2622983793530935, + "learning_rate": 0.00013223162787124104, + "loss": 1.0524, + "step": 8900 + }, + { + "epoch": 0.85, + "grad_norm": 0.28082192393839644, + "learning_rate": 0.00013221665223128593, + "loss": 1.2141, + "step": 8901 + }, + { + "epoch": 0.85, + "grad_norm": 0.30805730414546934, + "learning_rate": 0.00013220167578508892, + "loss": 1.073, + "step": 8902 + }, + { + "epoch": 0.85, + "grad_norm": 0.2984108714398881, + "learning_rate": 0.00013218669853302467, + "loss": 1.075, + "step": 8903 + }, + { + "epoch": 0.85, + "grad_norm": 0.24970867197130167, + "learning_rate": 0.0001321717204754681, + "loss": 0.9129, + "step": 8904 + }, + { + "epoch": 0.85, + "grad_norm": 0.2976975973384336, + "learning_rate": 0.00013215674161279402, + "loss": 1.0718, + "step": 8905 + }, + { + "epoch": 0.85, + "grad_norm": 0.26473164124738907, + "learning_rate": 0.00013214176194537722, + "loss": 1.1786, + "step": 8906 + }, + { + "epoch": 0.85, + "grad_norm": 0.28433983890808473, + "learning_rate": 0.00013212678147359267, + "loss": 1.1131, + "step": 8907 + }, + { + "epoch": 0.85, + "grad_norm": 0.268486990998593, + "learning_rate": 0.00013211180019781518, + "loss": 1.0515, + "step": 8908 + }, + { + "epoch": 0.85, + "grad_norm": 0.2737650744067927, + "learning_rate": 0.00013209681811841972, + "loss": 1.0995, + "step": 8909 + }, + { + "epoch": 0.85, + "grad_norm": 0.30007514688621423, + "learning_rate": 0.00013208183523578124, + "loss": 1.0624, + "step": 8910 + }, + { + "epoch": 0.85, + "grad_norm": 0.26052045873934276, + "learning_rate": 0.00013206685155027465, + "loss": 1.1553, + "step": 8911 + }, + { + "epoch": 0.85, + "grad_norm": 0.27092778505487736, + "learning_rate": 0.00013205186706227498, + "loss": 1.1842, + "step": 8912 + }, + { + "epoch": 0.85, + "grad_norm": 0.3258021626325885, + "learning_rate": 0.00013203688177215714, + "loss": 1.081, + "step": 8913 + }, + { + "epoch": 0.85, + "grad_norm": 0.2984371253963151, + "learning_rate": 0.00013202189568029625, + "loss": 1.0585, + "step": 8914 + }, + { + "epoch": 0.85, + "grad_norm": 0.28356401437541234, + "learning_rate": 0.00013200690878706724, + "loss": 0.9656, + "step": 8915 + }, + { + "epoch": 0.85, + "grad_norm": 0.27533653430669, + "learning_rate": 0.00013199192109284526, + "loss": 1.0106, + "step": 8916 + }, + { + "epoch": 0.85, + "grad_norm": 0.28571972152442254, + "learning_rate": 0.00013197693259800534, + "loss": 0.9763, + "step": 8917 + }, + { + "epoch": 0.85, + "grad_norm": 0.2560488227643934, + "learning_rate": 0.0001319619433029226, + "loss": 1.182, + "step": 8918 + }, + { + "epoch": 0.85, + "grad_norm": 0.2984853514513387, + "learning_rate": 0.00013194695320797214, + "loss": 1.1465, + "step": 8919 + }, + { + "epoch": 0.85, + "grad_norm": 0.25687016849701755, + "learning_rate": 0.00013193196231352905, + "loss": 0.9482, + "step": 8920 + }, + { + "epoch": 0.85, + "grad_norm": 0.31648422763460987, + "learning_rate": 0.00013191697061996858, + "loss": 1.0639, + "step": 8921 + }, + { + "epoch": 0.85, + "grad_norm": 0.2499110549279051, + "learning_rate": 0.00013190197812766588, + "loss": 1.0542, + "step": 8922 + }, + { + "epoch": 0.85, + "grad_norm": 0.26107365347955813, + "learning_rate": 0.00013188698483699608, + "loss": 1.0763, + "step": 8923 + }, + { + "epoch": 0.85, + "grad_norm": 0.2890340873975123, + "learning_rate": 0.00013187199074833449, + "loss": 1.1517, + "step": 8924 + }, + { + "epoch": 0.85, + "grad_norm": 0.26683546682272963, + "learning_rate": 0.00013185699586205628, + "loss": 1.1019, + "step": 8925 + }, + { + "epoch": 0.85, + "grad_norm": 0.2751530786283772, + "learning_rate": 0.0001318420001785367, + "loss": 1.0552, + "step": 8926 + }, + { + "epoch": 0.85, + "grad_norm": 0.2805940758160267, + "learning_rate": 0.00013182700369815108, + "loss": 1.046, + "step": 8927 + }, + { + "epoch": 0.85, + "grad_norm": 0.2581306634911059, + "learning_rate": 0.00013181200642127468, + "loss": 0.9718, + "step": 8928 + }, + { + "epoch": 0.85, + "grad_norm": 0.2699851519716227, + "learning_rate": 0.00013179700834828282, + "loss": 1.1284, + "step": 8929 + }, + { + "epoch": 0.85, + "grad_norm": 0.2735704940152805, + "learning_rate": 0.00013178200947955087, + "loss": 0.9873, + "step": 8930 + }, + { + "epoch": 0.85, + "grad_norm": 0.29262357118516796, + "learning_rate": 0.00013176700981545414, + "loss": 1.052, + "step": 8931 + }, + { + "epoch": 0.85, + "grad_norm": 0.3021541436530508, + "learning_rate": 0.00013175200935636804, + "loss": 0.9968, + "step": 8932 + }, + { + "epoch": 0.85, + "grad_norm": 0.2994688515381643, + "learning_rate": 0.0001317370081026679, + "loss": 1.0423, + "step": 8933 + }, + { + "epoch": 0.85, + "grad_norm": 0.21622128251751344, + "learning_rate": 0.00013172200605472925, + "loss": 1.0596, + "step": 8934 + }, + { + "epoch": 0.85, + "grad_norm": 0.27357520538102, + "learning_rate": 0.00013170700321292746, + "loss": 1.0868, + "step": 8935 + }, + { + "epoch": 0.85, + "grad_norm": 0.32314314647789527, + "learning_rate": 0.00013169199957763797, + "loss": 1.1866, + "step": 8936 + }, + { + "epoch": 0.86, + "grad_norm": 0.27729229797866944, + "learning_rate": 0.00013167699514923624, + "loss": 1.0354, + "step": 8937 + }, + { + "epoch": 0.86, + "grad_norm": 0.2689304675066156, + "learning_rate": 0.00013166198992809784, + "loss": 1.118, + "step": 8938 + }, + { + "epoch": 0.86, + "grad_norm": 0.281878748265055, + "learning_rate": 0.00013164698391459823, + "loss": 1.0886, + "step": 8939 + }, + { + "epoch": 0.86, + "grad_norm": 0.31972214001347354, + "learning_rate": 0.00013163197710911294, + "loss": 1.0638, + "step": 8940 + }, + { + "epoch": 0.86, + "grad_norm": 0.30752537186921586, + "learning_rate": 0.00013161696951201755, + "loss": 1.0808, + "step": 8941 + }, + { + "epoch": 0.86, + "grad_norm": 0.3169449014565689, + "learning_rate": 0.00013160196112368765, + "loss": 1.0815, + "step": 8942 + }, + { + "epoch": 0.86, + "grad_norm": 0.279353465633034, + "learning_rate": 0.00013158695194449878, + "loss": 0.9856, + "step": 8943 + }, + { + "epoch": 0.86, + "grad_norm": 0.2506893330731972, + "learning_rate": 0.00013157194197482662, + "loss": 1.0427, + "step": 8944 + }, + { + "epoch": 0.86, + "grad_norm": 0.28379173348711845, + "learning_rate": 0.00013155693121504676, + "loss": 1.077, + "step": 8945 + }, + { + "epoch": 0.86, + "grad_norm": 0.35906069380401406, + "learning_rate": 0.00013154191966553488, + "loss": 1.0853, + "step": 8946 + }, + { + "epoch": 0.86, + "grad_norm": 0.2682913120576577, + "learning_rate": 0.0001315269073266666, + "loss": 1.011, + "step": 8947 + }, + { + "epoch": 0.86, + "grad_norm": 0.2686741445506597, + "learning_rate": 0.00013151189419881767, + "loss": 1.0058, + "step": 8948 + }, + { + "epoch": 0.86, + "grad_norm": 0.30034831308239196, + "learning_rate": 0.00013149688028236378, + "loss": 0.999, + "step": 8949 + }, + { + "epoch": 0.86, + "grad_norm": 0.2785651237874469, + "learning_rate": 0.00013148186557768065, + "loss": 1.0743, + "step": 8950 + }, + { + "epoch": 0.86, + "grad_norm": 0.26665224113450825, + "learning_rate": 0.00013146685008514405, + "loss": 1.1498, + "step": 8951 + }, + { + "epoch": 0.86, + "grad_norm": 0.2767238879256942, + "learning_rate": 0.00013145183380512977, + "loss": 0.9933, + "step": 8952 + }, + { + "epoch": 0.86, + "grad_norm": 0.27676417086874466, + "learning_rate": 0.0001314368167380136, + "loss": 1.048, + "step": 8953 + }, + { + "epoch": 0.86, + "grad_norm": 0.26331762630608485, + "learning_rate": 0.00013142179888417127, + "loss": 1.0725, + "step": 8954 + }, + { + "epoch": 0.86, + "grad_norm": 0.2744742895646326, + "learning_rate": 0.00013140678024397876, + "loss": 1.0698, + "step": 8955 + }, + { + "epoch": 0.86, + "grad_norm": 0.3284594696993571, + "learning_rate": 0.00013139176081781176, + "loss": 1.1246, + "step": 8956 + }, + { + "epoch": 0.86, + "grad_norm": 0.25822096562028224, + "learning_rate": 0.00013137674060604627, + "loss": 0.9168, + "step": 8957 + }, + { + "epoch": 0.86, + "grad_norm": 0.26107725204700727, + "learning_rate": 0.0001313617196090581, + "loss": 1.2021, + "step": 8958 + }, + { + "epoch": 0.86, + "grad_norm": 0.3179530192263277, + "learning_rate": 0.0001313466978272232, + "loss": 1.0834, + "step": 8959 + }, + { + "epoch": 0.86, + "grad_norm": 0.2754800386668831, + "learning_rate": 0.00013133167526091746, + "loss": 1.0735, + "step": 8960 + }, + { + "epoch": 0.86, + "grad_norm": 0.28969505946971996, + "learning_rate": 0.00013131665191051686, + "loss": 1.1824, + "step": 8961 + }, + { + "epoch": 0.86, + "grad_norm": 0.3050269917318578, + "learning_rate": 0.0001313016277763974, + "loss": 1.1192, + "step": 8962 + }, + { + "epoch": 0.86, + "grad_norm": 0.28997353889525096, + "learning_rate": 0.00013128660285893502, + "loss": 1.0988, + "step": 8963 + }, + { + "epoch": 0.86, + "grad_norm": 0.2639957610030003, + "learning_rate": 0.00013127157715850572, + "loss": 1.095, + "step": 8964 + }, + { + "epoch": 0.86, + "grad_norm": 0.25243571206061105, + "learning_rate": 0.00013125655067548555, + "loss": 1.0326, + "step": 8965 + }, + { + "epoch": 0.86, + "grad_norm": 0.27769176135258244, + "learning_rate": 0.00013124152341025057, + "loss": 1.0556, + "step": 8966 + }, + { + "epoch": 0.86, + "grad_norm": 0.31119684825896593, + "learning_rate": 0.00013122649536317682, + "loss": 1.0474, + "step": 8967 + }, + { + "epoch": 0.86, + "grad_norm": 0.2899032223624125, + "learning_rate": 0.0001312114665346404, + "loss": 1.1379, + "step": 8968 + }, + { + "epoch": 0.86, + "grad_norm": 0.26474621772474893, + "learning_rate": 0.00013119643692501742, + "loss": 0.9737, + "step": 8969 + }, + { + "epoch": 0.86, + "grad_norm": 0.296070787125954, + "learning_rate": 0.000131181406534684, + "loss": 1.1303, + "step": 8970 + }, + { + "epoch": 0.86, + "grad_norm": 0.28196083416898055, + "learning_rate": 0.00013116637536401626, + "loss": 0.9212, + "step": 8971 + }, + { + "epoch": 0.86, + "grad_norm": 0.33777731349783074, + "learning_rate": 0.00013115134341339042, + "loss": 1.1114, + "step": 8972 + }, + { + "epoch": 0.86, + "grad_norm": 0.2980471209168762, + "learning_rate": 0.00013113631068318262, + "loss": 1.0419, + "step": 8973 + }, + { + "epoch": 0.86, + "grad_norm": 0.34067752191686806, + "learning_rate": 0.00013112127717376906, + "loss": 1.0489, + "step": 8974 + }, + { + "epoch": 0.86, + "grad_norm": 0.2857921528011418, + "learning_rate": 0.000131106242885526, + "loss": 1.095, + "step": 8975 + }, + { + "epoch": 0.86, + "grad_norm": 0.30595650571474364, + "learning_rate": 0.0001310912078188297, + "loss": 1.0579, + "step": 8976 + }, + { + "epoch": 0.86, + "grad_norm": 0.3113974291276129, + "learning_rate": 0.00013107617197405632, + "loss": 1.1008, + "step": 8977 + }, + { + "epoch": 0.86, + "grad_norm": 0.2835220329748002, + "learning_rate": 0.00013106113535158223, + "loss": 1.1758, + "step": 8978 + }, + { + "epoch": 0.86, + "grad_norm": 0.30573600648132765, + "learning_rate": 0.00013104609795178373, + "loss": 1.0587, + "step": 8979 + }, + { + "epoch": 0.86, + "grad_norm": 0.2724899362196218, + "learning_rate": 0.00013103105977503712, + "loss": 1.0202, + "step": 8980 + }, + { + "epoch": 0.86, + "grad_norm": 0.28699936186440506, + "learning_rate": 0.0001310160208217187, + "loss": 1.1057, + "step": 8981 + }, + { + "epoch": 0.86, + "grad_norm": 0.26272510332229243, + "learning_rate": 0.00013100098109220486, + "loss": 1.0776, + "step": 8982 + }, + { + "epoch": 0.86, + "grad_norm": 0.27023197555661166, + "learning_rate": 0.00013098594058687203, + "loss": 1.1075, + "step": 8983 + }, + { + "epoch": 0.86, + "grad_norm": 0.27849525197693725, + "learning_rate": 0.00013097089930609653, + "loss": 1.0556, + "step": 8984 + }, + { + "epoch": 0.86, + "grad_norm": 0.2813628673155407, + "learning_rate": 0.00013095585725025481, + "loss": 0.9954, + "step": 8985 + }, + { + "epoch": 0.86, + "grad_norm": 0.23165959095448188, + "learning_rate": 0.00013094081441972333, + "loss": 1.0472, + "step": 8986 + }, + { + "epoch": 0.86, + "grad_norm": 0.2865879802186648, + "learning_rate": 0.0001309257708148785, + "loss": 1.0757, + "step": 8987 + }, + { + "epoch": 0.86, + "grad_norm": 0.28003705635045895, + "learning_rate": 0.00013091072643609683, + "loss": 1.027, + "step": 8988 + }, + { + "epoch": 0.86, + "grad_norm": 0.2518027494418857, + "learning_rate": 0.0001308956812837548, + "loss": 1.074, + "step": 8989 + }, + { + "epoch": 0.86, + "grad_norm": 0.2578937586135656, + "learning_rate": 0.0001308806353582289, + "loss": 1.0831, + "step": 8990 + }, + { + "epoch": 0.86, + "grad_norm": 0.27260063912350424, + "learning_rate": 0.00013086558865989576, + "loss": 1.0183, + "step": 8991 + }, + { + "epoch": 0.86, + "grad_norm": 0.29385408837160987, + "learning_rate": 0.0001308505411891318, + "loss": 1.0944, + "step": 8992 + }, + { + "epoch": 0.86, + "grad_norm": 0.293490265358121, + "learning_rate": 0.0001308354929463137, + "loss": 1.0724, + "step": 8993 + }, + { + "epoch": 0.86, + "grad_norm": 0.2822276360093321, + "learning_rate": 0.00013082044393181798, + "loss": 1.0708, + "step": 8994 + }, + { + "epoch": 0.86, + "grad_norm": 0.31600027722892315, + "learning_rate": 0.0001308053941460213, + "loss": 1.036, + "step": 8995 + }, + { + "epoch": 0.86, + "grad_norm": 0.31974027288364265, + "learning_rate": 0.00013079034358930028, + "loss": 1.1244, + "step": 8996 + }, + { + "epoch": 0.86, + "grad_norm": 0.2636215501400869, + "learning_rate": 0.00013077529226203155, + "loss": 0.9505, + "step": 8997 + }, + { + "epoch": 0.86, + "grad_norm": 0.2732300022362931, + "learning_rate": 0.00013076024016459177, + "loss": 1.0561, + "step": 8998 + }, + { + "epoch": 0.86, + "grad_norm": 0.33093910848931535, + "learning_rate": 0.0001307451872973577, + "loss": 1.0408, + "step": 8999 + }, + { + "epoch": 0.86, + "grad_norm": 0.2880821403188908, + "learning_rate": 0.00013073013366070595, + "loss": 1.052, + "step": 9000 + }, + { + "epoch": 0.86, + "grad_norm": 0.3061067194549959, + "learning_rate": 0.0001307150792550133, + "loss": 1.0279, + "step": 9001 + }, + { + "epoch": 0.86, + "grad_norm": 0.2723856651427697, + "learning_rate": 0.0001307000240806565, + "loss": 1.0259, + "step": 9002 + }, + { + "epoch": 0.86, + "grad_norm": 0.2997832873455384, + "learning_rate": 0.0001306849681380123, + "loss": 1.0171, + "step": 9003 + }, + { + "epoch": 0.86, + "grad_norm": 0.268324862068108, + "learning_rate": 0.00013066991142745746, + "loss": 1.0015, + "step": 9004 + }, + { + "epoch": 0.86, + "grad_norm": 0.3313758049625676, + "learning_rate": 0.00013065485394936886, + "loss": 1.0488, + "step": 9005 + }, + { + "epoch": 0.86, + "grad_norm": 0.2840867112984875, + "learning_rate": 0.00013063979570412324, + "loss": 1.0547, + "step": 9006 + }, + { + "epoch": 0.86, + "grad_norm": 0.2859672508903575, + "learning_rate": 0.0001306247366920975, + "loss": 1.0588, + "step": 9007 + }, + { + "epoch": 0.86, + "grad_norm": 0.29147422535829376, + "learning_rate": 0.00013060967691366844, + "loss": 1.126, + "step": 9008 + }, + { + "epoch": 0.86, + "grad_norm": 0.2736559351409344, + "learning_rate": 0.00013059461636921298, + "loss": 0.9615, + "step": 9009 + }, + { + "epoch": 0.86, + "grad_norm": 0.2772933874203004, + "learning_rate": 0.00013057955505910805, + "loss": 1.085, + "step": 9010 + }, + { + "epoch": 0.86, + "grad_norm": 0.29810412332818575, + "learning_rate": 0.00013056449298373053, + "loss": 1.0038, + "step": 9011 + }, + { + "epoch": 0.86, + "grad_norm": 0.2844185615999324, + "learning_rate": 0.00013054943014345732, + "loss": 1.1034, + "step": 9012 + }, + { + "epoch": 0.86, + "grad_norm": 0.3198834730415027, + "learning_rate": 0.0001305343665386655, + "loss": 1.0527, + "step": 9013 + }, + { + "epoch": 0.86, + "grad_norm": 0.29395624791864494, + "learning_rate": 0.00013051930216973192, + "loss": 1.1132, + "step": 9014 + }, + { + "epoch": 0.86, + "grad_norm": 0.28969238208023507, + "learning_rate": 0.0001305042370370336, + "loss": 0.8879, + "step": 9015 + }, + { + "epoch": 0.86, + "grad_norm": 0.2718247193165485, + "learning_rate": 0.0001304891711409476, + "loss": 0.9514, + "step": 9016 + }, + { + "epoch": 0.86, + "grad_norm": 0.2736779348162901, + "learning_rate": 0.00013047410448185096, + "loss": 1.0625, + "step": 9017 + }, + { + "epoch": 0.86, + "grad_norm": 0.3190676200490636, + "learning_rate": 0.00013045903706012066, + "loss": 1.1119, + "step": 9018 + }, + { + "epoch": 0.86, + "grad_norm": 0.2645211188163359, + "learning_rate": 0.00013044396887613383, + "loss": 1.0451, + "step": 9019 + }, + { + "epoch": 0.86, + "grad_norm": 0.27409629322747, + "learning_rate": 0.00013042889993026757, + "loss": 1.0542, + "step": 9020 + }, + { + "epoch": 0.86, + "grad_norm": 0.3057021252154314, + "learning_rate": 0.00013041383022289893, + "loss": 0.9845, + "step": 9021 + }, + { + "epoch": 0.86, + "grad_norm": 0.31255114430184533, + "learning_rate": 0.00013039875975440508, + "loss": 0.923, + "step": 9022 + }, + { + "epoch": 0.86, + "grad_norm": 0.28633949394067004, + "learning_rate": 0.00013038368852516318, + "loss": 1.0501, + "step": 9023 + }, + { + "epoch": 0.86, + "grad_norm": 0.2635390208946298, + "learning_rate": 0.00013036861653555038, + "loss": 1.0946, + "step": 9024 + }, + { + "epoch": 0.86, + "grad_norm": 0.2938192444403693, + "learning_rate": 0.00013035354378594384, + "loss": 1.0054, + "step": 9025 + }, + { + "epoch": 0.86, + "grad_norm": 0.30536276310624666, + "learning_rate": 0.0001303384702767208, + "loss": 1.0497, + "step": 9026 + }, + { + "epoch": 0.86, + "grad_norm": 0.2790718788746024, + "learning_rate": 0.0001303233960082585, + "loss": 1.0417, + "step": 9027 + }, + { + "epoch": 0.86, + "grad_norm": 0.2816516990804407, + "learning_rate": 0.00013030832098093412, + "loss": 0.9402, + "step": 9028 + }, + { + "epoch": 0.86, + "grad_norm": 0.31401103333965236, + "learning_rate": 0.00013029324519512497, + "loss": 0.9892, + "step": 9029 + }, + { + "epoch": 0.86, + "grad_norm": 0.26148363698940397, + "learning_rate": 0.00013027816865120834, + "loss": 1.0242, + "step": 9030 + }, + { + "epoch": 0.86, + "grad_norm": 0.2998596222624002, + "learning_rate": 0.0001302630913495615, + "loss": 1.0302, + "step": 9031 + }, + { + "epoch": 0.86, + "grad_norm": 0.29176595806240635, + "learning_rate": 0.00013024801329056178, + "loss": 1.2279, + "step": 9032 + }, + { + "epoch": 0.86, + "grad_norm": 0.3344098434780136, + "learning_rate": 0.00013023293447458648, + "loss": 1.0633, + "step": 9033 + }, + { + "epoch": 0.86, + "grad_norm": 0.34296449020035136, + "learning_rate": 0.00013021785490201305, + "loss": 1.2539, + "step": 9034 + }, + { + "epoch": 0.86, + "grad_norm": 0.2564122283684667, + "learning_rate": 0.00013020277457321877, + "loss": 1.0903, + "step": 9035 + }, + { + "epoch": 0.86, + "grad_norm": 0.2917999377183397, + "learning_rate": 0.00013018769348858107, + "loss": 0.9977, + "step": 9036 + }, + { + "epoch": 0.86, + "grad_norm": 0.31170385260658784, + "learning_rate": 0.00013017261164847743, + "loss": 0.8905, + "step": 9037 + }, + { + "epoch": 0.86, + "grad_norm": 0.30015787859342813, + "learning_rate": 0.00013015752905328514, + "loss": 1.0676, + "step": 9038 + }, + { + "epoch": 0.86, + "grad_norm": 0.43157315991514567, + "learning_rate": 0.00013014244570338178, + "loss": 1.1419, + "step": 9039 + }, + { + "epoch": 0.86, + "grad_norm": 0.25061895805404616, + "learning_rate": 0.0001301273615991448, + "loss": 0.9809, + "step": 9040 + }, + { + "epoch": 0.86, + "grad_norm": 0.2678433671294428, + "learning_rate": 0.00013011227674095162, + "loss": 0.951, + "step": 9041 + }, + { + "epoch": 0.87, + "grad_norm": 0.2886930597012415, + "learning_rate": 0.00013009719112917978, + "loss": 1.0616, + "step": 9042 + }, + { + "epoch": 0.87, + "grad_norm": 0.25785608296938134, + "learning_rate": 0.00013008210476420684, + "loss": 1.0351, + "step": 9043 + }, + { + "epoch": 0.87, + "grad_norm": 0.25783062009926794, + "learning_rate": 0.0001300670176464103, + "loss": 1.0019, + "step": 9044 + }, + { + "epoch": 0.87, + "grad_norm": 0.30369123418177635, + "learning_rate": 0.00013005192977616777, + "loss": 0.8641, + "step": 9045 + }, + { + "epoch": 0.87, + "grad_norm": 0.3225428130185902, + "learning_rate": 0.0001300368411538568, + "loss": 1.1576, + "step": 9046 + }, + { + "epoch": 0.87, + "grad_norm": 0.2967976819197218, + "learning_rate": 0.00013002175177985502, + "loss": 1.1703, + "step": 9047 + }, + { + "epoch": 0.87, + "grad_norm": 0.3014813200032349, + "learning_rate": 0.00013000666165454, + "loss": 1.1789, + "step": 9048 + }, + { + "epoch": 0.87, + "grad_norm": 0.2501779735414716, + "learning_rate": 0.00012999157077828944, + "loss": 1.105, + "step": 9049 + }, + { + "epoch": 0.87, + "grad_norm": 0.32187566601028167, + "learning_rate": 0.000129976479151481, + "loss": 1.0835, + "step": 9050 + }, + { + "epoch": 0.87, + "grad_norm": 0.27746438481780444, + "learning_rate": 0.0001299613867744923, + "loss": 1.0821, + "step": 9051 + }, + { + "epoch": 0.87, + "grad_norm": 0.28339489212125185, + "learning_rate": 0.00012994629364770102, + "loss": 1.0247, + "step": 9052 + }, + { + "epoch": 0.87, + "grad_norm": 0.27520392235897523, + "learning_rate": 0.00012993119977148499, + "loss": 0.9562, + "step": 9053 + }, + { + "epoch": 0.87, + "grad_norm": 0.2575142187774334, + "learning_rate": 0.0001299161051462218, + "loss": 1.0923, + "step": 9054 + }, + { + "epoch": 0.87, + "grad_norm": 0.2934030821339123, + "learning_rate": 0.00012990100977228934, + "loss": 1.0186, + "step": 9055 + }, + { + "epoch": 0.87, + "grad_norm": 0.2825842909756287, + "learning_rate": 0.0001298859136500653, + "loss": 1.0244, + "step": 9056 + }, + { + "epoch": 0.87, + "grad_norm": 0.2732801101975911, + "learning_rate": 0.0001298708167799275, + "loss": 1.1346, + "step": 9057 + }, + { + "epoch": 0.87, + "grad_norm": 0.2835565308919408, + "learning_rate": 0.0001298557191622537, + "loss": 0.9529, + "step": 9058 + }, + { + "epoch": 0.87, + "grad_norm": 0.32615560594023263, + "learning_rate": 0.00012984062079742181, + "loss": 1.1499, + "step": 9059 + }, + { + "epoch": 0.87, + "grad_norm": 0.2869135232591449, + "learning_rate": 0.00012982552168580962, + "loss": 0.9194, + "step": 9060 + }, + { + "epoch": 0.87, + "grad_norm": 0.3134787513394852, + "learning_rate": 0.000129810421827795, + "loss": 1.1952, + "step": 9061 + }, + { + "epoch": 0.87, + "grad_norm": 0.32346472417562094, + "learning_rate": 0.0001297953212237558, + "loss": 1.023, + "step": 9062 + }, + { + "epoch": 0.87, + "grad_norm": 0.3367904265131591, + "learning_rate": 0.00012978021987407004, + "loss": 1.1234, + "step": 9063 + }, + { + "epoch": 0.87, + "grad_norm": 0.30432907150859745, + "learning_rate": 0.0001297651177791155, + "loss": 0.9762, + "step": 9064 + }, + { + "epoch": 0.87, + "grad_norm": 0.3161449071160964, + "learning_rate": 0.00012975001493927018, + "loss": 0.9887, + "step": 9065 + }, + { + "epoch": 0.87, + "grad_norm": 0.24592648677051562, + "learning_rate": 0.00012973491135491206, + "loss": 1.0069, + "step": 9066 + }, + { + "epoch": 0.87, + "grad_norm": 0.26653722611873165, + "learning_rate": 0.00012971980702641912, + "loss": 1.0546, + "step": 9067 + }, + { + "epoch": 0.87, + "grad_norm": 0.2862693690630182, + "learning_rate": 0.00012970470195416931, + "loss": 1.1177, + "step": 9068 + }, + { + "epoch": 0.87, + "grad_norm": 0.29138542499171116, + "learning_rate": 0.00012968959613854063, + "loss": 1.1238, + "step": 9069 + }, + { + "epoch": 0.87, + "grad_norm": 0.2859847406201628, + "learning_rate": 0.0001296744895799112, + "loss": 1.047, + "step": 9070 + }, + { + "epoch": 0.87, + "grad_norm": 0.2807399557511384, + "learning_rate": 0.000129659382278659, + "loss": 0.9589, + "step": 9071 + }, + { + "epoch": 0.87, + "grad_norm": 0.28450650670607563, + "learning_rate": 0.0001296442742351621, + "loss": 0.9999, + "step": 9072 + }, + { + "epoch": 0.87, + "grad_norm": 0.31676772977788115, + "learning_rate": 0.0001296291654497986, + "loss": 0.978, + "step": 9073 + }, + { + "epoch": 0.87, + "grad_norm": 0.2784851835452743, + "learning_rate": 0.00012961405592294665, + "loss": 1.0332, + "step": 9074 + }, + { + "epoch": 0.87, + "grad_norm": 0.26832152402985765, + "learning_rate": 0.0001295989456549843, + "loss": 1.0811, + "step": 9075 + }, + { + "epoch": 0.87, + "grad_norm": 0.29162865744847255, + "learning_rate": 0.00012958383464628975, + "loss": 1.1361, + "step": 9076 + }, + { + "epoch": 0.87, + "grad_norm": 0.26672809187008667, + "learning_rate": 0.00012956872289724116, + "loss": 1.0796, + "step": 9077 + }, + { + "epoch": 0.87, + "grad_norm": 0.27076549176531384, + "learning_rate": 0.0001295536104082167, + "loss": 1.1287, + "step": 9078 + }, + { + "epoch": 0.87, + "grad_norm": 0.28696821632322866, + "learning_rate": 0.00012953849717959454, + "loss": 1.1795, + "step": 9079 + }, + { + "epoch": 0.87, + "grad_norm": 0.2955303786908336, + "learning_rate": 0.00012952338321175293, + "loss": 1.0168, + "step": 9080 + }, + { + "epoch": 0.87, + "grad_norm": 0.3039172006339761, + "learning_rate": 0.00012950826850507011, + "loss": 0.9734, + "step": 9081 + }, + { + "epoch": 0.87, + "grad_norm": 0.2680076109464228, + "learning_rate": 0.00012949315305992433, + "loss": 1.1325, + "step": 9082 + }, + { + "epoch": 0.87, + "grad_norm": 0.3197355029722075, + "learning_rate": 0.00012947803687669385, + "loss": 1.1474, + "step": 9083 + }, + { + "epoch": 0.87, + "grad_norm": 0.260643212258335, + "learning_rate": 0.00012946291995575697, + "loss": 1.0691, + "step": 9084 + }, + { + "epoch": 0.87, + "grad_norm": 0.28094689856026667, + "learning_rate": 0.00012944780229749201, + "loss": 1.1027, + "step": 9085 + }, + { + "epoch": 0.87, + "grad_norm": 0.3354262462770161, + "learning_rate": 0.00012943268390227727, + "loss": 0.9943, + "step": 9086 + }, + { + "epoch": 0.87, + "grad_norm": 0.2863437127555651, + "learning_rate": 0.00012941756477049114, + "loss": 1.101, + "step": 9087 + }, + { + "epoch": 0.87, + "grad_norm": 0.24300912831153612, + "learning_rate": 0.00012940244490251197, + "loss": 0.9077, + "step": 9088 + }, + { + "epoch": 0.87, + "grad_norm": 0.259039073094172, + "learning_rate": 0.0001293873242987181, + "loss": 1.117, + "step": 9089 + }, + { + "epoch": 0.87, + "grad_norm": 0.2512219349760124, + "learning_rate": 0.000129372202959488, + "loss": 0.9785, + "step": 9090 + }, + { + "epoch": 0.87, + "grad_norm": 0.27273837493945713, + "learning_rate": 0.00012935708088520007, + "loss": 0.9931, + "step": 9091 + }, + { + "epoch": 0.87, + "grad_norm": 0.27530538774617, + "learning_rate": 0.0001293419580762327, + "loss": 1.0153, + "step": 9092 + }, + { + "epoch": 0.87, + "grad_norm": 0.2926747917554771, + "learning_rate": 0.0001293268345329644, + "loss": 1.0461, + "step": 9093 + }, + { + "epoch": 0.87, + "grad_norm": 0.2500212462256167, + "learning_rate": 0.00012931171025577366, + "loss": 1.0736, + "step": 9094 + }, + { + "epoch": 0.87, + "grad_norm": 0.29111956874370015, + "learning_rate": 0.00012929658524503894, + "loss": 1.0124, + "step": 9095 + }, + { + "epoch": 0.87, + "grad_norm": 0.263782277974996, + "learning_rate": 0.00012928145950113877, + "loss": 0.9819, + "step": 9096 + }, + { + "epoch": 0.87, + "grad_norm": 0.2905200592972542, + "learning_rate": 0.00012926633302445164, + "loss": 1.0002, + "step": 9097 + }, + { + "epoch": 0.87, + "grad_norm": 0.29496184216102744, + "learning_rate": 0.00012925120581535614, + "loss": 1.0787, + "step": 9098 + }, + { + "epoch": 0.87, + "grad_norm": 0.3117510201468771, + "learning_rate": 0.00012923607787423085, + "loss": 1.0186, + "step": 9099 + }, + { + "epoch": 0.87, + "grad_norm": 0.32624449591492205, + "learning_rate": 0.00012922094920145432, + "loss": 1.1453, + "step": 9100 + }, + { + "epoch": 0.87, + "grad_norm": 0.33313379774620716, + "learning_rate": 0.0001292058197974052, + "loss": 1.1103, + "step": 9101 + }, + { + "epoch": 0.87, + "grad_norm": 0.30476232541112913, + "learning_rate": 0.0001291906896624621, + "loss": 1.0197, + "step": 9102 + }, + { + "epoch": 0.87, + "grad_norm": 0.2966527557371042, + "learning_rate": 0.00012917555879700358, + "loss": 1.0609, + "step": 9103 + }, + { + "epoch": 0.87, + "grad_norm": 0.3050234189493314, + "learning_rate": 0.0001291604272014084, + "loss": 1.098, + "step": 9104 + }, + { + "epoch": 0.87, + "grad_norm": 0.27720296715762255, + "learning_rate": 0.0001291452948760552, + "loss": 1.0512, + "step": 9105 + }, + { + "epoch": 0.87, + "grad_norm": 0.3042006172159329, + "learning_rate": 0.00012913016182132268, + "loss": 0.9907, + "step": 9106 + }, + { + "epoch": 0.87, + "grad_norm": 0.3068804882948529, + "learning_rate": 0.00012911502803758954, + "loss": 1.1273, + "step": 9107 + }, + { + "epoch": 0.87, + "grad_norm": 0.25437915672597605, + "learning_rate": 0.00012909989352523455, + "loss": 1.0053, + "step": 9108 + }, + { + "epoch": 0.87, + "grad_norm": 0.2588485696040596, + "learning_rate": 0.00012908475828463643, + "loss": 1.069, + "step": 9109 + }, + { + "epoch": 0.87, + "grad_norm": 0.288489014040999, + "learning_rate": 0.00012906962231617396, + "loss": 1.1162, + "step": 9110 + }, + { + "epoch": 0.87, + "grad_norm": 0.2902499883183751, + "learning_rate": 0.00012905448562022592, + "loss": 0.959, + "step": 9111 + }, + { + "epoch": 0.87, + "grad_norm": 0.2885389820207888, + "learning_rate": 0.00012903934819717108, + "loss": 1.0781, + "step": 9112 + }, + { + "epoch": 0.87, + "grad_norm": 0.2434151493288888, + "learning_rate": 0.00012902421004738833, + "loss": 1.1026, + "step": 9113 + }, + { + "epoch": 0.87, + "grad_norm": 0.27724613447577373, + "learning_rate": 0.0001290090711712565, + "loss": 0.9479, + "step": 9114 + }, + { + "epoch": 0.87, + "grad_norm": 0.2638723940275514, + "learning_rate": 0.00012899393156915438, + "loss": 1.0193, + "step": 9115 + }, + { + "epoch": 0.87, + "grad_norm": 0.3146091758887233, + "learning_rate": 0.00012897879124146094, + "loss": 1.0415, + "step": 9116 + }, + { + "epoch": 0.87, + "grad_norm": 0.2781231516386979, + "learning_rate": 0.00012896365018855502, + "loss": 1.1148, + "step": 9117 + }, + { + "epoch": 0.87, + "grad_norm": 0.2814021450123451, + "learning_rate": 0.00012894850841081555, + "loss": 0.9947, + "step": 9118 + }, + { + "epoch": 0.87, + "grad_norm": 0.2951572978095963, + "learning_rate": 0.0001289333659086215, + "loss": 1.0666, + "step": 9119 + }, + { + "epoch": 0.87, + "grad_norm": 0.2789547266879171, + "learning_rate": 0.00012891822268235175, + "loss": 1.1119, + "step": 9120 + }, + { + "epoch": 0.87, + "grad_norm": 0.2944178727397588, + "learning_rate": 0.0001289030787323853, + "loss": 1.0622, + "step": 9121 + }, + { + "epoch": 0.87, + "grad_norm": 0.3002264804462691, + "learning_rate": 0.00012888793405910117, + "loss": 1.1301, + "step": 9122 + }, + { + "epoch": 0.87, + "grad_norm": 0.29596892828518445, + "learning_rate": 0.0001288727886628783, + "loss": 0.9955, + "step": 9123 + }, + { + "epoch": 0.87, + "grad_norm": 0.28581897006530516, + "learning_rate": 0.00012885764254409577, + "loss": 1.0991, + "step": 9124 + }, + { + "epoch": 0.87, + "grad_norm": 0.2848802047710548, + "learning_rate": 0.0001288424957031326, + "loss": 1.1973, + "step": 9125 + }, + { + "epoch": 0.87, + "grad_norm": 0.30273121467305963, + "learning_rate": 0.00012882734814036783, + "loss": 1.1433, + "step": 9126 + }, + { + "epoch": 0.87, + "grad_norm": 0.26901216004525036, + "learning_rate": 0.00012881219985618058, + "loss": 0.9651, + "step": 9127 + }, + { + "epoch": 0.87, + "grad_norm": 0.29357748445834536, + "learning_rate": 0.0001287970508509499, + "loss": 1.0533, + "step": 9128 + }, + { + "epoch": 0.87, + "grad_norm": 0.28302245682622934, + "learning_rate": 0.00012878190112505496, + "loss": 1.1715, + "step": 9129 + }, + { + "epoch": 0.87, + "grad_norm": 0.32220788805164724, + "learning_rate": 0.0001287667506788748, + "loss": 1.037, + "step": 9130 + }, + { + "epoch": 0.87, + "grad_norm": 0.2838656933293558, + "learning_rate": 0.00012875159951278867, + "loss": 0.9909, + "step": 9131 + }, + { + "epoch": 0.87, + "grad_norm": 0.31812680763347256, + "learning_rate": 0.0001287364476271757, + "loss": 1.0599, + "step": 9132 + }, + { + "epoch": 0.87, + "grad_norm": 0.2729154895834484, + "learning_rate": 0.00012872129502241502, + "loss": 1.0977, + "step": 9133 + }, + { + "epoch": 0.87, + "grad_norm": 0.3125996237172212, + "learning_rate": 0.0001287061416988859, + "loss": 1.0514, + "step": 9134 + }, + { + "epoch": 0.87, + "grad_norm": 0.2807631918763277, + "learning_rate": 0.00012869098765696757, + "loss": 1.051, + "step": 9135 + }, + { + "epoch": 0.87, + "grad_norm": 0.31219828383217524, + "learning_rate": 0.0001286758328970392, + "loss": 1.0464, + "step": 9136 + }, + { + "epoch": 0.87, + "grad_norm": 0.2639358482520139, + "learning_rate": 0.0001286606774194801, + "loss": 1.0578, + "step": 9137 + }, + { + "epoch": 0.87, + "grad_norm": 0.2974661876294612, + "learning_rate": 0.00012864552122466956, + "loss": 0.9595, + "step": 9138 + }, + { + "epoch": 0.87, + "grad_norm": 0.2926598180262982, + "learning_rate": 0.00012863036431298684, + "loss": 1.1708, + "step": 9139 + }, + { + "epoch": 0.87, + "grad_norm": 0.25929024999451794, + "learning_rate": 0.00012861520668481122, + "loss": 0.9799, + "step": 9140 + }, + { + "epoch": 0.87, + "grad_norm": 0.29654960563184474, + "learning_rate": 0.0001286000483405221, + "loss": 1.1523, + "step": 9141 + }, + { + "epoch": 0.87, + "grad_norm": 0.27691848301002864, + "learning_rate": 0.00012858488928049882, + "loss": 1.0334, + "step": 9142 + }, + { + "epoch": 0.87, + "grad_norm": 0.29295271269701484, + "learning_rate": 0.00012856972950512068, + "loss": 1.0546, + "step": 9143 + }, + { + "epoch": 0.87, + "grad_norm": 0.2803146245701744, + "learning_rate": 0.00012855456901476712, + "loss": 0.9898, + "step": 9144 + }, + { + "epoch": 0.87, + "grad_norm": 0.2863989751594063, + "learning_rate": 0.00012853940780981751, + "loss": 1.1504, + "step": 9145 + }, + { + "epoch": 0.88, + "grad_norm": 0.2739972569019825, + "learning_rate": 0.00012852424589065132, + "loss": 0.923, + "step": 9146 + }, + { + "epoch": 0.88, + "grad_norm": 0.25052615701520137, + "learning_rate": 0.0001285090832576479, + "loss": 1.0061, + "step": 9147 + }, + { + "epoch": 0.88, + "grad_norm": 0.3030142194521633, + "learning_rate": 0.00012849391991118683, + "loss": 1.0694, + "step": 9148 + }, + { + "epoch": 0.88, + "grad_norm": 0.2714696456433213, + "learning_rate": 0.00012847875585164745, + "loss": 1.0009, + "step": 9149 + }, + { + "epoch": 0.88, + "grad_norm": 0.28875446135458116, + "learning_rate": 0.00012846359107940931, + "loss": 1.1649, + "step": 9150 + }, + { + "epoch": 0.88, + "grad_norm": 0.3150764287272924, + "learning_rate": 0.00012844842559485192, + "loss": 0.9091, + "step": 9151 + }, + { + "epoch": 0.88, + "grad_norm": 0.27701993372597217, + "learning_rate": 0.00012843325939835483, + "loss": 1.083, + "step": 9152 + }, + { + "epoch": 0.88, + "grad_norm": 0.29338798312190123, + "learning_rate": 0.00012841809249029747, + "loss": 0.9318, + "step": 9153 + }, + { + "epoch": 0.88, + "grad_norm": 0.30076117068870556, + "learning_rate": 0.00012840292487105955, + "loss": 1.0577, + "step": 9154 + }, + { + "epoch": 0.88, + "grad_norm": 0.26916632631906984, + "learning_rate": 0.0001283877565410206, + "loss": 1.0901, + "step": 9155 + }, + { + "epoch": 0.88, + "grad_norm": 0.2863676566010998, + "learning_rate": 0.00012837258750056016, + "loss": 1.0916, + "step": 9156 + }, + { + "epoch": 0.88, + "grad_norm": 0.311513288826919, + "learning_rate": 0.0001283574177500579, + "loss": 0.9895, + "step": 9157 + }, + { + "epoch": 0.88, + "grad_norm": 0.2788747044437777, + "learning_rate": 0.00012834224728989344, + "loss": 1.0058, + "step": 9158 + }, + { + "epoch": 0.88, + "grad_norm": 0.31246123148504296, + "learning_rate": 0.00012832707612044642, + "loss": 1.1208, + "step": 9159 + }, + { + "epoch": 0.88, + "grad_norm": 0.3153487374039508, + "learning_rate": 0.00012831190424209655, + "loss": 1.0619, + "step": 9160 + }, + { + "epoch": 0.88, + "grad_norm": 0.2979832645966896, + "learning_rate": 0.00012829673165522343, + "loss": 1.0788, + "step": 9161 + }, + { + "epoch": 0.88, + "grad_norm": 0.27761648021394475, + "learning_rate": 0.00012828155836020687, + "loss": 1.0676, + "step": 9162 + }, + { + "epoch": 0.88, + "grad_norm": 0.2966190620374098, + "learning_rate": 0.00012826638435742654, + "loss": 1.127, + "step": 9163 + }, + { + "epoch": 0.88, + "grad_norm": 0.2677449239886606, + "learning_rate": 0.0001282512096472621, + "loss": 1.1182, + "step": 9164 + }, + { + "epoch": 0.88, + "grad_norm": 0.2955895529683899, + "learning_rate": 0.00012823603423009347, + "loss": 0.8931, + "step": 9165 + }, + { + "epoch": 0.88, + "grad_norm": 0.2864122311531748, + "learning_rate": 0.0001282208581063003, + "loss": 1.1078, + "step": 9166 + }, + { + "epoch": 0.88, + "grad_norm": 0.2901266851305628, + "learning_rate": 0.00012820568127626242, + "loss": 1.0239, + "step": 9167 + }, + { + "epoch": 0.88, + "grad_norm": 0.3049079996092775, + "learning_rate": 0.00012819050374035962, + "loss": 1.0681, + "step": 9168 + }, + { + "epoch": 0.88, + "grad_norm": 0.27371968074107056, + "learning_rate": 0.0001281753254989718, + "loss": 1.0393, + "step": 9169 + }, + { + "epoch": 0.88, + "grad_norm": 0.31873095311089905, + "learning_rate": 0.0001281601465524787, + "loss": 1.0495, + "step": 9170 + }, + { + "epoch": 0.88, + "grad_norm": 0.33430466740169595, + "learning_rate": 0.00012814496690126027, + "loss": 1.0828, + "step": 9171 + }, + { + "epoch": 0.88, + "grad_norm": 0.2793091890823801, + "learning_rate": 0.00012812978654569635, + "loss": 1.0, + "step": 9172 + }, + { + "epoch": 0.88, + "grad_norm": 0.28127672160759526, + "learning_rate": 0.00012811460548616682, + "loss": 1.1068, + "step": 9173 + }, + { + "epoch": 0.88, + "grad_norm": 0.2822256164444625, + "learning_rate": 0.00012809942372305164, + "loss": 1.1039, + "step": 9174 + }, + { + "epoch": 0.88, + "grad_norm": 0.2989368506283751, + "learning_rate": 0.0001280842412567307, + "loss": 1.0661, + "step": 9175 + }, + { + "epoch": 0.88, + "grad_norm": 0.2787834185347091, + "learning_rate": 0.000128069058087584, + "loss": 1.0754, + "step": 9176 + }, + { + "epoch": 0.88, + "grad_norm": 0.26459335862211264, + "learning_rate": 0.00012805387421599144, + "loss": 1.0921, + "step": 9177 + }, + { + "epoch": 0.88, + "grad_norm": 0.2717879411221796, + "learning_rate": 0.0001280386896423331, + "loss": 1.0256, + "step": 9178 + }, + { + "epoch": 0.88, + "grad_norm": 0.3392781499730835, + "learning_rate": 0.00012802350436698888, + "loss": 0.963, + "step": 9179 + }, + { + "epoch": 0.88, + "grad_norm": 0.3008890143060967, + "learning_rate": 0.0001280083183903389, + "loss": 1.0453, + "step": 9180 + }, + { + "epoch": 0.88, + "grad_norm": 0.2859398503777074, + "learning_rate": 0.00012799313171276308, + "loss": 1.095, + "step": 9181 + }, + { + "epoch": 0.88, + "grad_norm": 0.269640842666525, + "learning_rate": 0.0001279779443346416, + "loss": 1.0724, + "step": 9182 + }, + { + "epoch": 0.88, + "grad_norm": 0.28038845119609257, + "learning_rate": 0.0001279627562563545, + "loss": 1.0738, + "step": 9183 + }, + { + "epoch": 0.88, + "grad_norm": 0.2887167863295914, + "learning_rate": 0.00012794756747828179, + "loss": 1.0641, + "step": 9184 + }, + { + "epoch": 0.88, + "grad_norm": 0.29441549775626563, + "learning_rate": 0.00012793237800080365, + "loss": 1.1956, + "step": 9185 + }, + { + "epoch": 0.88, + "grad_norm": 0.2614449210833528, + "learning_rate": 0.00012791718782430024, + "loss": 0.9941, + "step": 9186 + }, + { + "epoch": 0.88, + "grad_norm": 0.27898220786899924, + "learning_rate": 0.00012790199694915163, + "loss": 1.0125, + "step": 9187 + }, + { + "epoch": 0.88, + "grad_norm": 0.2623293441843174, + "learning_rate": 0.000127886805375738, + "loss": 1.018, + "step": 9188 + }, + { + "epoch": 0.88, + "grad_norm": 0.24474445860858335, + "learning_rate": 0.00012787161310443958, + "loss": 0.9925, + "step": 9189 + }, + { + "epoch": 0.88, + "grad_norm": 0.26691886103805196, + "learning_rate": 0.0001278564201356365, + "loss": 1.0262, + "step": 9190 + }, + { + "epoch": 0.88, + "grad_norm": 0.2996480479285695, + "learning_rate": 0.000127841226469709, + "loss": 1.1404, + "step": 9191 + }, + { + "epoch": 0.88, + "grad_norm": 0.2831005610084299, + "learning_rate": 0.0001278260321070373, + "loss": 1.0199, + "step": 9192 + }, + { + "epoch": 0.88, + "grad_norm": 0.3016403069097424, + "learning_rate": 0.00012781083704800167, + "loss": 1.1395, + "step": 9193 + }, + { + "epoch": 0.88, + "grad_norm": 0.258599684442815, + "learning_rate": 0.00012779564129298233, + "loss": 1.0072, + "step": 9194 + }, + { + "epoch": 0.88, + "grad_norm": 0.28981820294114957, + "learning_rate": 0.00012778044484235964, + "loss": 1.0872, + "step": 9195 + }, + { + "epoch": 0.88, + "grad_norm": 0.2944671368276984, + "learning_rate": 0.0001277652476965139, + "loss": 0.9736, + "step": 9196 + }, + { + "epoch": 0.88, + "grad_norm": 0.32250052614896707, + "learning_rate": 0.0001277500498558253, + "loss": 1.0861, + "step": 9197 + }, + { + "epoch": 0.88, + "grad_norm": 0.33413812354501804, + "learning_rate": 0.00012773485132067428, + "loss": 0.9882, + "step": 9198 + }, + { + "epoch": 0.88, + "grad_norm": 0.2396696634982347, + "learning_rate": 0.00012771965209144122, + "loss": 0.999, + "step": 9199 + }, + { + "epoch": 0.88, + "grad_norm": 0.2871461793248707, + "learning_rate": 0.00012770445216850638, + "loss": 0.9985, + "step": 9200 + }, + { + "epoch": 0.88, + "grad_norm": 0.27681646471608834, + "learning_rate": 0.00012768925155225025, + "loss": 0.9664, + "step": 9201 + }, + { + "epoch": 0.88, + "grad_norm": 0.27957278139129027, + "learning_rate": 0.00012767405024305322, + "loss": 1.0305, + "step": 9202 + }, + { + "epoch": 0.88, + "grad_norm": 0.28753086690595353, + "learning_rate": 0.00012765884824129565, + "loss": 1.0237, + "step": 9203 + }, + { + "epoch": 0.88, + "grad_norm": 0.3111440689789316, + "learning_rate": 0.000127643645547358, + "loss": 1.2017, + "step": 9204 + }, + { + "epoch": 0.88, + "grad_norm": 0.2921443717707693, + "learning_rate": 0.0001276284421616208, + "loss": 1.029, + "step": 9205 + }, + { + "epoch": 0.88, + "grad_norm": 0.30261904265294104, + "learning_rate": 0.00012761323808446447, + "loss": 1.0758, + "step": 9206 + }, + { + "epoch": 0.88, + "grad_norm": 0.25772792131049205, + "learning_rate": 0.00012759803331626948, + "loss": 1.0139, + "step": 9207 + }, + { + "epoch": 0.88, + "grad_norm": 0.3046831790519165, + "learning_rate": 0.00012758282785741638, + "loss": 1.0825, + "step": 9208 + }, + { + "epoch": 0.88, + "grad_norm": 0.3183296731093666, + "learning_rate": 0.00012756762170828566, + "loss": 0.9899, + "step": 9209 + }, + { + "epoch": 0.88, + "grad_norm": 0.2817939566711674, + "learning_rate": 0.0001275524148692579, + "loss": 0.9752, + "step": 9210 + }, + { + "epoch": 0.88, + "grad_norm": 0.2665373598422484, + "learning_rate": 0.0001275372073407136, + "loss": 1.0575, + "step": 9211 + }, + { + "epoch": 0.88, + "grad_norm": 0.2725700632449814, + "learning_rate": 0.00012752199912303345, + "loss": 1.0651, + "step": 9212 + }, + { + "epoch": 0.88, + "grad_norm": 0.29400308946907394, + "learning_rate": 0.00012750679021659794, + "loss": 1.0696, + "step": 9213 + }, + { + "epoch": 0.88, + "grad_norm": 0.297641959611062, + "learning_rate": 0.00012749158062178769, + "loss": 1.0051, + "step": 9214 + }, + { + "epoch": 0.88, + "grad_norm": 0.24521868429916097, + "learning_rate": 0.0001274763703389834, + "loss": 0.9303, + "step": 9215 + }, + { + "epoch": 0.88, + "grad_norm": 0.29297299579392244, + "learning_rate": 0.00012746115936856564, + "loss": 1.1431, + "step": 9216 + }, + { + "epoch": 0.88, + "grad_norm": 0.26851527665671227, + "learning_rate": 0.00012744594771091513, + "loss": 1.0198, + "step": 9217 + }, + { + "epoch": 0.88, + "grad_norm": 0.2819204613169668, + "learning_rate": 0.0001274307353664125, + "loss": 1.1456, + "step": 9218 + }, + { + "epoch": 0.88, + "grad_norm": 0.3038986733258005, + "learning_rate": 0.00012741552233543852, + "loss": 0.9887, + "step": 9219 + }, + { + "epoch": 0.88, + "grad_norm": 0.3047117203847732, + "learning_rate": 0.0001274003086183738, + "loss": 1.069, + "step": 9220 + }, + { + "epoch": 0.88, + "grad_norm": 0.2920407974764749, + "learning_rate": 0.0001273850942155992, + "loss": 1.113, + "step": 9221 + }, + { + "epoch": 0.88, + "grad_norm": 0.28997161723843756, + "learning_rate": 0.0001273698791274954, + "loss": 0.9928, + "step": 9222 + }, + { + "epoch": 0.88, + "grad_norm": 0.2601062102739676, + "learning_rate": 0.00012735466335444314, + "loss": 1.0277, + "step": 9223 + }, + { + "epoch": 0.88, + "grad_norm": 0.24196462571085023, + "learning_rate": 0.00012733944689682325, + "loss": 1.0118, + "step": 9224 + }, + { + "epoch": 0.88, + "grad_norm": 0.27692067285133026, + "learning_rate": 0.00012732422975501653, + "loss": 1.1095, + "step": 9225 + }, + { + "epoch": 0.88, + "grad_norm": 0.25760809183221073, + "learning_rate": 0.0001273090119294038, + "loss": 0.9548, + "step": 9226 + }, + { + "epoch": 0.88, + "grad_norm": 0.25859993613149374, + "learning_rate": 0.00012729379342036587, + "loss": 0.9637, + "step": 9227 + }, + { + "epoch": 0.88, + "grad_norm": 0.2540462826516634, + "learning_rate": 0.00012727857422828359, + "loss": 1.1307, + "step": 9228 + }, + { + "epoch": 0.88, + "grad_norm": 0.2921227707514754, + "learning_rate": 0.00012726335435353785, + "loss": 1.1008, + "step": 9229 + }, + { + "epoch": 0.88, + "grad_norm": 0.2984915373574552, + "learning_rate": 0.00012724813379650954, + "loss": 1.0848, + "step": 9230 + }, + { + "epoch": 0.88, + "grad_norm": 0.3095681979770164, + "learning_rate": 0.00012723291255757957, + "loss": 1.1033, + "step": 9231 + }, + { + "epoch": 0.88, + "grad_norm": 0.28318769682527445, + "learning_rate": 0.00012721769063712884, + "loss": 1.0971, + "step": 9232 + }, + { + "epoch": 0.88, + "grad_norm": 0.27525707454107234, + "learning_rate": 0.00012720246803553828, + "loss": 1.0534, + "step": 9233 + }, + { + "epoch": 0.88, + "grad_norm": 0.28825651516813144, + "learning_rate": 0.0001271872447531889, + "loss": 1.1621, + "step": 9234 + }, + { + "epoch": 0.88, + "grad_norm": 0.3010094105143729, + "learning_rate": 0.0001271720207904616, + "loss": 1.072, + "step": 9235 + }, + { + "epoch": 0.88, + "grad_norm": 0.24956098318442146, + "learning_rate": 0.00012715679614773738, + "loss": 1.2222, + "step": 9236 + }, + { + "epoch": 0.88, + "grad_norm": 0.2764944338239069, + "learning_rate": 0.00012714157082539733, + "loss": 1.0995, + "step": 9237 + }, + { + "epoch": 0.88, + "grad_norm": 0.27634606116433363, + "learning_rate": 0.00012712634482382238, + "loss": 1.036, + "step": 9238 + }, + { + "epoch": 0.88, + "grad_norm": 0.2794870761616325, + "learning_rate": 0.0001271111181433936, + "loss": 1.1254, + "step": 9239 + }, + { + "epoch": 0.88, + "grad_norm": 0.2601345404562058, + "learning_rate": 0.00012709589078449204, + "loss": 1.1053, + "step": 9240 + }, + { + "epoch": 0.88, + "grad_norm": 0.2622935878916725, + "learning_rate": 0.0001270806627474988, + "loss": 1.0455, + "step": 9241 + }, + { + "epoch": 0.88, + "grad_norm": 0.29883490193917805, + "learning_rate": 0.00012706543403279497, + "loss": 0.9762, + "step": 9242 + }, + { + "epoch": 0.88, + "grad_norm": 0.22869752618561462, + "learning_rate": 0.0001270502046407616, + "loss": 0.9916, + "step": 9243 + }, + { + "epoch": 0.88, + "grad_norm": 0.29444348606081083, + "learning_rate": 0.00012703497457177988, + "loss": 0.9847, + "step": 9244 + }, + { + "epoch": 0.88, + "grad_norm": 0.2938071041973925, + "learning_rate": 0.00012701974382623094, + "loss": 1.0545, + "step": 9245 + }, + { + "epoch": 0.88, + "grad_norm": 0.26894685253793144, + "learning_rate": 0.00012700451240449593, + "loss": 1.0102, + "step": 9246 + }, + { + "epoch": 0.88, + "grad_norm": 0.3295306409371883, + "learning_rate": 0.00012698928030695602, + "loss": 1.1135, + "step": 9247 + }, + { + "epoch": 0.88, + "grad_norm": 0.29519953492147355, + "learning_rate": 0.0001269740475339924, + "loss": 1.1716, + "step": 9248 + }, + { + "epoch": 0.88, + "grad_norm": 0.31644779195394845, + "learning_rate": 0.0001269588140859863, + "loss": 1.0338, + "step": 9249 + }, + { + "epoch": 0.88, + "grad_norm": 0.28862661343334506, + "learning_rate": 0.00012694357996331893, + "loss": 0.9805, + "step": 9250 + }, + { + "epoch": 0.89, + "grad_norm": 0.29109092097690287, + "learning_rate": 0.00012692834516637156, + "loss": 1.0948, + "step": 9251 + }, + { + "epoch": 0.89, + "grad_norm": 0.2755208136905377, + "learning_rate": 0.00012691310969552538, + "loss": 1.0859, + "step": 9252 + }, + { + "epoch": 0.89, + "grad_norm": 0.2947036722853116, + "learning_rate": 0.00012689787355116177, + "loss": 1.0358, + "step": 9253 + }, + { + "epoch": 0.89, + "grad_norm": 0.2871135631256799, + "learning_rate": 0.00012688263673366195, + "loss": 1.0249, + "step": 9254 + }, + { + "epoch": 0.89, + "grad_norm": 0.2562542263019815, + "learning_rate": 0.0001268673992434072, + "loss": 1.0182, + "step": 9255 + }, + { + "epoch": 0.89, + "grad_norm": 0.31659182199731395, + "learning_rate": 0.00012685216108077895, + "loss": 1.0291, + "step": 9256 + }, + { + "epoch": 0.89, + "grad_norm": 0.2933670124167318, + "learning_rate": 0.0001268369222461585, + "loss": 1.0631, + "step": 9257 + }, + { + "epoch": 0.89, + "grad_norm": 0.2657225784590937, + "learning_rate": 0.0001268216827399272, + "loss": 1.119, + "step": 9258 + }, + { + "epoch": 0.89, + "grad_norm": 0.2797518168945411, + "learning_rate": 0.00012680644256246642, + "loss": 0.9506, + "step": 9259 + }, + { + "epoch": 0.89, + "grad_norm": 0.3020240632022724, + "learning_rate": 0.00012679120171415757, + "loss": 0.9763, + "step": 9260 + }, + { + "epoch": 0.89, + "grad_norm": 0.2637706589429481, + "learning_rate": 0.00012677596019538206, + "loss": 1.0194, + "step": 9261 + }, + { + "epoch": 0.89, + "grad_norm": 0.31418144937011505, + "learning_rate": 0.0001267607180065213, + "loss": 1.1327, + "step": 9262 + }, + { + "epoch": 0.89, + "grad_norm": 0.2952874649931668, + "learning_rate": 0.00012674547514795675, + "loss": 1.0817, + "step": 9263 + }, + { + "epoch": 0.89, + "grad_norm": 0.3150374662852804, + "learning_rate": 0.00012673023162006989, + "loss": 1.0925, + "step": 9264 + }, + { + "epoch": 0.89, + "grad_norm": 0.3198349522638215, + "learning_rate": 0.0001267149874232422, + "loss": 1.1409, + "step": 9265 + }, + { + "epoch": 0.89, + "grad_norm": 0.25854203157035266, + "learning_rate": 0.00012669974255785516, + "loss": 1.138, + "step": 9266 + }, + { + "epoch": 0.89, + "grad_norm": 0.2608990015480636, + "learning_rate": 0.00012668449702429028, + "loss": 1.0271, + "step": 9267 + }, + { + "epoch": 0.89, + "grad_norm": 0.29909886728393076, + "learning_rate": 0.0001266692508229291, + "loss": 1.0969, + "step": 9268 + }, + { + "epoch": 0.89, + "grad_norm": 0.23909051250886207, + "learning_rate": 0.0001266540039541531, + "loss": 0.9744, + "step": 9269 + }, + { + "epoch": 0.89, + "grad_norm": 0.28087485327219586, + "learning_rate": 0.00012663875641834394, + "loss": 1.1004, + "step": 9270 + }, + { + "epoch": 0.89, + "grad_norm": 0.2552216763566098, + "learning_rate": 0.0001266235082158832, + "loss": 0.9802, + "step": 9271 + }, + { + "epoch": 0.89, + "grad_norm": 0.28763301871999136, + "learning_rate": 0.00012660825934715235, + "loss": 1.0887, + "step": 9272 + }, + { + "epoch": 0.89, + "grad_norm": 0.29630933468752285, + "learning_rate": 0.00012659300981253315, + "loss": 1.1228, + "step": 9273 + }, + { + "epoch": 0.89, + "grad_norm": 0.29194868948582664, + "learning_rate": 0.00012657775961240713, + "loss": 1.0174, + "step": 9274 + }, + { + "epoch": 0.89, + "grad_norm": 0.2617941239283223, + "learning_rate": 0.000126562508747156, + "loss": 1.0721, + "step": 9275 + }, + { + "epoch": 0.89, + "grad_norm": 0.28279751520459484, + "learning_rate": 0.00012654725721716138, + "loss": 1.0261, + "step": 9276 + }, + { + "epoch": 0.89, + "grad_norm": 0.30103442627581245, + "learning_rate": 0.00012653200502280498, + "loss": 1.1181, + "step": 9277 + }, + { + "epoch": 0.89, + "grad_norm": 0.27808862986365707, + "learning_rate": 0.00012651675216446848, + "loss": 1.0745, + "step": 9278 + }, + { + "epoch": 0.89, + "grad_norm": 0.29656831264442357, + "learning_rate": 0.00012650149864253357, + "loss": 1.1474, + "step": 9279 + }, + { + "epoch": 0.89, + "grad_norm": 0.27503893982251604, + "learning_rate": 0.000126486244457382, + "loss": 1.0173, + "step": 9280 + }, + { + "epoch": 0.89, + "grad_norm": 0.2657056986153241, + "learning_rate": 0.00012647098960939554, + "loss": 0.9806, + "step": 9281 + }, + { + "epoch": 0.89, + "grad_norm": 0.25653477829913407, + "learning_rate": 0.0001264557340989559, + "loss": 1.0358, + "step": 9282 + }, + { + "epoch": 0.89, + "grad_norm": 0.29619037014025956, + "learning_rate": 0.0001264404779264449, + "loss": 1.1536, + "step": 9283 + }, + { + "epoch": 0.89, + "grad_norm": 0.2822157801841811, + "learning_rate": 0.00012642522109224434, + "loss": 1.1011, + "step": 9284 + }, + { + "epoch": 0.89, + "grad_norm": 0.31455283414554064, + "learning_rate": 0.000126409963596736, + "loss": 0.9813, + "step": 9285 + }, + { + "epoch": 0.89, + "grad_norm": 0.2775822826731524, + "learning_rate": 0.0001263947054403017, + "loss": 0.951, + "step": 9286 + }, + { + "epoch": 0.89, + "grad_norm": 0.2575236283104343, + "learning_rate": 0.00012637944662332332, + "loss": 0.9371, + "step": 9287 + }, + { + "epoch": 0.89, + "grad_norm": 0.24149231079738087, + "learning_rate": 0.00012636418714618273, + "loss": 0.9715, + "step": 9288 + }, + { + "epoch": 0.89, + "grad_norm": 0.2831753975004831, + "learning_rate": 0.00012634892700926178, + "loss": 1.0039, + "step": 9289 + }, + { + "epoch": 0.89, + "grad_norm": 0.29422901876914576, + "learning_rate": 0.00012633366621294238, + "loss": 1.1093, + "step": 9290 + }, + { + "epoch": 0.89, + "grad_norm": 0.27392177332876777, + "learning_rate": 0.00012631840475760644, + "loss": 1.1547, + "step": 9291 + }, + { + "epoch": 0.89, + "grad_norm": 0.27799764214008904, + "learning_rate": 0.00012630314264363584, + "loss": 1.1274, + "step": 9292 + }, + { + "epoch": 0.89, + "grad_norm": 0.31072988212098307, + "learning_rate": 0.0001262878798714126, + "loss": 1.0469, + "step": 9293 + }, + { + "epoch": 0.89, + "grad_norm": 0.31492709765160437, + "learning_rate": 0.00012627261644131862, + "loss": 0.9578, + "step": 9294 + }, + { + "epoch": 0.89, + "grad_norm": 0.2528035347256275, + "learning_rate": 0.00012625735235373593, + "loss": 1.0668, + "step": 9295 + }, + { + "epoch": 0.89, + "grad_norm": 0.3155214475191695, + "learning_rate": 0.00012624208760904647, + "loss": 0.9916, + "step": 9296 + }, + { + "epoch": 0.89, + "grad_norm": 0.302087235196872, + "learning_rate": 0.00012622682220763228, + "loss": 1.0884, + "step": 9297 + }, + { + "epoch": 0.89, + "grad_norm": 0.2990095102194339, + "learning_rate": 0.00012621155614987538, + "loss": 1.0101, + "step": 9298 + }, + { + "epoch": 0.89, + "grad_norm": 0.26706525825653776, + "learning_rate": 0.00012619628943615782, + "loss": 1.0278, + "step": 9299 + }, + { + "epoch": 0.89, + "grad_norm": 0.27756280255006816, + "learning_rate": 0.00012618102206686166, + "loss": 1.0099, + "step": 9300 + }, + { + "epoch": 0.89, + "grad_norm": 0.30586196916112135, + "learning_rate": 0.00012616575404236899, + "loss": 1.0915, + "step": 9301 + }, + { + "epoch": 0.89, + "grad_norm": 0.25272225815414157, + "learning_rate": 0.0001261504853630618, + "loss": 1.0799, + "step": 9302 + }, + { + "epoch": 0.89, + "grad_norm": 0.3258028723785073, + "learning_rate": 0.00012613521602932237, + "loss": 0.9907, + "step": 9303 + }, + { + "epoch": 0.89, + "grad_norm": 0.3105133570818196, + "learning_rate": 0.00012611994604153269, + "loss": 1.0494, + "step": 9304 + }, + { + "epoch": 0.89, + "grad_norm": 0.2853288951774398, + "learning_rate": 0.00012610467540007494, + "loss": 1.0367, + "step": 9305 + }, + { + "epoch": 0.89, + "grad_norm": 0.30136724902716244, + "learning_rate": 0.00012608940410533127, + "loss": 1.0905, + "step": 9306 + }, + { + "epoch": 0.89, + "grad_norm": 0.27766552230271924, + "learning_rate": 0.00012607413215768388, + "loss": 1.0067, + "step": 9307 + }, + { + "epoch": 0.89, + "grad_norm": 0.2839527667018618, + "learning_rate": 0.00012605885955751497, + "loss": 0.9767, + "step": 9308 + }, + { + "epoch": 0.89, + "grad_norm": 0.30687156353062944, + "learning_rate": 0.0001260435863052067, + "loss": 1.0725, + "step": 9309 + }, + { + "epoch": 0.89, + "grad_norm": 0.3156040190026411, + "learning_rate": 0.0001260283124011413, + "loss": 1.0457, + "step": 9310 + }, + { + "epoch": 0.89, + "grad_norm": 0.3104920835342392, + "learning_rate": 0.00012601303784570106, + "loss": 1.1438, + "step": 9311 + }, + { + "epoch": 0.89, + "grad_norm": 0.27017582473335783, + "learning_rate": 0.0001259977626392682, + "loss": 1.052, + "step": 9312 + }, + { + "epoch": 0.89, + "grad_norm": 0.27144499750698986, + "learning_rate": 0.00012598248678222498, + "loss": 1.0559, + "step": 9313 + }, + { + "epoch": 0.89, + "grad_norm": 0.2675880100657242, + "learning_rate": 0.0001259672102749537, + "loss": 0.9763, + "step": 9314 + }, + { + "epoch": 0.89, + "grad_norm": 0.2955864081602817, + "learning_rate": 0.00012595193311783665, + "loss": 0.9867, + "step": 9315 + }, + { + "epoch": 0.89, + "grad_norm": 0.2813400142046753, + "learning_rate": 0.00012593665531125615, + "loss": 1.0049, + "step": 9316 + }, + { + "epoch": 0.89, + "grad_norm": 0.2831320746009813, + "learning_rate": 0.00012592137685559458, + "loss": 1.029, + "step": 9317 + }, + { + "epoch": 0.89, + "grad_norm": 0.29831547442685147, + "learning_rate": 0.00012590609775123426, + "loss": 1.116, + "step": 9318 + }, + { + "epoch": 0.89, + "grad_norm": 0.2936254006184292, + "learning_rate": 0.00012589081799855756, + "loss": 1.0433, + "step": 9319 + }, + { + "epoch": 0.89, + "grad_norm": 0.30202822877591445, + "learning_rate": 0.00012587553759794683, + "loss": 1.067, + "step": 9320 + }, + { + "epoch": 0.89, + "grad_norm": 0.31153441957615685, + "learning_rate": 0.00012586025654978458, + "loss": 1.0185, + "step": 9321 + }, + { + "epoch": 0.89, + "grad_norm": 0.31241827225447677, + "learning_rate": 0.0001258449748544531, + "loss": 1.0975, + "step": 9322 + }, + { + "epoch": 0.89, + "grad_norm": 0.27730753920433476, + "learning_rate": 0.0001258296925123349, + "loss": 1.069, + "step": 9323 + }, + { + "epoch": 0.89, + "grad_norm": 0.2897016648466612, + "learning_rate": 0.00012581440952381243, + "loss": 1.067, + "step": 9324 + }, + { + "epoch": 0.89, + "grad_norm": 0.24652987673338808, + "learning_rate": 0.0001257991258892681, + "loss": 1.1338, + "step": 9325 + }, + { + "epoch": 0.89, + "grad_norm": 0.3071238167163373, + "learning_rate": 0.00012578384160908445, + "loss": 1.1402, + "step": 9326 + }, + { + "epoch": 0.89, + "grad_norm": 0.29112654720110537, + "learning_rate": 0.00012576855668364396, + "loss": 1.0772, + "step": 9327 + }, + { + "epoch": 0.89, + "grad_norm": 0.28108564003717257, + "learning_rate": 0.00012575327111332912, + "loss": 1.0895, + "step": 9328 + }, + { + "epoch": 0.89, + "grad_norm": 0.2697345251089792, + "learning_rate": 0.00012573798489852253, + "loss": 1.1332, + "step": 9329 + }, + { + "epoch": 0.89, + "grad_norm": 0.304121790273392, + "learning_rate": 0.00012572269803960665, + "loss": 1.0902, + "step": 9330 + }, + { + "epoch": 0.89, + "grad_norm": 0.28853126394768475, + "learning_rate": 0.00012570741053696412, + "loss": 1.0721, + "step": 9331 + }, + { + "epoch": 0.89, + "grad_norm": 0.2851724014266135, + "learning_rate": 0.0001256921223909775, + "loss": 1.1567, + "step": 9332 + }, + { + "epoch": 0.89, + "grad_norm": 0.28817110546138114, + "learning_rate": 0.0001256768336020293, + "loss": 1.1432, + "step": 9333 + }, + { + "epoch": 0.89, + "grad_norm": 0.2674528738842632, + "learning_rate": 0.00012566154417050225, + "loss": 1.1189, + "step": 9334 + }, + { + "epoch": 0.89, + "grad_norm": 0.27322750722188155, + "learning_rate": 0.00012564625409677895, + "loss": 1.0305, + "step": 9335 + }, + { + "epoch": 0.89, + "grad_norm": 0.29977545197889227, + "learning_rate": 0.000125630963381242, + "loss": 1.1117, + "step": 9336 + }, + { + "epoch": 0.89, + "grad_norm": 0.29111294479338623, + "learning_rate": 0.00012561567202427407, + "loss": 0.9916, + "step": 9337 + }, + { + "epoch": 0.89, + "grad_norm": 0.3036268811633583, + "learning_rate": 0.00012560038002625788, + "loss": 1.0984, + "step": 9338 + }, + { + "epoch": 0.89, + "grad_norm": 0.2549172822007168, + "learning_rate": 0.00012558508738757604, + "loss": 1.1815, + "step": 9339 + }, + { + "epoch": 0.89, + "grad_norm": 0.2675936118833171, + "learning_rate": 0.00012556979410861135, + "loss": 0.962, + "step": 9340 + }, + { + "epoch": 0.89, + "grad_norm": 0.2773919289873, + "learning_rate": 0.00012555450018974647, + "loss": 1.1229, + "step": 9341 + }, + { + "epoch": 0.89, + "grad_norm": 0.2813050855847403, + "learning_rate": 0.00012553920563136418, + "loss": 1.1681, + "step": 9342 + }, + { + "epoch": 0.89, + "grad_norm": 0.2798799129707208, + "learning_rate": 0.00012552391043384718, + "loss": 1.1568, + "step": 9343 + }, + { + "epoch": 0.89, + "grad_norm": 0.30424213764742, + "learning_rate": 0.00012550861459757835, + "loss": 1.1593, + "step": 9344 + }, + { + "epoch": 0.89, + "grad_norm": 0.32261170593030764, + "learning_rate": 0.00012549331812294033, + "loss": 1.0733, + "step": 9345 + }, + { + "epoch": 0.89, + "grad_norm": 0.28695317695752437, + "learning_rate": 0.00012547802101031604, + "loss": 1.0534, + "step": 9346 + }, + { + "epoch": 0.89, + "grad_norm": 0.297270830182022, + "learning_rate": 0.00012546272326008828, + "loss": 0.9348, + "step": 9347 + }, + { + "epoch": 0.89, + "grad_norm": 0.32798136328441224, + "learning_rate": 0.00012544742487263983, + "loss": 1.2133, + "step": 9348 + }, + { + "epoch": 0.89, + "grad_norm": 0.276438841709617, + "learning_rate": 0.00012543212584835363, + "loss": 0.9188, + "step": 9349 + }, + { + "epoch": 0.89, + "grad_norm": 0.30713210860480056, + "learning_rate": 0.00012541682618761243, + "loss": 0.936, + "step": 9350 + }, + { + "epoch": 0.89, + "grad_norm": 0.3058264322272426, + "learning_rate": 0.00012540152589079922, + "loss": 1.0953, + "step": 9351 + }, + { + "epoch": 0.89, + "grad_norm": 0.3086810532233868, + "learning_rate": 0.00012538622495829687, + "loss": 1.0574, + "step": 9352 + }, + { + "epoch": 0.89, + "grad_norm": 0.26492015598732366, + "learning_rate": 0.00012537092339048829, + "loss": 1.0913, + "step": 9353 + }, + { + "epoch": 0.89, + "grad_norm": 0.2613494790949844, + "learning_rate": 0.00012535562118775638, + "loss": 1.0103, + "step": 9354 + }, + { + "epoch": 0.9, + "grad_norm": 0.2745257568238265, + "learning_rate": 0.00012534031835048412, + "loss": 1.016, + "step": 9355 + }, + { + "epoch": 0.9, + "grad_norm": 0.32547504865315513, + "learning_rate": 0.00012532501487905447, + "loss": 1.0429, + "step": 9356 + }, + { + "epoch": 0.9, + "grad_norm": 0.25947151074133556, + "learning_rate": 0.0001253097107738504, + "loss": 1.0684, + "step": 9357 + }, + { + "epoch": 0.9, + "grad_norm": 0.2749564028335704, + "learning_rate": 0.00012529440603525495, + "loss": 1.0026, + "step": 9358 + }, + { + "epoch": 0.9, + "grad_norm": 0.26444889751740525, + "learning_rate": 0.00012527910066365108, + "loss": 1.1251, + "step": 9359 + }, + { + "epoch": 0.9, + "grad_norm": 0.24760688189734706, + "learning_rate": 0.00012526379465942179, + "loss": 1.0365, + "step": 9360 + }, + { + "epoch": 0.9, + "grad_norm": 0.2952587878180295, + "learning_rate": 0.00012524848802295018, + "loss": 1.1582, + "step": 9361 + }, + { + "epoch": 0.9, + "grad_norm": 0.27540696444772433, + "learning_rate": 0.0001252331807546193, + "loss": 1.1061, + "step": 9362 + }, + { + "epoch": 0.9, + "grad_norm": 0.27595222755388815, + "learning_rate": 0.00012521787285481222, + "loss": 1.0053, + "step": 9363 + }, + { + "epoch": 0.9, + "grad_norm": 0.3078665527994498, + "learning_rate": 0.00012520256432391197, + "loss": 1.0609, + "step": 9364 + }, + { + "epoch": 0.9, + "grad_norm": 0.30912162354934847, + "learning_rate": 0.00012518725516230176, + "loss": 1.092, + "step": 9365 + }, + { + "epoch": 0.9, + "grad_norm": 0.24850758690847607, + "learning_rate": 0.00012517194537036463, + "loss": 0.9636, + "step": 9366 + }, + { + "epoch": 0.9, + "grad_norm": 0.25462201249484734, + "learning_rate": 0.00012515663494848378, + "loss": 1.0612, + "step": 9367 + }, + { + "epoch": 0.9, + "grad_norm": 0.30057529749822187, + "learning_rate": 0.0001251413238970423, + "loss": 1.1008, + "step": 9368 + }, + { + "epoch": 0.9, + "grad_norm": 0.28606400293322526, + "learning_rate": 0.00012512601221642338, + "loss": 0.9646, + "step": 9369 + }, + { + "epoch": 0.9, + "grad_norm": 0.2718761895079578, + "learning_rate": 0.00012511069990701022, + "loss": 1.1127, + "step": 9370 + }, + { + "epoch": 0.9, + "grad_norm": 0.26749842342835667, + "learning_rate": 0.00012509538696918606, + "loss": 1.0016, + "step": 9371 + }, + { + "epoch": 0.9, + "grad_norm": 0.24422845567032508, + "learning_rate": 0.00012508007340333402, + "loss": 1.0075, + "step": 9372 + }, + { + "epoch": 0.9, + "grad_norm": 0.26485906439128654, + "learning_rate": 0.00012506475920983742, + "loss": 1.0334, + "step": 9373 + }, + { + "epoch": 0.9, + "grad_norm": 0.23407388212556063, + "learning_rate": 0.00012504944438907945, + "loss": 0.9974, + "step": 9374 + }, + { + "epoch": 0.9, + "grad_norm": 0.2948257650153299, + "learning_rate": 0.00012503412894144337, + "loss": 1.0004, + "step": 9375 + }, + { + "epoch": 0.9, + "grad_norm": 0.2624520239691215, + "learning_rate": 0.0001250188128673125, + "loss": 1.113, + "step": 9376 + }, + { + "epoch": 0.9, + "grad_norm": 0.28609464800481466, + "learning_rate": 0.00012500349616707013, + "loss": 0.9897, + "step": 9377 + }, + { + "epoch": 0.9, + "grad_norm": 0.2647060271102632, + "learning_rate": 0.0001249881788410995, + "loss": 1.1142, + "step": 9378 + }, + { + "epoch": 0.9, + "grad_norm": 0.3319227336941671, + "learning_rate": 0.00012497286088978407, + "loss": 1.1371, + "step": 9379 + }, + { + "epoch": 0.9, + "grad_norm": 0.290890609063919, + "learning_rate": 0.00012495754231350704, + "loss": 0.9454, + "step": 9380 + }, + { + "epoch": 0.9, + "grad_norm": 0.275011083994787, + "learning_rate": 0.00012494222311265185, + "loss": 1.0498, + "step": 9381 + }, + { + "epoch": 0.9, + "grad_norm": 0.29328964264372026, + "learning_rate": 0.00012492690328760184, + "loss": 1.2401, + "step": 9382 + }, + { + "epoch": 0.9, + "grad_norm": 0.2849134201992309, + "learning_rate": 0.00012491158283874042, + "loss": 1.0384, + "step": 9383 + }, + { + "epoch": 0.9, + "grad_norm": 0.2644889309340255, + "learning_rate": 0.00012489626176645098, + "loss": 1.0729, + "step": 9384 + }, + { + "epoch": 0.9, + "grad_norm": 0.2671846138345653, + "learning_rate": 0.00012488094007111694, + "loss": 1.0643, + "step": 9385 + }, + { + "epoch": 0.9, + "grad_norm": 0.28342755899615574, + "learning_rate": 0.00012486561775312176, + "loss": 1.0652, + "step": 9386 + }, + { + "epoch": 0.9, + "grad_norm": 0.2708718366533827, + "learning_rate": 0.00012485029481284883, + "loss": 1.0131, + "step": 9387 + }, + { + "epoch": 0.9, + "grad_norm": 0.2893275903976366, + "learning_rate": 0.00012483497125068168, + "loss": 1.1254, + "step": 9388 + }, + { + "epoch": 0.9, + "grad_norm": 0.29651203987564034, + "learning_rate": 0.00012481964706700374, + "loss": 1.0964, + "step": 9389 + }, + { + "epoch": 0.9, + "grad_norm": 0.2668243860412548, + "learning_rate": 0.00012480432226219857, + "loss": 1.0646, + "step": 9390 + }, + { + "epoch": 0.9, + "grad_norm": 0.2607022670832829, + "learning_rate": 0.0001247889968366496, + "loss": 1.094, + "step": 9391 + }, + { + "epoch": 0.9, + "grad_norm": 0.3032946090686901, + "learning_rate": 0.00012477367079074045, + "loss": 1.0649, + "step": 9392 + }, + { + "epoch": 0.9, + "grad_norm": 0.31332018726986793, + "learning_rate": 0.0001247583441248546, + "loss": 1.0273, + "step": 9393 + }, + { + "epoch": 0.9, + "grad_norm": 0.25149055452189184, + "learning_rate": 0.00012474301683937562, + "loss": 1.0565, + "step": 9394 + }, + { + "epoch": 0.9, + "grad_norm": 0.31857711281294315, + "learning_rate": 0.00012472768893468712, + "loss": 1.1011, + "step": 9395 + }, + { + "epoch": 0.9, + "grad_norm": 0.27584446547396924, + "learning_rate": 0.00012471236041117263, + "loss": 1.1056, + "step": 9396 + }, + { + "epoch": 0.9, + "grad_norm": 0.2710593279658963, + "learning_rate": 0.00012469703126921582, + "loss": 1.0088, + "step": 9397 + }, + { + "epoch": 0.9, + "grad_norm": 0.2924314964891264, + "learning_rate": 0.00012468170150920028, + "loss": 1.0443, + "step": 9398 + }, + { + "epoch": 0.9, + "grad_norm": 0.3100262243859713, + "learning_rate": 0.00012466637113150964, + "loss": 1.068, + "step": 9399 + }, + { + "epoch": 0.9, + "grad_norm": 0.3203008665877059, + "learning_rate": 0.00012465104013652755, + "loss": 1.0662, + "step": 9400 + }, + { + "epoch": 0.9, + "grad_norm": 0.28159144815769815, + "learning_rate": 0.00012463570852463767, + "loss": 1.0416, + "step": 9401 + }, + { + "epoch": 0.9, + "grad_norm": 0.2670293148269375, + "learning_rate": 0.00012462037629622374, + "loss": 1.0158, + "step": 9402 + }, + { + "epoch": 0.9, + "grad_norm": 0.28251780180802666, + "learning_rate": 0.00012460504345166942, + "loss": 1.0844, + "step": 9403 + }, + { + "epoch": 0.9, + "grad_norm": 0.2762339006289285, + "learning_rate": 0.00012458970999135839, + "loss": 1.0557, + "step": 9404 + }, + { + "epoch": 0.9, + "grad_norm": 0.26743346835091364, + "learning_rate": 0.00012457437591567442, + "loss": 1.0952, + "step": 9405 + }, + { + "epoch": 0.9, + "grad_norm": 0.28037210231320464, + "learning_rate": 0.00012455904122500128, + "loss": 0.9932, + "step": 9406 + }, + { + "epoch": 0.9, + "grad_norm": 0.2629358216186376, + "learning_rate": 0.00012454370591972268, + "loss": 0.9753, + "step": 9407 + }, + { + "epoch": 0.9, + "grad_norm": 0.2749296239864377, + "learning_rate": 0.0001245283700002224, + "loss": 1.0294, + "step": 9408 + }, + { + "epoch": 0.9, + "grad_norm": 0.3005194418484203, + "learning_rate": 0.00012451303346688424, + "loss": 0.9774, + "step": 9409 + }, + { + "epoch": 0.9, + "grad_norm": 0.2994528395818742, + "learning_rate": 0.00012449769632009205, + "loss": 1.0247, + "step": 9410 + }, + { + "epoch": 0.9, + "grad_norm": 0.2491825962382407, + "learning_rate": 0.00012448235856022958, + "loss": 1.0416, + "step": 9411 + }, + { + "epoch": 0.9, + "grad_norm": 0.24946941218248858, + "learning_rate": 0.0001244670201876807, + "loss": 1.0701, + "step": 9412 + }, + { + "epoch": 0.9, + "grad_norm": 0.30337869321931366, + "learning_rate": 0.0001244516812028293, + "loss": 1.1049, + "step": 9413 + }, + { + "epoch": 0.9, + "grad_norm": 0.30488539436890616, + "learning_rate": 0.00012443634160605918, + "loss": 1.0473, + "step": 9414 + }, + { + "epoch": 0.9, + "grad_norm": 0.307492284380008, + "learning_rate": 0.00012442100139775425, + "loss": 1.0344, + "step": 9415 + }, + { + "epoch": 0.9, + "grad_norm": 0.3091356412029321, + "learning_rate": 0.00012440566057829843, + "loss": 1.0037, + "step": 9416 + }, + { + "epoch": 0.9, + "grad_norm": 0.27659203912070673, + "learning_rate": 0.0001243903191480756, + "loss": 0.9788, + "step": 9417 + }, + { + "epoch": 0.9, + "grad_norm": 0.295213821134966, + "learning_rate": 0.00012437497710746974, + "loss": 1.0793, + "step": 9418 + }, + { + "epoch": 0.9, + "grad_norm": 0.2921351405650045, + "learning_rate": 0.00012435963445686472, + "loss": 1.0692, + "step": 9419 + }, + { + "epoch": 0.9, + "grad_norm": 0.32103907136316706, + "learning_rate": 0.00012434429119664457, + "loss": 1.1415, + "step": 9420 + }, + { + "epoch": 0.9, + "grad_norm": 0.30821488009357795, + "learning_rate": 0.0001243289473271932, + "loss": 1.1574, + "step": 9421 + }, + { + "epoch": 0.9, + "grad_norm": 0.3020517741494569, + "learning_rate": 0.00012431360284889464, + "loss": 1.1252, + "step": 9422 + }, + { + "epoch": 0.9, + "grad_norm": 0.30111749627739215, + "learning_rate": 0.0001242982577621329, + "loss": 1.0815, + "step": 9423 + }, + { + "epoch": 0.9, + "grad_norm": 0.2654478388360173, + "learning_rate": 0.000124282912067292, + "loss": 1.0287, + "step": 9424 + }, + { + "epoch": 0.9, + "grad_norm": 0.2845065279316126, + "learning_rate": 0.00012426756576475593, + "loss": 1.0703, + "step": 9425 + }, + { + "epoch": 0.9, + "grad_norm": 0.24604429019373228, + "learning_rate": 0.00012425221885490882, + "loss": 0.982, + "step": 9426 + }, + { + "epoch": 0.9, + "grad_norm": 0.3204878522592504, + "learning_rate": 0.00012423687133813466, + "loss": 0.9874, + "step": 9427 + }, + { + "epoch": 0.9, + "grad_norm": 0.2698848880583595, + "learning_rate": 0.00012422152321481754, + "loss": 1.1232, + "step": 9428 + }, + { + "epoch": 0.9, + "grad_norm": 0.2956334280898767, + "learning_rate": 0.00012420617448534162, + "loss": 1.1245, + "step": 9429 + }, + { + "epoch": 0.9, + "grad_norm": 0.28215348025828507, + "learning_rate": 0.00012419082515009093, + "loss": 1.0598, + "step": 9430 + }, + { + "epoch": 0.9, + "grad_norm": 0.29800328339448867, + "learning_rate": 0.00012417547520944967, + "loss": 0.9549, + "step": 9431 + }, + { + "epoch": 0.9, + "grad_norm": 0.2758307830968502, + "learning_rate": 0.00012416012466380194, + "loss": 1.0299, + "step": 9432 + }, + { + "epoch": 0.9, + "grad_norm": 0.27789615338120255, + "learning_rate": 0.00012414477351353192, + "loss": 0.955, + "step": 9433 + }, + { + "epoch": 0.9, + "grad_norm": 0.28216151632355363, + "learning_rate": 0.00012412942175902376, + "loss": 1.0228, + "step": 9434 + }, + { + "epoch": 0.9, + "grad_norm": 0.3646066899027749, + "learning_rate": 0.00012411406940066163, + "loss": 1.2245, + "step": 9435 + }, + { + "epoch": 0.9, + "grad_norm": 0.28097256754194205, + "learning_rate": 0.0001240987164388298, + "loss": 1.0332, + "step": 9436 + }, + { + "epoch": 0.9, + "grad_norm": 0.25589371187045934, + "learning_rate": 0.00012408336287391243, + "loss": 1.0571, + "step": 9437 + }, + { + "epoch": 0.9, + "grad_norm": 0.2924108280699943, + "learning_rate": 0.00012406800870629373, + "loss": 1.0727, + "step": 9438 + }, + { + "epoch": 0.9, + "grad_norm": 0.26309668552925775, + "learning_rate": 0.00012405265393635804, + "loss": 0.9953, + "step": 9439 + }, + { + "epoch": 0.9, + "grad_norm": 0.3055036695706396, + "learning_rate": 0.00012403729856448956, + "loss": 1.1705, + "step": 9440 + }, + { + "epoch": 0.9, + "grad_norm": 0.26404553824001786, + "learning_rate": 0.00012402194259107256, + "loss": 0.8035, + "step": 9441 + }, + { + "epoch": 0.9, + "grad_norm": 0.263456979982817, + "learning_rate": 0.00012400658601649135, + "loss": 1.0515, + "step": 9442 + }, + { + "epoch": 0.9, + "grad_norm": 0.2697295571066528, + "learning_rate": 0.00012399122884113024, + "loss": 1.0783, + "step": 9443 + }, + { + "epoch": 0.9, + "grad_norm": 0.3032996814397787, + "learning_rate": 0.00012397587106537355, + "loss": 1.107, + "step": 9444 + }, + { + "epoch": 0.9, + "grad_norm": 0.30209747282548055, + "learning_rate": 0.0001239605126896056, + "loss": 1.0748, + "step": 9445 + }, + { + "epoch": 0.9, + "grad_norm": 0.2541798522215149, + "learning_rate": 0.0001239451537142108, + "loss": 0.9973, + "step": 9446 + }, + { + "epoch": 0.9, + "grad_norm": 0.26055885474176194, + "learning_rate": 0.0001239297941395735, + "loss": 0.9793, + "step": 9447 + }, + { + "epoch": 0.9, + "grad_norm": 0.2937502415391966, + "learning_rate": 0.00012391443396607798, + "loss": 1.0501, + "step": 9448 + }, + { + "epoch": 0.9, + "grad_norm": 0.2484096340317221, + "learning_rate": 0.00012389907319410877, + "loss": 0.9737, + "step": 9449 + }, + { + "epoch": 0.9, + "grad_norm": 0.34708042996433486, + "learning_rate": 0.00012388371182405023, + "loss": 1.0665, + "step": 9450 + }, + { + "epoch": 0.9, + "grad_norm": 0.2870344403909073, + "learning_rate": 0.0001238683498562868, + "loss": 1.0346, + "step": 9451 + }, + { + "epoch": 0.9, + "grad_norm": 0.32368230775180656, + "learning_rate": 0.00012385298729120287, + "loss": 0.9915, + "step": 9452 + }, + { + "epoch": 0.9, + "grad_norm": 0.2902156260509134, + "learning_rate": 0.00012383762412918297, + "loss": 1.0838, + "step": 9453 + }, + { + "epoch": 0.9, + "grad_norm": 0.26483538433540343, + "learning_rate": 0.00012382226037061157, + "loss": 1.0152, + "step": 9454 + }, + { + "epoch": 0.9, + "grad_norm": 0.31506688541873007, + "learning_rate": 0.0001238068960158731, + "loss": 1.0498, + "step": 9455 + }, + { + "epoch": 0.9, + "grad_norm": 0.31606368328175294, + "learning_rate": 0.00012379153106535212, + "loss": 1.0568, + "step": 9456 + }, + { + "epoch": 0.9, + "grad_norm": 0.26749250980306544, + "learning_rate": 0.00012377616551943312, + "loss": 1.1441, + "step": 9457 + }, + { + "epoch": 0.9, + "grad_norm": 0.2805411852480236, + "learning_rate": 0.0001237607993785006, + "loss": 1.0273, + "step": 9458 + }, + { + "epoch": 0.9, + "grad_norm": 0.29590637384614576, + "learning_rate": 0.0001237454326429392, + "loss": 1.2318, + "step": 9459 + }, + { + "epoch": 0.91, + "grad_norm": 0.277157897803464, + "learning_rate": 0.00012373006531313338, + "loss": 1.0606, + "step": 9460 + }, + { + "epoch": 0.91, + "grad_norm": 0.2820422335962146, + "learning_rate": 0.0001237146973894678, + "loss": 1.0724, + "step": 9461 + }, + { + "epoch": 0.91, + "grad_norm": 0.28970884694413745, + "learning_rate": 0.00012369932887232695, + "loss": 1.0501, + "step": 9462 + }, + { + "epoch": 0.91, + "grad_norm": 0.26821460214493453, + "learning_rate": 0.00012368395976209554, + "loss": 1.1002, + "step": 9463 + }, + { + "epoch": 0.91, + "grad_norm": 0.3020120162788039, + "learning_rate": 0.00012366859005915817, + "loss": 1.2129, + "step": 9464 + }, + { + "epoch": 0.91, + "grad_norm": 0.25290616792649784, + "learning_rate": 0.00012365321976389942, + "loss": 0.9608, + "step": 9465 + }, + { + "epoch": 0.91, + "grad_norm": 0.3118979180099709, + "learning_rate": 0.000123637848876704, + "loss": 1.0709, + "step": 9466 + }, + { + "epoch": 0.91, + "grad_norm": 0.31154007441669707, + "learning_rate": 0.00012362247739795658, + "loss": 1.0235, + "step": 9467 + }, + { + "epoch": 0.91, + "grad_norm": 0.28995095468089827, + "learning_rate": 0.00012360710532804178, + "loss": 1.201, + "step": 9468 + }, + { + "epoch": 0.91, + "grad_norm": 0.3110913941186855, + "learning_rate": 0.00012359173266734435, + "loss": 1.1567, + "step": 9469 + }, + { + "epoch": 0.91, + "grad_norm": 0.27514587325982237, + "learning_rate": 0.00012357635941624898, + "loss": 1.0559, + "step": 9470 + }, + { + "epoch": 0.91, + "grad_norm": 0.30517981996469423, + "learning_rate": 0.00012356098557514037, + "loss": 1.0068, + "step": 9471 + }, + { + "epoch": 0.91, + "grad_norm": 0.2725388181887218, + "learning_rate": 0.00012354561114440334, + "loss": 0.9899, + "step": 9472 + }, + { + "epoch": 0.91, + "grad_norm": 0.297899718417481, + "learning_rate": 0.00012353023612442254, + "loss": 1.0983, + "step": 9473 + }, + { + "epoch": 0.91, + "grad_norm": 0.2848396496021322, + "learning_rate": 0.00012351486051558283, + "loss": 1.0539, + "step": 9474 + }, + { + "epoch": 0.91, + "grad_norm": 0.2852186312128984, + "learning_rate": 0.00012349948431826895, + "loss": 1.0044, + "step": 9475 + }, + { + "epoch": 0.91, + "grad_norm": 0.29012020789960563, + "learning_rate": 0.0001234841075328657, + "loss": 0.9673, + "step": 9476 + }, + { + "epoch": 0.91, + "grad_norm": 0.24672860443444639, + "learning_rate": 0.0001234687301597579, + "loss": 1.0161, + "step": 9477 + }, + { + "epoch": 0.91, + "grad_norm": 0.2726877358325203, + "learning_rate": 0.0001234533521993304, + "loss": 1.0621, + "step": 9478 + }, + { + "epoch": 0.91, + "grad_norm": 0.28948332424230283, + "learning_rate": 0.00012343797365196797, + "loss": 0.9913, + "step": 9479 + }, + { + "epoch": 0.91, + "grad_norm": 0.26969250387705407, + "learning_rate": 0.00012342259451805557, + "loss": 1.1202, + "step": 9480 + }, + { + "epoch": 0.91, + "grad_norm": 0.3258916039185605, + "learning_rate": 0.000123407214797978, + "loss": 1.0168, + "step": 9481 + }, + { + "epoch": 0.91, + "grad_norm": 0.25973605245424924, + "learning_rate": 0.00012339183449212017, + "loss": 1.0874, + "step": 9482 + }, + { + "epoch": 0.91, + "grad_norm": 0.2676443845035316, + "learning_rate": 0.00012337645360086698, + "loss": 0.9667, + "step": 9483 + }, + { + "epoch": 0.91, + "grad_norm": 0.30203903279304206, + "learning_rate": 0.00012336107212460338, + "loss": 1.0537, + "step": 9484 + }, + { + "epoch": 0.91, + "grad_norm": 0.27476191579196646, + "learning_rate": 0.00012334569006371422, + "loss": 1.0224, + "step": 9485 + }, + { + "epoch": 0.91, + "grad_norm": 0.26328678406410266, + "learning_rate": 0.0001233303074185845, + "loss": 1.1169, + "step": 9486 + }, + { + "epoch": 0.91, + "grad_norm": 0.29307038009665376, + "learning_rate": 0.0001233149241895992, + "loss": 1.0547, + "step": 9487 + }, + { + "epoch": 0.91, + "grad_norm": 0.3029822127610837, + "learning_rate": 0.00012329954037714326, + "loss": 1.0455, + "step": 9488 + }, + { + "epoch": 0.91, + "grad_norm": 0.31316815984313306, + "learning_rate": 0.00012328415598160167, + "loss": 1.0461, + "step": 9489 + }, + { + "epoch": 0.91, + "grad_norm": 0.26416179108140025, + "learning_rate": 0.00012326877100335946, + "loss": 0.9973, + "step": 9490 + }, + { + "epoch": 0.91, + "grad_norm": 0.27917647838809806, + "learning_rate": 0.0001232533854428016, + "loss": 1.1014, + "step": 9491 + }, + { + "epoch": 0.91, + "grad_norm": 0.2880924083809222, + "learning_rate": 0.00012323799930031318, + "loss": 1.1198, + "step": 9492 + }, + { + "epoch": 0.91, + "grad_norm": 0.2738883771003005, + "learning_rate": 0.0001232226125762792, + "loss": 0.9753, + "step": 9493 + }, + { + "epoch": 0.91, + "grad_norm": 0.28505026809880435, + "learning_rate": 0.00012320722527108476, + "loss": 1.0131, + "step": 9494 + }, + { + "epoch": 0.91, + "grad_norm": 0.2700388433926024, + "learning_rate": 0.00012319183738511495, + "loss": 1.0627, + "step": 9495 + }, + { + "epoch": 0.91, + "grad_norm": 0.21724216250627967, + "learning_rate": 0.0001231764489187548, + "loss": 0.969, + "step": 9496 + }, + { + "epoch": 0.91, + "grad_norm": 0.30567642004976847, + "learning_rate": 0.00012316105987238946, + "loss": 1.1282, + "step": 9497 + }, + { + "epoch": 0.91, + "grad_norm": 0.2960952448251748, + "learning_rate": 0.00012314567024640405, + "loss": 1.0719, + "step": 9498 + }, + { + "epoch": 0.91, + "grad_norm": 0.2679330934652599, + "learning_rate": 0.00012313028004118368, + "loss": 1.0671, + "step": 9499 + }, + { + "epoch": 0.91, + "grad_norm": 0.30977923520733813, + "learning_rate": 0.00012311488925711352, + "loss": 1.0649, + "step": 9500 + }, + { + "epoch": 0.91, + "grad_norm": 0.2877059133094227, + "learning_rate": 0.00012309949789457872, + "loss": 0.99, + "step": 9501 + }, + { + "epoch": 0.91, + "grad_norm": 0.2747597630168646, + "learning_rate": 0.0001230841059539645, + "loss": 1.0708, + "step": 9502 + }, + { + "epoch": 0.91, + "grad_norm": 0.2849182513539573, + "learning_rate": 0.00012306871343565598, + "loss": 1.1457, + "step": 9503 + }, + { + "epoch": 0.91, + "grad_norm": 0.2816422560556322, + "learning_rate": 0.00012305332034003843, + "loss": 1.0653, + "step": 9504 + }, + { + "epoch": 0.91, + "grad_norm": 0.2449196404919519, + "learning_rate": 0.00012303792666749704, + "loss": 1.0025, + "step": 9505 + }, + { + "epoch": 0.91, + "grad_norm": 0.28277588561340145, + "learning_rate": 0.00012302253241841705, + "loss": 1.0981, + "step": 9506 + }, + { + "epoch": 0.91, + "grad_norm": 0.28881985968739604, + "learning_rate": 0.00012300713759318374, + "loss": 0.8564, + "step": 9507 + }, + { + "epoch": 0.91, + "grad_norm": 0.2658935819955603, + "learning_rate": 0.00012299174219218236, + "loss": 0.9685, + "step": 9508 + }, + { + "epoch": 0.91, + "grad_norm": 0.33536541922122687, + "learning_rate": 0.00012297634621579815, + "loss": 1.0644, + "step": 9509 + }, + { + "epoch": 0.91, + "grad_norm": 0.2883666884244152, + "learning_rate": 0.00012296094966441644, + "loss": 1.0163, + "step": 9510 + }, + { + "epoch": 0.91, + "grad_norm": 0.2874022639714324, + "learning_rate": 0.00012294555253842258, + "loss": 1.0547, + "step": 9511 + }, + { + "epoch": 0.91, + "grad_norm": 0.28615282013866633, + "learning_rate": 0.0001229301548382018, + "loss": 1.0267, + "step": 9512 + }, + { + "epoch": 0.91, + "grad_norm": 0.31559288068892855, + "learning_rate": 0.0001229147565641395, + "loss": 1.0427, + "step": 9513 + }, + { + "epoch": 0.91, + "grad_norm": 0.29067613727477026, + "learning_rate": 0.000122899357716621, + "loss": 1.0771, + "step": 9514 + }, + { + "epoch": 0.91, + "grad_norm": 0.3091379944841265, + "learning_rate": 0.00012288395829603168, + "loss": 1.1228, + "step": 9515 + }, + { + "epoch": 0.91, + "grad_norm": 0.34288634284416575, + "learning_rate": 0.0001228685583027569, + "loss": 1.07, + "step": 9516 + }, + { + "epoch": 0.91, + "grad_norm": 0.25806670007511917, + "learning_rate": 0.00012285315773718215, + "loss": 1.177, + "step": 9517 + }, + { + "epoch": 0.91, + "grad_norm": 0.28617035042389005, + "learning_rate": 0.00012283775659969272, + "loss": 0.9902, + "step": 9518 + }, + { + "epoch": 0.91, + "grad_norm": 0.2699590057863018, + "learning_rate": 0.00012282235489067406, + "loss": 1.0114, + "step": 9519 + }, + { + "epoch": 0.91, + "grad_norm": 0.2668076542427271, + "learning_rate": 0.00012280695261051168, + "loss": 1.0479, + "step": 9520 + }, + { + "epoch": 0.91, + "grad_norm": 0.27069952546890097, + "learning_rate": 0.00012279154975959093, + "loss": 0.9784, + "step": 9521 + }, + { + "epoch": 0.91, + "grad_norm": 0.26601560597027246, + "learning_rate": 0.00012277614633829736, + "loss": 0.9275, + "step": 9522 + }, + { + "epoch": 0.91, + "grad_norm": 0.34915490477463507, + "learning_rate": 0.00012276074234701637, + "loss": 1.0747, + "step": 9523 + }, + { + "epoch": 0.91, + "grad_norm": 0.2963938117016832, + "learning_rate": 0.00012274533778613354, + "loss": 0.968, + "step": 9524 + }, + { + "epoch": 0.91, + "grad_norm": 0.28627744365026025, + "learning_rate": 0.00012272993265603432, + "loss": 1.0285, + "step": 9525 + }, + { + "epoch": 0.91, + "grad_norm": 0.2543047213053924, + "learning_rate": 0.00012271452695710423, + "loss": 0.9785, + "step": 9526 + }, + { + "epoch": 0.91, + "grad_norm": 0.30369638655565206, + "learning_rate": 0.00012269912068972887, + "loss": 1.0088, + "step": 9527 + }, + { + "epoch": 0.91, + "grad_norm": 0.30048051465973585, + "learning_rate": 0.00012268371385429371, + "loss": 1.0032, + "step": 9528 + }, + { + "epoch": 0.91, + "grad_norm": 0.24363694434494604, + "learning_rate": 0.0001226683064511844, + "loss": 1.083, + "step": 9529 + }, + { + "epoch": 0.91, + "grad_norm": 0.27300145834174794, + "learning_rate": 0.0001226528984807864, + "loss": 1.056, + "step": 9530 + }, + { + "epoch": 0.91, + "grad_norm": 0.2840057926179831, + "learning_rate": 0.00012263748994348543, + "loss": 1.0906, + "step": 9531 + }, + { + "epoch": 0.91, + "grad_norm": 0.2867767195608796, + "learning_rate": 0.00012262208083966707, + "loss": 1.0343, + "step": 9532 + }, + { + "epoch": 0.91, + "grad_norm": 0.30097203291437813, + "learning_rate": 0.00012260667116971687, + "loss": 1.1535, + "step": 9533 + }, + { + "epoch": 0.91, + "grad_norm": 0.26057797605093236, + "learning_rate": 0.0001225912609340205, + "loss": 0.9956, + "step": 9534 + }, + { + "epoch": 0.91, + "grad_norm": 0.2722485625219177, + "learning_rate": 0.00012257585013296368, + "loss": 0.9609, + "step": 9535 + }, + { + "epoch": 0.91, + "grad_norm": 0.31674933662782234, + "learning_rate": 0.00012256043876693199, + "loss": 1.1484, + "step": 9536 + }, + { + "epoch": 0.91, + "grad_norm": 0.24732657574619976, + "learning_rate": 0.00012254502683631114, + "loss": 1.11, + "step": 9537 + }, + { + "epoch": 0.91, + "grad_norm": 0.302444859564973, + "learning_rate": 0.00012252961434148685, + "loss": 1.1601, + "step": 9538 + }, + { + "epoch": 0.91, + "grad_norm": 0.3039631031521198, + "learning_rate": 0.0001225142012828448, + "loss": 1.1319, + "step": 9539 + }, + { + "epoch": 0.91, + "grad_norm": 0.2833615843410249, + "learning_rate": 0.0001224987876607707, + "loss": 1.1211, + "step": 9540 + }, + { + "epoch": 0.91, + "grad_norm": 0.24087751384697798, + "learning_rate": 0.0001224833734756503, + "loss": 0.9828, + "step": 9541 + }, + { + "epoch": 0.91, + "grad_norm": 0.3022341870078711, + "learning_rate": 0.00012246795872786938, + "loss": 1.0778, + "step": 9542 + }, + { + "epoch": 0.91, + "grad_norm": 0.3125008887305215, + "learning_rate": 0.00012245254341781362, + "loss": 0.9715, + "step": 9543 + }, + { + "epoch": 0.91, + "grad_norm": 0.26961102066302006, + "learning_rate": 0.0001224371275458689, + "loss": 1.0219, + "step": 9544 + }, + { + "epoch": 0.91, + "grad_norm": 0.30552221121015993, + "learning_rate": 0.00012242171111242093, + "loss": 0.9941, + "step": 9545 + }, + { + "epoch": 0.91, + "grad_norm": 0.30365825120398526, + "learning_rate": 0.00012240629411785557, + "loss": 0.9531, + "step": 9546 + }, + { + "epoch": 0.91, + "grad_norm": 0.27544518032809634, + "learning_rate": 0.0001223908765625586, + "loss": 1.0634, + "step": 9547 + }, + { + "epoch": 0.91, + "grad_norm": 0.2469345335356712, + "learning_rate": 0.00012237545844691585, + "loss": 1.0724, + "step": 9548 + }, + { + "epoch": 0.91, + "grad_norm": 0.2956105861133632, + "learning_rate": 0.00012236003977131324, + "loss": 1.1523, + "step": 9549 + }, + { + "epoch": 0.91, + "grad_norm": 0.2968897322282517, + "learning_rate": 0.00012234462053613653, + "loss": 1.07, + "step": 9550 + }, + { + "epoch": 0.91, + "grad_norm": 0.26572173065636373, + "learning_rate": 0.0001223292007417717, + "loss": 1.0555, + "step": 9551 + }, + { + "epoch": 0.91, + "grad_norm": 0.24837107366849048, + "learning_rate": 0.00012231378038860455, + "loss": 1.1674, + "step": 9552 + }, + { + "epoch": 0.91, + "grad_norm": 0.27292429243707583, + "learning_rate": 0.00012229835947702103, + "loss": 1.1317, + "step": 9553 + }, + { + "epoch": 0.91, + "grad_norm": 0.31026913454307414, + "learning_rate": 0.00012228293800740705, + "loss": 1.1446, + "step": 9554 + }, + { + "epoch": 0.91, + "grad_norm": 0.3072735487423723, + "learning_rate": 0.00012226751598014854, + "loss": 1.1987, + "step": 9555 + }, + { + "epoch": 0.91, + "grad_norm": 0.24847902065517485, + "learning_rate": 0.00012225209339563145, + "loss": 1.1041, + "step": 9556 + }, + { + "epoch": 0.91, + "grad_norm": 0.2924020829498602, + "learning_rate": 0.00012223667025424172, + "loss": 1.0721, + "step": 9557 + }, + { + "epoch": 0.91, + "grad_norm": 0.2613223633837016, + "learning_rate": 0.00012222124655636538, + "loss": 1.1266, + "step": 9558 + }, + { + "epoch": 0.91, + "grad_norm": 0.2894905328595956, + "learning_rate": 0.00012220582230238839, + "loss": 1.0732, + "step": 9559 + }, + { + "epoch": 0.91, + "grad_norm": 0.28681410216353886, + "learning_rate": 0.00012219039749269668, + "loss": 1.0915, + "step": 9560 + }, + { + "epoch": 0.91, + "grad_norm": 0.2943586385473255, + "learning_rate": 0.00012217497212767636, + "loss": 1.0788, + "step": 9561 + }, + { + "epoch": 0.91, + "grad_norm": 0.285929389887662, + "learning_rate": 0.00012215954620771344, + "loss": 1.1005, + "step": 9562 + }, + { + "epoch": 0.91, + "grad_norm": 0.3149078861825459, + "learning_rate": 0.00012214411973319396, + "loss": 1.0933, + "step": 9563 + }, + { + "epoch": 0.91, + "grad_norm": 0.24280112314190866, + "learning_rate": 0.0001221286927045039, + "loss": 0.954, + "step": 9564 + }, + { + "epoch": 0.92, + "grad_norm": 0.24225580439211833, + "learning_rate": 0.00012211326512202945, + "loss": 0.9918, + "step": 9565 + }, + { + "epoch": 0.92, + "grad_norm": 0.3066920417822219, + "learning_rate": 0.00012209783698615665, + "loss": 1.0828, + "step": 9566 + }, + { + "epoch": 0.92, + "grad_norm": 0.3112335184826233, + "learning_rate": 0.00012208240829727156, + "loss": 1.1426, + "step": 9567 + }, + { + "epoch": 0.92, + "grad_norm": 0.263752238595472, + "learning_rate": 0.00012206697905576034, + "loss": 1.0959, + "step": 9568 + }, + { + "epoch": 0.92, + "grad_norm": 0.3013509779853578, + "learning_rate": 0.0001220515492620091, + "loss": 1.0237, + "step": 9569 + }, + { + "epoch": 0.92, + "grad_norm": 0.29198852460496516, + "learning_rate": 0.00012203611891640398, + "loss": 1.1627, + "step": 9570 + }, + { + "epoch": 0.92, + "grad_norm": 0.26850931222903324, + "learning_rate": 0.00012202068801933112, + "loss": 1.1425, + "step": 9571 + }, + { + "epoch": 0.92, + "grad_norm": 0.30251674022231434, + "learning_rate": 0.00012200525657117673, + "loss": 1.1273, + "step": 9572 + }, + { + "epoch": 0.92, + "grad_norm": 0.3018015700917101, + "learning_rate": 0.00012198982457232698, + "loss": 0.9809, + "step": 9573 + }, + { + "epoch": 0.92, + "grad_norm": 0.273598131245597, + "learning_rate": 0.000121974392023168, + "loss": 1.0571, + "step": 9574 + }, + { + "epoch": 0.92, + "grad_norm": 0.30277174215701025, + "learning_rate": 0.00012195895892408609, + "loss": 0.9031, + "step": 9575 + }, + { + "epoch": 0.92, + "grad_norm": 0.28490196205477425, + "learning_rate": 0.00012194352527546739, + "loss": 1.0131, + "step": 9576 + }, + { + "epoch": 0.92, + "grad_norm": 0.2715100350623435, + "learning_rate": 0.0001219280910776982, + "loss": 1.0771, + "step": 9577 + }, + { + "epoch": 0.92, + "grad_norm": 0.2948896697832583, + "learning_rate": 0.00012191265633116473, + "loss": 1.0838, + "step": 9578 + }, + { + "epoch": 0.92, + "grad_norm": 0.2570476008819864, + "learning_rate": 0.00012189722103625332, + "loss": 1.0897, + "step": 9579 + }, + { + "epoch": 0.92, + "grad_norm": 0.2923182653513327, + "learning_rate": 0.00012188178519335014, + "loss": 1.063, + "step": 9580 + }, + { + "epoch": 0.92, + "grad_norm": 0.24843565354969446, + "learning_rate": 0.00012186634880284155, + "loss": 1.0832, + "step": 9581 + }, + { + "epoch": 0.92, + "grad_norm": 0.30405197780168924, + "learning_rate": 0.00012185091186511383, + "loss": 1.0964, + "step": 9582 + }, + { + "epoch": 0.92, + "grad_norm": 0.28552359904155467, + "learning_rate": 0.00012183547438055334, + "loss": 1.0048, + "step": 9583 + }, + { + "epoch": 0.92, + "grad_norm": 0.25623311580886565, + "learning_rate": 0.00012182003634954635, + "loss": 1.1127, + "step": 9584 + }, + { + "epoch": 0.92, + "grad_norm": 0.28043711484669764, + "learning_rate": 0.00012180459777247924, + "loss": 0.9836, + "step": 9585 + }, + { + "epoch": 0.92, + "grad_norm": 0.29306608514635557, + "learning_rate": 0.00012178915864973839, + "loss": 1.0981, + "step": 9586 + }, + { + "epoch": 0.92, + "grad_norm": 0.2509994097045669, + "learning_rate": 0.00012177371898171011, + "loss": 1.0955, + "step": 9587 + }, + { + "epoch": 0.92, + "grad_norm": 0.25595385108043606, + "learning_rate": 0.00012175827876878085, + "loss": 1.0905, + "step": 9588 + }, + { + "epoch": 0.92, + "grad_norm": 0.29970697558699483, + "learning_rate": 0.00012174283801133701, + "loss": 1.08, + "step": 9589 + }, + { + "epoch": 0.92, + "grad_norm": 0.25132811288192697, + "learning_rate": 0.00012172739670976497, + "loss": 0.9933, + "step": 9590 + }, + { + "epoch": 0.92, + "grad_norm": 0.2795967397200255, + "learning_rate": 0.00012171195486445115, + "loss": 1.0812, + "step": 9591 + }, + { + "epoch": 0.92, + "grad_norm": 0.27861509817589664, + "learning_rate": 0.00012169651247578205, + "loss": 1.0368, + "step": 9592 + }, + { + "epoch": 0.92, + "grad_norm": 0.2916254831937168, + "learning_rate": 0.00012168106954414406, + "loss": 1.0809, + "step": 9593 + }, + { + "epoch": 0.92, + "grad_norm": 0.2922945494777507, + "learning_rate": 0.00012166562606992368, + "loss": 1.1979, + "step": 9594 + }, + { + "epoch": 0.92, + "grad_norm": 0.27637344653802837, + "learning_rate": 0.0001216501820535074, + "loss": 0.957, + "step": 9595 + }, + { + "epoch": 0.92, + "grad_norm": 0.2890101477293797, + "learning_rate": 0.00012163473749528169, + "loss": 1.0743, + "step": 9596 + }, + { + "epoch": 0.92, + "grad_norm": 0.27375555736611096, + "learning_rate": 0.0001216192923956331, + "loss": 1.0294, + "step": 9597 + }, + { + "epoch": 0.92, + "grad_norm": 0.3020964878540027, + "learning_rate": 0.00012160384675494811, + "loss": 1.117, + "step": 9598 + }, + { + "epoch": 0.92, + "grad_norm": 0.31564811381117275, + "learning_rate": 0.00012158840057361332, + "loss": 1.1155, + "step": 9599 + }, + { + "epoch": 0.92, + "grad_norm": 0.30570724996592125, + "learning_rate": 0.00012157295385201522, + "loss": 1.0954, + "step": 9600 + }, + { + "epoch": 0.92, + "grad_norm": 0.2772559483117768, + "learning_rate": 0.00012155750659054035, + "loss": 1.0597, + "step": 9601 + }, + { + "epoch": 0.92, + "grad_norm": 0.27996328612745197, + "learning_rate": 0.00012154205878957539, + "loss": 1.0686, + "step": 9602 + }, + { + "epoch": 0.92, + "grad_norm": 0.2862321218720333, + "learning_rate": 0.00012152661044950684, + "loss": 0.9934, + "step": 9603 + }, + { + "epoch": 0.92, + "grad_norm": 0.2853452333425892, + "learning_rate": 0.00012151116157072132, + "loss": 1.1348, + "step": 9604 + }, + { + "epoch": 0.92, + "grad_norm": 0.2617628600038958, + "learning_rate": 0.00012149571215360547, + "loss": 0.97, + "step": 9605 + }, + { + "epoch": 0.92, + "grad_norm": 0.25696280278663025, + "learning_rate": 0.00012148026219854594, + "loss": 1.0261, + "step": 9606 + }, + { + "epoch": 0.92, + "grad_norm": 0.310031454374293, + "learning_rate": 0.00012146481170592933, + "loss": 1.0158, + "step": 9607 + }, + { + "epoch": 0.92, + "grad_norm": 0.2790355327820539, + "learning_rate": 0.0001214493606761423, + "loss": 1.1757, + "step": 9608 + }, + { + "epoch": 0.92, + "grad_norm": 0.30770923269815564, + "learning_rate": 0.00012143390910957157, + "loss": 1.1518, + "step": 9609 + }, + { + "epoch": 0.92, + "grad_norm": 0.30029358695910546, + "learning_rate": 0.00012141845700660379, + "loss": 1.1968, + "step": 9610 + }, + { + "epoch": 0.92, + "grad_norm": 0.3098726943709629, + "learning_rate": 0.00012140300436762564, + "loss": 1.0352, + "step": 9611 + }, + { + "epoch": 0.92, + "grad_norm": 0.2959419907143475, + "learning_rate": 0.00012138755119302388, + "loss": 1.0186, + "step": 9612 + }, + { + "epoch": 0.92, + "grad_norm": 0.27785698197656766, + "learning_rate": 0.00012137209748318521, + "loss": 1.0375, + "step": 9613 + }, + { + "epoch": 0.92, + "grad_norm": 0.2930400342362595, + "learning_rate": 0.00012135664323849634, + "loss": 0.9418, + "step": 9614 + }, + { + "epoch": 0.92, + "grad_norm": 0.28085631013272555, + "learning_rate": 0.00012134118845934405, + "loss": 1.0465, + "step": 9615 + }, + { + "epoch": 0.92, + "grad_norm": 0.2541256790591151, + "learning_rate": 0.00012132573314611516, + "loss": 1.0293, + "step": 9616 + }, + { + "epoch": 0.92, + "grad_norm": 0.28908641994102546, + "learning_rate": 0.00012131027729919633, + "loss": 1.078, + "step": 9617 + }, + { + "epoch": 0.92, + "grad_norm": 0.2772893250241189, + "learning_rate": 0.00012129482091897446, + "loss": 1.1151, + "step": 9618 + }, + { + "epoch": 0.92, + "grad_norm": 0.327164314875272, + "learning_rate": 0.00012127936400583629, + "loss": 1.0591, + "step": 9619 + }, + { + "epoch": 0.92, + "grad_norm": 0.2886231869799765, + "learning_rate": 0.00012126390656016866, + "loss": 1.0113, + "step": 9620 + }, + { + "epoch": 0.92, + "grad_norm": 0.283680263633664, + "learning_rate": 0.0001212484485823584, + "loss": 1.0249, + "step": 9621 + }, + { + "epoch": 0.92, + "grad_norm": 0.26111321954434336, + "learning_rate": 0.00012123299007279238, + "loss": 1.0342, + "step": 9622 + }, + { + "epoch": 0.92, + "grad_norm": 0.2955195301556354, + "learning_rate": 0.00012121753103185745, + "loss": 0.9461, + "step": 9623 + }, + { + "epoch": 0.92, + "grad_norm": 0.27409190677236495, + "learning_rate": 0.00012120207145994045, + "loss": 1.0637, + "step": 9624 + }, + { + "epoch": 0.92, + "grad_norm": 0.26924614303903166, + "learning_rate": 0.00012118661135742828, + "loss": 0.9253, + "step": 9625 + }, + { + "epoch": 0.92, + "grad_norm": 0.2692477485784918, + "learning_rate": 0.00012117115072470788, + "loss": 1.0827, + "step": 9626 + }, + { + "epoch": 0.92, + "grad_norm": 0.26774970205249055, + "learning_rate": 0.0001211556895621661, + "loss": 1.0528, + "step": 9627 + }, + { + "epoch": 0.92, + "grad_norm": 0.29867507929874776, + "learning_rate": 0.00012114022787018988, + "loss": 1.0436, + "step": 9628 + }, + { + "epoch": 0.92, + "grad_norm": 0.3208070747287111, + "learning_rate": 0.00012112476564916622, + "loss": 1.1365, + "step": 9629 + }, + { + "epoch": 0.92, + "grad_norm": 0.2972966476878782, + "learning_rate": 0.000121109302899482, + "loss": 1.1186, + "step": 9630 + }, + { + "epoch": 0.92, + "grad_norm": 0.30544954734093066, + "learning_rate": 0.00012109383962152416, + "loss": 1.1047, + "step": 9631 + }, + { + "epoch": 0.92, + "grad_norm": 0.2967657674672275, + "learning_rate": 0.00012107837581567977, + "loss": 1.072, + "step": 9632 + }, + { + "epoch": 0.92, + "grad_norm": 0.3033096939499795, + "learning_rate": 0.00012106291148233579, + "loss": 1.0254, + "step": 9633 + }, + { + "epoch": 0.92, + "grad_norm": 0.26626395824315857, + "learning_rate": 0.00012104744662187922, + "loss": 0.9941, + "step": 9634 + }, + { + "epoch": 0.92, + "grad_norm": 0.2895890185705288, + "learning_rate": 0.00012103198123469704, + "loss": 1.0531, + "step": 9635 + }, + { + "epoch": 0.92, + "grad_norm": 0.2957215592529329, + "learning_rate": 0.00012101651532117632, + "loss": 1.0199, + "step": 9636 + }, + { + "epoch": 0.92, + "grad_norm": 0.2966423558454665, + "learning_rate": 0.00012100104888170407, + "loss": 0.9559, + "step": 9637 + }, + { + "epoch": 0.92, + "grad_norm": 0.3146917874326988, + "learning_rate": 0.00012098558191666742, + "loss": 1.1623, + "step": 9638 + }, + { + "epoch": 0.92, + "grad_norm": 0.33430522861396766, + "learning_rate": 0.00012097011442645337, + "loss": 1.0295, + "step": 9639 + }, + { + "epoch": 0.92, + "grad_norm": 0.2725271575959663, + "learning_rate": 0.00012095464641144902, + "loss": 1.0745, + "step": 9640 + }, + { + "epoch": 0.92, + "grad_norm": 0.32836523657529365, + "learning_rate": 0.00012093917787204148, + "loss": 1.0739, + "step": 9641 + }, + { + "epoch": 0.92, + "grad_norm": 0.2905477137452141, + "learning_rate": 0.00012092370880861786, + "loss": 0.9218, + "step": 9642 + }, + { + "epoch": 0.92, + "grad_norm": 0.28312434963987126, + "learning_rate": 0.00012090823922156526, + "loss": 1.1302, + "step": 9643 + }, + { + "epoch": 0.92, + "grad_norm": 0.31119878698668474, + "learning_rate": 0.00012089276911127088, + "loss": 1.1067, + "step": 9644 + }, + { + "epoch": 0.92, + "grad_norm": 0.24999334472319124, + "learning_rate": 0.00012087729847812176, + "loss": 1.0741, + "step": 9645 + }, + { + "epoch": 0.92, + "grad_norm": 0.2543284979471403, + "learning_rate": 0.00012086182732250517, + "loss": 0.9579, + "step": 9646 + }, + { + "epoch": 0.92, + "grad_norm": 0.2594464862081111, + "learning_rate": 0.00012084635564480824, + "loss": 1.0921, + "step": 9647 + }, + { + "epoch": 0.92, + "grad_norm": 0.3158864173446626, + "learning_rate": 0.00012083088344541813, + "loss": 1.1075, + "step": 9648 + }, + { + "epoch": 0.92, + "grad_norm": 0.29277153816070817, + "learning_rate": 0.00012081541072472208, + "loss": 1.0819, + "step": 9649 + }, + { + "epoch": 0.92, + "grad_norm": 0.285551700825859, + "learning_rate": 0.00012079993748310729, + "loss": 1.1289, + "step": 9650 + }, + { + "epoch": 0.92, + "grad_norm": 0.26578837779844633, + "learning_rate": 0.00012078446372096102, + "loss": 1.0786, + "step": 9651 + }, + { + "epoch": 0.92, + "grad_norm": 0.2770368943423453, + "learning_rate": 0.00012076898943867046, + "loss": 1.2504, + "step": 9652 + }, + { + "epoch": 0.92, + "grad_norm": 0.2915791206912121, + "learning_rate": 0.0001207535146366229, + "loss": 1.0691, + "step": 9653 + }, + { + "epoch": 0.92, + "grad_norm": 0.29701485898418156, + "learning_rate": 0.00012073803931520557, + "loss": 1.0343, + "step": 9654 + }, + { + "epoch": 0.92, + "grad_norm": 0.3164931927910207, + "learning_rate": 0.0001207225634748058, + "loss": 1.0555, + "step": 9655 + }, + { + "epoch": 0.92, + "grad_norm": 0.26381700408017583, + "learning_rate": 0.00012070708711581083, + "loss": 1.1291, + "step": 9656 + }, + { + "epoch": 0.92, + "grad_norm": 0.29962327944839096, + "learning_rate": 0.00012069161023860802, + "loss": 1.039, + "step": 9657 + }, + { + "epoch": 0.92, + "grad_norm": 0.3144393601015309, + "learning_rate": 0.00012067613284358461, + "loss": 1.1407, + "step": 9658 + }, + { + "epoch": 0.92, + "grad_norm": 0.29485729590956794, + "learning_rate": 0.00012066065493112803, + "loss": 0.9772, + "step": 9659 + }, + { + "epoch": 0.92, + "grad_norm": 0.3317695176709224, + "learning_rate": 0.00012064517650162555, + "loss": 1.0321, + "step": 9660 + }, + { + "epoch": 0.92, + "grad_norm": 0.2795257128963298, + "learning_rate": 0.00012062969755546456, + "loss": 1.1162, + "step": 9661 + }, + { + "epoch": 0.92, + "grad_norm": 0.3308060051457206, + "learning_rate": 0.00012061421809303241, + "loss": 0.9852, + "step": 9662 + }, + { + "epoch": 0.92, + "grad_norm": 0.284967097310632, + "learning_rate": 0.00012059873811471651, + "loss": 0.9865, + "step": 9663 + }, + { + "epoch": 0.92, + "grad_norm": 0.31468502602583576, + "learning_rate": 0.00012058325762090426, + "loss": 1.0112, + "step": 9664 + }, + { + "epoch": 0.92, + "grad_norm": 0.2760079699322398, + "learning_rate": 0.00012056777661198301, + "loss": 0.9597, + "step": 9665 + }, + { + "epoch": 0.92, + "grad_norm": 0.2911837084304875, + "learning_rate": 0.00012055229508834027, + "loss": 1.0258, + "step": 9666 + }, + { + "epoch": 0.92, + "grad_norm": 0.30981871374584724, + "learning_rate": 0.00012053681305036342, + "loss": 1.1091, + "step": 9667 + }, + { + "epoch": 0.92, + "grad_norm": 0.28199493834041306, + "learning_rate": 0.00012052133049843992, + "loss": 0.9613, + "step": 9668 + }, + { + "epoch": 0.93, + "grad_norm": 0.2833626576757252, + "learning_rate": 0.00012050584743295718, + "loss": 1.0918, + "step": 9669 + }, + { + "epoch": 0.93, + "grad_norm": 0.32179313490464345, + "learning_rate": 0.00012049036385430277, + "loss": 1.0589, + "step": 9670 + }, + { + "epoch": 0.93, + "grad_norm": 0.3529820688530236, + "learning_rate": 0.0001204748797628641, + "loss": 1.0392, + "step": 9671 + }, + { + "epoch": 0.93, + "grad_norm": 0.316182685666834, + "learning_rate": 0.00012045939515902872, + "loss": 1.091, + "step": 9672 + }, + { + "epoch": 0.93, + "grad_norm": 0.3132427307808237, + "learning_rate": 0.00012044391004318409, + "loss": 1.1227, + "step": 9673 + }, + { + "epoch": 0.93, + "grad_norm": 0.2997545510559344, + "learning_rate": 0.0001204284244157178, + "loss": 1.0994, + "step": 9674 + }, + { + "epoch": 0.93, + "grad_norm": 0.2536805673335262, + "learning_rate": 0.00012041293827701729, + "loss": 0.9636, + "step": 9675 + }, + { + "epoch": 0.93, + "grad_norm": 0.2876440010059215, + "learning_rate": 0.00012039745162747022, + "loss": 1.1536, + "step": 9676 + }, + { + "epoch": 0.93, + "grad_norm": 0.2691609443880876, + "learning_rate": 0.0001203819644674641, + "loss": 1.1113, + "step": 9677 + }, + { + "epoch": 0.93, + "grad_norm": 0.25701727543705255, + "learning_rate": 0.00012036647679738649, + "loss": 0.9819, + "step": 9678 + }, + { + "epoch": 0.93, + "grad_norm": 0.29817078458954743, + "learning_rate": 0.00012035098861762502, + "loss": 0.9931, + "step": 9679 + }, + { + "epoch": 0.93, + "grad_norm": 0.2642831901167002, + "learning_rate": 0.00012033549992856726, + "loss": 1.0036, + "step": 9680 + }, + { + "epoch": 0.93, + "grad_norm": 0.29959073213056514, + "learning_rate": 0.00012032001073060082, + "loss": 1.1449, + "step": 9681 + }, + { + "epoch": 0.93, + "grad_norm": 0.32204374587710666, + "learning_rate": 0.00012030452102411333, + "loss": 0.9079, + "step": 9682 + }, + { + "epoch": 0.93, + "grad_norm": 0.30732832828204176, + "learning_rate": 0.00012028903080949248, + "loss": 0.8944, + "step": 9683 + }, + { + "epoch": 0.93, + "grad_norm": 0.27203865429830504, + "learning_rate": 0.00012027354008712588, + "loss": 0.9959, + "step": 9684 + }, + { + "epoch": 0.93, + "grad_norm": 0.31723507213333874, + "learning_rate": 0.0001202580488574012, + "loss": 1.0008, + "step": 9685 + }, + { + "epoch": 0.93, + "grad_norm": 0.2753881383976869, + "learning_rate": 0.00012024255712070607, + "loss": 0.9831, + "step": 9686 + }, + { + "epoch": 0.93, + "grad_norm": 0.2905972632173138, + "learning_rate": 0.00012022706487742827, + "loss": 0.9718, + "step": 9687 + }, + { + "epoch": 0.93, + "grad_norm": 0.3441351333384633, + "learning_rate": 0.00012021157212795544, + "loss": 1.1038, + "step": 9688 + }, + { + "epoch": 0.93, + "grad_norm": 0.30829573394192783, + "learning_rate": 0.00012019607887267532, + "loss": 1.0747, + "step": 9689 + }, + { + "epoch": 0.93, + "grad_norm": 0.3116599969382794, + "learning_rate": 0.00012018058511197563, + "loss": 0.9906, + "step": 9690 + }, + { + "epoch": 0.93, + "grad_norm": 0.32153118671256326, + "learning_rate": 0.00012016509084624413, + "loss": 1.0987, + "step": 9691 + }, + { + "epoch": 0.93, + "grad_norm": 0.30020496331906704, + "learning_rate": 0.00012014959607586853, + "loss": 0.9893, + "step": 9692 + }, + { + "epoch": 0.93, + "grad_norm": 0.2488961913424749, + "learning_rate": 0.00012013410080123666, + "loss": 1.0128, + "step": 9693 + }, + { + "epoch": 0.93, + "grad_norm": 0.2894864987548495, + "learning_rate": 0.00012011860502273625, + "loss": 1.0898, + "step": 9694 + }, + { + "epoch": 0.93, + "grad_norm": 0.339459814596149, + "learning_rate": 0.0001201031087407551, + "loss": 0.972, + "step": 9695 + }, + { + "epoch": 0.93, + "grad_norm": 0.28936642655855416, + "learning_rate": 0.00012008761195568101, + "loss": 1.1054, + "step": 9696 + }, + { + "epoch": 0.93, + "grad_norm": 0.29865336998518743, + "learning_rate": 0.00012007211466790183, + "loss": 1.0518, + "step": 9697 + }, + { + "epoch": 0.93, + "grad_norm": 0.29410810287938477, + "learning_rate": 0.00012005661687780537, + "loss": 1.0348, + "step": 9698 + }, + { + "epoch": 0.93, + "grad_norm": 0.28715006026168827, + "learning_rate": 0.00012004111858577942, + "loss": 1.0277, + "step": 9699 + }, + { + "epoch": 0.93, + "grad_norm": 0.3246035439472215, + "learning_rate": 0.00012002561979221191, + "loss": 1.0978, + "step": 9700 + }, + { + "epoch": 0.93, + "grad_norm": 0.28714503427033067, + "learning_rate": 0.00012001012049749067, + "loss": 1.0528, + "step": 9701 + }, + { + "epoch": 0.93, + "grad_norm": 0.2999877264540504, + "learning_rate": 0.0001199946207020036, + "loss": 1.0535, + "step": 9702 + }, + { + "epoch": 0.93, + "grad_norm": 0.2831842528289797, + "learning_rate": 0.00011997912040613856, + "loss": 1.0861, + "step": 9703 + }, + { + "epoch": 0.93, + "grad_norm": 0.27863916487179596, + "learning_rate": 0.00011996361961028351, + "loss": 1.0172, + "step": 9704 + }, + { + "epoch": 0.93, + "grad_norm": 0.3076428593870626, + "learning_rate": 0.00011994811831482633, + "loss": 1.154, + "step": 9705 + }, + { + "epoch": 0.93, + "grad_norm": 0.3178394621331127, + "learning_rate": 0.00011993261652015493, + "loss": 1.0459, + "step": 9706 + }, + { + "epoch": 0.93, + "grad_norm": 0.2834780610000245, + "learning_rate": 0.00011991711422665728, + "loss": 0.8991, + "step": 9707 + }, + { + "epoch": 0.93, + "grad_norm": 0.28248971237831144, + "learning_rate": 0.00011990161143472134, + "loss": 1.0412, + "step": 9708 + }, + { + "epoch": 0.93, + "grad_norm": 0.37293406629713854, + "learning_rate": 0.00011988610814473504, + "loss": 1.1236, + "step": 9709 + }, + { + "epoch": 0.93, + "grad_norm": 0.29203647947031824, + "learning_rate": 0.00011987060435708643, + "loss": 0.972, + "step": 9710 + }, + { + "epoch": 0.93, + "grad_norm": 0.2784065820475021, + "learning_rate": 0.00011985510007216343, + "loss": 1.0975, + "step": 9711 + }, + { + "epoch": 0.93, + "grad_norm": 0.2967922411536282, + "learning_rate": 0.0001198395952903541, + "loss": 1.0979, + "step": 9712 + }, + { + "epoch": 0.93, + "grad_norm": 0.29200788752529727, + "learning_rate": 0.00011982409001204637, + "loss": 1.0726, + "step": 9713 + }, + { + "epoch": 0.93, + "grad_norm": 0.2692696205517818, + "learning_rate": 0.00011980858423762837, + "loss": 1.0631, + "step": 9714 + }, + { + "epoch": 0.93, + "grad_norm": 0.2965913502544387, + "learning_rate": 0.00011979307796748811, + "loss": 1.1334, + "step": 9715 + }, + { + "epoch": 0.93, + "grad_norm": 0.28309793277521694, + "learning_rate": 0.0001197775712020136, + "loss": 1.1148, + "step": 9716 + }, + { + "epoch": 0.93, + "grad_norm": 0.2842362819659992, + "learning_rate": 0.00011976206394159297, + "loss": 1.089, + "step": 9717 + }, + { + "epoch": 0.93, + "grad_norm": 0.30869020384451756, + "learning_rate": 0.00011974655618661425, + "loss": 1.1634, + "step": 9718 + }, + { + "epoch": 0.93, + "grad_norm": 0.2919947030319265, + "learning_rate": 0.00011973104793746554, + "loss": 0.9976, + "step": 9719 + }, + { + "epoch": 0.93, + "grad_norm": 0.28107687880463733, + "learning_rate": 0.000119715539194535, + "loss": 1.0656, + "step": 9720 + }, + { + "epoch": 0.93, + "grad_norm": 0.2538194528442572, + "learning_rate": 0.00011970002995821069, + "loss": 0.9763, + "step": 9721 + }, + { + "epoch": 0.93, + "grad_norm": 0.3042314896103001, + "learning_rate": 0.0001196845202288807, + "loss": 1.1705, + "step": 9722 + }, + { + "epoch": 0.93, + "grad_norm": 0.2572188222835515, + "learning_rate": 0.00011966901000693325, + "loss": 1.0154, + "step": 9723 + }, + { + "epoch": 0.93, + "grad_norm": 0.28736803000564753, + "learning_rate": 0.00011965349929275646, + "loss": 0.9974, + "step": 9724 + }, + { + "epoch": 0.93, + "grad_norm": 0.31110944588178774, + "learning_rate": 0.00011963798808673852, + "loss": 1.0147, + "step": 9725 + }, + { + "epoch": 0.93, + "grad_norm": 0.2875577964788188, + "learning_rate": 0.00011962247638926755, + "loss": 1.0678, + "step": 9726 + }, + { + "epoch": 0.93, + "grad_norm": 0.2700897281383975, + "learning_rate": 0.00011960696420073181, + "loss": 1.1177, + "step": 9727 + }, + { + "epoch": 0.93, + "grad_norm": 0.2806117435648341, + "learning_rate": 0.00011959145152151947, + "loss": 1.012, + "step": 9728 + }, + { + "epoch": 0.93, + "grad_norm": 0.31209826283604314, + "learning_rate": 0.00011957593835201875, + "loss": 0.9591, + "step": 9729 + }, + { + "epoch": 0.93, + "grad_norm": 0.2750239320324656, + "learning_rate": 0.00011956042469261781, + "loss": 1.1095, + "step": 9730 + }, + { + "epoch": 0.93, + "grad_norm": 0.28264832045438565, + "learning_rate": 0.000119544910543705, + "loss": 1.037, + "step": 9731 + }, + { + "epoch": 0.93, + "grad_norm": 0.3300651441181935, + "learning_rate": 0.00011952939590566852, + "loss": 1.0304, + "step": 9732 + }, + { + "epoch": 0.93, + "grad_norm": 0.2859521337438915, + "learning_rate": 0.0001195138807788966, + "loss": 0.969, + "step": 9733 + }, + { + "epoch": 0.93, + "grad_norm": 0.2828356666065811, + "learning_rate": 0.00011949836516377759, + "loss": 1.004, + "step": 9734 + }, + { + "epoch": 0.93, + "grad_norm": 0.2696647965723336, + "learning_rate": 0.00011948284906069974, + "loss": 1.089, + "step": 9735 + }, + { + "epoch": 0.93, + "grad_norm": 0.2708560032776781, + "learning_rate": 0.00011946733247005131, + "loss": 0.9534, + "step": 9736 + }, + { + "epoch": 0.93, + "grad_norm": 0.3142091199851108, + "learning_rate": 0.00011945181539222065, + "loss": 1.1159, + "step": 9737 + }, + { + "epoch": 0.93, + "grad_norm": 0.2580939496791897, + "learning_rate": 0.00011943629782759611, + "loss": 0.9785, + "step": 9738 + }, + { + "epoch": 0.93, + "grad_norm": 0.29871072295327306, + "learning_rate": 0.00011942077977656601, + "loss": 1.072, + "step": 9739 + }, + { + "epoch": 0.93, + "grad_norm": 0.28134297529949587, + "learning_rate": 0.00011940526123951865, + "loss": 1.0097, + "step": 9740 + }, + { + "epoch": 0.93, + "grad_norm": 0.27030245416409626, + "learning_rate": 0.00011938974221684248, + "loss": 1.0362, + "step": 9741 + }, + { + "epoch": 0.93, + "grad_norm": 0.27891127006754407, + "learning_rate": 0.00011937422270892578, + "loss": 1.1503, + "step": 9742 + }, + { + "epoch": 0.93, + "grad_norm": 0.2527819203001018, + "learning_rate": 0.00011935870271615701, + "loss": 1.0682, + "step": 9743 + }, + { + "epoch": 0.93, + "grad_norm": 0.25930692949225115, + "learning_rate": 0.00011934318223892451, + "loss": 1.062, + "step": 9744 + }, + { + "epoch": 0.93, + "grad_norm": 0.2875000837745984, + "learning_rate": 0.00011932766127761675, + "loss": 0.9477, + "step": 9745 + }, + { + "epoch": 0.93, + "grad_norm": 0.325246715534001, + "learning_rate": 0.00011931213983262211, + "loss": 1.0165, + "step": 9746 + }, + { + "epoch": 0.93, + "grad_norm": 0.25662864976415845, + "learning_rate": 0.00011929661790432903, + "loss": 1.0786, + "step": 9747 + }, + { + "epoch": 0.93, + "grad_norm": 0.260785274906786, + "learning_rate": 0.00011928109549312596, + "loss": 1.0819, + "step": 9748 + }, + { + "epoch": 0.93, + "grad_norm": 0.24971660647552188, + "learning_rate": 0.00011926557259940137, + "loss": 1.1172, + "step": 9749 + }, + { + "epoch": 0.93, + "grad_norm": 0.2634144133215548, + "learning_rate": 0.00011925004922354368, + "loss": 1.0457, + "step": 9750 + }, + { + "epoch": 0.93, + "grad_norm": 0.28481353084008365, + "learning_rate": 0.00011923452536594144, + "loss": 1.0667, + "step": 9751 + }, + { + "epoch": 0.93, + "grad_norm": 0.2480420939966222, + "learning_rate": 0.00011921900102698312, + "loss": 1.0387, + "step": 9752 + }, + { + "epoch": 0.93, + "grad_norm": 0.290303015413711, + "learning_rate": 0.00011920347620705719, + "loss": 0.9624, + "step": 9753 + }, + { + "epoch": 0.93, + "grad_norm": 0.30582174621304614, + "learning_rate": 0.00011918795090655221, + "loss": 0.982, + "step": 9754 + }, + { + "epoch": 0.93, + "grad_norm": 0.2655028218340249, + "learning_rate": 0.00011917242512585674, + "loss": 1.0536, + "step": 9755 + }, + { + "epoch": 0.93, + "grad_norm": 0.27477226526624493, + "learning_rate": 0.00011915689886535923, + "loss": 0.9856, + "step": 9756 + }, + { + "epoch": 0.93, + "grad_norm": 0.28682754344560973, + "learning_rate": 0.00011914137212544831, + "loss": 1.1707, + "step": 9757 + }, + { + "epoch": 0.93, + "grad_norm": 0.26153050639845216, + "learning_rate": 0.00011912584490651253, + "loss": 1.0497, + "step": 9758 + }, + { + "epoch": 0.93, + "grad_norm": 0.28229038067976947, + "learning_rate": 0.00011911031720894046, + "loss": 0.9965, + "step": 9759 + }, + { + "epoch": 0.93, + "grad_norm": 0.28006029083616607, + "learning_rate": 0.00011909478903312066, + "loss": 1.0668, + "step": 9760 + }, + { + "epoch": 0.93, + "grad_norm": 0.27740453645729457, + "learning_rate": 0.00011907926037944179, + "loss": 1.0366, + "step": 9761 + }, + { + "epoch": 0.93, + "grad_norm": 0.2384257527120478, + "learning_rate": 0.00011906373124829244, + "loss": 1.0827, + "step": 9762 + }, + { + "epoch": 0.93, + "grad_norm": 0.293678354692542, + "learning_rate": 0.0001190482016400612, + "loss": 1.0645, + "step": 9763 + }, + { + "epoch": 0.93, + "grad_norm": 0.2938241245866292, + "learning_rate": 0.00011903267155513677, + "loss": 1.055, + "step": 9764 + }, + { + "epoch": 0.93, + "grad_norm": 0.26615653569909886, + "learning_rate": 0.00011901714099390777, + "loss": 1.0972, + "step": 9765 + }, + { + "epoch": 0.93, + "grad_norm": 0.28171980966420157, + "learning_rate": 0.00011900160995676288, + "loss": 0.9422, + "step": 9766 + }, + { + "epoch": 0.93, + "grad_norm": 0.26740119824230185, + "learning_rate": 0.00011898607844409073, + "loss": 1.0326, + "step": 9767 + }, + { + "epoch": 0.93, + "grad_norm": 0.27023112187042453, + "learning_rate": 0.00011897054645628005, + "loss": 0.9777, + "step": 9768 + }, + { + "epoch": 0.93, + "grad_norm": 0.3200885822581613, + "learning_rate": 0.00011895501399371953, + "loss": 1.0587, + "step": 9769 + }, + { + "epoch": 0.93, + "grad_norm": 0.31831634758810756, + "learning_rate": 0.00011893948105679787, + "loss": 1.0764, + "step": 9770 + }, + { + "epoch": 0.93, + "grad_norm": 0.3081776019474058, + "learning_rate": 0.00011892394764590378, + "loss": 1.0104, + "step": 9771 + }, + { + "epoch": 0.93, + "grad_norm": 0.28957521753453463, + "learning_rate": 0.00011890841376142603, + "loss": 1.0592, + "step": 9772 + }, + { + "epoch": 0.93, + "grad_norm": 0.28207411325497395, + "learning_rate": 0.00011889287940375334, + "loss": 1.104, + "step": 9773 + }, + { + "epoch": 0.94, + "grad_norm": 0.36180324747623, + "learning_rate": 0.00011887734457327443, + "loss": 1.0525, + "step": 9774 + }, + { + "epoch": 0.94, + "grad_norm": 0.2910045433010645, + "learning_rate": 0.00011886180927037815, + "loss": 1.087, + "step": 9775 + }, + { + "epoch": 0.94, + "grad_norm": 0.28543697105123744, + "learning_rate": 0.00011884627349545323, + "loss": 1.0952, + "step": 9776 + }, + { + "epoch": 0.94, + "grad_norm": 0.2849455865427251, + "learning_rate": 0.00011883073724888844, + "loss": 1.0036, + "step": 9777 + }, + { + "epoch": 0.94, + "grad_norm": 0.3445968201656461, + "learning_rate": 0.00011881520053107267, + "loss": 1.0498, + "step": 9778 + }, + { + "epoch": 0.94, + "grad_norm": 0.2506321633298875, + "learning_rate": 0.00011879966334239466, + "loss": 1.1624, + "step": 9779 + }, + { + "epoch": 0.94, + "grad_norm": 0.29162103318787663, + "learning_rate": 0.00011878412568324322, + "loss": 1.0233, + "step": 9780 + }, + { + "epoch": 0.94, + "grad_norm": 0.29500101419963837, + "learning_rate": 0.00011876858755400728, + "loss": 1.0922, + "step": 9781 + }, + { + "epoch": 0.94, + "grad_norm": 0.3009884592875106, + "learning_rate": 0.00011875304895507562, + "loss": 1.1165, + "step": 9782 + }, + { + "epoch": 0.94, + "grad_norm": 0.27565694136640945, + "learning_rate": 0.00011873750988683712, + "loss": 1.0688, + "step": 9783 + }, + { + "epoch": 0.94, + "grad_norm": 0.3273050375332514, + "learning_rate": 0.00011872197034968067, + "loss": 1.1118, + "step": 9784 + }, + { + "epoch": 0.94, + "grad_norm": 0.29960996186748606, + "learning_rate": 0.00011870643034399514, + "loss": 1.2125, + "step": 9785 + }, + { + "epoch": 0.94, + "grad_norm": 0.2964779408152491, + "learning_rate": 0.00011869088987016943, + "loss": 1.1393, + "step": 9786 + }, + { + "epoch": 0.94, + "grad_norm": 0.2858838476728597, + "learning_rate": 0.00011867534892859244, + "loss": 1.0267, + "step": 9787 + }, + { + "epoch": 0.94, + "grad_norm": 0.2852488900654838, + "learning_rate": 0.00011865980751965313, + "loss": 1.1121, + "step": 9788 + }, + { + "epoch": 0.94, + "grad_norm": 0.29119397522264384, + "learning_rate": 0.00011864426564374043, + "loss": 1.0825, + "step": 9789 + }, + { + "epoch": 0.94, + "grad_norm": 0.2594508280784169, + "learning_rate": 0.00011862872330124324, + "loss": 1.154, + "step": 9790 + }, + { + "epoch": 0.94, + "grad_norm": 0.2975677876895368, + "learning_rate": 0.00011861318049255052, + "loss": 1.0173, + "step": 9791 + }, + { + "epoch": 0.94, + "grad_norm": 0.2648318009564742, + "learning_rate": 0.00011859763721805128, + "loss": 1.0818, + "step": 9792 + }, + { + "epoch": 0.94, + "grad_norm": 0.2475611229418435, + "learning_rate": 0.00011858209347813449, + "loss": 0.9974, + "step": 9793 + }, + { + "epoch": 0.94, + "grad_norm": 0.24368772705519215, + "learning_rate": 0.00011856654927318914, + "loss": 1.0076, + "step": 9794 + }, + { + "epoch": 0.94, + "grad_norm": 0.2589222900696365, + "learning_rate": 0.0001185510046036042, + "loss": 1.0072, + "step": 9795 + }, + { + "epoch": 0.94, + "grad_norm": 0.2518421468766372, + "learning_rate": 0.00011853545946976874, + "loss": 1.1606, + "step": 9796 + }, + { + "epoch": 0.94, + "grad_norm": 0.295292430219631, + "learning_rate": 0.00011851991387207171, + "loss": 0.98, + "step": 9797 + }, + { + "epoch": 0.94, + "grad_norm": 0.29245516327817817, + "learning_rate": 0.00011850436781090223, + "loss": 1.0599, + "step": 9798 + }, + { + "epoch": 0.94, + "grad_norm": 0.2914565208305866, + "learning_rate": 0.00011848882128664933, + "loss": 0.9911, + "step": 9799 + }, + { + "epoch": 0.94, + "grad_norm": 0.3078689389856212, + "learning_rate": 0.00011847327429970203, + "loss": 0.9396, + "step": 9800 + }, + { + "epoch": 0.94, + "grad_norm": 0.3127304428982239, + "learning_rate": 0.00011845772685044945, + "loss": 1.0645, + "step": 9801 + }, + { + "epoch": 0.94, + "grad_norm": 0.2873883647576085, + "learning_rate": 0.00011844217893928064, + "loss": 0.9597, + "step": 9802 + }, + { + "epoch": 0.94, + "grad_norm": 0.29524076369164753, + "learning_rate": 0.00011842663056658471, + "loss": 1.054, + "step": 9803 + }, + { + "epoch": 0.94, + "grad_norm": 0.30216797407479606, + "learning_rate": 0.00011841108173275078, + "loss": 1.0366, + "step": 9804 + }, + { + "epoch": 0.94, + "grad_norm": 0.2683352824632881, + "learning_rate": 0.00011839553243816794, + "loss": 1.1457, + "step": 9805 + }, + { + "epoch": 0.94, + "grad_norm": 0.3088671649045264, + "learning_rate": 0.00011837998268322535, + "loss": 0.9811, + "step": 9806 + }, + { + "epoch": 0.94, + "grad_norm": 0.3127798071067639, + "learning_rate": 0.00011836443246831215, + "loss": 1.0054, + "step": 9807 + }, + { + "epoch": 0.94, + "grad_norm": 0.2958330910482311, + "learning_rate": 0.00011834888179381746, + "loss": 0.9717, + "step": 9808 + }, + { + "epoch": 0.94, + "grad_norm": 0.24632414148720827, + "learning_rate": 0.00011833333066013051, + "loss": 1.0858, + "step": 9809 + }, + { + "epoch": 0.94, + "grad_norm": 0.2723921866158821, + "learning_rate": 0.00011831777906764044, + "loss": 1.0244, + "step": 9810 + }, + { + "epoch": 0.94, + "grad_norm": 0.2975337869324926, + "learning_rate": 0.00011830222701673639, + "loss": 1.0093, + "step": 9811 + }, + { + "epoch": 0.94, + "grad_norm": 0.28650604212889696, + "learning_rate": 0.00011828667450780764, + "loss": 1.069, + "step": 9812 + }, + { + "epoch": 0.94, + "grad_norm": 0.27777382822296665, + "learning_rate": 0.00011827112154124338, + "loss": 1.1207, + "step": 9813 + }, + { + "epoch": 0.94, + "grad_norm": 0.2870567816741802, + "learning_rate": 0.00011825556811743279, + "loss": 1.1142, + "step": 9814 + }, + { + "epoch": 0.94, + "grad_norm": 0.30386027251939157, + "learning_rate": 0.00011824001423676513, + "loss": 1.1055, + "step": 9815 + }, + { + "epoch": 0.94, + "grad_norm": 0.2705336512672914, + "learning_rate": 0.00011822445989962969, + "loss": 1.1277, + "step": 9816 + }, + { + "epoch": 0.94, + "grad_norm": 0.29969456459847577, + "learning_rate": 0.00011820890510641567, + "loss": 1.0124, + "step": 9817 + }, + { + "epoch": 0.94, + "grad_norm": 0.30298340488413317, + "learning_rate": 0.00011819334985751233, + "loss": 1.0302, + "step": 9818 + }, + { + "epoch": 0.94, + "grad_norm": 0.27778928676842507, + "learning_rate": 0.00011817779415330901, + "loss": 1.0885, + "step": 9819 + }, + { + "epoch": 0.94, + "grad_norm": 0.300820511952422, + "learning_rate": 0.00011816223799419497, + "loss": 1.0825, + "step": 9820 + }, + { + "epoch": 0.94, + "grad_norm": 0.3444116599675863, + "learning_rate": 0.00011814668138055947, + "loss": 1.098, + "step": 9821 + }, + { + "epoch": 0.94, + "grad_norm": 0.27136608383916344, + "learning_rate": 0.0001181311243127919, + "loss": 1.1083, + "step": 9822 + }, + { + "epoch": 0.94, + "grad_norm": 0.29143092481382066, + "learning_rate": 0.00011811556679128153, + "loss": 1.0637, + "step": 9823 + }, + { + "epoch": 0.94, + "grad_norm": 0.25973448401962235, + "learning_rate": 0.00011810000881641771, + "loss": 1.0963, + "step": 9824 + }, + { + "epoch": 0.94, + "grad_norm": 0.2774775599714097, + "learning_rate": 0.00011808445038858982, + "loss": 1.0984, + "step": 9825 + }, + { + "epoch": 0.94, + "grad_norm": 0.263532587397457, + "learning_rate": 0.00011806889150818716, + "loss": 0.9681, + "step": 9826 + }, + { + "epoch": 0.94, + "grad_norm": 0.3141288079300534, + "learning_rate": 0.00011805333217559918, + "loss": 1.0527, + "step": 9827 + }, + { + "epoch": 0.94, + "grad_norm": 0.31147150343587526, + "learning_rate": 0.00011803777239121516, + "loss": 1.0198, + "step": 9828 + }, + { + "epoch": 0.94, + "grad_norm": 0.2732581866158368, + "learning_rate": 0.00011802221215542459, + "loss": 1.0903, + "step": 9829 + }, + { + "epoch": 0.94, + "grad_norm": 0.28505766654611703, + "learning_rate": 0.00011800665146861683, + "loss": 1.0171, + "step": 9830 + }, + { + "epoch": 0.94, + "grad_norm": 0.28825531298858104, + "learning_rate": 0.00011799109033118127, + "loss": 1.1042, + "step": 9831 + }, + { + "epoch": 0.94, + "grad_norm": 0.28681592008334156, + "learning_rate": 0.00011797552874350739, + "loss": 1.0476, + "step": 9832 + }, + { + "epoch": 0.94, + "grad_norm": 0.3123668239526817, + "learning_rate": 0.00011795996670598462, + "loss": 1.0743, + "step": 9833 + }, + { + "epoch": 0.94, + "grad_norm": 0.2418704079745117, + "learning_rate": 0.0001179444042190024, + "loss": 1.0915, + "step": 9834 + }, + { + "epoch": 0.94, + "grad_norm": 0.3233001919313393, + "learning_rate": 0.00011792884128295014, + "loss": 1.1112, + "step": 9835 + }, + { + "epoch": 0.94, + "grad_norm": 0.23543282622618317, + "learning_rate": 0.0001179132778982174, + "loss": 0.9408, + "step": 9836 + }, + { + "epoch": 0.94, + "grad_norm": 0.27953787864369156, + "learning_rate": 0.00011789771406519361, + "loss": 1.0951, + "step": 9837 + }, + { + "epoch": 0.94, + "grad_norm": 0.26888014225579154, + "learning_rate": 0.00011788214978426827, + "loss": 1.0421, + "step": 9838 + }, + { + "epoch": 0.94, + "grad_norm": 0.28757566533635587, + "learning_rate": 0.0001178665850558309, + "loss": 1.1287, + "step": 9839 + }, + { + "epoch": 0.94, + "grad_norm": 0.3053049659715436, + "learning_rate": 0.00011785101988027103, + "loss": 1.0709, + "step": 9840 + }, + { + "epoch": 0.94, + "grad_norm": 0.2693832593847435, + "learning_rate": 0.00011783545425797813, + "loss": 1.178, + "step": 9841 + }, + { + "epoch": 0.94, + "grad_norm": 0.29150202296705147, + "learning_rate": 0.0001178198881893418, + "loss": 1.1276, + "step": 9842 + }, + { + "epoch": 0.94, + "grad_norm": 0.28026413298185754, + "learning_rate": 0.00011780432167475157, + "loss": 1.1026, + "step": 9843 + }, + { + "epoch": 0.94, + "grad_norm": 0.2883237105659363, + "learning_rate": 0.00011778875471459703, + "loss": 1.029, + "step": 9844 + }, + { + "epoch": 0.94, + "grad_norm": 0.2882413609766784, + "learning_rate": 0.00011777318730926768, + "loss": 1.1348, + "step": 9845 + }, + { + "epoch": 0.94, + "grad_norm": 0.3024147348726725, + "learning_rate": 0.00011775761945915315, + "loss": 1.1326, + "step": 9846 + }, + { + "epoch": 0.94, + "grad_norm": 0.27230068738152696, + "learning_rate": 0.00011774205116464304, + "loss": 1.1274, + "step": 9847 + }, + { + "epoch": 0.94, + "grad_norm": 0.2671829447086581, + "learning_rate": 0.00011772648242612694, + "loss": 0.999, + "step": 9848 + }, + { + "epoch": 0.94, + "grad_norm": 0.27183496557565345, + "learning_rate": 0.00011771091324399447, + "loss": 0.9818, + "step": 9849 + }, + { + "epoch": 0.94, + "grad_norm": 0.30565783107867467, + "learning_rate": 0.0001176953436186353, + "loss": 1.0334, + "step": 9850 + }, + { + "epoch": 0.94, + "grad_norm": 0.2672952617830373, + "learning_rate": 0.00011767977355043902, + "loss": 1.014, + "step": 9851 + }, + { + "epoch": 0.94, + "grad_norm": 0.25977834030458224, + "learning_rate": 0.00011766420303979528, + "loss": 1.0847, + "step": 9852 + }, + { + "epoch": 0.94, + "grad_norm": 0.29226617360753704, + "learning_rate": 0.00011764863208709378, + "loss": 1.1198, + "step": 9853 + }, + { + "epoch": 0.94, + "grad_norm": 0.2710445438667288, + "learning_rate": 0.00011763306069272415, + "loss": 1.1502, + "step": 9854 + }, + { + "epoch": 0.94, + "grad_norm": 0.2683767291753003, + "learning_rate": 0.00011761748885707611, + "loss": 1.0218, + "step": 9855 + }, + { + "epoch": 0.94, + "grad_norm": 0.30996151508397074, + "learning_rate": 0.00011760191658053933, + "loss": 1.0196, + "step": 9856 + }, + { + "epoch": 0.94, + "grad_norm": 0.2836262926909511, + "learning_rate": 0.00011758634386350353, + "loss": 0.9859, + "step": 9857 + }, + { + "epoch": 0.94, + "grad_norm": 0.2835130303902405, + "learning_rate": 0.00011757077070635842, + "loss": 0.9518, + "step": 9858 + }, + { + "epoch": 0.94, + "grad_norm": 0.29179997921864015, + "learning_rate": 0.00011755519710949375, + "loss": 1.0851, + "step": 9859 + }, + { + "epoch": 0.94, + "grad_norm": 0.2580784420137259, + "learning_rate": 0.0001175396230732992, + "loss": 1.0792, + "step": 9860 + }, + { + "epoch": 0.94, + "grad_norm": 0.2980701381253143, + "learning_rate": 0.00011752404859816459, + "loss": 1.1523, + "step": 9861 + }, + { + "epoch": 0.94, + "grad_norm": 0.30650598237537346, + "learning_rate": 0.00011750847368447963, + "loss": 1.0509, + "step": 9862 + }, + { + "epoch": 0.94, + "grad_norm": 0.2829225311103543, + "learning_rate": 0.00011749289833263413, + "loss": 1.0698, + "step": 9863 + }, + { + "epoch": 0.94, + "grad_norm": 0.3150147271464283, + "learning_rate": 0.00011747732254301786, + "loss": 1.0618, + "step": 9864 + }, + { + "epoch": 0.94, + "grad_norm": 0.26617589803804353, + "learning_rate": 0.00011746174631602059, + "loss": 0.9886, + "step": 9865 + }, + { + "epoch": 0.94, + "grad_norm": 0.28821178558985444, + "learning_rate": 0.00011744616965203214, + "loss": 0.9826, + "step": 9866 + }, + { + "epoch": 0.94, + "grad_norm": 0.264781657352801, + "learning_rate": 0.00011743059255144233, + "loss": 0.8746, + "step": 9867 + }, + { + "epoch": 0.94, + "grad_norm": 0.29886418377952234, + "learning_rate": 0.000117415015014641, + "loss": 1.0563, + "step": 9868 + }, + { + "epoch": 0.94, + "grad_norm": 0.3002368289446441, + "learning_rate": 0.00011739943704201796, + "loss": 1.0341, + "step": 9869 + }, + { + "epoch": 0.94, + "grad_norm": 0.28512524718120663, + "learning_rate": 0.00011738385863396311, + "loss": 0.9591, + "step": 9870 + }, + { + "epoch": 0.94, + "grad_norm": 0.2900458256329998, + "learning_rate": 0.00011736827979086625, + "loss": 1.0893, + "step": 9871 + }, + { + "epoch": 0.94, + "grad_norm": 0.28440813574018564, + "learning_rate": 0.00011735270051311724, + "loss": 1.052, + "step": 9872 + }, + { + "epoch": 0.94, + "grad_norm": 0.2469565537916454, + "learning_rate": 0.00011733712080110603, + "loss": 0.857, + "step": 9873 + }, + { + "epoch": 0.94, + "grad_norm": 0.2501421742546667, + "learning_rate": 0.00011732154065522247, + "loss": 1.1517, + "step": 9874 + }, + { + "epoch": 0.94, + "grad_norm": 0.2943711294105304, + "learning_rate": 0.00011730596007585646, + "loss": 1.0131, + "step": 9875 + }, + { + "epoch": 0.94, + "grad_norm": 0.2759019579633036, + "learning_rate": 0.00011729037906339795, + "loss": 1.0827, + "step": 9876 + }, + { + "epoch": 0.94, + "grad_norm": 0.2734166501679395, + "learning_rate": 0.00011727479761823683, + "loss": 1.017, + "step": 9877 + }, + { + "epoch": 0.95, + "grad_norm": 0.2945759953073775, + "learning_rate": 0.00011725921574076305, + "loss": 1.048, + "step": 9878 + }, + { + "epoch": 0.95, + "grad_norm": 0.2847423179857022, + "learning_rate": 0.00011724363343136651, + "loss": 0.9387, + "step": 9879 + }, + { + "epoch": 0.95, + "grad_norm": 0.2731998133242859, + "learning_rate": 0.00011722805069043724, + "loss": 1.1269, + "step": 9880 + }, + { + "epoch": 0.95, + "grad_norm": 0.31097755473044353, + "learning_rate": 0.00011721246751836514, + "loss": 1.1183, + "step": 9881 + }, + { + "epoch": 0.95, + "grad_norm": 0.24077282326706342, + "learning_rate": 0.00011719688391554024, + "loss": 1.1315, + "step": 9882 + }, + { + "epoch": 0.95, + "grad_norm": 0.28290762006495423, + "learning_rate": 0.00011718129988235251, + "loss": 1.0301, + "step": 9883 + }, + { + "epoch": 0.95, + "grad_norm": 0.28799605220582924, + "learning_rate": 0.00011716571541919197, + "loss": 1.1275, + "step": 9884 + }, + { + "epoch": 0.95, + "grad_norm": 0.2506151300260542, + "learning_rate": 0.00011715013052644859, + "loss": 1.0488, + "step": 9885 + }, + { + "epoch": 0.95, + "grad_norm": 0.3003785975679057, + "learning_rate": 0.00011713454520451243, + "loss": 1.1445, + "step": 9886 + }, + { + "epoch": 0.95, + "grad_norm": 0.2871681991986448, + "learning_rate": 0.00011711895945377351, + "loss": 1.1341, + "step": 9887 + }, + { + "epoch": 0.95, + "grad_norm": 0.263469877916956, + "learning_rate": 0.00011710337327462186, + "loss": 1.0684, + "step": 9888 + }, + { + "epoch": 0.95, + "grad_norm": 0.25031317343613296, + "learning_rate": 0.00011708778666744756, + "loss": 1.0692, + "step": 9889 + }, + { + "epoch": 0.95, + "grad_norm": 0.27978745157205426, + "learning_rate": 0.00011707219963264063, + "loss": 0.9031, + "step": 9890 + }, + { + "epoch": 0.95, + "grad_norm": 0.2771417917801048, + "learning_rate": 0.00011705661217059121, + "loss": 1.0672, + "step": 9891 + }, + { + "epoch": 0.95, + "grad_norm": 0.27046659772554993, + "learning_rate": 0.00011704102428168931, + "loss": 1.1261, + "step": 9892 + }, + { + "epoch": 0.95, + "grad_norm": 0.2627203026681411, + "learning_rate": 0.00011702543596632512, + "loss": 1.0424, + "step": 9893 + }, + { + "epoch": 0.95, + "grad_norm": 0.24842638162917163, + "learning_rate": 0.00011700984722488865, + "loss": 1.0639, + "step": 9894 + }, + { + "epoch": 0.95, + "grad_norm": 0.297596295779575, + "learning_rate": 0.00011699425805777008, + "loss": 1.1003, + "step": 9895 + }, + { + "epoch": 0.95, + "grad_norm": 0.27759923733902636, + "learning_rate": 0.00011697866846535953, + "loss": 1.1192, + "step": 9896 + }, + { + "epoch": 0.95, + "grad_norm": 0.30285981266810275, + "learning_rate": 0.00011696307844804713, + "loss": 1.1918, + "step": 9897 + }, + { + "epoch": 0.95, + "grad_norm": 0.31450365765502614, + "learning_rate": 0.00011694748800622301, + "loss": 1.0818, + "step": 9898 + }, + { + "epoch": 0.95, + "grad_norm": 0.3141057225598429, + "learning_rate": 0.00011693189714027737, + "loss": 0.9055, + "step": 9899 + }, + { + "epoch": 0.95, + "grad_norm": 0.2718320654630044, + "learning_rate": 0.00011691630585060036, + "loss": 1.182, + "step": 9900 + }, + { + "epoch": 0.95, + "grad_norm": 0.2642413261096164, + "learning_rate": 0.00011690071413758217, + "loss": 1.0622, + "step": 9901 + }, + { + "epoch": 0.95, + "grad_norm": 0.2781872909054998, + "learning_rate": 0.00011688512200161297, + "loss": 1.0917, + "step": 9902 + }, + { + "epoch": 0.95, + "grad_norm": 0.2933665399467418, + "learning_rate": 0.00011686952944308298, + "loss": 1.0172, + "step": 9903 + }, + { + "epoch": 0.95, + "grad_norm": 0.2986798343224596, + "learning_rate": 0.00011685393646238243, + "loss": 1.0515, + "step": 9904 + }, + { + "epoch": 0.95, + "grad_norm": 0.27008347335950034, + "learning_rate": 0.00011683834305990154, + "loss": 1.0519, + "step": 9905 + }, + { + "epoch": 0.95, + "grad_norm": 0.27146126457125797, + "learning_rate": 0.00011682274923603049, + "loss": 1.0363, + "step": 9906 + }, + { + "epoch": 0.95, + "grad_norm": 0.24730678500742276, + "learning_rate": 0.00011680715499115959, + "loss": 1.0349, + "step": 9907 + }, + { + "epoch": 0.95, + "grad_norm": 0.274923895199804, + "learning_rate": 0.00011679156032567911, + "loss": 1.0628, + "step": 9908 + }, + { + "epoch": 0.95, + "grad_norm": 0.28568564007828456, + "learning_rate": 0.00011677596523997922, + "loss": 1.0989, + "step": 9909 + }, + { + "epoch": 0.95, + "grad_norm": 0.26786733880651786, + "learning_rate": 0.00011676036973445028, + "loss": 0.9306, + "step": 9910 + }, + { + "epoch": 0.95, + "grad_norm": 0.30754840140433903, + "learning_rate": 0.00011674477380948255, + "loss": 1.0696, + "step": 9911 + }, + { + "epoch": 0.95, + "grad_norm": 0.25852789773425133, + "learning_rate": 0.00011672917746546634, + "loss": 0.8838, + "step": 9912 + }, + { + "epoch": 0.95, + "grad_norm": 0.2558247705235392, + "learning_rate": 0.00011671358070279193, + "loss": 0.99, + "step": 9913 + }, + { + "epoch": 0.95, + "grad_norm": 0.29138329913400457, + "learning_rate": 0.00011669798352184968, + "loss": 0.9954, + "step": 9914 + }, + { + "epoch": 0.95, + "grad_norm": 0.30260586220267804, + "learning_rate": 0.0001166823859230299, + "loss": 1.0312, + "step": 9915 + }, + { + "epoch": 0.95, + "grad_norm": 0.2827165548781267, + "learning_rate": 0.0001166667879067229, + "loss": 1.131, + "step": 9916 + }, + { + "epoch": 0.95, + "grad_norm": 0.29392517958616593, + "learning_rate": 0.0001166511894733191, + "loss": 1.0896, + "step": 9917 + }, + { + "epoch": 0.95, + "grad_norm": 0.3260991950351522, + "learning_rate": 0.00011663559062320878, + "loss": 1.0871, + "step": 9918 + }, + { + "epoch": 0.95, + "grad_norm": 0.2561723519059954, + "learning_rate": 0.00011661999135678237, + "loss": 1.1483, + "step": 9919 + }, + { + "epoch": 0.95, + "grad_norm": 0.309722846552505, + "learning_rate": 0.00011660439167443022, + "loss": 1.0889, + "step": 9920 + }, + { + "epoch": 0.95, + "grad_norm": 0.25190002272893053, + "learning_rate": 0.00011658879157654276, + "loss": 1.0626, + "step": 9921 + }, + { + "epoch": 0.95, + "grad_norm": 0.3259479008437906, + "learning_rate": 0.00011657319106351035, + "loss": 1.0441, + "step": 9922 + }, + { + "epoch": 0.95, + "grad_norm": 0.26095427143336863, + "learning_rate": 0.0001165575901357234, + "loss": 1.108, + "step": 9923 + }, + { + "epoch": 0.95, + "grad_norm": 0.3334009293399356, + "learning_rate": 0.00011654198879357236, + "loss": 0.9941, + "step": 9924 + }, + { + "epoch": 0.95, + "grad_norm": 0.2479075214464415, + "learning_rate": 0.00011652638703744769, + "loss": 0.9997, + "step": 9925 + }, + { + "epoch": 0.95, + "grad_norm": 0.2699529303379684, + "learning_rate": 0.00011651078486773974, + "loss": 1.0008, + "step": 9926 + }, + { + "epoch": 0.95, + "grad_norm": 0.2793875562565832, + "learning_rate": 0.00011649518228483907, + "loss": 1.1141, + "step": 9927 + }, + { + "epoch": 0.95, + "grad_norm": 0.30640720124070897, + "learning_rate": 0.00011647957928913606, + "loss": 1.088, + "step": 9928 + }, + { + "epoch": 0.95, + "grad_norm": 0.29450119862900453, + "learning_rate": 0.00011646397588102123, + "loss": 1.0208, + "step": 9929 + }, + { + "epoch": 0.95, + "grad_norm": 0.2986882197988017, + "learning_rate": 0.00011644837206088508, + "loss": 1.1405, + "step": 9930 + }, + { + "epoch": 0.95, + "grad_norm": 0.28289114316471997, + "learning_rate": 0.00011643276782911805, + "loss": 1.0624, + "step": 9931 + }, + { + "epoch": 0.95, + "grad_norm": 0.25290685088339804, + "learning_rate": 0.0001164171631861107, + "loss": 0.9819, + "step": 9932 + }, + { + "epoch": 0.95, + "grad_norm": 0.29613853648276806, + "learning_rate": 0.00011640155813225348, + "loss": 0.9768, + "step": 9933 + }, + { + "epoch": 0.95, + "grad_norm": 0.30470164680746314, + "learning_rate": 0.00011638595266793701, + "loss": 1.0121, + "step": 9934 + }, + { + "epoch": 0.95, + "grad_norm": 0.26617140467629113, + "learning_rate": 0.00011637034679355176, + "loss": 1.0911, + "step": 9935 + }, + { + "epoch": 0.95, + "grad_norm": 0.29756347865505667, + "learning_rate": 0.00011635474050948829, + "loss": 1.0299, + "step": 9936 + }, + { + "epoch": 0.95, + "grad_norm": 0.2496000722355499, + "learning_rate": 0.00011633913381613717, + "loss": 0.9958, + "step": 9937 + }, + { + "epoch": 0.95, + "grad_norm": 0.28395690278817887, + "learning_rate": 0.00011632352671388898, + "loss": 1.0131, + "step": 9938 + }, + { + "epoch": 0.95, + "grad_norm": 0.2922841640955166, + "learning_rate": 0.00011630791920313425, + "loss": 1.0615, + "step": 9939 + }, + { + "epoch": 0.95, + "grad_norm": 0.3033319129166104, + "learning_rate": 0.00011629231128426356, + "loss": 1.1185, + "step": 9940 + }, + { + "epoch": 0.95, + "grad_norm": 0.2696093697556044, + "learning_rate": 0.00011627670295766759, + "loss": 0.9108, + "step": 9941 + }, + { + "epoch": 0.95, + "grad_norm": 0.275903786765533, + "learning_rate": 0.00011626109422373688, + "loss": 0.9584, + "step": 9942 + }, + { + "epoch": 0.95, + "grad_norm": 0.2593587234240088, + "learning_rate": 0.00011624548508286206, + "loss": 1.0215, + "step": 9943 + }, + { + "epoch": 0.95, + "grad_norm": 0.25466792159457646, + "learning_rate": 0.00011622987553543376, + "loss": 1.0309, + "step": 9944 + }, + { + "epoch": 0.95, + "grad_norm": 0.28160232777966615, + "learning_rate": 0.00011621426558184265, + "loss": 1.036, + "step": 9945 + }, + { + "epoch": 0.95, + "grad_norm": 0.2665942842206432, + "learning_rate": 0.00011619865522247933, + "loss": 1.0767, + "step": 9946 + }, + { + "epoch": 0.95, + "grad_norm": 0.29079511088912224, + "learning_rate": 0.00011618304445773451, + "loss": 1.0276, + "step": 9947 + }, + { + "epoch": 0.95, + "grad_norm": 0.2803873259698023, + "learning_rate": 0.00011616743328799881, + "loss": 1.0198, + "step": 9948 + }, + { + "epoch": 0.95, + "grad_norm": 0.2555819038764869, + "learning_rate": 0.00011615182171366297, + "loss": 1.0222, + "step": 9949 + }, + { + "epoch": 0.95, + "grad_norm": 0.31734739800334083, + "learning_rate": 0.00011613620973511758, + "loss": 1.1547, + "step": 9950 + }, + { + "epoch": 0.95, + "grad_norm": 0.28917459887517855, + "learning_rate": 0.00011612059735275342, + "loss": 1.1102, + "step": 9951 + }, + { + "epoch": 0.95, + "grad_norm": 0.24841835752111638, + "learning_rate": 0.00011610498456696119, + "loss": 1.1191, + "step": 9952 + }, + { + "epoch": 0.95, + "grad_norm": 0.29654024864058454, + "learning_rate": 0.00011608937137813161, + "loss": 0.9583, + "step": 9953 + }, + { + "epoch": 0.95, + "grad_norm": 0.3147918475594126, + "learning_rate": 0.00011607375778665536, + "loss": 1.1377, + "step": 9954 + }, + { + "epoch": 0.95, + "grad_norm": 0.2772820949372226, + "learning_rate": 0.00011605814379292325, + "loss": 1.0474, + "step": 9955 + }, + { + "epoch": 0.95, + "grad_norm": 0.30109169932300395, + "learning_rate": 0.00011604252939732601, + "loss": 1.0939, + "step": 9956 + }, + { + "epoch": 0.95, + "grad_norm": 0.2868717018893174, + "learning_rate": 0.00011602691460025437, + "loss": 0.9908, + "step": 9957 + }, + { + "epoch": 0.95, + "grad_norm": 0.2675493227671909, + "learning_rate": 0.00011601129940209911, + "loss": 1.0693, + "step": 9958 + }, + { + "epoch": 0.95, + "grad_norm": 0.31375505859098296, + "learning_rate": 0.00011599568380325106, + "loss": 1.1187, + "step": 9959 + }, + { + "epoch": 0.95, + "grad_norm": 0.295148232115398, + "learning_rate": 0.00011598006780410091, + "loss": 1.0427, + "step": 9960 + }, + { + "epoch": 0.95, + "grad_norm": 0.28936388619727343, + "learning_rate": 0.00011596445140503957, + "loss": 0.9698, + "step": 9961 + }, + { + "epoch": 0.95, + "grad_norm": 0.28216120171352943, + "learning_rate": 0.0001159488346064578, + "loss": 1.0421, + "step": 9962 + }, + { + "epoch": 0.95, + "grad_norm": 0.28268688737805364, + "learning_rate": 0.00011593321740874639, + "loss": 1.0307, + "step": 9963 + }, + { + "epoch": 0.95, + "grad_norm": 0.27612823655418073, + "learning_rate": 0.00011591759981229622, + "loss": 0.9663, + "step": 9964 + }, + { + "epoch": 0.95, + "grad_norm": 0.2740428689355272, + "learning_rate": 0.00011590198181749811, + "loss": 1.0961, + "step": 9965 + }, + { + "epoch": 0.95, + "grad_norm": 0.2499724504263894, + "learning_rate": 0.0001158863634247429, + "loss": 1.0651, + "step": 9966 + }, + { + "epoch": 0.95, + "grad_norm": 0.2757709129720417, + "learning_rate": 0.00011587074463442147, + "loss": 1.0643, + "step": 9967 + }, + { + "epoch": 0.95, + "grad_norm": 0.2985207172650015, + "learning_rate": 0.00011585512544692467, + "loss": 1.1078, + "step": 9968 + }, + { + "epoch": 0.95, + "grad_norm": 0.3045737397007342, + "learning_rate": 0.00011583950586264343, + "loss": 1.1594, + "step": 9969 + }, + { + "epoch": 0.95, + "grad_norm": 0.27637988509985273, + "learning_rate": 0.00011582388588196855, + "loss": 1.0231, + "step": 9970 + }, + { + "epoch": 0.95, + "grad_norm": 0.2821414694181937, + "learning_rate": 0.000115808265505291, + "loss": 1.1147, + "step": 9971 + }, + { + "epoch": 0.95, + "grad_norm": 0.29155806162009856, + "learning_rate": 0.00011579264473300167, + "loss": 1.0431, + "step": 9972 + }, + { + "epoch": 0.95, + "grad_norm": 0.27636257246936596, + "learning_rate": 0.00011577702356549149, + "loss": 1.0565, + "step": 9973 + }, + { + "epoch": 0.95, + "grad_norm": 0.31102867461907796, + "learning_rate": 0.00011576140200315135, + "loss": 1.1162, + "step": 9974 + }, + { + "epoch": 0.95, + "grad_norm": 0.28686875083828667, + "learning_rate": 0.00011574578004637226, + "loss": 0.9938, + "step": 9975 + }, + { + "epoch": 0.95, + "grad_norm": 0.2829922774553789, + "learning_rate": 0.00011573015769554512, + "loss": 1.043, + "step": 9976 + }, + { + "epoch": 0.95, + "grad_norm": 0.30378521566991484, + "learning_rate": 0.00011571453495106086, + "loss": 1.125, + "step": 9977 + }, + { + "epoch": 0.95, + "grad_norm": 0.2882399502995974, + "learning_rate": 0.00011569891181331054, + "loss": 1.0316, + "step": 9978 + }, + { + "epoch": 0.95, + "grad_norm": 0.2546929646781789, + "learning_rate": 0.00011568328828268506, + "loss": 1.0347, + "step": 9979 + }, + { + "epoch": 0.95, + "grad_norm": 0.27885757954603946, + "learning_rate": 0.00011566766435957541, + "loss": 1.0919, + "step": 9980 + }, + { + "epoch": 0.95, + "grad_norm": 0.26559169992251663, + "learning_rate": 0.00011565204004437267, + "loss": 1.0969, + "step": 9981 + }, + { + "epoch": 0.95, + "grad_norm": 0.33065813751668216, + "learning_rate": 0.00011563641533746774, + "loss": 1.0411, + "step": 9982 + }, + { + "epoch": 0.96, + "grad_norm": 0.33089614226385505, + "learning_rate": 0.00011562079023925172, + "loss": 1.0129, + "step": 9983 + }, + { + "epoch": 0.96, + "grad_norm": 0.29760832729104647, + "learning_rate": 0.00011560516475011558, + "loss": 1.0865, + "step": 9984 + }, + { + "epoch": 0.96, + "grad_norm": 0.2982860380062189, + "learning_rate": 0.00011558953887045041, + "loss": 1.0879, + "step": 9985 + }, + { + "epoch": 0.96, + "grad_norm": 0.3089962049085526, + "learning_rate": 0.00011557391260064723, + "loss": 1.062, + "step": 9986 + }, + { + "epoch": 0.96, + "grad_norm": 0.2907036025220388, + "learning_rate": 0.00011555828594109707, + "loss": 1.0863, + "step": 9987 + }, + { + "epoch": 0.96, + "grad_norm": 0.31013815730431993, + "learning_rate": 0.00011554265889219106, + "loss": 1.0049, + "step": 9988 + }, + { + "epoch": 0.96, + "grad_norm": 0.29883777425014585, + "learning_rate": 0.00011552703145432025, + "loss": 1.0613, + "step": 9989 + }, + { + "epoch": 0.96, + "grad_norm": 0.29975907234168436, + "learning_rate": 0.0001155114036278757, + "loss": 1.0005, + "step": 9990 + }, + { + "epoch": 0.96, + "grad_norm": 0.30684771167447394, + "learning_rate": 0.0001154957754132485, + "loss": 1.1158, + "step": 9991 + }, + { + "epoch": 0.96, + "grad_norm": 0.26335079161415853, + "learning_rate": 0.00011548014681082981, + "loss": 1.021, + "step": 9992 + }, + { + "epoch": 0.96, + "grad_norm": 0.27648270125908575, + "learning_rate": 0.00011546451782101071, + "loss": 1.1223, + "step": 9993 + }, + { + "epoch": 0.96, + "grad_norm": 0.3475752040212275, + "learning_rate": 0.00011544888844418233, + "loss": 1.1233, + "step": 9994 + }, + { + "epoch": 0.96, + "grad_norm": 0.30111765274598085, + "learning_rate": 0.0001154332586807358, + "loss": 1.0963, + "step": 9995 + }, + { + "epoch": 0.96, + "grad_norm": 0.2970605582481724, + "learning_rate": 0.0001154176285310623, + "loss": 0.99, + "step": 9996 + }, + { + "epoch": 0.96, + "grad_norm": 0.305235597238056, + "learning_rate": 0.00011540199799555294, + "loss": 1.132, + "step": 9997 + }, + { + "epoch": 0.96, + "grad_norm": 0.2670713296870493, + "learning_rate": 0.00011538636707459889, + "loss": 1.0476, + "step": 9998 + }, + { + "epoch": 0.96, + "grad_norm": 0.2677367396231706, + "learning_rate": 0.00011537073576859136, + "loss": 1.069, + "step": 9999 + }, + { + "epoch": 0.96, + "grad_norm": 0.2525746570305202, + "learning_rate": 0.00011535510407792149, + "loss": 0.9385, + "step": 10000 + }, + { + "epoch": 0.96, + "grad_norm": 0.2758414503350628, + "learning_rate": 0.0001153394720029805, + "loss": 1.0631, + "step": 10001 + }, + { + "epoch": 0.96, + "grad_norm": 0.3047038095467878, + "learning_rate": 0.00011532383954415957, + "loss": 1.0311, + "step": 10002 + }, + { + "epoch": 0.96, + "grad_norm": 0.28111478399862794, + "learning_rate": 0.00011530820670184995, + "loss": 1.0573, + "step": 10003 + }, + { + "epoch": 0.96, + "grad_norm": 0.28036170451809095, + "learning_rate": 0.0001152925734764428, + "loss": 1.1742, + "step": 10004 + }, + { + "epoch": 0.96, + "grad_norm": 0.2963156258676252, + "learning_rate": 0.00011527693986832942, + "loss": 1.016, + "step": 10005 + }, + { + "epoch": 0.96, + "grad_norm": 0.2815524009206448, + "learning_rate": 0.000115261305877901, + "loss": 1.0306, + "step": 10006 + }, + { + "epoch": 0.96, + "grad_norm": 0.29474398852141115, + "learning_rate": 0.00011524567150554881, + "loss": 1.0829, + "step": 10007 + }, + { + "epoch": 0.96, + "grad_norm": 0.29358155011217263, + "learning_rate": 0.00011523003675166411, + "loss": 1.0714, + "step": 10008 + }, + { + "epoch": 0.96, + "grad_norm": 0.2844867269668011, + "learning_rate": 0.00011521440161663819, + "loss": 1.0348, + "step": 10009 + }, + { + "epoch": 0.96, + "grad_norm": 0.255280580277318, + "learning_rate": 0.00011519876610086229, + "loss": 1.1012, + "step": 10010 + }, + { + "epoch": 0.96, + "grad_norm": 0.31413925882198807, + "learning_rate": 0.00011518313020472768, + "loss": 1.0744, + "step": 10011 + }, + { + "epoch": 0.96, + "grad_norm": 0.31994016396189484, + "learning_rate": 0.00011516749392862576, + "loss": 0.9883, + "step": 10012 + }, + { + "epoch": 0.96, + "grad_norm": 0.27390552772942023, + "learning_rate": 0.00011515185727294771, + "loss": 0.9983, + "step": 10013 + }, + { + "epoch": 0.96, + "grad_norm": 0.2853534170678965, + "learning_rate": 0.00011513622023808495, + "loss": 1.0904, + "step": 10014 + }, + { + "epoch": 0.96, + "grad_norm": 0.25465697705481827, + "learning_rate": 0.00011512058282442874, + "loss": 0.922, + "step": 10015 + }, + { + "epoch": 0.96, + "grad_norm": 0.26802643575831, + "learning_rate": 0.00011510494503237046, + "loss": 1.0313, + "step": 10016 + }, + { + "epoch": 0.96, + "grad_norm": 0.3193729552335535, + "learning_rate": 0.00011508930686230146, + "loss": 0.9854, + "step": 10017 + }, + { + "epoch": 0.96, + "grad_norm": 0.2879367749732901, + "learning_rate": 0.00011507366831461302, + "loss": 1.1051, + "step": 10018 + }, + { + "epoch": 0.96, + "grad_norm": 0.30097969407853326, + "learning_rate": 0.0001150580293896966, + "loss": 1.0425, + "step": 10019 + }, + { + "epoch": 0.96, + "grad_norm": 0.30624544517915264, + "learning_rate": 0.0001150423900879435, + "loss": 1.0311, + "step": 10020 + }, + { + "epoch": 0.96, + "grad_norm": 0.2767941099814116, + "learning_rate": 0.00011502675040974516, + "loss": 0.9427, + "step": 10021 + }, + { + "epoch": 0.96, + "grad_norm": 0.28490801229190077, + "learning_rate": 0.00011501111035549295, + "loss": 1.1947, + "step": 10022 + }, + { + "epoch": 0.96, + "grad_norm": 0.2784277522206461, + "learning_rate": 0.00011499546992557826, + "loss": 0.9624, + "step": 10023 + }, + { + "epoch": 0.96, + "grad_norm": 0.2666766109775799, + "learning_rate": 0.00011497982912039249, + "loss": 0.9757, + "step": 10024 + }, + { + "epoch": 0.96, + "grad_norm": 0.32891260560151236, + "learning_rate": 0.00011496418794032711, + "loss": 1.1256, + "step": 10025 + }, + { + "epoch": 0.96, + "grad_norm": 0.3086769971151652, + "learning_rate": 0.0001149485463857735, + "loss": 0.9429, + "step": 10026 + }, + { + "epoch": 0.96, + "grad_norm": 0.25897740279910364, + "learning_rate": 0.00011493290445712315, + "loss": 0.8941, + "step": 10027 + }, + { + "epoch": 0.96, + "grad_norm": 0.27367523231686985, + "learning_rate": 0.00011491726215476746, + "loss": 0.961, + "step": 10028 + }, + { + "epoch": 0.96, + "grad_norm": 0.2902717009917645, + "learning_rate": 0.0001149016194790979, + "loss": 1.0828, + "step": 10029 + }, + { + "epoch": 0.96, + "grad_norm": 0.2767681614076228, + "learning_rate": 0.00011488597643050598, + "loss": 1.1453, + "step": 10030 + }, + { + "epoch": 0.96, + "grad_norm": 0.2857144403267616, + "learning_rate": 0.0001148703330093831, + "loss": 1.1737, + "step": 10031 + }, + { + "epoch": 0.96, + "grad_norm": 0.28150370452068746, + "learning_rate": 0.00011485468921612084, + "loss": 1.1734, + "step": 10032 + }, + { + "epoch": 0.96, + "grad_norm": 0.3290209532225155, + "learning_rate": 0.00011483904505111063, + "loss": 1.1331, + "step": 10033 + }, + { + "epoch": 0.96, + "grad_norm": 0.3138396953516879, + "learning_rate": 0.00011482340051474396, + "loss": 1.0148, + "step": 10034 + }, + { + "epoch": 0.96, + "grad_norm": 0.30110096644908, + "learning_rate": 0.00011480775560741239, + "loss": 1.0134, + "step": 10035 + }, + { + "epoch": 0.96, + "grad_norm": 0.3203655127160484, + "learning_rate": 0.00011479211032950743, + "loss": 1.0475, + "step": 10036 + }, + { + "epoch": 0.96, + "grad_norm": 0.2772142324547969, + "learning_rate": 0.00011477646468142062, + "loss": 1.0685, + "step": 10037 + }, + { + "epoch": 0.96, + "grad_norm": 0.2917042525821615, + "learning_rate": 0.0001147608186635435, + "loss": 1.1421, + "step": 10038 + }, + { + "epoch": 0.96, + "grad_norm": 0.26698256397886955, + "learning_rate": 0.00011474517227626762, + "loss": 1.1066, + "step": 10039 + }, + { + "epoch": 0.96, + "grad_norm": 0.2675194389510324, + "learning_rate": 0.00011472952551998452, + "loss": 0.9672, + "step": 10040 + }, + { + "epoch": 0.96, + "grad_norm": 0.2925787442554594, + "learning_rate": 0.0001147138783950858, + "loss": 1.0619, + "step": 10041 + }, + { + "epoch": 0.96, + "grad_norm": 0.2925652633706256, + "learning_rate": 0.00011469823090196303, + "loss": 1.067, + "step": 10042 + }, + { + "epoch": 0.96, + "grad_norm": 0.26146199790712404, + "learning_rate": 0.00011468258304100779, + "loss": 1.0669, + "step": 10043 + }, + { + "epoch": 0.96, + "grad_norm": 0.29387442287441945, + "learning_rate": 0.00011466693481261168, + "loss": 1.1529, + "step": 10044 + }, + { + "epoch": 0.96, + "grad_norm": 0.30566339364004746, + "learning_rate": 0.0001146512862171663, + "loss": 1.0711, + "step": 10045 + }, + { + "epoch": 0.96, + "grad_norm": 0.2623345882172563, + "learning_rate": 0.00011463563725506328, + "loss": 1.1218, + "step": 10046 + }, + { + "epoch": 0.96, + "grad_norm": 0.3090594432747929, + "learning_rate": 0.00011461998792669426, + "loss": 1.0833, + "step": 10047 + }, + { + "epoch": 0.96, + "grad_norm": 0.3168446002514091, + "learning_rate": 0.0001146043382324508, + "loss": 1.0041, + "step": 10048 + }, + { + "epoch": 0.96, + "grad_norm": 0.26118984316659555, + "learning_rate": 0.00011458868817272465, + "loss": 1.075, + "step": 10049 + }, + { + "epoch": 0.96, + "grad_norm": 0.3084194238337441, + "learning_rate": 0.0001145730377479074, + "loss": 0.9865, + "step": 10050 + }, + { + "epoch": 0.96, + "grad_norm": 0.2916812390686762, + "learning_rate": 0.00011455738695839071, + "loss": 1.1019, + "step": 10051 + }, + { + "epoch": 0.96, + "grad_norm": 0.2853685064498807, + "learning_rate": 0.00011454173580456627, + "loss": 1.1053, + "step": 10052 + }, + { + "epoch": 0.96, + "grad_norm": 0.28381729559022867, + "learning_rate": 0.00011452608428682574, + "loss": 1.0872, + "step": 10053 + }, + { + "epoch": 0.96, + "grad_norm": 0.35555285136726095, + "learning_rate": 0.0001145104324055608, + "loss": 1.0321, + "step": 10054 + }, + { + "epoch": 0.96, + "grad_norm": 0.29725262158547283, + "learning_rate": 0.00011449478016116322, + "loss": 1.0607, + "step": 10055 + }, + { + "epoch": 0.96, + "grad_norm": 0.32988647361187395, + "learning_rate": 0.00011447912755402463, + "loss": 1.0194, + "step": 10056 + }, + { + "epoch": 0.96, + "grad_norm": 0.2986055841309832, + "learning_rate": 0.00011446347458453677, + "loss": 1.0673, + "step": 10057 + }, + { + "epoch": 0.96, + "grad_norm": 0.2696942404700563, + "learning_rate": 0.00011444782125309137, + "loss": 1.0442, + "step": 10058 + }, + { + "epoch": 0.96, + "grad_norm": 0.3064172789572965, + "learning_rate": 0.00011443216756008017, + "loss": 0.9705, + "step": 10059 + }, + { + "epoch": 0.96, + "grad_norm": 0.29986340116379595, + "learning_rate": 0.00011441651350589493, + "loss": 1.042, + "step": 10060 + }, + { + "epoch": 0.96, + "grad_norm": 0.29196996307249695, + "learning_rate": 0.00011440085909092735, + "loss": 1.1514, + "step": 10061 + }, + { + "epoch": 0.96, + "grad_norm": 0.26305531388271913, + "learning_rate": 0.00011438520431556923, + "loss": 1.0125, + "step": 10062 + }, + { + "epoch": 0.96, + "grad_norm": 0.28626327759234815, + "learning_rate": 0.00011436954918021232, + "loss": 1.1407, + "step": 10063 + }, + { + "epoch": 0.96, + "grad_norm": 0.27422258071461836, + "learning_rate": 0.00011435389368524842, + "loss": 1.055, + "step": 10064 + }, + { + "epoch": 0.96, + "grad_norm": 0.29139299897756993, + "learning_rate": 0.0001143382378310693, + "loss": 1.015, + "step": 10065 + }, + { + "epoch": 0.96, + "grad_norm": 0.2590303991609533, + "learning_rate": 0.0001143225816180668, + "loss": 0.9926, + "step": 10066 + }, + { + "epoch": 0.96, + "grad_norm": 0.28910079655703913, + "learning_rate": 0.00011430692504663265, + "loss": 0.9727, + "step": 10067 + }, + { + "epoch": 0.96, + "grad_norm": 0.3312654664415843, + "learning_rate": 0.00011429126811715872, + "loss": 1.101, + "step": 10068 + }, + { + "epoch": 0.96, + "grad_norm": 0.30000869026615357, + "learning_rate": 0.00011427561083003683, + "loss": 1.0738, + "step": 10069 + }, + { + "epoch": 0.96, + "grad_norm": 0.28159858071588617, + "learning_rate": 0.00011425995318565883, + "loss": 1.0995, + "step": 10070 + }, + { + "epoch": 0.96, + "grad_norm": 0.3123705444016819, + "learning_rate": 0.00011424429518441653, + "loss": 0.9907, + "step": 10071 + }, + { + "epoch": 0.96, + "grad_norm": 0.25560829274875024, + "learning_rate": 0.00011422863682670176, + "loss": 1.0387, + "step": 10072 + }, + { + "epoch": 0.96, + "grad_norm": 0.3047604879614061, + "learning_rate": 0.00011421297811290643, + "loss": 1.0803, + "step": 10073 + }, + { + "epoch": 0.96, + "grad_norm": 0.27464921456414265, + "learning_rate": 0.0001141973190434224, + "loss": 1.1457, + "step": 10074 + }, + { + "epoch": 0.96, + "grad_norm": 0.3015661166736289, + "learning_rate": 0.00011418165961864151, + "loss": 0.9435, + "step": 10075 + }, + { + "epoch": 0.96, + "grad_norm": 0.3288482467287445, + "learning_rate": 0.0001141659998389557, + "loss": 1.0268, + "step": 10076 + }, + { + "epoch": 0.96, + "grad_norm": 0.2880879802413768, + "learning_rate": 0.00011415033970475682, + "loss": 1.1155, + "step": 10077 + }, + { + "epoch": 0.96, + "grad_norm": 0.3046710592105044, + "learning_rate": 0.00011413467921643681, + "loss": 1.0854, + "step": 10078 + }, + { + "epoch": 0.96, + "grad_norm": 0.23189332771051346, + "learning_rate": 0.00011411901837438757, + "loss": 0.8959, + "step": 10079 + }, + { + "epoch": 0.96, + "grad_norm": 0.2860860545736034, + "learning_rate": 0.00011410335717900102, + "loss": 1.004, + "step": 10080 + }, + { + "epoch": 0.96, + "grad_norm": 0.29819011560467884, + "learning_rate": 0.0001140876956306691, + "loss": 1.0271, + "step": 10081 + }, + { + "epoch": 0.96, + "grad_norm": 0.2961548882133075, + "learning_rate": 0.00011407203372978372, + "loss": 1.0964, + "step": 10082 + }, + { + "epoch": 0.96, + "grad_norm": 0.27643172524881576, + "learning_rate": 0.00011405637147673688, + "loss": 1.0848, + "step": 10083 + }, + { + "epoch": 0.96, + "grad_norm": 0.26601937387043634, + "learning_rate": 0.00011404070887192051, + "loss": 1.0771, + "step": 10084 + }, + { + "epoch": 0.96, + "grad_norm": 0.29423767837815973, + "learning_rate": 0.00011402504591572656, + "loss": 1.1087, + "step": 10085 + }, + { + "epoch": 0.96, + "grad_norm": 0.31957243246751704, + "learning_rate": 0.00011400938260854703, + "loss": 1.1154, + "step": 10086 + }, + { + "epoch": 0.97, + "grad_norm": 0.34871076842626053, + "learning_rate": 0.00011399371895077389, + "loss": 1.0691, + "step": 10087 + }, + { + "epoch": 0.97, + "grad_norm": 0.30378790346074774, + "learning_rate": 0.00011397805494279916, + "loss": 1.096, + "step": 10088 + }, + { + "epoch": 0.97, + "grad_norm": 0.25772589878682645, + "learning_rate": 0.00011396239058501476, + "loss": 1.0342, + "step": 10089 + }, + { + "epoch": 0.97, + "grad_norm": 0.2823080995120186, + "learning_rate": 0.00011394672587781284, + "loss": 1.1017, + "step": 10090 + }, + { + "epoch": 0.97, + "grad_norm": 0.312629120636968, + "learning_rate": 0.0001139310608215853, + "loss": 1.0579, + "step": 10091 + }, + { + "epoch": 0.97, + "grad_norm": 0.3054424461560633, + "learning_rate": 0.00011391539541672418, + "loss": 1.0553, + "step": 10092 + }, + { + "epoch": 0.97, + "grad_norm": 0.28281060096224714, + "learning_rate": 0.00011389972966362159, + "loss": 1.0612, + "step": 10093 + }, + { + "epoch": 0.97, + "grad_norm": 0.27166297516714194, + "learning_rate": 0.00011388406356266951, + "loss": 1.0524, + "step": 10094 + }, + { + "epoch": 0.97, + "grad_norm": 0.31743027731547635, + "learning_rate": 0.00011386839711426003, + "loss": 1.0024, + "step": 10095 + }, + { + "epoch": 0.97, + "grad_norm": 0.32534625159202174, + "learning_rate": 0.00011385273031878516, + "loss": 1.0885, + "step": 10096 + }, + { + "epoch": 0.97, + "grad_norm": 0.30049574798069184, + "learning_rate": 0.00011383706317663705, + "loss": 1.0135, + "step": 10097 + }, + { + "epoch": 0.97, + "grad_norm": 0.3266379229812977, + "learning_rate": 0.00011382139568820771, + "loss": 1.0434, + "step": 10098 + }, + { + "epoch": 0.97, + "grad_norm": 0.3039958500677989, + "learning_rate": 0.00011380572785388923, + "loss": 1.1338, + "step": 10099 + }, + { + "epoch": 0.97, + "grad_norm": 0.2820873927102382, + "learning_rate": 0.0001137900596740738, + "loss": 0.9901, + "step": 10100 + }, + { + "epoch": 0.97, + "grad_norm": 0.25895906852181655, + "learning_rate": 0.00011377439114915343, + "loss": 1.0197, + "step": 10101 + }, + { + "epoch": 0.97, + "grad_norm": 0.2736675525144609, + "learning_rate": 0.00011375872227952024, + "loss": 1.0666, + "step": 10102 + }, + { + "epoch": 0.97, + "grad_norm": 0.27812994654877704, + "learning_rate": 0.00011374305306556641, + "loss": 0.9667, + "step": 10103 + }, + { + "epoch": 0.97, + "grad_norm": 0.24944483327099637, + "learning_rate": 0.00011372738350768404, + "loss": 1.1377, + "step": 10104 + }, + { + "epoch": 0.97, + "grad_norm": 0.3052298238193706, + "learning_rate": 0.00011371171360626528, + "loss": 1.0432, + "step": 10105 + }, + { + "epoch": 0.97, + "grad_norm": 0.27306031709563633, + "learning_rate": 0.00011369604336170221, + "loss": 1.0682, + "step": 10106 + }, + { + "epoch": 0.97, + "grad_norm": 0.3167325272734246, + "learning_rate": 0.0001136803727743871, + "loss": 1.1381, + "step": 10107 + }, + { + "epoch": 0.97, + "grad_norm": 0.3310696902768376, + "learning_rate": 0.00011366470184471206, + "loss": 0.9888, + "step": 10108 + }, + { + "epoch": 0.97, + "grad_norm": 0.2920470605038157, + "learning_rate": 0.00011364903057306923, + "loss": 1.0723, + "step": 10109 + }, + { + "epoch": 0.97, + "grad_norm": 0.28721162596130884, + "learning_rate": 0.00011363335895985087, + "loss": 1.0795, + "step": 10110 + }, + { + "epoch": 0.97, + "grad_norm": 0.3123155431147055, + "learning_rate": 0.00011361768700544915, + "loss": 1.0195, + "step": 10111 + }, + { + "epoch": 0.97, + "grad_norm": 0.31432924829965664, + "learning_rate": 0.00011360201471025625, + "loss": 1.0262, + "step": 10112 + }, + { + "epoch": 0.97, + "grad_norm": 0.30371743385151373, + "learning_rate": 0.00011358634207466434, + "loss": 1.1198, + "step": 10113 + }, + { + "epoch": 0.97, + "grad_norm": 0.30399866249564966, + "learning_rate": 0.0001135706690990657, + "loss": 1.0732, + "step": 10114 + }, + { + "epoch": 0.97, + "grad_norm": 0.32184020587696033, + "learning_rate": 0.00011355499578385256, + "loss": 1.1721, + "step": 10115 + }, + { + "epoch": 0.97, + "grad_norm": 0.30947844684370757, + "learning_rate": 0.00011353932212941709, + "loss": 0.9335, + "step": 10116 + }, + { + "epoch": 0.97, + "grad_norm": 0.30689326685860674, + "learning_rate": 0.00011352364813615159, + "loss": 1.154, + "step": 10117 + }, + { + "epoch": 0.97, + "grad_norm": 0.35179570677545263, + "learning_rate": 0.0001135079738044483, + "loss": 1.1623, + "step": 10118 + }, + { + "epoch": 0.97, + "grad_norm": 0.2895264616854555, + "learning_rate": 0.00011349229913469948, + "loss": 1.1446, + "step": 10119 + }, + { + "epoch": 0.97, + "grad_norm": 0.31453418960821145, + "learning_rate": 0.00011347662412729738, + "loss": 1.1001, + "step": 10120 + }, + { + "epoch": 0.97, + "grad_norm": 0.2761763344790832, + "learning_rate": 0.00011346094878263431, + "loss": 0.9588, + "step": 10121 + }, + { + "epoch": 0.97, + "grad_norm": 0.30740807652927366, + "learning_rate": 0.00011344527310110256, + "loss": 1.0145, + "step": 10122 + }, + { + "epoch": 0.97, + "grad_norm": 0.2852359412921041, + "learning_rate": 0.00011342959708309435, + "loss": 0.9789, + "step": 10123 + }, + { + "epoch": 0.97, + "grad_norm": 0.2870738764698151, + "learning_rate": 0.00011341392072900205, + "loss": 1.0349, + "step": 10124 + }, + { + "epoch": 0.97, + "grad_norm": 0.3079179973519736, + "learning_rate": 0.00011339824403921797, + "loss": 1.0653, + "step": 10125 + }, + { + "epoch": 0.97, + "grad_norm": 0.256710403425938, + "learning_rate": 0.0001133825670141344, + "loss": 0.9875, + "step": 10126 + }, + { + "epoch": 0.97, + "grad_norm": 0.3171962319601306, + "learning_rate": 0.00011336688965414369, + "loss": 1.1135, + "step": 10127 + }, + { + "epoch": 0.97, + "grad_norm": 0.2964396038679801, + "learning_rate": 0.00011335121195963813, + "loss": 1.0385, + "step": 10128 + }, + { + "epoch": 0.97, + "grad_norm": 0.2750463815164073, + "learning_rate": 0.00011333553393101013, + "loss": 0.9907, + "step": 10129 + }, + { + "epoch": 0.97, + "grad_norm": 0.3245742501930826, + "learning_rate": 0.00011331985556865201, + "loss": 1.0089, + "step": 10130 + }, + { + "epoch": 0.97, + "grad_norm": 0.2976235078728115, + "learning_rate": 0.00011330417687295614, + "loss": 1.0739, + "step": 10131 + }, + { + "epoch": 0.97, + "grad_norm": 0.30953794415640545, + "learning_rate": 0.00011328849784431488, + "loss": 1.1504, + "step": 10132 + }, + { + "epoch": 0.97, + "grad_norm": 0.2779345999485772, + "learning_rate": 0.00011327281848312059, + "loss": 1.1356, + "step": 10133 + }, + { + "epoch": 0.97, + "grad_norm": 0.2802541009558856, + "learning_rate": 0.0001132571387897657, + "loss": 1.0917, + "step": 10134 + }, + { + "epoch": 0.97, + "grad_norm": 0.27849384342581196, + "learning_rate": 0.00011324145876464259, + "loss": 1.0923, + "step": 10135 + }, + { + "epoch": 0.97, + "grad_norm": 0.3318350576277318, + "learning_rate": 0.00011322577840814361, + "loss": 0.9506, + "step": 10136 + }, + { + "epoch": 0.97, + "grad_norm": 0.3459815664705523, + "learning_rate": 0.00011321009772066124, + "loss": 1.0129, + "step": 10137 + }, + { + "epoch": 0.97, + "grad_norm": 0.3037533943998662, + "learning_rate": 0.00011319441670258788, + "loss": 1.0418, + "step": 10138 + }, + { + "epoch": 0.97, + "grad_norm": 0.2520746414367993, + "learning_rate": 0.00011317873535431591, + "loss": 1.0216, + "step": 10139 + }, + { + "epoch": 0.97, + "grad_norm": 0.2661292266112255, + "learning_rate": 0.00011316305367623785, + "loss": 1.1706, + "step": 10140 + }, + { + "epoch": 0.97, + "grad_norm": 0.3063821264146261, + "learning_rate": 0.00011314737166874607, + "loss": 1.0417, + "step": 10141 + }, + { + "epoch": 0.97, + "grad_norm": 0.26323311620303635, + "learning_rate": 0.00011313168933223306, + "loss": 1.0204, + "step": 10142 + }, + { + "epoch": 0.97, + "grad_norm": 0.27238653921733935, + "learning_rate": 0.00011311600666709126, + "loss": 1.0871, + "step": 10143 + }, + { + "epoch": 0.97, + "grad_norm": 0.3013746324023429, + "learning_rate": 0.00011310032367371317, + "loss": 1.0356, + "step": 10144 + }, + { + "epoch": 0.97, + "grad_norm": 0.2574291995601369, + "learning_rate": 0.00011308464035249125, + "loss": 1.0812, + "step": 10145 + }, + { + "epoch": 0.97, + "grad_norm": 0.2869306171965447, + "learning_rate": 0.00011306895670381797, + "loss": 1.0173, + "step": 10146 + }, + { + "epoch": 0.97, + "grad_norm": 0.23922368061607324, + "learning_rate": 0.00011305327272808583, + "loss": 1.0289, + "step": 10147 + }, + { + "epoch": 0.97, + "grad_norm": 0.25159221628276673, + "learning_rate": 0.00011303758842568735, + "loss": 0.999, + "step": 10148 + }, + { + "epoch": 0.97, + "grad_norm": 0.2868191888678923, + "learning_rate": 0.00011302190379701503, + "loss": 1.0263, + "step": 10149 + }, + { + "epoch": 0.97, + "grad_norm": 0.361397853429607, + "learning_rate": 0.00011300621884246136, + "loss": 1.1683, + "step": 10150 + }, + { + "epoch": 0.97, + "grad_norm": 0.2826280133952878, + "learning_rate": 0.00011299053356241891, + "loss": 1.0963, + "step": 10151 + }, + { + "epoch": 0.97, + "grad_norm": 0.28972955414613405, + "learning_rate": 0.00011297484795728019, + "loss": 1.0244, + "step": 10152 + }, + { + "epoch": 0.97, + "grad_norm": 0.3320877897903177, + "learning_rate": 0.00011295916202743773, + "loss": 1.0985, + "step": 10153 + }, + { + "epoch": 0.97, + "grad_norm": 0.2928253376533602, + "learning_rate": 0.00011294347577328412, + "loss": 1.0557, + "step": 10154 + }, + { + "epoch": 0.97, + "grad_norm": 0.33688760633249853, + "learning_rate": 0.00011292778919521189, + "loss": 1.149, + "step": 10155 + }, + { + "epoch": 0.97, + "grad_norm": 0.27676103323848317, + "learning_rate": 0.00011291210229361362, + "loss": 1.1117, + "step": 10156 + }, + { + "epoch": 0.97, + "grad_norm": 0.29026256750352997, + "learning_rate": 0.00011289641506888182, + "loss": 1.0704, + "step": 10157 + }, + { + "epoch": 0.97, + "grad_norm": 0.29334540208045223, + "learning_rate": 0.0001128807275214092, + "loss": 0.9988, + "step": 10158 + }, + { + "epoch": 0.97, + "grad_norm": 0.2818605763391759, + "learning_rate": 0.00011286503965158822, + "loss": 1.0227, + "step": 10159 + }, + { + "epoch": 0.97, + "grad_norm": 0.27801097090219445, + "learning_rate": 0.00011284935145981157, + "loss": 0.9915, + "step": 10160 + }, + { + "epoch": 0.97, + "grad_norm": 0.2805850444759864, + "learning_rate": 0.0001128336629464718, + "loss": 1.0897, + "step": 10161 + }, + { + "epoch": 0.97, + "grad_norm": 0.3032491771748229, + "learning_rate": 0.00011281797411196156, + "loss": 1.0666, + "step": 10162 + }, + { + "epoch": 0.97, + "grad_norm": 0.300948028153539, + "learning_rate": 0.00011280228495667346, + "loss": 1.1215, + "step": 10163 + }, + { + "epoch": 0.97, + "grad_norm": 0.32505655549938, + "learning_rate": 0.00011278659548100015, + "loss": 1.0754, + "step": 10164 + }, + { + "epoch": 0.97, + "grad_norm": 0.2830441409159092, + "learning_rate": 0.00011277090568533424, + "loss": 1.1184, + "step": 10165 + }, + { + "epoch": 0.97, + "grad_norm": 0.27235511959421765, + "learning_rate": 0.0001127552155700684, + "loss": 1.0373, + "step": 10166 + }, + { + "epoch": 0.97, + "grad_norm": 0.3112142074976755, + "learning_rate": 0.00011273952513559525, + "loss": 1.0028, + "step": 10167 + }, + { + "epoch": 0.97, + "grad_norm": 0.27216215210937883, + "learning_rate": 0.0001127238343823075, + "loss": 1.0513, + "step": 10168 + }, + { + "epoch": 0.97, + "grad_norm": 0.2953527746729821, + "learning_rate": 0.0001127081433105978, + "loss": 1.0803, + "step": 10169 + }, + { + "epoch": 0.97, + "grad_norm": 0.2897815242186968, + "learning_rate": 0.0001126924519208588, + "loss": 1.0248, + "step": 10170 + }, + { + "epoch": 0.97, + "grad_norm": 0.28105812749428083, + "learning_rate": 0.00011267676021348323, + "loss": 1.013, + "step": 10171 + }, + { + "epoch": 0.97, + "grad_norm": 0.30722992978249025, + "learning_rate": 0.00011266106818886377, + "loss": 1.0101, + "step": 10172 + }, + { + "epoch": 0.97, + "grad_norm": 0.3033918428966205, + "learning_rate": 0.00011264537584739314, + "loss": 1.0618, + "step": 10173 + }, + { + "epoch": 0.97, + "grad_norm": 0.28043324409698095, + "learning_rate": 0.00011262968318946398, + "loss": 0.9475, + "step": 10174 + }, + { + "epoch": 0.97, + "grad_norm": 0.2609838079910845, + "learning_rate": 0.00011261399021546912, + "loss": 0.9683, + "step": 10175 + }, + { + "epoch": 0.97, + "grad_norm": 0.3252941464289039, + "learning_rate": 0.00011259829692580119, + "loss": 1.0946, + "step": 10176 + }, + { + "epoch": 0.97, + "grad_norm": 0.3102405322061509, + "learning_rate": 0.00011258260332085298, + "loss": 1.0548, + "step": 10177 + }, + { + "epoch": 0.97, + "grad_norm": 0.27928723327298144, + "learning_rate": 0.0001125669094010172, + "loss": 1.0014, + "step": 10178 + }, + { + "epoch": 0.97, + "grad_norm": 0.3043342368498864, + "learning_rate": 0.00011255121516668663, + "loss": 1.0738, + "step": 10179 + }, + { + "epoch": 0.97, + "grad_norm": 0.28720036803685933, + "learning_rate": 0.00011253552061825398, + "loss": 1.0355, + "step": 10180 + }, + { + "epoch": 0.97, + "grad_norm": 0.284815493784025, + "learning_rate": 0.00011251982575611209, + "loss": 1.0894, + "step": 10181 + }, + { + "epoch": 0.97, + "grad_norm": 0.2556934414454461, + "learning_rate": 0.00011250413058065365, + "loss": 0.9576, + "step": 10182 + }, + { + "epoch": 0.97, + "grad_norm": 0.26083285136088175, + "learning_rate": 0.00011248843509227152, + "loss": 0.945, + "step": 10183 + }, + { + "epoch": 0.97, + "grad_norm": 0.28387550518261473, + "learning_rate": 0.00011247273929135841, + "loss": 1.1312, + "step": 10184 + }, + { + "epoch": 0.97, + "grad_norm": 0.33228217976195246, + "learning_rate": 0.00011245704317830721, + "loss": 1.0897, + "step": 10185 + }, + { + "epoch": 0.97, + "grad_norm": 0.31237530100881616, + "learning_rate": 0.00011244134675351066, + "loss": 1.1166, + "step": 10186 + }, + { + "epoch": 0.97, + "grad_norm": 0.30031596382794623, + "learning_rate": 0.00011242565001736159, + "loss": 1.0411, + "step": 10187 + }, + { + "epoch": 0.97, + "grad_norm": 0.3052672362577054, + "learning_rate": 0.00011240995297025281, + "loss": 0.9902, + "step": 10188 + }, + { + "epoch": 0.97, + "grad_norm": 0.306885450267344, + "learning_rate": 0.00011239425561257717, + "loss": 1.0137, + "step": 10189 + }, + { + "epoch": 0.97, + "grad_norm": 0.29453938744454133, + "learning_rate": 0.00011237855794472748, + "loss": 1.0604, + "step": 10190 + }, + { + "epoch": 0.97, + "grad_norm": 0.24569112604498705, + "learning_rate": 0.00011236285996709659, + "loss": 1.128, + "step": 10191 + }, + { + "epoch": 0.98, + "grad_norm": 0.30552330478731954, + "learning_rate": 0.00011234716168007737, + "loss": 1.0256, + "step": 10192 + }, + { + "epoch": 0.98, + "grad_norm": 0.29514963250865506, + "learning_rate": 0.00011233146308406268, + "loss": 1.1758, + "step": 10193 + }, + { + "epoch": 0.98, + "grad_norm": 0.2895811885576239, + "learning_rate": 0.00011231576417944536, + "loss": 1.0462, + "step": 10194 + }, + { + "epoch": 0.98, + "grad_norm": 0.2674722074074362, + "learning_rate": 0.00011230006496661831, + "loss": 1.2296, + "step": 10195 + }, + { + "epoch": 0.98, + "grad_norm": 0.31631409210970807, + "learning_rate": 0.00011228436544597442, + "loss": 1.0523, + "step": 10196 + }, + { + "epoch": 0.98, + "grad_norm": 0.2383251012221512, + "learning_rate": 0.00011226866561790653, + "loss": 0.9241, + "step": 10197 + }, + { + "epoch": 0.98, + "grad_norm": 0.3245079733703714, + "learning_rate": 0.00011225296548280759, + "loss": 1.0727, + "step": 10198 + }, + { + "epoch": 0.98, + "grad_norm": 0.28495612155493694, + "learning_rate": 0.0001122372650410705, + "loss": 0.9841, + "step": 10199 + }, + { + "epoch": 0.98, + "grad_norm": 0.30013506380723926, + "learning_rate": 0.00011222156429308812, + "loss": 0.9932, + "step": 10200 + }, + { + "epoch": 0.98, + "grad_norm": 0.2786075519347718, + "learning_rate": 0.00011220586323925346, + "loss": 1.0858, + "step": 10201 + }, + { + "epoch": 0.98, + "grad_norm": 0.3138110381506769, + "learning_rate": 0.00011219016187995937, + "loss": 1.1049, + "step": 10202 + }, + { + "epoch": 0.98, + "grad_norm": 0.27581534819480813, + "learning_rate": 0.00011217446021559883, + "loss": 0.9373, + "step": 10203 + }, + { + "epoch": 0.98, + "grad_norm": 0.3292494819868315, + "learning_rate": 0.00011215875824656477, + "loss": 0.9851, + "step": 10204 + }, + { + "epoch": 0.98, + "grad_norm": 0.2655547214014843, + "learning_rate": 0.00011214305597325015, + "loss": 0.9125, + "step": 10205 + }, + { + "epoch": 0.98, + "grad_norm": 0.2747966640064946, + "learning_rate": 0.00011212735339604792, + "loss": 0.9829, + "step": 10206 + }, + { + "epoch": 0.98, + "grad_norm": 0.26757047246677396, + "learning_rate": 0.00011211165051535104, + "loss": 0.9451, + "step": 10207 + }, + { + "epoch": 0.98, + "grad_norm": 0.27347785909550587, + "learning_rate": 0.00011209594733155251, + "loss": 1.0511, + "step": 10208 + }, + { + "epoch": 0.98, + "grad_norm": 0.250251681777428, + "learning_rate": 0.00011208024384504527, + "loss": 1.0769, + "step": 10209 + }, + { + "epoch": 0.98, + "grad_norm": 0.2977263384929927, + "learning_rate": 0.00011206454005622237, + "loss": 1.0488, + "step": 10210 + }, + { + "epoch": 0.98, + "grad_norm": 0.26128674737229673, + "learning_rate": 0.00011204883596547676, + "loss": 0.9721, + "step": 10211 + }, + { + "epoch": 0.98, + "grad_norm": 0.2864777874901107, + "learning_rate": 0.00011203313157320146, + "loss": 0.9903, + "step": 10212 + }, + { + "epoch": 0.98, + "grad_norm": 0.2930793218779365, + "learning_rate": 0.00011201742687978946, + "loss": 1.0835, + "step": 10213 + }, + { + "epoch": 0.98, + "grad_norm": 0.2726110420231643, + "learning_rate": 0.0001120017218856338, + "loss": 1.0418, + "step": 10214 + }, + { + "epoch": 0.98, + "grad_norm": 0.26500863939608205, + "learning_rate": 0.00011198601659112753, + "loss": 0.9941, + "step": 10215 + }, + { + "epoch": 0.98, + "grad_norm": 0.35922537571617147, + "learning_rate": 0.00011197031099666366, + "loss": 1.1, + "step": 10216 + }, + { + "epoch": 0.98, + "grad_norm": 0.26403707584086666, + "learning_rate": 0.00011195460510263523, + "loss": 1.0841, + "step": 10217 + }, + { + "epoch": 0.98, + "grad_norm": 0.30153285229911025, + "learning_rate": 0.00011193889890943528, + "loss": 0.9745, + "step": 10218 + }, + { + "epoch": 0.98, + "grad_norm": 0.27058298329981373, + "learning_rate": 0.0001119231924174569, + "loss": 1.0014, + "step": 10219 + }, + { + "epoch": 0.98, + "grad_norm": 0.2890597865454572, + "learning_rate": 0.00011190748562709314, + "loss": 1.1932, + "step": 10220 + }, + { + "epoch": 0.98, + "grad_norm": 0.2559758601582075, + "learning_rate": 0.00011189177853873705, + "loss": 1.035, + "step": 10221 + }, + { + "epoch": 0.98, + "grad_norm": 0.2675198324495206, + "learning_rate": 0.00011187607115278173, + "loss": 1.0852, + "step": 10222 + }, + { + "epoch": 0.98, + "grad_norm": 0.2984560752244984, + "learning_rate": 0.00011186036346962025, + "loss": 1.0197, + "step": 10223 + }, + { + "epoch": 0.98, + "grad_norm": 0.2746333508360344, + "learning_rate": 0.00011184465548964575, + "loss": 1.1096, + "step": 10224 + }, + { + "epoch": 0.98, + "grad_norm": 0.2911661232884761, + "learning_rate": 0.00011182894721325128, + "loss": 1.0876, + "step": 10225 + }, + { + "epoch": 0.98, + "grad_norm": 0.2545901819238632, + "learning_rate": 0.00011181323864082999, + "loss": 1.008, + "step": 10226 + }, + { + "epoch": 0.98, + "grad_norm": 0.27438906716939765, + "learning_rate": 0.00011179752977277498, + "loss": 0.9276, + "step": 10227 + }, + { + "epoch": 0.98, + "grad_norm": 0.30116616901446114, + "learning_rate": 0.00011178182060947935, + "loss": 0.9543, + "step": 10228 + }, + { + "epoch": 0.98, + "grad_norm": 0.2752406915494225, + "learning_rate": 0.00011176611115133628, + "loss": 1.0355, + "step": 10229 + }, + { + "epoch": 0.98, + "grad_norm": 0.27687088339188193, + "learning_rate": 0.00011175040139873889, + "loss": 0.9992, + "step": 10230 + }, + { + "epoch": 0.98, + "grad_norm": 0.2650331819262323, + "learning_rate": 0.00011173469135208028, + "loss": 0.9954, + "step": 10231 + }, + { + "epoch": 0.98, + "grad_norm": 0.29487040103252415, + "learning_rate": 0.00011171898101175369, + "loss": 0.9526, + "step": 10232 + }, + { + "epoch": 0.98, + "grad_norm": 0.2784410390172419, + "learning_rate": 0.0001117032703781522, + "loss": 1.1314, + "step": 10233 + }, + { + "epoch": 0.98, + "grad_norm": 0.34321818169181995, + "learning_rate": 0.00011168755945166905, + "loss": 1.0451, + "step": 10234 + }, + { + "epoch": 0.98, + "grad_norm": 0.2934212795606079, + "learning_rate": 0.00011167184823269735, + "loss": 0.9916, + "step": 10235 + }, + { + "epoch": 0.98, + "grad_norm": 0.27720714282535736, + "learning_rate": 0.00011165613672163032, + "loss": 1.052, + "step": 10236 + }, + { + "epoch": 0.98, + "grad_norm": 0.2827387556355499, + "learning_rate": 0.00011164042491886115, + "loss": 1.0031, + "step": 10237 + }, + { + "epoch": 0.98, + "grad_norm": 0.3020361414751645, + "learning_rate": 0.00011162471282478299, + "loss": 1.1295, + "step": 10238 + }, + { + "epoch": 0.98, + "grad_norm": 0.2926727806203999, + "learning_rate": 0.00011160900043978915, + "loss": 1.0201, + "step": 10239 + }, + { + "epoch": 0.98, + "grad_norm": 0.29327346670931664, + "learning_rate": 0.00011159328776427274, + "loss": 1.0045, + "step": 10240 + }, + { + "epoch": 0.98, + "grad_norm": 0.3197321134655832, + "learning_rate": 0.00011157757479862701, + "loss": 0.9832, + "step": 10241 + }, + { + "epoch": 0.98, + "grad_norm": 0.2834245023692689, + "learning_rate": 0.00011156186154324522, + "loss": 0.9674, + "step": 10242 + }, + { + "epoch": 0.98, + "grad_norm": 0.2925004499767554, + "learning_rate": 0.00011154614799852055, + "loss": 1.0646, + "step": 10243 + }, + { + "epoch": 0.98, + "grad_norm": 0.25673872599009095, + "learning_rate": 0.0001115304341648463, + "loss": 1.04, + "step": 10244 + }, + { + "epoch": 0.98, + "grad_norm": 0.2808493957291415, + "learning_rate": 0.00011151472004261565, + "loss": 1.0743, + "step": 10245 + }, + { + "epoch": 0.98, + "grad_norm": 0.2908844238513358, + "learning_rate": 0.00011149900563222193, + "loss": 1.0376, + "step": 10246 + }, + { + "epoch": 0.98, + "grad_norm": 0.2887693925794027, + "learning_rate": 0.00011148329093405836, + "loss": 1.018, + "step": 10247 + }, + { + "epoch": 0.98, + "grad_norm": 0.29234813333297677, + "learning_rate": 0.0001114675759485182, + "loss": 1.0377, + "step": 10248 + }, + { + "epoch": 0.98, + "grad_norm": 0.28841549577391123, + "learning_rate": 0.00011145186067599478, + "loss": 1.1507, + "step": 10249 + }, + { + "epoch": 0.98, + "grad_norm": 0.31377105859709736, + "learning_rate": 0.00011143614511688132, + "loss": 0.9281, + "step": 10250 + }, + { + "epoch": 0.98, + "grad_norm": 0.27520050582114036, + "learning_rate": 0.00011142042927157114, + "loss": 1.0736, + "step": 10251 + }, + { + "epoch": 0.98, + "grad_norm": 0.2836489124077, + "learning_rate": 0.00011140471314045755, + "loss": 1.1397, + "step": 10252 + }, + { + "epoch": 0.98, + "grad_norm": 0.3196462341558277, + "learning_rate": 0.00011138899672393386, + "loss": 1.1109, + "step": 10253 + }, + { + "epoch": 0.98, + "grad_norm": 0.2919022639490787, + "learning_rate": 0.00011137328002239335, + "loss": 1.0623, + "step": 10254 + }, + { + "epoch": 0.98, + "grad_norm": 0.2668993177956516, + "learning_rate": 0.00011135756303622937, + "loss": 0.9756, + "step": 10255 + }, + { + "epoch": 0.98, + "grad_norm": 0.29098900383647164, + "learning_rate": 0.00011134184576583525, + "loss": 0.981, + "step": 10256 + }, + { + "epoch": 0.98, + "grad_norm": 0.2741127564871344, + "learning_rate": 0.00011132612821160428, + "loss": 1.0042, + "step": 10257 + }, + { + "epoch": 0.98, + "grad_norm": 0.3179337291974993, + "learning_rate": 0.00011131041037392984, + "loss": 0.9453, + "step": 10258 + }, + { + "epoch": 0.98, + "grad_norm": 0.2903023541952086, + "learning_rate": 0.00011129469225320527, + "loss": 1.1199, + "step": 10259 + }, + { + "epoch": 0.98, + "grad_norm": 0.2762878923470942, + "learning_rate": 0.00011127897384982396, + "loss": 1.0547, + "step": 10260 + }, + { + "epoch": 0.98, + "grad_norm": 0.29633017173721615, + "learning_rate": 0.00011126325516417921, + "loss": 1.0946, + "step": 10261 + }, + { + "epoch": 0.98, + "grad_norm": 0.30069391214448493, + "learning_rate": 0.00011124753619666441, + "loss": 1.1047, + "step": 10262 + }, + { + "epoch": 0.98, + "grad_norm": 0.30197537398471846, + "learning_rate": 0.000111231816947673, + "loss": 1.0716, + "step": 10263 + }, + { + "epoch": 0.98, + "grad_norm": 0.22716259808522002, + "learning_rate": 0.00011121609741759824, + "loss": 1.1495, + "step": 10264 + }, + { + "epoch": 0.98, + "grad_norm": 0.33447762653923324, + "learning_rate": 0.00011120037760683364, + "loss": 1.0615, + "step": 10265 + }, + { + "epoch": 0.98, + "grad_norm": 0.2677195592128131, + "learning_rate": 0.00011118465751577254, + "loss": 1.0661, + "step": 10266 + }, + { + "epoch": 0.98, + "grad_norm": 0.25916514691347986, + "learning_rate": 0.00011116893714480836, + "loss": 1.0618, + "step": 10267 + }, + { + "epoch": 0.98, + "grad_norm": 0.2735392836561538, + "learning_rate": 0.0001111532164943345, + "loss": 1.1209, + "step": 10268 + }, + { + "epoch": 0.98, + "grad_norm": 0.306277634485208, + "learning_rate": 0.0001111374955647444, + "loss": 1.0481, + "step": 10269 + }, + { + "epoch": 0.98, + "grad_norm": 0.24999171813245338, + "learning_rate": 0.00011112177435643147, + "loss": 0.9234, + "step": 10270 + }, + { + "epoch": 0.98, + "grad_norm": 0.26942078573492634, + "learning_rate": 0.00011110605286978914, + "loss": 1.0864, + "step": 10271 + }, + { + "epoch": 0.98, + "grad_norm": 0.26245411051479745, + "learning_rate": 0.00011109033110521086, + "loss": 1.1104, + "step": 10272 + }, + { + "epoch": 0.98, + "grad_norm": 0.31836586363546904, + "learning_rate": 0.00011107460906309008, + "loss": 0.9678, + "step": 10273 + }, + { + "epoch": 0.98, + "grad_norm": 0.3164102794109734, + "learning_rate": 0.00011105888674382025, + "loss": 1.063, + "step": 10274 + }, + { + "epoch": 0.98, + "grad_norm": 0.3168927687531968, + "learning_rate": 0.0001110431641477948, + "loss": 1.0617, + "step": 10275 + }, + { + "epoch": 0.98, + "grad_norm": 0.2901612455077041, + "learning_rate": 0.00011102744127540728, + "loss": 1.0798, + "step": 10276 + }, + { + "epoch": 0.98, + "grad_norm": 0.2808843978397793, + "learning_rate": 0.0001110117181270511, + "loss": 1.083, + "step": 10277 + }, + { + "epoch": 0.98, + "grad_norm": 0.28149992419883424, + "learning_rate": 0.00011099599470311972, + "loss": 1.0127, + "step": 10278 + }, + { + "epoch": 0.98, + "grad_norm": 0.31449595403015373, + "learning_rate": 0.00011098027100400667, + "loss": 1.1207, + "step": 10279 + }, + { + "epoch": 0.98, + "grad_norm": 0.243827419121298, + "learning_rate": 0.00011096454703010546, + "loss": 0.9526, + "step": 10280 + }, + { + "epoch": 0.98, + "grad_norm": 0.27945107442928213, + "learning_rate": 0.00011094882278180956, + "loss": 0.9607, + "step": 10281 + }, + { + "epoch": 0.98, + "grad_norm": 0.27291199800336474, + "learning_rate": 0.00011093309825951245, + "loss": 1.0538, + "step": 10282 + }, + { + "epoch": 0.98, + "grad_norm": 0.30765998442794057, + "learning_rate": 0.00011091737346360773, + "loss": 1.1567, + "step": 10283 + }, + { + "epoch": 0.98, + "grad_norm": 0.28547060490679954, + "learning_rate": 0.00011090164839448887, + "loss": 0.9414, + "step": 10284 + }, + { + "epoch": 0.98, + "grad_norm": 0.28516651031906826, + "learning_rate": 0.00011088592305254939, + "loss": 1.1272, + "step": 10285 + }, + { + "epoch": 0.98, + "grad_norm": 0.28976084124588714, + "learning_rate": 0.00011087019743818288, + "loss": 1.067, + "step": 10286 + }, + { + "epoch": 0.98, + "grad_norm": 0.30715911284891306, + "learning_rate": 0.00011085447155178279, + "loss": 1.0638, + "step": 10287 + }, + { + "epoch": 0.98, + "grad_norm": 0.26407524082018785, + "learning_rate": 0.00011083874539374277, + "loss": 1.0787, + "step": 10288 + }, + { + "epoch": 0.98, + "grad_norm": 0.2694386589995852, + "learning_rate": 0.00011082301896445633, + "loss": 1.0979, + "step": 10289 + }, + { + "epoch": 0.98, + "grad_norm": 0.27217656363948856, + "learning_rate": 0.00011080729226431703, + "loss": 1.141, + "step": 10290 + }, + { + "epoch": 0.98, + "grad_norm": 0.2794911223689666, + "learning_rate": 0.00011079156529371846, + "loss": 1.049, + "step": 10291 + }, + { + "epoch": 0.98, + "grad_norm": 0.27195359614242276, + "learning_rate": 0.00011077583805305418, + "loss": 0.9813, + "step": 10292 + }, + { + "epoch": 0.98, + "grad_norm": 0.29038357957473354, + "learning_rate": 0.00011076011054271778, + "loss": 1.1996, + "step": 10293 + }, + { + "epoch": 0.98, + "grad_norm": 0.28199425499385034, + "learning_rate": 0.00011074438276310287, + "loss": 1.1444, + "step": 10294 + }, + { + "epoch": 0.98, + "grad_norm": 0.30433530645019496, + "learning_rate": 0.00011072865471460301, + "loss": 1.0179, + "step": 10295 + }, + { + "epoch": 0.99, + "grad_norm": 0.26891004746639385, + "learning_rate": 0.00011071292639761181, + "loss": 1.0334, + "step": 10296 + }, + { + "epoch": 0.99, + "grad_norm": 0.29402614889194606, + "learning_rate": 0.0001106971978125229, + "loss": 1.0908, + "step": 10297 + }, + { + "epoch": 0.99, + "grad_norm": 0.2559539105537088, + "learning_rate": 0.00011068146895972993, + "loss": 1.1747, + "step": 10298 + }, + { + "epoch": 0.99, + "grad_norm": 0.2845273763898639, + "learning_rate": 0.00011066573983962642, + "loss": 1.0114, + "step": 10299 + }, + { + "epoch": 0.99, + "grad_norm": 0.3025475977583579, + "learning_rate": 0.00011065001045260613, + "loss": 1.0435, + "step": 10300 + }, + { + "epoch": 0.99, + "grad_norm": 0.2747465345583122, + "learning_rate": 0.00011063428079906259, + "loss": 1.1137, + "step": 10301 + }, + { + "epoch": 0.99, + "grad_norm": 0.2990549222362699, + "learning_rate": 0.0001106185508793895, + "loss": 1.0761, + "step": 10302 + }, + { + "epoch": 0.99, + "grad_norm": 0.3060379979713365, + "learning_rate": 0.00011060282069398052, + "loss": 1.0701, + "step": 10303 + }, + { + "epoch": 0.99, + "grad_norm": 0.25935605894516695, + "learning_rate": 0.00011058709024322929, + "loss": 1.0854, + "step": 10304 + }, + { + "epoch": 0.99, + "grad_norm": 0.284568963561047, + "learning_rate": 0.00011057135952752943, + "loss": 0.9767, + "step": 10305 + }, + { + "epoch": 0.99, + "grad_norm": 0.268579961216263, + "learning_rate": 0.00011055562854727471, + "loss": 1.0459, + "step": 10306 + }, + { + "epoch": 0.99, + "grad_norm": 0.26453950568162055, + "learning_rate": 0.00011053989730285869, + "loss": 1.0633, + "step": 10307 + }, + { + "epoch": 0.99, + "grad_norm": 0.2840144675918696, + "learning_rate": 0.00011052416579467518, + "loss": 0.8992, + "step": 10308 + }, + { + "epoch": 0.99, + "grad_norm": 0.2526273337620793, + "learning_rate": 0.00011050843402311777, + "loss": 1.0391, + "step": 10309 + }, + { + "epoch": 0.99, + "grad_norm": 0.2907758381732847, + "learning_rate": 0.00011049270198858019, + "loss": 1.1091, + "step": 10310 + }, + { + "epoch": 0.99, + "grad_norm": 0.29330977370977224, + "learning_rate": 0.00011047696969145618, + "loss": 1.1539, + "step": 10311 + }, + { + "epoch": 0.99, + "grad_norm": 0.28027168248307766, + "learning_rate": 0.00011046123713213939, + "loss": 1.0466, + "step": 10312 + }, + { + "epoch": 0.99, + "grad_norm": 0.29433174543880003, + "learning_rate": 0.00011044550431102358, + "loss": 1.0868, + "step": 10313 + }, + { + "epoch": 0.99, + "grad_norm": 0.28747079837458267, + "learning_rate": 0.00011042977122850247, + "loss": 0.9917, + "step": 10314 + }, + { + "epoch": 0.99, + "grad_norm": 0.2841412064889096, + "learning_rate": 0.00011041403788496976, + "loss": 1.23, + "step": 10315 + }, + { + "epoch": 0.99, + "grad_norm": 0.26187375710340965, + "learning_rate": 0.0001103983042808192, + "loss": 1.0055, + "step": 10316 + }, + { + "epoch": 0.99, + "grad_norm": 0.32286275918136254, + "learning_rate": 0.00011038257041644455, + "loss": 0.993, + "step": 10317 + }, + { + "epoch": 0.99, + "grad_norm": 0.28197212224874324, + "learning_rate": 0.00011036683629223958, + "loss": 1.003, + "step": 10318 + }, + { + "epoch": 0.99, + "grad_norm": 0.25905993814852957, + "learning_rate": 0.00011035110190859796, + "loss": 1.0416, + "step": 10319 + }, + { + "epoch": 0.99, + "grad_norm": 0.2894375367829976, + "learning_rate": 0.00011033536726591356, + "loss": 1.0795, + "step": 10320 + }, + { + "epoch": 0.99, + "grad_norm": 0.32587938767421387, + "learning_rate": 0.00011031963236458008, + "loss": 1.0801, + "step": 10321 + }, + { + "epoch": 0.99, + "grad_norm": 0.2678020975589778, + "learning_rate": 0.00011030389720499132, + "loss": 1.1145, + "step": 10322 + }, + { + "epoch": 0.99, + "grad_norm": 0.30583316214893913, + "learning_rate": 0.00011028816178754104, + "loss": 1.0778, + "step": 10323 + }, + { + "epoch": 0.99, + "grad_norm": 0.32483933693639894, + "learning_rate": 0.00011027242611262306, + "loss": 1.0419, + "step": 10324 + }, + { + "epoch": 0.99, + "grad_norm": 0.31586042610524007, + "learning_rate": 0.00011025669018063116, + "loss": 1.0958, + "step": 10325 + }, + { + "epoch": 0.99, + "grad_norm": 0.2878776556420424, + "learning_rate": 0.00011024095399195913, + "loss": 1.0656, + "step": 10326 + }, + { + "epoch": 0.99, + "grad_norm": 0.2830198312625606, + "learning_rate": 0.0001102252175470008, + "loss": 1.0493, + "step": 10327 + }, + { + "epoch": 0.99, + "grad_norm": 0.2803658012096297, + "learning_rate": 0.00011020948084614995, + "loss": 0.9544, + "step": 10328 + }, + { + "epoch": 0.99, + "grad_norm": 0.3160224185691885, + "learning_rate": 0.00011019374388980046, + "loss": 0.9955, + "step": 10329 + }, + { + "epoch": 0.99, + "grad_norm": 0.28845814066617365, + "learning_rate": 0.0001101780066783461, + "loss": 1.1675, + "step": 10330 + }, + { + "epoch": 0.99, + "grad_norm": 0.2892793784279133, + "learning_rate": 0.00011016226921218074, + "loss": 0.9273, + "step": 10331 + }, + { + "epoch": 0.99, + "grad_norm": 0.27909437771191264, + "learning_rate": 0.0001101465314916982, + "loss": 1.1146, + "step": 10332 + }, + { + "epoch": 0.99, + "grad_norm": 0.3021556255362156, + "learning_rate": 0.00011013079351729232, + "loss": 1.0174, + "step": 10333 + }, + { + "epoch": 0.99, + "grad_norm": 0.3088606738990712, + "learning_rate": 0.000110115055289357, + "loss": 1.0696, + "step": 10334 + }, + { + "epoch": 0.99, + "grad_norm": 0.2532028338885159, + "learning_rate": 0.00011009931680828604, + "loss": 1.0509, + "step": 10335 + }, + { + "epoch": 0.99, + "grad_norm": 0.2644967377127452, + "learning_rate": 0.00011008357807447334, + "loss": 1.011, + "step": 10336 + }, + { + "epoch": 0.99, + "grad_norm": 0.25911644503390924, + "learning_rate": 0.00011006783908831275, + "loss": 0.9568, + "step": 10337 + }, + { + "epoch": 0.99, + "grad_norm": 0.3093805850346706, + "learning_rate": 0.00011005209985019817, + "loss": 1.0442, + "step": 10338 + }, + { + "epoch": 0.99, + "grad_norm": 0.26605780506457216, + "learning_rate": 0.00011003636036052347, + "loss": 1.12, + "step": 10339 + }, + { + "epoch": 0.99, + "grad_norm": 0.2738908065467255, + "learning_rate": 0.00011002062061968255, + "loss": 1.0423, + "step": 10340 + }, + { + "epoch": 0.99, + "grad_norm": 0.38868786111630993, + "learning_rate": 0.00011000488062806929, + "loss": 1.0834, + "step": 10341 + }, + { + "epoch": 0.99, + "grad_norm": 0.3172034502101059, + "learning_rate": 0.00010998914038607762, + "loss": 1.0222, + "step": 10342 + }, + { + "epoch": 0.99, + "grad_norm": 0.2858843799078492, + "learning_rate": 0.0001099733998941014, + "loss": 1.0284, + "step": 10343 + }, + { + "epoch": 0.99, + "grad_norm": 0.2646838577150544, + "learning_rate": 0.00010995765915253462, + "loss": 1.1413, + "step": 10344 + }, + { + "epoch": 0.99, + "grad_norm": 0.27314529183764996, + "learning_rate": 0.00010994191816177115, + "loss": 1.0144, + "step": 10345 + }, + { + "epoch": 0.99, + "grad_norm": 0.25795307029613257, + "learning_rate": 0.0001099261769222049, + "loss": 1.1011, + "step": 10346 + }, + { + "epoch": 0.99, + "grad_norm": 0.269654266329036, + "learning_rate": 0.00010991043543422987, + "loss": 1.093, + "step": 10347 + }, + { + "epoch": 0.99, + "grad_norm": 0.2873513143757092, + "learning_rate": 0.00010989469369823993, + "loss": 1.0575, + "step": 10348 + }, + { + "epoch": 0.99, + "grad_norm": 0.2966705516629044, + "learning_rate": 0.0001098789517146291, + "loss": 0.9566, + "step": 10349 + }, + { + "epoch": 0.99, + "grad_norm": 0.30235296838928694, + "learning_rate": 0.00010986320948379125, + "loss": 1.0973, + "step": 10350 + }, + { + "epoch": 0.99, + "grad_norm": 0.22856559446563995, + "learning_rate": 0.00010984746700612043, + "loss": 1.0409, + "step": 10351 + }, + { + "epoch": 0.99, + "grad_norm": 0.29679007545691055, + "learning_rate": 0.00010983172428201055, + "loss": 1.0134, + "step": 10352 + }, + { + "epoch": 0.99, + "grad_norm": 0.3089775230936333, + "learning_rate": 0.00010981598131185558, + "loss": 1.0447, + "step": 10353 + }, + { + "epoch": 0.99, + "grad_norm": 0.29725469954817196, + "learning_rate": 0.00010980023809604951, + "loss": 1.0729, + "step": 10354 + }, + { + "epoch": 0.99, + "grad_norm": 0.2626951615117108, + "learning_rate": 0.00010978449463498632, + "loss": 0.9726, + "step": 10355 + }, + { + "epoch": 0.99, + "grad_norm": 0.27242407027742316, + "learning_rate": 0.00010976875092906003, + "loss": 1.0434, + "step": 10356 + }, + { + "epoch": 0.99, + "grad_norm": 0.2947561469672809, + "learning_rate": 0.00010975300697866456, + "loss": 1.0845, + "step": 10357 + }, + { + "epoch": 0.99, + "grad_norm": 0.29370486489779185, + "learning_rate": 0.00010973726278419398, + "loss": 0.9322, + "step": 10358 + }, + { + "epoch": 0.99, + "grad_norm": 0.2928377780175839, + "learning_rate": 0.00010972151834604229, + "loss": 1.0208, + "step": 10359 + }, + { + "epoch": 0.99, + "grad_norm": 0.32099727413336887, + "learning_rate": 0.00010970577366460349, + "loss": 1.0144, + "step": 10360 + }, + { + "epoch": 0.99, + "grad_norm": 0.2763946202818681, + "learning_rate": 0.00010969002874027161, + "loss": 1.206, + "step": 10361 + }, + { + "epoch": 0.99, + "grad_norm": 0.29031856568736375, + "learning_rate": 0.00010967428357344067, + "loss": 1.0013, + "step": 10362 + }, + { + "epoch": 0.99, + "grad_norm": 0.2600423194599826, + "learning_rate": 0.00010965853816450469, + "loss": 1.1104, + "step": 10363 + }, + { + "epoch": 0.99, + "grad_norm": 0.262125842100789, + "learning_rate": 0.00010964279251385774, + "loss": 1.0854, + "step": 10364 + }, + { + "epoch": 0.99, + "grad_norm": 0.2983375163299008, + "learning_rate": 0.00010962704662189383, + "loss": 1.0486, + "step": 10365 + }, + { + "epoch": 0.99, + "grad_norm": 0.27574617002830204, + "learning_rate": 0.00010961130048900705, + "loss": 1.0465, + "step": 10366 + }, + { + "epoch": 0.99, + "grad_norm": 0.2474828198501618, + "learning_rate": 0.0001095955541155914, + "loss": 1.0645, + "step": 10367 + }, + { + "epoch": 0.99, + "grad_norm": 0.33229191383287654, + "learning_rate": 0.000109579807502041, + "loss": 0.9869, + "step": 10368 + }, + { + "epoch": 0.99, + "grad_norm": 0.2982566291542284, + "learning_rate": 0.0001095640606487499, + "loss": 1.0616, + "step": 10369 + }, + { + "epoch": 0.99, + "grad_norm": 0.3011080575254373, + "learning_rate": 0.00010954831355611215, + "loss": 1.0475, + "step": 10370 + }, + { + "epoch": 0.99, + "grad_norm": 0.2954042572701708, + "learning_rate": 0.00010953256622452185, + "loss": 1.0212, + "step": 10371 + }, + { + "epoch": 0.99, + "grad_norm": 0.3015044088263099, + "learning_rate": 0.0001095168186543731, + "loss": 1.0839, + "step": 10372 + }, + { + "epoch": 0.99, + "grad_norm": 0.36161978315568827, + "learning_rate": 0.00010950107084605998, + "loss": 0.9966, + "step": 10373 + }, + { + "epoch": 0.99, + "grad_norm": 0.2947245497967197, + "learning_rate": 0.00010948532279997664, + "loss": 1.0052, + "step": 10374 + }, + { + "epoch": 0.99, + "grad_norm": 0.278629231797063, + "learning_rate": 0.00010946957451651709, + "loss": 0.974, + "step": 10375 + }, + { + "epoch": 0.99, + "grad_norm": 0.3081240007766056, + "learning_rate": 0.0001094538259960755, + "loss": 0.9513, + "step": 10376 + }, + { + "epoch": 0.99, + "grad_norm": 0.37939549753517904, + "learning_rate": 0.00010943807723904593, + "loss": 1.1191, + "step": 10377 + }, + { + "epoch": 0.99, + "grad_norm": 0.24633288069760065, + "learning_rate": 0.0001094223282458226, + "loss": 1.0712, + "step": 10378 + }, + { + "epoch": 0.99, + "grad_norm": 0.3096687909717489, + "learning_rate": 0.00010940657901679956, + "loss": 0.9914, + "step": 10379 + }, + { + "epoch": 0.99, + "grad_norm": 0.3063298337056405, + "learning_rate": 0.00010939082955237096, + "loss": 1.1432, + "step": 10380 + }, + { + "epoch": 0.99, + "grad_norm": 0.2745100518027808, + "learning_rate": 0.00010937507985293098, + "loss": 1.0335, + "step": 10381 + }, + { + "epoch": 0.99, + "grad_norm": 0.25605757624283976, + "learning_rate": 0.00010935932991887372, + "loss": 0.9888, + "step": 10382 + }, + { + "epoch": 0.99, + "grad_norm": 0.27108913673944884, + "learning_rate": 0.00010934357975059334, + "loss": 1.1554, + "step": 10383 + }, + { + "epoch": 0.99, + "grad_norm": 0.3465731349344641, + "learning_rate": 0.000109327829348484, + "loss": 1.0209, + "step": 10384 + }, + { + "epoch": 0.99, + "grad_norm": 0.2661351715791654, + "learning_rate": 0.00010931207871293987, + "loss": 1.1225, + "step": 10385 + }, + { + "epoch": 0.99, + "grad_norm": 0.2992239551094853, + "learning_rate": 0.00010929632784435513, + "loss": 1.1448, + "step": 10386 + }, + { + "epoch": 0.99, + "grad_norm": 0.2714660858795361, + "learning_rate": 0.00010928057674312393, + "loss": 1.1303, + "step": 10387 + }, + { + "epoch": 0.99, + "grad_norm": 0.3560152461135453, + "learning_rate": 0.00010926482540964047, + "loss": 1.0939, + "step": 10388 + }, + { + "epoch": 0.99, + "grad_norm": 0.33230188942732725, + "learning_rate": 0.00010924907384429892, + "loss": 1.0654, + "step": 10389 + }, + { + "epoch": 0.99, + "grad_norm": 0.2648343149387062, + "learning_rate": 0.0001092333220474935, + "loss": 1.0357, + "step": 10390 + }, + { + "epoch": 0.99, + "grad_norm": 0.2959171922667274, + "learning_rate": 0.00010921757001961839, + "loss": 1.0695, + "step": 10391 + }, + { + "epoch": 0.99, + "grad_norm": 0.26506456061345024, + "learning_rate": 0.00010920181776106779, + "loss": 1.1593, + "step": 10392 + }, + { + "epoch": 0.99, + "grad_norm": 0.3204886263895637, + "learning_rate": 0.00010918606527223593, + "loss": 1.0391, + "step": 10393 + }, + { + "epoch": 0.99, + "grad_norm": 0.24450093898680367, + "learning_rate": 0.00010917031255351699, + "loss": 0.9267, + "step": 10394 + }, + { + "epoch": 0.99, + "grad_norm": 0.28085674386349635, + "learning_rate": 0.00010915455960530526, + "loss": 1.0893, + "step": 10395 + }, + { + "epoch": 0.99, + "grad_norm": 0.28257173644304456, + "learning_rate": 0.00010913880642799488, + "loss": 1.0546, + "step": 10396 + }, + { + "epoch": 0.99, + "grad_norm": 0.3084129119694214, + "learning_rate": 0.00010912305302198014, + "loss": 1.0221, + "step": 10397 + }, + { + "epoch": 0.99, + "grad_norm": 0.2778700782623621, + "learning_rate": 0.00010910729938765528, + "loss": 1.0197, + "step": 10398 + }, + { + "epoch": 0.99, + "grad_norm": 0.24322367397269376, + "learning_rate": 0.00010909154552541449, + "loss": 0.9353, + "step": 10399 + }, + { + "epoch": 0.99, + "grad_norm": 0.27278544222892476, + "learning_rate": 0.0001090757914356521, + "loss": 1.076, + "step": 10400 + }, + { + "epoch": 1.0, + "grad_norm": 0.2869093663218051, + "learning_rate": 0.00010906003711876229, + "loss": 0.9683, + "step": 10401 + }, + { + "epoch": 1.0, + "grad_norm": 0.2941519770990777, + "learning_rate": 0.00010904428257513939, + "loss": 1.0158, + "step": 10402 + }, + { + "epoch": 1.0, + "grad_norm": 0.27338008253801777, + "learning_rate": 0.00010902852780517763, + "loss": 1.0386, + "step": 10403 + }, + { + "epoch": 1.0, + "grad_norm": 0.28389255664388, + "learning_rate": 0.00010901277280927124, + "loss": 1.1183, + "step": 10404 + }, + { + "epoch": 1.0, + "grad_norm": 0.3443690879581628, + "learning_rate": 0.0001089970175878146, + "loss": 1.098, + "step": 10405 + }, + { + "epoch": 1.0, + "grad_norm": 0.2551799172999775, + "learning_rate": 0.00010898126214120194, + "loss": 1.0581, + "step": 10406 + }, + { + "epoch": 1.0, + "grad_norm": 0.282716317579218, + "learning_rate": 0.0001089655064698275, + "loss": 0.9715, + "step": 10407 + }, + { + "epoch": 1.0, + "grad_norm": 0.31282993584728686, + "learning_rate": 0.00010894975057408568, + "loss": 1.047, + "step": 10408 + }, + { + "epoch": 1.0, + "grad_norm": 0.30036322798930976, + "learning_rate": 0.00010893399445437071, + "loss": 1.188, + "step": 10409 + }, + { + "epoch": 1.0, + "grad_norm": 0.3325931928877523, + "learning_rate": 0.0001089182381110769, + "loss": 0.9747, + "step": 10410 + }, + { + "epoch": 1.0, + "grad_norm": 0.3117101250717752, + "learning_rate": 0.00010890248154459858, + "loss": 1.0603, + "step": 10411 + }, + { + "epoch": 1.0, + "grad_norm": 0.28934935878323786, + "learning_rate": 0.00010888672475533006, + "loss": 1.093, + "step": 10412 + }, + { + "epoch": 1.0, + "grad_norm": 0.2716220894787029, + "learning_rate": 0.0001088709677436657, + "loss": 1.1221, + "step": 10413 + }, + { + "epoch": 1.0, + "grad_norm": 0.30194803612986443, + "learning_rate": 0.00010885521050999976, + "loss": 1.0063, + "step": 10414 + }, + { + "epoch": 1.0, + "grad_norm": 0.28931680238920227, + "learning_rate": 0.00010883945305472662, + "loss": 0.9928, + "step": 10415 + }, + { + "epoch": 1.0, + "grad_norm": 0.3229463397228268, + "learning_rate": 0.00010882369537824064, + "loss": 1.0712, + "step": 10416 + }, + { + "epoch": 1.0, + "grad_norm": 0.2954911003697874, + "learning_rate": 0.00010880793748093615, + "loss": 1.0046, + "step": 10417 + }, + { + "epoch": 1.0, + "grad_norm": 0.2872966086169036, + "learning_rate": 0.00010879217936320743, + "loss": 1.0814, + "step": 10418 + }, + { + "epoch": 1.0, + "grad_norm": 0.3310289766979793, + "learning_rate": 0.00010877642102544891, + "loss": 0.9988, + "step": 10419 + }, + { + "epoch": 1.0, + "grad_norm": 0.2661925275942424, + "learning_rate": 0.00010876066246805496, + "loss": 1.0533, + "step": 10420 + }, + { + "epoch": 1.0, + "grad_norm": 0.2868056408701368, + "learning_rate": 0.00010874490369141991, + "loss": 1.0454, + "step": 10421 + }, + { + "epoch": 1.0, + "grad_norm": 0.25382291752633346, + "learning_rate": 0.00010872914469593816, + "loss": 1.0787, + "step": 10422 + }, + { + "epoch": 1.0, + "grad_norm": 0.32950064922007866, + "learning_rate": 0.00010871338548200409, + "loss": 1.0575, + "step": 10423 + }, + { + "epoch": 1.0, + "grad_norm": 0.28542084992905475, + "learning_rate": 0.00010869762605001204, + "loss": 1.0273, + "step": 10424 + }, + { + "epoch": 1.0, + "grad_norm": 0.274404919205005, + "learning_rate": 0.00010868186640035645, + "loss": 1.1522, + "step": 10425 + }, + { + "epoch": 1.0, + "grad_norm": 0.27910073576406763, + "learning_rate": 0.0001086661065334317, + "loss": 0.9747, + "step": 10426 + }, + { + "epoch": 1.0, + "grad_norm": 0.2754135797678086, + "learning_rate": 0.00010865034644963219, + "loss": 1.0593, + "step": 10427 + }, + { + "epoch": 1.0, + "grad_norm": 0.3036570067584713, + "learning_rate": 0.00010863458614935228, + "loss": 0.9586, + "step": 10428 + }, + { + "epoch": 1.0, + "grad_norm": 0.25912219351365356, + "learning_rate": 0.00010861882563298648, + "loss": 1.1329, + "step": 10429 + }, + { + "epoch": 1.0, + "grad_norm": 0.30796423013527807, + "learning_rate": 0.00010860306490092916, + "loss": 0.9919, + "step": 10430 + }, + { + "epoch": 1.0, + "grad_norm": 0.2939383687791828, + "learning_rate": 0.00010858730395357468, + "loss": 0.9745, + "step": 10431 + }, + { + "epoch": 1.0, + "grad_norm": 0.27337663742366286, + "learning_rate": 0.00010857154279131754, + "loss": 0.9214, + "step": 10432 + }, + { + "epoch": 1.0, + "grad_norm": 0.26204823094059426, + "learning_rate": 0.00010855578141455216, + "loss": 1.0411, + "step": 10433 + }, + { + "epoch": 1.0, + "grad_norm": 0.26527110818824695, + "learning_rate": 0.00010854001982367296, + "loss": 1.0424, + "step": 10434 + }, + { + "epoch": 1.0, + "grad_norm": 0.25265488574920947, + "learning_rate": 0.00010852425801907442, + "loss": 0.9679, + "step": 10435 + }, + { + "epoch": 1.0, + "grad_norm": 0.26557773202249946, + "learning_rate": 0.00010850849600115096, + "loss": 1.0498, + "step": 10436 + }, + { + "epoch": 1.0, + "grad_norm": 0.3095199638697367, + "learning_rate": 0.00010849273377029705, + "loss": 0.9924, + "step": 10437 + }, + { + "epoch": 1.0, + "grad_norm": 0.25086882514656306, + "learning_rate": 0.00010847697132690713, + "loss": 1.0061, + "step": 10438 + }, + { + "epoch": 1.0, + "grad_norm": 0.26227634701423524, + "learning_rate": 0.00010846120867137567, + "loss": 1.022, + "step": 10439 + }, + { + "epoch": 1.0, + "grad_norm": 0.272747535197043, + "learning_rate": 0.00010844544580409717, + "loss": 1.049, + "step": 10440 + }, + { + "epoch": 1.0, + "grad_norm": 0.30486635134117596, + "learning_rate": 0.00010842968272546603, + "loss": 0.9928, + "step": 10441 + }, + { + "epoch": 1.0, + "grad_norm": 0.2911807248936414, + "learning_rate": 0.00010841391943587682, + "loss": 1.0824, + "step": 10442 + }, + { + "epoch": 1.0, + "grad_norm": 0.24115347100619006, + "learning_rate": 0.00010839815593572398, + "loss": 1.0671, + "step": 10443 + }, + { + "epoch": 1.0, + "grad_norm": 0.2749883873988357, + "learning_rate": 0.00010838239222540203, + "loss": 1.1308, + "step": 10444 + }, + { + "epoch": 1.0, + "grad_norm": 0.25592035706779787, + "learning_rate": 0.00010836662830530539, + "loss": 1.0291, + "step": 10445 + }, + { + "epoch": 1.0, + "grad_norm": 0.27534119534699625, + "learning_rate": 0.00010835086417582867, + "loss": 1.075, + "step": 10446 + }, + { + "epoch": 1.0, + "grad_norm": 0.29613831949975483, + "learning_rate": 0.00010833509983736632, + "loss": 1.0167, + "step": 10447 + }, + { + "epoch": 1.0, + "grad_norm": 0.2971382111384855, + "learning_rate": 0.00010831933529031284, + "loss": 1.0102, + "step": 10448 + }, + { + "epoch": 1.0, + "grad_norm": 0.2833249451857255, + "learning_rate": 0.00010830357053506277, + "loss": 1.1205, + "step": 10449 + }, + { + "epoch": 1.0, + "grad_norm": 0.28978501988105787, + "learning_rate": 0.00010828780557201066, + "loss": 1.0154, + "step": 10450 + }, + { + "epoch": 1.0, + "grad_norm": 0.32646478206285645, + "learning_rate": 0.00010827204040155094, + "loss": 0.9872, + "step": 10451 + }, + { + "epoch": 1.0, + "grad_norm": 0.28028521065676737, + "learning_rate": 0.00010825627502407827, + "loss": 0.9816, + "step": 10452 + }, + { + "epoch": 1.0, + "eval_loss": 1.123805046081543, + "eval_runtime": 4227.9159, + "eval_samples_per_second": 19.778, + "eval_steps_per_second": 2.472, + "step": 10452 + } + ], + "logging_steps": 1, + "max_steps": 20904, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 10452, + "total_flos": 2.304536495731507e+16, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}