diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,80941 @@ +{ + "best_metric": 12.103847094587513, + "best_model_checkpoint": "./hviske-v3/checkpoint-11548", + "epoch": 8.0, + "eval_steps": 500, + "global_step": 11548, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0006927606511950121, + "grad_norm": 12.292132377624512, + "learning_rate": 1.0000000000000002e-06, + "loss": 1.2402, + "step": 1 + }, + { + "epoch": 0.0013855213023900243, + "grad_norm": 11.564336776733398, + "learning_rate": 2.0000000000000003e-06, + "loss": 1.1541, + "step": 2 + }, + { + "epoch": 0.0020782819535850364, + "grad_norm": 9.734325408935547, + "learning_rate": 3e-06, + "loss": 1.0122, + "step": 3 + }, + { + "epoch": 0.0027710426047800486, + "grad_norm": 7.753880977630615, + "learning_rate": 4.000000000000001e-06, + "loss": 0.8952, + "step": 4 + }, + { + "epoch": 0.0034638032559750607, + "grad_norm": 6.528608798980713, + "learning_rate": 5e-06, + "loss": 0.7526, + "step": 5 + }, + { + "epoch": 0.004156563907170073, + "grad_norm": 5.133320331573486, + "learning_rate": 6e-06, + "loss": 0.5005, + "step": 6 + }, + { + "epoch": 0.0048493245583650845, + "grad_norm": 3.6035006046295166, + "learning_rate": 7e-06, + "loss": 0.4519, + "step": 7 + }, + { + "epoch": 0.005542085209560097, + "grad_norm": 2.8495588302612305, + "learning_rate": 8.000000000000001e-06, + "loss": 0.3598, + "step": 8 + }, + { + "epoch": 0.006234845860755109, + "grad_norm": 2.3552119731903076, + "learning_rate": 9e-06, + "loss": 0.337, + "step": 9 + }, + { + "epoch": 0.006927606511950121, + "grad_norm": 2.3528480529785156, + "learning_rate": 1e-05, + "loss": 0.3892, + "step": 10 + }, + { + "epoch": 0.007620367163145133, + "grad_norm": 2.1666362285614014, + "learning_rate": 9.999306518723996e-06, + "loss": 0.3178, + "step": 11 + }, + { + "epoch": 0.008313127814340146, + "grad_norm": 1.9824934005737305, + "learning_rate": 9.998613037447989e-06, + "loss": 0.3137, + "step": 12 + }, + { + "epoch": 0.009005888465535157, + "grad_norm": 1.8904988765716553, + "learning_rate": 9.997919556171984e-06, + "loss": 0.3366, + "step": 13 + }, + { + "epoch": 0.009698649116730169, + "grad_norm": 1.7962043285369873, + "learning_rate": 9.997226074895979e-06, + "loss": 0.3415, + "step": 14 + }, + { + "epoch": 0.010391409767925183, + "grad_norm": 1.5924854278564453, + "learning_rate": 9.996532593619974e-06, + "loss": 0.3083, + "step": 15 + }, + { + "epoch": 0.011084170419120194, + "grad_norm": 1.5507971048355103, + "learning_rate": 9.995839112343969e-06, + "loss": 0.2547, + "step": 16 + }, + { + "epoch": 0.011776931070315206, + "grad_norm": 1.625256061553955, + "learning_rate": 9.995145631067962e-06, + "loss": 0.3259, + "step": 17 + }, + { + "epoch": 0.012469691721510218, + "grad_norm": 1.5093649625778198, + "learning_rate": 9.994452149791957e-06, + "loss": 0.2925, + "step": 18 + }, + { + "epoch": 0.013162452372705231, + "grad_norm": 1.570859432220459, + "learning_rate": 9.99375866851595e-06, + "loss": 0.3171, + "step": 19 + }, + { + "epoch": 0.013855213023900243, + "grad_norm": 1.4971448183059692, + "learning_rate": 9.993065187239945e-06, + "loss": 0.3069, + "step": 20 + }, + { + "epoch": 0.014547973675095255, + "grad_norm": 1.3889166116714478, + "learning_rate": 9.99237170596394e-06, + "loss": 0.2657, + "step": 21 + }, + { + "epoch": 0.015240734326290266, + "grad_norm": 1.4377930164337158, + "learning_rate": 9.991678224687933e-06, + "loss": 0.2713, + "step": 22 + }, + { + "epoch": 0.01593349497748528, + "grad_norm": 1.4533603191375732, + "learning_rate": 9.990984743411928e-06, + "loss": 0.2843, + "step": 23 + }, + { + "epoch": 0.01662625562868029, + "grad_norm": 1.4159753322601318, + "learning_rate": 9.990291262135923e-06, + "loss": 0.3001, + "step": 24 + }, + { + "epoch": 0.017319016279875303, + "grad_norm": 1.3404206037521362, + "learning_rate": 9.989597780859918e-06, + "loss": 0.2786, + "step": 25 + }, + { + "epoch": 0.018011776931070315, + "grad_norm": 1.5711417198181152, + "learning_rate": 9.988904299583913e-06, + "loss": 0.2985, + "step": 26 + }, + { + "epoch": 0.018704537582265326, + "grad_norm": 1.3879624605178833, + "learning_rate": 9.988210818307906e-06, + "loss": 0.2675, + "step": 27 + }, + { + "epoch": 0.019397298233460338, + "grad_norm": 1.4611817598342896, + "learning_rate": 9.987517337031901e-06, + "loss": 0.2618, + "step": 28 + }, + { + "epoch": 0.02009005888465535, + "grad_norm": 1.5529396533966064, + "learning_rate": 9.986823855755894e-06, + "loss": 0.2912, + "step": 29 + }, + { + "epoch": 0.020782819535850365, + "grad_norm": 1.4379888772964478, + "learning_rate": 9.98613037447989e-06, + "loss": 0.2967, + "step": 30 + }, + { + "epoch": 0.021475580187045377, + "grad_norm": 1.515025019645691, + "learning_rate": 9.985436893203884e-06, + "loss": 0.3046, + "step": 31 + }, + { + "epoch": 0.02216834083824039, + "grad_norm": 1.5616956949234009, + "learning_rate": 9.98474341192788e-06, + "loss": 0.2605, + "step": 32 + }, + { + "epoch": 0.0228611014894354, + "grad_norm": 1.435473084449768, + "learning_rate": 9.984049930651874e-06, + "loss": 0.2748, + "step": 33 + }, + { + "epoch": 0.023553862140630412, + "grad_norm": 1.5559055805206299, + "learning_rate": 9.983356449375867e-06, + "loss": 0.2775, + "step": 34 + }, + { + "epoch": 0.024246622791825424, + "grad_norm": 1.3089165687561035, + "learning_rate": 9.982662968099862e-06, + "loss": 0.2415, + "step": 35 + }, + { + "epoch": 0.024939383443020435, + "grad_norm": 1.2515681982040405, + "learning_rate": 9.981969486823857e-06, + "loss": 0.2404, + "step": 36 + }, + { + "epoch": 0.025632144094215447, + "grad_norm": 1.357074499130249, + "learning_rate": 9.98127600554785e-06, + "loss": 0.2519, + "step": 37 + }, + { + "epoch": 0.026324904745410462, + "grad_norm": 1.4473412036895752, + "learning_rate": 9.980582524271845e-06, + "loss": 0.2975, + "step": 38 + }, + { + "epoch": 0.027017665396605474, + "grad_norm": 1.442353367805481, + "learning_rate": 9.97988904299584e-06, + "loss": 0.2451, + "step": 39 + }, + { + "epoch": 0.027710426047800486, + "grad_norm": 1.4204617738723755, + "learning_rate": 9.979195561719834e-06, + "loss": 0.2315, + "step": 40 + }, + { + "epoch": 0.028403186698995497, + "grad_norm": 1.443942904472351, + "learning_rate": 9.978502080443829e-06, + "loss": 0.2895, + "step": 41 + }, + { + "epoch": 0.02909594735019051, + "grad_norm": 1.2991156578063965, + "learning_rate": 9.977808599167823e-06, + "loss": 0.2266, + "step": 42 + }, + { + "epoch": 0.02978870800138552, + "grad_norm": 1.3881120681762695, + "learning_rate": 9.977115117891818e-06, + "loss": 0.2772, + "step": 43 + }, + { + "epoch": 0.030481468652580532, + "grad_norm": 1.3863579034805298, + "learning_rate": 9.976421636615812e-06, + "loss": 0.2694, + "step": 44 + }, + { + "epoch": 0.031174229303775544, + "grad_norm": 1.4252607822418213, + "learning_rate": 9.975728155339807e-06, + "loss": 0.2603, + "step": 45 + }, + { + "epoch": 0.03186698995497056, + "grad_norm": 1.3916229009628296, + "learning_rate": 9.975034674063801e-06, + "loss": 0.2634, + "step": 46 + }, + { + "epoch": 0.03255975060616557, + "grad_norm": 1.3882328271865845, + "learning_rate": 9.974341192787795e-06, + "loss": 0.2361, + "step": 47 + }, + { + "epoch": 0.03325251125736058, + "grad_norm": 1.3797417879104614, + "learning_rate": 9.97364771151179e-06, + "loss": 0.2611, + "step": 48 + }, + { + "epoch": 0.03394527190855559, + "grad_norm": 1.3497843742370605, + "learning_rate": 9.972954230235785e-06, + "loss": 0.256, + "step": 49 + }, + { + "epoch": 0.034638032559750606, + "grad_norm": 1.432814359664917, + "learning_rate": 9.97226074895978e-06, + "loss": 0.1958, + "step": 50 + }, + { + "epoch": 0.03533079321094562, + "grad_norm": 1.3722063302993774, + "learning_rate": 9.971567267683774e-06, + "loss": 0.2559, + "step": 51 + }, + { + "epoch": 0.03602355386214063, + "grad_norm": 1.3759533166885376, + "learning_rate": 9.970873786407768e-06, + "loss": 0.2528, + "step": 52 + }, + { + "epoch": 0.036716314513335645, + "grad_norm": 1.4539951086044312, + "learning_rate": 9.970180305131763e-06, + "loss": 0.2481, + "step": 53 + }, + { + "epoch": 0.03740907516453065, + "grad_norm": 1.3330899477005005, + "learning_rate": 9.969486823855756e-06, + "loss": 0.2212, + "step": 54 + }, + { + "epoch": 0.03810183581572567, + "grad_norm": 1.3839563131332397, + "learning_rate": 9.96879334257975e-06, + "loss": 0.2538, + "step": 55 + }, + { + "epoch": 0.038794596466920676, + "grad_norm": 1.4515337944030762, + "learning_rate": 9.968099861303746e-06, + "loss": 0.2505, + "step": 56 + }, + { + "epoch": 0.03948735711811569, + "grad_norm": 1.3861531019210815, + "learning_rate": 9.96740638002774e-06, + "loss": 0.2625, + "step": 57 + }, + { + "epoch": 0.0401801177693107, + "grad_norm": 1.364999771118164, + "learning_rate": 9.966712898751736e-06, + "loss": 0.2652, + "step": 58 + }, + { + "epoch": 0.040872878420505715, + "grad_norm": 1.4211515188217163, + "learning_rate": 9.966019417475729e-06, + "loss": 0.2569, + "step": 59 + }, + { + "epoch": 0.04156563907170073, + "grad_norm": 1.3074867725372314, + "learning_rate": 9.965325936199724e-06, + "loss": 0.2365, + "step": 60 + }, + { + "epoch": 0.04225839972289574, + "grad_norm": 1.4820562601089478, + "learning_rate": 9.964632454923719e-06, + "loss": 0.2756, + "step": 61 + }, + { + "epoch": 0.042951160374090754, + "grad_norm": 1.3363418579101562, + "learning_rate": 9.963938973647712e-06, + "loss": 0.2178, + "step": 62 + }, + { + "epoch": 0.04364392102528576, + "grad_norm": 1.466212272644043, + "learning_rate": 9.963245492371707e-06, + "loss": 0.2638, + "step": 63 + }, + { + "epoch": 0.04433668167648078, + "grad_norm": 1.5230470895767212, + "learning_rate": 9.9625520110957e-06, + "loss": 0.284, + "step": 64 + }, + { + "epoch": 0.045029442327675785, + "grad_norm": 1.2839550971984863, + "learning_rate": 9.961858529819695e-06, + "loss": 0.2314, + "step": 65 + }, + { + "epoch": 0.0457222029788708, + "grad_norm": 1.468959927558899, + "learning_rate": 9.96116504854369e-06, + "loss": 0.2499, + "step": 66 + }, + { + "epoch": 0.046414963630065816, + "grad_norm": 1.2798144817352295, + "learning_rate": 9.960471567267685e-06, + "loss": 0.2303, + "step": 67 + }, + { + "epoch": 0.047107724281260824, + "grad_norm": 1.324418544769287, + "learning_rate": 9.95977808599168e-06, + "loss": 0.214, + "step": 68 + }, + { + "epoch": 0.04780048493245584, + "grad_norm": 1.3903834819793701, + "learning_rate": 9.959084604715673e-06, + "loss": 0.2675, + "step": 69 + }, + { + "epoch": 0.04849324558365085, + "grad_norm": 1.4964789152145386, + "learning_rate": 9.958391123439668e-06, + "loss": 0.2548, + "step": 70 + }, + { + "epoch": 0.04918600623484586, + "grad_norm": 1.252150058746338, + "learning_rate": 9.957697642163663e-06, + "loss": 0.2366, + "step": 71 + }, + { + "epoch": 0.04987876688604087, + "grad_norm": 1.445634126663208, + "learning_rate": 9.957004160887656e-06, + "loss": 0.2491, + "step": 72 + }, + { + "epoch": 0.050571527537235886, + "grad_norm": 1.4249677658081055, + "learning_rate": 9.956310679611651e-06, + "loss": 0.2385, + "step": 73 + }, + { + "epoch": 0.051264288188430894, + "grad_norm": 1.3850736618041992, + "learning_rate": 9.955617198335646e-06, + "loss": 0.2443, + "step": 74 + }, + { + "epoch": 0.05195704883962591, + "grad_norm": 1.3256713151931763, + "learning_rate": 9.954923717059641e-06, + "loss": 0.2381, + "step": 75 + }, + { + "epoch": 0.052649809490820924, + "grad_norm": 1.3021267652511597, + "learning_rate": 9.954230235783636e-06, + "loss": 0.2498, + "step": 76 + }, + { + "epoch": 0.05334257014201593, + "grad_norm": 1.2603353261947632, + "learning_rate": 9.95353675450763e-06, + "loss": 0.2668, + "step": 77 + }, + { + "epoch": 0.05403533079321095, + "grad_norm": 1.2531903982162476, + "learning_rate": 9.952843273231624e-06, + "loss": 0.2266, + "step": 78 + }, + { + "epoch": 0.054728091444405956, + "grad_norm": 1.3433337211608887, + "learning_rate": 9.952149791955617e-06, + "loss": 0.2532, + "step": 79 + }, + { + "epoch": 0.05542085209560097, + "grad_norm": 1.2393718957901, + "learning_rate": 9.951456310679612e-06, + "loss": 0.237, + "step": 80 + }, + { + "epoch": 0.05611361274679598, + "grad_norm": 1.3576997518539429, + "learning_rate": 9.950762829403607e-06, + "loss": 0.2519, + "step": 81 + }, + { + "epoch": 0.056806373397990995, + "grad_norm": 1.3664438724517822, + "learning_rate": 9.9500693481276e-06, + "loss": 0.25, + "step": 82 + }, + { + "epoch": 0.057499134049186, + "grad_norm": 1.4981049299240112, + "learning_rate": 9.949375866851595e-06, + "loss": 0.2901, + "step": 83 + }, + { + "epoch": 0.05819189470038102, + "grad_norm": 1.4166090488433838, + "learning_rate": 9.94868238557559e-06, + "loss": 0.2465, + "step": 84 + }, + { + "epoch": 0.05888465535157603, + "grad_norm": 1.2040208578109741, + "learning_rate": 9.947988904299585e-06, + "loss": 0.1856, + "step": 85 + }, + { + "epoch": 0.05957741600277104, + "grad_norm": 1.189120888710022, + "learning_rate": 9.94729542302358e-06, + "loss": 0.2086, + "step": 86 + }, + { + "epoch": 0.06027017665396606, + "grad_norm": 1.3155075311660767, + "learning_rate": 9.946601941747573e-06, + "loss": 0.2299, + "step": 87 + }, + { + "epoch": 0.060962937305161065, + "grad_norm": 1.4105749130249023, + "learning_rate": 9.945908460471568e-06, + "loss": 0.267, + "step": 88 + }, + { + "epoch": 0.06165569795635608, + "grad_norm": 1.3087950944900513, + "learning_rate": 9.945214979195562e-06, + "loss": 0.2267, + "step": 89 + }, + { + "epoch": 0.06234845860755109, + "grad_norm": 1.3032172918319702, + "learning_rate": 9.944521497919557e-06, + "loss": 0.2376, + "step": 90 + }, + { + "epoch": 0.0630412192587461, + "grad_norm": 1.2708215713500977, + "learning_rate": 9.943828016643551e-06, + "loss": 0.2141, + "step": 91 + }, + { + "epoch": 0.06373397990994112, + "grad_norm": 1.1809252500534058, + "learning_rate": 9.943134535367546e-06, + "loss": 0.2102, + "step": 92 + }, + { + "epoch": 0.06442674056113613, + "grad_norm": 1.3737300634384155, + "learning_rate": 9.942441054091541e-06, + "loss": 0.2512, + "step": 93 + }, + { + "epoch": 0.06511950121233114, + "grad_norm": 1.261559009552002, + "learning_rate": 9.941747572815535e-06, + "loss": 0.218, + "step": 94 + }, + { + "epoch": 0.06581226186352615, + "grad_norm": 1.394752860069275, + "learning_rate": 9.94105409153953e-06, + "loss": 0.2524, + "step": 95 + }, + { + "epoch": 0.06650502251472117, + "grad_norm": 1.3859493732452393, + "learning_rate": 9.940360610263524e-06, + "loss": 0.1946, + "step": 96 + }, + { + "epoch": 0.06719778316591618, + "grad_norm": 1.346633791923523, + "learning_rate": 9.939667128987518e-06, + "loss": 0.2555, + "step": 97 + }, + { + "epoch": 0.06789054381711118, + "grad_norm": 1.2180049419403076, + "learning_rate": 9.938973647711513e-06, + "loss": 0.2117, + "step": 98 + }, + { + "epoch": 0.0685833044683062, + "grad_norm": 1.2323360443115234, + "learning_rate": 9.938280166435506e-06, + "loss": 0.2402, + "step": 99 + }, + { + "epoch": 0.06927606511950121, + "grad_norm": 1.1962711811065674, + "learning_rate": 9.9375866851595e-06, + "loss": 0.2081, + "step": 100 + }, + { + "epoch": 0.06996882577069623, + "grad_norm": 1.3550431728363037, + "learning_rate": 9.936893203883496e-06, + "loss": 0.24, + "step": 101 + }, + { + "epoch": 0.07066158642189124, + "grad_norm": 1.2796891927719116, + "learning_rate": 9.93619972260749e-06, + "loss": 0.2375, + "step": 102 + }, + { + "epoch": 0.07135434707308624, + "grad_norm": 1.254270076751709, + "learning_rate": 9.935506241331486e-06, + "loss": 0.2052, + "step": 103 + }, + { + "epoch": 0.07204710772428126, + "grad_norm": 1.1684892177581787, + "learning_rate": 9.934812760055479e-06, + "loss": 0.1916, + "step": 104 + }, + { + "epoch": 0.07273986837547627, + "grad_norm": 1.381418228149414, + "learning_rate": 9.934119278779474e-06, + "loss": 0.2708, + "step": 105 + }, + { + "epoch": 0.07343262902667129, + "grad_norm": 1.2438158988952637, + "learning_rate": 9.933425797503469e-06, + "loss": 0.2002, + "step": 106 + }, + { + "epoch": 0.07412538967786629, + "grad_norm": 1.244842529296875, + "learning_rate": 9.932732316227462e-06, + "loss": 0.2351, + "step": 107 + }, + { + "epoch": 0.0748181503290613, + "grad_norm": 1.1751757860183716, + "learning_rate": 9.932038834951457e-06, + "loss": 0.2049, + "step": 108 + }, + { + "epoch": 0.07551091098025632, + "grad_norm": 1.2962253093719482, + "learning_rate": 9.931345353675452e-06, + "loss": 0.2092, + "step": 109 + }, + { + "epoch": 0.07620367163145134, + "grad_norm": 1.2747116088867188, + "learning_rate": 9.930651872399447e-06, + "loss": 0.2346, + "step": 110 + }, + { + "epoch": 0.07689643228264635, + "grad_norm": 1.279923915863037, + "learning_rate": 9.929958391123442e-06, + "loss": 0.2346, + "step": 111 + }, + { + "epoch": 0.07758919293384135, + "grad_norm": 1.406819224357605, + "learning_rate": 9.929264909847435e-06, + "loss": 0.2081, + "step": 112 + }, + { + "epoch": 0.07828195358503637, + "grad_norm": 1.172568678855896, + "learning_rate": 9.92857142857143e-06, + "loss": 0.1966, + "step": 113 + }, + { + "epoch": 0.07897471423623138, + "grad_norm": 1.2985316514968872, + "learning_rate": 9.927877947295423e-06, + "loss": 0.2204, + "step": 114 + }, + { + "epoch": 0.0796674748874264, + "grad_norm": 1.4315857887268066, + "learning_rate": 9.927184466019418e-06, + "loss": 0.2115, + "step": 115 + }, + { + "epoch": 0.0803602355386214, + "grad_norm": 1.6077250242233276, + "learning_rate": 9.926490984743413e-06, + "loss": 0.2492, + "step": 116 + }, + { + "epoch": 0.08105299618981641, + "grad_norm": 1.518119215965271, + "learning_rate": 9.925797503467406e-06, + "loss": 0.2278, + "step": 117 + }, + { + "epoch": 0.08174575684101143, + "grad_norm": 1.2023013830184937, + "learning_rate": 9.925104022191401e-06, + "loss": 0.2105, + "step": 118 + }, + { + "epoch": 0.08243851749220645, + "grad_norm": 1.2681522369384766, + "learning_rate": 9.924410540915396e-06, + "loss": 0.2107, + "step": 119 + }, + { + "epoch": 0.08313127814340146, + "grad_norm": 1.2564748525619507, + "learning_rate": 9.923717059639391e-06, + "loss": 0.2202, + "step": 120 + }, + { + "epoch": 0.08382403879459646, + "grad_norm": 1.2870888710021973, + "learning_rate": 9.923023578363386e-06, + "loss": 0.2215, + "step": 121 + }, + { + "epoch": 0.08451679944579148, + "grad_norm": 1.364901065826416, + "learning_rate": 9.922330097087379e-06, + "loss": 0.2106, + "step": 122 + }, + { + "epoch": 0.08520956009698649, + "grad_norm": 1.326931118965149, + "learning_rate": 9.921636615811374e-06, + "loss": 0.2641, + "step": 123 + }, + { + "epoch": 0.08590232074818151, + "grad_norm": 1.2108707427978516, + "learning_rate": 9.920943134535367e-06, + "loss": 0.2265, + "step": 124 + }, + { + "epoch": 0.08659508139937652, + "grad_norm": 1.4389864206314087, + "learning_rate": 9.920249653259362e-06, + "loss": 0.2426, + "step": 125 + }, + { + "epoch": 0.08728784205057152, + "grad_norm": 1.3699928522109985, + "learning_rate": 9.919556171983357e-06, + "loss": 0.2315, + "step": 126 + }, + { + "epoch": 0.08798060270176654, + "grad_norm": 1.321111798286438, + "learning_rate": 9.918862690707352e-06, + "loss": 0.2347, + "step": 127 + }, + { + "epoch": 0.08867336335296155, + "grad_norm": 1.1944401264190674, + "learning_rate": 9.918169209431347e-06, + "loss": 0.2258, + "step": 128 + }, + { + "epoch": 0.08936612400415657, + "grad_norm": 1.245202898979187, + "learning_rate": 9.91747572815534e-06, + "loss": 0.2356, + "step": 129 + }, + { + "epoch": 0.09005888465535157, + "grad_norm": 1.2784806489944458, + "learning_rate": 9.916782246879335e-06, + "loss": 0.2452, + "step": 130 + }, + { + "epoch": 0.09075164530654659, + "grad_norm": 1.1236475706100464, + "learning_rate": 9.91608876560333e-06, + "loss": 0.188, + "step": 131 + }, + { + "epoch": 0.0914444059577416, + "grad_norm": 1.326741337776184, + "learning_rate": 9.915395284327323e-06, + "loss": 0.2342, + "step": 132 + }, + { + "epoch": 0.09213716660893662, + "grad_norm": 1.403367280960083, + "learning_rate": 9.914701803051318e-06, + "loss": 0.2539, + "step": 133 + }, + { + "epoch": 0.09282992726013163, + "grad_norm": 1.344679355621338, + "learning_rate": 9.914008321775313e-06, + "loss": 0.2288, + "step": 134 + }, + { + "epoch": 0.09352268791132663, + "grad_norm": 1.3344292640686035, + "learning_rate": 9.913314840499308e-06, + "loss": 0.2419, + "step": 135 + }, + { + "epoch": 0.09421544856252165, + "grad_norm": 1.2694120407104492, + "learning_rate": 9.912621359223301e-06, + "loss": 0.2186, + "step": 136 + }, + { + "epoch": 0.09490820921371666, + "grad_norm": 1.0931838750839233, + "learning_rate": 9.911927877947296e-06, + "loss": 0.1731, + "step": 137 + }, + { + "epoch": 0.09560096986491168, + "grad_norm": 1.4262518882751465, + "learning_rate": 9.911234396671291e-06, + "loss": 0.2536, + "step": 138 + }, + { + "epoch": 0.09629373051610668, + "grad_norm": 1.3521032333374023, + "learning_rate": 9.910540915395285e-06, + "loss": 0.2245, + "step": 139 + }, + { + "epoch": 0.0969864911673017, + "grad_norm": 1.4264510869979858, + "learning_rate": 9.90984743411928e-06, + "loss": 0.2639, + "step": 140 + }, + { + "epoch": 0.09767925181849671, + "grad_norm": 1.265846610069275, + "learning_rate": 9.909153952843274e-06, + "loss": 0.1989, + "step": 141 + }, + { + "epoch": 0.09837201246969172, + "grad_norm": 1.3544530868530273, + "learning_rate": 9.908460471567268e-06, + "loss": 0.2206, + "step": 142 + }, + { + "epoch": 0.09906477312088674, + "grad_norm": 1.1606234312057495, + "learning_rate": 9.907766990291263e-06, + "loss": 0.1849, + "step": 143 + }, + { + "epoch": 0.09975753377208174, + "grad_norm": 1.3297104835510254, + "learning_rate": 9.907073509015258e-06, + "loss": 0.2128, + "step": 144 + }, + { + "epoch": 0.10045029442327676, + "grad_norm": 1.2785829305648804, + "learning_rate": 9.906380027739252e-06, + "loss": 0.1987, + "step": 145 + }, + { + "epoch": 0.10114305507447177, + "grad_norm": 1.2113291025161743, + "learning_rate": 9.905686546463247e-06, + "loss": 0.2208, + "step": 146 + }, + { + "epoch": 0.10183581572566679, + "grad_norm": 1.3514881134033203, + "learning_rate": 9.90499306518724e-06, + "loss": 0.2283, + "step": 147 + }, + { + "epoch": 0.10252857637686179, + "grad_norm": 1.2799649238586426, + "learning_rate": 9.904299583911236e-06, + "loss": 0.2348, + "step": 148 + }, + { + "epoch": 0.1032213370280568, + "grad_norm": 1.2758800983428955, + "learning_rate": 9.903606102635229e-06, + "loss": 0.2051, + "step": 149 + }, + { + "epoch": 0.10391409767925182, + "grad_norm": 1.4284158945083618, + "learning_rate": 9.902912621359224e-06, + "loss": 0.2407, + "step": 150 + }, + { + "epoch": 0.10460685833044683, + "grad_norm": 1.3322699069976807, + "learning_rate": 9.902219140083219e-06, + "loss": 0.2362, + "step": 151 + }, + { + "epoch": 0.10529961898164185, + "grad_norm": 1.2086516618728638, + "learning_rate": 9.901525658807214e-06, + "loss": 0.1891, + "step": 152 + }, + { + "epoch": 0.10599237963283685, + "grad_norm": 1.2374615669250488, + "learning_rate": 9.900832177531209e-06, + "loss": 0.2212, + "step": 153 + }, + { + "epoch": 0.10668514028403187, + "grad_norm": 1.2254961729049683, + "learning_rate": 9.900138696255202e-06, + "loss": 0.193, + "step": 154 + }, + { + "epoch": 0.10737790093522688, + "grad_norm": 1.230727195739746, + "learning_rate": 9.899445214979197e-06, + "loss": 0.2077, + "step": 155 + }, + { + "epoch": 0.1080706615864219, + "grad_norm": 1.1526367664337158, + "learning_rate": 9.898751733703192e-06, + "loss": 0.1706, + "step": 156 + }, + { + "epoch": 0.1087634222376169, + "grad_norm": 1.2905890941619873, + "learning_rate": 9.898058252427185e-06, + "loss": 0.2072, + "step": 157 + }, + { + "epoch": 0.10945618288881191, + "grad_norm": 1.2018944025039673, + "learning_rate": 9.89736477115118e-06, + "loss": 0.2084, + "step": 158 + }, + { + "epoch": 0.11014894354000693, + "grad_norm": 1.2835495471954346, + "learning_rate": 9.896671289875173e-06, + "loss": 0.2156, + "step": 159 + }, + { + "epoch": 0.11084170419120194, + "grad_norm": 1.24733304977417, + "learning_rate": 9.895977808599168e-06, + "loss": 0.2158, + "step": 160 + }, + { + "epoch": 0.11153446484239696, + "grad_norm": 1.1799983978271484, + "learning_rate": 9.895284327323163e-06, + "loss": 0.2064, + "step": 161 + }, + { + "epoch": 0.11222722549359196, + "grad_norm": 1.3932280540466309, + "learning_rate": 9.894590846047158e-06, + "loss": 0.2286, + "step": 162 + }, + { + "epoch": 0.11291998614478697, + "grad_norm": 1.2786810398101807, + "learning_rate": 9.893897364771153e-06, + "loss": 0.1983, + "step": 163 + }, + { + "epoch": 0.11361274679598199, + "grad_norm": 1.2713406085968018, + "learning_rate": 9.893203883495146e-06, + "loss": 0.2441, + "step": 164 + }, + { + "epoch": 0.114305507447177, + "grad_norm": 1.3331621885299683, + "learning_rate": 9.892510402219141e-06, + "loss": 0.2472, + "step": 165 + }, + { + "epoch": 0.114998268098372, + "grad_norm": 1.3456852436065674, + "learning_rate": 9.891816920943136e-06, + "loss": 0.2302, + "step": 166 + }, + { + "epoch": 0.11569102874956702, + "grad_norm": 1.2964712381362915, + "learning_rate": 9.891123439667129e-06, + "loss": 0.2219, + "step": 167 + }, + { + "epoch": 0.11638378940076204, + "grad_norm": 1.3069645166397095, + "learning_rate": 9.890429958391124e-06, + "loss": 0.2082, + "step": 168 + }, + { + "epoch": 0.11707655005195705, + "grad_norm": 1.1544204950332642, + "learning_rate": 9.889736477115119e-06, + "loss": 0.1808, + "step": 169 + }, + { + "epoch": 0.11776931070315207, + "grad_norm": 1.2361173629760742, + "learning_rate": 9.889042995839114e-06, + "loss": 0.2082, + "step": 170 + }, + { + "epoch": 0.11846207135434707, + "grad_norm": 1.2170089483261108, + "learning_rate": 9.888349514563109e-06, + "loss": 0.2217, + "step": 171 + }, + { + "epoch": 0.11915483200554208, + "grad_norm": 1.2207200527191162, + "learning_rate": 9.887656033287102e-06, + "loss": 0.2112, + "step": 172 + }, + { + "epoch": 0.1198475926567371, + "grad_norm": 1.2350168228149414, + "learning_rate": 9.886962552011097e-06, + "loss": 0.2186, + "step": 173 + }, + { + "epoch": 0.12054035330793211, + "grad_norm": 1.233114242553711, + "learning_rate": 9.88626907073509e-06, + "loss": 0.2126, + "step": 174 + }, + { + "epoch": 0.12123311395912713, + "grad_norm": 1.2821439504623413, + "learning_rate": 9.885575589459085e-06, + "loss": 0.2111, + "step": 175 + }, + { + "epoch": 0.12192587461032213, + "grad_norm": 1.2624083757400513, + "learning_rate": 9.88488210818308e-06, + "loss": 0.2094, + "step": 176 + }, + { + "epoch": 0.12261863526151714, + "grad_norm": 1.2297890186309814, + "learning_rate": 9.884188626907073e-06, + "loss": 0.2192, + "step": 177 + }, + { + "epoch": 0.12331139591271216, + "grad_norm": 1.2710490226745605, + "learning_rate": 9.883495145631068e-06, + "loss": 0.1832, + "step": 178 + }, + { + "epoch": 0.12400415656390718, + "grad_norm": 1.1201629638671875, + "learning_rate": 9.882801664355063e-06, + "loss": 0.1738, + "step": 179 + }, + { + "epoch": 0.12469691721510218, + "grad_norm": 1.30121648311615, + "learning_rate": 9.882108183079058e-06, + "loss": 0.2266, + "step": 180 + }, + { + "epoch": 0.1253896778662972, + "grad_norm": 1.3483548164367676, + "learning_rate": 9.881414701803053e-06, + "loss": 0.2142, + "step": 181 + }, + { + "epoch": 0.1260824385174922, + "grad_norm": 1.202298879623413, + "learning_rate": 9.880721220527046e-06, + "loss": 0.1916, + "step": 182 + }, + { + "epoch": 0.1267751991686872, + "grad_norm": 1.410788893699646, + "learning_rate": 9.880027739251041e-06, + "loss": 0.2421, + "step": 183 + }, + { + "epoch": 0.12746795981988224, + "grad_norm": 1.306676983833313, + "learning_rate": 9.879334257975035e-06, + "loss": 0.2341, + "step": 184 + }, + { + "epoch": 0.12816072047107724, + "grad_norm": 1.2322577238082886, + "learning_rate": 9.87864077669903e-06, + "loss": 0.1775, + "step": 185 + }, + { + "epoch": 0.12885348112227227, + "grad_norm": 1.1031079292297363, + "learning_rate": 9.877947295423024e-06, + "loss": 0.1945, + "step": 186 + }, + { + "epoch": 0.12954624177346727, + "grad_norm": 1.3930292129516602, + "learning_rate": 9.87725381414702e-06, + "loss": 0.2425, + "step": 187 + }, + { + "epoch": 0.13023900242466227, + "grad_norm": 1.2810267210006714, + "learning_rate": 9.876560332871014e-06, + "loss": 0.2209, + "step": 188 + }, + { + "epoch": 0.1309317630758573, + "grad_norm": 1.2307170629501343, + "learning_rate": 9.875866851595008e-06, + "loss": 0.202, + "step": 189 + }, + { + "epoch": 0.1316245237270523, + "grad_norm": 1.3543944358825684, + "learning_rate": 9.875173370319002e-06, + "loss": 0.2147, + "step": 190 + }, + { + "epoch": 0.1323172843782473, + "grad_norm": 1.176788330078125, + "learning_rate": 9.874479889042997e-06, + "loss": 0.1863, + "step": 191 + }, + { + "epoch": 0.13301004502944233, + "grad_norm": 1.282935380935669, + "learning_rate": 9.87378640776699e-06, + "loss": 0.196, + "step": 192 + }, + { + "epoch": 0.13370280568063733, + "grad_norm": 1.364748477935791, + "learning_rate": 9.873092926490986e-06, + "loss": 0.2352, + "step": 193 + }, + { + "epoch": 0.13439556633183236, + "grad_norm": 1.2242869138717651, + "learning_rate": 9.872399445214979e-06, + "loss": 0.2178, + "step": 194 + }, + { + "epoch": 0.13508832698302736, + "grad_norm": 1.2392072677612305, + "learning_rate": 9.871705963938974e-06, + "loss": 0.2012, + "step": 195 + }, + { + "epoch": 0.13578108763422236, + "grad_norm": 1.2447081804275513, + "learning_rate": 9.871012482662969e-06, + "loss": 0.198, + "step": 196 + }, + { + "epoch": 0.1364738482854174, + "grad_norm": 1.3111966848373413, + "learning_rate": 9.870319001386964e-06, + "loss": 0.2187, + "step": 197 + }, + { + "epoch": 0.1371666089366124, + "grad_norm": 1.3184764385223389, + "learning_rate": 9.869625520110959e-06, + "loss": 0.2719, + "step": 198 + }, + { + "epoch": 0.13785936958780742, + "grad_norm": 1.1946929693222046, + "learning_rate": 9.868932038834952e-06, + "loss": 0.1987, + "step": 199 + }, + { + "epoch": 0.13855213023900242, + "grad_norm": 1.234924554824829, + "learning_rate": 9.868238557558947e-06, + "loss": 0.2382, + "step": 200 + }, + { + "epoch": 0.13924489089019743, + "grad_norm": 1.2631289958953857, + "learning_rate": 9.867545076282942e-06, + "loss": 0.23, + "step": 201 + }, + { + "epoch": 0.13993765154139245, + "grad_norm": 1.2285739183425903, + "learning_rate": 9.866851595006935e-06, + "loss": 0.1768, + "step": 202 + }, + { + "epoch": 0.14063041219258746, + "grad_norm": 1.2478169202804565, + "learning_rate": 9.86615811373093e-06, + "loss": 0.1902, + "step": 203 + }, + { + "epoch": 0.14132317284378249, + "grad_norm": 1.2999858856201172, + "learning_rate": 9.865464632454925e-06, + "loss": 0.2109, + "step": 204 + }, + { + "epoch": 0.1420159334949775, + "grad_norm": 1.2435208559036255, + "learning_rate": 9.86477115117892e-06, + "loss": 0.1972, + "step": 205 + }, + { + "epoch": 0.1427086941461725, + "grad_norm": 1.2937616109848022, + "learning_rate": 9.864077669902915e-06, + "loss": 0.2116, + "step": 206 + }, + { + "epoch": 0.14340145479736752, + "grad_norm": 1.2655960321426392, + "learning_rate": 9.863384188626908e-06, + "loss": 0.2176, + "step": 207 + }, + { + "epoch": 0.14409421544856252, + "grad_norm": 1.202903389930725, + "learning_rate": 9.862690707350903e-06, + "loss": 0.1913, + "step": 208 + }, + { + "epoch": 0.14478697609975755, + "grad_norm": 1.2482086420059204, + "learning_rate": 9.861997226074896e-06, + "loss": 0.1854, + "step": 209 + }, + { + "epoch": 0.14547973675095255, + "grad_norm": 1.2493541240692139, + "learning_rate": 9.861303744798891e-06, + "loss": 0.2055, + "step": 210 + }, + { + "epoch": 0.14617249740214755, + "grad_norm": 1.0790982246398926, + "learning_rate": 9.860610263522886e-06, + "loss": 0.1891, + "step": 211 + }, + { + "epoch": 0.14686525805334258, + "grad_norm": 1.2813910245895386, + "learning_rate": 9.85991678224688e-06, + "loss": 0.2208, + "step": 212 + }, + { + "epoch": 0.14755801870453758, + "grad_norm": 1.2617616653442383, + "learning_rate": 9.859223300970874e-06, + "loss": 0.2139, + "step": 213 + }, + { + "epoch": 0.14825077935573258, + "grad_norm": 1.3398065567016602, + "learning_rate": 9.858529819694869e-06, + "loss": 0.1873, + "step": 214 + }, + { + "epoch": 0.1489435400069276, + "grad_norm": 1.2965143918991089, + "learning_rate": 9.857836338418864e-06, + "loss": 0.2068, + "step": 215 + }, + { + "epoch": 0.1496363006581226, + "grad_norm": 1.24671471118927, + "learning_rate": 9.857142857142859e-06, + "loss": 0.2225, + "step": 216 + }, + { + "epoch": 0.15032906130931764, + "grad_norm": 1.31780207157135, + "learning_rate": 9.856449375866852e-06, + "loss": 0.236, + "step": 217 + }, + { + "epoch": 0.15102182196051264, + "grad_norm": 1.3348220586776733, + "learning_rate": 9.855755894590847e-06, + "loss": 0.228, + "step": 218 + }, + { + "epoch": 0.15171458261170764, + "grad_norm": 1.3435728549957275, + "learning_rate": 9.85506241331484e-06, + "loss": 0.2072, + "step": 219 + }, + { + "epoch": 0.15240734326290267, + "grad_norm": 1.2767689228057861, + "learning_rate": 9.854368932038835e-06, + "loss": 0.2147, + "step": 220 + }, + { + "epoch": 0.15310010391409767, + "grad_norm": 1.2819961309432983, + "learning_rate": 9.85367545076283e-06, + "loss": 0.2037, + "step": 221 + }, + { + "epoch": 0.1537928645652927, + "grad_norm": 1.2598323822021484, + "learning_rate": 9.852981969486825e-06, + "loss": 0.2072, + "step": 222 + }, + { + "epoch": 0.1544856252164877, + "grad_norm": 1.3809995651245117, + "learning_rate": 9.85228848821082e-06, + "loss": 0.227, + "step": 223 + }, + { + "epoch": 0.1551783858676827, + "grad_norm": 1.1424462795257568, + "learning_rate": 9.851595006934813e-06, + "loss": 0.2237, + "step": 224 + }, + { + "epoch": 0.15587114651887773, + "grad_norm": 1.2016152143478394, + "learning_rate": 9.850901525658808e-06, + "loss": 0.187, + "step": 225 + }, + { + "epoch": 0.15656390717007274, + "grad_norm": 1.221073865890503, + "learning_rate": 9.850208044382803e-06, + "loss": 0.1899, + "step": 226 + }, + { + "epoch": 0.15725666782126776, + "grad_norm": 1.2531520128250122, + "learning_rate": 9.849514563106796e-06, + "loss": 0.2314, + "step": 227 + }, + { + "epoch": 0.15794942847246277, + "grad_norm": 1.2226425409317017, + "learning_rate": 9.848821081830791e-06, + "loss": 0.2108, + "step": 228 + }, + { + "epoch": 0.15864218912365777, + "grad_norm": 1.1877517700195312, + "learning_rate": 9.848127600554786e-06, + "loss": 0.225, + "step": 229 + }, + { + "epoch": 0.1593349497748528, + "grad_norm": 1.371266484260559, + "learning_rate": 9.847434119278781e-06, + "loss": 0.2285, + "step": 230 + }, + { + "epoch": 0.1600277104260478, + "grad_norm": 1.1313170194625854, + "learning_rate": 9.846740638002776e-06, + "loss": 0.2077, + "step": 231 + }, + { + "epoch": 0.1607204710772428, + "grad_norm": 1.3221700191497803, + "learning_rate": 9.84604715672677e-06, + "loss": 0.2128, + "step": 232 + }, + { + "epoch": 0.16141323172843783, + "grad_norm": 1.402754783630371, + "learning_rate": 9.845353675450764e-06, + "loss": 0.2218, + "step": 233 + }, + { + "epoch": 0.16210599237963283, + "grad_norm": 1.1608260869979858, + "learning_rate": 9.844660194174757e-06, + "loss": 0.2163, + "step": 234 + }, + { + "epoch": 0.16279875303082786, + "grad_norm": 1.2287039756774902, + "learning_rate": 9.843966712898752e-06, + "loss": 0.2145, + "step": 235 + }, + { + "epoch": 0.16349151368202286, + "grad_norm": 1.31821608543396, + "learning_rate": 9.843273231622747e-06, + "loss": 0.1889, + "step": 236 + }, + { + "epoch": 0.16418427433321786, + "grad_norm": 1.2445365190505981, + "learning_rate": 9.84257975034674e-06, + "loss": 0.2296, + "step": 237 + }, + { + "epoch": 0.1648770349844129, + "grad_norm": 1.3389273881912231, + "learning_rate": 9.841886269070736e-06, + "loss": 0.2008, + "step": 238 + }, + { + "epoch": 0.1655697956356079, + "grad_norm": 1.1889816522598267, + "learning_rate": 9.84119278779473e-06, + "loss": 0.2062, + "step": 239 + }, + { + "epoch": 0.16626255628680292, + "grad_norm": 1.1592460870742798, + "learning_rate": 9.840499306518725e-06, + "loss": 0.1967, + "step": 240 + }, + { + "epoch": 0.16695531693799792, + "grad_norm": 1.087691068649292, + "learning_rate": 9.83980582524272e-06, + "loss": 0.1709, + "step": 241 + }, + { + "epoch": 0.16764807758919292, + "grad_norm": 1.0733919143676758, + "learning_rate": 9.839112343966714e-06, + "loss": 0.1641, + "step": 242 + }, + { + "epoch": 0.16834083824038795, + "grad_norm": 1.19711172580719, + "learning_rate": 9.838418862690708e-06, + "loss": 0.2138, + "step": 243 + }, + { + "epoch": 0.16903359889158295, + "grad_norm": 1.1129231452941895, + "learning_rate": 9.837725381414702e-06, + "loss": 0.1721, + "step": 244 + }, + { + "epoch": 0.16972635954277798, + "grad_norm": 1.3185864686965942, + "learning_rate": 9.837031900138697e-06, + "loss": 0.2135, + "step": 245 + }, + { + "epoch": 0.17041912019397298, + "grad_norm": 1.4742556810379028, + "learning_rate": 9.836338418862692e-06, + "loss": 0.2573, + "step": 246 + }, + { + "epoch": 0.17111188084516799, + "grad_norm": 1.267116904258728, + "learning_rate": 9.835644937586687e-06, + "loss": 0.1905, + "step": 247 + }, + { + "epoch": 0.17180464149636301, + "grad_norm": 1.193515419960022, + "learning_rate": 9.834951456310681e-06, + "loss": 0.1993, + "step": 248 + }, + { + "epoch": 0.17249740214755802, + "grad_norm": 1.228421926498413, + "learning_rate": 9.834257975034675e-06, + "loss": 0.191, + "step": 249 + }, + { + "epoch": 0.17319016279875304, + "grad_norm": 1.1218215227127075, + "learning_rate": 9.83356449375867e-06, + "loss": 0.196, + "step": 250 + }, + { + "epoch": 0.17388292344994805, + "grad_norm": 1.4057866334915161, + "learning_rate": 9.832871012482665e-06, + "loss": 0.2423, + "step": 251 + }, + { + "epoch": 0.17457568410114305, + "grad_norm": 1.2802729606628418, + "learning_rate": 9.832177531206658e-06, + "loss": 0.1895, + "step": 252 + }, + { + "epoch": 0.17526844475233808, + "grad_norm": 1.1412217617034912, + "learning_rate": 9.831484049930653e-06, + "loss": 0.1842, + "step": 253 + }, + { + "epoch": 0.17596120540353308, + "grad_norm": 1.2268813848495483, + "learning_rate": 9.830790568654646e-06, + "loss": 0.1816, + "step": 254 + }, + { + "epoch": 0.17665396605472808, + "grad_norm": 1.364072561264038, + "learning_rate": 9.830097087378641e-06, + "loss": 0.2048, + "step": 255 + }, + { + "epoch": 0.1773467267059231, + "grad_norm": 1.3302061557769775, + "learning_rate": 9.829403606102636e-06, + "loss": 0.2034, + "step": 256 + }, + { + "epoch": 0.1780394873571181, + "grad_norm": 1.359153389930725, + "learning_rate": 9.82871012482663e-06, + "loss": 0.2518, + "step": 257 + }, + { + "epoch": 0.17873224800831314, + "grad_norm": 1.1203558444976807, + "learning_rate": 9.828016643550626e-06, + "loss": 0.1812, + "step": 258 + }, + { + "epoch": 0.17942500865950814, + "grad_norm": 1.5545564889907837, + "learning_rate": 9.827323162274619e-06, + "loss": 0.2388, + "step": 259 + }, + { + "epoch": 0.18011776931070314, + "grad_norm": 1.2660139799118042, + "learning_rate": 9.826629680998614e-06, + "loss": 0.1896, + "step": 260 + }, + { + "epoch": 0.18081052996189817, + "grad_norm": 1.1667258739471436, + "learning_rate": 9.825936199722609e-06, + "loss": 0.1964, + "step": 261 + }, + { + "epoch": 0.18150329061309317, + "grad_norm": 1.3700950145721436, + "learning_rate": 9.825242718446602e-06, + "loss": 0.2092, + "step": 262 + }, + { + "epoch": 0.1821960512642882, + "grad_norm": 1.4845219850540161, + "learning_rate": 9.824549237170597e-06, + "loss": 0.2104, + "step": 263 + }, + { + "epoch": 0.1828888119154832, + "grad_norm": 1.2829970121383667, + "learning_rate": 9.823855755894592e-06, + "loss": 0.1913, + "step": 264 + }, + { + "epoch": 0.1835815725666782, + "grad_norm": 1.3625524044036865, + "learning_rate": 9.823162274618587e-06, + "loss": 0.1969, + "step": 265 + }, + { + "epoch": 0.18427433321787323, + "grad_norm": 1.195306658744812, + "learning_rate": 9.822468793342582e-06, + "loss": 0.1846, + "step": 266 + }, + { + "epoch": 0.18496709386906823, + "grad_norm": 1.247125267982483, + "learning_rate": 9.821775312066575e-06, + "loss": 0.1967, + "step": 267 + }, + { + "epoch": 0.18565985452026326, + "grad_norm": 1.1861320734024048, + "learning_rate": 9.82108183079057e-06, + "loss": 0.1864, + "step": 268 + }, + { + "epoch": 0.18635261517145826, + "grad_norm": 1.2261962890625, + "learning_rate": 9.820388349514563e-06, + "loss": 0.2117, + "step": 269 + }, + { + "epoch": 0.18704537582265326, + "grad_norm": 1.260686993598938, + "learning_rate": 9.819694868238558e-06, + "loss": 0.2126, + "step": 270 + }, + { + "epoch": 0.1877381364738483, + "grad_norm": 1.3203638792037964, + "learning_rate": 9.819001386962553e-06, + "loss": 0.2191, + "step": 271 + }, + { + "epoch": 0.1884308971250433, + "grad_norm": 1.3990230560302734, + "learning_rate": 9.818307905686546e-06, + "loss": 0.195, + "step": 272 + }, + { + "epoch": 0.1891236577762383, + "grad_norm": 1.2345364093780518, + "learning_rate": 9.817614424410541e-06, + "loss": 0.2135, + "step": 273 + }, + { + "epoch": 0.18981641842743333, + "grad_norm": 1.2527592182159424, + "learning_rate": 9.816920943134536e-06, + "loss": 0.2312, + "step": 274 + }, + { + "epoch": 0.19050917907862833, + "grad_norm": 1.3236708641052246, + "learning_rate": 9.816227461858531e-06, + "loss": 0.226, + "step": 275 + }, + { + "epoch": 0.19120193972982336, + "grad_norm": 1.2532308101654053, + "learning_rate": 9.815533980582526e-06, + "loss": 0.2258, + "step": 276 + }, + { + "epoch": 0.19189470038101836, + "grad_norm": 1.1519466638565063, + "learning_rate": 9.81484049930652e-06, + "loss": 0.1831, + "step": 277 + }, + { + "epoch": 0.19258746103221336, + "grad_norm": 1.0873594284057617, + "learning_rate": 9.814147018030514e-06, + "loss": 0.1596, + "step": 278 + }, + { + "epoch": 0.1932802216834084, + "grad_norm": 1.0936123132705688, + "learning_rate": 9.813453536754507e-06, + "loss": 0.1909, + "step": 279 + }, + { + "epoch": 0.1939729823346034, + "grad_norm": 1.2175698280334473, + "learning_rate": 9.812760055478502e-06, + "loss": 0.1901, + "step": 280 + }, + { + "epoch": 0.19466574298579842, + "grad_norm": 1.2072296142578125, + "learning_rate": 9.812066574202497e-06, + "loss": 0.2185, + "step": 281 + }, + { + "epoch": 0.19535850363699342, + "grad_norm": 1.2681595087051392, + "learning_rate": 9.811373092926492e-06, + "loss": 0.1889, + "step": 282 + }, + { + "epoch": 0.19605126428818842, + "grad_norm": 1.2664990425109863, + "learning_rate": 9.810679611650487e-06, + "loss": 0.2113, + "step": 283 + }, + { + "epoch": 0.19674402493938345, + "grad_norm": 1.1895562410354614, + "learning_rate": 9.80998613037448e-06, + "loss": 0.1695, + "step": 284 + }, + { + "epoch": 0.19743678559057845, + "grad_norm": 1.3624688386917114, + "learning_rate": 9.809292649098475e-06, + "loss": 0.2142, + "step": 285 + }, + { + "epoch": 0.19812954624177348, + "grad_norm": 1.0687636137008667, + "learning_rate": 9.80859916782247e-06, + "loss": 0.1879, + "step": 286 + }, + { + "epoch": 0.19882230689296848, + "grad_norm": 1.2875828742980957, + "learning_rate": 9.807905686546464e-06, + "loss": 0.1725, + "step": 287 + }, + { + "epoch": 0.19951506754416348, + "grad_norm": 1.3224164247512817, + "learning_rate": 9.807212205270458e-06, + "loss": 0.2365, + "step": 288 + }, + { + "epoch": 0.2002078281953585, + "grad_norm": 1.1853020191192627, + "learning_rate": 9.806518723994453e-06, + "loss": 0.178, + "step": 289 + }, + { + "epoch": 0.2009005888465535, + "grad_norm": 1.1800174713134766, + "learning_rate": 9.805825242718447e-06, + "loss": 0.2122, + "step": 290 + }, + { + "epoch": 0.20159334949774851, + "grad_norm": 1.3669440746307373, + "learning_rate": 9.805131761442442e-06, + "loss": 0.1868, + "step": 291 + }, + { + "epoch": 0.20228611014894354, + "grad_norm": 1.2624220848083496, + "learning_rate": 9.804438280166437e-06, + "loss": 0.1918, + "step": 292 + }, + { + "epoch": 0.20297887080013854, + "grad_norm": 1.2374008893966675, + "learning_rate": 9.803744798890431e-06, + "loss": 0.2083, + "step": 293 + }, + { + "epoch": 0.20367163145133357, + "grad_norm": 1.2160348892211914, + "learning_rate": 9.803051317614425e-06, + "loss": 0.2094, + "step": 294 + }, + { + "epoch": 0.20436439210252857, + "grad_norm": 1.2681633234024048, + "learning_rate": 9.80235783633842e-06, + "loss": 0.1929, + "step": 295 + }, + { + "epoch": 0.20505715275372358, + "grad_norm": 1.4707847833633423, + "learning_rate": 9.801664355062415e-06, + "loss": 0.2017, + "step": 296 + }, + { + "epoch": 0.2057499134049186, + "grad_norm": 1.315453290939331, + "learning_rate": 9.800970873786408e-06, + "loss": 0.2484, + "step": 297 + }, + { + "epoch": 0.2064426740561136, + "grad_norm": 1.1209535598754883, + "learning_rate": 9.800277392510403e-06, + "loss": 0.2102, + "step": 298 + }, + { + "epoch": 0.20713543470730864, + "grad_norm": 1.1134284734725952, + "learning_rate": 9.799583911234398e-06, + "loss": 0.1657, + "step": 299 + }, + { + "epoch": 0.20782819535850364, + "grad_norm": 1.4142677783966064, + "learning_rate": 9.798890429958393e-06, + "loss": 0.278, + "step": 300 + }, + { + "epoch": 0.20852095600969864, + "grad_norm": 1.3863179683685303, + "learning_rate": 9.798196948682388e-06, + "loss": 0.199, + "step": 301 + }, + { + "epoch": 0.20921371666089367, + "grad_norm": 1.3656364679336548, + "learning_rate": 9.79750346740638e-06, + "loss": 0.2104, + "step": 302 + }, + { + "epoch": 0.20990647731208867, + "grad_norm": 1.2141101360321045, + "learning_rate": 9.796809986130376e-06, + "loss": 0.195, + "step": 303 + }, + { + "epoch": 0.2105992379632837, + "grad_norm": 0.9968119859695435, + "learning_rate": 9.796116504854369e-06, + "loss": 0.1642, + "step": 304 + }, + { + "epoch": 0.2112919986144787, + "grad_norm": 1.1695276498794556, + "learning_rate": 9.795423023578364e-06, + "loss": 0.1836, + "step": 305 + }, + { + "epoch": 0.2119847592656737, + "grad_norm": 1.1669930219650269, + "learning_rate": 9.794729542302359e-06, + "loss": 0.18, + "step": 306 + }, + { + "epoch": 0.21267751991686873, + "grad_norm": 1.2998963594436646, + "learning_rate": 9.794036061026354e-06, + "loss": 0.1898, + "step": 307 + }, + { + "epoch": 0.21337028056806373, + "grad_norm": 1.1873587369918823, + "learning_rate": 9.793342579750349e-06, + "loss": 0.1659, + "step": 308 + }, + { + "epoch": 0.21406304121925876, + "grad_norm": 1.2754130363464355, + "learning_rate": 9.792649098474342e-06, + "loss": 0.244, + "step": 309 + }, + { + "epoch": 0.21475580187045376, + "grad_norm": 1.1994528770446777, + "learning_rate": 9.791955617198337e-06, + "loss": 0.1897, + "step": 310 + }, + { + "epoch": 0.21544856252164876, + "grad_norm": 1.1074354648590088, + "learning_rate": 9.791262135922332e-06, + "loss": 0.2014, + "step": 311 + }, + { + "epoch": 0.2161413231728438, + "grad_norm": 1.0358245372772217, + "learning_rate": 9.790568654646325e-06, + "loss": 0.1665, + "step": 312 + }, + { + "epoch": 0.2168340838240388, + "grad_norm": 1.2764501571655273, + "learning_rate": 9.78987517337032e-06, + "loss": 0.2242, + "step": 313 + }, + { + "epoch": 0.2175268444752338, + "grad_norm": 1.310734748840332, + "learning_rate": 9.789181692094313e-06, + "loss": 0.2104, + "step": 314 + }, + { + "epoch": 0.21821960512642882, + "grad_norm": 1.4064656496047974, + "learning_rate": 9.788488210818308e-06, + "loss": 0.2077, + "step": 315 + }, + { + "epoch": 0.21891236577762382, + "grad_norm": 1.2232162952423096, + "learning_rate": 9.787794729542303e-06, + "loss": 0.214, + "step": 316 + }, + { + "epoch": 0.21960512642881885, + "grad_norm": 1.4126719236373901, + "learning_rate": 9.787101248266298e-06, + "loss": 0.2201, + "step": 317 + }, + { + "epoch": 0.22029788708001385, + "grad_norm": 1.2731772661209106, + "learning_rate": 9.786407766990293e-06, + "loss": 0.2398, + "step": 318 + }, + { + "epoch": 0.22099064773120886, + "grad_norm": 1.2599573135375977, + "learning_rate": 9.785714285714286e-06, + "loss": 0.1858, + "step": 319 + }, + { + "epoch": 0.22168340838240388, + "grad_norm": 1.168334722518921, + "learning_rate": 9.785020804438281e-06, + "loss": 0.1803, + "step": 320 + }, + { + "epoch": 0.22237616903359889, + "grad_norm": 1.2936931848526, + "learning_rate": 9.784327323162276e-06, + "loss": 0.1773, + "step": 321 + }, + { + "epoch": 0.22306892968479392, + "grad_norm": 1.203903317451477, + "learning_rate": 9.78363384188627e-06, + "loss": 0.19, + "step": 322 + }, + { + "epoch": 0.22376169033598892, + "grad_norm": 1.1261348724365234, + "learning_rate": 9.782940360610264e-06, + "loss": 0.2082, + "step": 323 + }, + { + "epoch": 0.22445445098718392, + "grad_norm": 1.328880786895752, + "learning_rate": 9.782246879334259e-06, + "loss": 0.1634, + "step": 324 + }, + { + "epoch": 0.22514721163837895, + "grad_norm": 1.2127742767333984, + "learning_rate": 9.781553398058254e-06, + "loss": 0.1974, + "step": 325 + }, + { + "epoch": 0.22583997228957395, + "grad_norm": 1.1583975553512573, + "learning_rate": 9.780859916782249e-06, + "loss": 0.1962, + "step": 326 + }, + { + "epoch": 0.22653273294076898, + "grad_norm": 1.3292402029037476, + "learning_rate": 9.780166435506242e-06, + "loss": 0.2285, + "step": 327 + }, + { + "epoch": 0.22722549359196398, + "grad_norm": 1.1186047792434692, + "learning_rate": 9.779472954230237e-06, + "loss": 0.1795, + "step": 328 + }, + { + "epoch": 0.22791825424315898, + "grad_norm": 1.1951663494110107, + "learning_rate": 9.77877947295423e-06, + "loss": 0.1856, + "step": 329 + }, + { + "epoch": 0.228611014894354, + "grad_norm": 1.2000247240066528, + "learning_rate": 9.778085991678225e-06, + "loss": 0.2217, + "step": 330 + }, + { + "epoch": 0.229303775545549, + "grad_norm": 1.190314531326294, + "learning_rate": 9.77739251040222e-06, + "loss": 0.1898, + "step": 331 + }, + { + "epoch": 0.229996536196744, + "grad_norm": 1.2973747253417969, + "learning_rate": 9.776699029126214e-06, + "loss": 0.2458, + "step": 332 + }, + { + "epoch": 0.23068929684793904, + "grad_norm": 1.2944179773330688, + "learning_rate": 9.776005547850208e-06, + "loss": 0.2192, + "step": 333 + }, + { + "epoch": 0.23138205749913404, + "grad_norm": 1.2244884967803955, + "learning_rate": 9.775312066574203e-06, + "loss": 0.195, + "step": 334 + }, + { + "epoch": 0.23207481815032907, + "grad_norm": 1.1542786359786987, + "learning_rate": 9.774618585298198e-06, + "loss": 0.2109, + "step": 335 + }, + { + "epoch": 0.23276757880152407, + "grad_norm": 1.220381498336792, + "learning_rate": 9.773925104022193e-06, + "loss": 0.1793, + "step": 336 + }, + { + "epoch": 0.23346033945271907, + "grad_norm": 1.3393840789794922, + "learning_rate": 9.773231622746186e-06, + "loss": 0.2216, + "step": 337 + }, + { + "epoch": 0.2341531001039141, + "grad_norm": 1.2374804019927979, + "learning_rate": 9.772538141470181e-06, + "loss": 0.1691, + "step": 338 + }, + { + "epoch": 0.2348458607551091, + "grad_norm": 1.1657991409301758, + "learning_rate": 9.771844660194175e-06, + "loss": 0.1849, + "step": 339 + }, + { + "epoch": 0.23553862140630413, + "grad_norm": 1.2341691255569458, + "learning_rate": 9.77115117891817e-06, + "loss": 0.22, + "step": 340 + }, + { + "epoch": 0.23623138205749913, + "grad_norm": 1.1729243993759155, + "learning_rate": 9.770457697642165e-06, + "loss": 0.1999, + "step": 341 + }, + { + "epoch": 0.23692414270869414, + "grad_norm": 1.2024861574172974, + "learning_rate": 9.76976421636616e-06, + "loss": 0.2065, + "step": 342 + }, + { + "epoch": 0.23761690335988916, + "grad_norm": 1.1806306838989258, + "learning_rate": 9.769070735090154e-06, + "loss": 0.2052, + "step": 343 + }, + { + "epoch": 0.23830966401108417, + "grad_norm": 1.2302980422973633, + "learning_rate": 9.768377253814148e-06, + "loss": 0.1886, + "step": 344 + }, + { + "epoch": 0.2390024246622792, + "grad_norm": 1.370265007019043, + "learning_rate": 9.767683772538143e-06, + "loss": 0.2156, + "step": 345 + }, + { + "epoch": 0.2396951853134742, + "grad_norm": 1.2476282119750977, + "learning_rate": 9.766990291262138e-06, + "loss": 0.2018, + "step": 346 + }, + { + "epoch": 0.2403879459646692, + "grad_norm": 1.2265602350234985, + "learning_rate": 9.76629680998613e-06, + "loss": 0.2077, + "step": 347 + }, + { + "epoch": 0.24108070661586423, + "grad_norm": 1.2666821479797363, + "learning_rate": 9.765603328710126e-06, + "loss": 0.2045, + "step": 348 + }, + { + "epoch": 0.24177346726705923, + "grad_norm": 1.2393244504928589, + "learning_rate": 9.764909847434119e-06, + "loss": 0.2169, + "step": 349 + }, + { + "epoch": 0.24246622791825426, + "grad_norm": 1.1078341007232666, + "learning_rate": 9.764216366158114e-06, + "loss": 0.1778, + "step": 350 + }, + { + "epoch": 0.24315898856944926, + "grad_norm": 1.1971571445465088, + "learning_rate": 9.763522884882109e-06, + "loss": 0.2142, + "step": 351 + }, + { + "epoch": 0.24385174922064426, + "grad_norm": 1.0961097478866577, + "learning_rate": 9.762829403606104e-06, + "loss": 0.1795, + "step": 352 + }, + { + "epoch": 0.2445445098718393, + "grad_norm": 1.141653299331665, + "learning_rate": 9.762135922330099e-06, + "loss": 0.1912, + "step": 353 + }, + { + "epoch": 0.2452372705230343, + "grad_norm": 1.2854044437408447, + "learning_rate": 9.761442441054092e-06, + "loss": 0.215, + "step": 354 + }, + { + "epoch": 0.2459300311742293, + "grad_norm": 1.32241690158844, + "learning_rate": 9.760748959778087e-06, + "loss": 0.2358, + "step": 355 + }, + { + "epoch": 0.24662279182542432, + "grad_norm": 1.2686291933059692, + "learning_rate": 9.760055478502082e-06, + "loss": 0.2248, + "step": 356 + }, + { + "epoch": 0.24731555247661932, + "grad_norm": 1.0646724700927734, + "learning_rate": 9.759361997226075e-06, + "loss": 0.1383, + "step": 357 + }, + { + "epoch": 0.24800831312781435, + "grad_norm": 1.18061363697052, + "learning_rate": 9.75866851595007e-06, + "loss": 0.2103, + "step": 358 + }, + { + "epoch": 0.24870107377900935, + "grad_norm": 1.1441906690597534, + "learning_rate": 9.757975034674065e-06, + "loss": 0.1733, + "step": 359 + }, + { + "epoch": 0.24939383443020435, + "grad_norm": 1.244655966758728, + "learning_rate": 9.75728155339806e-06, + "loss": 0.2094, + "step": 360 + }, + { + "epoch": 0.25008659508139935, + "grad_norm": 1.1576495170593262, + "learning_rate": 9.756588072122055e-06, + "loss": 0.1762, + "step": 361 + }, + { + "epoch": 0.2507793557325944, + "grad_norm": 1.246250867843628, + "learning_rate": 9.755894590846048e-06, + "loss": 0.1998, + "step": 362 + }, + { + "epoch": 0.2514721163837894, + "grad_norm": 1.1842297315597534, + "learning_rate": 9.755201109570043e-06, + "loss": 0.1827, + "step": 363 + }, + { + "epoch": 0.2521648770349844, + "grad_norm": 1.2482850551605225, + "learning_rate": 9.754507628294036e-06, + "loss": 0.1867, + "step": 364 + }, + { + "epoch": 0.2528576376861794, + "grad_norm": 1.3382683992385864, + "learning_rate": 9.753814147018031e-06, + "loss": 0.2192, + "step": 365 + }, + { + "epoch": 0.2535503983373744, + "grad_norm": 1.0633735656738281, + "learning_rate": 9.753120665742026e-06, + "loss": 0.163, + "step": 366 + }, + { + "epoch": 0.2542431589885695, + "grad_norm": 1.0362846851348877, + "learning_rate": 9.752427184466021e-06, + "loss": 0.172, + "step": 367 + }, + { + "epoch": 0.2549359196397645, + "grad_norm": 1.1413606405258179, + "learning_rate": 9.751733703190014e-06, + "loss": 0.2118, + "step": 368 + }, + { + "epoch": 0.2556286802909595, + "grad_norm": 1.2740391492843628, + "learning_rate": 9.751040221914009e-06, + "loss": 0.2133, + "step": 369 + }, + { + "epoch": 0.2563214409421545, + "grad_norm": 1.2702744007110596, + "learning_rate": 9.750346740638004e-06, + "loss": 0.2098, + "step": 370 + }, + { + "epoch": 0.2570142015933495, + "grad_norm": 1.3101216554641724, + "learning_rate": 9.749653259361997e-06, + "loss": 0.2287, + "step": 371 + }, + { + "epoch": 0.25770696224454454, + "grad_norm": 1.1721727848052979, + "learning_rate": 9.748959778085992e-06, + "loss": 0.1813, + "step": 372 + }, + { + "epoch": 0.25839972289573954, + "grad_norm": 1.2729319334030151, + "learning_rate": 9.748266296809987e-06, + "loss": 0.1981, + "step": 373 + }, + { + "epoch": 0.25909248354693454, + "grad_norm": 1.223813533782959, + "learning_rate": 9.74757281553398e-06, + "loss": 0.1912, + "step": 374 + }, + { + "epoch": 0.25978524419812954, + "grad_norm": 1.198010802268982, + "learning_rate": 9.746879334257975e-06, + "loss": 0.1891, + "step": 375 + }, + { + "epoch": 0.26047800484932454, + "grad_norm": 1.2105743885040283, + "learning_rate": 9.74618585298197e-06, + "loss": 0.2123, + "step": 376 + }, + { + "epoch": 0.2611707655005196, + "grad_norm": 1.2053524255752563, + "learning_rate": 9.745492371705965e-06, + "loss": 0.2044, + "step": 377 + }, + { + "epoch": 0.2618635261517146, + "grad_norm": 1.3305010795593262, + "learning_rate": 9.74479889042996e-06, + "loss": 0.203, + "step": 378 + }, + { + "epoch": 0.2625562868029096, + "grad_norm": 1.1689729690551758, + "learning_rate": 9.744105409153953e-06, + "loss": 0.2056, + "step": 379 + }, + { + "epoch": 0.2632490474541046, + "grad_norm": 1.2126179933547974, + "learning_rate": 9.743411927877948e-06, + "loss": 0.2246, + "step": 380 + }, + { + "epoch": 0.2639418081052996, + "grad_norm": 1.2313309907913208, + "learning_rate": 9.742718446601942e-06, + "loss": 0.1887, + "step": 381 + }, + { + "epoch": 0.2646345687564946, + "grad_norm": 1.238054633140564, + "learning_rate": 9.742024965325936e-06, + "loss": 0.2449, + "step": 382 + }, + { + "epoch": 0.26532732940768966, + "grad_norm": 1.1343685388565063, + "learning_rate": 9.741331484049931e-06, + "loss": 0.1905, + "step": 383 + }, + { + "epoch": 0.26602009005888466, + "grad_norm": 1.0633238554000854, + "learning_rate": 9.740638002773926e-06, + "loss": 0.1662, + "step": 384 + }, + { + "epoch": 0.26671285071007966, + "grad_norm": 1.267633318901062, + "learning_rate": 9.739944521497921e-06, + "loss": 0.1999, + "step": 385 + }, + { + "epoch": 0.26740561136127466, + "grad_norm": 1.1595088243484497, + "learning_rate": 9.739251040221915e-06, + "loss": 0.2027, + "step": 386 + }, + { + "epoch": 0.26809837201246967, + "grad_norm": 1.138271689414978, + "learning_rate": 9.73855755894591e-06, + "loss": 0.1548, + "step": 387 + }, + { + "epoch": 0.2687911326636647, + "grad_norm": 1.0888426303863525, + "learning_rate": 9.737864077669904e-06, + "loss": 0.2006, + "step": 388 + }, + { + "epoch": 0.2694838933148597, + "grad_norm": 1.2251331806182861, + "learning_rate": 9.737170596393898e-06, + "loss": 0.2073, + "step": 389 + }, + { + "epoch": 0.2701766539660547, + "grad_norm": 1.1277856826782227, + "learning_rate": 9.736477115117893e-06, + "loss": 0.1727, + "step": 390 + }, + { + "epoch": 0.2708694146172497, + "grad_norm": 1.1262153387069702, + "learning_rate": 9.735783633841886e-06, + "loss": 0.2016, + "step": 391 + }, + { + "epoch": 0.2715621752684447, + "grad_norm": 1.321863055229187, + "learning_rate": 9.73509015256588e-06, + "loss": 0.2055, + "step": 392 + }, + { + "epoch": 0.2722549359196398, + "grad_norm": 1.271531343460083, + "learning_rate": 9.734396671289876e-06, + "loss": 0.2554, + "step": 393 + }, + { + "epoch": 0.2729476965708348, + "grad_norm": 1.216313123703003, + "learning_rate": 9.73370319001387e-06, + "loss": 0.2341, + "step": 394 + }, + { + "epoch": 0.2736404572220298, + "grad_norm": 1.1416480541229248, + "learning_rate": 9.733009708737866e-06, + "loss": 0.1699, + "step": 395 + }, + { + "epoch": 0.2743332178732248, + "grad_norm": 1.0104820728302002, + "learning_rate": 9.732316227461859e-06, + "loss": 0.1712, + "step": 396 + }, + { + "epoch": 0.2750259785244198, + "grad_norm": 1.174569845199585, + "learning_rate": 9.731622746185854e-06, + "loss": 0.1882, + "step": 397 + }, + { + "epoch": 0.27571873917561485, + "grad_norm": 1.202559471130371, + "learning_rate": 9.730929264909849e-06, + "loss": 0.2103, + "step": 398 + }, + { + "epoch": 0.27641149982680985, + "grad_norm": 1.1985924243927002, + "learning_rate": 9.730235783633842e-06, + "loss": 0.1878, + "step": 399 + }, + { + "epoch": 0.27710426047800485, + "grad_norm": 1.156169056892395, + "learning_rate": 9.729542302357837e-06, + "loss": 0.2074, + "step": 400 + }, + { + "epoch": 0.27779702112919985, + "grad_norm": 1.2795816659927368, + "learning_rate": 9.728848821081832e-06, + "loss": 0.2074, + "step": 401 + }, + { + "epoch": 0.27848978178039485, + "grad_norm": 1.1505229473114014, + "learning_rate": 9.728155339805827e-06, + "loss": 0.192, + "step": 402 + }, + { + "epoch": 0.2791825424315899, + "grad_norm": 1.221264362335205, + "learning_rate": 9.727461858529822e-06, + "loss": 0.1976, + "step": 403 + }, + { + "epoch": 0.2798753030827849, + "grad_norm": 1.379112720489502, + "learning_rate": 9.726768377253815e-06, + "loss": 0.1998, + "step": 404 + }, + { + "epoch": 0.2805680637339799, + "grad_norm": 1.002105951309204, + "learning_rate": 9.72607489597781e-06, + "loss": 0.1537, + "step": 405 + }, + { + "epoch": 0.2812608243851749, + "grad_norm": 1.2976447343826294, + "learning_rate": 9.725381414701803e-06, + "loss": 0.2045, + "step": 406 + }, + { + "epoch": 0.2819535850363699, + "grad_norm": 1.293208360671997, + "learning_rate": 9.724687933425798e-06, + "loss": 0.2234, + "step": 407 + }, + { + "epoch": 0.28264634568756497, + "grad_norm": 1.0355128049850464, + "learning_rate": 9.723994452149793e-06, + "loss": 0.1715, + "step": 408 + }, + { + "epoch": 0.28333910633875997, + "grad_norm": 1.1013727188110352, + "learning_rate": 9.723300970873786e-06, + "loss": 0.156, + "step": 409 + }, + { + "epoch": 0.284031866989955, + "grad_norm": 1.2460412979125977, + "learning_rate": 9.722607489597781e-06, + "loss": 0.226, + "step": 410 + }, + { + "epoch": 0.28472462764115, + "grad_norm": 1.16079843044281, + "learning_rate": 9.721914008321776e-06, + "loss": 0.1926, + "step": 411 + }, + { + "epoch": 0.285417388292345, + "grad_norm": 1.0862517356872559, + "learning_rate": 9.721220527045771e-06, + "loss": 0.1744, + "step": 412 + }, + { + "epoch": 0.28611014894354003, + "grad_norm": 1.147106409072876, + "learning_rate": 9.720527045769766e-06, + "loss": 0.1853, + "step": 413 + }, + { + "epoch": 0.28680290959473503, + "grad_norm": 1.2101041078567505, + "learning_rate": 9.719833564493759e-06, + "loss": 0.2021, + "step": 414 + }, + { + "epoch": 0.28749567024593004, + "grad_norm": 1.2365410327911377, + "learning_rate": 9.719140083217754e-06, + "loss": 0.2283, + "step": 415 + }, + { + "epoch": 0.28818843089712504, + "grad_norm": 1.1555569171905518, + "learning_rate": 9.718446601941747e-06, + "loss": 0.2091, + "step": 416 + }, + { + "epoch": 0.28888119154832004, + "grad_norm": 1.1707910299301147, + "learning_rate": 9.717753120665742e-06, + "loss": 0.1849, + "step": 417 + }, + { + "epoch": 0.2895739521995151, + "grad_norm": 1.1495592594146729, + "learning_rate": 9.717059639389737e-06, + "loss": 0.185, + "step": 418 + }, + { + "epoch": 0.2902667128507101, + "grad_norm": 1.1243607997894287, + "learning_rate": 9.716366158113732e-06, + "loss": 0.1699, + "step": 419 + }, + { + "epoch": 0.2909594735019051, + "grad_norm": 1.0783498287200928, + "learning_rate": 9.715672676837727e-06, + "loss": 0.1778, + "step": 420 + }, + { + "epoch": 0.2916522341531001, + "grad_norm": 1.098490834236145, + "learning_rate": 9.71497919556172e-06, + "loss": 0.185, + "step": 421 + }, + { + "epoch": 0.2923449948042951, + "grad_norm": 1.2742648124694824, + "learning_rate": 9.714285714285715e-06, + "loss": 0.193, + "step": 422 + }, + { + "epoch": 0.2930377554554901, + "grad_norm": 1.2350236177444458, + "learning_rate": 9.71359223300971e-06, + "loss": 0.2115, + "step": 423 + }, + { + "epoch": 0.29373051610668516, + "grad_norm": 1.175158977508545, + "learning_rate": 9.712898751733703e-06, + "loss": 0.2067, + "step": 424 + }, + { + "epoch": 0.29442327675788016, + "grad_norm": 1.1748855113983154, + "learning_rate": 9.712205270457698e-06, + "loss": 0.1961, + "step": 425 + }, + { + "epoch": 0.29511603740907516, + "grad_norm": 1.1294324398040771, + "learning_rate": 9.711511789181692e-06, + "loss": 0.2103, + "step": 426 + }, + { + "epoch": 0.29580879806027016, + "grad_norm": 1.0847774744033813, + "learning_rate": 9.710818307905686e-06, + "loss": 0.1919, + "step": 427 + }, + { + "epoch": 0.29650155871146516, + "grad_norm": 1.0319700241088867, + "learning_rate": 9.710124826629681e-06, + "loss": 0.1564, + "step": 428 + }, + { + "epoch": 0.2971943193626602, + "grad_norm": 1.1214897632598877, + "learning_rate": 9.709431345353676e-06, + "loss": 0.187, + "step": 429 + }, + { + "epoch": 0.2978870800138552, + "grad_norm": 1.1951547861099243, + "learning_rate": 9.708737864077671e-06, + "loss": 0.1894, + "step": 430 + }, + { + "epoch": 0.2985798406650502, + "grad_norm": 1.082057237625122, + "learning_rate": 9.708044382801664e-06, + "loss": 0.1784, + "step": 431 + }, + { + "epoch": 0.2992726013162452, + "grad_norm": 1.1025521755218506, + "learning_rate": 9.70735090152566e-06, + "loss": 0.1705, + "step": 432 + }, + { + "epoch": 0.2999653619674402, + "grad_norm": 1.2060528993606567, + "learning_rate": 9.706657420249654e-06, + "loss": 0.2305, + "step": 433 + }, + { + "epoch": 0.3006581226186353, + "grad_norm": 1.2095818519592285, + "learning_rate": 9.705963938973648e-06, + "loss": 0.1757, + "step": 434 + }, + { + "epoch": 0.3013508832698303, + "grad_norm": 1.1277533769607544, + "learning_rate": 9.705270457697643e-06, + "loss": 0.1559, + "step": 435 + }, + { + "epoch": 0.3020436439210253, + "grad_norm": 1.0360640287399292, + "learning_rate": 9.704576976421637e-06, + "loss": 0.169, + "step": 436 + }, + { + "epoch": 0.3027364045722203, + "grad_norm": 1.2301961183547974, + "learning_rate": 9.703883495145632e-06, + "loss": 0.198, + "step": 437 + }, + { + "epoch": 0.3034291652234153, + "grad_norm": 1.2964677810668945, + "learning_rate": 9.703190013869627e-06, + "loss": 0.2049, + "step": 438 + }, + { + "epoch": 0.30412192587461034, + "grad_norm": 1.098554015159607, + "learning_rate": 9.70249653259362e-06, + "loss": 0.1893, + "step": 439 + }, + { + "epoch": 0.30481468652580535, + "grad_norm": 1.2005589008331299, + "learning_rate": 9.701803051317616e-06, + "loss": 0.211, + "step": 440 + }, + { + "epoch": 0.30550744717700035, + "grad_norm": 1.1574318408966064, + "learning_rate": 9.701109570041609e-06, + "loss": 0.1909, + "step": 441 + }, + { + "epoch": 0.30620020782819535, + "grad_norm": 1.2841503620147705, + "learning_rate": 9.700416088765604e-06, + "loss": 0.1951, + "step": 442 + }, + { + "epoch": 0.30689296847939035, + "grad_norm": 1.0907950401306152, + "learning_rate": 9.699722607489599e-06, + "loss": 0.1671, + "step": 443 + }, + { + "epoch": 0.3075857291305854, + "grad_norm": 1.0942476987838745, + "learning_rate": 9.699029126213594e-06, + "loss": 0.1738, + "step": 444 + }, + { + "epoch": 0.3082784897817804, + "grad_norm": 1.1258342266082764, + "learning_rate": 9.698335644937587e-06, + "loss": 0.1602, + "step": 445 + }, + { + "epoch": 0.3089712504329754, + "grad_norm": 1.1836743354797363, + "learning_rate": 9.697642163661582e-06, + "loss": 0.1926, + "step": 446 + }, + { + "epoch": 0.3096640110841704, + "grad_norm": 1.2888097763061523, + "learning_rate": 9.696948682385577e-06, + "loss": 0.1945, + "step": 447 + }, + { + "epoch": 0.3103567717353654, + "grad_norm": 1.0280569791793823, + "learning_rate": 9.696255201109572e-06, + "loss": 0.152, + "step": 448 + }, + { + "epoch": 0.31104953238656047, + "grad_norm": 1.1025164127349854, + "learning_rate": 9.695561719833565e-06, + "loss": 0.2013, + "step": 449 + }, + { + "epoch": 0.31174229303775547, + "grad_norm": 1.1613831520080566, + "learning_rate": 9.69486823855756e-06, + "loss": 0.1909, + "step": 450 + }, + { + "epoch": 0.31243505368895047, + "grad_norm": 1.1448906660079956, + "learning_rate": 9.694174757281553e-06, + "loss": 0.209, + "step": 451 + }, + { + "epoch": 0.31312781434014547, + "grad_norm": 1.0306289196014404, + "learning_rate": 9.693481276005548e-06, + "loss": 0.1618, + "step": 452 + }, + { + "epoch": 0.3138205749913405, + "grad_norm": 1.0736443996429443, + "learning_rate": 9.692787794729543e-06, + "loss": 0.1656, + "step": 453 + }, + { + "epoch": 0.31451333564253553, + "grad_norm": 1.131273627281189, + "learning_rate": 9.692094313453538e-06, + "loss": 0.1749, + "step": 454 + }, + { + "epoch": 0.31520609629373053, + "grad_norm": 1.082929253578186, + "learning_rate": 9.691400832177533e-06, + "loss": 0.1634, + "step": 455 + }, + { + "epoch": 0.31589885694492553, + "grad_norm": 1.1273523569107056, + "learning_rate": 9.690707350901526e-06, + "loss": 0.1873, + "step": 456 + }, + { + "epoch": 0.31659161759612053, + "grad_norm": 1.0584896802902222, + "learning_rate": 9.690013869625521e-06, + "loss": 0.1625, + "step": 457 + }, + { + "epoch": 0.31728437824731553, + "grad_norm": 1.2214630842208862, + "learning_rate": 9.689320388349516e-06, + "loss": 0.1886, + "step": 458 + }, + { + "epoch": 0.3179771388985106, + "grad_norm": 1.0907474756240845, + "learning_rate": 9.688626907073509e-06, + "loss": 0.1835, + "step": 459 + }, + { + "epoch": 0.3186698995497056, + "grad_norm": 1.190325379371643, + "learning_rate": 9.687933425797504e-06, + "loss": 0.1808, + "step": 460 + }, + { + "epoch": 0.3193626602009006, + "grad_norm": 1.0343101024627686, + "learning_rate": 9.687239944521499e-06, + "loss": 0.1647, + "step": 461 + }, + { + "epoch": 0.3200554208520956, + "grad_norm": 1.1271415948867798, + "learning_rate": 9.686546463245494e-06, + "loss": 0.1749, + "step": 462 + }, + { + "epoch": 0.3207481815032906, + "grad_norm": 1.165846347808838, + "learning_rate": 9.685852981969489e-06, + "loss": 0.1609, + "step": 463 + }, + { + "epoch": 0.3214409421544856, + "grad_norm": 1.1843103170394897, + "learning_rate": 9.685159500693482e-06, + "loss": 0.1854, + "step": 464 + }, + { + "epoch": 0.32213370280568066, + "grad_norm": 1.0988398790359497, + "learning_rate": 9.684466019417477e-06, + "loss": 0.1786, + "step": 465 + }, + { + "epoch": 0.32282646345687566, + "grad_norm": 1.2949365377426147, + "learning_rate": 9.68377253814147e-06, + "loss": 0.1947, + "step": 466 + }, + { + "epoch": 0.32351922410807066, + "grad_norm": 1.3218495845794678, + "learning_rate": 9.683079056865465e-06, + "loss": 0.2363, + "step": 467 + }, + { + "epoch": 0.32421198475926566, + "grad_norm": 1.0944557189941406, + "learning_rate": 9.68238557558946e-06, + "loss": 0.1645, + "step": 468 + }, + { + "epoch": 0.32490474541046066, + "grad_norm": 1.2187862396240234, + "learning_rate": 9.681692094313453e-06, + "loss": 0.1986, + "step": 469 + }, + { + "epoch": 0.3255975060616557, + "grad_norm": 1.1893078088760376, + "learning_rate": 9.680998613037448e-06, + "loss": 0.1879, + "step": 470 + }, + { + "epoch": 0.3262902667128507, + "grad_norm": 1.2025196552276611, + "learning_rate": 9.680305131761443e-06, + "loss": 0.1843, + "step": 471 + }, + { + "epoch": 0.3269830273640457, + "grad_norm": 1.064837098121643, + "learning_rate": 9.679611650485438e-06, + "loss": 0.161, + "step": 472 + }, + { + "epoch": 0.3276757880152407, + "grad_norm": 1.1834200620651245, + "learning_rate": 9.678918169209433e-06, + "loss": 0.2052, + "step": 473 + }, + { + "epoch": 0.3283685486664357, + "grad_norm": 1.117592215538025, + "learning_rate": 9.678224687933426e-06, + "loss": 0.1722, + "step": 474 + }, + { + "epoch": 0.3290613093176308, + "grad_norm": 1.1554933786392212, + "learning_rate": 9.677531206657421e-06, + "loss": 0.1696, + "step": 475 + }, + { + "epoch": 0.3297540699688258, + "grad_norm": 1.256576418876648, + "learning_rate": 9.676837725381414e-06, + "loss": 0.2044, + "step": 476 + }, + { + "epoch": 0.3304468306200208, + "grad_norm": 1.1218692064285278, + "learning_rate": 9.67614424410541e-06, + "loss": 0.1754, + "step": 477 + }, + { + "epoch": 0.3311395912712158, + "grad_norm": 1.0906285047531128, + "learning_rate": 9.675450762829404e-06, + "loss": 0.1782, + "step": 478 + }, + { + "epoch": 0.3318323519224108, + "grad_norm": 1.1338284015655518, + "learning_rate": 9.6747572815534e-06, + "loss": 0.1699, + "step": 479 + }, + { + "epoch": 0.33252511257360584, + "grad_norm": 1.0718350410461426, + "learning_rate": 9.674063800277394e-06, + "loss": 0.1797, + "step": 480 + }, + { + "epoch": 0.33321787322480084, + "grad_norm": 1.0006433725357056, + "learning_rate": 9.673370319001387e-06, + "loss": 0.1455, + "step": 481 + }, + { + "epoch": 0.33391063387599584, + "grad_norm": 0.9580618143081665, + "learning_rate": 9.672676837725382e-06, + "loss": 0.1499, + "step": 482 + }, + { + "epoch": 0.33460339452719084, + "grad_norm": 1.1401749849319458, + "learning_rate": 9.671983356449377e-06, + "loss": 0.1873, + "step": 483 + }, + { + "epoch": 0.33529615517838585, + "grad_norm": 1.089127779006958, + "learning_rate": 9.67128987517337e-06, + "loss": 0.165, + "step": 484 + }, + { + "epoch": 0.3359889158295809, + "grad_norm": 1.109618902206421, + "learning_rate": 9.670596393897365e-06, + "loss": 0.1922, + "step": 485 + }, + { + "epoch": 0.3366816764807759, + "grad_norm": 1.332227349281311, + "learning_rate": 9.669902912621359e-06, + "loss": 0.2299, + "step": 486 + }, + { + "epoch": 0.3373744371319709, + "grad_norm": 1.1735680103302002, + "learning_rate": 9.669209431345354e-06, + "loss": 0.1865, + "step": 487 + }, + { + "epoch": 0.3380671977831659, + "grad_norm": 1.0729219913482666, + "learning_rate": 9.668515950069349e-06, + "loss": 0.1925, + "step": 488 + }, + { + "epoch": 0.3387599584343609, + "grad_norm": 1.0912272930145264, + "learning_rate": 9.667822468793344e-06, + "loss": 0.1672, + "step": 489 + }, + { + "epoch": 0.33945271908555597, + "grad_norm": 1.1081303358078003, + "learning_rate": 9.667128987517338e-06, + "loss": 0.1879, + "step": 490 + }, + { + "epoch": 0.34014547973675097, + "grad_norm": 1.1520613431930542, + "learning_rate": 9.666435506241332e-06, + "loss": 0.1807, + "step": 491 + }, + { + "epoch": 0.34083824038794597, + "grad_norm": 1.1199016571044922, + "learning_rate": 9.665742024965327e-06, + "loss": 0.1783, + "step": 492 + }, + { + "epoch": 0.34153100103914097, + "grad_norm": 1.2753344774246216, + "learning_rate": 9.665048543689322e-06, + "loss": 0.2021, + "step": 493 + }, + { + "epoch": 0.34222376169033597, + "grad_norm": 1.1093775033950806, + "learning_rate": 9.664355062413315e-06, + "loss": 0.1505, + "step": 494 + }, + { + "epoch": 0.342916522341531, + "grad_norm": 1.1510506868362427, + "learning_rate": 9.66366158113731e-06, + "loss": 0.1739, + "step": 495 + }, + { + "epoch": 0.34360928299272603, + "grad_norm": 1.119981050491333, + "learning_rate": 9.662968099861305e-06, + "loss": 0.1804, + "step": 496 + }, + { + "epoch": 0.34430204364392103, + "grad_norm": 1.1446335315704346, + "learning_rate": 9.6622746185853e-06, + "loss": 0.1772, + "step": 497 + }, + { + "epoch": 0.34499480429511603, + "grad_norm": 1.0427706241607666, + "learning_rate": 9.661581137309295e-06, + "loss": 0.1481, + "step": 498 + }, + { + "epoch": 0.34568756494631103, + "grad_norm": 1.0160114765167236, + "learning_rate": 9.660887656033288e-06, + "loss": 0.1445, + "step": 499 + }, + { + "epoch": 0.3463803255975061, + "grad_norm": 1.117127537727356, + "learning_rate": 9.660194174757283e-06, + "loss": 0.1576, + "step": 500 + }, + { + "epoch": 0.3470730862487011, + "grad_norm": 1.1823601722717285, + "learning_rate": 9.659500693481276e-06, + "loss": 0.2048, + "step": 501 + }, + { + "epoch": 0.3477658468998961, + "grad_norm": 1.1818424463272095, + "learning_rate": 9.658807212205271e-06, + "loss": 0.2025, + "step": 502 + }, + { + "epoch": 0.3484586075510911, + "grad_norm": 1.2560657262802124, + "learning_rate": 9.658113730929266e-06, + "loss": 0.1853, + "step": 503 + }, + { + "epoch": 0.3491513682022861, + "grad_norm": 1.2229050397872925, + "learning_rate": 9.657420249653259e-06, + "loss": 0.2385, + "step": 504 + }, + { + "epoch": 0.3498441288534811, + "grad_norm": 1.2232409715652466, + "learning_rate": 9.656726768377254e-06, + "loss": 0.212, + "step": 505 + }, + { + "epoch": 0.35053688950467615, + "grad_norm": 1.0253208875656128, + "learning_rate": 9.656033287101249e-06, + "loss": 0.1585, + "step": 506 + }, + { + "epoch": 0.35122965015587115, + "grad_norm": 1.1161078214645386, + "learning_rate": 9.655339805825244e-06, + "loss": 0.1998, + "step": 507 + }, + { + "epoch": 0.35192241080706615, + "grad_norm": 1.1084858179092407, + "learning_rate": 9.654646324549239e-06, + "loss": 0.1627, + "step": 508 + }, + { + "epoch": 0.35261517145826116, + "grad_norm": 1.2024314403533936, + "learning_rate": 9.653952843273232e-06, + "loss": 0.1539, + "step": 509 + }, + { + "epoch": 0.35330793210945616, + "grad_norm": 1.1474061012268066, + "learning_rate": 9.653259361997227e-06, + "loss": 0.1886, + "step": 510 + }, + { + "epoch": 0.3540006927606512, + "grad_norm": 1.1132161617279053, + "learning_rate": 9.65256588072122e-06, + "loss": 0.1882, + "step": 511 + }, + { + "epoch": 0.3546934534118462, + "grad_norm": 1.0473583936691284, + "learning_rate": 9.651872399445215e-06, + "loss": 0.1735, + "step": 512 + }, + { + "epoch": 0.3553862140630412, + "grad_norm": 1.031063199043274, + "learning_rate": 9.65117891816921e-06, + "loss": 0.1658, + "step": 513 + }, + { + "epoch": 0.3560789747142362, + "grad_norm": 1.1989202499389648, + "learning_rate": 9.650485436893205e-06, + "loss": 0.1977, + "step": 514 + }, + { + "epoch": 0.3567717353654312, + "grad_norm": 1.1673550605773926, + "learning_rate": 9.6497919556172e-06, + "loss": 0.1803, + "step": 515 + }, + { + "epoch": 0.3574644960166263, + "grad_norm": 1.2909184694290161, + "learning_rate": 9.649098474341193e-06, + "loss": 0.1833, + "step": 516 + }, + { + "epoch": 0.3581572566678213, + "grad_norm": 1.2750986814498901, + "learning_rate": 9.648404993065188e-06, + "loss": 0.1966, + "step": 517 + }, + { + "epoch": 0.3588500173190163, + "grad_norm": 1.183593988418579, + "learning_rate": 9.647711511789183e-06, + "loss": 0.1902, + "step": 518 + }, + { + "epoch": 0.3595427779702113, + "grad_norm": 1.1403534412384033, + "learning_rate": 9.647018030513176e-06, + "loss": 0.1636, + "step": 519 + }, + { + "epoch": 0.3602355386214063, + "grad_norm": 1.127642273902893, + "learning_rate": 9.646324549237171e-06, + "loss": 0.1958, + "step": 520 + }, + { + "epoch": 0.36092829927260134, + "grad_norm": 1.1878528594970703, + "learning_rate": 9.645631067961166e-06, + "loss": 0.1899, + "step": 521 + }, + { + "epoch": 0.36162105992379634, + "grad_norm": 1.1126781702041626, + "learning_rate": 9.64493758668516e-06, + "loss": 0.1657, + "step": 522 + }, + { + "epoch": 0.36231382057499134, + "grad_norm": 1.1267461776733398, + "learning_rate": 9.644244105409154e-06, + "loss": 0.1706, + "step": 523 + }, + { + "epoch": 0.36300658122618634, + "grad_norm": 1.2233935594558716, + "learning_rate": 9.64355062413315e-06, + "loss": 0.1979, + "step": 524 + }, + { + "epoch": 0.36369934187738134, + "grad_norm": 1.3893072605133057, + "learning_rate": 9.642857142857144e-06, + "loss": 0.2063, + "step": 525 + }, + { + "epoch": 0.3643921025285764, + "grad_norm": 0.9728911519050598, + "learning_rate": 9.642163661581137e-06, + "loss": 0.1617, + "step": 526 + }, + { + "epoch": 0.3650848631797714, + "grad_norm": 1.113114833831787, + "learning_rate": 9.641470180305132e-06, + "loss": 0.1787, + "step": 527 + }, + { + "epoch": 0.3657776238309664, + "grad_norm": 1.1457524299621582, + "learning_rate": 9.640776699029127e-06, + "loss": 0.1738, + "step": 528 + }, + { + "epoch": 0.3664703844821614, + "grad_norm": 0.9557360410690308, + "learning_rate": 9.64008321775312e-06, + "loss": 0.1419, + "step": 529 + }, + { + "epoch": 0.3671631451333564, + "grad_norm": 1.0903773307800293, + "learning_rate": 9.639389736477115e-06, + "loss": 0.1851, + "step": 530 + }, + { + "epoch": 0.36785590578455146, + "grad_norm": 1.0435748100280762, + "learning_rate": 9.63869625520111e-06, + "loss": 0.1541, + "step": 531 + }, + { + "epoch": 0.36854866643574646, + "grad_norm": 1.2730120420455933, + "learning_rate": 9.638002773925105e-06, + "loss": 0.2004, + "step": 532 + }, + { + "epoch": 0.36924142708694146, + "grad_norm": 1.177163004875183, + "learning_rate": 9.6373092926491e-06, + "loss": 0.1913, + "step": 533 + }, + { + "epoch": 0.36993418773813647, + "grad_norm": 1.2473503351211548, + "learning_rate": 9.636615811373094e-06, + "loss": 0.1925, + "step": 534 + }, + { + "epoch": 0.37062694838933147, + "grad_norm": 1.0158774852752686, + "learning_rate": 9.635922330097088e-06, + "loss": 0.1535, + "step": 535 + }, + { + "epoch": 0.3713197090405265, + "grad_norm": 1.147966742515564, + "learning_rate": 9.635228848821082e-06, + "loss": 0.1947, + "step": 536 + }, + { + "epoch": 0.3720124696917215, + "grad_norm": 1.1904044151306152, + "learning_rate": 9.634535367545077e-06, + "loss": 0.1971, + "step": 537 + }, + { + "epoch": 0.3727052303429165, + "grad_norm": 1.0254184007644653, + "learning_rate": 9.633841886269072e-06, + "loss": 0.1567, + "step": 538 + }, + { + "epoch": 0.37339799099411153, + "grad_norm": 1.207237958908081, + "learning_rate": 9.633148404993066e-06, + "loss": 0.1931, + "step": 539 + }, + { + "epoch": 0.37409075164530653, + "grad_norm": 1.2658352851867676, + "learning_rate": 9.632454923717061e-06, + "loss": 0.171, + "step": 540 + }, + { + "epoch": 0.3747835122965016, + "grad_norm": 1.3020246028900146, + "learning_rate": 9.631761442441055e-06, + "loss": 0.1924, + "step": 541 + }, + { + "epoch": 0.3754762729476966, + "grad_norm": 1.0947773456573486, + "learning_rate": 9.63106796116505e-06, + "loss": 0.1948, + "step": 542 + }, + { + "epoch": 0.3761690335988916, + "grad_norm": 1.281325340270996, + "learning_rate": 9.630374479889045e-06, + "loss": 0.2063, + "step": 543 + }, + { + "epoch": 0.3768617942500866, + "grad_norm": 1.1554181575775146, + "learning_rate": 9.629680998613038e-06, + "loss": 0.1859, + "step": 544 + }, + { + "epoch": 0.3775545549012816, + "grad_norm": 1.1861845254898071, + "learning_rate": 9.628987517337033e-06, + "loss": 0.184, + "step": 545 + }, + { + "epoch": 0.3782473155524766, + "grad_norm": 1.0844807624816895, + "learning_rate": 9.628294036061026e-06, + "loss": 0.1549, + "step": 546 + }, + { + "epoch": 0.37894007620367165, + "grad_norm": 1.0676765441894531, + "learning_rate": 9.627600554785021e-06, + "loss": 0.1719, + "step": 547 + }, + { + "epoch": 0.37963283685486665, + "grad_norm": 1.0964168310165405, + "learning_rate": 9.626907073509016e-06, + "loss": 0.1632, + "step": 548 + }, + { + "epoch": 0.38032559750606165, + "grad_norm": 1.1241437196731567, + "learning_rate": 9.62621359223301e-06, + "loss": 0.1702, + "step": 549 + }, + { + "epoch": 0.38101835815725665, + "grad_norm": 0.990709662437439, + "learning_rate": 9.625520110957006e-06, + "loss": 0.1441, + "step": 550 + }, + { + "epoch": 0.38171111880845165, + "grad_norm": 1.131478190422058, + "learning_rate": 9.624826629680999e-06, + "loss": 0.1793, + "step": 551 + }, + { + "epoch": 0.3824038794596467, + "grad_norm": 1.108688473701477, + "learning_rate": 9.624133148404994e-06, + "loss": 0.1485, + "step": 552 + }, + { + "epoch": 0.3830966401108417, + "grad_norm": 1.3076444864273071, + "learning_rate": 9.623439667128989e-06, + "loss": 0.2132, + "step": 553 + }, + { + "epoch": 0.3837894007620367, + "grad_norm": 1.1900216341018677, + "learning_rate": 9.622746185852982e-06, + "loss": 0.1925, + "step": 554 + }, + { + "epoch": 0.3844821614132317, + "grad_norm": 1.1948908567428589, + "learning_rate": 9.622052704576977e-06, + "loss": 0.1964, + "step": 555 + }, + { + "epoch": 0.3851749220644267, + "grad_norm": 1.0484943389892578, + "learning_rate": 9.621359223300972e-06, + "loss": 0.1665, + "step": 556 + }, + { + "epoch": 0.3858676827156218, + "grad_norm": 1.1652822494506836, + "learning_rate": 9.620665742024967e-06, + "loss": 0.2059, + "step": 557 + }, + { + "epoch": 0.3865604433668168, + "grad_norm": 1.1573383808135986, + "learning_rate": 9.619972260748962e-06, + "loss": 0.1829, + "step": 558 + }, + { + "epoch": 0.3872532040180118, + "grad_norm": 1.075415015220642, + "learning_rate": 9.619278779472955e-06, + "loss": 0.1573, + "step": 559 + }, + { + "epoch": 0.3879459646692068, + "grad_norm": 1.114197850227356, + "learning_rate": 9.61858529819695e-06, + "loss": 0.183, + "step": 560 + }, + { + "epoch": 0.3886387253204018, + "grad_norm": 1.2600793838500977, + "learning_rate": 9.617891816920943e-06, + "loss": 0.1897, + "step": 561 + }, + { + "epoch": 0.38933148597159684, + "grad_norm": 1.266296625137329, + "learning_rate": 9.617198335644938e-06, + "loss": 0.1765, + "step": 562 + }, + { + "epoch": 0.39002424662279184, + "grad_norm": 1.1767463684082031, + "learning_rate": 9.616504854368933e-06, + "loss": 0.1755, + "step": 563 + }, + { + "epoch": 0.39071700727398684, + "grad_norm": 1.203078269958496, + "learning_rate": 9.615811373092926e-06, + "loss": 0.2081, + "step": 564 + }, + { + "epoch": 0.39140976792518184, + "grad_norm": 1.230280876159668, + "learning_rate": 9.615117891816921e-06, + "loss": 0.1554, + "step": 565 + }, + { + "epoch": 0.39210252857637684, + "grad_norm": 1.1045949459075928, + "learning_rate": 9.614424410540916e-06, + "loss": 0.1666, + "step": 566 + }, + { + "epoch": 0.3927952892275719, + "grad_norm": 1.3441511392593384, + "learning_rate": 9.613730929264911e-06, + "loss": 0.1952, + "step": 567 + }, + { + "epoch": 0.3934880498787669, + "grad_norm": 1.3405990600585938, + "learning_rate": 9.613037447988906e-06, + "loss": 0.2042, + "step": 568 + }, + { + "epoch": 0.3941808105299619, + "grad_norm": 1.2307276725769043, + "learning_rate": 9.6123439667129e-06, + "loss": 0.2099, + "step": 569 + }, + { + "epoch": 0.3948735711811569, + "grad_norm": 1.0651723146438599, + "learning_rate": 9.611650485436894e-06, + "loss": 0.1796, + "step": 570 + }, + { + "epoch": 0.3955663318323519, + "grad_norm": 1.2337777614593506, + "learning_rate": 9.610957004160887e-06, + "loss": 0.2137, + "step": 571 + }, + { + "epoch": 0.39625909248354696, + "grad_norm": 1.0993244647979736, + "learning_rate": 9.610263522884882e-06, + "loss": 0.1714, + "step": 572 + }, + { + "epoch": 0.39695185313474196, + "grad_norm": 1.1756548881530762, + "learning_rate": 9.609570041608877e-06, + "loss": 0.1684, + "step": 573 + }, + { + "epoch": 0.39764461378593696, + "grad_norm": 1.0881178379058838, + "learning_rate": 9.608876560332872e-06, + "loss": 0.1784, + "step": 574 + }, + { + "epoch": 0.39833737443713196, + "grad_norm": 1.1008052825927734, + "learning_rate": 9.608183079056867e-06, + "loss": 0.1593, + "step": 575 + }, + { + "epoch": 0.39903013508832696, + "grad_norm": 1.2633024454116821, + "learning_rate": 9.60748959778086e-06, + "loss": 0.1946, + "step": 576 + }, + { + "epoch": 0.399722895739522, + "grad_norm": 1.450706958770752, + "learning_rate": 9.606796116504855e-06, + "loss": 0.2175, + "step": 577 + }, + { + "epoch": 0.400415656390717, + "grad_norm": 1.2185648679733276, + "learning_rate": 9.60610263522885e-06, + "loss": 0.1849, + "step": 578 + }, + { + "epoch": 0.401108417041912, + "grad_norm": 1.235177755355835, + "learning_rate": 9.605409153952843e-06, + "loss": 0.1845, + "step": 579 + }, + { + "epoch": 0.401801177693107, + "grad_norm": 1.2215522527694702, + "learning_rate": 9.604715672676838e-06, + "loss": 0.1945, + "step": 580 + }, + { + "epoch": 0.402493938344302, + "grad_norm": 1.1482781171798706, + "learning_rate": 9.604022191400832e-06, + "loss": 0.1685, + "step": 581 + }, + { + "epoch": 0.40318669899549703, + "grad_norm": 1.165074348449707, + "learning_rate": 9.603328710124827e-06, + "loss": 0.1873, + "step": 582 + }, + { + "epoch": 0.4038794596466921, + "grad_norm": 1.2293803691864014, + "learning_rate": 9.602635228848822e-06, + "loss": 0.1879, + "step": 583 + }, + { + "epoch": 0.4045722202978871, + "grad_norm": 1.126194953918457, + "learning_rate": 9.601941747572816e-06, + "loss": 0.1751, + "step": 584 + }, + { + "epoch": 0.4052649809490821, + "grad_norm": 1.2499005794525146, + "learning_rate": 9.601248266296811e-06, + "loss": 0.1672, + "step": 585 + }, + { + "epoch": 0.4059577416002771, + "grad_norm": 1.1256407499313354, + "learning_rate": 9.600554785020805e-06, + "loss": 0.1786, + "step": 586 + }, + { + "epoch": 0.4066505022514721, + "grad_norm": 1.1976889371871948, + "learning_rate": 9.5998613037448e-06, + "loss": 0.1695, + "step": 587 + }, + { + "epoch": 0.40734326290266715, + "grad_norm": 1.0952256917953491, + "learning_rate": 9.599167822468795e-06, + "loss": 0.1888, + "step": 588 + }, + { + "epoch": 0.40803602355386215, + "grad_norm": 1.134501338005066, + "learning_rate": 9.598474341192788e-06, + "loss": 0.165, + "step": 589 + }, + { + "epoch": 0.40872878420505715, + "grad_norm": 1.1301794052124023, + "learning_rate": 9.597780859916783e-06, + "loss": 0.1749, + "step": 590 + }, + { + "epoch": 0.40942154485625215, + "grad_norm": 1.1504310369491577, + "learning_rate": 9.597087378640778e-06, + "loss": 0.1703, + "step": 591 + }, + { + "epoch": 0.41011430550744715, + "grad_norm": 1.1471697092056274, + "learning_rate": 9.596393897364773e-06, + "loss": 0.1733, + "step": 592 + }, + { + "epoch": 0.4108070661586422, + "grad_norm": 1.0838083028793335, + "learning_rate": 9.595700416088767e-06, + "loss": 0.1634, + "step": 593 + }, + { + "epoch": 0.4114998268098372, + "grad_norm": 1.149314284324646, + "learning_rate": 9.59500693481276e-06, + "loss": 0.1681, + "step": 594 + }, + { + "epoch": 0.4121925874610322, + "grad_norm": 1.072961688041687, + "learning_rate": 9.594313453536756e-06, + "loss": 0.1765, + "step": 595 + }, + { + "epoch": 0.4128853481122272, + "grad_norm": 1.0739357471466064, + "learning_rate": 9.593619972260749e-06, + "loss": 0.1644, + "step": 596 + }, + { + "epoch": 0.4135781087634222, + "grad_norm": 1.1856303215026855, + "learning_rate": 9.592926490984744e-06, + "loss": 0.1729, + "step": 597 + }, + { + "epoch": 0.41427086941461727, + "grad_norm": 1.0169947147369385, + "learning_rate": 9.592233009708739e-06, + "loss": 0.1483, + "step": 598 + }, + { + "epoch": 0.4149636300658123, + "grad_norm": 1.1429182291030884, + "learning_rate": 9.591539528432732e-06, + "loss": 0.1909, + "step": 599 + }, + { + "epoch": 0.4156563907170073, + "grad_norm": 1.009037971496582, + "learning_rate": 9.590846047156727e-06, + "loss": 0.1578, + "step": 600 + }, + { + "epoch": 0.4163491513682023, + "grad_norm": 1.1267091035842896, + "learning_rate": 9.590152565880722e-06, + "loss": 0.1718, + "step": 601 + }, + { + "epoch": 0.4170419120193973, + "grad_norm": 1.2382667064666748, + "learning_rate": 9.589459084604717e-06, + "loss": 0.2238, + "step": 602 + }, + { + "epoch": 0.41773467267059233, + "grad_norm": 1.196805715560913, + "learning_rate": 9.588765603328712e-06, + "loss": 0.1702, + "step": 603 + }, + { + "epoch": 0.41842743332178733, + "grad_norm": 1.0227054357528687, + "learning_rate": 9.588072122052705e-06, + "loss": 0.1671, + "step": 604 + }, + { + "epoch": 0.41912019397298234, + "grad_norm": 1.0865464210510254, + "learning_rate": 9.5873786407767e-06, + "loss": 0.163, + "step": 605 + }, + { + "epoch": 0.41981295462417734, + "grad_norm": 1.0725364685058594, + "learning_rate": 9.586685159500693e-06, + "loss": 0.1627, + "step": 606 + }, + { + "epoch": 0.42050571527537234, + "grad_norm": 1.1005607843399048, + "learning_rate": 9.585991678224688e-06, + "loss": 0.1541, + "step": 607 + }, + { + "epoch": 0.4211984759265674, + "grad_norm": 1.1609758138656616, + "learning_rate": 9.585298196948683e-06, + "loss": 0.1646, + "step": 608 + }, + { + "epoch": 0.4218912365777624, + "grad_norm": 0.9930508136749268, + "learning_rate": 9.584604715672678e-06, + "loss": 0.1479, + "step": 609 + }, + { + "epoch": 0.4225839972289574, + "grad_norm": 1.3058476448059082, + "learning_rate": 9.583911234396673e-06, + "loss": 0.1913, + "step": 610 + }, + { + "epoch": 0.4232767578801524, + "grad_norm": 1.258899211883545, + "learning_rate": 9.583217753120666e-06, + "loss": 0.2204, + "step": 611 + }, + { + "epoch": 0.4239695185313474, + "grad_norm": 1.1631954908370972, + "learning_rate": 9.582524271844661e-06, + "loss": 0.1746, + "step": 612 + }, + { + "epoch": 0.42466227918254246, + "grad_norm": 1.2395148277282715, + "learning_rate": 9.581830790568656e-06, + "loss": 0.1733, + "step": 613 + }, + { + "epoch": 0.42535503983373746, + "grad_norm": 1.0607619285583496, + "learning_rate": 9.58113730929265e-06, + "loss": 0.1603, + "step": 614 + }, + { + "epoch": 0.42604780048493246, + "grad_norm": 1.1310009956359863, + "learning_rate": 9.580443828016644e-06, + "loss": 0.1972, + "step": 615 + }, + { + "epoch": 0.42674056113612746, + "grad_norm": 1.1180874109268188, + "learning_rate": 9.579750346740639e-06, + "loss": 0.1816, + "step": 616 + }, + { + "epoch": 0.42743332178732246, + "grad_norm": 1.2065924406051636, + "learning_rate": 9.579056865464634e-06, + "loss": 0.2039, + "step": 617 + }, + { + "epoch": 0.4281260824385175, + "grad_norm": 1.1499533653259277, + "learning_rate": 9.578363384188627e-06, + "loss": 0.1819, + "step": 618 + }, + { + "epoch": 0.4288188430897125, + "grad_norm": 1.1326128244400024, + "learning_rate": 9.577669902912622e-06, + "loss": 0.1827, + "step": 619 + }, + { + "epoch": 0.4295116037409075, + "grad_norm": 1.0215452909469604, + "learning_rate": 9.576976421636617e-06, + "loss": 0.1887, + "step": 620 + }, + { + "epoch": 0.4302043643921025, + "grad_norm": 1.0503754615783691, + "learning_rate": 9.57628294036061e-06, + "loss": 0.1753, + "step": 621 + }, + { + "epoch": 0.4308971250432975, + "grad_norm": 1.2217227220535278, + "learning_rate": 9.575589459084605e-06, + "loss": 0.1885, + "step": 622 + }, + { + "epoch": 0.4315898856944925, + "grad_norm": 1.1326897144317627, + "learning_rate": 9.5748959778086e-06, + "loss": 0.2021, + "step": 623 + }, + { + "epoch": 0.4322826463456876, + "grad_norm": 1.143894910812378, + "learning_rate": 9.574202496532593e-06, + "loss": 0.2095, + "step": 624 + }, + { + "epoch": 0.4329754069968826, + "grad_norm": 1.1931277513504028, + "learning_rate": 9.573509015256588e-06, + "loss": 0.222, + "step": 625 + }, + { + "epoch": 0.4336681676480776, + "grad_norm": 1.1747947931289673, + "learning_rate": 9.572815533980583e-06, + "loss": 0.217, + "step": 626 + }, + { + "epoch": 0.4343609282992726, + "grad_norm": 1.1393777132034302, + "learning_rate": 9.572122052704578e-06, + "loss": 0.191, + "step": 627 + }, + { + "epoch": 0.4350536889504676, + "grad_norm": 1.1151050329208374, + "learning_rate": 9.571428571428573e-06, + "loss": 0.1918, + "step": 628 + }, + { + "epoch": 0.43574644960166264, + "grad_norm": 1.0819246768951416, + "learning_rate": 9.570735090152566e-06, + "loss": 0.1976, + "step": 629 + }, + { + "epoch": 0.43643921025285765, + "grad_norm": 0.962162971496582, + "learning_rate": 9.570041608876561e-06, + "loss": 0.1707, + "step": 630 + }, + { + "epoch": 0.43713197090405265, + "grad_norm": 1.1759436130523682, + "learning_rate": 9.569348127600555e-06, + "loss": 0.2099, + "step": 631 + }, + { + "epoch": 0.43782473155524765, + "grad_norm": 1.060920000076294, + "learning_rate": 9.56865464632455e-06, + "loss": 0.161, + "step": 632 + }, + { + "epoch": 0.43851749220644265, + "grad_norm": 1.1771917343139648, + "learning_rate": 9.567961165048544e-06, + "loss": 0.18, + "step": 633 + }, + { + "epoch": 0.4392102528576377, + "grad_norm": 1.1630181074142456, + "learning_rate": 9.56726768377254e-06, + "loss": 0.2122, + "step": 634 + }, + { + "epoch": 0.4399030135088327, + "grad_norm": 1.141360878944397, + "learning_rate": 9.566574202496534e-06, + "loss": 0.1777, + "step": 635 + }, + { + "epoch": 0.4405957741600277, + "grad_norm": 1.2425981760025024, + "learning_rate": 9.565880721220528e-06, + "loss": 0.1852, + "step": 636 + }, + { + "epoch": 0.4412885348112227, + "grad_norm": 1.1600677967071533, + "learning_rate": 9.565187239944523e-06, + "loss": 0.1837, + "step": 637 + }, + { + "epoch": 0.4419812954624177, + "grad_norm": 1.2033652067184448, + "learning_rate": 9.564493758668517e-06, + "loss": 0.1933, + "step": 638 + }, + { + "epoch": 0.44267405611361277, + "grad_norm": 1.1137166023254395, + "learning_rate": 9.56380027739251e-06, + "loss": 0.1688, + "step": 639 + }, + { + "epoch": 0.44336681676480777, + "grad_norm": 1.1078693866729736, + "learning_rate": 9.563106796116506e-06, + "loss": 0.1839, + "step": 640 + }, + { + "epoch": 0.44405957741600277, + "grad_norm": 1.2090137004852295, + "learning_rate": 9.562413314840499e-06, + "loss": 0.1989, + "step": 641 + }, + { + "epoch": 0.44475233806719777, + "grad_norm": 1.0715160369873047, + "learning_rate": 9.561719833564494e-06, + "loss": 0.1677, + "step": 642 + }, + { + "epoch": 0.4454450987183928, + "grad_norm": 1.1578370332717896, + "learning_rate": 9.561026352288489e-06, + "loss": 0.1691, + "step": 643 + }, + { + "epoch": 0.44613785936958783, + "grad_norm": 1.098793864250183, + "learning_rate": 9.560332871012484e-06, + "loss": 0.145, + "step": 644 + }, + { + "epoch": 0.44683062002078283, + "grad_norm": 1.1655832529067993, + "learning_rate": 9.559639389736479e-06, + "loss": 0.2167, + "step": 645 + }, + { + "epoch": 0.44752338067197783, + "grad_norm": 1.0528911352157593, + "learning_rate": 9.558945908460472e-06, + "loss": 0.155, + "step": 646 + }, + { + "epoch": 0.44821614132317283, + "grad_norm": 1.0747157335281372, + "learning_rate": 9.558252427184467e-06, + "loss": 0.1867, + "step": 647 + }, + { + "epoch": 0.44890890197436784, + "grad_norm": 1.0972528457641602, + "learning_rate": 9.557558945908462e-06, + "loss": 0.1949, + "step": 648 + }, + { + "epoch": 0.4496016626255629, + "grad_norm": 1.099705696105957, + "learning_rate": 9.556865464632455e-06, + "loss": 0.1808, + "step": 649 + }, + { + "epoch": 0.4502944232767579, + "grad_norm": 1.1693402528762817, + "learning_rate": 9.55617198335645e-06, + "loss": 0.1955, + "step": 650 + }, + { + "epoch": 0.4509871839279529, + "grad_norm": 1.0074596405029297, + "learning_rate": 9.555478502080445e-06, + "loss": 0.1664, + "step": 651 + }, + { + "epoch": 0.4516799445791479, + "grad_norm": 1.0684326887130737, + "learning_rate": 9.55478502080444e-06, + "loss": 0.1709, + "step": 652 + }, + { + "epoch": 0.4523727052303429, + "grad_norm": 1.0958446264266968, + "learning_rate": 9.554091539528435e-06, + "loss": 0.1665, + "step": 653 + }, + { + "epoch": 0.45306546588153795, + "grad_norm": 1.1592566967010498, + "learning_rate": 9.553398058252428e-06, + "loss": 0.2131, + "step": 654 + }, + { + "epoch": 0.45375822653273296, + "grad_norm": 1.1357659101486206, + "learning_rate": 9.552704576976423e-06, + "loss": 0.185, + "step": 655 + }, + { + "epoch": 0.45445098718392796, + "grad_norm": 1.0651899576187134, + "learning_rate": 9.552011095700416e-06, + "loss": 0.1653, + "step": 656 + }, + { + "epoch": 0.45514374783512296, + "grad_norm": 1.1451846361160278, + "learning_rate": 9.551317614424411e-06, + "loss": 0.1967, + "step": 657 + }, + { + "epoch": 0.45583650848631796, + "grad_norm": 1.0786046981811523, + "learning_rate": 9.550624133148406e-06, + "loss": 0.196, + "step": 658 + }, + { + "epoch": 0.456529269137513, + "grad_norm": 0.9809712171554565, + "learning_rate": 9.5499306518724e-06, + "loss": 0.1568, + "step": 659 + }, + { + "epoch": 0.457222029788708, + "grad_norm": 1.2117514610290527, + "learning_rate": 9.549237170596394e-06, + "loss": 0.1804, + "step": 660 + }, + { + "epoch": 0.457914790439903, + "grad_norm": 1.1225571632385254, + "learning_rate": 9.548543689320389e-06, + "loss": 0.1837, + "step": 661 + }, + { + "epoch": 0.458607551091098, + "grad_norm": 1.0222355127334595, + "learning_rate": 9.547850208044384e-06, + "loss": 0.1809, + "step": 662 + }, + { + "epoch": 0.459300311742293, + "grad_norm": 1.2313727140426636, + "learning_rate": 9.547156726768379e-06, + "loss": 0.2136, + "step": 663 + }, + { + "epoch": 0.459993072393488, + "grad_norm": 1.2407517433166504, + "learning_rate": 9.546463245492372e-06, + "loss": 0.1986, + "step": 664 + }, + { + "epoch": 0.4606858330446831, + "grad_norm": 1.126572847366333, + "learning_rate": 9.545769764216367e-06, + "loss": 0.1767, + "step": 665 + }, + { + "epoch": 0.4613785936958781, + "grad_norm": 1.0602760314941406, + "learning_rate": 9.54507628294036e-06, + "loss": 0.1603, + "step": 666 + }, + { + "epoch": 0.4620713543470731, + "grad_norm": 1.2250707149505615, + "learning_rate": 9.544382801664355e-06, + "loss": 0.1639, + "step": 667 + }, + { + "epoch": 0.4627641149982681, + "grad_norm": 1.2366909980773926, + "learning_rate": 9.54368932038835e-06, + "loss": 0.2106, + "step": 668 + }, + { + "epoch": 0.4634568756494631, + "grad_norm": 1.2094905376434326, + "learning_rate": 9.542995839112345e-06, + "loss": 0.2088, + "step": 669 + }, + { + "epoch": 0.46414963630065814, + "grad_norm": 1.1645236015319824, + "learning_rate": 9.54230235783634e-06, + "loss": 0.17, + "step": 670 + }, + { + "epoch": 0.46484239695185314, + "grad_norm": 1.1013844013214111, + "learning_rate": 9.541608876560333e-06, + "loss": 0.1776, + "step": 671 + }, + { + "epoch": 0.46553515760304814, + "grad_norm": 1.1031912565231323, + "learning_rate": 9.540915395284328e-06, + "loss": 0.1666, + "step": 672 + }, + { + "epoch": 0.46622791825424315, + "grad_norm": 1.1358929872512817, + "learning_rate": 9.540221914008323e-06, + "loss": 0.1939, + "step": 673 + }, + { + "epoch": 0.46692067890543815, + "grad_norm": 1.0319671630859375, + "learning_rate": 9.539528432732316e-06, + "loss": 0.16, + "step": 674 + }, + { + "epoch": 0.4676134395566332, + "grad_norm": 1.1813427209854126, + "learning_rate": 9.538834951456311e-06, + "loss": 0.1536, + "step": 675 + }, + { + "epoch": 0.4683062002078282, + "grad_norm": 1.269011378288269, + "learning_rate": 9.538141470180306e-06, + "loss": 0.19, + "step": 676 + }, + { + "epoch": 0.4689989608590232, + "grad_norm": 1.142793893814087, + "learning_rate": 9.5374479889043e-06, + "loss": 0.1707, + "step": 677 + }, + { + "epoch": 0.4696917215102182, + "grad_norm": 1.0003671646118164, + "learning_rate": 9.536754507628294e-06, + "loss": 0.1606, + "step": 678 + }, + { + "epoch": 0.4703844821614132, + "grad_norm": 1.093482255935669, + "learning_rate": 9.53606102635229e-06, + "loss": 0.1654, + "step": 679 + }, + { + "epoch": 0.47107724281260827, + "grad_norm": 1.0161356925964355, + "learning_rate": 9.535367545076284e-06, + "loss": 0.1546, + "step": 680 + }, + { + "epoch": 0.47177000346380327, + "grad_norm": 1.2691065073013306, + "learning_rate": 9.534674063800278e-06, + "loss": 0.1511, + "step": 681 + }, + { + "epoch": 0.47246276411499827, + "grad_norm": 1.1133615970611572, + "learning_rate": 9.533980582524273e-06, + "loss": 0.1745, + "step": 682 + }, + { + "epoch": 0.47315552476619327, + "grad_norm": 1.132333517074585, + "learning_rate": 9.533287101248267e-06, + "loss": 0.158, + "step": 683 + }, + { + "epoch": 0.47384828541738827, + "grad_norm": 1.217966914176941, + "learning_rate": 9.53259361997226e-06, + "loss": 0.1904, + "step": 684 + }, + { + "epoch": 0.4745410460685833, + "grad_norm": 1.2397990226745605, + "learning_rate": 9.531900138696256e-06, + "loss": 0.1963, + "step": 685 + }, + { + "epoch": 0.47523380671977833, + "grad_norm": 1.2267277240753174, + "learning_rate": 9.53120665742025e-06, + "loss": 0.1831, + "step": 686 + }, + { + "epoch": 0.47592656737097333, + "grad_norm": 1.0864651203155518, + "learning_rate": 9.530513176144245e-06, + "loss": 0.1771, + "step": 687 + }, + { + "epoch": 0.47661932802216833, + "grad_norm": 1.104586124420166, + "learning_rate": 9.52981969486824e-06, + "loss": 0.1497, + "step": 688 + }, + { + "epoch": 0.47731208867336333, + "grad_norm": 1.0452547073364258, + "learning_rate": 9.529126213592234e-06, + "loss": 0.1816, + "step": 689 + }, + { + "epoch": 0.4780048493245584, + "grad_norm": 1.189637541770935, + "learning_rate": 9.528432732316229e-06, + "loss": 0.2085, + "step": 690 + }, + { + "epoch": 0.4786976099757534, + "grad_norm": 0.9842264652252197, + "learning_rate": 9.527739251040222e-06, + "loss": 0.168, + "step": 691 + }, + { + "epoch": 0.4793903706269484, + "grad_norm": 1.2393513917922974, + "learning_rate": 9.527045769764217e-06, + "loss": 0.1678, + "step": 692 + }, + { + "epoch": 0.4800831312781434, + "grad_norm": 1.2825803756713867, + "learning_rate": 9.526352288488212e-06, + "loss": 0.1998, + "step": 693 + }, + { + "epoch": 0.4807758919293384, + "grad_norm": 1.0857014656066895, + "learning_rate": 9.525658807212207e-06, + "loss": 0.1661, + "step": 694 + }, + { + "epoch": 0.48146865258053345, + "grad_norm": 1.0867372751235962, + "learning_rate": 9.524965325936202e-06, + "loss": 0.194, + "step": 695 + }, + { + "epoch": 0.48216141323172845, + "grad_norm": 1.1201382875442505, + "learning_rate": 9.524271844660195e-06, + "loss": 0.1708, + "step": 696 + }, + { + "epoch": 0.48285417388292345, + "grad_norm": 1.0329430103302002, + "learning_rate": 9.52357836338419e-06, + "loss": 0.1904, + "step": 697 + }, + { + "epoch": 0.48354693453411846, + "grad_norm": 1.0161997079849243, + "learning_rate": 9.522884882108185e-06, + "loss": 0.1558, + "step": 698 + }, + { + "epoch": 0.48423969518531346, + "grad_norm": 1.0819106101989746, + "learning_rate": 9.522191400832178e-06, + "loss": 0.1486, + "step": 699 + }, + { + "epoch": 0.4849324558365085, + "grad_norm": 1.1887950897216797, + "learning_rate": 9.521497919556173e-06, + "loss": 0.1777, + "step": 700 + }, + { + "epoch": 0.4856252164877035, + "grad_norm": 1.1242791414260864, + "learning_rate": 9.520804438280166e-06, + "loss": 0.1696, + "step": 701 + }, + { + "epoch": 0.4863179771388985, + "grad_norm": 1.224696397781372, + "learning_rate": 9.520110957004161e-06, + "loss": 0.2119, + "step": 702 + }, + { + "epoch": 0.4870107377900935, + "grad_norm": 1.1201990842819214, + "learning_rate": 9.519417475728156e-06, + "loss": 0.1641, + "step": 703 + }, + { + "epoch": 0.4877034984412885, + "grad_norm": 1.097137689590454, + "learning_rate": 9.518723994452151e-06, + "loss": 0.1775, + "step": 704 + }, + { + "epoch": 0.4883962590924835, + "grad_norm": 1.062699556350708, + "learning_rate": 9.518030513176146e-06, + "loss": 0.1501, + "step": 705 + }, + { + "epoch": 0.4890890197436786, + "grad_norm": 1.1051220893859863, + "learning_rate": 9.517337031900139e-06, + "loss": 0.1586, + "step": 706 + }, + { + "epoch": 0.4897817803948736, + "grad_norm": 1.1447747945785522, + "learning_rate": 9.516643550624134e-06, + "loss": 0.1961, + "step": 707 + }, + { + "epoch": 0.4904745410460686, + "grad_norm": 1.0780168771743774, + "learning_rate": 9.515950069348129e-06, + "loss": 0.1665, + "step": 708 + }, + { + "epoch": 0.4911673016972636, + "grad_norm": 1.154279112815857, + "learning_rate": 9.515256588072122e-06, + "loss": 0.1913, + "step": 709 + }, + { + "epoch": 0.4918600623484586, + "grad_norm": 1.137214183807373, + "learning_rate": 9.514563106796117e-06, + "loss": 0.1734, + "step": 710 + }, + { + "epoch": 0.49255282299965364, + "grad_norm": 1.1921672821044922, + "learning_rate": 9.513869625520112e-06, + "loss": 0.1781, + "step": 711 + }, + { + "epoch": 0.49324558365084864, + "grad_norm": 1.0072582960128784, + "learning_rate": 9.513176144244107e-06, + "loss": 0.1362, + "step": 712 + }, + { + "epoch": 0.49393834430204364, + "grad_norm": 1.1574749946594238, + "learning_rate": 9.512482662968102e-06, + "loss": 0.1693, + "step": 713 + }, + { + "epoch": 0.49463110495323864, + "grad_norm": 1.2283178567886353, + "learning_rate": 9.511789181692095e-06, + "loss": 0.1753, + "step": 714 + }, + { + "epoch": 0.49532386560443364, + "grad_norm": 0.8855034708976746, + "learning_rate": 9.51109570041609e-06, + "loss": 0.131, + "step": 715 + }, + { + "epoch": 0.4960166262556287, + "grad_norm": 1.0271499156951904, + "learning_rate": 9.510402219140083e-06, + "loss": 0.1416, + "step": 716 + }, + { + "epoch": 0.4967093869068237, + "grad_norm": 1.055769920349121, + "learning_rate": 9.509708737864078e-06, + "loss": 0.1802, + "step": 717 + }, + { + "epoch": 0.4974021475580187, + "grad_norm": 1.1768474578857422, + "learning_rate": 9.509015256588073e-06, + "loss": 0.1647, + "step": 718 + }, + { + "epoch": 0.4980949082092137, + "grad_norm": 1.1791810989379883, + "learning_rate": 9.508321775312066e-06, + "loss": 0.216, + "step": 719 + }, + { + "epoch": 0.4987876688604087, + "grad_norm": 1.0952759981155396, + "learning_rate": 9.507628294036061e-06, + "loss": 0.1976, + "step": 720 + }, + { + "epoch": 0.49948042951160376, + "grad_norm": 1.055741548538208, + "learning_rate": 9.506934812760056e-06, + "loss": 0.1744, + "step": 721 + }, + { + "epoch": 0.5001731901627987, + "grad_norm": 1.1406736373901367, + "learning_rate": 9.506241331484051e-06, + "loss": 0.1761, + "step": 722 + }, + { + "epoch": 0.5008659508139938, + "grad_norm": 1.1879351139068604, + "learning_rate": 9.505547850208046e-06, + "loss": 0.1842, + "step": 723 + }, + { + "epoch": 0.5015587114651888, + "grad_norm": 1.1181124448776245, + "learning_rate": 9.50485436893204e-06, + "loss": 0.1799, + "step": 724 + }, + { + "epoch": 0.5022514721163838, + "grad_norm": 1.1482161283493042, + "learning_rate": 9.504160887656034e-06, + "loss": 0.1711, + "step": 725 + }, + { + "epoch": 0.5029442327675788, + "grad_norm": 1.0373079776763916, + "learning_rate": 9.503467406380028e-06, + "loss": 0.1639, + "step": 726 + }, + { + "epoch": 0.5036369934187738, + "grad_norm": 0.9980860948562622, + "learning_rate": 9.502773925104022e-06, + "loss": 0.1604, + "step": 727 + }, + { + "epoch": 0.5043297540699688, + "grad_norm": 1.0549565553665161, + "learning_rate": 9.502080443828017e-06, + "loss": 0.1876, + "step": 728 + }, + { + "epoch": 0.5050225147211639, + "grad_norm": 1.1054435968399048, + "learning_rate": 9.501386962552012e-06, + "loss": 0.1753, + "step": 729 + }, + { + "epoch": 0.5057152753723588, + "grad_norm": 1.103574514389038, + "learning_rate": 9.500693481276007e-06, + "loss": 0.1581, + "step": 730 + }, + { + "epoch": 0.5064080360235539, + "grad_norm": 1.071014165878296, + "learning_rate": 9.5e-06, + "loss": 0.1746, + "step": 731 + }, + { + "epoch": 0.5071007966747488, + "grad_norm": 1.1004180908203125, + "learning_rate": 9.499306518723995e-06, + "loss": 0.1914, + "step": 732 + }, + { + "epoch": 0.5077935573259439, + "grad_norm": 1.2091470956802368, + "learning_rate": 9.49861303744799e-06, + "loss": 0.1894, + "step": 733 + }, + { + "epoch": 0.508486317977139, + "grad_norm": 1.163714051246643, + "learning_rate": 9.497919556171984e-06, + "loss": 0.1919, + "step": 734 + }, + { + "epoch": 0.5091790786283339, + "grad_norm": 1.2377830743789673, + "learning_rate": 9.497226074895979e-06, + "loss": 0.1979, + "step": 735 + }, + { + "epoch": 0.509871839279529, + "grad_norm": 1.0097078084945679, + "learning_rate": 9.496532593619972e-06, + "loss": 0.1594, + "step": 736 + }, + { + "epoch": 0.5105645999307239, + "grad_norm": 1.0955952405929565, + "learning_rate": 9.495839112343967e-06, + "loss": 0.1671, + "step": 737 + }, + { + "epoch": 0.511257360581919, + "grad_norm": 1.084110975265503, + "learning_rate": 9.495145631067962e-06, + "loss": 0.1764, + "step": 738 + }, + { + "epoch": 0.511950121233114, + "grad_norm": 1.0516014099121094, + "learning_rate": 9.494452149791957e-06, + "loss": 0.1606, + "step": 739 + }, + { + "epoch": 0.512642881884309, + "grad_norm": 1.0338329076766968, + "learning_rate": 9.493758668515952e-06, + "loss": 0.1611, + "step": 740 + }, + { + "epoch": 0.513335642535504, + "grad_norm": 1.1432766914367676, + "learning_rate": 9.493065187239945e-06, + "loss": 0.1695, + "step": 741 + }, + { + "epoch": 0.514028403186699, + "grad_norm": 1.0821613073349, + "learning_rate": 9.49237170596394e-06, + "loss": 0.1746, + "step": 742 + }, + { + "epoch": 0.514721163837894, + "grad_norm": 1.1612175703048706, + "learning_rate": 9.491678224687935e-06, + "loss": 0.1675, + "step": 743 + }, + { + "epoch": 0.5154139244890891, + "grad_norm": 1.1131502389907837, + "learning_rate": 9.490984743411928e-06, + "loss": 0.1655, + "step": 744 + }, + { + "epoch": 0.516106685140284, + "grad_norm": 1.0816477537155151, + "learning_rate": 9.490291262135923e-06, + "loss": 0.1672, + "step": 745 + }, + { + "epoch": 0.5167994457914791, + "grad_norm": 1.1418256759643555, + "learning_rate": 9.489597780859918e-06, + "loss": 0.1934, + "step": 746 + }, + { + "epoch": 0.517492206442674, + "grad_norm": 1.1041661500930786, + "learning_rate": 9.488904299583913e-06, + "loss": 0.1733, + "step": 747 + }, + { + "epoch": 0.5181849670938691, + "grad_norm": 1.169494867324829, + "learning_rate": 9.488210818307908e-06, + "loss": 0.1676, + "step": 748 + }, + { + "epoch": 0.5188777277450641, + "grad_norm": 1.0808500051498413, + "learning_rate": 9.487517337031901e-06, + "loss": 0.1641, + "step": 749 + }, + { + "epoch": 0.5195704883962591, + "grad_norm": 1.0408234596252441, + "learning_rate": 9.486823855755896e-06, + "loss": 0.1712, + "step": 750 + }, + { + "epoch": 0.5202632490474541, + "grad_norm": 1.096534252166748, + "learning_rate": 9.486130374479889e-06, + "loss": 0.1697, + "step": 751 + }, + { + "epoch": 0.5209560096986491, + "grad_norm": 1.1266546249389648, + "learning_rate": 9.485436893203884e-06, + "loss": 0.1733, + "step": 752 + }, + { + "epoch": 0.5216487703498441, + "grad_norm": 1.0809706449508667, + "learning_rate": 9.484743411927879e-06, + "loss": 0.1488, + "step": 753 + }, + { + "epoch": 0.5223415310010392, + "grad_norm": 1.0462276935577393, + "learning_rate": 9.484049930651872e-06, + "loss": 0.1563, + "step": 754 + }, + { + "epoch": 0.5230342916522341, + "grad_norm": 0.9936279654502869, + "learning_rate": 9.483356449375867e-06, + "loss": 0.1465, + "step": 755 + }, + { + "epoch": 0.5237270523034292, + "grad_norm": 1.135073184967041, + "learning_rate": 9.482662968099862e-06, + "loss": 0.2139, + "step": 756 + }, + { + "epoch": 0.5244198129546241, + "grad_norm": 1.115220069885254, + "learning_rate": 9.481969486823857e-06, + "loss": 0.1782, + "step": 757 + }, + { + "epoch": 0.5251125736058192, + "grad_norm": 1.0991733074188232, + "learning_rate": 9.481276005547852e-06, + "loss": 0.1593, + "step": 758 + }, + { + "epoch": 0.5258053342570143, + "grad_norm": 1.1360588073730469, + "learning_rate": 9.480582524271845e-06, + "loss": 0.1931, + "step": 759 + }, + { + "epoch": 0.5264980949082092, + "grad_norm": 1.1279994249343872, + "learning_rate": 9.47988904299584e-06, + "loss": 0.1854, + "step": 760 + }, + { + "epoch": 0.5271908555594043, + "grad_norm": 1.1006906032562256, + "learning_rate": 9.479195561719833e-06, + "loss": 0.1527, + "step": 761 + }, + { + "epoch": 0.5278836162105992, + "grad_norm": 1.2920564413070679, + "learning_rate": 9.478502080443828e-06, + "loss": 0.2194, + "step": 762 + }, + { + "epoch": 0.5285763768617943, + "grad_norm": 1.049129605293274, + "learning_rate": 9.477808599167823e-06, + "loss": 0.1614, + "step": 763 + }, + { + "epoch": 0.5292691375129892, + "grad_norm": 1.2203729152679443, + "learning_rate": 9.477115117891818e-06, + "loss": 0.1753, + "step": 764 + }, + { + "epoch": 0.5299618981641843, + "grad_norm": 1.2329607009887695, + "learning_rate": 9.476421636615813e-06, + "loss": 0.1734, + "step": 765 + }, + { + "epoch": 0.5306546588153793, + "grad_norm": 1.0419095754623413, + "learning_rate": 9.475728155339806e-06, + "loss": 0.1663, + "step": 766 + }, + { + "epoch": 0.5313474194665743, + "grad_norm": 1.1187750101089478, + "learning_rate": 9.475034674063801e-06, + "loss": 0.1609, + "step": 767 + }, + { + "epoch": 0.5320401801177693, + "grad_norm": 0.9896152019500732, + "learning_rate": 9.474341192787796e-06, + "loss": 0.1529, + "step": 768 + }, + { + "epoch": 0.5327329407689643, + "grad_norm": 1.0580388307571411, + "learning_rate": 9.47364771151179e-06, + "loss": 0.177, + "step": 769 + }, + { + "epoch": 0.5334257014201593, + "grad_norm": 1.0653841495513916, + "learning_rate": 9.472954230235784e-06, + "loss": 0.1877, + "step": 770 + }, + { + "epoch": 0.5341184620713544, + "grad_norm": 1.161211609840393, + "learning_rate": 9.47226074895978e-06, + "loss": 0.1858, + "step": 771 + }, + { + "epoch": 0.5348112227225493, + "grad_norm": 1.0894384384155273, + "learning_rate": 9.471567267683774e-06, + "loss": 0.2039, + "step": 772 + }, + { + "epoch": 0.5355039833737444, + "grad_norm": 1.0264865159988403, + "learning_rate": 9.470873786407767e-06, + "loss": 0.1519, + "step": 773 + }, + { + "epoch": 0.5361967440249393, + "grad_norm": 1.2336781024932861, + "learning_rate": 9.470180305131762e-06, + "loss": 0.1956, + "step": 774 + }, + { + "epoch": 0.5368895046761344, + "grad_norm": 1.064982533454895, + "learning_rate": 9.469486823855757e-06, + "loss": 0.1591, + "step": 775 + }, + { + "epoch": 0.5375822653273294, + "grad_norm": 1.0894606113433838, + "learning_rate": 9.46879334257975e-06, + "loss": 0.1708, + "step": 776 + }, + { + "epoch": 0.5382750259785244, + "grad_norm": 0.9786089062690735, + "learning_rate": 9.468099861303745e-06, + "loss": 0.1295, + "step": 777 + }, + { + "epoch": 0.5389677866297194, + "grad_norm": 1.1241366863250732, + "learning_rate": 9.46740638002774e-06, + "loss": 0.1713, + "step": 778 + }, + { + "epoch": 0.5396605472809144, + "grad_norm": 1.055454969406128, + "learning_rate": 9.466712898751734e-06, + "loss": 0.1754, + "step": 779 + }, + { + "epoch": 0.5403533079321095, + "grad_norm": 1.2188910245895386, + "learning_rate": 9.466019417475729e-06, + "loss": 0.2029, + "step": 780 + }, + { + "epoch": 0.5410460685833045, + "grad_norm": 0.9836776852607727, + "learning_rate": 9.465325936199723e-06, + "loss": 0.1491, + "step": 781 + }, + { + "epoch": 0.5417388292344995, + "grad_norm": 1.0582470893859863, + "learning_rate": 9.464632454923718e-06, + "loss": 0.1693, + "step": 782 + }, + { + "epoch": 0.5424315898856945, + "grad_norm": 1.2043572664260864, + "learning_rate": 9.463938973647713e-06, + "loss": 0.1891, + "step": 783 + }, + { + "epoch": 0.5431243505368895, + "grad_norm": 1.095377802848816, + "learning_rate": 9.463245492371707e-06, + "loss": 0.1809, + "step": 784 + }, + { + "epoch": 0.5438171111880845, + "grad_norm": 1.0977164506912231, + "learning_rate": 9.462552011095702e-06, + "loss": 0.1703, + "step": 785 + }, + { + "epoch": 0.5445098718392796, + "grad_norm": 1.1873396635055542, + "learning_rate": 9.461858529819695e-06, + "loss": 0.142, + "step": 786 + }, + { + "epoch": 0.5452026324904745, + "grad_norm": 1.0897413492202759, + "learning_rate": 9.46116504854369e-06, + "loss": 0.1657, + "step": 787 + }, + { + "epoch": 0.5458953931416696, + "grad_norm": 1.0887234210968018, + "learning_rate": 9.460471567267685e-06, + "loss": 0.1708, + "step": 788 + }, + { + "epoch": 0.5465881537928645, + "grad_norm": 1.056678056716919, + "learning_rate": 9.45977808599168e-06, + "loss": 0.1517, + "step": 789 + }, + { + "epoch": 0.5472809144440596, + "grad_norm": 1.0199482440948486, + "learning_rate": 9.459084604715674e-06, + "loss": 0.1447, + "step": 790 + }, + { + "epoch": 0.5479736750952546, + "grad_norm": 1.1114829778671265, + "learning_rate": 9.458391123439668e-06, + "loss": 0.1788, + "step": 791 + }, + { + "epoch": 0.5486664357464496, + "grad_norm": 1.0675311088562012, + "learning_rate": 9.457697642163663e-06, + "loss": 0.1678, + "step": 792 + }, + { + "epoch": 0.5493591963976446, + "grad_norm": 1.1933588981628418, + "learning_rate": 9.457004160887658e-06, + "loss": 0.1897, + "step": 793 + }, + { + "epoch": 0.5500519570488396, + "grad_norm": 0.9565851092338562, + "learning_rate": 9.45631067961165e-06, + "loss": 0.1556, + "step": 794 + }, + { + "epoch": 0.5507447177000346, + "grad_norm": 1.0294686555862427, + "learning_rate": 9.455617198335646e-06, + "loss": 0.1564, + "step": 795 + }, + { + "epoch": 0.5514374783512297, + "grad_norm": 1.1544939279556274, + "learning_rate": 9.454923717059639e-06, + "loss": 0.1808, + "step": 796 + }, + { + "epoch": 0.5521302390024246, + "grad_norm": 1.0158432722091675, + "learning_rate": 9.454230235783634e-06, + "loss": 0.1758, + "step": 797 + }, + { + "epoch": 0.5528229996536197, + "grad_norm": 1.0488780736923218, + "learning_rate": 9.453536754507629e-06, + "loss": 0.1514, + "step": 798 + }, + { + "epoch": 0.5535157603048146, + "grad_norm": 1.0166054964065552, + "learning_rate": 9.452843273231624e-06, + "loss": 0.1699, + "step": 799 + }, + { + "epoch": 0.5542085209560097, + "grad_norm": 1.273810863494873, + "learning_rate": 9.452149791955619e-06, + "loss": 0.1872, + "step": 800 + }, + { + "epoch": 0.5549012816072048, + "grad_norm": 1.1096241474151611, + "learning_rate": 9.451456310679612e-06, + "loss": 0.2005, + "step": 801 + }, + { + "epoch": 0.5555940422583997, + "grad_norm": 0.938686192035675, + "learning_rate": 9.450762829403607e-06, + "loss": 0.1365, + "step": 802 + }, + { + "epoch": 0.5562868029095948, + "grad_norm": 1.1645365953445435, + "learning_rate": 9.450069348127602e-06, + "loss": 0.1654, + "step": 803 + }, + { + "epoch": 0.5569795635607897, + "grad_norm": 1.0550135374069214, + "learning_rate": 9.449375866851595e-06, + "loss": 0.147, + "step": 804 + }, + { + "epoch": 0.5576723242119848, + "grad_norm": 1.1085137128829956, + "learning_rate": 9.44868238557559e-06, + "loss": 0.17, + "step": 805 + }, + { + "epoch": 0.5583650848631798, + "grad_norm": 1.0627312660217285, + "learning_rate": 9.447988904299585e-06, + "loss": 0.166, + "step": 806 + }, + { + "epoch": 0.5590578455143748, + "grad_norm": 1.2298170328140259, + "learning_rate": 9.44729542302358e-06, + "loss": 0.1824, + "step": 807 + }, + { + "epoch": 0.5597506061655698, + "grad_norm": 1.083337426185608, + "learning_rate": 9.446601941747575e-06, + "loss": 0.1601, + "step": 808 + }, + { + "epoch": 0.5604433668167648, + "grad_norm": 1.0384013652801514, + "learning_rate": 9.445908460471568e-06, + "loss": 0.159, + "step": 809 + }, + { + "epoch": 0.5611361274679598, + "grad_norm": 1.178229808807373, + "learning_rate": 9.445214979195563e-06, + "loss": 0.1669, + "step": 810 + }, + { + "epoch": 0.5618288881191549, + "grad_norm": 1.1300045251846313, + "learning_rate": 9.444521497919556e-06, + "loss": 0.1729, + "step": 811 + }, + { + "epoch": 0.5625216487703498, + "grad_norm": 0.940787672996521, + "learning_rate": 9.443828016643551e-06, + "loss": 0.1478, + "step": 812 + }, + { + "epoch": 0.5632144094215449, + "grad_norm": 1.126418948173523, + "learning_rate": 9.443134535367546e-06, + "loss": 0.1962, + "step": 813 + }, + { + "epoch": 0.5639071700727398, + "grad_norm": 1.0317989587783813, + "learning_rate": 9.44244105409154e-06, + "loss": 0.1548, + "step": 814 + }, + { + "epoch": 0.5645999307239349, + "grad_norm": 1.2981680631637573, + "learning_rate": 9.441747572815534e-06, + "loss": 0.1905, + "step": 815 + }, + { + "epoch": 0.5652926913751299, + "grad_norm": 0.9974861741065979, + "learning_rate": 9.44105409153953e-06, + "loss": 0.1447, + "step": 816 + }, + { + "epoch": 0.5659854520263249, + "grad_norm": 1.0251084566116333, + "learning_rate": 9.440360610263524e-06, + "loss": 0.146, + "step": 817 + }, + { + "epoch": 0.5666782126775199, + "grad_norm": 0.9431449174880981, + "learning_rate": 9.439667128987519e-06, + "loss": 0.1516, + "step": 818 + }, + { + "epoch": 0.5673709733287149, + "grad_norm": 1.076874852180481, + "learning_rate": 9.438973647711512e-06, + "loss": 0.1747, + "step": 819 + }, + { + "epoch": 0.56806373397991, + "grad_norm": 1.1013811826705933, + "learning_rate": 9.438280166435507e-06, + "loss": 0.1674, + "step": 820 + }, + { + "epoch": 0.568756494631105, + "grad_norm": 1.1640971899032593, + "learning_rate": 9.4375866851595e-06, + "loss": 0.1457, + "step": 821 + }, + { + "epoch": 0.5694492552823, + "grad_norm": 1.1467467546463013, + "learning_rate": 9.436893203883495e-06, + "loss": 0.1752, + "step": 822 + }, + { + "epoch": 0.570142015933495, + "grad_norm": 1.1216028928756714, + "learning_rate": 9.43619972260749e-06, + "loss": 0.169, + "step": 823 + }, + { + "epoch": 0.57083477658469, + "grad_norm": 1.068642020225525, + "learning_rate": 9.435506241331485e-06, + "loss": 0.1912, + "step": 824 + }, + { + "epoch": 0.571527537235885, + "grad_norm": 1.0372047424316406, + "learning_rate": 9.43481276005548e-06, + "loss": 0.1722, + "step": 825 + }, + { + "epoch": 0.5722202978870801, + "grad_norm": 1.0227689743041992, + "learning_rate": 9.434119278779473e-06, + "loss": 0.1651, + "step": 826 + }, + { + "epoch": 0.572913058538275, + "grad_norm": 0.9313246607780457, + "learning_rate": 9.433425797503468e-06, + "loss": 0.1704, + "step": 827 + }, + { + "epoch": 0.5736058191894701, + "grad_norm": 0.9683116674423218, + "learning_rate": 9.432732316227463e-06, + "loss": 0.1536, + "step": 828 + }, + { + "epoch": 0.574298579840665, + "grad_norm": 1.049546241760254, + "learning_rate": 9.432038834951457e-06, + "loss": 0.1934, + "step": 829 + }, + { + "epoch": 0.5749913404918601, + "grad_norm": 1.0880478620529175, + "learning_rate": 9.431345353675451e-06, + "loss": 0.167, + "step": 830 + }, + { + "epoch": 0.5756841011430551, + "grad_norm": 0.9867199659347534, + "learning_rate": 9.430651872399445e-06, + "loss": 0.1683, + "step": 831 + }, + { + "epoch": 0.5763768617942501, + "grad_norm": 1.1325767040252686, + "learning_rate": 9.42995839112344e-06, + "loss": 0.1769, + "step": 832 + }, + { + "epoch": 0.5770696224454451, + "grad_norm": 1.094765543937683, + "learning_rate": 9.429264909847435e-06, + "loss": 0.1488, + "step": 833 + }, + { + "epoch": 0.5777623830966401, + "grad_norm": 1.150747537612915, + "learning_rate": 9.42857142857143e-06, + "loss": 0.1855, + "step": 834 + }, + { + "epoch": 0.5784551437478351, + "grad_norm": 1.2596840858459473, + "learning_rate": 9.427877947295424e-06, + "loss": 0.1901, + "step": 835 + }, + { + "epoch": 0.5791479043990302, + "grad_norm": 1.0859004259109497, + "learning_rate": 9.427184466019418e-06, + "loss": 0.1711, + "step": 836 + }, + { + "epoch": 0.5798406650502251, + "grad_norm": 1.0970268249511719, + "learning_rate": 9.426490984743413e-06, + "loss": 0.1829, + "step": 837 + }, + { + "epoch": 0.5805334257014202, + "grad_norm": 0.987734317779541, + "learning_rate": 9.425797503467408e-06, + "loss": 0.1337, + "step": 838 + }, + { + "epoch": 0.5812261863526151, + "grad_norm": 0.9403496980667114, + "learning_rate": 9.4251040221914e-06, + "loss": 0.1426, + "step": 839 + }, + { + "epoch": 0.5819189470038102, + "grad_norm": 1.0452890396118164, + "learning_rate": 9.424410540915396e-06, + "loss": 0.1731, + "step": 840 + }, + { + "epoch": 0.5826117076550053, + "grad_norm": 1.207574486732483, + "learning_rate": 9.42371705963939e-06, + "loss": 0.1846, + "step": 841 + }, + { + "epoch": 0.5833044683062002, + "grad_norm": 1.1449249982833862, + "learning_rate": 9.423023578363386e-06, + "loss": 0.1648, + "step": 842 + }, + { + "epoch": 0.5839972289573953, + "grad_norm": 1.1291990280151367, + "learning_rate": 9.42233009708738e-06, + "loss": 0.1725, + "step": 843 + }, + { + "epoch": 0.5846899896085902, + "grad_norm": 1.0940741300582886, + "learning_rate": 9.421636615811374e-06, + "loss": 0.1713, + "step": 844 + }, + { + "epoch": 0.5853827502597853, + "grad_norm": 1.090114712715149, + "learning_rate": 9.420943134535369e-06, + "loss": 0.1899, + "step": 845 + }, + { + "epoch": 0.5860755109109802, + "grad_norm": 1.1002274751663208, + "learning_rate": 9.420249653259362e-06, + "loss": 0.1623, + "step": 846 + }, + { + "epoch": 0.5867682715621753, + "grad_norm": 1.200971245765686, + "learning_rate": 9.419556171983357e-06, + "loss": 0.1813, + "step": 847 + }, + { + "epoch": 0.5874610322133703, + "grad_norm": 1.0063960552215576, + "learning_rate": 9.418862690707352e-06, + "loss": 0.1723, + "step": 848 + }, + { + "epoch": 0.5881537928645653, + "grad_norm": 1.2347016334533691, + "learning_rate": 9.418169209431347e-06, + "loss": 0.1865, + "step": 849 + }, + { + "epoch": 0.5888465535157603, + "grad_norm": 1.1809277534484863, + "learning_rate": 9.41747572815534e-06, + "loss": 0.1778, + "step": 850 + }, + { + "epoch": 0.5895393141669553, + "grad_norm": 0.9739560484886169, + "learning_rate": 9.416782246879335e-06, + "loss": 0.1391, + "step": 851 + }, + { + "epoch": 0.5902320748181503, + "grad_norm": 1.0778708457946777, + "learning_rate": 9.41608876560333e-06, + "loss": 0.1648, + "step": 852 + }, + { + "epoch": 0.5909248354693454, + "grad_norm": 1.2798731327056885, + "learning_rate": 9.415395284327325e-06, + "loss": 0.1685, + "step": 853 + }, + { + "epoch": 0.5916175961205403, + "grad_norm": 1.5980770587921143, + "learning_rate": 9.414701803051318e-06, + "loss": 0.1503, + "step": 854 + }, + { + "epoch": 0.5923103567717354, + "grad_norm": 1.0819963216781616, + "learning_rate": 9.414008321775313e-06, + "loss": 0.1435, + "step": 855 + }, + { + "epoch": 0.5930031174229303, + "grad_norm": 1.1285098791122437, + "learning_rate": 9.413314840499306e-06, + "loss": 0.171, + "step": 856 + }, + { + "epoch": 0.5936958780741254, + "grad_norm": 0.9913773536682129, + "learning_rate": 9.412621359223301e-06, + "loss": 0.1453, + "step": 857 + }, + { + "epoch": 0.5943886387253204, + "grad_norm": 1.1429187059402466, + "learning_rate": 9.411927877947296e-06, + "loss": 0.1559, + "step": 858 + }, + { + "epoch": 0.5950813993765154, + "grad_norm": 1.103598952293396, + "learning_rate": 9.411234396671291e-06, + "loss": 0.1334, + "step": 859 + }, + { + "epoch": 0.5957741600277104, + "grad_norm": 1.1337062120437622, + "learning_rate": 9.410540915395286e-06, + "loss": 0.1663, + "step": 860 + }, + { + "epoch": 0.5964669206789054, + "grad_norm": 1.2121385335922241, + "learning_rate": 9.40984743411928e-06, + "loss": 0.1767, + "step": 861 + }, + { + "epoch": 0.5971596813301004, + "grad_norm": 1.0137088298797607, + "learning_rate": 9.409153952843274e-06, + "loss": 0.1552, + "step": 862 + }, + { + "epoch": 0.5978524419812955, + "grad_norm": 1.0943708419799805, + "learning_rate": 9.408460471567269e-06, + "loss": 0.1593, + "step": 863 + }, + { + "epoch": 0.5985452026324904, + "grad_norm": 1.1034287214279175, + "learning_rate": 9.407766990291262e-06, + "loss": 0.1607, + "step": 864 + }, + { + "epoch": 0.5992379632836855, + "grad_norm": 1.1569900512695312, + "learning_rate": 9.407073509015257e-06, + "loss": 0.1645, + "step": 865 + }, + { + "epoch": 0.5999307239348804, + "grad_norm": 1.1737624406814575, + "learning_rate": 9.406380027739252e-06, + "loss": 0.1903, + "step": 866 + }, + { + "epoch": 0.6006234845860755, + "grad_norm": 0.9703179001808167, + "learning_rate": 9.405686546463247e-06, + "loss": 0.1402, + "step": 867 + }, + { + "epoch": 0.6013162452372706, + "grad_norm": 1.1867642402648926, + "learning_rate": 9.404993065187242e-06, + "loss": 0.1722, + "step": 868 + }, + { + "epoch": 0.6020090058884655, + "grad_norm": 1.0768351554870605, + "learning_rate": 9.404299583911235e-06, + "loss": 0.1674, + "step": 869 + }, + { + "epoch": 0.6027017665396606, + "grad_norm": 1.1360597610473633, + "learning_rate": 9.40360610263523e-06, + "loss": 0.1669, + "step": 870 + }, + { + "epoch": 0.6033945271908555, + "grad_norm": 1.0119037628173828, + "learning_rate": 9.402912621359223e-06, + "loss": 0.16, + "step": 871 + }, + { + "epoch": 0.6040872878420506, + "grad_norm": 1.1194970607757568, + "learning_rate": 9.402219140083218e-06, + "loss": 0.1936, + "step": 872 + }, + { + "epoch": 0.6047800484932456, + "grad_norm": 1.0727914571762085, + "learning_rate": 9.401525658807213e-06, + "loss": 0.152, + "step": 873 + }, + { + "epoch": 0.6054728091444406, + "grad_norm": 1.087863564491272, + "learning_rate": 9.400832177531207e-06, + "loss": 0.15, + "step": 874 + }, + { + "epoch": 0.6061655697956356, + "grad_norm": 0.9721863269805908, + "learning_rate": 9.400138696255201e-06, + "loss": 0.1494, + "step": 875 + }, + { + "epoch": 0.6068583304468306, + "grad_norm": 1.1647229194641113, + "learning_rate": 9.399445214979196e-06, + "loss": 0.1827, + "step": 876 + }, + { + "epoch": 0.6075510910980256, + "grad_norm": 1.0240576267242432, + "learning_rate": 9.398751733703191e-06, + "loss": 0.157, + "step": 877 + }, + { + "epoch": 0.6082438517492207, + "grad_norm": 1.128947377204895, + "learning_rate": 9.398058252427186e-06, + "loss": 0.171, + "step": 878 + }, + { + "epoch": 0.6089366124004156, + "grad_norm": 1.165592908859253, + "learning_rate": 9.39736477115118e-06, + "loss": 0.1586, + "step": 879 + }, + { + "epoch": 0.6096293730516107, + "grad_norm": 1.0030488967895508, + "learning_rate": 9.396671289875174e-06, + "loss": 0.1414, + "step": 880 + }, + { + "epoch": 0.6103221337028056, + "grad_norm": 1.1792995929718018, + "learning_rate": 9.395977808599168e-06, + "loss": 0.2094, + "step": 881 + }, + { + "epoch": 0.6110148943540007, + "grad_norm": 1.0264792442321777, + "learning_rate": 9.395284327323163e-06, + "loss": 0.1515, + "step": 882 + }, + { + "epoch": 0.6117076550051957, + "grad_norm": 1.0973026752471924, + "learning_rate": 9.394590846047158e-06, + "loss": 0.1903, + "step": 883 + }, + { + "epoch": 0.6124004156563907, + "grad_norm": 1.005173921585083, + "learning_rate": 9.393897364771152e-06, + "loss": 0.1404, + "step": 884 + }, + { + "epoch": 0.6130931763075858, + "grad_norm": 1.0814557075500488, + "learning_rate": 9.393203883495147e-06, + "loss": 0.2041, + "step": 885 + }, + { + "epoch": 0.6137859369587807, + "grad_norm": 1.2196296453475952, + "learning_rate": 9.39251040221914e-06, + "loss": 0.1873, + "step": 886 + }, + { + "epoch": 0.6144786976099758, + "grad_norm": 0.9337116479873657, + "learning_rate": 9.391816920943136e-06, + "loss": 0.1468, + "step": 887 + }, + { + "epoch": 0.6151714582611708, + "grad_norm": 0.9686228036880493, + "learning_rate": 9.39112343966713e-06, + "loss": 0.1357, + "step": 888 + }, + { + "epoch": 0.6158642189123658, + "grad_norm": 1.1271620988845825, + "learning_rate": 9.390429958391124e-06, + "loss": 0.1891, + "step": 889 + }, + { + "epoch": 0.6165569795635608, + "grad_norm": 1.0006288290023804, + "learning_rate": 9.389736477115119e-06, + "loss": 0.1632, + "step": 890 + }, + { + "epoch": 0.6172497402147558, + "grad_norm": 1.1276105642318726, + "learning_rate": 9.389042995839112e-06, + "loss": 0.1847, + "step": 891 + }, + { + "epoch": 0.6179425008659508, + "grad_norm": 0.9748163223266602, + "learning_rate": 9.388349514563107e-06, + "loss": 0.1561, + "step": 892 + }, + { + "epoch": 0.6186352615171459, + "grad_norm": 1.104117751121521, + "learning_rate": 9.387656033287102e-06, + "loss": 0.184, + "step": 893 + }, + { + "epoch": 0.6193280221683408, + "grad_norm": 0.9743916392326355, + "learning_rate": 9.386962552011097e-06, + "loss": 0.1528, + "step": 894 + }, + { + "epoch": 0.6200207828195359, + "grad_norm": 1.0051432847976685, + "learning_rate": 9.386269070735092e-06, + "loss": 0.1343, + "step": 895 + }, + { + "epoch": 0.6207135434707308, + "grad_norm": 1.1347147226333618, + "learning_rate": 9.385575589459085e-06, + "loss": 0.1689, + "step": 896 + }, + { + "epoch": 0.6214063041219259, + "grad_norm": 1.051589012145996, + "learning_rate": 9.38488210818308e-06, + "loss": 0.1561, + "step": 897 + }, + { + "epoch": 0.6220990647731209, + "grad_norm": 1.0692572593688965, + "learning_rate": 9.384188626907075e-06, + "loss": 0.1738, + "step": 898 + }, + { + "epoch": 0.6227918254243159, + "grad_norm": 0.9588863253593445, + "learning_rate": 9.383495145631068e-06, + "loss": 0.1536, + "step": 899 + }, + { + "epoch": 0.6234845860755109, + "grad_norm": 0.991447389125824, + "learning_rate": 9.382801664355063e-06, + "loss": 0.1435, + "step": 900 + }, + { + "epoch": 0.6241773467267059, + "grad_norm": 1.0007715225219727, + "learning_rate": 9.382108183079058e-06, + "loss": 0.1481, + "step": 901 + }, + { + "epoch": 0.6248701073779009, + "grad_norm": 1.085169792175293, + "learning_rate": 9.381414701803053e-06, + "loss": 0.1542, + "step": 902 + }, + { + "epoch": 0.625562868029096, + "grad_norm": 0.9361264109611511, + "learning_rate": 9.380721220527048e-06, + "loss": 0.1316, + "step": 903 + }, + { + "epoch": 0.6262556286802909, + "grad_norm": 1.097924828529358, + "learning_rate": 9.380027739251041e-06, + "loss": 0.1684, + "step": 904 + }, + { + "epoch": 0.626948389331486, + "grad_norm": 1.149580717086792, + "learning_rate": 9.379334257975036e-06, + "loss": 0.1706, + "step": 905 + }, + { + "epoch": 0.627641149982681, + "grad_norm": 1.1573041677474976, + "learning_rate": 9.37864077669903e-06, + "loss": 0.1787, + "step": 906 + }, + { + "epoch": 0.628333910633876, + "grad_norm": 1.0068886280059814, + "learning_rate": 9.377947295423024e-06, + "loss": 0.1437, + "step": 907 + }, + { + "epoch": 0.6290266712850711, + "grad_norm": 1.1588560342788696, + "learning_rate": 9.377253814147019e-06, + "loss": 0.1749, + "step": 908 + }, + { + "epoch": 0.629719431936266, + "grad_norm": 1.0714843273162842, + "learning_rate": 9.376560332871012e-06, + "loss": 0.1578, + "step": 909 + }, + { + "epoch": 0.6304121925874611, + "grad_norm": 1.195635199546814, + "learning_rate": 9.375866851595007e-06, + "loss": 0.181, + "step": 910 + }, + { + "epoch": 0.631104953238656, + "grad_norm": 1.0656758546829224, + "learning_rate": 9.375173370319002e-06, + "loss": 0.1799, + "step": 911 + }, + { + "epoch": 0.6317977138898511, + "grad_norm": 0.9909989833831787, + "learning_rate": 9.374479889042997e-06, + "loss": 0.1494, + "step": 912 + }, + { + "epoch": 0.6324904745410461, + "grad_norm": 1.0120370388031006, + "learning_rate": 9.373786407766992e-06, + "loss": 0.1609, + "step": 913 + }, + { + "epoch": 0.6331832351922411, + "grad_norm": 1.131453514099121, + "learning_rate": 9.373092926490985e-06, + "loss": 0.1554, + "step": 914 + }, + { + "epoch": 0.6338759958434361, + "grad_norm": 1.067887783050537, + "learning_rate": 9.37239944521498e-06, + "loss": 0.1474, + "step": 915 + }, + { + "epoch": 0.6345687564946311, + "grad_norm": 1.0186927318572998, + "learning_rate": 9.371705963938973e-06, + "loss": 0.1753, + "step": 916 + }, + { + "epoch": 0.6352615171458261, + "grad_norm": 1.1581684350967407, + "learning_rate": 9.371012482662968e-06, + "loss": 0.1716, + "step": 917 + }, + { + "epoch": 0.6359542777970212, + "grad_norm": 1.0602717399597168, + "learning_rate": 9.370319001386963e-06, + "loss": 0.1518, + "step": 918 + }, + { + "epoch": 0.6366470384482161, + "grad_norm": 0.9391573071479797, + "learning_rate": 9.369625520110958e-06, + "loss": 0.154, + "step": 919 + }, + { + "epoch": 0.6373397990994112, + "grad_norm": 0.9334474802017212, + "learning_rate": 9.368932038834953e-06, + "loss": 0.1729, + "step": 920 + }, + { + "epoch": 0.6380325597506061, + "grad_norm": 1.1120349168777466, + "learning_rate": 9.368238557558946e-06, + "loss": 0.1823, + "step": 921 + }, + { + "epoch": 0.6387253204018012, + "grad_norm": 1.159719705581665, + "learning_rate": 9.367545076282941e-06, + "loss": 0.1734, + "step": 922 + }, + { + "epoch": 0.6394180810529961, + "grad_norm": 0.9645901322364807, + "learning_rate": 9.366851595006936e-06, + "loss": 0.1622, + "step": 923 + }, + { + "epoch": 0.6401108417041912, + "grad_norm": 1.1247612237930298, + "learning_rate": 9.36615811373093e-06, + "loss": 0.1832, + "step": 924 + }, + { + "epoch": 0.6408036023553862, + "grad_norm": 1.1481128931045532, + "learning_rate": 9.365464632454924e-06, + "loss": 0.1846, + "step": 925 + }, + { + "epoch": 0.6414963630065812, + "grad_norm": 1.0706948041915894, + "learning_rate": 9.36477115117892e-06, + "loss": 0.1957, + "step": 926 + }, + { + "epoch": 0.6421891236577763, + "grad_norm": 1.136802315711975, + "learning_rate": 9.364077669902913e-06, + "loss": 0.1927, + "step": 927 + }, + { + "epoch": 0.6428818843089712, + "grad_norm": 1.1108746528625488, + "learning_rate": 9.363384188626908e-06, + "loss": 0.1714, + "step": 928 + }, + { + "epoch": 0.6435746449601663, + "grad_norm": 1.1542079448699951, + "learning_rate": 9.362690707350902e-06, + "loss": 0.1624, + "step": 929 + }, + { + "epoch": 0.6442674056113613, + "grad_norm": 0.9668132066726685, + "learning_rate": 9.361997226074897e-06, + "loss": 0.1498, + "step": 930 + }, + { + "epoch": 0.6449601662625563, + "grad_norm": 0.997539222240448, + "learning_rate": 9.36130374479889e-06, + "loss": 0.1684, + "step": 931 + }, + { + "epoch": 0.6456529269137513, + "grad_norm": 1.1347752809524536, + "learning_rate": 9.360610263522886e-06, + "loss": 0.1916, + "step": 932 + }, + { + "epoch": 0.6463456875649463, + "grad_norm": 1.0294948816299438, + "learning_rate": 9.35991678224688e-06, + "loss": 0.1711, + "step": 933 + }, + { + "epoch": 0.6470384482161413, + "grad_norm": 1.0586535930633545, + "learning_rate": 9.359223300970874e-06, + "loss": 0.1698, + "step": 934 + }, + { + "epoch": 0.6477312088673364, + "grad_norm": 0.9234961867332458, + "learning_rate": 9.358529819694869e-06, + "loss": 0.1513, + "step": 935 + }, + { + "epoch": 0.6484239695185313, + "grad_norm": 1.0497515201568604, + "learning_rate": 9.357836338418864e-06, + "loss": 0.1767, + "step": 936 + }, + { + "epoch": 0.6491167301697264, + "grad_norm": 1.0686051845550537, + "learning_rate": 9.357142857142859e-06, + "loss": 0.1691, + "step": 937 + }, + { + "epoch": 0.6498094908209213, + "grad_norm": 1.0756900310516357, + "learning_rate": 9.356449375866853e-06, + "loss": 0.169, + "step": 938 + }, + { + "epoch": 0.6505022514721164, + "grad_norm": 1.0305697917938232, + "learning_rate": 9.355755894590847e-06, + "loss": 0.1614, + "step": 939 + }, + { + "epoch": 0.6511950121233114, + "grad_norm": 1.0718286037445068, + "learning_rate": 9.355062413314842e-06, + "loss": 0.1552, + "step": 940 + }, + { + "epoch": 0.6518877727745064, + "grad_norm": 0.9616384506225586, + "learning_rate": 9.354368932038835e-06, + "loss": 0.1328, + "step": 941 + }, + { + "epoch": 0.6525805334257014, + "grad_norm": 1.1087232828140259, + "learning_rate": 9.35367545076283e-06, + "loss": 0.1723, + "step": 942 + }, + { + "epoch": 0.6532732940768964, + "grad_norm": 1.084382176399231, + "learning_rate": 9.352981969486825e-06, + "loss": 0.1689, + "step": 943 + }, + { + "epoch": 0.6539660547280914, + "grad_norm": 1.0610822439193726, + "learning_rate": 9.35228848821082e-06, + "loss": 0.161, + "step": 944 + }, + { + "epoch": 0.6546588153792865, + "grad_norm": 1.0572192668914795, + "learning_rate": 9.351595006934815e-06, + "loss": 0.1412, + "step": 945 + }, + { + "epoch": 0.6553515760304814, + "grad_norm": 0.9759120941162109, + "learning_rate": 9.350901525658808e-06, + "loss": 0.146, + "step": 946 + }, + { + "epoch": 0.6560443366816765, + "grad_norm": 1.085342526435852, + "learning_rate": 9.350208044382803e-06, + "loss": 0.1497, + "step": 947 + }, + { + "epoch": 0.6567370973328714, + "grad_norm": 0.9910585284233093, + "learning_rate": 9.349514563106798e-06, + "loss": 0.1466, + "step": 948 + }, + { + "epoch": 0.6574298579840665, + "grad_norm": 0.9044740200042725, + "learning_rate": 9.348821081830791e-06, + "loss": 0.1232, + "step": 949 + }, + { + "epoch": 0.6581226186352616, + "grad_norm": 1.0807905197143555, + "learning_rate": 9.348127600554786e-06, + "loss": 0.1817, + "step": 950 + }, + { + "epoch": 0.6588153792864565, + "grad_norm": 1.0731427669525146, + "learning_rate": 9.347434119278779e-06, + "loss": 0.1786, + "step": 951 + }, + { + "epoch": 0.6595081399376516, + "grad_norm": 1.138004183769226, + "learning_rate": 9.346740638002774e-06, + "loss": 0.1871, + "step": 952 + }, + { + "epoch": 0.6602009005888465, + "grad_norm": 1.1381019353866577, + "learning_rate": 9.346047156726769e-06, + "loss": 0.186, + "step": 953 + }, + { + "epoch": 0.6608936612400416, + "grad_norm": 1.1066994667053223, + "learning_rate": 9.345353675450764e-06, + "loss": 0.1664, + "step": 954 + }, + { + "epoch": 0.6615864218912366, + "grad_norm": 1.055285096168518, + "learning_rate": 9.344660194174759e-06, + "loss": 0.1848, + "step": 955 + }, + { + "epoch": 0.6622791825424316, + "grad_norm": 1.1309658288955688, + "learning_rate": 9.343966712898752e-06, + "loss": 0.1718, + "step": 956 + }, + { + "epoch": 0.6629719431936266, + "grad_norm": 1.0387343168258667, + "learning_rate": 9.343273231622747e-06, + "loss": 0.1842, + "step": 957 + }, + { + "epoch": 0.6636647038448216, + "grad_norm": 0.9196197986602783, + "learning_rate": 9.342579750346742e-06, + "loss": 0.1239, + "step": 958 + }, + { + "epoch": 0.6643574644960166, + "grad_norm": 1.1281828880310059, + "learning_rate": 9.341886269070735e-06, + "loss": 0.2056, + "step": 959 + }, + { + "epoch": 0.6650502251472117, + "grad_norm": 0.9667572975158691, + "learning_rate": 9.34119278779473e-06, + "loss": 0.1448, + "step": 960 + }, + { + "epoch": 0.6657429857984066, + "grad_norm": 1.164891004562378, + "learning_rate": 9.340499306518725e-06, + "loss": 0.1525, + "step": 961 + }, + { + "epoch": 0.6664357464496017, + "grad_norm": 1.0004878044128418, + "learning_rate": 9.33980582524272e-06, + "loss": 0.1797, + "step": 962 + }, + { + "epoch": 0.6671285071007966, + "grad_norm": 1.0409966707229614, + "learning_rate": 9.339112343966715e-06, + "loss": 0.1636, + "step": 963 + }, + { + "epoch": 0.6678212677519917, + "grad_norm": 1.0098788738250732, + "learning_rate": 9.338418862690708e-06, + "loss": 0.1641, + "step": 964 + }, + { + "epoch": 0.6685140284031867, + "grad_norm": 1.109473466873169, + "learning_rate": 9.337725381414703e-06, + "loss": 0.1467, + "step": 965 + }, + { + "epoch": 0.6692067890543817, + "grad_norm": 1.0836076736450195, + "learning_rate": 9.337031900138696e-06, + "loss": 0.1467, + "step": 966 + }, + { + "epoch": 0.6698995497055767, + "grad_norm": 1.0216044187545776, + "learning_rate": 9.336338418862691e-06, + "loss": 0.1667, + "step": 967 + }, + { + "epoch": 0.6705923103567717, + "grad_norm": 0.8676590919494629, + "learning_rate": 9.335644937586686e-06, + "loss": 0.1303, + "step": 968 + }, + { + "epoch": 0.6712850710079667, + "grad_norm": 1.0340557098388672, + "learning_rate": 9.33495145631068e-06, + "loss": 0.146, + "step": 969 + }, + { + "epoch": 0.6719778316591618, + "grad_norm": 1.0925043821334839, + "learning_rate": 9.334257975034674e-06, + "loss": 0.1651, + "step": 970 + }, + { + "epoch": 0.6726705923103568, + "grad_norm": 1.1762585639953613, + "learning_rate": 9.33356449375867e-06, + "loss": 0.1832, + "step": 971 + }, + { + "epoch": 0.6733633529615518, + "grad_norm": 1.0555940866470337, + "learning_rate": 9.332871012482664e-06, + "loss": 0.175, + "step": 972 + }, + { + "epoch": 0.6740561136127468, + "grad_norm": 1.0787198543548584, + "learning_rate": 9.33217753120666e-06, + "loss": 0.1557, + "step": 973 + }, + { + "epoch": 0.6747488742639418, + "grad_norm": 1.0291804075241089, + "learning_rate": 9.331484049930652e-06, + "loss": 0.1605, + "step": 974 + }, + { + "epoch": 0.6754416349151369, + "grad_norm": 1.0406602621078491, + "learning_rate": 9.330790568654647e-06, + "loss": 0.1642, + "step": 975 + }, + { + "epoch": 0.6761343955663318, + "grad_norm": 0.9932894110679626, + "learning_rate": 9.33009708737864e-06, + "loss": 0.1631, + "step": 976 + }, + { + "epoch": 0.6768271562175269, + "grad_norm": 1.0147134065628052, + "learning_rate": 9.329403606102636e-06, + "loss": 0.1375, + "step": 977 + }, + { + "epoch": 0.6775199168687218, + "grad_norm": 1.0169036388397217, + "learning_rate": 9.32871012482663e-06, + "loss": 0.1525, + "step": 978 + }, + { + "epoch": 0.6782126775199169, + "grad_norm": 1.1349810361862183, + "learning_rate": 9.328016643550625e-06, + "loss": 0.1808, + "step": 979 + }, + { + "epoch": 0.6789054381711119, + "grad_norm": 0.9810749292373657, + "learning_rate": 9.32732316227462e-06, + "loss": 0.1394, + "step": 980 + }, + { + "epoch": 0.6795981988223069, + "grad_norm": 1.1747621297836304, + "learning_rate": 9.326629680998614e-06, + "loss": 0.1777, + "step": 981 + }, + { + "epoch": 0.6802909594735019, + "grad_norm": 1.1135560274124146, + "learning_rate": 9.325936199722609e-06, + "loss": 0.1543, + "step": 982 + }, + { + "epoch": 0.6809837201246969, + "grad_norm": 1.1176321506500244, + "learning_rate": 9.325242718446603e-06, + "loss": 0.1572, + "step": 983 + }, + { + "epoch": 0.6816764807758919, + "grad_norm": 1.020430564880371, + "learning_rate": 9.324549237170597e-06, + "loss": 0.1628, + "step": 984 + }, + { + "epoch": 0.682369241427087, + "grad_norm": 1.08804190158844, + "learning_rate": 9.323855755894592e-06, + "loss": 0.157, + "step": 985 + }, + { + "epoch": 0.6830620020782819, + "grad_norm": 1.0574597120285034, + "learning_rate": 9.323162274618585e-06, + "loss": 0.1508, + "step": 986 + }, + { + "epoch": 0.683754762729477, + "grad_norm": 1.0497900247573853, + "learning_rate": 9.32246879334258e-06, + "loss": 0.1711, + "step": 987 + }, + { + "epoch": 0.6844475233806719, + "grad_norm": 1.1266224384307861, + "learning_rate": 9.321775312066575e-06, + "loss": 0.1483, + "step": 988 + }, + { + "epoch": 0.685140284031867, + "grad_norm": 1.0701349973678589, + "learning_rate": 9.32108183079057e-06, + "loss": 0.1666, + "step": 989 + }, + { + "epoch": 0.685833044683062, + "grad_norm": 1.1562618017196655, + "learning_rate": 9.320388349514565e-06, + "loss": 0.1717, + "step": 990 + }, + { + "epoch": 0.686525805334257, + "grad_norm": 1.1472821235656738, + "learning_rate": 9.319694868238558e-06, + "loss": 0.1639, + "step": 991 + }, + { + "epoch": 0.6872185659854521, + "grad_norm": 1.0069383382797241, + "learning_rate": 9.319001386962553e-06, + "loss": 0.1675, + "step": 992 + }, + { + "epoch": 0.687911326636647, + "grad_norm": 1.1724313497543335, + "learning_rate": 9.318307905686548e-06, + "loss": 0.1765, + "step": 993 + }, + { + "epoch": 0.6886040872878421, + "grad_norm": 0.966568112373352, + "learning_rate": 9.317614424410541e-06, + "loss": 0.1397, + "step": 994 + }, + { + "epoch": 0.6892968479390371, + "grad_norm": 1.0228078365325928, + "learning_rate": 9.316920943134536e-06, + "loss": 0.148, + "step": 995 + }, + { + "epoch": 0.6899896085902321, + "grad_norm": 1.229444980621338, + "learning_rate": 9.31622746185853e-06, + "loss": 0.1855, + "step": 996 + }, + { + "epoch": 0.6906823692414271, + "grad_norm": 1.1054368019104004, + "learning_rate": 9.315533980582526e-06, + "loss": 0.2115, + "step": 997 + }, + { + "epoch": 0.6913751298926221, + "grad_norm": 1.0250862836837769, + "learning_rate": 9.31484049930652e-06, + "loss": 0.1479, + "step": 998 + }, + { + "epoch": 0.6920678905438171, + "grad_norm": 1.1154900789260864, + "learning_rate": 9.314147018030514e-06, + "loss": 0.1697, + "step": 999 + }, + { + "epoch": 0.6927606511950122, + "grad_norm": 1.1613211631774902, + "learning_rate": 9.313453536754509e-06, + "loss": 0.1604, + "step": 1000 + }, + { + "epoch": 0.6934534118462071, + "grad_norm": 1.1166093349456787, + "learning_rate": 9.312760055478502e-06, + "loss": 0.1304, + "step": 1001 + }, + { + "epoch": 0.6941461724974022, + "grad_norm": 1.0721238851547241, + "learning_rate": 9.312066574202497e-06, + "loss": 0.1699, + "step": 1002 + }, + { + "epoch": 0.6948389331485971, + "grad_norm": 1.1429399251937866, + "learning_rate": 9.311373092926492e-06, + "loss": 0.1922, + "step": 1003 + }, + { + "epoch": 0.6955316937997922, + "grad_norm": 0.9935634732246399, + "learning_rate": 9.310679611650487e-06, + "loss": 0.162, + "step": 1004 + }, + { + "epoch": 0.6962244544509871, + "grad_norm": 1.2032567262649536, + "learning_rate": 9.30998613037448e-06, + "loss": 0.1599, + "step": 1005 + }, + { + "epoch": 0.6969172151021822, + "grad_norm": 1.0629124641418457, + "learning_rate": 9.309292649098475e-06, + "loss": 0.169, + "step": 1006 + }, + { + "epoch": 0.6976099757533772, + "grad_norm": 0.9449934363365173, + "learning_rate": 9.30859916782247e-06, + "loss": 0.1529, + "step": 1007 + }, + { + "epoch": 0.6983027364045722, + "grad_norm": 1.0493741035461426, + "learning_rate": 9.307905686546465e-06, + "loss": 0.1621, + "step": 1008 + }, + { + "epoch": 0.6989954970557672, + "grad_norm": 1.028087854385376, + "learning_rate": 9.307212205270458e-06, + "loss": 0.1714, + "step": 1009 + }, + { + "epoch": 0.6996882577069622, + "grad_norm": 1.0470004081726074, + "learning_rate": 9.306518723994453e-06, + "loss": 0.1527, + "step": 1010 + }, + { + "epoch": 0.7003810183581572, + "grad_norm": 1.0218693017959595, + "learning_rate": 9.305825242718446e-06, + "loss": 0.143, + "step": 1011 + }, + { + "epoch": 0.7010737790093523, + "grad_norm": 1.211381196975708, + "learning_rate": 9.305131761442441e-06, + "loss": 0.1908, + "step": 1012 + }, + { + "epoch": 0.7017665396605473, + "grad_norm": 1.0903083086013794, + "learning_rate": 9.304438280166436e-06, + "loss": 0.1742, + "step": 1013 + }, + { + "epoch": 0.7024593003117423, + "grad_norm": 1.0702459812164307, + "learning_rate": 9.303744798890431e-06, + "loss": 0.163, + "step": 1014 + }, + { + "epoch": 0.7031520609629373, + "grad_norm": 1.0752493143081665, + "learning_rate": 9.303051317614426e-06, + "loss": 0.1461, + "step": 1015 + }, + { + "epoch": 0.7038448216141323, + "grad_norm": 1.2007278203964233, + "learning_rate": 9.30235783633842e-06, + "loss": 0.1593, + "step": 1016 + }, + { + "epoch": 0.7045375822653274, + "grad_norm": 1.0017162561416626, + "learning_rate": 9.301664355062414e-06, + "loss": 0.1573, + "step": 1017 + }, + { + "epoch": 0.7052303429165223, + "grad_norm": 1.1920140981674194, + "learning_rate": 9.30097087378641e-06, + "loss": 0.1812, + "step": 1018 + }, + { + "epoch": 0.7059231035677174, + "grad_norm": 1.0638477802276611, + "learning_rate": 9.300277392510402e-06, + "loss": 0.1658, + "step": 1019 + }, + { + "epoch": 0.7066158642189123, + "grad_norm": 1.1916426420211792, + "learning_rate": 9.299583911234397e-06, + "loss": 0.1749, + "step": 1020 + }, + { + "epoch": 0.7073086248701074, + "grad_norm": 0.9912903904914856, + "learning_rate": 9.298890429958392e-06, + "loss": 0.1566, + "step": 1021 + }, + { + "epoch": 0.7080013855213024, + "grad_norm": 0.9812451004981995, + "learning_rate": 9.298196948682387e-06, + "loss": 0.1295, + "step": 1022 + }, + { + "epoch": 0.7086941461724974, + "grad_norm": 1.126921534538269, + "learning_rate": 9.297503467406382e-06, + "loss": 0.1689, + "step": 1023 + }, + { + "epoch": 0.7093869068236924, + "grad_norm": 1.038448691368103, + "learning_rate": 9.296809986130375e-06, + "loss": 0.1529, + "step": 1024 + }, + { + "epoch": 0.7100796674748874, + "grad_norm": 1.2521450519561768, + "learning_rate": 9.29611650485437e-06, + "loss": 0.1731, + "step": 1025 + }, + { + "epoch": 0.7107724281260824, + "grad_norm": 0.9496546983718872, + "learning_rate": 9.295423023578364e-06, + "loss": 0.1349, + "step": 1026 + }, + { + "epoch": 0.7114651887772775, + "grad_norm": 1.1061389446258545, + "learning_rate": 9.294729542302359e-06, + "loss": 0.1638, + "step": 1027 + }, + { + "epoch": 0.7121579494284724, + "grad_norm": 1.0456463098526, + "learning_rate": 9.294036061026353e-06, + "loss": 0.1557, + "step": 1028 + }, + { + "epoch": 0.7128507100796675, + "grad_norm": 0.9909372329711914, + "learning_rate": 9.293342579750347e-06, + "loss": 0.154, + "step": 1029 + }, + { + "epoch": 0.7135434707308624, + "grad_norm": 1.135169506072998, + "learning_rate": 9.292649098474342e-06, + "loss": 0.171, + "step": 1030 + }, + { + "epoch": 0.7142362313820575, + "grad_norm": 1.1106867790222168, + "learning_rate": 9.291955617198337e-06, + "loss": 0.1695, + "step": 1031 + }, + { + "epoch": 0.7149289920332526, + "grad_norm": 1.0818274021148682, + "learning_rate": 9.291262135922331e-06, + "loss": 0.1751, + "step": 1032 + }, + { + "epoch": 0.7156217526844475, + "grad_norm": 1.0426416397094727, + "learning_rate": 9.290568654646326e-06, + "loss": 0.155, + "step": 1033 + }, + { + "epoch": 0.7163145133356426, + "grad_norm": 1.0632688999176025, + "learning_rate": 9.28987517337032e-06, + "loss": 0.1724, + "step": 1034 + }, + { + "epoch": 0.7170072739868375, + "grad_norm": 1.0620195865631104, + "learning_rate": 9.289181692094315e-06, + "loss": 0.1766, + "step": 1035 + }, + { + "epoch": 0.7177000346380326, + "grad_norm": 1.0451894998550415, + "learning_rate": 9.288488210818308e-06, + "loss": 0.1543, + "step": 1036 + }, + { + "epoch": 0.7183927952892276, + "grad_norm": 0.9844353795051575, + "learning_rate": 9.287794729542303e-06, + "loss": 0.1433, + "step": 1037 + }, + { + "epoch": 0.7190855559404226, + "grad_norm": 1.168127179145813, + "learning_rate": 9.287101248266298e-06, + "loss": 0.1524, + "step": 1038 + }, + { + "epoch": 0.7197783165916176, + "grad_norm": 0.9377881288528442, + "learning_rate": 9.286407766990293e-06, + "loss": 0.1134, + "step": 1039 + }, + { + "epoch": 0.7204710772428126, + "grad_norm": 1.0388245582580566, + "learning_rate": 9.285714285714288e-06, + "loss": 0.1528, + "step": 1040 + }, + { + "epoch": 0.7211638378940076, + "grad_norm": 1.082821249961853, + "learning_rate": 9.28502080443828e-06, + "loss": 0.1531, + "step": 1041 + }, + { + "epoch": 0.7218565985452027, + "grad_norm": 0.903874397277832, + "learning_rate": 9.284327323162276e-06, + "loss": 0.1338, + "step": 1042 + }, + { + "epoch": 0.7225493591963976, + "grad_norm": 1.0584840774536133, + "learning_rate": 9.28363384188627e-06, + "loss": 0.1678, + "step": 1043 + }, + { + "epoch": 0.7232421198475927, + "grad_norm": 1.0977245569229126, + "learning_rate": 9.282940360610264e-06, + "loss": 0.1528, + "step": 1044 + }, + { + "epoch": 0.7239348804987876, + "grad_norm": 1.2373993396759033, + "learning_rate": 9.282246879334259e-06, + "loss": 0.1749, + "step": 1045 + }, + { + "epoch": 0.7246276411499827, + "grad_norm": 1.0567039251327515, + "learning_rate": 9.281553398058252e-06, + "loss": 0.1766, + "step": 1046 + }, + { + "epoch": 0.7253204018011777, + "grad_norm": 0.9542796015739441, + "learning_rate": 9.280859916782247e-06, + "loss": 0.1533, + "step": 1047 + }, + { + "epoch": 0.7260131624523727, + "grad_norm": 1.0102726221084595, + "learning_rate": 9.280166435506242e-06, + "loss": 0.1343, + "step": 1048 + }, + { + "epoch": 0.7267059231035677, + "grad_norm": 1.1852046251296997, + "learning_rate": 9.279472954230237e-06, + "loss": 0.1793, + "step": 1049 + }, + { + "epoch": 0.7273986837547627, + "grad_norm": 1.1553641557693481, + "learning_rate": 9.278779472954232e-06, + "loss": 0.1757, + "step": 1050 + }, + { + "epoch": 0.7280914444059577, + "grad_norm": 1.290137529373169, + "learning_rate": 9.278085991678225e-06, + "loss": 0.1827, + "step": 1051 + }, + { + "epoch": 0.7287842050571528, + "grad_norm": 1.0179033279418945, + "learning_rate": 9.27739251040222e-06, + "loss": 0.1448, + "step": 1052 + }, + { + "epoch": 0.7294769657083477, + "grad_norm": 1.0473142862319946, + "learning_rate": 9.276699029126215e-06, + "loss": 0.1529, + "step": 1053 + }, + { + "epoch": 0.7301697263595428, + "grad_norm": 1.0068601369857788, + "learning_rate": 9.276005547850208e-06, + "loss": 0.1602, + "step": 1054 + }, + { + "epoch": 0.7308624870107377, + "grad_norm": 1.0162426233291626, + "learning_rate": 9.275312066574203e-06, + "loss": 0.1511, + "step": 1055 + }, + { + "epoch": 0.7315552476619328, + "grad_norm": 1.029964566230774, + "learning_rate": 9.274618585298198e-06, + "loss": 0.1479, + "step": 1056 + }, + { + "epoch": 0.7322480083131279, + "grad_norm": 1.010059118270874, + "learning_rate": 9.273925104022193e-06, + "loss": 0.1388, + "step": 1057 + }, + { + "epoch": 0.7329407689643228, + "grad_norm": 0.9803706407546997, + "learning_rate": 9.273231622746188e-06, + "loss": 0.1521, + "step": 1058 + }, + { + "epoch": 0.7336335296155179, + "grad_norm": 1.1875274181365967, + "learning_rate": 9.272538141470181e-06, + "loss": 0.1636, + "step": 1059 + }, + { + "epoch": 0.7343262902667128, + "grad_norm": 1.0571045875549316, + "learning_rate": 9.271844660194176e-06, + "loss": 0.1642, + "step": 1060 + }, + { + "epoch": 0.7350190509179079, + "grad_norm": 1.180854320526123, + "learning_rate": 9.27115117891817e-06, + "loss": 0.2017, + "step": 1061 + }, + { + "epoch": 0.7357118115691029, + "grad_norm": 1.120179533958435, + "learning_rate": 9.270457697642164e-06, + "loss": 0.1575, + "step": 1062 + }, + { + "epoch": 0.7364045722202979, + "grad_norm": 0.9948523044586182, + "learning_rate": 9.26976421636616e-06, + "loss": 0.1329, + "step": 1063 + }, + { + "epoch": 0.7370973328714929, + "grad_norm": 1.1517481803894043, + "learning_rate": 9.269070735090152e-06, + "loss": 0.1882, + "step": 1064 + }, + { + "epoch": 0.7377900935226879, + "grad_norm": 1.1129299402236938, + "learning_rate": 9.268377253814147e-06, + "loss": 0.1755, + "step": 1065 + }, + { + "epoch": 0.7384828541738829, + "grad_norm": 1.118369698524475, + "learning_rate": 9.267683772538142e-06, + "loss": 0.1657, + "step": 1066 + }, + { + "epoch": 0.739175614825078, + "grad_norm": 1.0388578176498413, + "learning_rate": 9.266990291262137e-06, + "loss": 0.1499, + "step": 1067 + }, + { + "epoch": 0.7398683754762729, + "grad_norm": 1.312852144241333, + "learning_rate": 9.266296809986132e-06, + "loss": 0.1345, + "step": 1068 + }, + { + "epoch": 0.740561136127468, + "grad_norm": 1.011390209197998, + "learning_rate": 9.265603328710125e-06, + "loss": 0.1409, + "step": 1069 + }, + { + "epoch": 0.7412538967786629, + "grad_norm": 0.9556867480278015, + "learning_rate": 9.26490984743412e-06, + "loss": 0.1349, + "step": 1070 + }, + { + "epoch": 0.741946657429858, + "grad_norm": 1.0241023302078247, + "learning_rate": 9.264216366158114e-06, + "loss": 0.1419, + "step": 1071 + }, + { + "epoch": 0.742639418081053, + "grad_norm": 1.0253700017929077, + "learning_rate": 9.263522884882108e-06, + "loss": 0.1783, + "step": 1072 + }, + { + "epoch": 0.743332178732248, + "grad_norm": 1.0557609796524048, + "learning_rate": 9.262829403606103e-06, + "loss": 0.1725, + "step": 1073 + }, + { + "epoch": 0.744024939383443, + "grad_norm": 1.0009117126464844, + "learning_rate": 9.262135922330098e-06, + "loss": 0.1578, + "step": 1074 + }, + { + "epoch": 0.744717700034638, + "grad_norm": 0.9810420274734497, + "learning_rate": 9.261442441054093e-06, + "loss": 0.1463, + "step": 1075 + }, + { + "epoch": 0.745410460685833, + "grad_norm": 1.3629769086837769, + "learning_rate": 9.260748959778087e-06, + "loss": 0.163, + "step": 1076 + }, + { + "epoch": 0.7461032213370281, + "grad_norm": 1.088313341140747, + "learning_rate": 9.260055478502081e-06, + "loss": 0.1408, + "step": 1077 + }, + { + "epoch": 0.7467959819882231, + "grad_norm": 0.9942247867584229, + "learning_rate": 9.259361997226076e-06, + "loss": 0.1337, + "step": 1078 + }, + { + "epoch": 0.7474887426394181, + "grad_norm": 1.0539976358413696, + "learning_rate": 9.25866851595007e-06, + "loss": 0.1659, + "step": 1079 + }, + { + "epoch": 0.7481815032906131, + "grad_norm": 1.0356535911560059, + "learning_rate": 9.257975034674065e-06, + "loss": 0.1572, + "step": 1080 + }, + { + "epoch": 0.7488742639418081, + "grad_norm": 1.2949830293655396, + "learning_rate": 9.25728155339806e-06, + "loss": 0.2064, + "step": 1081 + }, + { + "epoch": 0.7495670245930032, + "grad_norm": 0.9998825192451477, + "learning_rate": 9.256588072122053e-06, + "loss": 0.1268, + "step": 1082 + }, + { + "epoch": 0.7502597852441981, + "grad_norm": 1.1159971952438354, + "learning_rate": 9.255894590846048e-06, + "loss": 0.1631, + "step": 1083 + }, + { + "epoch": 0.7509525458953932, + "grad_norm": 0.9983909726142883, + "learning_rate": 9.255201109570043e-06, + "loss": 0.1426, + "step": 1084 + }, + { + "epoch": 0.7516453065465881, + "grad_norm": 1.0740413665771484, + "learning_rate": 9.254507628294038e-06, + "loss": 0.1622, + "step": 1085 + }, + { + "epoch": 0.7523380671977832, + "grad_norm": 1.0700693130493164, + "learning_rate": 9.25381414701803e-06, + "loss": 0.1583, + "step": 1086 + }, + { + "epoch": 0.7530308278489781, + "grad_norm": 1.0357578992843628, + "learning_rate": 9.253120665742026e-06, + "loss": 0.1579, + "step": 1087 + }, + { + "epoch": 0.7537235885001732, + "grad_norm": 1.112646222114563, + "learning_rate": 9.25242718446602e-06, + "loss": 0.1759, + "step": 1088 + }, + { + "epoch": 0.7544163491513682, + "grad_norm": 0.9094178676605225, + "learning_rate": 9.251733703190014e-06, + "loss": 0.1235, + "step": 1089 + }, + { + "epoch": 0.7551091098025632, + "grad_norm": 1.0854482650756836, + "learning_rate": 9.251040221914009e-06, + "loss": 0.1562, + "step": 1090 + }, + { + "epoch": 0.7558018704537582, + "grad_norm": 1.2528469562530518, + "learning_rate": 9.250346740638004e-06, + "loss": 0.2051, + "step": 1091 + }, + { + "epoch": 0.7564946311049532, + "grad_norm": 0.8920885324478149, + "learning_rate": 9.249653259361999e-06, + "loss": 0.1357, + "step": 1092 + }, + { + "epoch": 0.7571873917561482, + "grad_norm": 0.9023600816726685, + "learning_rate": 9.248959778085992e-06, + "loss": 0.1237, + "step": 1093 + }, + { + "epoch": 0.7578801524073433, + "grad_norm": 1.0625343322753906, + "learning_rate": 9.248266296809987e-06, + "loss": 0.1655, + "step": 1094 + }, + { + "epoch": 0.7585729130585382, + "grad_norm": 1.1089736223220825, + "learning_rate": 9.247572815533982e-06, + "loss": 0.1783, + "step": 1095 + }, + { + "epoch": 0.7592656737097333, + "grad_norm": 1.058384656906128, + "learning_rate": 9.246879334257975e-06, + "loss": 0.1581, + "step": 1096 + }, + { + "epoch": 0.7599584343609282, + "grad_norm": 1.105261206626892, + "learning_rate": 9.24618585298197e-06, + "loss": 0.1815, + "step": 1097 + }, + { + "epoch": 0.7606511950121233, + "grad_norm": 0.9062468409538269, + "learning_rate": 9.245492371705965e-06, + "loss": 0.1406, + "step": 1098 + }, + { + "epoch": 0.7613439556633184, + "grad_norm": 0.981909990310669, + "learning_rate": 9.24479889042996e-06, + "loss": 0.1778, + "step": 1099 + }, + { + "epoch": 0.7620367163145133, + "grad_norm": 1.1163431406021118, + "learning_rate": 9.244105409153955e-06, + "loss": 0.1896, + "step": 1100 + }, + { + "epoch": 0.7627294769657084, + "grad_norm": 1.1027591228485107, + "learning_rate": 9.243411927877948e-06, + "loss": 0.1287, + "step": 1101 + }, + { + "epoch": 0.7634222376169033, + "grad_norm": 1.017777442932129, + "learning_rate": 9.242718446601943e-06, + "loss": 0.1676, + "step": 1102 + }, + { + "epoch": 0.7641149982680984, + "grad_norm": 1.1382927894592285, + "learning_rate": 9.242024965325936e-06, + "loss": 0.1813, + "step": 1103 + }, + { + "epoch": 0.7648077589192934, + "grad_norm": 1.0597646236419678, + "learning_rate": 9.241331484049931e-06, + "loss": 0.1337, + "step": 1104 + }, + { + "epoch": 0.7655005195704884, + "grad_norm": 0.8857343196868896, + "learning_rate": 9.240638002773926e-06, + "loss": 0.1308, + "step": 1105 + }, + { + "epoch": 0.7661932802216834, + "grad_norm": 1.1816799640655518, + "learning_rate": 9.23994452149792e-06, + "loss": 0.2012, + "step": 1106 + }, + { + "epoch": 0.7668860408728784, + "grad_norm": 1.0828983783721924, + "learning_rate": 9.239251040221914e-06, + "loss": 0.1488, + "step": 1107 + }, + { + "epoch": 0.7675788015240734, + "grad_norm": 0.993518590927124, + "learning_rate": 9.23855755894591e-06, + "loss": 0.119, + "step": 1108 + }, + { + "epoch": 0.7682715621752685, + "grad_norm": 1.0390045642852783, + "learning_rate": 9.237864077669904e-06, + "loss": 0.1567, + "step": 1109 + }, + { + "epoch": 0.7689643228264634, + "grad_norm": 1.031101107597351, + "learning_rate": 9.237170596393899e-06, + "loss": 0.1593, + "step": 1110 + }, + { + "epoch": 0.7696570834776585, + "grad_norm": 1.0703082084655762, + "learning_rate": 9.236477115117892e-06, + "loss": 0.16, + "step": 1111 + }, + { + "epoch": 0.7703498441288534, + "grad_norm": 0.9922031760215759, + "learning_rate": 9.235783633841887e-06, + "loss": 0.1457, + "step": 1112 + }, + { + "epoch": 0.7710426047800485, + "grad_norm": 1.1097067594528198, + "learning_rate": 9.23509015256588e-06, + "loss": 0.1583, + "step": 1113 + }, + { + "epoch": 0.7717353654312435, + "grad_norm": 1.2743453979492188, + "learning_rate": 9.234396671289875e-06, + "loss": 0.1684, + "step": 1114 + }, + { + "epoch": 0.7724281260824385, + "grad_norm": 0.9603894948959351, + "learning_rate": 9.23370319001387e-06, + "loss": 0.1342, + "step": 1115 + }, + { + "epoch": 0.7731208867336335, + "grad_norm": 1.0211710929870605, + "learning_rate": 9.233009708737865e-06, + "loss": 0.148, + "step": 1116 + }, + { + "epoch": 0.7738136473848285, + "grad_norm": 1.2108137607574463, + "learning_rate": 9.23231622746186e-06, + "loss": 0.1791, + "step": 1117 + }, + { + "epoch": 0.7745064080360236, + "grad_norm": 1.0738354921340942, + "learning_rate": 9.231622746185853e-06, + "loss": 0.1775, + "step": 1118 + }, + { + "epoch": 0.7751991686872186, + "grad_norm": 0.9560322165489197, + "learning_rate": 9.230929264909848e-06, + "loss": 0.1362, + "step": 1119 + }, + { + "epoch": 0.7758919293384136, + "grad_norm": 0.9589406847953796, + "learning_rate": 9.230235783633843e-06, + "loss": 0.1538, + "step": 1120 + }, + { + "epoch": 0.7765846899896086, + "grad_norm": 0.9562289714813232, + "learning_rate": 9.229542302357837e-06, + "loss": 0.1369, + "step": 1121 + }, + { + "epoch": 0.7772774506408036, + "grad_norm": 1.0238457918167114, + "learning_rate": 9.228848821081831e-06, + "loss": 0.1484, + "step": 1122 + }, + { + "epoch": 0.7779702112919986, + "grad_norm": 1.0500246286392212, + "learning_rate": 9.228155339805825e-06, + "loss": 0.1431, + "step": 1123 + }, + { + "epoch": 0.7786629719431937, + "grad_norm": 0.9847618937492371, + "learning_rate": 9.22746185852982e-06, + "loss": 0.1379, + "step": 1124 + }, + { + "epoch": 0.7793557325943886, + "grad_norm": 1.0581865310668945, + "learning_rate": 9.226768377253815e-06, + "loss": 0.1758, + "step": 1125 + }, + { + "epoch": 0.7800484932455837, + "grad_norm": 1.0479624271392822, + "learning_rate": 9.22607489597781e-06, + "loss": 0.1748, + "step": 1126 + }, + { + "epoch": 0.7807412538967786, + "grad_norm": 1.0092641115188599, + "learning_rate": 9.225381414701804e-06, + "loss": 0.1671, + "step": 1127 + }, + { + "epoch": 0.7814340145479737, + "grad_norm": 0.9954952597618103, + "learning_rate": 9.224687933425798e-06, + "loss": 0.1442, + "step": 1128 + }, + { + "epoch": 0.7821267751991687, + "grad_norm": 0.9774269461631775, + "learning_rate": 9.223994452149793e-06, + "loss": 0.1558, + "step": 1129 + }, + { + "epoch": 0.7828195358503637, + "grad_norm": 1.1028008460998535, + "learning_rate": 9.223300970873788e-06, + "loss": 0.162, + "step": 1130 + }, + { + "epoch": 0.7835122965015587, + "grad_norm": 1.1184663772583008, + "learning_rate": 9.22260748959778e-06, + "loss": 0.1866, + "step": 1131 + }, + { + "epoch": 0.7842050571527537, + "grad_norm": 0.9057531356811523, + "learning_rate": 9.221914008321776e-06, + "loss": 0.1432, + "step": 1132 + }, + { + "epoch": 0.7848978178039487, + "grad_norm": 1.1297880411148071, + "learning_rate": 9.22122052704577e-06, + "loss": 0.1589, + "step": 1133 + }, + { + "epoch": 0.7855905784551438, + "grad_norm": 1.0383490324020386, + "learning_rate": 9.220527045769766e-06, + "loss": 0.1496, + "step": 1134 + }, + { + "epoch": 0.7862833391063387, + "grad_norm": 0.9274728298187256, + "learning_rate": 9.21983356449376e-06, + "loss": 0.1353, + "step": 1135 + }, + { + "epoch": 0.7869760997575338, + "grad_norm": 1.059129238128662, + "learning_rate": 9.219140083217754e-06, + "loss": 0.1473, + "step": 1136 + }, + { + "epoch": 0.7876688604087287, + "grad_norm": 0.9455123543739319, + "learning_rate": 9.218446601941749e-06, + "loss": 0.1427, + "step": 1137 + }, + { + "epoch": 0.7883616210599238, + "grad_norm": 0.9490697383880615, + "learning_rate": 9.217753120665742e-06, + "loss": 0.1236, + "step": 1138 + }, + { + "epoch": 0.7890543817111189, + "grad_norm": 1.120455265045166, + "learning_rate": 9.217059639389737e-06, + "loss": 0.1466, + "step": 1139 + }, + { + "epoch": 0.7897471423623138, + "grad_norm": 1.126746654510498, + "learning_rate": 9.216366158113732e-06, + "loss": 0.1668, + "step": 1140 + }, + { + "epoch": 0.7904399030135089, + "grad_norm": 1.1976633071899414, + "learning_rate": 9.215672676837725e-06, + "loss": 0.1642, + "step": 1141 + }, + { + "epoch": 0.7911326636647038, + "grad_norm": 1.0458179712295532, + "learning_rate": 9.21497919556172e-06, + "loss": 0.1449, + "step": 1142 + }, + { + "epoch": 0.7918254243158989, + "grad_norm": 0.9306297302246094, + "learning_rate": 9.214285714285715e-06, + "loss": 0.1553, + "step": 1143 + }, + { + "epoch": 0.7925181849670939, + "grad_norm": 1.0312331914901733, + "learning_rate": 9.21359223300971e-06, + "loss": 0.1533, + "step": 1144 + }, + { + "epoch": 0.7932109456182889, + "grad_norm": 1.0560755729675293, + "learning_rate": 9.212898751733705e-06, + "loss": 0.1542, + "step": 1145 + }, + { + "epoch": 0.7939037062694839, + "grad_norm": 1.2565730810165405, + "learning_rate": 9.212205270457698e-06, + "loss": 0.2271, + "step": 1146 + }, + { + "epoch": 0.7945964669206789, + "grad_norm": 1.0296990871429443, + "learning_rate": 9.211511789181693e-06, + "loss": 0.1656, + "step": 1147 + }, + { + "epoch": 0.7952892275718739, + "grad_norm": 1.0432108640670776, + "learning_rate": 9.210818307905686e-06, + "loss": 0.1572, + "step": 1148 + }, + { + "epoch": 0.795981988223069, + "grad_norm": 1.1227762699127197, + "learning_rate": 9.210124826629681e-06, + "loss": 0.1803, + "step": 1149 + }, + { + "epoch": 0.7966747488742639, + "grad_norm": 1.0830938816070557, + "learning_rate": 9.209431345353676e-06, + "loss": 0.1744, + "step": 1150 + }, + { + "epoch": 0.797367509525459, + "grad_norm": 1.0725040435791016, + "learning_rate": 9.208737864077671e-06, + "loss": 0.1465, + "step": 1151 + }, + { + "epoch": 0.7980602701766539, + "grad_norm": 1.0585803985595703, + "learning_rate": 9.208044382801666e-06, + "loss": 0.166, + "step": 1152 + }, + { + "epoch": 0.798753030827849, + "grad_norm": 1.043522834777832, + "learning_rate": 9.207350901525659e-06, + "loss": 0.1609, + "step": 1153 + }, + { + "epoch": 0.799445791479044, + "grad_norm": 1.0265485048294067, + "learning_rate": 9.206657420249654e-06, + "loss": 0.1596, + "step": 1154 + }, + { + "epoch": 0.800138552130239, + "grad_norm": 1.074607491493225, + "learning_rate": 9.205963938973649e-06, + "loss": 0.1983, + "step": 1155 + }, + { + "epoch": 0.800831312781434, + "grad_norm": 1.0008004903793335, + "learning_rate": 9.205270457697642e-06, + "loss": 0.1739, + "step": 1156 + }, + { + "epoch": 0.801524073432629, + "grad_norm": 1.0192253589630127, + "learning_rate": 9.204576976421637e-06, + "loss": 0.1589, + "step": 1157 + }, + { + "epoch": 0.802216834083824, + "grad_norm": 1.1635502576828003, + "learning_rate": 9.203883495145632e-06, + "loss": 0.1731, + "step": 1158 + }, + { + "epoch": 0.8029095947350191, + "grad_norm": 1.1289515495300293, + "learning_rate": 9.203190013869625e-06, + "loss": 0.161, + "step": 1159 + }, + { + "epoch": 0.803602355386214, + "grad_norm": 0.9179483652114868, + "learning_rate": 9.20249653259362e-06, + "loss": 0.144, + "step": 1160 + }, + { + "epoch": 0.8042951160374091, + "grad_norm": 1.172469139099121, + "learning_rate": 9.201803051317615e-06, + "loss": 0.1731, + "step": 1161 + }, + { + "epoch": 0.804987876688604, + "grad_norm": 1.0264207124710083, + "learning_rate": 9.20110957004161e-06, + "loss": 0.154, + "step": 1162 + }, + { + "epoch": 0.8056806373397991, + "grad_norm": 1.1471171379089355, + "learning_rate": 9.200416088765603e-06, + "loss": 0.1777, + "step": 1163 + }, + { + "epoch": 0.8063733979909941, + "grad_norm": 1.0495991706848145, + "learning_rate": 9.199722607489598e-06, + "loss": 0.1834, + "step": 1164 + }, + { + "epoch": 0.8070661586421891, + "grad_norm": 1.0372450351715088, + "learning_rate": 9.199029126213593e-06, + "loss": 0.1576, + "step": 1165 + }, + { + "epoch": 0.8077589192933842, + "grad_norm": 1.057462453842163, + "learning_rate": 9.198335644937586e-06, + "loss": 0.1754, + "step": 1166 + }, + { + "epoch": 0.8084516799445791, + "grad_norm": 1.0398510694503784, + "learning_rate": 9.197642163661581e-06, + "loss": 0.1693, + "step": 1167 + }, + { + "epoch": 0.8091444405957742, + "grad_norm": 0.997722864151001, + "learning_rate": 9.196948682385576e-06, + "loss": 0.1603, + "step": 1168 + }, + { + "epoch": 0.8098372012469691, + "grad_norm": 1.0216574668884277, + "learning_rate": 9.196255201109571e-06, + "loss": 0.1597, + "step": 1169 + }, + { + "epoch": 0.8105299618981642, + "grad_norm": 0.9988551139831543, + "learning_rate": 9.195561719833566e-06, + "loss": 0.1481, + "step": 1170 + }, + { + "epoch": 0.8112227225493592, + "grad_norm": 1.1010034084320068, + "learning_rate": 9.19486823855756e-06, + "loss": 0.1441, + "step": 1171 + }, + { + "epoch": 0.8119154832005542, + "grad_norm": 1.089496374130249, + "learning_rate": 9.194174757281554e-06, + "loss": 0.1717, + "step": 1172 + }, + { + "epoch": 0.8126082438517492, + "grad_norm": 1.0673224925994873, + "learning_rate": 9.193481276005548e-06, + "loss": 0.1638, + "step": 1173 + }, + { + "epoch": 0.8133010045029442, + "grad_norm": 0.9966147541999817, + "learning_rate": 9.192787794729543e-06, + "loss": 0.1386, + "step": 1174 + }, + { + "epoch": 0.8139937651541392, + "grad_norm": 1.1094818115234375, + "learning_rate": 9.192094313453538e-06, + "loss": 0.1182, + "step": 1175 + }, + { + "epoch": 0.8146865258053343, + "grad_norm": 1.0231292247772217, + "learning_rate": 9.191400832177532e-06, + "loss": 0.1565, + "step": 1176 + }, + { + "epoch": 0.8153792864565292, + "grad_norm": 1.0623294115066528, + "learning_rate": 9.190707350901527e-06, + "loss": 0.165, + "step": 1177 + }, + { + "epoch": 0.8160720471077243, + "grad_norm": 1.0557681322097778, + "learning_rate": 9.19001386962552e-06, + "loss": 0.1681, + "step": 1178 + }, + { + "epoch": 0.8167648077589192, + "grad_norm": 1.084099292755127, + "learning_rate": 9.189320388349516e-06, + "loss": 0.1722, + "step": 1179 + }, + { + "epoch": 0.8174575684101143, + "grad_norm": 1.0746034383773804, + "learning_rate": 9.18862690707351e-06, + "loss": 0.1596, + "step": 1180 + }, + { + "epoch": 0.8181503290613094, + "grad_norm": 1.026454210281372, + "learning_rate": 9.187933425797504e-06, + "loss": 0.1605, + "step": 1181 + }, + { + "epoch": 0.8188430897125043, + "grad_norm": 0.9294998645782471, + "learning_rate": 9.187239944521499e-06, + "loss": 0.1333, + "step": 1182 + }, + { + "epoch": 0.8195358503636994, + "grad_norm": 1.0550793409347534, + "learning_rate": 9.186546463245492e-06, + "loss": 0.1332, + "step": 1183 + }, + { + "epoch": 0.8202286110148943, + "grad_norm": 0.9574646353721619, + "learning_rate": 9.185852981969487e-06, + "loss": 0.143, + "step": 1184 + }, + { + "epoch": 0.8209213716660894, + "grad_norm": 1.0283784866333008, + "learning_rate": 9.185159500693482e-06, + "loss": 0.1682, + "step": 1185 + }, + { + "epoch": 0.8216141323172844, + "grad_norm": 1.0605374574661255, + "learning_rate": 9.184466019417477e-06, + "loss": 0.1476, + "step": 1186 + }, + { + "epoch": 0.8223068929684794, + "grad_norm": 1.0304421186447144, + "learning_rate": 9.183772538141472e-06, + "loss": 0.1095, + "step": 1187 + }, + { + "epoch": 0.8229996536196744, + "grad_norm": 1.1015008687973022, + "learning_rate": 9.183079056865465e-06, + "loss": 0.1349, + "step": 1188 + }, + { + "epoch": 0.8236924142708694, + "grad_norm": 1.1376676559448242, + "learning_rate": 9.18238557558946e-06, + "loss": 0.1681, + "step": 1189 + }, + { + "epoch": 0.8243851749220644, + "grad_norm": 1.0262609720230103, + "learning_rate": 9.181692094313455e-06, + "loss": 0.1517, + "step": 1190 + }, + { + "epoch": 0.8250779355732595, + "grad_norm": 0.9977340698242188, + "learning_rate": 9.180998613037448e-06, + "loss": 0.152, + "step": 1191 + }, + { + "epoch": 0.8257706962244544, + "grad_norm": 0.984466016292572, + "learning_rate": 9.180305131761443e-06, + "loss": 0.1403, + "step": 1192 + }, + { + "epoch": 0.8264634568756495, + "grad_norm": 1.0744271278381348, + "learning_rate": 9.179611650485438e-06, + "loss": 0.134, + "step": 1193 + }, + { + "epoch": 0.8271562175268444, + "grad_norm": 1.0406293869018555, + "learning_rate": 9.178918169209433e-06, + "loss": 0.1605, + "step": 1194 + }, + { + "epoch": 0.8278489781780395, + "grad_norm": 1.2142112255096436, + "learning_rate": 9.178224687933428e-06, + "loss": 0.1864, + "step": 1195 + }, + { + "epoch": 0.8285417388292345, + "grad_norm": 0.9912194609642029, + "learning_rate": 9.177531206657421e-06, + "loss": 0.1439, + "step": 1196 + }, + { + "epoch": 0.8292344994804295, + "grad_norm": 1.0492942333221436, + "learning_rate": 9.176837725381416e-06, + "loss": 0.1695, + "step": 1197 + }, + { + "epoch": 0.8299272601316245, + "grad_norm": 0.9441617131233215, + "learning_rate": 9.176144244105409e-06, + "loss": 0.1256, + "step": 1198 + }, + { + "epoch": 0.8306200207828195, + "grad_norm": 1.0140005350112915, + "learning_rate": 9.175450762829404e-06, + "loss": 0.1355, + "step": 1199 + }, + { + "epoch": 0.8313127814340145, + "grad_norm": 1.0498930215835571, + "learning_rate": 9.174757281553399e-06, + "loss": 0.1533, + "step": 1200 + }, + { + "epoch": 0.8320055420852096, + "grad_norm": 1.0157033205032349, + "learning_rate": 9.174063800277392e-06, + "loss": 0.1648, + "step": 1201 + }, + { + "epoch": 0.8326983027364045, + "grad_norm": 0.989496111869812, + "learning_rate": 9.173370319001387e-06, + "loss": 0.1488, + "step": 1202 + }, + { + "epoch": 0.8333910633875996, + "grad_norm": 0.9286702871322632, + "learning_rate": 9.172676837725382e-06, + "loss": 0.1183, + "step": 1203 + }, + { + "epoch": 0.8340838240387946, + "grad_norm": 1.0913113355636597, + "learning_rate": 9.171983356449377e-06, + "loss": 0.1497, + "step": 1204 + }, + { + "epoch": 0.8347765846899896, + "grad_norm": 1.0152077674865723, + "learning_rate": 9.171289875173372e-06, + "loss": 0.1533, + "step": 1205 + }, + { + "epoch": 0.8354693453411847, + "grad_norm": 0.9213568568229675, + "learning_rate": 9.170596393897365e-06, + "loss": 0.1348, + "step": 1206 + }, + { + "epoch": 0.8361621059923796, + "grad_norm": 0.9164698719978333, + "learning_rate": 9.16990291262136e-06, + "loss": 0.1322, + "step": 1207 + }, + { + "epoch": 0.8368548666435747, + "grad_norm": 1.0275636911392212, + "learning_rate": 9.169209431345353e-06, + "loss": 0.1684, + "step": 1208 + }, + { + "epoch": 0.8375476272947696, + "grad_norm": 1.00309157371521, + "learning_rate": 9.168515950069348e-06, + "loss": 0.1364, + "step": 1209 + }, + { + "epoch": 0.8382403879459647, + "grad_norm": 1.0757734775543213, + "learning_rate": 9.167822468793343e-06, + "loss": 0.128, + "step": 1210 + }, + { + "epoch": 0.8389331485971597, + "grad_norm": 0.9592078924179077, + "learning_rate": 9.167128987517338e-06, + "loss": 0.1383, + "step": 1211 + }, + { + "epoch": 0.8396259092483547, + "grad_norm": 0.9673621654510498, + "learning_rate": 9.166435506241333e-06, + "loss": 0.1313, + "step": 1212 + }, + { + "epoch": 0.8403186698995497, + "grad_norm": 1.0815045833587646, + "learning_rate": 9.165742024965326e-06, + "loss": 0.184, + "step": 1213 + }, + { + "epoch": 0.8410114305507447, + "grad_norm": 1.0106115341186523, + "learning_rate": 9.165048543689321e-06, + "loss": 0.1692, + "step": 1214 + }, + { + "epoch": 0.8417041912019397, + "grad_norm": 1.0461982488632202, + "learning_rate": 9.164355062413316e-06, + "loss": 0.142, + "step": 1215 + }, + { + "epoch": 0.8423969518531348, + "grad_norm": 1.1969220638275146, + "learning_rate": 9.16366158113731e-06, + "loss": 0.1573, + "step": 1216 + }, + { + "epoch": 0.8430897125043297, + "grad_norm": 1.0679327249526978, + "learning_rate": 9.162968099861304e-06, + "loss": 0.1659, + "step": 1217 + }, + { + "epoch": 0.8437824731555248, + "grad_norm": 1.1530208587646484, + "learning_rate": 9.162274618585298e-06, + "loss": 0.1521, + "step": 1218 + }, + { + "epoch": 0.8444752338067197, + "grad_norm": 1.0931788682937622, + "learning_rate": 9.161581137309293e-06, + "loss": 0.1483, + "step": 1219 + }, + { + "epoch": 0.8451679944579148, + "grad_norm": 0.9220978617668152, + "learning_rate": 9.160887656033287e-06, + "loss": 0.14, + "step": 1220 + }, + { + "epoch": 0.8458607551091099, + "grad_norm": 1.0384410619735718, + "learning_rate": 9.160194174757282e-06, + "loss": 0.1656, + "step": 1221 + }, + { + "epoch": 0.8465535157603048, + "grad_norm": 1.0730808973312378, + "learning_rate": 9.159500693481277e-06, + "loss": 0.1542, + "step": 1222 + }, + { + "epoch": 0.8472462764114999, + "grad_norm": 0.91897052526474, + "learning_rate": 9.15880721220527e-06, + "loss": 0.1471, + "step": 1223 + }, + { + "epoch": 0.8479390370626948, + "grad_norm": 1.0821276903152466, + "learning_rate": 9.158113730929266e-06, + "loss": 0.1534, + "step": 1224 + }, + { + "epoch": 0.8486317977138899, + "grad_norm": 1.1234447956085205, + "learning_rate": 9.15742024965326e-06, + "loss": 0.1592, + "step": 1225 + }, + { + "epoch": 0.8493245583650849, + "grad_norm": 0.9577057361602783, + "learning_rate": 9.156726768377254e-06, + "loss": 0.1246, + "step": 1226 + }, + { + "epoch": 0.8500173190162799, + "grad_norm": 1.0541062355041504, + "learning_rate": 9.156033287101249e-06, + "loss": 0.1688, + "step": 1227 + }, + { + "epoch": 0.8507100796674749, + "grad_norm": 1.027347445487976, + "learning_rate": 9.155339805825244e-06, + "loss": 0.1746, + "step": 1228 + }, + { + "epoch": 0.8514028403186699, + "grad_norm": 0.9498367309570312, + "learning_rate": 9.154646324549238e-06, + "loss": 0.1384, + "step": 1229 + }, + { + "epoch": 0.8520956009698649, + "grad_norm": 1.069927453994751, + "learning_rate": 9.153952843273233e-06, + "loss": 0.1328, + "step": 1230 + }, + { + "epoch": 0.85278836162106, + "grad_norm": 0.9595523476600647, + "learning_rate": 9.153259361997227e-06, + "loss": 0.1171, + "step": 1231 + }, + { + "epoch": 0.8534811222722549, + "grad_norm": 1.2975196838378906, + "learning_rate": 9.152565880721222e-06, + "loss": 0.1949, + "step": 1232 + }, + { + "epoch": 0.85417388292345, + "grad_norm": 1.0584015846252441, + "learning_rate": 9.151872399445215e-06, + "loss": 0.1662, + "step": 1233 + }, + { + "epoch": 0.8548666435746449, + "grad_norm": 0.9537229537963867, + "learning_rate": 9.15117891816921e-06, + "loss": 0.1583, + "step": 1234 + }, + { + "epoch": 0.85555940422584, + "grad_norm": 1.0296034812927246, + "learning_rate": 9.150485436893205e-06, + "loss": 0.1647, + "step": 1235 + }, + { + "epoch": 0.856252164877035, + "grad_norm": 1.0385677814483643, + "learning_rate": 9.149791955617198e-06, + "loss": 0.144, + "step": 1236 + }, + { + "epoch": 0.85694492552823, + "grad_norm": 1.0480293035507202, + "learning_rate": 9.149098474341193e-06, + "loss": 0.1494, + "step": 1237 + }, + { + "epoch": 0.857637686179425, + "grad_norm": 1.0525736808776855, + "learning_rate": 9.148404993065188e-06, + "loss": 0.1506, + "step": 1238 + }, + { + "epoch": 0.85833044683062, + "grad_norm": 1.0605429410934448, + "learning_rate": 9.147711511789183e-06, + "loss": 0.1378, + "step": 1239 + }, + { + "epoch": 0.859023207481815, + "grad_norm": 1.1563390493392944, + "learning_rate": 9.147018030513178e-06, + "loss": 0.1696, + "step": 1240 + }, + { + "epoch": 0.8597159681330101, + "grad_norm": 1.1349605321884155, + "learning_rate": 9.146324549237171e-06, + "loss": 0.1645, + "step": 1241 + }, + { + "epoch": 0.860408728784205, + "grad_norm": 0.9695345759391785, + "learning_rate": 9.145631067961166e-06, + "loss": 0.1487, + "step": 1242 + }, + { + "epoch": 0.8611014894354001, + "grad_norm": 0.9568183422088623, + "learning_rate": 9.144937586685159e-06, + "loss": 0.1329, + "step": 1243 + }, + { + "epoch": 0.861794250086595, + "grad_norm": 1.025154709815979, + "learning_rate": 9.144244105409154e-06, + "loss": 0.1299, + "step": 1244 + }, + { + "epoch": 0.8624870107377901, + "grad_norm": 1.0970348119735718, + "learning_rate": 9.143550624133149e-06, + "loss": 0.1621, + "step": 1245 + }, + { + "epoch": 0.863179771388985, + "grad_norm": 1.0218442678451538, + "learning_rate": 9.142857142857144e-06, + "loss": 0.1256, + "step": 1246 + }, + { + "epoch": 0.8638725320401801, + "grad_norm": 1.2018505334854126, + "learning_rate": 9.142163661581139e-06, + "loss": 0.1697, + "step": 1247 + }, + { + "epoch": 0.8645652926913752, + "grad_norm": 1.018684983253479, + "learning_rate": 9.141470180305132e-06, + "loss": 0.1705, + "step": 1248 + }, + { + "epoch": 0.8652580533425701, + "grad_norm": 1.0717487335205078, + "learning_rate": 9.140776699029127e-06, + "loss": 0.1784, + "step": 1249 + }, + { + "epoch": 0.8659508139937652, + "grad_norm": 0.947426974773407, + "learning_rate": 9.140083217753122e-06, + "loss": 0.121, + "step": 1250 + }, + { + "epoch": 0.8666435746449601, + "grad_norm": 1.0973340272903442, + "learning_rate": 9.139389736477115e-06, + "loss": 0.1572, + "step": 1251 + }, + { + "epoch": 0.8673363352961552, + "grad_norm": 1.0337198972702026, + "learning_rate": 9.13869625520111e-06, + "loss": 0.1555, + "step": 1252 + }, + { + "epoch": 0.8680290959473502, + "grad_norm": 0.9271091818809509, + "learning_rate": 9.138002773925105e-06, + "loss": 0.1254, + "step": 1253 + }, + { + "epoch": 0.8687218565985452, + "grad_norm": 1.0046056509017944, + "learning_rate": 9.1373092926491e-06, + "loss": 0.1498, + "step": 1254 + }, + { + "epoch": 0.8694146172497402, + "grad_norm": 1.0540696382522583, + "learning_rate": 9.136615811373093e-06, + "loss": 0.1684, + "step": 1255 + }, + { + "epoch": 0.8701073779009352, + "grad_norm": 1.0148552656173706, + "learning_rate": 9.135922330097088e-06, + "loss": 0.1676, + "step": 1256 + }, + { + "epoch": 0.8708001385521302, + "grad_norm": 0.9528785347938538, + "learning_rate": 9.135228848821083e-06, + "loss": 0.1379, + "step": 1257 + }, + { + "epoch": 0.8714928992033253, + "grad_norm": 1.084505319595337, + "learning_rate": 9.134535367545076e-06, + "loss": 0.1533, + "step": 1258 + }, + { + "epoch": 0.8721856598545202, + "grad_norm": 0.9560432434082031, + "learning_rate": 9.133841886269071e-06, + "loss": 0.1566, + "step": 1259 + }, + { + "epoch": 0.8728784205057153, + "grad_norm": 1.0317646265029907, + "learning_rate": 9.133148404993066e-06, + "loss": 0.1636, + "step": 1260 + }, + { + "epoch": 0.8735711811569102, + "grad_norm": 1.1282131671905518, + "learning_rate": 9.13245492371706e-06, + "loss": 0.1798, + "step": 1261 + }, + { + "epoch": 0.8742639418081053, + "grad_norm": 0.9957450032234192, + "learning_rate": 9.131761442441054e-06, + "loss": 0.1419, + "step": 1262 + }, + { + "epoch": 0.8749567024593004, + "grad_norm": 1.069015383720398, + "learning_rate": 9.13106796116505e-06, + "loss": 0.1485, + "step": 1263 + }, + { + "epoch": 0.8756494631104953, + "grad_norm": 1.1129701137542725, + "learning_rate": 9.130374479889044e-06, + "loss": 0.169, + "step": 1264 + }, + { + "epoch": 0.8763422237616904, + "grad_norm": 1.0611717700958252, + "learning_rate": 9.12968099861304e-06, + "loss": 0.1422, + "step": 1265 + }, + { + "epoch": 0.8770349844128853, + "grad_norm": 1.0714110136032104, + "learning_rate": 9.128987517337032e-06, + "loss": 0.1913, + "step": 1266 + }, + { + "epoch": 0.8777277450640804, + "grad_norm": 1.039005160331726, + "learning_rate": 9.128294036061027e-06, + "loss": 0.1547, + "step": 1267 + }, + { + "epoch": 0.8784205057152754, + "grad_norm": 1.1571332216262817, + "learning_rate": 9.12760055478502e-06, + "loss": 0.1821, + "step": 1268 + }, + { + "epoch": 0.8791132663664704, + "grad_norm": 1.127876877784729, + "learning_rate": 9.126907073509016e-06, + "loss": 0.1656, + "step": 1269 + }, + { + "epoch": 0.8798060270176654, + "grad_norm": 0.9981972575187683, + "learning_rate": 9.12621359223301e-06, + "loss": 0.1552, + "step": 1270 + }, + { + "epoch": 0.8804987876688604, + "grad_norm": 1.0318617820739746, + "learning_rate": 9.125520110957005e-06, + "loss": 0.1624, + "step": 1271 + }, + { + "epoch": 0.8811915483200554, + "grad_norm": 1.4229623079299927, + "learning_rate": 9.124826629681e-06, + "loss": 0.1844, + "step": 1272 + }, + { + "epoch": 0.8818843089712505, + "grad_norm": 1.1267119646072388, + "learning_rate": 9.124133148404994e-06, + "loss": 0.2063, + "step": 1273 + }, + { + "epoch": 0.8825770696224454, + "grad_norm": 1.0590894222259521, + "learning_rate": 9.123439667128988e-06, + "loss": 0.1354, + "step": 1274 + }, + { + "epoch": 0.8832698302736405, + "grad_norm": 0.9794348478317261, + "learning_rate": 9.122746185852983e-06, + "loss": 0.176, + "step": 1275 + }, + { + "epoch": 0.8839625909248354, + "grad_norm": 1.099520206451416, + "learning_rate": 9.122052704576977e-06, + "loss": 0.183, + "step": 1276 + }, + { + "epoch": 0.8846553515760305, + "grad_norm": 1.0151985883712769, + "learning_rate": 9.121359223300972e-06, + "loss": 0.1597, + "step": 1277 + }, + { + "epoch": 0.8853481122272255, + "grad_norm": 0.9931125640869141, + "learning_rate": 9.120665742024965e-06, + "loss": 0.1436, + "step": 1278 + }, + { + "epoch": 0.8860408728784205, + "grad_norm": 0.897212028503418, + "learning_rate": 9.11997226074896e-06, + "loss": 0.133, + "step": 1279 + }, + { + "epoch": 0.8867336335296155, + "grad_norm": 1.0599122047424316, + "learning_rate": 9.119278779472955e-06, + "loss": 0.1428, + "step": 1280 + }, + { + "epoch": 0.8874263941808105, + "grad_norm": 1.0727603435516357, + "learning_rate": 9.11858529819695e-06, + "loss": 0.1512, + "step": 1281 + }, + { + "epoch": 0.8881191548320055, + "grad_norm": 1.020111083984375, + "learning_rate": 9.117891816920945e-06, + "loss": 0.1463, + "step": 1282 + }, + { + "epoch": 0.8888119154832006, + "grad_norm": 1.0650858879089355, + "learning_rate": 9.117198335644938e-06, + "loss": 0.1516, + "step": 1283 + }, + { + "epoch": 0.8895046761343955, + "grad_norm": 0.9823405146598816, + "learning_rate": 9.116504854368933e-06, + "loss": 0.131, + "step": 1284 + }, + { + "epoch": 0.8901974367855906, + "grad_norm": 1.0441343784332275, + "learning_rate": 9.115811373092928e-06, + "loss": 0.1512, + "step": 1285 + }, + { + "epoch": 0.8908901974367855, + "grad_norm": 0.9891427159309387, + "learning_rate": 9.115117891816921e-06, + "loss": 0.109, + "step": 1286 + }, + { + "epoch": 0.8915829580879806, + "grad_norm": 1.0274287462234497, + "learning_rate": 9.114424410540916e-06, + "loss": 0.1653, + "step": 1287 + }, + { + "epoch": 0.8922757187391757, + "grad_norm": 1.041791558265686, + "learning_rate": 9.11373092926491e-06, + "loss": 0.154, + "step": 1288 + }, + { + "epoch": 0.8929684793903706, + "grad_norm": 0.9982712268829346, + "learning_rate": 9.113037447988906e-06, + "loss": 0.1295, + "step": 1289 + }, + { + "epoch": 0.8936612400415657, + "grad_norm": 1.0003913640975952, + "learning_rate": 9.1123439667129e-06, + "loss": 0.1516, + "step": 1290 + }, + { + "epoch": 0.8943540006927606, + "grad_norm": 1.0033366680145264, + "learning_rate": 9.111650485436894e-06, + "loss": 0.1534, + "step": 1291 + }, + { + "epoch": 0.8950467613439557, + "grad_norm": 1.1789953708648682, + "learning_rate": 9.110957004160889e-06, + "loss": 0.1808, + "step": 1292 + }, + { + "epoch": 0.8957395219951507, + "grad_norm": 1.0715515613555908, + "learning_rate": 9.110263522884882e-06, + "loss": 0.1636, + "step": 1293 + }, + { + "epoch": 0.8964322826463457, + "grad_norm": 0.8517835736274719, + "learning_rate": 9.109570041608877e-06, + "loss": 0.1375, + "step": 1294 + }, + { + "epoch": 0.8971250432975407, + "grad_norm": 0.9976966381072998, + "learning_rate": 9.108876560332872e-06, + "loss": 0.1512, + "step": 1295 + }, + { + "epoch": 0.8978178039487357, + "grad_norm": 1.0409802198410034, + "learning_rate": 9.108183079056865e-06, + "loss": 0.1633, + "step": 1296 + }, + { + "epoch": 0.8985105645999307, + "grad_norm": 1.0344579219818115, + "learning_rate": 9.10748959778086e-06, + "loss": 0.1239, + "step": 1297 + }, + { + "epoch": 0.8992033252511258, + "grad_norm": 1.0247353315353394, + "learning_rate": 9.106796116504855e-06, + "loss": 0.1605, + "step": 1298 + }, + { + "epoch": 0.8998960859023207, + "grad_norm": 0.9276660680770874, + "learning_rate": 9.10610263522885e-06, + "loss": 0.1316, + "step": 1299 + }, + { + "epoch": 0.9005888465535158, + "grad_norm": 1.0993589162826538, + "learning_rate": 9.105409153952845e-06, + "loss": 0.1622, + "step": 1300 + }, + { + "epoch": 0.9012816072047107, + "grad_norm": 1.0584131479263306, + "learning_rate": 9.104715672676838e-06, + "loss": 0.1425, + "step": 1301 + }, + { + "epoch": 0.9019743678559058, + "grad_norm": 1.1870489120483398, + "learning_rate": 9.104022191400833e-06, + "loss": 0.1862, + "step": 1302 + }, + { + "epoch": 0.9026671285071008, + "grad_norm": 1.0090534687042236, + "learning_rate": 9.103328710124826e-06, + "loss": 0.1605, + "step": 1303 + }, + { + "epoch": 0.9033598891582958, + "grad_norm": 0.8897512555122375, + "learning_rate": 9.102635228848821e-06, + "loss": 0.1438, + "step": 1304 + }, + { + "epoch": 0.9040526498094908, + "grad_norm": 1.1026703119277954, + "learning_rate": 9.101941747572816e-06, + "loss": 0.1527, + "step": 1305 + }, + { + "epoch": 0.9047454104606858, + "grad_norm": 0.974031925201416, + "learning_rate": 9.101248266296811e-06, + "loss": 0.1516, + "step": 1306 + }, + { + "epoch": 0.9054381711118809, + "grad_norm": 1.025345802307129, + "learning_rate": 9.100554785020806e-06, + "loss": 0.1565, + "step": 1307 + }, + { + "epoch": 0.9061309317630759, + "grad_norm": 1.143355131149292, + "learning_rate": 9.0998613037448e-06, + "loss": 0.1692, + "step": 1308 + }, + { + "epoch": 0.9068236924142709, + "grad_norm": 1.1369552612304688, + "learning_rate": 9.099167822468794e-06, + "loss": 0.1573, + "step": 1309 + }, + { + "epoch": 0.9075164530654659, + "grad_norm": 0.9756319522857666, + "learning_rate": 9.098474341192789e-06, + "loss": 0.1448, + "step": 1310 + }, + { + "epoch": 0.9082092137166609, + "grad_norm": 1.089788556098938, + "learning_rate": 9.097780859916782e-06, + "loss": 0.1767, + "step": 1311 + }, + { + "epoch": 0.9089019743678559, + "grad_norm": 0.9750568270683289, + "learning_rate": 9.097087378640777e-06, + "loss": 0.1628, + "step": 1312 + }, + { + "epoch": 0.909594735019051, + "grad_norm": 0.8872171640396118, + "learning_rate": 9.096393897364772e-06, + "loss": 0.093, + "step": 1313 + }, + { + "epoch": 0.9102874956702459, + "grad_norm": 1.1109753847122192, + "learning_rate": 9.095700416088765e-06, + "loss": 0.1749, + "step": 1314 + }, + { + "epoch": 0.910980256321441, + "grad_norm": 0.9836993217468262, + "learning_rate": 9.09500693481276e-06, + "loss": 0.1632, + "step": 1315 + }, + { + "epoch": 0.9116730169726359, + "grad_norm": 1.0990700721740723, + "learning_rate": 9.094313453536755e-06, + "loss": 0.1645, + "step": 1316 + }, + { + "epoch": 0.912365777623831, + "grad_norm": 0.9555688500404358, + "learning_rate": 9.09361997226075e-06, + "loss": 0.136, + "step": 1317 + }, + { + "epoch": 0.913058538275026, + "grad_norm": 1.057181477546692, + "learning_rate": 9.092926490984744e-06, + "loss": 0.1388, + "step": 1318 + }, + { + "epoch": 0.913751298926221, + "grad_norm": 1.0919501781463623, + "learning_rate": 9.092233009708738e-06, + "loss": 0.146, + "step": 1319 + }, + { + "epoch": 0.914444059577416, + "grad_norm": 0.9598453640937805, + "learning_rate": 9.091539528432733e-06, + "loss": 0.1565, + "step": 1320 + }, + { + "epoch": 0.915136820228611, + "grad_norm": 0.9124870896339417, + "learning_rate": 9.090846047156727e-06, + "loss": 0.1346, + "step": 1321 + }, + { + "epoch": 0.915829580879806, + "grad_norm": 1.0432202816009521, + "learning_rate": 9.090152565880722e-06, + "loss": 0.1416, + "step": 1322 + }, + { + "epoch": 0.9165223415310011, + "grad_norm": 0.9280151128768921, + "learning_rate": 9.089459084604716e-06, + "loss": 0.1473, + "step": 1323 + }, + { + "epoch": 0.917215102182196, + "grad_norm": 0.9922134876251221, + "learning_rate": 9.088765603328711e-06, + "loss": 0.1355, + "step": 1324 + }, + { + "epoch": 0.9179078628333911, + "grad_norm": 0.9754739999771118, + "learning_rate": 9.088072122052706e-06, + "loss": 0.1455, + "step": 1325 + }, + { + "epoch": 0.918600623484586, + "grad_norm": 1.052607774734497, + "learning_rate": 9.0873786407767e-06, + "loss": 0.1508, + "step": 1326 + }, + { + "epoch": 0.9192933841357811, + "grad_norm": 1.0654557943344116, + "learning_rate": 9.086685159500695e-06, + "loss": 0.1437, + "step": 1327 + }, + { + "epoch": 0.919986144786976, + "grad_norm": 1.0544451475143433, + "learning_rate": 9.085991678224688e-06, + "loss": 0.1565, + "step": 1328 + }, + { + "epoch": 0.9206789054381711, + "grad_norm": 1.0749974250793457, + "learning_rate": 9.085298196948683e-06, + "loss": 0.1582, + "step": 1329 + }, + { + "epoch": 0.9213716660893662, + "grad_norm": 0.9863892793655396, + "learning_rate": 9.084604715672678e-06, + "loss": 0.1438, + "step": 1330 + }, + { + "epoch": 0.9220644267405611, + "grad_norm": 1.0883053541183472, + "learning_rate": 9.083911234396673e-06, + "loss": 0.1725, + "step": 1331 + }, + { + "epoch": 0.9227571873917562, + "grad_norm": 0.9873329401016235, + "learning_rate": 9.083217753120668e-06, + "loss": 0.132, + "step": 1332 + }, + { + "epoch": 0.9234499480429511, + "grad_norm": 1.089841365814209, + "learning_rate": 9.08252427184466e-06, + "loss": 0.1607, + "step": 1333 + }, + { + "epoch": 0.9241427086941462, + "grad_norm": 1.1246720552444458, + "learning_rate": 9.081830790568656e-06, + "loss": 0.1873, + "step": 1334 + }, + { + "epoch": 0.9248354693453412, + "grad_norm": 0.9264191389083862, + "learning_rate": 9.08113730929265e-06, + "loss": 0.1419, + "step": 1335 + }, + { + "epoch": 0.9255282299965362, + "grad_norm": 1.1533194780349731, + "learning_rate": 9.080443828016644e-06, + "loss": 0.171, + "step": 1336 + }, + { + "epoch": 0.9262209906477312, + "grad_norm": 1.1083037853240967, + "learning_rate": 9.079750346740639e-06, + "loss": 0.1362, + "step": 1337 + }, + { + "epoch": 0.9269137512989262, + "grad_norm": 1.1557817459106445, + "learning_rate": 9.079056865464632e-06, + "loss": 0.1533, + "step": 1338 + }, + { + "epoch": 0.9276065119501212, + "grad_norm": 0.9090328812599182, + "learning_rate": 9.078363384188627e-06, + "loss": 0.1521, + "step": 1339 + }, + { + "epoch": 0.9282992726013163, + "grad_norm": 0.977802574634552, + "learning_rate": 9.077669902912622e-06, + "loss": 0.1684, + "step": 1340 + }, + { + "epoch": 0.9289920332525112, + "grad_norm": 0.9049829244613647, + "learning_rate": 9.076976421636617e-06, + "loss": 0.138, + "step": 1341 + }, + { + "epoch": 0.9296847939037063, + "grad_norm": 1.0639560222625732, + "learning_rate": 9.076282940360612e-06, + "loss": 0.1769, + "step": 1342 + }, + { + "epoch": 0.9303775545549012, + "grad_norm": 0.9426189064979553, + "learning_rate": 9.075589459084605e-06, + "loss": 0.1313, + "step": 1343 + }, + { + "epoch": 0.9310703152060963, + "grad_norm": 0.97137850522995, + "learning_rate": 9.0748959778086e-06, + "loss": 0.1614, + "step": 1344 + }, + { + "epoch": 0.9317630758572913, + "grad_norm": 1.0211855173110962, + "learning_rate": 9.074202496532595e-06, + "loss": 0.1647, + "step": 1345 + }, + { + "epoch": 0.9324558365084863, + "grad_norm": 0.9157480597496033, + "learning_rate": 9.073509015256588e-06, + "loss": 0.1517, + "step": 1346 + }, + { + "epoch": 0.9331485971596813, + "grad_norm": 1.0401955842971802, + "learning_rate": 9.072815533980583e-06, + "loss": 0.1677, + "step": 1347 + }, + { + "epoch": 0.9338413578108763, + "grad_norm": 1.2260417938232422, + "learning_rate": 9.072122052704578e-06, + "loss": 0.1758, + "step": 1348 + }, + { + "epoch": 0.9345341184620714, + "grad_norm": 1.0605547428131104, + "learning_rate": 9.071428571428573e-06, + "loss": 0.1495, + "step": 1349 + }, + { + "epoch": 0.9352268791132664, + "grad_norm": 0.9659693241119385, + "learning_rate": 9.070735090152568e-06, + "loss": 0.1332, + "step": 1350 + }, + { + "epoch": 0.9359196397644614, + "grad_norm": 0.9812120199203491, + "learning_rate": 9.070041608876561e-06, + "loss": 0.1333, + "step": 1351 + }, + { + "epoch": 0.9366124004156564, + "grad_norm": 0.9992308616638184, + "learning_rate": 9.069348127600556e-06, + "loss": 0.1607, + "step": 1352 + }, + { + "epoch": 0.9373051610668514, + "grad_norm": 1.1059376001358032, + "learning_rate": 9.06865464632455e-06, + "loss": 0.1627, + "step": 1353 + }, + { + "epoch": 0.9379979217180464, + "grad_norm": 1.1383405923843384, + "learning_rate": 9.067961165048544e-06, + "loss": 0.1438, + "step": 1354 + }, + { + "epoch": 0.9386906823692415, + "grad_norm": 1.1224113702774048, + "learning_rate": 9.067267683772539e-06, + "loss": 0.1716, + "step": 1355 + }, + { + "epoch": 0.9393834430204364, + "grad_norm": 1.0540070533752441, + "learning_rate": 9.066574202496532e-06, + "loss": 0.1475, + "step": 1356 + }, + { + "epoch": 0.9400762036716315, + "grad_norm": 0.9050190448760986, + "learning_rate": 9.065880721220527e-06, + "loss": 0.1358, + "step": 1357 + }, + { + "epoch": 0.9407689643228264, + "grad_norm": 0.995415985584259, + "learning_rate": 9.065187239944522e-06, + "loss": 0.1481, + "step": 1358 + }, + { + "epoch": 0.9414617249740215, + "grad_norm": 1.6251403093338013, + "learning_rate": 9.064493758668517e-06, + "loss": 0.1532, + "step": 1359 + }, + { + "epoch": 0.9421544856252165, + "grad_norm": 1.0649985074996948, + "learning_rate": 9.063800277392512e-06, + "loss": 0.1583, + "step": 1360 + }, + { + "epoch": 0.9428472462764115, + "grad_norm": 1.1055551767349243, + "learning_rate": 9.063106796116505e-06, + "loss": 0.1727, + "step": 1361 + }, + { + "epoch": 0.9435400069276065, + "grad_norm": 1.0746663808822632, + "learning_rate": 9.0624133148405e-06, + "loss": 0.1673, + "step": 1362 + }, + { + "epoch": 0.9442327675788015, + "grad_norm": 0.9959187507629395, + "learning_rate": 9.061719833564494e-06, + "loss": 0.1528, + "step": 1363 + }, + { + "epoch": 0.9449255282299965, + "grad_norm": 1.0603806972503662, + "learning_rate": 9.061026352288488e-06, + "loss": 0.1448, + "step": 1364 + }, + { + "epoch": 0.9456182888811916, + "grad_norm": 1.0264545679092407, + "learning_rate": 9.060332871012483e-06, + "loss": 0.1269, + "step": 1365 + }, + { + "epoch": 0.9463110495323865, + "grad_norm": 1.0569376945495605, + "learning_rate": 9.059639389736478e-06, + "loss": 0.16, + "step": 1366 + }, + { + "epoch": 0.9470038101835816, + "grad_norm": 0.9829663038253784, + "learning_rate": 9.058945908460473e-06, + "loss": 0.1507, + "step": 1367 + }, + { + "epoch": 0.9476965708347765, + "grad_norm": 1.0312482118606567, + "learning_rate": 9.058252427184466e-06, + "loss": 0.1346, + "step": 1368 + }, + { + "epoch": 0.9483893314859716, + "grad_norm": 1.0962169170379639, + "learning_rate": 9.057558945908461e-06, + "loss": 0.1874, + "step": 1369 + }, + { + "epoch": 0.9490820921371667, + "grad_norm": 1.0141427516937256, + "learning_rate": 9.056865464632456e-06, + "loss": 0.1431, + "step": 1370 + }, + { + "epoch": 0.9497748527883616, + "grad_norm": 1.0795923471450806, + "learning_rate": 9.05617198335645e-06, + "loss": 0.1631, + "step": 1371 + }, + { + "epoch": 0.9504676134395567, + "grad_norm": 0.9833242297172546, + "learning_rate": 9.055478502080445e-06, + "loss": 0.1337, + "step": 1372 + }, + { + "epoch": 0.9511603740907516, + "grad_norm": 1.0700855255126953, + "learning_rate": 9.054785020804438e-06, + "loss": 0.1398, + "step": 1373 + }, + { + "epoch": 0.9518531347419467, + "grad_norm": 0.9360601902008057, + "learning_rate": 9.054091539528433e-06, + "loss": 0.1494, + "step": 1374 + }, + { + "epoch": 0.9525458953931417, + "grad_norm": 1.0651384592056274, + "learning_rate": 9.053398058252428e-06, + "loss": 0.1443, + "step": 1375 + }, + { + "epoch": 0.9532386560443367, + "grad_norm": 1.0867021083831787, + "learning_rate": 9.052704576976423e-06, + "loss": 0.1655, + "step": 1376 + }, + { + "epoch": 0.9539314166955317, + "grad_norm": 1.0285042524337769, + "learning_rate": 9.052011095700417e-06, + "loss": 0.15, + "step": 1377 + }, + { + "epoch": 0.9546241773467267, + "grad_norm": 1.0461286306381226, + "learning_rate": 9.05131761442441e-06, + "loss": 0.1586, + "step": 1378 + }, + { + "epoch": 0.9553169379979217, + "grad_norm": 1.0037174224853516, + "learning_rate": 9.050624133148406e-06, + "loss": 0.1109, + "step": 1379 + }, + { + "epoch": 0.9560096986491168, + "grad_norm": 0.9715021848678589, + "learning_rate": 9.0499306518724e-06, + "loss": 0.1604, + "step": 1380 + }, + { + "epoch": 0.9567024593003117, + "grad_norm": 1.1860789060592651, + "learning_rate": 9.049237170596394e-06, + "loss": 0.1528, + "step": 1381 + }, + { + "epoch": 0.9573952199515068, + "grad_norm": 0.952221155166626, + "learning_rate": 9.048543689320389e-06, + "loss": 0.1345, + "step": 1382 + }, + { + "epoch": 0.9580879806027017, + "grad_norm": 1.0256909132003784, + "learning_rate": 9.047850208044384e-06, + "loss": 0.1694, + "step": 1383 + }, + { + "epoch": 0.9587807412538968, + "grad_norm": 1.0124690532684326, + "learning_rate": 9.047156726768379e-06, + "loss": 0.1369, + "step": 1384 + }, + { + "epoch": 0.9594735019050918, + "grad_norm": 1.1516591310501099, + "learning_rate": 9.046463245492374e-06, + "loss": 0.1482, + "step": 1385 + }, + { + "epoch": 0.9601662625562868, + "grad_norm": 1.0505039691925049, + "learning_rate": 9.045769764216367e-06, + "loss": 0.1585, + "step": 1386 + }, + { + "epoch": 0.9608590232074818, + "grad_norm": 1.0213027000427246, + "learning_rate": 9.045076282940362e-06, + "loss": 0.1236, + "step": 1387 + }, + { + "epoch": 0.9615517838586768, + "grad_norm": 0.9856483936309814, + "learning_rate": 9.044382801664355e-06, + "loss": 0.133, + "step": 1388 + }, + { + "epoch": 0.9622445445098718, + "grad_norm": 1.0869338512420654, + "learning_rate": 9.04368932038835e-06, + "loss": 0.1758, + "step": 1389 + }, + { + "epoch": 0.9629373051610669, + "grad_norm": 0.9334696531295776, + "learning_rate": 9.042995839112345e-06, + "loss": 0.1199, + "step": 1390 + }, + { + "epoch": 0.9636300658122618, + "grad_norm": 1.0358421802520752, + "learning_rate": 9.042302357836338e-06, + "loss": 0.1334, + "step": 1391 + }, + { + "epoch": 0.9643228264634569, + "grad_norm": 1.0631022453308105, + "learning_rate": 9.041608876560333e-06, + "loss": 0.1376, + "step": 1392 + }, + { + "epoch": 0.9650155871146519, + "grad_norm": 0.982193112373352, + "learning_rate": 9.040915395284328e-06, + "loss": 0.1393, + "step": 1393 + }, + { + "epoch": 0.9657083477658469, + "grad_norm": 0.9237812161445618, + "learning_rate": 9.040221914008323e-06, + "loss": 0.1341, + "step": 1394 + }, + { + "epoch": 0.966401108417042, + "grad_norm": 0.9706209897994995, + "learning_rate": 9.039528432732318e-06, + "loss": 0.1439, + "step": 1395 + }, + { + "epoch": 0.9670938690682369, + "grad_norm": 0.952503502368927, + "learning_rate": 9.038834951456311e-06, + "loss": 0.1498, + "step": 1396 + }, + { + "epoch": 0.967786629719432, + "grad_norm": 1.0859705209732056, + "learning_rate": 9.038141470180306e-06, + "loss": 0.1552, + "step": 1397 + }, + { + "epoch": 0.9684793903706269, + "grad_norm": 1.0285351276397705, + "learning_rate": 9.0374479889043e-06, + "loss": 0.1636, + "step": 1398 + }, + { + "epoch": 0.969172151021822, + "grad_norm": 1.1443272829055786, + "learning_rate": 9.036754507628294e-06, + "loss": 0.1717, + "step": 1399 + }, + { + "epoch": 0.969864911673017, + "grad_norm": 1.1059259176254272, + "learning_rate": 9.036061026352289e-06, + "loss": 0.1876, + "step": 1400 + }, + { + "epoch": 0.970557672324212, + "grad_norm": 1.0529000759124756, + "learning_rate": 9.035367545076284e-06, + "loss": 0.1507, + "step": 1401 + }, + { + "epoch": 0.971250432975407, + "grad_norm": 1.1878215074539185, + "learning_rate": 9.034674063800279e-06, + "loss": 0.1675, + "step": 1402 + }, + { + "epoch": 0.971943193626602, + "grad_norm": 1.0230395793914795, + "learning_rate": 9.033980582524272e-06, + "loss": 0.1408, + "step": 1403 + }, + { + "epoch": 0.972635954277797, + "grad_norm": 0.9114059805870056, + "learning_rate": 9.033287101248267e-06, + "loss": 0.155, + "step": 1404 + }, + { + "epoch": 0.973328714928992, + "grad_norm": 0.9873482584953308, + "learning_rate": 9.032593619972262e-06, + "loss": 0.143, + "step": 1405 + }, + { + "epoch": 0.974021475580187, + "grad_norm": 0.9903661012649536, + "learning_rate": 9.031900138696255e-06, + "loss": 0.1368, + "step": 1406 + }, + { + "epoch": 0.9747142362313821, + "grad_norm": 0.9592527747154236, + "learning_rate": 9.03120665742025e-06, + "loss": 0.14, + "step": 1407 + }, + { + "epoch": 0.975406996882577, + "grad_norm": 0.9270055294036865, + "learning_rate": 9.030513176144245e-06, + "loss": 0.1252, + "step": 1408 + }, + { + "epoch": 0.9760997575337721, + "grad_norm": 1.0739645957946777, + "learning_rate": 9.02981969486824e-06, + "loss": 0.1686, + "step": 1409 + }, + { + "epoch": 0.976792518184967, + "grad_norm": 0.9901601076126099, + "learning_rate": 9.029126213592233e-06, + "loss": 0.1307, + "step": 1410 + }, + { + "epoch": 0.9774852788361621, + "grad_norm": 1.122162103652954, + "learning_rate": 9.028432732316228e-06, + "loss": 0.1714, + "step": 1411 + }, + { + "epoch": 0.9781780394873572, + "grad_norm": 1.008817195892334, + "learning_rate": 9.027739251040223e-06, + "loss": 0.161, + "step": 1412 + }, + { + "epoch": 0.9788708001385521, + "grad_norm": 1.0328139066696167, + "learning_rate": 9.027045769764216e-06, + "loss": 0.1472, + "step": 1413 + }, + { + "epoch": 0.9795635607897472, + "grad_norm": 1.090971827507019, + "learning_rate": 9.026352288488211e-06, + "loss": 0.1416, + "step": 1414 + }, + { + "epoch": 0.9802563214409421, + "grad_norm": 0.9473955035209656, + "learning_rate": 9.025658807212206e-06, + "loss": 0.141, + "step": 1415 + }, + { + "epoch": 0.9809490820921372, + "grad_norm": 0.8541167378425598, + "learning_rate": 9.0249653259362e-06, + "loss": 0.1232, + "step": 1416 + }, + { + "epoch": 0.9816418427433322, + "grad_norm": 0.8578402400016785, + "learning_rate": 9.024271844660194e-06, + "loss": 0.1119, + "step": 1417 + }, + { + "epoch": 0.9823346033945272, + "grad_norm": 0.9830331206321716, + "learning_rate": 9.02357836338419e-06, + "loss": 0.136, + "step": 1418 + }, + { + "epoch": 0.9830273640457222, + "grad_norm": 1.096903681755066, + "learning_rate": 9.022884882108184e-06, + "loss": 0.1689, + "step": 1419 + }, + { + "epoch": 0.9837201246969172, + "grad_norm": 0.9177114367485046, + "learning_rate": 9.02219140083218e-06, + "loss": 0.1104, + "step": 1420 + }, + { + "epoch": 0.9844128853481122, + "grad_norm": 0.9560793042182922, + "learning_rate": 9.021497919556173e-06, + "loss": 0.1328, + "step": 1421 + }, + { + "epoch": 0.9851056459993073, + "grad_norm": 1.094313383102417, + "learning_rate": 9.020804438280167e-06, + "loss": 0.1503, + "step": 1422 + }, + { + "epoch": 0.9857984066505022, + "grad_norm": 1.0043598413467407, + "learning_rate": 9.02011095700416e-06, + "loss": 0.1547, + "step": 1423 + }, + { + "epoch": 0.9864911673016973, + "grad_norm": 0.963378369808197, + "learning_rate": 9.019417475728156e-06, + "loss": 0.1303, + "step": 1424 + }, + { + "epoch": 0.9871839279528922, + "grad_norm": 1.1278101205825806, + "learning_rate": 9.01872399445215e-06, + "loss": 0.1754, + "step": 1425 + }, + { + "epoch": 0.9878766886040873, + "grad_norm": 1.0143046379089355, + "learning_rate": 9.018030513176146e-06, + "loss": 0.1554, + "step": 1426 + }, + { + "epoch": 0.9885694492552823, + "grad_norm": 1.0028659105300903, + "learning_rate": 9.01733703190014e-06, + "loss": 0.1473, + "step": 1427 + }, + { + "epoch": 0.9892622099064773, + "grad_norm": 1.020531177520752, + "learning_rate": 9.016643550624134e-06, + "loss": 0.1401, + "step": 1428 + }, + { + "epoch": 0.9899549705576723, + "grad_norm": 0.9271031022071838, + "learning_rate": 9.015950069348129e-06, + "loss": 0.1372, + "step": 1429 + }, + { + "epoch": 0.9906477312088673, + "grad_norm": 1.1084705591201782, + "learning_rate": 9.015256588072124e-06, + "loss": 0.1726, + "step": 1430 + }, + { + "epoch": 0.9913404918600623, + "grad_norm": 0.9905039072036743, + "learning_rate": 9.014563106796117e-06, + "loss": 0.1245, + "step": 1431 + }, + { + "epoch": 0.9920332525112574, + "grad_norm": 1.114532470703125, + "learning_rate": 9.013869625520112e-06, + "loss": 0.1773, + "step": 1432 + }, + { + "epoch": 0.9927260131624523, + "grad_norm": 0.9326059818267822, + "learning_rate": 9.013176144244105e-06, + "loss": 0.1575, + "step": 1433 + }, + { + "epoch": 0.9934187738136474, + "grad_norm": 1.106297254562378, + "learning_rate": 9.0124826629681e-06, + "loss": 0.1667, + "step": 1434 + }, + { + "epoch": 0.9941115344648424, + "grad_norm": 1.1321109533309937, + "learning_rate": 9.011789181692095e-06, + "loss": 0.1443, + "step": 1435 + }, + { + "epoch": 0.9948042951160374, + "grad_norm": 0.9964682459831238, + "learning_rate": 9.01109570041609e-06, + "loss": 0.1427, + "step": 1436 + }, + { + "epoch": 0.9954970557672325, + "grad_norm": 1.0049324035644531, + "learning_rate": 9.010402219140085e-06, + "loss": 0.142, + "step": 1437 + }, + { + "epoch": 0.9961898164184274, + "grad_norm": 1.120507001876831, + "learning_rate": 9.009708737864078e-06, + "loss": 0.1791, + "step": 1438 + }, + { + "epoch": 0.9968825770696225, + "grad_norm": 1.191520094871521, + "learning_rate": 9.009015256588073e-06, + "loss": 0.1568, + "step": 1439 + }, + { + "epoch": 0.9975753377208174, + "grad_norm": 1.0339500904083252, + "learning_rate": 9.008321775312068e-06, + "loss": 0.1435, + "step": 1440 + }, + { + "epoch": 0.9982680983720125, + "grad_norm": 1.1104273796081543, + "learning_rate": 9.007628294036061e-06, + "loss": 0.1516, + "step": 1441 + }, + { + "epoch": 0.9989608590232075, + "grad_norm": 1.2329273223876953, + "learning_rate": 9.006934812760056e-06, + "loss": 0.1517, + "step": 1442 + }, + { + "epoch": 0.9996536196744025, + "grad_norm": 1.001638650894165, + "learning_rate": 9.006241331484051e-06, + "loss": 0.1419, + "step": 1443 + }, + { + "epoch": 0.9996536196744025, + "eval_loss": 0.25841060280799866, + "eval_runtime": 7627.0462, + "eval_samples_per_second": 1.049, + "eval_steps_per_second": 0.033, + "eval_wer": 14.441454977462525, + "step": 1443 + }, + { + "epoch": 1.0003463803255974, + "grad_norm": 0.8515987992286682, + "learning_rate": 9.005547850208046e-06, + "loss": 0.1159, + "step": 1444 + }, + { + "epoch": 1.0010391409767925, + "grad_norm": 0.6200939416885376, + "learning_rate": 9.00485436893204e-06, + "loss": 0.0749, + "step": 1445 + }, + { + "epoch": 1.0017319016279875, + "grad_norm": 0.7645689249038696, + "learning_rate": 9.004160887656034e-06, + "loss": 0.1186, + "step": 1446 + }, + { + "epoch": 1.0024246622791826, + "grad_norm": 0.8087880611419678, + "learning_rate": 9.003467406380029e-06, + "loss": 0.1118, + "step": 1447 + }, + { + "epoch": 1.0031174229303776, + "grad_norm": 0.8951125741004944, + "learning_rate": 9.002773925104022e-06, + "loss": 0.1138, + "step": 1448 + }, + { + "epoch": 1.0038101835815725, + "grad_norm": 0.8107717037200928, + "learning_rate": 9.002080443828017e-06, + "loss": 0.1061, + "step": 1449 + }, + { + "epoch": 1.0045029442327675, + "grad_norm": 0.6840798258781433, + "learning_rate": 9.001386962552012e-06, + "loss": 0.0901, + "step": 1450 + }, + { + "epoch": 1.0051957048839626, + "grad_norm": 0.7962857484817505, + "learning_rate": 9.000693481276005e-06, + "loss": 0.0908, + "step": 1451 + }, + { + "epoch": 1.0058884655351576, + "grad_norm": 0.7427263855934143, + "learning_rate": 9e-06, + "loss": 0.091, + "step": 1452 + }, + { + "epoch": 1.0065812261863527, + "grad_norm": 0.9398947954177856, + "learning_rate": 8.999306518723995e-06, + "loss": 0.1119, + "step": 1453 + }, + { + "epoch": 1.0072739868375475, + "grad_norm": 0.801214873790741, + "learning_rate": 8.99861303744799e-06, + "loss": 0.1067, + "step": 1454 + }, + { + "epoch": 1.0079667474887426, + "grad_norm": 0.7821488976478577, + "learning_rate": 8.997919556171985e-06, + "loss": 0.1015, + "step": 1455 + }, + { + "epoch": 1.0086595081399377, + "grad_norm": 0.9047034382820129, + "learning_rate": 8.997226074895978e-06, + "loss": 0.1166, + "step": 1456 + }, + { + "epoch": 1.0093522687911327, + "grad_norm": 1.458219051361084, + "learning_rate": 8.996532593619973e-06, + "loss": 0.0968, + "step": 1457 + }, + { + "epoch": 1.0100450294423278, + "grad_norm": 0.8063825964927673, + "learning_rate": 8.995839112343966e-06, + "loss": 0.0941, + "step": 1458 + }, + { + "epoch": 1.0107377900935226, + "grad_norm": 0.8319393396377563, + "learning_rate": 8.995145631067961e-06, + "loss": 0.0862, + "step": 1459 + }, + { + "epoch": 1.0114305507447177, + "grad_norm": 0.9785870909690857, + "learning_rate": 8.994452149791956e-06, + "loss": 0.1093, + "step": 1460 + }, + { + "epoch": 1.0121233113959127, + "grad_norm": 0.8586106300354004, + "learning_rate": 8.993758668515951e-06, + "loss": 0.102, + "step": 1461 + }, + { + "epoch": 1.0128160720471078, + "grad_norm": 0.8335251808166504, + "learning_rate": 8.993065187239946e-06, + "loss": 0.1003, + "step": 1462 + }, + { + "epoch": 1.0135088326983028, + "grad_norm": 0.8018205165863037, + "learning_rate": 8.99237170596394e-06, + "loss": 0.0983, + "step": 1463 + }, + { + "epoch": 1.0142015933494977, + "grad_norm": 0.824069619178772, + "learning_rate": 8.991678224687934e-06, + "loss": 0.1018, + "step": 1464 + }, + { + "epoch": 1.0148943540006927, + "grad_norm": 0.8895369172096252, + "learning_rate": 8.99098474341193e-06, + "loss": 0.0986, + "step": 1465 + }, + { + "epoch": 1.0155871146518878, + "grad_norm": 0.9495205879211426, + "learning_rate": 8.990291262135923e-06, + "loss": 0.111, + "step": 1466 + }, + { + "epoch": 1.0162798753030828, + "grad_norm": 0.823100209236145, + "learning_rate": 8.989597780859917e-06, + "loss": 0.1117, + "step": 1467 + }, + { + "epoch": 1.016972635954278, + "grad_norm": 0.7357906103134155, + "learning_rate": 8.98890429958391e-06, + "loss": 0.0822, + "step": 1468 + }, + { + "epoch": 1.0176653966054727, + "grad_norm": 0.7708436846733093, + "learning_rate": 8.988210818307906e-06, + "loss": 0.0922, + "step": 1469 + }, + { + "epoch": 1.0183581572566678, + "grad_norm": 0.8186256885528564, + "learning_rate": 8.9875173370319e-06, + "loss": 0.0869, + "step": 1470 + }, + { + "epoch": 1.0190509179078628, + "grad_norm": 0.8974500894546509, + "learning_rate": 8.986823855755895e-06, + "loss": 0.1017, + "step": 1471 + }, + { + "epoch": 1.019743678559058, + "grad_norm": 0.9176140427589417, + "learning_rate": 8.98613037447989e-06, + "loss": 0.1124, + "step": 1472 + }, + { + "epoch": 1.020436439210253, + "grad_norm": 0.7861721515655518, + "learning_rate": 8.985436893203884e-06, + "loss": 0.0885, + "step": 1473 + }, + { + "epoch": 1.0211291998614478, + "grad_norm": 0.8407741189002991, + "learning_rate": 8.984743411927879e-06, + "loss": 0.0964, + "step": 1474 + }, + { + "epoch": 1.0218219605126428, + "grad_norm": 0.7929752469062805, + "learning_rate": 8.984049930651874e-06, + "loss": 0.0907, + "step": 1475 + }, + { + "epoch": 1.022514721163838, + "grad_norm": 0.8151705265045166, + "learning_rate": 8.983356449375867e-06, + "loss": 0.0844, + "step": 1476 + }, + { + "epoch": 1.023207481815033, + "grad_norm": 0.9558108448982239, + "learning_rate": 8.982662968099862e-06, + "loss": 0.1124, + "step": 1477 + }, + { + "epoch": 1.023900242466228, + "grad_norm": 0.9324179291725159, + "learning_rate": 8.981969486823857e-06, + "loss": 0.1119, + "step": 1478 + }, + { + "epoch": 1.0245930031174229, + "grad_norm": 0.8692248463630676, + "learning_rate": 8.981276005547852e-06, + "loss": 0.1018, + "step": 1479 + }, + { + "epoch": 1.025285763768618, + "grad_norm": 0.80369633436203, + "learning_rate": 8.980582524271847e-06, + "loss": 0.0885, + "step": 1480 + }, + { + "epoch": 1.025978524419813, + "grad_norm": 0.779391884803772, + "learning_rate": 8.97988904299584e-06, + "loss": 0.0921, + "step": 1481 + }, + { + "epoch": 1.026671285071008, + "grad_norm": 0.7021351456642151, + "learning_rate": 8.979195561719835e-06, + "loss": 0.0738, + "step": 1482 + }, + { + "epoch": 1.027364045722203, + "grad_norm": 0.7086558938026428, + "learning_rate": 8.978502080443828e-06, + "loss": 0.0689, + "step": 1483 + }, + { + "epoch": 1.028056806373398, + "grad_norm": 0.7233958840370178, + "learning_rate": 8.977808599167823e-06, + "loss": 0.0841, + "step": 1484 + }, + { + "epoch": 1.028749567024593, + "grad_norm": 0.8533040881156921, + "learning_rate": 8.977115117891818e-06, + "loss": 0.1049, + "step": 1485 + }, + { + "epoch": 1.029442327675788, + "grad_norm": 0.8435163497924805, + "learning_rate": 8.976421636615813e-06, + "loss": 0.0876, + "step": 1486 + }, + { + "epoch": 1.030135088326983, + "grad_norm": 0.843229353427887, + "learning_rate": 8.975728155339806e-06, + "loss": 0.0919, + "step": 1487 + }, + { + "epoch": 1.0308278489781781, + "grad_norm": 0.8355031609535217, + "learning_rate": 8.975034674063801e-06, + "loss": 0.1028, + "step": 1488 + }, + { + "epoch": 1.031520609629373, + "grad_norm": 0.9322400093078613, + "learning_rate": 8.974341192787796e-06, + "loss": 0.0955, + "step": 1489 + }, + { + "epoch": 1.032213370280568, + "grad_norm": 0.7547881603240967, + "learning_rate": 8.97364771151179e-06, + "loss": 0.0795, + "step": 1490 + }, + { + "epoch": 1.032906130931763, + "grad_norm": 0.8543777465820312, + "learning_rate": 8.972954230235784e-06, + "loss": 0.0909, + "step": 1491 + }, + { + "epoch": 1.0335988915829581, + "grad_norm": 0.8199926614761353, + "learning_rate": 8.972260748959779e-06, + "loss": 0.0874, + "step": 1492 + }, + { + "epoch": 1.0342916522341532, + "grad_norm": 0.9519786238670349, + "learning_rate": 8.971567267683772e-06, + "loss": 0.1191, + "step": 1493 + }, + { + "epoch": 1.034984412885348, + "grad_norm": 0.7860363125801086, + "learning_rate": 8.970873786407767e-06, + "loss": 0.0838, + "step": 1494 + }, + { + "epoch": 1.035677173536543, + "grad_norm": 0.703731894493103, + "learning_rate": 8.970180305131762e-06, + "loss": 0.085, + "step": 1495 + }, + { + "epoch": 1.0363699341877382, + "grad_norm": 0.8515883088111877, + "learning_rate": 8.969486823855757e-06, + "loss": 0.0977, + "step": 1496 + }, + { + "epoch": 1.0370626948389332, + "grad_norm": 0.6951834559440613, + "learning_rate": 8.968793342579752e-06, + "loss": 0.0806, + "step": 1497 + }, + { + "epoch": 1.0377554554901283, + "grad_norm": 0.852260172367096, + "learning_rate": 8.968099861303745e-06, + "loss": 0.0939, + "step": 1498 + }, + { + "epoch": 1.038448216141323, + "grad_norm": 0.8648838996887207, + "learning_rate": 8.96740638002774e-06, + "loss": 0.1255, + "step": 1499 + }, + { + "epoch": 1.0391409767925182, + "grad_norm": 0.9019761681556702, + "learning_rate": 8.966712898751735e-06, + "loss": 0.1007, + "step": 1500 + }, + { + "epoch": 1.0398337374437132, + "grad_norm": 0.8064924478530884, + "learning_rate": 8.966019417475728e-06, + "loss": 0.1192, + "step": 1501 + }, + { + "epoch": 1.0405264980949083, + "grad_norm": 0.8667044043540955, + "learning_rate": 8.965325936199723e-06, + "loss": 0.0965, + "step": 1502 + }, + { + "epoch": 1.0412192587461033, + "grad_norm": 0.7102158069610596, + "learning_rate": 8.964632454923718e-06, + "loss": 0.0817, + "step": 1503 + }, + { + "epoch": 1.0419120193972982, + "grad_norm": 0.817099928855896, + "learning_rate": 8.963938973647713e-06, + "loss": 0.0991, + "step": 1504 + }, + { + "epoch": 1.0426047800484932, + "grad_norm": 0.9012547135353088, + "learning_rate": 8.963245492371708e-06, + "loss": 0.0949, + "step": 1505 + }, + { + "epoch": 1.0432975406996883, + "grad_norm": 0.7682470679283142, + "learning_rate": 8.962552011095701e-06, + "loss": 0.0781, + "step": 1506 + }, + { + "epoch": 1.0439903013508833, + "grad_norm": 0.7825487852096558, + "learning_rate": 8.961858529819696e-06, + "loss": 0.0961, + "step": 1507 + }, + { + "epoch": 1.0446830620020784, + "grad_norm": 0.7672064900398254, + "learning_rate": 8.96116504854369e-06, + "loss": 0.088, + "step": 1508 + }, + { + "epoch": 1.0453758226532732, + "grad_norm": 0.7983643412590027, + "learning_rate": 8.960471567267684e-06, + "loss": 0.087, + "step": 1509 + }, + { + "epoch": 1.0460685833044683, + "grad_norm": 0.9235342144966125, + "learning_rate": 8.95977808599168e-06, + "loss": 0.0993, + "step": 1510 + }, + { + "epoch": 1.0467613439556633, + "grad_norm": 0.8425189852714539, + "learning_rate": 8.959084604715673e-06, + "loss": 0.093, + "step": 1511 + }, + { + "epoch": 1.0474541046068584, + "grad_norm": 0.875891923904419, + "learning_rate": 8.958391123439667e-06, + "loss": 0.0995, + "step": 1512 + }, + { + "epoch": 1.0481468652580532, + "grad_norm": 0.8321284055709839, + "learning_rate": 8.957697642163662e-06, + "loss": 0.0837, + "step": 1513 + }, + { + "epoch": 1.0488396259092483, + "grad_norm": 1.001866340637207, + "learning_rate": 8.957004160887657e-06, + "loss": 0.0948, + "step": 1514 + }, + { + "epoch": 1.0495323865604433, + "grad_norm": 0.9124066233634949, + "learning_rate": 8.956310679611652e-06, + "loss": 0.0972, + "step": 1515 + }, + { + "epoch": 1.0502251472116384, + "grad_norm": 0.7963526844978333, + "learning_rate": 8.955617198335645e-06, + "loss": 0.0929, + "step": 1516 + }, + { + "epoch": 1.0509179078628335, + "grad_norm": 0.7871224880218506, + "learning_rate": 8.95492371705964e-06, + "loss": 0.0979, + "step": 1517 + }, + { + "epoch": 1.0516106685140283, + "grad_norm": 0.9203463792800903, + "learning_rate": 8.954230235783634e-06, + "loss": 0.0992, + "step": 1518 + }, + { + "epoch": 1.0523034291652233, + "grad_norm": 0.7667852640151978, + "learning_rate": 8.953536754507629e-06, + "loss": 0.081, + "step": 1519 + }, + { + "epoch": 1.0529961898164184, + "grad_norm": 0.912794291973114, + "learning_rate": 8.952843273231624e-06, + "loss": 0.0986, + "step": 1520 + }, + { + "epoch": 1.0536889504676135, + "grad_norm": 0.7972784042358398, + "learning_rate": 8.952149791955618e-06, + "loss": 0.0957, + "step": 1521 + }, + { + "epoch": 1.0543817111188085, + "grad_norm": 0.8647158741950989, + "learning_rate": 8.951456310679613e-06, + "loss": 0.092, + "step": 1522 + }, + { + "epoch": 1.0550744717700034, + "grad_norm": 0.8256508111953735, + "learning_rate": 8.950762829403607e-06, + "loss": 0.0988, + "step": 1523 + }, + { + "epoch": 1.0557672324211984, + "grad_norm": 0.7755040526390076, + "learning_rate": 8.950069348127602e-06, + "loss": 0.0925, + "step": 1524 + }, + { + "epoch": 1.0564599930723935, + "grad_norm": 0.8472358584403992, + "learning_rate": 8.949375866851596e-06, + "loss": 0.1258, + "step": 1525 + }, + { + "epoch": 1.0571527537235885, + "grad_norm": 0.8066132068634033, + "learning_rate": 8.94868238557559e-06, + "loss": 0.0856, + "step": 1526 + }, + { + "epoch": 1.0578455143747836, + "grad_norm": 0.8940655589103699, + "learning_rate": 8.947988904299585e-06, + "loss": 0.1053, + "step": 1527 + }, + { + "epoch": 1.0585382750259784, + "grad_norm": 0.8073523640632629, + "learning_rate": 8.947295423023578e-06, + "loss": 0.0955, + "step": 1528 + }, + { + "epoch": 1.0592310356771735, + "grad_norm": 0.853069007396698, + "learning_rate": 8.946601941747573e-06, + "loss": 0.0928, + "step": 1529 + }, + { + "epoch": 1.0599237963283685, + "grad_norm": 0.8059660196304321, + "learning_rate": 8.945908460471568e-06, + "loss": 0.101, + "step": 1530 + }, + { + "epoch": 1.0606165569795636, + "grad_norm": 0.8703159689903259, + "learning_rate": 8.945214979195563e-06, + "loss": 0.1047, + "step": 1531 + }, + { + "epoch": 1.0613093176307586, + "grad_norm": 0.7334396243095398, + "learning_rate": 8.944521497919558e-06, + "loss": 0.0834, + "step": 1532 + }, + { + "epoch": 1.0620020782819535, + "grad_norm": 0.8599957823753357, + "learning_rate": 8.943828016643551e-06, + "loss": 0.107, + "step": 1533 + }, + { + "epoch": 1.0626948389331485, + "grad_norm": 0.7433030605316162, + "learning_rate": 8.943134535367546e-06, + "loss": 0.0931, + "step": 1534 + }, + { + "epoch": 1.0633875995843436, + "grad_norm": 1.012636423110962, + "learning_rate": 8.94244105409154e-06, + "loss": 0.1233, + "step": 1535 + }, + { + "epoch": 1.0640803602355386, + "grad_norm": 0.7831315994262695, + "learning_rate": 8.941747572815534e-06, + "loss": 0.0896, + "step": 1536 + }, + { + "epoch": 1.0647731208867337, + "grad_norm": 0.8660885691642761, + "learning_rate": 8.941054091539529e-06, + "loss": 0.1041, + "step": 1537 + }, + { + "epoch": 1.0654658815379285, + "grad_norm": 0.6929388046264648, + "learning_rate": 8.940360610263524e-06, + "loss": 0.0682, + "step": 1538 + }, + { + "epoch": 1.0661586421891236, + "grad_norm": 0.750199556350708, + "learning_rate": 8.939667128987519e-06, + "loss": 0.0919, + "step": 1539 + }, + { + "epoch": 1.0668514028403187, + "grad_norm": 0.7908627390861511, + "learning_rate": 8.938973647711514e-06, + "loss": 0.0901, + "step": 1540 + }, + { + "epoch": 1.0675441634915137, + "grad_norm": 0.874664306640625, + "learning_rate": 8.938280166435507e-06, + "loss": 0.1096, + "step": 1541 + }, + { + "epoch": 1.0682369241427088, + "grad_norm": 0.8638962507247925, + "learning_rate": 8.937586685159502e-06, + "loss": 0.0977, + "step": 1542 + }, + { + "epoch": 1.0689296847939036, + "grad_norm": 0.7941625118255615, + "learning_rate": 8.936893203883495e-06, + "loss": 0.0892, + "step": 1543 + }, + { + "epoch": 1.0696224454450987, + "grad_norm": 0.7659174799919128, + "learning_rate": 8.93619972260749e-06, + "loss": 0.0772, + "step": 1544 + }, + { + "epoch": 1.0703152060962937, + "grad_norm": 0.778826117515564, + "learning_rate": 8.935506241331485e-06, + "loss": 0.0953, + "step": 1545 + }, + { + "epoch": 1.0710079667474888, + "grad_norm": 0.8357170224189758, + "learning_rate": 8.934812760055478e-06, + "loss": 0.1054, + "step": 1546 + }, + { + "epoch": 1.0717007273986838, + "grad_norm": 0.8270371556282043, + "learning_rate": 8.934119278779473e-06, + "loss": 0.0939, + "step": 1547 + }, + { + "epoch": 1.0723934880498787, + "grad_norm": 0.8877065777778625, + "learning_rate": 8.933425797503468e-06, + "loss": 0.0991, + "step": 1548 + }, + { + "epoch": 1.0730862487010737, + "grad_norm": 0.9467287063598633, + "learning_rate": 8.932732316227463e-06, + "loss": 0.0973, + "step": 1549 + }, + { + "epoch": 1.0737790093522688, + "grad_norm": 0.8550165891647339, + "learning_rate": 8.932038834951458e-06, + "loss": 0.1029, + "step": 1550 + }, + { + "epoch": 1.0744717700034638, + "grad_norm": 0.7602420449256897, + "learning_rate": 8.931345353675451e-06, + "loss": 0.1005, + "step": 1551 + }, + { + "epoch": 1.075164530654659, + "grad_norm": 0.9009706974029541, + "learning_rate": 8.930651872399446e-06, + "loss": 0.1016, + "step": 1552 + }, + { + "epoch": 1.0758572913058537, + "grad_norm": 0.8762458562850952, + "learning_rate": 8.92995839112344e-06, + "loss": 0.0928, + "step": 1553 + }, + { + "epoch": 1.0765500519570488, + "grad_norm": 0.8292327523231506, + "learning_rate": 8.929264909847434e-06, + "loss": 0.1012, + "step": 1554 + }, + { + "epoch": 1.0772428126082438, + "grad_norm": 0.8724935054779053, + "learning_rate": 8.92857142857143e-06, + "loss": 0.1072, + "step": 1555 + }, + { + "epoch": 1.077935573259439, + "grad_norm": 0.8319444060325623, + "learning_rate": 8.927877947295424e-06, + "loss": 0.0948, + "step": 1556 + }, + { + "epoch": 1.078628333910634, + "grad_norm": 0.7967873215675354, + "learning_rate": 8.927184466019419e-06, + "loss": 0.0887, + "step": 1557 + }, + { + "epoch": 1.0793210945618288, + "grad_norm": 0.8298776745796204, + "learning_rate": 8.926490984743412e-06, + "loss": 0.0836, + "step": 1558 + }, + { + "epoch": 1.0800138552130238, + "grad_norm": 0.890299916267395, + "learning_rate": 8.925797503467407e-06, + "loss": 0.1127, + "step": 1559 + }, + { + "epoch": 1.080706615864219, + "grad_norm": 0.9299351572990417, + "learning_rate": 8.925104022191402e-06, + "loss": 0.1148, + "step": 1560 + }, + { + "epoch": 1.081399376515414, + "grad_norm": 0.7534828186035156, + "learning_rate": 8.924410540915395e-06, + "loss": 0.0912, + "step": 1561 + }, + { + "epoch": 1.082092137166609, + "grad_norm": 0.952809751033783, + "learning_rate": 8.92371705963939e-06, + "loss": 0.1056, + "step": 1562 + }, + { + "epoch": 1.0827848978178038, + "grad_norm": 0.8834134936332703, + "learning_rate": 8.923023578363385e-06, + "loss": 0.1075, + "step": 1563 + }, + { + "epoch": 1.083477658468999, + "grad_norm": 0.7658727765083313, + "learning_rate": 8.922330097087379e-06, + "loss": 0.0875, + "step": 1564 + }, + { + "epoch": 1.084170419120194, + "grad_norm": 0.767088770866394, + "learning_rate": 8.921636615811373e-06, + "loss": 0.1049, + "step": 1565 + }, + { + "epoch": 1.084863179771389, + "grad_norm": 0.7463728189468384, + "learning_rate": 8.920943134535368e-06, + "loss": 0.0856, + "step": 1566 + }, + { + "epoch": 1.085555940422584, + "grad_norm": 0.9736886024475098, + "learning_rate": 8.920249653259363e-06, + "loss": 0.1124, + "step": 1567 + }, + { + "epoch": 1.086248701073779, + "grad_norm": 0.847335696220398, + "learning_rate": 8.919556171983357e-06, + "loss": 0.0988, + "step": 1568 + }, + { + "epoch": 1.086941461724974, + "grad_norm": 0.696940541267395, + "learning_rate": 8.918862690707352e-06, + "loss": 0.0836, + "step": 1569 + }, + { + "epoch": 1.087634222376169, + "grad_norm": 0.7993536591529846, + "learning_rate": 8.918169209431346e-06, + "loss": 0.0952, + "step": 1570 + }, + { + "epoch": 1.088326983027364, + "grad_norm": 0.8469255566596985, + "learning_rate": 8.91747572815534e-06, + "loss": 0.0812, + "step": 1571 + }, + { + "epoch": 1.0890197436785591, + "grad_norm": 0.9231520295143127, + "learning_rate": 8.916782246879335e-06, + "loss": 0.0992, + "step": 1572 + }, + { + "epoch": 1.089712504329754, + "grad_norm": 0.8420063853263855, + "learning_rate": 8.91608876560333e-06, + "loss": 0.0858, + "step": 1573 + }, + { + "epoch": 1.090405264980949, + "grad_norm": 1.1117023229599, + "learning_rate": 8.915395284327325e-06, + "loss": 0.0928, + "step": 1574 + }, + { + "epoch": 1.091098025632144, + "grad_norm": 0.7346429824829102, + "learning_rate": 8.91470180305132e-06, + "loss": 0.0895, + "step": 1575 + }, + { + "epoch": 1.0917907862833391, + "grad_norm": 0.8749075531959534, + "learning_rate": 8.914008321775313e-06, + "loss": 0.1101, + "step": 1576 + }, + { + "epoch": 1.0924835469345342, + "grad_norm": 0.840613067150116, + "learning_rate": 8.913314840499308e-06, + "loss": 0.1001, + "step": 1577 + }, + { + "epoch": 1.093176307585729, + "grad_norm": 0.89544278383255, + "learning_rate": 8.912621359223301e-06, + "loss": 0.1043, + "step": 1578 + }, + { + "epoch": 1.093869068236924, + "grad_norm": 0.9560450315475464, + "learning_rate": 8.911927877947296e-06, + "loss": 0.1288, + "step": 1579 + }, + { + "epoch": 1.0945618288881191, + "grad_norm": 0.9753036499023438, + "learning_rate": 8.91123439667129e-06, + "loss": 0.1171, + "step": 1580 + }, + { + "epoch": 1.0952545895393142, + "grad_norm": 0.7583956122398376, + "learning_rate": 8.910540915395286e-06, + "loss": 0.0955, + "step": 1581 + }, + { + "epoch": 1.0959473501905093, + "grad_norm": 0.7719088196754456, + "learning_rate": 8.90984743411928e-06, + "loss": 0.1065, + "step": 1582 + }, + { + "epoch": 1.096640110841704, + "grad_norm": 0.6964689493179321, + "learning_rate": 8.909153952843274e-06, + "loss": 0.0667, + "step": 1583 + }, + { + "epoch": 1.0973328714928992, + "grad_norm": 0.8248798847198486, + "learning_rate": 8.908460471567269e-06, + "loss": 0.0833, + "step": 1584 + }, + { + "epoch": 1.0980256321440942, + "grad_norm": 0.7117317914962769, + "learning_rate": 8.907766990291264e-06, + "loss": 0.0902, + "step": 1585 + }, + { + "epoch": 1.0987183927952893, + "grad_norm": 0.8334735631942749, + "learning_rate": 8.907073509015257e-06, + "loss": 0.1074, + "step": 1586 + }, + { + "epoch": 1.0994111534464843, + "grad_norm": 0.8360122442245483, + "learning_rate": 8.906380027739252e-06, + "loss": 0.1058, + "step": 1587 + }, + { + "epoch": 1.1001039140976792, + "grad_norm": 0.8382623791694641, + "learning_rate": 8.905686546463245e-06, + "loss": 0.1137, + "step": 1588 + }, + { + "epoch": 1.1007966747488742, + "grad_norm": 0.8029754161834717, + "learning_rate": 8.90499306518724e-06, + "loss": 0.1055, + "step": 1589 + }, + { + "epoch": 1.1014894354000693, + "grad_norm": 0.778613805770874, + "learning_rate": 8.904299583911235e-06, + "loss": 0.0887, + "step": 1590 + }, + { + "epoch": 1.1021821960512643, + "grad_norm": 0.8317983746528625, + "learning_rate": 8.90360610263523e-06, + "loss": 0.0856, + "step": 1591 + }, + { + "epoch": 1.1028749567024594, + "grad_norm": 0.8260476589202881, + "learning_rate": 8.902912621359225e-06, + "loss": 0.1022, + "step": 1592 + }, + { + "epoch": 1.1035677173536542, + "grad_norm": 0.8466697335243225, + "learning_rate": 8.902219140083218e-06, + "loss": 0.0942, + "step": 1593 + }, + { + "epoch": 1.1042604780048493, + "grad_norm": 0.8946962952613831, + "learning_rate": 8.901525658807213e-06, + "loss": 0.1236, + "step": 1594 + }, + { + "epoch": 1.1049532386560443, + "grad_norm": 0.8465958833694458, + "learning_rate": 8.900832177531208e-06, + "loss": 0.1087, + "step": 1595 + }, + { + "epoch": 1.1056459993072394, + "grad_norm": 0.8597657680511475, + "learning_rate": 8.900138696255201e-06, + "loss": 0.0858, + "step": 1596 + }, + { + "epoch": 1.1063387599584344, + "grad_norm": 0.9134311676025391, + "learning_rate": 8.899445214979196e-06, + "loss": 0.1294, + "step": 1597 + }, + { + "epoch": 1.1070315206096293, + "grad_norm": 0.7299925684928894, + "learning_rate": 8.898751733703191e-06, + "loss": 0.0828, + "step": 1598 + }, + { + "epoch": 1.1077242812608243, + "grad_norm": 0.8390952944755554, + "learning_rate": 8.898058252427186e-06, + "loss": 0.1073, + "step": 1599 + }, + { + "epoch": 1.1084170419120194, + "grad_norm": 0.877514660358429, + "learning_rate": 8.897364771151181e-06, + "loss": 0.0876, + "step": 1600 + }, + { + "epoch": 1.1091098025632145, + "grad_norm": 0.8509402871131897, + "learning_rate": 8.896671289875174e-06, + "loss": 0.087, + "step": 1601 + }, + { + "epoch": 1.1098025632144095, + "grad_norm": 0.8124068975448608, + "learning_rate": 8.895977808599169e-06, + "loss": 0.0997, + "step": 1602 + }, + { + "epoch": 1.1104953238656043, + "grad_norm": 0.8765443563461304, + "learning_rate": 8.895284327323162e-06, + "loss": 0.0911, + "step": 1603 + }, + { + "epoch": 1.1111880845167994, + "grad_norm": 0.8992905020713806, + "learning_rate": 8.894590846047157e-06, + "loss": 0.112, + "step": 1604 + }, + { + "epoch": 1.1118808451679945, + "grad_norm": 0.7098968625068665, + "learning_rate": 8.893897364771152e-06, + "loss": 0.0778, + "step": 1605 + }, + { + "epoch": 1.1125736058191895, + "grad_norm": 0.8729726076126099, + "learning_rate": 8.893203883495145e-06, + "loss": 0.109, + "step": 1606 + }, + { + "epoch": 1.1132663664703846, + "grad_norm": 0.8122535943984985, + "learning_rate": 8.89251040221914e-06, + "loss": 0.0805, + "step": 1607 + }, + { + "epoch": 1.1139591271215794, + "grad_norm": 0.9429517388343811, + "learning_rate": 8.891816920943135e-06, + "loss": 0.0881, + "step": 1608 + }, + { + "epoch": 1.1146518877727745, + "grad_norm": 0.8913496136665344, + "learning_rate": 8.89112343966713e-06, + "loss": 0.097, + "step": 1609 + }, + { + "epoch": 1.1153446484239695, + "grad_norm": 0.9648658633232117, + "learning_rate": 8.890429958391125e-06, + "loss": 0.1072, + "step": 1610 + }, + { + "epoch": 1.1160374090751646, + "grad_norm": 0.7665410041809082, + "learning_rate": 8.889736477115118e-06, + "loss": 0.0836, + "step": 1611 + }, + { + "epoch": 1.1167301697263596, + "grad_norm": 0.7544222474098206, + "learning_rate": 8.889042995839113e-06, + "loss": 0.0799, + "step": 1612 + }, + { + "epoch": 1.1174229303775545, + "grad_norm": 0.844616711139679, + "learning_rate": 8.888349514563107e-06, + "loss": 0.0927, + "step": 1613 + }, + { + "epoch": 1.1181156910287495, + "grad_norm": 0.8389930725097656, + "learning_rate": 8.887656033287102e-06, + "loss": 0.1, + "step": 1614 + }, + { + "epoch": 1.1188084516799446, + "grad_norm": 0.8759860396385193, + "learning_rate": 8.886962552011096e-06, + "loss": 0.095, + "step": 1615 + }, + { + "epoch": 1.1195012123311396, + "grad_norm": 0.8822001218795776, + "learning_rate": 8.886269070735091e-06, + "loss": 0.0838, + "step": 1616 + }, + { + "epoch": 1.1201939729823347, + "grad_norm": 0.7945178747177124, + "learning_rate": 8.885575589459086e-06, + "loss": 0.1027, + "step": 1617 + }, + { + "epoch": 1.1208867336335295, + "grad_norm": 0.8539460897445679, + "learning_rate": 8.88488210818308e-06, + "loss": 0.0953, + "step": 1618 + }, + { + "epoch": 1.1215794942847246, + "grad_norm": 0.8433850407600403, + "learning_rate": 8.884188626907074e-06, + "loss": 0.0942, + "step": 1619 + }, + { + "epoch": 1.1222722549359196, + "grad_norm": 0.964124321937561, + "learning_rate": 8.88349514563107e-06, + "loss": 0.1065, + "step": 1620 + }, + { + "epoch": 1.1229650155871147, + "grad_norm": 0.9237002730369568, + "learning_rate": 8.882801664355063e-06, + "loss": 0.1102, + "step": 1621 + }, + { + "epoch": 1.1236577762383098, + "grad_norm": 0.7764036655426025, + "learning_rate": 8.882108183079058e-06, + "loss": 0.0906, + "step": 1622 + }, + { + "epoch": 1.1243505368895046, + "grad_norm": 0.8339813351631165, + "learning_rate": 8.88141470180305e-06, + "loss": 0.0985, + "step": 1623 + }, + { + "epoch": 1.1250432975406996, + "grad_norm": 0.9000864028930664, + "learning_rate": 8.880721220527046e-06, + "loss": 0.1039, + "step": 1624 + }, + { + "epoch": 1.1257360581918947, + "grad_norm": 0.728771448135376, + "learning_rate": 8.88002773925104e-06, + "loss": 0.0855, + "step": 1625 + }, + { + "epoch": 1.1264288188430898, + "grad_norm": 0.8056138753890991, + "learning_rate": 8.879334257975036e-06, + "loss": 0.0926, + "step": 1626 + }, + { + "epoch": 1.1271215794942848, + "grad_norm": 0.8006919026374817, + "learning_rate": 8.87864077669903e-06, + "loss": 0.1135, + "step": 1627 + }, + { + "epoch": 1.1278143401454797, + "grad_norm": 0.7326475977897644, + "learning_rate": 8.877947295423024e-06, + "loss": 0.0942, + "step": 1628 + }, + { + "epoch": 1.1285071007966747, + "grad_norm": 0.7217555046081543, + "learning_rate": 8.877253814147019e-06, + "loss": 0.0806, + "step": 1629 + }, + { + "epoch": 1.1291998614478698, + "grad_norm": 0.8380305171012878, + "learning_rate": 8.876560332871014e-06, + "loss": 0.1011, + "step": 1630 + }, + { + "epoch": 1.1298926220990648, + "grad_norm": 0.7469961643218994, + "learning_rate": 8.875866851595007e-06, + "loss": 0.0755, + "step": 1631 + }, + { + "epoch": 1.1305853827502599, + "grad_norm": 0.8756946921348572, + "learning_rate": 8.875173370319002e-06, + "loss": 0.1159, + "step": 1632 + }, + { + "epoch": 1.1312781434014547, + "grad_norm": 0.7768692970275879, + "learning_rate": 8.874479889042997e-06, + "loss": 0.0601, + "step": 1633 + }, + { + "epoch": 1.1319709040526498, + "grad_norm": 0.7950308322906494, + "learning_rate": 8.873786407766992e-06, + "loss": 0.093, + "step": 1634 + }, + { + "epoch": 1.1326636647038448, + "grad_norm": 0.7664488554000854, + "learning_rate": 8.873092926490987e-06, + "loss": 0.0738, + "step": 1635 + }, + { + "epoch": 1.1333564253550399, + "grad_norm": 0.8197282552719116, + "learning_rate": 8.87239944521498e-06, + "loss": 0.1013, + "step": 1636 + }, + { + "epoch": 1.134049186006235, + "grad_norm": 0.9681476950645447, + "learning_rate": 8.871705963938975e-06, + "loss": 0.107, + "step": 1637 + }, + { + "epoch": 1.1347419466574298, + "grad_norm": 0.7873309850692749, + "learning_rate": 8.871012482662968e-06, + "loss": 0.0967, + "step": 1638 + }, + { + "epoch": 1.1354347073086248, + "grad_norm": 1.0039669275283813, + "learning_rate": 8.870319001386963e-06, + "loss": 0.1273, + "step": 1639 + }, + { + "epoch": 1.13612746795982, + "grad_norm": 0.7824607491493225, + "learning_rate": 8.869625520110958e-06, + "loss": 0.088, + "step": 1640 + }, + { + "epoch": 1.136820228611015, + "grad_norm": 0.7642351388931274, + "learning_rate": 8.868932038834953e-06, + "loss": 0.0859, + "step": 1641 + }, + { + "epoch": 1.13751298926221, + "grad_norm": 0.8798947334289551, + "learning_rate": 8.868238557558946e-06, + "loss": 0.0877, + "step": 1642 + }, + { + "epoch": 1.1382057499134048, + "grad_norm": 0.906349778175354, + "learning_rate": 8.867545076282941e-06, + "loss": 0.1013, + "step": 1643 + }, + { + "epoch": 1.1388985105646, + "grad_norm": 0.689426600933075, + "learning_rate": 8.866851595006936e-06, + "loss": 0.0748, + "step": 1644 + }, + { + "epoch": 1.139591271215795, + "grad_norm": 0.7668915390968323, + "learning_rate": 8.866158113730931e-06, + "loss": 0.0757, + "step": 1645 + }, + { + "epoch": 1.14028403186699, + "grad_norm": 0.9249112606048584, + "learning_rate": 8.865464632454924e-06, + "loss": 0.0987, + "step": 1646 + }, + { + "epoch": 1.140976792518185, + "grad_norm": 0.7976338267326355, + "learning_rate": 8.864771151178919e-06, + "loss": 0.0937, + "step": 1647 + }, + { + "epoch": 1.14166955316938, + "grad_norm": 1.0105061531066895, + "learning_rate": 8.864077669902912e-06, + "loss": 0.1253, + "step": 1648 + }, + { + "epoch": 1.142362313820575, + "grad_norm": 0.7926379442214966, + "learning_rate": 8.863384188626907e-06, + "loss": 0.0799, + "step": 1649 + }, + { + "epoch": 1.14305507447177, + "grad_norm": 0.7894308567047119, + "learning_rate": 8.862690707350902e-06, + "loss": 0.0848, + "step": 1650 + }, + { + "epoch": 1.143747835122965, + "grad_norm": 0.8707621097564697, + "learning_rate": 8.861997226074897e-06, + "loss": 0.0878, + "step": 1651 + }, + { + "epoch": 1.1444405957741601, + "grad_norm": 0.8926756978034973, + "learning_rate": 8.861303744798892e-06, + "loss": 0.0936, + "step": 1652 + }, + { + "epoch": 1.145133356425355, + "grad_norm": 0.957144021987915, + "learning_rate": 8.860610263522885e-06, + "loss": 0.0933, + "step": 1653 + }, + { + "epoch": 1.14582611707655, + "grad_norm": 0.9077473878860474, + "learning_rate": 8.85991678224688e-06, + "loss": 0.1079, + "step": 1654 + }, + { + "epoch": 1.146518877727745, + "grad_norm": 0.8787539601325989, + "learning_rate": 8.859223300970875e-06, + "loss": 0.1079, + "step": 1655 + }, + { + "epoch": 1.1472116383789401, + "grad_norm": 0.8024150133132935, + "learning_rate": 8.858529819694868e-06, + "loss": 0.0865, + "step": 1656 + }, + { + "epoch": 1.1479043990301352, + "grad_norm": 0.8101352453231812, + "learning_rate": 8.857836338418863e-06, + "loss": 0.0855, + "step": 1657 + }, + { + "epoch": 1.14859715968133, + "grad_norm": 0.7889779806137085, + "learning_rate": 8.857142857142858e-06, + "loss": 0.0801, + "step": 1658 + }, + { + "epoch": 1.149289920332525, + "grad_norm": 0.8348006010055542, + "learning_rate": 8.856449375866853e-06, + "loss": 0.0987, + "step": 1659 + }, + { + "epoch": 1.1499826809837201, + "grad_norm": 0.8121004700660706, + "learning_rate": 8.855755894590848e-06, + "loss": 0.0918, + "step": 1660 + }, + { + "epoch": 1.1506754416349152, + "grad_norm": 0.8209229111671448, + "learning_rate": 8.855062413314841e-06, + "loss": 0.0872, + "step": 1661 + }, + { + "epoch": 1.1513682022861103, + "grad_norm": 0.8757693767547607, + "learning_rate": 8.854368932038836e-06, + "loss": 0.1056, + "step": 1662 + }, + { + "epoch": 1.152060962937305, + "grad_norm": 0.7186292409896851, + "learning_rate": 8.85367545076283e-06, + "loss": 0.081, + "step": 1663 + }, + { + "epoch": 1.1527537235885001, + "grad_norm": 0.8405160307884216, + "learning_rate": 8.852981969486824e-06, + "loss": 0.1121, + "step": 1664 + }, + { + "epoch": 1.1534464842396952, + "grad_norm": 0.8855977058410645, + "learning_rate": 8.85228848821082e-06, + "loss": 0.0884, + "step": 1665 + }, + { + "epoch": 1.1541392448908903, + "grad_norm": 0.9416713118553162, + "learning_rate": 8.851595006934813e-06, + "loss": 0.0909, + "step": 1666 + }, + { + "epoch": 1.1548320055420853, + "grad_norm": 0.889167308807373, + "learning_rate": 8.850901525658808e-06, + "loss": 0.1148, + "step": 1667 + }, + { + "epoch": 1.1555247661932802, + "grad_norm": 0.8325088024139404, + "learning_rate": 8.850208044382803e-06, + "loss": 0.1016, + "step": 1668 + }, + { + "epoch": 1.1562175268444752, + "grad_norm": 0.8529247641563416, + "learning_rate": 8.849514563106797e-06, + "loss": 0.0905, + "step": 1669 + }, + { + "epoch": 1.1569102874956703, + "grad_norm": 0.8866456747055054, + "learning_rate": 8.848821081830792e-06, + "loss": 0.1206, + "step": 1670 + }, + { + "epoch": 1.1576030481468653, + "grad_norm": 0.8046126365661621, + "learning_rate": 8.848127600554786e-06, + "loss": 0.1144, + "step": 1671 + }, + { + "epoch": 1.1582958087980604, + "grad_norm": 0.7605208158493042, + "learning_rate": 8.84743411927878e-06, + "loss": 0.0903, + "step": 1672 + }, + { + "epoch": 1.1589885694492552, + "grad_norm": 0.829639732837677, + "learning_rate": 8.846740638002774e-06, + "loss": 0.0975, + "step": 1673 + }, + { + "epoch": 1.1596813301004503, + "grad_norm": 0.8354451656341553, + "learning_rate": 8.846047156726769e-06, + "loss": 0.0984, + "step": 1674 + }, + { + "epoch": 1.1603740907516453, + "grad_norm": 0.8832699656486511, + "learning_rate": 8.845353675450764e-06, + "loss": 0.1112, + "step": 1675 + }, + { + "epoch": 1.1610668514028404, + "grad_norm": 0.831802248954773, + "learning_rate": 8.844660194174759e-06, + "loss": 0.1032, + "step": 1676 + }, + { + "epoch": 1.1617596120540354, + "grad_norm": 0.96922367811203, + "learning_rate": 8.843966712898754e-06, + "loss": 0.0947, + "step": 1677 + }, + { + "epoch": 1.1624523727052303, + "grad_norm": 0.8046149015426636, + "learning_rate": 8.843273231622747e-06, + "loss": 0.092, + "step": 1678 + }, + { + "epoch": 1.1631451333564253, + "grad_norm": 0.8756119012832642, + "learning_rate": 8.842579750346742e-06, + "loss": 0.0863, + "step": 1679 + }, + { + "epoch": 1.1638378940076204, + "grad_norm": 0.7364367246627808, + "learning_rate": 8.841886269070737e-06, + "loss": 0.0884, + "step": 1680 + }, + { + "epoch": 1.1645306546588154, + "grad_norm": 0.7794440984725952, + "learning_rate": 8.84119278779473e-06, + "loss": 0.0883, + "step": 1681 + }, + { + "epoch": 1.1652234153100105, + "grad_norm": 0.8520834445953369, + "learning_rate": 8.840499306518725e-06, + "loss": 0.0972, + "step": 1682 + }, + { + "epoch": 1.1659161759612053, + "grad_norm": 0.8838192820549011, + "learning_rate": 8.839805825242718e-06, + "loss": 0.1158, + "step": 1683 + }, + { + "epoch": 1.1666089366124004, + "grad_norm": 0.6713674068450928, + "learning_rate": 8.839112343966713e-06, + "loss": 0.0729, + "step": 1684 + }, + { + "epoch": 1.1673016972635955, + "grad_norm": 0.8523823618888855, + "learning_rate": 8.838418862690708e-06, + "loss": 0.0849, + "step": 1685 + }, + { + "epoch": 1.1679944579147905, + "grad_norm": 0.8470174670219421, + "learning_rate": 8.837725381414703e-06, + "loss": 0.096, + "step": 1686 + }, + { + "epoch": 1.1686872185659856, + "grad_norm": 0.902706503868103, + "learning_rate": 8.837031900138698e-06, + "loss": 0.1046, + "step": 1687 + }, + { + "epoch": 1.1693799792171804, + "grad_norm": 0.7811160087585449, + "learning_rate": 8.836338418862691e-06, + "loss": 0.0935, + "step": 1688 + }, + { + "epoch": 1.1700727398683755, + "grad_norm": 0.8322646617889404, + "learning_rate": 8.835644937586686e-06, + "loss": 0.1052, + "step": 1689 + }, + { + "epoch": 1.1707655005195705, + "grad_norm": 0.8243199586868286, + "learning_rate": 8.834951456310681e-06, + "loss": 0.1135, + "step": 1690 + }, + { + "epoch": 1.1714582611707656, + "grad_norm": 0.8629660606384277, + "learning_rate": 8.834257975034674e-06, + "loss": 0.0952, + "step": 1691 + }, + { + "epoch": 1.1721510218219606, + "grad_norm": 1.0917249917984009, + "learning_rate": 8.833564493758669e-06, + "loss": 0.1336, + "step": 1692 + }, + { + "epoch": 1.1728437824731555, + "grad_norm": 0.9731141328811646, + "learning_rate": 8.832871012482664e-06, + "loss": 0.1084, + "step": 1693 + }, + { + "epoch": 1.1735365431243505, + "grad_norm": 0.8693202137947083, + "learning_rate": 8.832177531206659e-06, + "loss": 0.1009, + "step": 1694 + }, + { + "epoch": 1.1742293037755456, + "grad_norm": 0.8254092335700989, + "learning_rate": 8.831484049930654e-06, + "loss": 0.1141, + "step": 1695 + }, + { + "epoch": 1.1749220644267406, + "grad_norm": 0.8278314471244812, + "learning_rate": 8.830790568654647e-06, + "loss": 0.1116, + "step": 1696 + }, + { + "epoch": 1.1756148250779357, + "grad_norm": 0.8282114267349243, + "learning_rate": 8.830097087378642e-06, + "loss": 0.0848, + "step": 1697 + }, + { + "epoch": 1.1763075857291305, + "grad_norm": 0.8601030707359314, + "learning_rate": 8.829403606102635e-06, + "loss": 0.0974, + "step": 1698 + }, + { + "epoch": 1.1770003463803256, + "grad_norm": 1.0528501272201538, + "learning_rate": 8.82871012482663e-06, + "loss": 0.0954, + "step": 1699 + }, + { + "epoch": 1.1776931070315206, + "grad_norm": 0.8671848773956299, + "learning_rate": 8.828016643550625e-06, + "loss": 0.0896, + "step": 1700 + }, + { + "epoch": 1.1783858676827157, + "grad_norm": 1.3647528886795044, + "learning_rate": 8.827323162274618e-06, + "loss": 0.1201, + "step": 1701 + }, + { + "epoch": 1.1790786283339107, + "grad_norm": 0.7462443113327026, + "learning_rate": 8.826629680998613e-06, + "loss": 0.0791, + "step": 1702 + }, + { + "epoch": 1.1797713889851056, + "grad_norm": 0.8084028959274292, + "learning_rate": 8.825936199722608e-06, + "loss": 0.0817, + "step": 1703 + }, + { + "epoch": 1.1804641496363006, + "grad_norm": 0.763846218585968, + "learning_rate": 8.825242718446603e-06, + "loss": 0.0867, + "step": 1704 + }, + { + "epoch": 1.1811569102874957, + "grad_norm": 0.826973557472229, + "learning_rate": 8.824549237170598e-06, + "loss": 0.0959, + "step": 1705 + }, + { + "epoch": 1.1818496709386908, + "grad_norm": 0.8394191861152649, + "learning_rate": 8.823855755894591e-06, + "loss": 0.1102, + "step": 1706 + }, + { + "epoch": 1.1825424315898858, + "grad_norm": 0.9438965916633606, + "learning_rate": 8.823162274618586e-06, + "loss": 0.1097, + "step": 1707 + }, + { + "epoch": 1.1832351922410806, + "grad_norm": 0.8429800868034363, + "learning_rate": 8.82246879334258e-06, + "loss": 0.115, + "step": 1708 + }, + { + "epoch": 1.1839279528922757, + "grad_norm": 0.7064415216445923, + "learning_rate": 8.821775312066574e-06, + "loss": 0.0801, + "step": 1709 + }, + { + "epoch": 1.1846207135434708, + "grad_norm": 0.739035964012146, + "learning_rate": 8.82108183079057e-06, + "loss": 0.0824, + "step": 1710 + }, + { + "epoch": 1.1853134741946658, + "grad_norm": 0.7586328387260437, + "learning_rate": 8.820388349514564e-06, + "loss": 0.0789, + "step": 1711 + }, + { + "epoch": 1.1860062348458609, + "grad_norm": 0.8423470854759216, + "learning_rate": 8.81969486823856e-06, + "loss": 0.1163, + "step": 1712 + }, + { + "epoch": 1.1866989954970557, + "grad_norm": 0.8857660293579102, + "learning_rate": 8.819001386962552e-06, + "loss": 0.1072, + "step": 1713 + }, + { + "epoch": 1.1873917561482508, + "grad_norm": 0.9075539112091064, + "learning_rate": 8.818307905686547e-06, + "loss": 0.0941, + "step": 1714 + }, + { + "epoch": 1.1880845167994458, + "grad_norm": 0.831018328666687, + "learning_rate": 8.817614424410542e-06, + "loss": 0.0802, + "step": 1715 + }, + { + "epoch": 1.1887772774506409, + "grad_norm": 0.774970531463623, + "learning_rate": 8.816920943134536e-06, + "loss": 0.0829, + "step": 1716 + }, + { + "epoch": 1.189470038101836, + "grad_norm": 0.8437609672546387, + "learning_rate": 8.81622746185853e-06, + "loss": 0.0952, + "step": 1717 + }, + { + "epoch": 1.1901627987530308, + "grad_norm": 0.9007271528244019, + "learning_rate": 8.815533980582525e-06, + "loss": 0.0861, + "step": 1718 + }, + { + "epoch": 1.1908555594042258, + "grad_norm": 0.9031360745429993, + "learning_rate": 8.814840499306519e-06, + "loss": 0.1107, + "step": 1719 + }, + { + "epoch": 1.1915483200554209, + "grad_norm": 0.8673595190048218, + "learning_rate": 8.814147018030514e-06, + "loss": 0.1065, + "step": 1720 + }, + { + "epoch": 1.192241080706616, + "grad_norm": 0.7581225037574768, + "learning_rate": 8.813453536754509e-06, + "loss": 0.0901, + "step": 1721 + }, + { + "epoch": 1.192933841357811, + "grad_norm": 0.7888699173927307, + "learning_rate": 8.812760055478503e-06, + "loss": 0.0818, + "step": 1722 + }, + { + "epoch": 1.1936266020090058, + "grad_norm": 0.8125446438789368, + "learning_rate": 8.812066574202497e-06, + "loss": 0.0909, + "step": 1723 + }, + { + "epoch": 1.194319362660201, + "grad_norm": 0.8669037818908691, + "learning_rate": 8.811373092926492e-06, + "loss": 0.0983, + "step": 1724 + }, + { + "epoch": 1.195012123311396, + "grad_norm": 0.753842294216156, + "learning_rate": 8.810679611650487e-06, + "loss": 0.0855, + "step": 1725 + }, + { + "epoch": 1.195704883962591, + "grad_norm": 0.8449932336807251, + "learning_rate": 8.80998613037448e-06, + "loss": 0.0948, + "step": 1726 + }, + { + "epoch": 1.196397644613786, + "grad_norm": 0.7542533278465271, + "learning_rate": 8.809292649098475e-06, + "loss": 0.083, + "step": 1727 + }, + { + "epoch": 1.197090405264981, + "grad_norm": 0.9120563864707947, + "learning_rate": 8.80859916782247e-06, + "loss": 0.1076, + "step": 1728 + }, + { + "epoch": 1.197783165916176, + "grad_norm": 0.9268695712089539, + "learning_rate": 8.807905686546465e-06, + "loss": 0.0992, + "step": 1729 + }, + { + "epoch": 1.198475926567371, + "grad_norm": 0.8432742953300476, + "learning_rate": 8.80721220527046e-06, + "loss": 0.0937, + "step": 1730 + }, + { + "epoch": 1.199168687218566, + "grad_norm": 0.8582000732421875, + "learning_rate": 8.806518723994453e-06, + "loss": 0.1055, + "step": 1731 + }, + { + "epoch": 1.1998614478697611, + "grad_norm": 0.8746100664138794, + "learning_rate": 8.805825242718448e-06, + "loss": 0.0996, + "step": 1732 + }, + { + "epoch": 1.200554208520956, + "grad_norm": 0.7897584438323975, + "learning_rate": 8.805131761442441e-06, + "loss": 0.0952, + "step": 1733 + }, + { + "epoch": 1.201246969172151, + "grad_norm": 0.7623755931854248, + "learning_rate": 8.804438280166436e-06, + "loss": 0.0806, + "step": 1734 + }, + { + "epoch": 1.201939729823346, + "grad_norm": 0.8046366572380066, + "learning_rate": 8.803744798890431e-06, + "loss": 0.0665, + "step": 1735 + }, + { + "epoch": 1.2026324904745411, + "grad_norm": 0.8854013681411743, + "learning_rate": 8.803051317614426e-06, + "loss": 0.0926, + "step": 1736 + }, + { + "epoch": 1.203325251125736, + "grad_norm": 0.8266016244888306, + "learning_rate": 8.80235783633842e-06, + "loss": 0.086, + "step": 1737 + }, + { + "epoch": 1.204018011776931, + "grad_norm": 0.8264349699020386, + "learning_rate": 8.801664355062414e-06, + "loss": 0.0977, + "step": 1738 + }, + { + "epoch": 1.204710772428126, + "grad_norm": 0.7382345795631409, + "learning_rate": 8.800970873786409e-06, + "loss": 0.0804, + "step": 1739 + }, + { + "epoch": 1.2054035330793211, + "grad_norm": 0.7804838418960571, + "learning_rate": 8.800277392510404e-06, + "loss": 0.0832, + "step": 1740 + }, + { + "epoch": 1.2060962937305162, + "grad_norm": 0.7389581203460693, + "learning_rate": 8.799583911234397e-06, + "loss": 0.0901, + "step": 1741 + }, + { + "epoch": 1.206789054381711, + "grad_norm": 0.8908270001411438, + "learning_rate": 8.798890429958392e-06, + "loss": 0.0893, + "step": 1742 + }, + { + "epoch": 1.207481815032906, + "grad_norm": 0.9740622639656067, + "learning_rate": 8.798196948682385e-06, + "loss": 0.1231, + "step": 1743 + }, + { + "epoch": 1.2081745756841011, + "grad_norm": 0.813880205154419, + "learning_rate": 8.79750346740638e-06, + "loss": 0.1011, + "step": 1744 + }, + { + "epoch": 1.2088673363352962, + "grad_norm": 0.9801640510559082, + "learning_rate": 8.796809986130375e-06, + "loss": 0.0832, + "step": 1745 + }, + { + "epoch": 1.2095600969864913, + "grad_norm": 0.8074042201042175, + "learning_rate": 8.79611650485437e-06, + "loss": 0.0888, + "step": 1746 + }, + { + "epoch": 1.210252857637686, + "grad_norm": 0.7691810727119446, + "learning_rate": 8.795423023578365e-06, + "loss": 0.0989, + "step": 1747 + }, + { + "epoch": 1.2109456182888811, + "grad_norm": 0.8661003112792969, + "learning_rate": 8.794729542302358e-06, + "loss": 0.1081, + "step": 1748 + }, + { + "epoch": 1.2116383789400762, + "grad_norm": 0.9518867135047913, + "learning_rate": 8.794036061026353e-06, + "loss": 0.1168, + "step": 1749 + }, + { + "epoch": 1.2123311395912713, + "grad_norm": 0.955470085144043, + "learning_rate": 8.793342579750348e-06, + "loss": 0.1213, + "step": 1750 + }, + { + "epoch": 1.2130239002424663, + "grad_norm": 0.7917088866233826, + "learning_rate": 8.792649098474341e-06, + "loss": 0.0994, + "step": 1751 + }, + { + "epoch": 1.2137166608936611, + "grad_norm": 0.8858113884925842, + "learning_rate": 8.791955617198336e-06, + "loss": 0.1199, + "step": 1752 + }, + { + "epoch": 1.2144094215448562, + "grad_norm": 0.8482376337051392, + "learning_rate": 8.791262135922331e-06, + "loss": 0.0927, + "step": 1753 + }, + { + "epoch": 1.2151021821960513, + "grad_norm": 0.8622453212738037, + "learning_rate": 8.790568654646326e-06, + "loss": 0.0942, + "step": 1754 + }, + { + "epoch": 1.2157949428472463, + "grad_norm": 0.9274044632911682, + "learning_rate": 8.789875173370321e-06, + "loss": 0.0932, + "step": 1755 + }, + { + "epoch": 1.2164877034984414, + "grad_norm": 0.802811861038208, + "learning_rate": 8.789181692094314e-06, + "loss": 0.0765, + "step": 1756 + }, + { + "epoch": 1.2171804641496362, + "grad_norm": 0.7712103724479675, + "learning_rate": 8.78848821081831e-06, + "loss": 0.0786, + "step": 1757 + }, + { + "epoch": 1.2178732248008313, + "grad_norm": 1.124839186668396, + "learning_rate": 8.787794729542302e-06, + "loss": 0.1003, + "step": 1758 + }, + { + "epoch": 1.2185659854520263, + "grad_norm": 0.7888901233673096, + "learning_rate": 8.787101248266297e-06, + "loss": 0.0813, + "step": 1759 + }, + { + "epoch": 1.2192587461032214, + "grad_norm": 0.9014065265655518, + "learning_rate": 8.786407766990292e-06, + "loss": 0.1011, + "step": 1760 + }, + { + "epoch": 1.2199515067544164, + "grad_norm": 0.859272837638855, + "learning_rate": 8.785714285714286e-06, + "loss": 0.1052, + "step": 1761 + }, + { + "epoch": 1.2206442674056113, + "grad_norm": 0.7922794222831726, + "learning_rate": 8.78502080443828e-06, + "loss": 0.0974, + "step": 1762 + }, + { + "epoch": 1.2213370280568063, + "grad_norm": 0.8734545111656189, + "learning_rate": 8.784327323162275e-06, + "loss": 0.1074, + "step": 1763 + }, + { + "epoch": 1.2220297887080014, + "grad_norm": 0.8003416657447815, + "learning_rate": 8.78363384188627e-06, + "loss": 0.0943, + "step": 1764 + }, + { + "epoch": 1.2227225493591964, + "grad_norm": 0.7421976327896118, + "learning_rate": 8.782940360610265e-06, + "loss": 0.1014, + "step": 1765 + }, + { + "epoch": 1.2234153100103915, + "grad_norm": 0.8193487524986267, + "learning_rate": 8.782246879334259e-06, + "loss": 0.0969, + "step": 1766 + }, + { + "epoch": 1.2241080706615863, + "grad_norm": 0.9062775373458862, + "learning_rate": 8.781553398058253e-06, + "loss": 0.1, + "step": 1767 + }, + { + "epoch": 1.2248008313127814, + "grad_norm": 0.913105309009552, + "learning_rate": 8.780859916782247e-06, + "loss": 0.1151, + "step": 1768 + }, + { + "epoch": 1.2254935919639764, + "grad_norm": 0.7597215175628662, + "learning_rate": 8.780166435506242e-06, + "loss": 0.0942, + "step": 1769 + }, + { + "epoch": 1.2261863526151715, + "grad_norm": 0.7712662220001221, + "learning_rate": 8.779472954230237e-06, + "loss": 0.0864, + "step": 1770 + }, + { + "epoch": 1.2268791132663666, + "grad_norm": 0.7437800765037537, + "learning_rate": 8.778779472954232e-06, + "loss": 0.0853, + "step": 1771 + }, + { + "epoch": 1.2275718739175614, + "grad_norm": 0.8142057061195374, + "learning_rate": 8.778085991678226e-06, + "loss": 0.1051, + "step": 1772 + }, + { + "epoch": 1.2282646345687565, + "grad_norm": 0.9158908128738403, + "learning_rate": 8.77739251040222e-06, + "loss": 0.1038, + "step": 1773 + }, + { + "epoch": 1.2289573952199515, + "grad_norm": 0.9019196629524231, + "learning_rate": 8.776699029126215e-06, + "loss": 0.0864, + "step": 1774 + }, + { + "epoch": 1.2296501558711466, + "grad_norm": 0.8636556267738342, + "learning_rate": 8.77600554785021e-06, + "loss": 0.1178, + "step": 1775 + }, + { + "epoch": 1.2303429165223416, + "grad_norm": 0.8372964262962341, + "learning_rate": 8.775312066574203e-06, + "loss": 0.0662, + "step": 1776 + }, + { + "epoch": 1.2310356771735365, + "grad_norm": 0.8746647834777832, + "learning_rate": 8.774618585298198e-06, + "loss": 0.0968, + "step": 1777 + }, + { + "epoch": 1.2317284378247315, + "grad_norm": 0.9964105486869812, + "learning_rate": 8.773925104022191e-06, + "loss": 0.1123, + "step": 1778 + }, + { + "epoch": 1.2324211984759266, + "grad_norm": 0.756689727306366, + "learning_rate": 8.773231622746186e-06, + "loss": 0.0721, + "step": 1779 + }, + { + "epoch": 1.2331139591271216, + "grad_norm": 0.8758362531661987, + "learning_rate": 8.77253814147018e-06, + "loss": 0.1014, + "step": 1780 + }, + { + "epoch": 1.2338067197783167, + "grad_norm": 0.7719059586524963, + "learning_rate": 8.771844660194176e-06, + "loss": 0.0949, + "step": 1781 + }, + { + "epoch": 1.2344994804295115, + "grad_norm": 0.8182286620140076, + "learning_rate": 8.77115117891817e-06, + "loss": 0.0966, + "step": 1782 + }, + { + "epoch": 1.2351922410807066, + "grad_norm": 0.8806812167167664, + "learning_rate": 8.770457697642164e-06, + "loss": 0.1119, + "step": 1783 + }, + { + "epoch": 1.2358850017319016, + "grad_norm": 0.9253789782524109, + "learning_rate": 8.769764216366159e-06, + "loss": 0.1036, + "step": 1784 + }, + { + "epoch": 1.2365777623830967, + "grad_norm": 0.8416339755058289, + "learning_rate": 8.769070735090154e-06, + "loss": 0.0917, + "step": 1785 + }, + { + "epoch": 1.2372705230342917, + "grad_norm": 0.8205117583274841, + "learning_rate": 8.768377253814147e-06, + "loss": 0.098, + "step": 1786 + }, + { + "epoch": 1.2379632836854866, + "grad_norm": 0.9483963847160339, + "learning_rate": 8.767683772538142e-06, + "loss": 0.0984, + "step": 1787 + }, + { + "epoch": 1.2386560443366816, + "grad_norm": 0.9278393387794495, + "learning_rate": 8.766990291262137e-06, + "loss": 0.1076, + "step": 1788 + }, + { + "epoch": 1.2393488049878767, + "grad_norm": 0.8407341837882996, + "learning_rate": 8.766296809986132e-06, + "loss": 0.0859, + "step": 1789 + }, + { + "epoch": 1.2400415656390718, + "grad_norm": 0.9124855995178223, + "learning_rate": 8.765603328710127e-06, + "loss": 0.12, + "step": 1790 + }, + { + "epoch": 1.2407343262902668, + "grad_norm": 0.8021404147148132, + "learning_rate": 8.76490984743412e-06, + "loss": 0.072, + "step": 1791 + }, + { + "epoch": 1.2414270869414616, + "grad_norm": 0.8200904130935669, + "learning_rate": 8.764216366158115e-06, + "loss": 0.0762, + "step": 1792 + }, + { + "epoch": 1.2421198475926567, + "grad_norm": 0.7664557695388794, + "learning_rate": 8.763522884882108e-06, + "loss": 0.0912, + "step": 1793 + }, + { + "epoch": 1.2428126082438518, + "grad_norm": 0.9423385858535767, + "learning_rate": 8.762829403606103e-06, + "loss": 0.1182, + "step": 1794 + }, + { + "epoch": 1.2435053688950468, + "grad_norm": 0.8365580439567566, + "learning_rate": 8.762135922330098e-06, + "loss": 0.0936, + "step": 1795 + }, + { + "epoch": 1.2441981295462416, + "grad_norm": 0.93754643201828, + "learning_rate": 8.761442441054091e-06, + "loss": 0.124, + "step": 1796 + }, + { + "epoch": 1.2448908901974367, + "grad_norm": 0.7680178284645081, + "learning_rate": 8.760748959778086e-06, + "loss": 0.1028, + "step": 1797 + }, + { + "epoch": 1.2455836508486318, + "grad_norm": 0.8152678608894348, + "learning_rate": 8.760055478502081e-06, + "loss": 0.0865, + "step": 1798 + }, + { + "epoch": 1.2462764114998268, + "grad_norm": 0.7605410218238831, + "learning_rate": 8.759361997226076e-06, + "loss": 0.0768, + "step": 1799 + }, + { + "epoch": 1.2469691721510219, + "grad_norm": 0.8376767039299011, + "learning_rate": 8.758668515950071e-06, + "loss": 0.0966, + "step": 1800 + }, + { + "epoch": 1.2476619328022167, + "grad_norm": 0.886950671672821, + "learning_rate": 8.757975034674064e-06, + "loss": 0.1082, + "step": 1801 + }, + { + "epoch": 1.2483546934534118, + "grad_norm": 0.8822245597839355, + "learning_rate": 8.75728155339806e-06, + "loss": 0.1032, + "step": 1802 + }, + { + "epoch": 1.2490474541046068, + "grad_norm": 0.8090283274650574, + "learning_rate": 8.756588072122052e-06, + "loss": 0.0874, + "step": 1803 + }, + { + "epoch": 1.2497402147558019, + "grad_norm": 0.8402153253555298, + "learning_rate": 8.755894590846047e-06, + "loss": 0.1002, + "step": 1804 + }, + { + "epoch": 1.250432975406997, + "grad_norm": 0.7970601916313171, + "learning_rate": 8.755201109570042e-06, + "loss": 0.0831, + "step": 1805 + }, + { + "epoch": 1.2511257360581918, + "grad_norm": 0.7803841233253479, + "learning_rate": 8.754507628294037e-06, + "loss": 0.0835, + "step": 1806 + }, + { + "epoch": 1.2518184967093868, + "grad_norm": 0.7126593589782715, + "learning_rate": 8.753814147018032e-06, + "loss": 0.0867, + "step": 1807 + }, + { + "epoch": 1.2525112573605819, + "grad_norm": 0.8743826746940613, + "learning_rate": 8.753120665742025e-06, + "loss": 0.1001, + "step": 1808 + }, + { + "epoch": 1.253204018011777, + "grad_norm": 0.9614912867546082, + "learning_rate": 8.75242718446602e-06, + "loss": 0.1229, + "step": 1809 + }, + { + "epoch": 1.253896778662972, + "grad_norm": 0.8666008710861206, + "learning_rate": 8.751733703190015e-06, + "loss": 0.0781, + "step": 1810 + }, + { + "epoch": 1.2545895393141668, + "grad_norm": 0.9381266236305237, + "learning_rate": 8.751040221914009e-06, + "loss": 0.1314, + "step": 1811 + }, + { + "epoch": 1.255282299965362, + "grad_norm": 0.9971676468849182, + "learning_rate": 8.750346740638003e-06, + "loss": 0.1007, + "step": 1812 + }, + { + "epoch": 1.255975060616557, + "grad_norm": 0.7789368033409119, + "learning_rate": 8.749653259361998e-06, + "loss": 0.082, + "step": 1813 + }, + { + "epoch": 1.256667821267752, + "grad_norm": 0.8247684836387634, + "learning_rate": 8.748959778085993e-06, + "loss": 0.1143, + "step": 1814 + }, + { + "epoch": 1.257360581918947, + "grad_norm": 0.7430122494697571, + "learning_rate": 8.748266296809987e-06, + "loss": 0.0772, + "step": 1815 + }, + { + "epoch": 1.258053342570142, + "grad_norm": 0.8330528140068054, + "learning_rate": 8.747572815533982e-06, + "loss": 0.1069, + "step": 1816 + }, + { + "epoch": 1.258746103221337, + "grad_norm": 0.8116832971572876, + "learning_rate": 8.746879334257976e-06, + "loss": 0.0745, + "step": 1817 + }, + { + "epoch": 1.259438863872532, + "grad_norm": 0.7854447364807129, + "learning_rate": 8.74618585298197e-06, + "loss": 0.0876, + "step": 1818 + }, + { + "epoch": 1.260131624523727, + "grad_norm": 0.8094685673713684, + "learning_rate": 8.745492371705965e-06, + "loss": 0.0961, + "step": 1819 + }, + { + "epoch": 1.2608243851749221, + "grad_norm": 0.8866847157478333, + "learning_rate": 8.744798890429958e-06, + "loss": 0.1133, + "step": 1820 + }, + { + "epoch": 1.261517145826117, + "grad_norm": 0.7459259033203125, + "learning_rate": 8.744105409153953e-06, + "loss": 0.0977, + "step": 1821 + }, + { + "epoch": 1.262209906477312, + "grad_norm": 0.7870854139328003, + "learning_rate": 8.743411927877948e-06, + "loss": 0.0828, + "step": 1822 + }, + { + "epoch": 1.262902667128507, + "grad_norm": 0.9214602112770081, + "learning_rate": 8.742718446601943e-06, + "loss": 0.092, + "step": 1823 + }, + { + "epoch": 1.2635954277797021, + "grad_norm": 0.7299473285675049, + "learning_rate": 8.742024965325938e-06, + "loss": 0.0865, + "step": 1824 + }, + { + "epoch": 1.2642881884308972, + "grad_norm": 0.8461336493492126, + "learning_rate": 8.74133148404993e-06, + "loss": 0.1011, + "step": 1825 + }, + { + "epoch": 1.264980949082092, + "grad_norm": 0.7238945960998535, + "learning_rate": 8.740638002773926e-06, + "loss": 0.0805, + "step": 1826 + }, + { + "epoch": 1.265673709733287, + "grad_norm": 0.8741490244865417, + "learning_rate": 8.73994452149792e-06, + "loss": 0.1077, + "step": 1827 + }, + { + "epoch": 1.2663664703844821, + "grad_norm": 0.7736914157867432, + "learning_rate": 8.739251040221914e-06, + "loss": 0.0881, + "step": 1828 + }, + { + "epoch": 1.2670592310356772, + "grad_norm": 0.7423211336135864, + "learning_rate": 8.738557558945909e-06, + "loss": 0.0844, + "step": 1829 + }, + { + "epoch": 1.2677519916868722, + "grad_norm": 0.7699253559112549, + "learning_rate": 8.737864077669904e-06, + "loss": 0.0832, + "step": 1830 + }, + { + "epoch": 1.268444752338067, + "grad_norm": 0.853696346282959, + "learning_rate": 8.737170596393899e-06, + "loss": 0.1042, + "step": 1831 + }, + { + "epoch": 1.2691375129892621, + "grad_norm": 0.7845718860626221, + "learning_rate": 8.736477115117894e-06, + "loss": 0.0804, + "step": 1832 + }, + { + "epoch": 1.2698302736404572, + "grad_norm": 0.763395369052887, + "learning_rate": 8.735783633841887e-06, + "loss": 0.0914, + "step": 1833 + }, + { + "epoch": 1.2705230342916523, + "grad_norm": 0.8438493013381958, + "learning_rate": 8.735090152565882e-06, + "loss": 0.0895, + "step": 1834 + }, + { + "epoch": 1.2712157949428473, + "grad_norm": 0.8871287107467651, + "learning_rate": 8.734396671289875e-06, + "loss": 0.1106, + "step": 1835 + }, + { + "epoch": 1.2719085555940421, + "grad_norm": 1.0052287578582764, + "learning_rate": 8.73370319001387e-06, + "loss": 0.1063, + "step": 1836 + }, + { + "epoch": 1.2726013162452372, + "grad_norm": 0.7611558437347412, + "learning_rate": 8.733009708737865e-06, + "loss": 0.09, + "step": 1837 + }, + { + "epoch": 1.2732940768964323, + "grad_norm": 0.79737788438797, + "learning_rate": 8.732316227461858e-06, + "loss": 0.0907, + "step": 1838 + }, + { + "epoch": 1.2739868375476273, + "grad_norm": 1.0511468648910522, + "learning_rate": 8.731622746185853e-06, + "loss": 0.1066, + "step": 1839 + }, + { + "epoch": 1.2746795981988224, + "grad_norm": 0.8577345609664917, + "learning_rate": 8.730929264909848e-06, + "loss": 0.0945, + "step": 1840 + }, + { + "epoch": 1.2753723588500172, + "grad_norm": 0.9274505972862244, + "learning_rate": 8.730235783633843e-06, + "loss": 0.0878, + "step": 1841 + }, + { + "epoch": 1.2760651195012123, + "grad_norm": 0.8447585105895996, + "learning_rate": 8.729542302357838e-06, + "loss": 0.0964, + "step": 1842 + }, + { + "epoch": 1.2767578801524073, + "grad_norm": 0.840035617351532, + "learning_rate": 8.728848821081831e-06, + "loss": 0.0912, + "step": 1843 + }, + { + "epoch": 1.2774506408036024, + "grad_norm": 0.8527553677558899, + "learning_rate": 8.728155339805826e-06, + "loss": 0.0734, + "step": 1844 + }, + { + "epoch": 1.2781434014547974, + "grad_norm": 0.9054072499275208, + "learning_rate": 8.72746185852982e-06, + "loss": 0.0971, + "step": 1845 + }, + { + "epoch": 1.2788361621059923, + "grad_norm": 0.8647048473358154, + "learning_rate": 8.726768377253814e-06, + "loss": 0.0958, + "step": 1846 + }, + { + "epoch": 1.2795289227571873, + "grad_norm": 0.7765918374061584, + "learning_rate": 8.72607489597781e-06, + "loss": 0.1003, + "step": 1847 + }, + { + "epoch": 1.2802216834083824, + "grad_norm": 0.8595507144927979, + "learning_rate": 8.725381414701804e-06, + "loss": 0.0995, + "step": 1848 + }, + { + "epoch": 1.2809144440595774, + "grad_norm": 0.9293921589851379, + "learning_rate": 8.724687933425799e-06, + "loss": 0.0875, + "step": 1849 + }, + { + "epoch": 1.2816072047107725, + "grad_norm": 0.9870502352714539, + "learning_rate": 8.723994452149792e-06, + "loss": 0.1334, + "step": 1850 + }, + { + "epoch": 1.2822999653619673, + "grad_norm": 0.8699167966842651, + "learning_rate": 8.723300970873787e-06, + "loss": 0.0938, + "step": 1851 + }, + { + "epoch": 1.2829927260131624, + "grad_norm": 0.7957127690315247, + "learning_rate": 8.722607489597782e-06, + "loss": 0.0862, + "step": 1852 + }, + { + "epoch": 1.2836854866643574, + "grad_norm": 0.8202260732650757, + "learning_rate": 8.721914008321775e-06, + "loss": 0.0869, + "step": 1853 + }, + { + "epoch": 1.2843782473155525, + "grad_norm": 2.299309015274048, + "learning_rate": 8.72122052704577e-06, + "loss": 0.1076, + "step": 1854 + }, + { + "epoch": 1.2850710079667476, + "grad_norm": 0.8096281290054321, + "learning_rate": 8.720527045769764e-06, + "loss": 0.0915, + "step": 1855 + }, + { + "epoch": 1.2857637686179424, + "grad_norm": 1.0309594869613647, + "learning_rate": 8.719833564493759e-06, + "loss": 0.1019, + "step": 1856 + }, + { + "epoch": 1.2864565292691374, + "grad_norm": 0.7998151183128357, + "learning_rate": 8.719140083217753e-06, + "loss": 0.1014, + "step": 1857 + }, + { + "epoch": 1.2871492899203325, + "grad_norm": 0.8655368089675903, + "learning_rate": 8.718446601941748e-06, + "loss": 0.0925, + "step": 1858 + }, + { + "epoch": 1.2878420505715276, + "grad_norm": 0.8410249948501587, + "learning_rate": 8.717753120665743e-06, + "loss": 0.0916, + "step": 1859 + }, + { + "epoch": 1.2885348112227226, + "grad_norm": 0.7467215657234192, + "learning_rate": 8.717059639389737e-06, + "loss": 0.0817, + "step": 1860 + }, + { + "epoch": 1.2892275718739175, + "grad_norm": 0.8425984978675842, + "learning_rate": 8.716366158113731e-06, + "loss": 0.07, + "step": 1861 + }, + { + "epoch": 1.2899203325251125, + "grad_norm": 0.8508981466293335, + "learning_rate": 8.715672676837726e-06, + "loss": 0.0921, + "step": 1862 + }, + { + "epoch": 1.2906130931763076, + "grad_norm": 0.7180240750312805, + "learning_rate": 8.71497919556172e-06, + "loss": 0.0877, + "step": 1863 + }, + { + "epoch": 1.2913058538275026, + "grad_norm": 0.7329303026199341, + "learning_rate": 8.714285714285715e-06, + "loss": 0.0739, + "step": 1864 + }, + { + "epoch": 1.2919986144786977, + "grad_norm": 0.8160582184791565, + "learning_rate": 8.71359223300971e-06, + "loss": 0.0893, + "step": 1865 + }, + { + "epoch": 1.2926913751298925, + "grad_norm": 0.8358718752861023, + "learning_rate": 8.712898751733704e-06, + "loss": 0.1096, + "step": 1866 + }, + { + "epoch": 1.2933841357810876, + "grad_norm": 0.7995262145996094, + "learning_rate": 8.7122052704577e-06, + "loss": 0.0938, + "step": 1867 + }, + { + "epoch": 1.2940768964322826, + "grad_norm": 0.7863608598709106, + "learning_rate": 8.711511789181693e-06, + "loss": 0.0927, + "step": 1868 + }, + { + "epoch": 1.2947696570834777, + "grad_norm": 0.8459818363189697, + "learning_rate": 8.710818307905688e-06, + "loss": 0.0969, + "step": 1869 + }, + { + "epoch": 1.2954624177346727, + "grad_norm": 0.9080449342727661, + "learning_rate": 8.71012482662968e-06, + "loss": 0.1302, + "step": 1870 + }, + { + "epoch": 1.2961551783858676, + "grad_norm": 0.7980386018753052, + "learning_rate": 8.709431345353676e-06, + "loss": 0.0839, + "step": 1871 + }, + { + "epoch": 1.2968479390370626, + "grad_norm": 0.8454411625862122, + "learning_rate": 8.70873786407767e-06, + "loss": 0.1082, + "step": 1872 + }, + { + "epoch": 1.2975406996882577, + "grad_norm": 0.8614441156387329, + "learning_rate": 8.708044382801666e-06, + "loss": 0.0903, + "step": 1873 + }, + { + "epoch": 1.2982334603394527, + "grad_norm": 0.8343150615692139, + "learning_rate": 8.707350901525659e-06, + "loss": 0.1102, + "step": 1874 + }, + { + "epoch": 1.2989262209906478, + "grad_norm": 0.8563399314880371, + "learning_rate": 8.706657420249654e-06, + "loss": 0.0959, + "step": 1875 + }, + { + "epoch": 1.2996189816418426, + "grad_norm": 0.9160473942756653, + "learning_rate": 8.705963938973649e-06, + "loss": 0.1008, + "step": 1876 + }, + { + "epoch": 1.3003117422930377, + "grad_norm": 0.8592052459716797, + "learning_rate": 8.705270457697644e-06, + "loss": 0.1026, + "step": 1877 + }, + { + "epoch": 1.3010045029442328, + "grad_norm": 0.8042607307434082, + "learning_rate": 8.704576976421637e-06, + "loss": 0.0802, + "step": 1878 + }, + { + "epoch": 1.3016972635954278, + "grad_norm": 0.939224362373352, + "learning_rate": 8.703883495145632e-06, + "loss": 0.1137, + "step": 1879 + }, + { + "epoch": 1.3023900242466229, + "grad_norm": 0.7538774013519287, + "learning_rate": 8.703190013869625e-06, + "loss": 0.081, + "step": 1880 + }, + { + "epoch": 1.3030827848978177, + "grad_norm": 0.8179598450660706, + "learning_rate": 8.70249653259362e-06, + "loss": 0.0962, + "step": 1881 + }, + { + "epoch": 1.3037755455490128, + "grad_norm": 0.8050620555877686, + "learning_rate": 8.701803051317615e-06, + "loss": 0.091, + "step": 1882 + }, + { + "epoch": 1.3044683062002078, + "grad_norm": 0.9653608798980713, + "learning_rate": 8.70110957004161e-06, + "loss": 0.125, + "step": 1883 + }, + { + "epoch": 1.3051610668514029, + "grad_norm": 0.849090039730072, + "learning_rate": 8.700416088765605e-06, + "loss": 0.0961, + "step": 1884 + }, + { + "epoch": 1.305853827502598, + "grad_norm": 0.8501788377761841, + "learning_rate": 8.699722607489598e-06, + "loss": 0.0986, + "step": 1885 + }, + { + "epoch": 1.3065465881537928, + "grad_norm": 0.9050464630126953, + "learning_rate": 8.699029126213593e-06, + "loss": 0.1099, + "step": 1886 + }, + { + "epoch": 1.3072393488049878, + "grad_norm": 0.9707657098770142, + "learning_rate": 8.698335644937588e-06, + "loss": 0.1288, + "step": 1887 + }, + { + "epoch": 1.3079321094561829, + "grad_norm": 0.8001958131790161, + "learning_rate": 8.697642163661581e-06, + "loss": 0.0854, + "step": 1888 + }, + { + "epoch": 1.308624870107378, + "grad_norm": 0.7052933573722839, + "learning_rate": 8.696948682385576e-06, + "loss": 0.0676, + "step": 1889 + }, + { + "epoch": 1.309317630758573, + "grad_norm": 0.9062489867210388, + "learning_rate": 8.696255201109571e-06, + "loss": 0.0956, + "step": 1890 + }, + { + "epoch": 1.3100103914097678, + "grad_norm": 0.8515408635139465, + "learning_rate": 8.695561719833566e-06, + "loss": 0.0821, + "step": 1891 + }, + { + "epoch": 1.3107031520609629, + "grad_norm": 0.9760435819625854, + "learning_rate": 8.69486823855756e-06, + "loss": 0.1251, + "step": 1892 + }, + { + "epoch": 1.311395912712158, + "grad_norm": 0.8155739307403564, + "learning_rate": 8.694174757281554e-06, + "loss": 0.0897, + "step": 1893 + }, + { + "epoch": 1.312088673363353, + "grad_norm": 0.7818350791931152, + "learning_rate": 8.693481276005549e-06, + "loss": 0.0825, + "step": 1894 + }, + { + "epoch": 1.312781434014548, + "grad_norm": 0.8299257159233093, + "learning_rate": 8.692787794729542e-06, + "loss": 0.0969, + "step": 1895 + }, + { + "epoch": 1.313474194665743, + "grad_norm": 0.8357214331626892, + "learning_rate": 8.692094313453537e-06, + "loss": 0.106, + "step": 1896 + }, + { + "epoch": 1.314166955316938, + "grad_norm": 0.8854610323905945, + "learning_rate": 8.691400832177532e-06, + "loss": 0.1114, + "step": 1897 + }, + { + "epoch": 1.314859715968133, + "grad_norm": 0.9018477201461792, + "learning_rate": 8.690707350901525e-06, + "loss": 0.0933, + "step": 1898 + }, + { + "epoch": 1.315552476619328, + "grad_norm": 0.7969275712966919, + "learning_rate": 8.69001386962552e-06, + "loss": 0.0951, + "step": 1899 + }, + { + "epoch": 1.3162452372705231, + "grad_norm": 0.8710771799087524, + "learning_rate": 8.689320388349515e-06, + "loss": 0.1001, + "step": 1900 + }, + { + "epoch": 1.316937997921718, + "grad_norm": 0.8591422438621521, + "learning_rate": 8.68862690707351e-06, + "loss": 0.1093, + "step": 1901 + }, + { + "epoch": 1.317630758572913, + "grad_norm": 0.8055676817893982, + "learning_rate": 8.687933425797505e-06, + "loss": 0.092, + "step": 1902 + }, + { + "epoch": 1.318323519224108, + "grad_norm": 0.7591870427131653, + "learning_rate": 8.687239944521498e-06, + "loss": 0.0896, + "step": 1903 + }, + { + "epoch": 1.3190162798753031, + "grad_norm": 0.856926679611206, + "learning_rate": 8.686546463245493e-06, + "loss": 0.0985, + "step": 1904 + }, + { + "epoch": 1.3197090405264982, + "grad_norm": 0.8678262829780579, + "learning_rate": 8.685852981969487e-06, + "loss": 0.1183, + "step": 1905 + }, + { + "epoch": 1.320401801177693, + "grad_norm": 0.7745727300643921, + "learning_rate": 8.685159500693481e-06, + "loss": 0.0929, + "step": 1906 + }, + { + "epoch": 1.321094561828888, + "grad_norm": 0.7514280676841736, + "learning_rate": 8.684466019417476e-06, + "loss": 0.0893, + "step": 1907 + }, + { + "epoch": 1.3217873224800831, + "grad_norm": 0.877655029296875, + "learning_rate": 8.683772538141471e-06, + "loss": 0.1119, + "step": 1908 + }, + { + "epoch": 1.3224800831312782, + "grad_norm": 0.8242598176002502, + "learning_rate": 8.683079056865466e-06, + "loss": 0.0996, + "step": 1909 + }, + { + "epoch": 1.3231728437824732, + "grad_norm": 0.8473266363143921, + "learning_rate": 8.68238557558946e-06, + "loss": 0.1044, + "step": 1910 + }, + { + "epoch": 1.323865604433668, + "grad_norm": 0.8577867746353149, + "learning_rate": 8.681692094313454e-06, + "loss": 0.108, + "step": 1911 + }, + { + "epoch": 1.3245583650848631, + "grad_norm": 0.9692781567573547, + "learning_rate": 8.68099861303745e-06, + "loss": 0.09, + "step": 1912 + }, + { + "epoch": 1.3252511257360582, + "grad_norm": 0.8671038746833801, + "learning_rate": 8.680305131761443e-06, + "loss": 0.1085, + "step": 1913 + }, + { + "epoch": 1.3259438863872532, + "grad_norm": 0.8092488050460815, + "learning_rate": 8.679611650485438e-06, + "loss": 0.1093, + "step": 1914 + }, + { + "epoch": 1.3266366470384483, + "grad_norm": 0.9651378989219666, + "learning_rate": 8.67891816920943e-06, + "loss": 0.1106, + "step": 1915 + }, + { + "epoch": 1.3273294076896431, + "grad_norm": 0.7887718081474304, + "learning_rate": 8.678224687933426e-06, + "loss": 0.0955, + "step": 1916 + }, + { + "epoch": 1.3280221683408382, + "grad_norm": 0.7525023818016052, + "learning_rate": 8.67753120665742e-06, + "loss": 0.0897, + "step": 1917 + }, + { + "epoch": 1.3287149289920333, + "grad_norm": 0.7951431274414062, + "learning_rate": 8.676837725381416e-06, + "loss": 0.0851, + "step": 1918 + }, + { + "epoch": 1.3294076896432283, + "grad_norm": 0.9328269362449646, + "learning_rate": 8.67614424410541e-06, + "loss": 0.1171, + "step": 1919 + }, + { + "epoch": 1.3301004502944234, + "grad_norm": 0.7530510425567627, + "learning_rate": 8.675450762829404e-06, + "loss": 0.1047, + "step": 1920 + }, + { + "epoch": 1.3307932109456182, + "grad_norm": 0.7813425660133362, + "learning_rate": 8.674757281553399e-06, + "loss": 0.0911, + "step": 1921 + }, + { + "epoch": 1.3314859715968133, + "grad_norm": 0.7479113936424255, + "learning_rate": 8.674063800277394e-06, + "loss": 0.0886, + "step": 1922 + }, + { + "epoch": 1.3321787322480083, + "grad_norm": 0.8434452414512634, + "learning_rate": 8.673370319001387e-06, + "loss": 0.0905, + "step": 1923 + }, + { + "epoch": 1.3328714928992034, + "grad_norm": 1.0284795761108398, + "learning_rate": 8.672676837725382e-06, + "loss": 0.1008, + "step": 1924 + }, + { + "epoch": 1.3335642535503984, + "grad_norm": 0.9190812706947327, + "learning_rate": 8.671983356449377e-06, + "loss": 0.1037, + "step": 1925 + }, + { + "epoch": 1.3342570142015933, + "grad_norm": 0.7740709185600281, + "learning_rate": 8.671289875173372e-06, + "loss": 0.0791, + "step": 1926 + }, + { + "epoch": 1.3349497748527883, + "grad_norm": 0.8993537425994873, + "learning_rate": 8.670596393897367e-06, + "loss": 0.1132, + "step": 1927 + }, + { + "epoch": 1.3356425355039834, + "grad_norm": 0.6645535826683044, + "learning_rate": 8.66990291262136e-06, + "loss": 0.0593, + "step": 1928 + }, + { + "epoch": 1.3363352961551784, + "grad_norm": 0.7269705533981323, + "learning_rate": 8.669209431345355e-06, + "loss": 0.0836, + "step": 1929 + }, + { + "epoch": 1.3370280568063735, + "grad_norm": 0.8108569979667664, + "learning_rate": 8.668515950069348e-06, + "loss": 0.0859, + "step": 1930 + }, + { + "epoch": 1.3377208174575683, + "grad_norm": 0.8316945433616638, + "learning_rate": 8.667822468793343e-06, + "loss": 0.0856, + "step": 1931 + }, + { + "epoch": 1.3384135781087634, + "grad_norm": 0.923198401927948, + "learning_rate": 8.667128987517338e-06, + "loss": 0.1228, + "step": 1932 + }, + { + "epoch": 1.3391063387599584, + "grad_norm": 0.8418177366256714, + "learning_rate": 8.666435506241331e-06, + "loss": 0.1091, + "step": 1933 + }, + { + "epoch": 1.3397990994111535, + "grad_norm": 0.9837353825569153, + "learning_rate": 8.665742024965326e-06, + "loss": 0.1016, + "step": 1934 + }, + { + "epoch": 1.3404918600623486, + "grad_norm": 0.9974249601364136, + "learning_rate": 8.665048543689321e-06, + "loss": 0.0982, + "step": 1935 + }, + { + "epoch": 1.3411846207135434, + "grad_norm": 0.9031357169151306, + "learning_rate": 8.664355062413316e-06, + "loss": 0.0735, + "step": 1936 + }, + { + "epoch": 1.3418773813647384, + "grad_norm": 0.9560807943344116, + "learning_rate": 8.66366158113731e-06, + "loss": 0.1073, + "step": 1937 + }, + { + "epoch": 1.3425701420159335, + "grad_norm": 0.9520257115364075, + "learning_rate": 8.662968099861304e-06, + "loss": 0.0958, + "step": 1938 + }, + { + "epoch": 1.3432629026671286, + "grad_norm": 0.8354464173316956, + "learning_rate": 8.662274618585299e-06, + "loss": 0.107, + "step": 1939 + }, + { + "epoch": 1.3439556633183236, + "grad_norm": 0.8196549415588379, + "learning_rate": 8.661581137309292e-06, + "loss": 0.0941, + "step": 1940 + }, + { + "epoch": 1.3446484239695184, + "grad_norm": 0.749197244644165, + "learning_rate": 8.660887656033287e-06, + "loss": 0.0886, + "step": 1941 + }, + { + "epoch": 1.3453411846207135, + "grad_norm": 0.8296682238578796, + "learning_rate": 8.660194174757282e-06, + "loss": 0.0855, + "step": 1942 + }, + { + "epoch": 1.3460339452719086, + "grad_norm": 1.4121226072311401, + "learning_rate": 8.659500693481277e-06, + "loss": 0.1187, + "step": 1943 + }, + { + "epoch": 1.3467267059231036, + "grad_norm": 0.8417717218399048, + "learning_rate": 8.658807212205272e-06, + "loss": 0.095, + "step": 1944 + }, + { + "epoch": 1.3474194665742987, + "grad_norm": 0.8696645498275757, + "learning_rate": 8.658113730929265e-06, + "loss": 0.0884, + "step": 1945 + }, + { + "epoch": 1.3481122272254935, + "grad_norm": 0.831957221031189, + "learning_rate": 8.65742024965326e-06, + "loss": 0.0978, + "step": 1946 + }, + { + "epoch": 1.3488049878766886, + "grad_norm": 0.8580504655838013, + "learning_rate": 8.656726768377255e-06, + "loss": 0.0918, + "step": 1947 + }, + { + "epoch": 1.3494977485278836, + "grad_norm": 0.7921102046966553, + "learning_rate": 8.656033287101248e-06, + "loss": 0.0804, + "step": 1948 + }, + { + "epoch": 1.3501905091790787, + "grad_norm": 0.9082995653152466, + "learning_rate": 8.655339805825243e-06, + "loss": 0.0988, + "step": 1949 + }, + { + "epoch": 1.3508832698302737, + "grad_norm": 0.8123347163200378, + "learning_rate": 8.654646324549238e-06, + "loss": 0.0859, + "step": 1950 + }, + { + "epoch": 1.3515760304814686, + "grad_norm": 0.8211560845375061, + "learning_rate": 8.653952843273231e-06, + "loss": 0.09, + "step": 1951 + }, + { + "epoch": 1.3522687911326636, + "grad_norm": 0.909602701663971, + "learning_rate": 8.653259361997226e-06, + "loss": 0.0996, + "step": 1952 + }, + { + "epoch": 1.3529615517838587, + "grad_norm": 0.8661556243896484, + "learning_rate": 8.652565880721221e-06, + "loss": 0.098, + "step": 1953 + }, + { + "epoch": 1.3536543124350537, + "grad_norm": 0.8467791080474854, + "learning_rate": 8.651872399445216e-06, + "loss": 0.0993, + "step": 1954 + }, + { + "epoch": 1.3543470730862488, + "grad_norm": 0.8431129455566406, + "learning_rate": 8.65117891816921e-06, + "loss": 0.1165, + "step": 1955 + }, + { + "epoch": 1.3550398337374436, + "grad_norm": 0.839606761932373, + "learning_rate": 8.650485436893204e-06, + "loss": 0.1, + "step": 1956 + }, + { + "epoch": 1.3557325943886387, + "grad_norm": 0.8434377908706665, + "learning_rate": 8.6497919556172e-06, + "loss": 0.103, + "step": 1957 + }, + { + "epoch": 1.3564253550398337, + "grad_norm": 0.9461360573768616, + "learning_rate": 8.649098474341193e-06, + "loss": 0.1179, + "step": 1958 + }, + { + "epoch": 1.3571181156910288, + "grad_norm": 0.9228819608688354, + "learning_rate": 8.648404993065188e-06, + "loss": 0.1052, + "step": 1959 + }, + { + "epoch": 1.3578108763422239, + "grad_norm": 0.8292375802993774, + "learning_rate": 8.647711511789182e-06, + "loss": 0.1027, + "step": 1960 + }, + { + "epoch": 1.3585036369934187, + "grad_norm": 0.8803178668022156, + "learning_rate": 8.647018030513177e-06, + "loss": 0.1017, + "step": 1961 + }, + { + "epoch": 1.3591963976446138, + "grad_norm": 0.810207724571228, + "learning_rate": 8.646324549237172e-06, + "loss": 0.0876, + "step": 1962 + }, + { + "epoch": 1.3598891582958088, + "grad_norm": 0.8565738201141357, + "learning_rate": 8.645631067961166e-06, + "loss": 0.0985, + "step": 1963 + }, + { + "epoch": 1.3605819189470039, + "grad_norm": 0.7794263362884521, + "learning_rate": 8.64493758668516e-06, + "loss": 0.0935, + "step": 1964 + }, + { + "epoch": 1.361274679598199, + "grad_norm": 0.8222514390945435, + "learning_rate": 8.644244105409154e-06, + "loss": 0.0927, + "step": 1965 + }, + { + "epoch": 1.3619674402493938, + "grad_norm": 0.8469381332397461, + "learning_rate": 8.643550624133149e-06, + "loss": 0.0836, + "step": 1966 + }, + { + "epoch": 1.3626602009005888, + "grad_norm": 0.7348058223724365, + "learning_rate": 8.642857142857144e-06, + "loss": 0.0902, + "step": 1967 + }, + { + "epoch": 1.3633529615517839, + "grad_norm": 0.8116043210029602, + "learning_rate": 8.642163661581139e-06, + "loss": 0.0962, + "step": 1968 + }, + { + "epoch": 1.364045722202979, + "grad_norm": 0.786858081817627, + "learning_rate": 8.641470180305133e-06, + "loss": 0.0861, + "step": 1969 + }, + { + "epoch": 1.364738482854174, + "grad_norm": 0.9200820326805115, + "learning_rate": 8.640776699029127e-06, + "loss": 0.1187, + "step": 1970 + }, + { + "epoch": 1.3654312435053688, + "grad_norm": 0.8893924951553345, + "learning_rate": 8.640083217753122e-06, + "loss": 0.1166, + "step": 1971 + }, + { + "epoch": 1.3661240041565639, + "grad_norm": 0.9585806727409363, + "learning_rate": 8.639389736477117e-06, + "loss": 0.1124, + "step": 1972 + }, + { + "epoch": 1.366816764807759, + "grad_norm": 0.9382540583610535, + "learning_rate": 8.63869625520111e-06, + "loss": 0.1135, + "step": 1973 + }, + { + "epoch": 1.367509525458954, + "grad_norm": 0.8802511096000671, + "learning_rate": 8.638002773925105e-06, + "loss": 0.089, + "step": 1974 + }, + { + "epoch": 1.368202286110149, + "grad_norm": 0.8175269365310669, + "learning_rate": 8.637309292649098e-06, + "loss": 0.116, + "step": 1975 + }, + { + "epoch": 1.3688950467613439, + "grad_norm": 0.9394797682762146, + "learning_rate": 8.636615811373093e-06, + "loss": 0.0898, + "step": 1976 + }, + { + "epoch": 1.369587807412539, + "grad_norm": 0.8105082511901855, + "learning_rate": 8.635922330097088e-06, + "loss": 0.0894, + "step": 1977 + }, + { + "epoch": 1.370280568063734, + "grad_norm": 0.9872980713844299, + "learning_rate": 8.635228848821083e-06, + "loss": 0.0954, + "step": 1978 + }, + { + "epoch": 1.370973328714929, + "grad_norm": 0.8237231373786926, + "learning_rate": 8.634535367545078e-06, + "loss": 0.1074, + "step": 1979 + }, + { + "epoch": 1.371666089366124, + "grad_norm": 0.9056423902511597, + "learning_rate": 8.633841886269071e-06, + "loss": 0.0867, + "step": 1980 + }, + { + "epoch": 1.372358850017319, + "grad_norm": 0.8608441948890686, + "learning_rate": 8.633148404993066e-06, + "loss": 0.089, + "step": 1981 + }, + { + "epoch": 1.373051610668514, + "grad_norm": 0.8657476902008057, + "learning_rate": 8.63245492371706e-06, + "loss": 0.0989, + "step": 1982 + }, + { + "epoch": 1.373744371319709, + "grad_norm": 0.8708510994911194, + "learning_rate": 8.631761442441054e-06, + "loss": 0.094, + "step": 1983 + }, + { + "epoch": 1.3744371319709041, + "grad_norm": 0.9043082594871521, + "learning_rate": 8.631067961165049e-06, + "loss": 0.104, + "step": 1984 + }, + { + "epoch": 1.3751298926220992, + "grad_norm": 0.8037663102149963, + "learning_rate": 8.630374479889044e-06, + "loss": 0.0807, + "step": 1985 + }, + { + "epoch": 1.375822653273294, + "grad_norm": 0.8416350483894348, + "learning_rate": 8.629680998613039e-06, + "loss": 0.0993, + "step": 1986 + }, + { + "epoch": 1.376515413924489, + "grad_norm": 0.7926865816116333, + "learning_rate": 8.628987517337034e-06, + "loss": 0.0933, + "step": 1987 + }, + { + "epoch": 1.3772081745756841, + "grad_norm": 0.8975657820701599, + "learning_rate": 8.628294036061027e-06, + "loss": 0.0985, + "step": 1988 + }, + { + "epoch": 1.3779009352268792, + "grad_norm": 0.9110676050186157, + "learning_rate": 8.627600554785022e-06, + "loss": 0.0944, + "step": 1989 + }, + { + "epoch": 1.3785936958780742, + "grad_norm": 0.9367595911026001, + "learning_rate": 8.626907073509015e-06, + "loss": 0.0975, + "step": 1990 + }, + { + "epoch": 1.379286456529269, + "grad_norm": 0.7948849201202393, + "learning_rate": 8.62621359223301e-06, + "loss": 0.0892, + "step": 1991 + }, + { + "epoch": 1.3799792171804641, + "grad_norm": 0.8453758358955383, + "learning_rate": 8.625520110957005e-06, + "loss": 0.1105, + "step": 1992 + }, + { + "epoch": 1.3806719778316592, + "grad_norm": 0.864433228969574, + "learning_rate": 8.624826629680998e-06, + "loss": 0.1162, + "step": 1993 + }, + { + "epoch": 1.3813647384828542, + "grad_norm": 0.8461838960647583, + "learning_rate": 8.624133148404993e-06, + "loss": 0.0974, + "step": 1994 + }, + { + "epoch": 1.3820574991340493, + "grad_norm": 0.9221845269203186, + "learning_rate": 8.623439667128988e-06, + "loss": 0.0949, + "step": 1995 + }, + { + "epoch": 1.3827502597852441, + "grad_norm": 0.7963080406188965, + "learning_rate": 8.622746185852983e-06, + "loss": 0.0912, + "step": 1996 + }, + { + "epoch": 1.3834430204364392, + "grad_norm": 0.7928188443183899, + "learning_rate": 8.622052704576978e-06, + "loss": 0.098, + "step": 1997 + }, + { + "epoch": 1.3841357810876342, + "grad_norm": 0.7823619842529297, + "learning_rate": 8.621359223300971e-06, + "loss": 0.0901, + "step": 1998 + }, + { + "epoch": 1.3848285417388293, + "grad_norm": 1.0484232902526855, + "learning_rate": 8.620665742024966e-06, + "loss": 0.1024, + "step": 1999 + }, + { + "epoch": 1.3855213023900244, + "grad_norm": 0.9590020775794983, + "learning_rate": 8.61997226074896e-06, + "loss": 0.1206, + "step": 2000 + }, + { + "epoch": 1.3862140630412192, + "grad_norm": 0.8306148648262024, + "learning_rate": 8.619278779472954e-06, + "loss": 0.0977, + "step": 2001 + }, + { + "epoch": 1.3869068236924142, + "grad_norm": 0.7876715064048767, + "learning_rate": 8.61858529819695e-06, + "loss": 0.0891, + "step": 2002 + }, + { + "epoch": 1.3875995843436093, + "grad_norm": 1.0013034343719482, + "learning_rate": 8.617891816920944e-06, + "loss": 0.1069, + "step": 2003 + }, + { + "epoch": 1.3882923449948044, + "grad_norm": 0.7736858129501343, + "learning_rate": 8.61719833564494e-06, + "loss": 0.0832, + "step": 2004 + }, + { + "epoch": 1.3889851056459994, + "grad_norm": 0.7988465428352356, + "learning_rate": 8.616504854368932e-06, + "loss": 0.0932, + "step": 2005 + }, + { + "epoch": 1.3896778662971943, + "grad_norm": 0.8956063389778137, + "learning_rate": 8.615811373092927e-06, + "loss": 0.1014, + "step": 2006 + }, + { + "epoch": 1.3903706269483893, + "grad_norm": 0.8395360112190247, + "learning_rate": 8.615117891816922e-06, + "loss": 0.1008, + "step": 2007 + }, + { + "epoch": 1.3910633875995844, + "grad_norm": 0.79246985912323, + "learning_rate": 8.614424410540916e-06, + "loss": 0.0706, + "step": 2008 + }, + { + "epoch": 1.3917561482507794, + "grad_norm": 0.8563838601112366, + "learning_rate": 8.61373092926491e-06, + "loss": 0.1176, + "step": 2009 + }, + { + "epoch": 1.3924489089019745, + "grad_norm": 1.1236284971237183, + "learning_rate": 8.613037447988904e-06, + "loss": 0.1097, + "step": 2010 + }, + { + "epoch": 1.3931416695531693, + "grad_norm": 0.8628478646278381, + "learning_rate": 8.612343966712899e-06, + "loss": 0.0891, + "step": 2011 + }, + { + "epoch": 1.3938344302043644, + "grad_norm": 0.8011003732681274, + "learning_rate": 8.611650485436894e-06, + "loss": 0.0916, + "step": 2012 + }, + { + "epoch": 1.3945271908555594, + "grad_norm": 0.8925850987434387, + "learning_rate": 8.610957004160889e-06, + "loss": 0.0998, + "step": 2013 + }, + { + "epoch": 1.3952199515067545, + "grad_norm": 0.8047410249710083, + "learning_rate": 8.610263522884883e-06, + "loss": 0.1053, + "step": 2014 + }, + { + "epoch": 1.3959127121579495, + "grad_norm": 0.7683031558990479, + "learning_rate": 8.609570041608877e-06, + "loss": 0.0925, + "step": 2015 + }, + { + "epoch": 1.3966054728091444, + "grad_norm": 0.7874264717102051, + "learning_rate": 8.608876560332872e-06, + "loss": 0.0862, + "step": 2016 + }, + { + "epoch": 1.3972982334603394, + "grad_norm": 0.7253385186195374, + "learning_rate": 8.608183079056867e-06, + "loss": 0.0706, + "step": 2017 + }, + { + "epoch": 1.3979909941115345, + "grad_norm": 0.7991751432418823, + "learning_rate": 8.60748959778086e-06, + "loss": 0.1014, + "step": 2018 + }, + { + "epoch": 1.3986837547627295, + "grad_norm": 0.7393742203712463, + "learning_rate": 8.606796116504855e-06, + "loss": 0.0831, + "step": 2019 + }, + { + "epoch": 1.3993765154139246, + "grad_norm": 0.8541518449783325, + "learning_rate": 8.60610263522885e-06, + "loss": 0.0957, + "step": 2020 + }, + { + "epoch": 1.4000692760651194, + "grad_norm": 0.8139171600341797, + "learning_rate": 8.605409153952845e-06, + "loss": 0.0888, + "step": 2021 + }, + { + "epoch": 1.4007620367163145, + "grad_norm": 0.8328967094421387, + "learning_rate": 8.60471567267684e-06, + "loss": 0.0974, + "step": 2022 + }, + { + "epoch": 1.4014547973675096, + "grad_norm": 0.7323856949806213, + "learning_rate": 8.604022191400833e-06, + "loss": 0.0915, + "step": 2023 + }, + { + "epoch": 1.4021475580187046, + "grad_norm": 0.8023518919944763, + "learning_rate": 8.603328710124828e-06, + "loss": 0.0882, + "step": 2024 + }, + { + "epoch": 1.4028403186698997, + "grad_norm": 0.793217122554779, + "learning_rate": 8.602635228848821e-06, + "loss": 0.0768, + "step": 2025 + }, + { + "epoch": 1.4035330793210945, + "grad_norm": 0.8914885520935059, + "learning_rate": 8.601941747572816e-06, + "loss": 0.0755, + "step": 2026 + }, + { + "epoch": 1.4042258399722896, + "grad_norm": 0.8099649548530579, + "learning_rate": 8.60124826629681e-06, + "loss": 0.0913, + "step": 2027 + }, + { + "epoch": 1.4049186006234846, + "grad_norm": 0.8921623229980469, + "learning_rate": 8.600554785020804e-06, + "loss": 0.0978, + "step": 2028 + }, + { + "epoch": 1.4056113612746797, + "grad_norm": 0.8204141855239868, + "learning_rate": 8.599861303744799e-06, + "loss": 0.094, + "step": 2029 + }, + { + "epoch": 1.4063041219258747, + "grad_norm": 0.9586772918701172, + "learning_rate": 8.599167822468794e-06, + "loss": 0.1301, + "step": 2030 + }, + { + "epoch": 1.4069968825770696, + "grad_norm": 0.9210357666015625, + "learning_rate": 8.598474341192789e-06, + "loss": 0.1078, + "step": 2031 + }, + { + "epoch": 1.4076896432282646, + "grad_norm": 0.8714752793312073, + "learning_rate": 8.597780859916784e-06, + "loss": 0.0954, + "step": 2032 + }, + { + "epoch": 1.4083824038794597, + "grad_norm": 0.75909423828125, + "learning_rate": 8.597087378640777e-06, + "loss": 0.0835, + "step": 2033 + }, + { + "epoch": 1.4090751645306547, + "grad_norm": 1.0207693576812744, + "learning_rate": 8.596393897364772e-06, + "loss": 0.1364, + "step": 2034 + }, + { + "epoch": 1.4097679251818498, + "grad_norm": 0.7902257442474365, + "learning_rate": 8.595700416088765e-06, + "loss": 0.0804, + "step": 2035 + }, + { + "epoch": 1.4104606858330446, + "grad_norm": 0.8312771916389465, + "learning_rate": 8.59500693481276e-06, + "loss": 0.1076, + "step": 2036 + }, + { + "epoch": 1.4111534464842397, + "grad_norm": 0.8642279505729675, + "learning_rate": 8.594313453536755e-06, + "loss": 0.087, + "step": 2037 + }, + { + "epoch": 1.4118462071354347, + "grad_norm": 3.088615894317627, + "learning_rate": 8.59361997226075e-06, + "loss": 0.1102, + "step": 2038 + }, + { + "epoch": 1.4125389677866298, + "grad_norm": 0.8367988467216492, + "learning_rate": 8.592926490984745e-06, + "loss": 0.1022, + "step": 2039 + }, + { + "epoch": 1.4132317284378249, + "grad_norm": 0.8424378633499146, + "learning_rate": 8.592233009708738e-06, + "loss": 0.1064, + "step": 2040 + }, + { + "epoch": 1.4139244890890197, + "grad_norm": 0.942853569984436, + "learning_rate": 8.591539528432733e-06, + "loss": 0.1177, + "step": 2041 + }, + { + "epoch": 1.4146172497402147, + "grad_norm": 0.8797265887260437, + "learning_rate": 8.590846047156728e-06, + "loss": 0.0967, + "step": 2042 + }, + { + "epoch": 1.4153100103914098, + "grad_norm": 0.7765311002731323, + "learning_rate": 8.590152565880721e-06, + "loss": 0.0707, + "step": 2043 + }, + { + "epoch": 1.4160027710426049, + "grad_norm": 0.873090922832489, + "learning_rate": 8.589459084604716e-06, + "loss": 0.0938, + "step": 2044 + }, + { + "epoch": 1.4166955316938, + "grad_norm": 0.9923801422119141, + "learning_rate": 8.588765603328711e-06, + "loss": 0.1312, + "step": 2045 + }, + { + "epoch": 1.4173882923449947, + "grad_norm": 0.9018128514289856, + "learning_rate": 8.588072122052706e-06, + "loss": 0.0949, + "step": 2046 + }, + { + "epoch": 1.4180810529961898, + "grad_norm": 0.799936056137085, + "learning_rate": 8.5873786407767e-06, + "loss": 0.0841, + "step": 2047 + }, + { + "epoch": 1.4187738136473849, + "grad_norm": 0.789219856262207, + "learning_rate": 8.586685159500694e-06, + "loss": 0.094, + "step": 2048 + }, + { + "epoch": 1.41946657429858, + "grad_norm": 0.8584476113319397, + "learning_rate": 8.58599167822469e-06, + "loss": 0.0955, + "step": 2049 + }, + { + "epoch": 1.420159334949775, + "grad_norm": 0.7601639032363892, + "learning_rate": 8.585298196948682e-06, + "loss": 0.0789, + "step": 2050 + }, + { + "epoch": 1.4208520956009698, + "grad_norm": 0.8748162984848022, + "learning_rate": 8.584604715672677e-06, + "loss": 0.0976, + "step": 2051 + }, + { + "epoch": 1.4215448562521649, + "grad_norm": 0.8095703125, + "learning_rate": 8.583911234396672e-06, + "loss": 0.1005, + "step": 2052 + }, + { + "epoch": 1.42223761690336, + "grad_norm": 0.8859317302703857, + "learning_rate": 8.583217753120666e-06, + "loss": 0.1006, + "step": 2053 + }, + { + "epoch": 1.422930377554555, + "grad_norm": 0.790725827217102, + "learning_rate": 8.58252427184466e-06, + "loss": 0.0951, + "step": 2054 + }, + { + "epoch": 1.42362313820575, + "grad_norm": 0.7737709879875183, + "learning_rate": 8.581830790568655e-06, + "loss": 0.0848, + "step": 2055 + }, + { + "epoch": 1.4243158988569449, + "grad_norm": 0.7824344038963318, + "learning_rate": 8.58113730929265e-06, + "loss": 0.0911, + "step": 2056 + }, + { + "epoch": 1.42500865950814, + "grad_norm": 0.8980814814567566, + "learning_rate": 8.580443828016645e-06, + "loss": 0.1004, + "step": 2057 + }, + { + "epoch": 1.425701420159335, + "grad_norm": 0.8582302927970886, + "learning_rate": 8.579750346740638e-06, + "loss": 0.1103, + "step": 2058 + }, + { + "epoch": 1.42639418081053, + "grad_norm": 0.9365726113319397, + "learning_rate": 8.579056865464633e-06, + "loss": 0.1141, + "step": 2059 + }, + { + "epoch": 1.427086941461725, + "grad_norm": 0.856505811214447, + "learning_rate": 8.578363384188627e-06, + "loss": 0.1085, + "step": 2060 + }, + { + "epoch": 1.42777970211292, + "grad_norm": 1.071853756904602, + "learning_rate": 8.577669902912622e-06, + "loss": 0.1094, + "step": 2061 + }, + { + "epoch": 1.428472462764115, + "grad_norm": 0.9276185035705566, + "learning_rate": 8.576976421636617e-06, + "loss": 0.116, + "step": 2062 + }, + { + "epoch": 1.42916522341531, + "grad_norm": 0.8712035417556763, + "learning_rate": 8.576282940360611e-06, + "loss": 0.097, + "step": 2063 + }, + { + "epoch": 1.429857984066505, + "grad_norm": 0.7674479484558105, + "learning_rate": 8.575589459084606e-06, + "loss": 0.093, + "step": 2064 + }, + { + "epoch": 1.4305507447177002, + "grad_norm": 1.0586395263671875, + "learning_rate": 8.5748959778086e-06, + "loss": 0.1034, + "step": 2065 + }, + { + "epoch": 1.431243505368895, + "grad_norm": 0.8353042602539062, + "learning_rate": 8.574202496532595e-06, + "loss": 0.1023, + "step": 2066 + }, + { + "epoch": 1.43193626602009, + "grad_norm": 0.8541135191917419, + "learning_rate": 8.57350901525659e-06, + "loss": 0.1047, + "step": 2067 + }, + { + "epoch": 1.432629026671285, + "grad_norm": 0.7746983766555786, + "learning_rate": 8.572815533980583e-06, + "loss": 0.0871, + "step": 2068 + }, + { + "epoch": 1.43332178732248, + "grad_norm": 0.854115903377533, + "learning_rate": 8.572122052704578e-06, + "loss": 0.0951, + "step": 2069 + }, + { + "epoch": 1.4340145479736752, + "grad_norm": 0.8405554890632629, + "learning_rate": 8.571428571428571e-06, + "loss": 0.0961, + "step": 2070 + }, + { + "epoch": 1.43470730862487, + "grad_norm": 0.8547260165214539, + "learning_rate": 8.570735090152566e-06, + "loss": 0.0731, + "step": 2071 + }, + { + "epoch": 1.4354000692760651, + "grad_norm": 0.930776834487915, + "learning_rate": 8.57004160887656e-06, + "loss": 0.0952, + "step": 2072 + }, + { + "epoch": 1.4360928299272602, + "grad_norm": 0.8687021136283875, + "learning_rate": 8.569348127600556e-06, + "loss": 0.084, + "step": 2073 + }, + { + "epoch": 1.436785590578455, + "grad_norm": 0.8948690891265869, + "learning_rate": 8.56865464632455e-06, + "loss": 0.0963, + "step": 2074 + }, + { + "epoch": 1.4374783512296503, + "grad_norm": 0.7903040051460266, + "learning_rate": 8.567961165048544e-06, + "loss": 0.0876, + "step": 2075 + }, + { + "epoch": 1.4381711118808451, + "grad_norm": 0.8380088210105896, + "learning_rate": 8.567267683772539e-06, + "loss": 0.1008, + "step": 2076 + }, + { + "epoch": 1.4388638725320402, + "grad_norm": 0.8036629557609558, + "learning_rate": 8.566574202496534e-06, + "loss": 0.0767, + "step": 2077 + }, + { + "epoch": 1.4395566331832352, + "grad_norm": 0.8765242695808411, + "learning_rate": 8.565880721220527e-06, + "loss": 0.0879, + "step": 2078 + }, + { + "epoch": 1.44024939383443, + "grad_norm": 0.9223285913467407, + "learning_rate": 8.565187239944522e-06, + "loss": 0.1053, + "step": 2079 + }, + { + "epoch": 1.4409421544856253, + "grad_norm": 0.8735775351524353, + "learning_rate": 8.564493758668517e-06, + "loss": 0.0936, + "step": 2080 + }, + { + "epoch": 1.4416349151368202, + "grad_norm": 0.9525173306465149, + "learning_rate": 8.563800277392512e-06, + "loss": 0.09, + "step": 2081 + }, + { + "epoch": 1.4423276757880152, + "grad_norm": 0.8726947903633118, + "learning_rate": 8.563106796116507e-06, + "loss": 0.096, + "step": 2082 + }, + { + "epoch": 1.4430204364392103, + "grad_norm": 0.9291168451309204, + "learning_rate": 8.5624133148405e-06, + "loss": 0.1142, + "step": 2083 + }, + { + "epoch": 1.4437131970904051, + "grad_norm": 0.8167150616645813, + "learning_rate": 8.561719833564495e-06, + "loss": 0.081, + "step": 2084 + }, + { + "epoch": 1.4444059577416004, + "grad_norm": 0.8544431328773499, + "learning_rate": 8.561026352288488e-06, + "loss": 0.0983, + "step": 2085 + }, + { + "epoch": 1.4450987183927952, + "grad_norm": 0.8344400525093079, + "learning_rate": 8.560332871012483e-06, + "loss": 0.0905, + "step": 2086 + }, + { + "epoch": 1.4457914790439903, + "grad_norm": 0.948971688747406, + "learning_rate": 8.559639389736478e-06, + "loss": 0.1133, + "step": 2087 + }, + { + "epoch": 1.4464842396951854, + "grad_norm": 0.9090603590011597, + "learning_rate": 8.558945908460471e-06, + "loss": 0.0923, + "step": 2088 + }, + { + "epoch": 1.4471770003463802, + "grad_norm": 0.9278730154037476, + "learning_rate": 8.558252427184466e-06, + "loss": 0.1105, + "step": 2089 + }, + { + "epoch": 1.4478697609975755, + "grad_norm": 1.03156316280365, + "learning_rate": 8.557558945908461e-06, + "loss": 0.1029, + "step": 2090 + }, + { + "epoch": 1.4485625216487703, + "grad_norm": 0.8852841258049011, + "learning_rate": 8.556865464632456e-06, + "loss": 0.0975, + "step": 2091 + }, + { + "epoch": 1.4492552822999654, + "grad_norm": 0.9188801646232605, + "learning_rate": 8.556171983356451e-06, + "loss": 0.1096, + "step": 2092 + }, + { + "epoch": 1.4499480429511604, + "grad_norm": 0.7110975980758667, + "learning_rate": 8.555478502080444e-06, + "loss": 0.0815, + "step": 2093 + }, + { + "epoch": 1.4506408036023553, + "grad_norm": 0.8940724730491638, + "learning_rate": 8.55478502080444e-06, + "loss": 0.099, + "step": 2094 + }, + { + "epoch": 1.4513335642535505, + "grad_norm": 0.8253915905952454, + "learning_rate": 8.554091539528432e-06, + "loss": 0.0921, + "step": 2095 + }, + { + "epoch": 1.4520263249047454, + "grad_norm": 0.7639085650444031, + "learning_rate": 8.553398058252427e-06, + "loss": 0.0818, + "step": 2096 + }, + { + "epoch": 1.4527190855559404, + "grad_norm": 0.915162205696106, + "learning_rate": 8.552704576976422e-06, + "loss": 0.1126, + "step": 2097 + }, + { + "epoch": 1.4534118462071355, + "grad_norm": 1.0026566982269287, + "learning_rate": 8.552011095700417e-06, + "loss": 0.1012, + "step": 2098 + }, + { + "epoch": 1.4541046068583303, + "grad_norm": 0.8221924304962158, + "learning_rate": 8.551317614424412e-06, + "loss": 0.1048, + "step": 2099 + }, + { + "epoch": 1.4547973675095256, + "grad_norm": 0.7636975646018982, + "learning_rate": 8.550624133148405e-06, + "loss": 0.0805, + "step": 2100 + }, + { + "epoch": 1.4554901281607204, + "grad_norm": 0.847546398639679, + "learning_rate": 8.5499306518724e-06, + "loss": 0.1076, + "step": 2101 + }, + { + "epoch": 1.4561828888119155, + "grad_norm": 0.8234090805053711, + "learning_rate": 8.549237170596395e-06, + "loss": 0.1032, + "step": 2102 + }, + { + "epoch": 1.4568756494631105, + "grad_norm": 0.7470714449882507, + "learning_rate": 8.548543689320388e-06, + "loss": 0.0882, + "step": 2103 + }, + { + "epoch": 1.4575684101143054, + "grad_norm": 0.9202353954315186, + "learning_rate": 8.547850208044383e-06, + "loss": 0.103, + "step": 2104 + }, + { + "epoch": 1.4582611707655007, + "grad_norm": 0.9301064014434814, + "learning_rate": 8.547156726768377e-06, + "loss": 0.1181, + "step": 2105 + }, + { + "epoch": 1.4589539314166955, + "grad_norm": 0.8707886934280396, + "learning_rate": 8.546463245492372e-06, + "loss": 0.0964, + "step": 2106 + }, + { + "epoch": 1.4596466920678905, + "grad_norm": 0.8569350242614746, + "learning_rate": 8.545769764216367e-06, + "loss": 0.0858, + "step": 2107 + }, + { + "epoch": 1.4603394527190856, + "grad_norm": 0.8189959526062012, + "learning_rate": 8.545076282940361e-06, + "loss": 0.0883, + "step": 2108 + }, + { + "epoch": 1.4610322133702804, + "grad_norm": 0.8569098114967346, + "learning_rate": 8.544382801664356e-06, + "loss": 0.0987, + "step": 2109 + }, + { + "epoch": 1.4617249740214755, + "grad_norm": 0.8092207908630371, + "learning_rate": 8.54368932038835e-06, + "loss": 0.094, + "step": 2110 + }, + { + "epoch": 1.4624177346726706, + "grad_norm": 0.8205234408378601, + "learning_rate": 8.542995839112345e-06, + "loss": 0.0856, + "step": 2111 + }, + { + "epoch": 1.4631104953238656, + "grad_norm": 0.9704433083534241, + "learning_rate": 8.54230235783634e-06, + "loss": 0.1348, + "step": 2112 + }, + { + "epoch": 1.4638032559750607, + "grad_norm": 0.8799313902854919, + "learning_rate": 8.541608876560333e-06, + "loss": 0.0922, + "step": 2113 + }, + { + "epoch": 1.4644960166262555, + "grad_norm": 1.072108507156372, + "learning_rate": 8.540915395284328e-06, + "loss": 0.0948, + "step": 2114 + }, + { + "epoch": 1.4651887772774506, + "grad_norm": 0.8269270658493042, + "learning_rate": 8.540221914008323e-06, + "loss": 0.0901, + "step": 2115 + }, + { + "epoch": 1.4658815379286456, + "grad_norm": 0.9558659195899963, + "learning_rate": 8.539528432732318e-06, + "loss": 0.0904, + "step": 2116 + }, + { + "epoch": 1.4665742985798407, + "grad_norm": 0.8863042593002319, + "learning_rate": 8.538834951456312e-06, + "loss": 0.0976, + "step": 2117 + }, + { + "epoch": 1.4672670592310357, + "grad_norm": 0.8317894339561462, + "learning_rate": 8.538141470180306e-06, + "loss": 0.0847, + "step": 2118 + }, + { + "epoch": 1.4679598198822306, + "grad_norm": 1.0261125564575195, + "learning_rate": 8.5374479889043e-06, + "loss": 0.1116, + "step": 2119 + }, + { + "epoch": 1.4686525805334256, + "grad_norm": 0.8076496720314026, + "learning_rate": 8.536754507628294e-06, + "loss": 0.0839, + "step": 2120 + }, + { + "epoch": 1.4693453411846207, + "grad_norm": 0.8292192220687866, + "learning_rate": 8.536061026352289e-06, + "loss": 0.0897, + "step": 2121 + }, + { + "epoch": 1.4700381018358157, + "grad_norm": 0.8142775893211365, + "learning_rate": 8.535367545076284e-06, + "loss": 0.0925, + "step": 2122 + }, + { + "epoch": 1.4707308624870108, + "grad_norm": 0.844031810760498, + "learning_rate": 8.534674063800279e-06, + "loss": 0.105, + "step": 2123 + }, + { + "epoch": 1.4714236231382056, + "grad_norm": 0.8712015748023987, + "learning_rate": 8.533980582524272e-06, + "loss": 0.1166, + "step": 2124 + }, + { + "epoch": 1.4721163837894007, + "grad_norm": 0.89336097240448, + "learning_rate": 8.533287101248267e-06, + "loss": 0.1108, + "step": 2125 + }, + { + "epoch": 1.4728091444405957, + "grad_norm": 0.8269360065460205, + "learning_rate": 8.532593619972262e-06, + "loss": 0.0826, + "step": 2126 + }, + { + "epoch": 1.4735019050917908, + "grad_norm": 0.8392295837402344, + "learning_rate": 8.531900138696257e-06, + "loss": 0.0989, + "step": 2127 + }, + { + "epoch": 1.4741946657429859, + "grad_norm": 0.8825324773788452, + "learning_rate": 8.53120665742025e-06, + "loss": 0.0998, + "step": 2128 + }, + { + "epoch": 1.4748874263941807, + "grad_norm": 0.8092551231384277, + "learning_rate": 8.530513176144245e-06, + "loss": 0.0932, + "step": 2129 + }, + { + "epoch": 1.4755801870453757, + "grad_norm": 0.8736479878425598, + "learning_rate": 8.529819694868238e-06, + "loss": 0.0914, + "step": 2130 + }, + { + "epoch": 1.4762729476965708, + "grad_norm": 0.8300082683563232, + "learning_rate": 8.529126213592233e-06, + "loss": 0.0916, + "step": 2131 + }, + { + "epoch": 1.4769657083477659, + "grad_norm": 0.8586478233337402, + "learning_rate": 8.528432732316228e-06, + "loss": 0.0913, + "step": 2132 + }, + { + "epoch": 1.477658468998961, + "grad_norm": 0.935139536857605, + "learning_rate": 8.527739251040223e-06, + "loss": 0.126, + "step": 2133 + }, + { + "epoch": 1.4783512296501558, + "grad_norm": 0.8662792444229126, + "learning_rate": 8.527045769764218e-06, + "loss": 0.094, + "step": 2134 + }, + { + "epoch": 1.4790439903013508, + "grad_norm": 0.8421633839607239, + "learning_rate": 8.526352288488211e-06, + "loss": 0.097, + "step": 2135 + }, + { + "epoch": 1.4797367509525459, + "grad_norm": 0.8011019229888916, + "learning_rate": 8.525658807212206e-06, + "loss": 0.1083, + "step": 2136 + }, + { + "epoch": 1.480429511603741, + "grad_norm": 0.7471836805343628, + "learning_rate": 8.524965325936201e-06, + "loss": 0.0688, + "step": 2137 + }, + { + "epoch": 1.481122272254936, + "grad_norm": 0.8931064009666443, + "learning_rate": 8.524271844660194e-06, + "loss": 0.101, + "step": 2138 + }, + { + "epoch": 1.4818150329061308, + "grad_norm": 0.8537425994873047, + "learning_rate": 8.523578363384189e-06, + "loss": 0.0788, + "step": 2139 + }, + { + "epoch": 1.4825077935573259, + "grad_norm": 0.8899325728416443, + "learning_rate": 8.522884882108184e-06, + "loss": 0.0986, + "step": 2140 + }, + { + "epoch": 1.483200554208521, + "grad_norm": 0.864824652671814, + "learning_rate": 8.522191400832179e-06, + "loss": 0.0884, + "step": 2141 + }, + { + "epoch": 1.483893314859716, + "grad_norm": 0.8555802702903748, + "learning_rate": 8.521497919556174e-06, + "loss": 0.1002, + "step": 2142 + }, + { + "epoch": 1.484586075510911, + "grad_norm": 0.8407396674156189, + "learning_rate": 8.520804438280167e-06, + "loss": 0.0942, + "step": 2143 + }, + { + "epoch": 1.4852788361621059, + "grad_norm": 0.8692257404327393, + "learning_rate": 8.520110957004162e-06, + "loss": 0.1022, + "step": 2144 + }, + { + "epoch": 1.485971596813301, + "grad_norm": 0.9186396598815918, + "learning_rate": 8.519417475728155e-06, + "loss": 0.0994, + "step": 2145 + }, + { + "epoch": 1.486664357464496, + "grad_norm": 0.8801345825195312, + "learning_rate": 8.51872399445215e-06, + "loss": 0.0945, + "step": 2146 + }, + { + "epoch": 1.487357118115691, + "grad_norm": 0.7477225065231323, + "learning_rate": 8.518030513176145e-06, + "loss": 0.0738, + "step": 2147 + }, + { + "epoch": 1.488049878766886, + "grad_norm": 1.0097402334213257, + "learning_rate": 8.517337031900138e-06, + "loss": 0.104, + "step": 2148 + }, + { + "epoch": 1.488742639418081, + "grad_norm": 0.927609384059906, + "learning_rate": 8.516643550624133e-06, + "loss": 0.0944, + "step": 2149 + }, + { + "epoch": 1.489435400069276, + "grad_norm": 0.7638230323791504, + "learning_rate": 8.515950069348128e-06, + "loss": 0.0806, + "step": 2150 + }, + { + "epoch": 1.490128160720471, + "grad_norm": 1.0538862943649292, + "learning_rate": 8.515256588072123e-06, + "loss": 0.1002, + "step": 2151 + }, + { + "epoch": 1.490820921371666, + "grad_norm": 0.807037889957428, + "learning_rate": 8.514563106796118e-06, + "loss": 0.0926, + "step": 2152 + }, + { + "epoch": 1.4915136820228612, + "grad_norm": 0.7541260123252869, + "learning_rate": 8.513869625520111e-06, + "loss": 0.0893, + "step": 2153 + }, + { + "epoch": 1.492206442674056, + "grad_norm": 0.783531904220581, + "learning_rate": 8.513176144244106e-06, + "loss": 0.0913, + "step": 2154 + }, + { + "epoch": 1.492899203325251, + "grad_norm": 0.8726778030395508, + "learning_rate": 8.5124826629681e-06, + "loss": 0.093, + "step": 2155 + }, + { + "epoch": 1.4935919639764461, + "grad_norm": 0.9442015290260315, + "learning_rate": 8.511789181692095e-06, + "loss": 0.1027, + "step": 2156 + }, + { + "epoch": 1.4942847246276412, + "grad_norm": 0.8207205533981323, + "learning_rate": 8.51109570041609e-06, + "loss": 0.0933, + "step": 2157 + }, + { + "epoch": 1.4949774852788362, + "grad_norm": 0.8412635922431946, + "learning_rate": 8.510402219140084e-06, + "loss": 0.0957, + "step": 2158 + }, + { + "epoch": 1.495670245930031, + "grad_norm": 0.9047576189041138, + "learning_rate": 8.50970873786408e-06, + "loss": 0.0998, + "step": 2159 + }, + { + "epoch": 1.4963630065812261, + "grad_norm": 0.8066442012786865, + "learning_rate": 8.509015256588073e-06, + "loss": 0.0849, + "step": 2160 + }, + { + "epoch": 1.4970557672324212, + "grad_norm": 0.8611944317817688, + "learning_rate": 8.508321775312068e-06, + "loss": 0.0895, + "step": 2161 + }, + { + "epoch": 1.4977485278836162, + "grad_norm": 0.9365400075912476, + "learning_rate": 8.507628294036062e-06, + "loss": 0.0908, + "step": 2162 + }, + { + "epoch": 1.4984412885348113, + "grad_norm": 0.8359665274620056, + "learning_rate": 8.506934812760056e-06, + "loss": 0.0953, + "step": 2163 + }, + { + "epoch": 1.4991340491860061, + "grad_norm": 0.806018054485321, + "learning_rate": 8.50624133148405e-06, + "loss": 0.0843, + "step": 2164 + }, + { + "epoch": 1.4998268098372012, + "grad_norm": 0.8166291117668152, + "learning_rate": 8.505547850208044e-06, + "loss": 0.0807, + "step": 2165 + }, + { + "epoch": 1.5005195704883962, + "grad_norm": 0.7994527220726013, + "learning_rate": 8.504854368932039e-06, + "loss": 0.0919, + "step": 2166 + }, + { + "epoch": 1.5012123311395913, + "grad_norm": 0.8588024377822876, + "learning_rate": 8.504160887656034e-06, + "loss": 0.1015, + "step": 2167 + }, + { + "epoch": 1.5019050917907864, + "grad_norm": 0.8852903842926025, + "learning_rate": 8.503467406380029e-06, + "loss": 0.1108, + "step": 2168 + }, + { + "epoch": 1.5025978524419812, + "grad_norm": 0.8280705213546753, + "learning_rate": 8.502773925104024e-06, + "loss": 0.0786, + "step": 2169 + }, + { + "epoch": 1.5032906130931765, + "grad_norm": 0.8698577880859375, + "learning_rate": 8.502080443828017e-06, + "loss": 0.1013, + "step": 2170 + }, + { + "epoch": 1.5039833737443713, + "grad_norm": 0.960726261138916, + "learning_rate": 8.501386962552012e-06, + "loss": 0.0949, + "step": 2171 + }, + { + "epoch": 1.5046761343955664, + "grad_norm": 0.8570393323898315, + "learning_rate": 8.500693481276007e-06, + "loss": 0.1036, + "step": 2172 + }, + { + "epoch": 1.5053688950467614, + "grad_norm": 0.8700308203697205, + "learning_rate": 8.5e-06, + "loss": 0.102, + "step": 2173 + }, + { + "epoch": 1.5060616556979562, + "grad_norm": 0.9036585092544556, + "learning_rate": 8.499306518723995e-06, + "loss": 0.0972, + "step": 2174 + }, + { + "epoch": 1.5067544163491515, + "grad_norm": 0.8921233415603638, + "learning_rate": 8.49861303744799e-06, + "loss": 0.1123, + "step": 2175 + }, + { + "epoch": 1.5074471770003464, + "grad_norm": 0.7798379063606262, + "learning_rate": 8.497919556171985e-06, + "loss": 0.0886, + "step": 2176 + }, + { + "epoch": 1.5081399376515414, + "grad_norm": 0.8370013236999512, + "learning_rate": 8.49722607489598e-06, + "loss": 0.0957, + "step": 2177 + }, + { + "epoch": 1.5088326983027365, + "grad_norm": 0.7789174318313599, + "learning_rate": 8.496532593619973e-06, + "loss": 0.0889, + "step": 2178 + }, + { + "epoch": 1.5095254589539313, + "grad_norm": 0.7475576400756836, + "learning_rate": 8.495839112343968e-06, + "loss": 0.0911, + "step": 2179 + }, + { + "epoch": 1.5102182196051266, + "grad_norm": 0.8233779072761536, + "learning_rate": 8.495145631067961e-06, + "loss": 0.0744, + "step": 2180 + }, + { + "epoch": 1.5109109802563214, + "grad_norm": 0.7661387324333191, + "learning_rate": 8.494452149791956e-06, + "loss": 0.0931, + "step": 2181 + }, + { + "epoch": 1.5116037409075165, + "grad_norm": 0.7867785096168518, + "learning_rate": 8.493758668515951e-06, + "loss": 0.0901, + "step": 2182 + }, + { + "epoch": 1.5122965015587115, + "grad_norm": 0.8294643759727478, + "learning_rate": 8.493065187239944e-06, + "loss": 0.1132, + "step": 2183 + }, + { + "epoch": 1.5129892622099064, + "grad_norm": 0.8717425465583801, + "learning_rate": 8.492371705963939e-06, + "loss": 0.0987, + "step": 2184 + }, + { + "epoch": 1.5136820228611017, + "grad_norm": 0.8166452646255493, + "learning_rate": 8.491678224687934e-06, + "loss": 0.0901, + "step": 2185 + }, + { + "epoch": 1.5143747835122965, + "grad_norm": 0.8640490770339966, + "learning_rate": 8.490984743411929e-06, + "loss": 0.1201, + "step": 2186 + }, + { + "epoch": 1.5150675441634915, + "grad_norm": 0.8363556861877441, + "learning_rate": 8.490291262135924e-06, + "loss": 0.1068, + "step": 2187 + }, + { + "epoch": 1.5157603048146866, + "grad_norm": 0.8939452171325684, + "learning_rate": 8.489597780859917e-06, + "loss": 0.0878, + "step": 2188 + }, + { + "epoch": 1.5164530654658814, + "grad_norm": 0.9297623634338379, + "learning_rate": 8.488904299583912e-06, + "loss": 0.0937, + "step": 2189 + }, + { + "epoch": 1.5171458261170767, + "grad_norm": 0.8661013841629028, + "learning_rate": 8.488210818307905e-06, + "loss": 0.0862, + "step": 2190 + }, + { + "epoch": 1.5178385867682715, + "grad_norm": 0.9533841013908386, + "learning_rate": 8.4875173370319e-06, + "loss": 0.1266, + "step": 2191 + }, + { + "epoch": 1.5185313474194666, + "grad_norm": 0.873518705368042, + "learning_rate": 8.486823855755895e-06, + "loss": 0.0928, + "step": 2192 + }, + { + "epoch": 1.5192241080706617, + "grad_norm": 0.8036083579063416, + "learning_rate": 8.48613037447989e-06, + "loss": 0.093, + "step": 2193 + }, + { + "epoch": 1.5199168687218565, + "grad_norm": 0.8228848576545715, + "learning_rate": 8.485436893203885e-06, + "loss": 0.0908, + "step": 2194 + }, + { + "epoch": 1.5206096293730518, + "grad_norm": 0.8991081118583679, + "learning_rate": 8.484743411927878e-06, + "loss": 0.1134, + "step": 2195 + }, + { + "epoch": 1.5213023900242466, + "grad_norm": 0.891331136226654, + "learning_rate": 8.484049930651873e-06, + "loss": 0.1266, + "step": 2196 + }, + { + "epoch": 1.5219951506754417, + "grad_norm": 0.7529364228248596, + "learning_rate": 8.483356449375868e-06, + "loss": 0.0813, + "step": 2197 + }, + { + "epoch": 1.5226879113266367, + "grad_norm": 0.774604320526123, + "learning_rate": 8.482662968099861e-06, + "loss": 0.1056, + "step": 2198 + }, + { + "epoch": 1.5233806719778316, + "grad_norm": 0.9285426139831543, + "learning_rate": 8.481969486823856e-06, + "loss": 0.0923, + "step": 2199 + }, + { + "epoch": 1.5240734326290268, + "grad_norm": 0.866604208946228, + "learning_rate": 8.481276005547851e-06, + "loss": 0.1077, + "step": 2200 + }, + { + "epoch": 1.5247661932802217, + "grad_norm": 0.8425531983375549, + "learning_rate": 8.480582524271845e-06, + "loss": 0.0961, + "step": 2201 + }, + { + "epoch": 1.5254589539314167, + "grad_norm": 0.7247449159622192, + "learning_rate": 8.47988904299584e-06, + "loss": 0.0833, + "step": 2202 + }, + { + "epoch": 1.5261517145826118, + "grad_norm": 0.7726675868034363, + "learning_rate": 8.479195561719834e-06, + "loss": 0.0886, + "step": 2203 + }, + { + "epoch": 1.5268444752338066, + "grad_norm": 0.7877854704856873, + "learning_rate": 8.47850208044383e-06, + "loss": 0.0782, + "step": 2204 + }, + { + "epoch": 1.527537235885002, + "grad_norm": 0.8992438912391663, + "learning_rate": 8.477808599167823e-06, + "loss": 0.0864, + "step": 2205 + }, + { + "epoch": 1.5282299965361967, + "grad_norm": 0.8800943493843079, + "learning_rate": 8.477115117891817e-06, + "loss": 0.0785, + "step": 2206 + }, + { + "epoch": 1.5289227571873918, + "grad_norm": 0.7755690813064575, + "learning_rate": 8.476421636615812e-06, + "loss": 0.0819, + "step": 2207 + }, + { + "epoch": 1.5296155178385868, + "grad_norm": 0.7317102551460266, + "learning_rate": 8.475728155339806e-06, + "loss": 0.0843, + "step": 2208 + }, + { + "epoch": 1.5303082784897817, + "grad_norm": 0.6747839450836182, + "learning_rate": 8.4750346740638e-06, + "loss": 0.0643, + "step": 2209 + }, + { + "epoch": 1.5310010391409767, + "grad_norm": 0.789362907409668, + "learning_rate": 8.474341192787796e-06, + "loss": 0.0965, + "step": 2210 + }, + { + "epoch": 1.5316937997921718, + "grad_norm": 0.8516563773155212, + "learning_rate": 8.47364771151179e-06, + "loss": 0.1064, + "step": 2211 + }, + { + "epoch": 1.5323865604433669, + "grad_norm": 1.3988782167434692, + "learning_rate": 8.472954230235785e-06, + "loss": 0.0986, + "step": 2212 + }, + { + "epoch": 1.533079321094562, + "grad_norm": 0.8765525817871094, + "learning_rate": 8.472260748959779e-06, + "loss": 0.1131, + "step": 2213 + }, + { + "epoch": 1.5337720817457567, + "grad_norm": 0.8611074686050415, + "learning_rate": 8.471567267683774e-06, + "loss": 0.0941, + "step": 2214 + }, + { + "epoch": 1.5344648423969518, + "grad_norm": 0.8504629135131836, + "learning_rate": 8.470873786407767e-06, + "loss": 0.1056, + "step": 2215 + }, + { + "epoch": 1.5351576030481469, + "grad_norm": 0.8550898432731628, + "learning_rate": 8.470180305131762e-06, + "loss": 0.1006, + "step": 2216 + }, + { + "epoch": 1.535850363699342, + "grad_norm": 0.8239811062812805, + "learning_rate": 8.469486823855757e-06, + "loss": 0.081, + "step": 2217 + }, + { + "epoch": 1.536543124350537, + "grad_norm": 0.8953526616096497, + "learning_rate": 8.468793342579752e-06, + "loss": 0.114, + "step": 2218 + }, + { + "epoch": 1.5372358850017318, + "grad_norm": 0.8694466948509216, + "learning_rate": 8.468099861303747e-06, + "loss": 0.0892, + "step": 2219 + }, + { + "epoch": 1.5379286456529269, + "grad_norm": 0.9421346783638, + "learning_rate": 8.46740638002774e-06, + "loss": 0.1106, + "step": 2220 + }, + { + "epoch": 1.538621406304122, + "grad_norm": 0.8330638408660889, + "learning_rate": 8.466712898751735e-06, + "loss": 0.1088, + "step": 2221 + }, + { + "epoch": 1.539314166955317, + "grad_norm": 0.7530385851860046, + "learning_rate": 8.46601941747573e-06, + "loss": 0.0777, + "step": 2222 + }, + { + "epoch": 1.540006927606512, + "grad_norm": 0.7912553548812866, + "learning_rate": 8.465325936199723e-06, + "loss": 0.0788, + "step": 2223 + }, + { + "epoch": 1.5406996882577069, + "grad_norm": 0.9006667733192444, + "learning_rate": 8.464632454923718e-06, + "loss": 0.115, + "step": 2224 + }, + { + "epoch": 1.541392448908902, + "grad_norm": 0.8519740104675293, + "learning_rate": 8.463938973647711e-06, + "loss": 0.0983, + "step": 2225 + }, + { + "epoch": 1.542085209560097, + "grad_norm": 0.9370810389518738, + "learning_rate": 8.463245492371706e-06, + "loss": 0.1081, + "step": 2226 + }, + { + "epoch": 1.542777970211292, + "grad_norm": 0.8160594701766968, + "learning_rate": 8.462552011095701e-06, + "loss": 0.0985, + "step": 2227 + }, + { + "epoch": 1.543470730862487, + "grad_norm": 0.8738390803337097, + "learning_rate": 8.461858529819696e-06, + "loss": 0.1103, + "step": 2228 + }, + { + "epoch": 1.544163491513682, + "grad_norm": 0.8068435192108154, + "learning_rate": 8.46116504854369e-06, + "loss": 0.0806, + "step": 2229 + }, + { + "epoch": 1.544856252164877, + "grad_norm": 0.9434190988540649, + "learning_rate": 8.460471567267684e-06, + "loss": 0.0955, + "step": 2230 + }, + { + "epoch": 1.545549012816072, + "grad_norm": 0.8715764284133911, + "learning_rate": 8.459778085991679e-06, + "loss": 0.0864, + "step": 2231 + }, + { + "epoch": 1.546241773467267, + "grad_norm": 0.7127259969711304, + "learning_rate": 8.459084604715674e-06, + "loss": 0.0829, + "step": 2232 + }, + { + "epoch": 1.5469345341184622, + "grad_norm": 0.798002302646637, + "learning_rate": 8.458391123439667e-06, + "loss": 0.0826, + "step": 2233 + }, + { + "epoch": 1.547627294769657, + "grad_norm": 0.7845522165298462, + "learning_rate": 8.457697642163662e-06, + "loss": 0.0838, + "step": 2234 + }, + { + "epoch": 1.548320055420852, + "grad_norm": 0.8004646301269531, + "learning_rate": 8.457004160887657e-06, + "loss": 0.0846, + "step": 2235 + }, + { + "epoch": 1.549012816072047, + "grad_norm": 0.8239243626594543, + "learning_rate": 8.456310679611652e-06, + "loss": 0.0949, + "step": 2236 + }, + { + "epoch": 1.5497055767232422, + "grad_norm": 0.785351037979126, + "learning_rate": 8.455617198335647e-06, + "loss": 0.0776, + "step": 2237 + }, + { + "epoch": 1.5503983373744372, + "grad_norm": 0.7641162276268005, + "learning_rate": 8.45492371705964e-06, + "loss": 0.0743, + "step": 2238 + }, + { + "epoch": 1.551091098025632, + "grad_norm": 0.8055779337882996, + "learning_rate": 8.454230235783635e-06, + "loss": 0.0779, + "step": 2239 + }, + { + "epoch": 1.551783858676827, + "grad_norm": 0.806489109992981, + "learning_rate": 8.453536754507628e-06, + "loss": 0.0813, + "step": 2240 + }, + { + "epoch": 1.5524766193280222, + "grad_norm": 0.8963631391525269, + "learning_rate": 8.452843273231623e-06, + "loss": 0.0991, + "step": 2241 + }, + { + "epoch": 1.5531693799792172, + "grad_norm": 0.8540571928024292, + "learning_rate": 8.452149791955618e-06, + "loss": 0.0984, + "step": 2242 + }, + { + "epoch": 1.5538621406304123, + "grad_norm": 0.7815813422203064, + "learning_rate": 8.451456310679611e-06, + "loss": 0.0863, + "step": 2243 + }, + { + "epoch": 1.5545549012816071, + "grad_norm": 0.8563543558120728, + "learning_rate": 8.450762829403606e-06, + "loss": 0.0894, + "step": 2244 + }, + { + "epoch": 1.5552476619328022, + "grad_norm": 0.7765095233917236, + "learning_rate": 8.450069348127601e-06, + "loss": 0.078, + "step": 2245 + }, + { + "epoch": 1.5559404225839972, + "grad_norm": 0.7816537022590637, + "learning_rate": 8.449375866851596e-06, + "loss": 0.0856, + "step": 2246 + }, + { + "epoch": 1.5566331832351923, + "grad_norm": 0.9687926173210144, + "learning_rate": 8.448682385575591e-06, + "loss": 0.085, + "step": 2247 + }, + { + "epoch": 1.5573259438863873, + "grad_norm": 0.9634583592414856, + "learning_rate": 8.447988904299584e-06, + "loss": 0.1021, + "step": 2248 + }, + { + "epoch": 1.5580187045375822, + "grad_norm": 0.8024788498878479, + "learning_rate": 8.44729542302358e-06, + "loss": 0.0829, + "step": 2249 + }, + { + "epoch": 1.5587114651887772, + "grad_norm": 0.77815181016922, + "learning_rate": 8.446601941747573e-06, + "loss": 0.0855, + "step": 2250 + }, + { + "epoch": 1.5594042258399723, + "grad_norm": 1.0114446878433228, + "learning_rate": 8.445908460471567e-06, + "loss": 0.113, + "step": 2251 + }, + { + "epoch": 1.5600969864911673, + "grad_norm": 0.8147749304771423, + "learning_rate": 8.445214979195562e-06, + "loss": 0.0855, + "step": 2252 + }, + { + "epoch": 1.5607897471423624, + "grad_norm": 0.8138399124145508, + "learning_rate": 8.444521497919557e-06, + "loss": 0.088, + "step": 2253 + }, + { + "epoch": 1.5614825077935572, + "grad_norm": 0.8528136610984802, + "learning_rate": 8.443828016643552e-06, + "loss": 0.0915, + "step": 2254 + }, + { + "epoch": 1.5621752684447523, + "grad_norm": 0.7786431312561035, + "learning_rate": 8.443134535367546e-06, + "loss": 0.085, + "step": 2255 + }, + { + "epoch": 1.5628680290959474, + "grad_norm": 0.8430132865905762, + "learning_rate": 8.44244105409154e-06, + "loss": 0.0956, + "step": 2256 + }, + { + "epoch": 1.5635607897471424, + "grad_norm": 0.8963088393211365, + "learning_rate": 8.441747572815535e-06, + "loss": 0.0866, + "step": 2257 + }, + { + "epoch": 1.5642535503983375, + "grad_norm": 0.7786991596221924, + "learning_rate": 8.441054091539529e-06, + "loss": 0.0851, + "step": 2258 + }, + { + "epoch": 1.5649463110495323, + "grad_norm": 0.939453125, + "learning_rate": 8.440360610263524e-06, + "loss": 0.1006, + "step": 2259 + }, + { + "epoch": 1.5656390717007274, + "grad_norm": 0.7245839834213257, + "learning_rate": 8.439667128987517e-06, + "loss": 0.085, + "step": 2260 + }, + { + "epoch": 1.5663318323519224, + "grad_norm": 0.9496403336524963, + "learning_rate": 8.438973647711512e-06, + "loss": 0.1069, + "step": 2261 + }, + { + "epoch": 1.5670245930031175, + "grad_norm": 0.8363348245620728, + "learning_rate": 8.438280166435507e-06, + "loss": 0.0969, + "step": 2262 + }, + { + "epoch": 1.5677173536543125, + "grad_norm": 0.8518447279930115, + "learning_rate": 8.437586685159502e-06, + "loss": 0.0822, + "step": 2263 + }, + { + "epoch": 1.5684101143055074, + "grad_norm": 0.8098133206367493, + "learning_rate": 8.436893203883497e-06, + "loss": 0.0888, + "step": 2264 + }, + { + "epoch": 1.5691028749567024, + "grad_norm": 0.8080687522888184, + "learning_rate": 8.43619972260749e-06, + "loss": 0.0876, + "step": 2265 + }, + { + "epoch": 1.5697956356078975, + "grad_norm": 1.146905779838562, + "learning_rate": 8.435506241331485e-06, + "loss": 0.1192, + "step": 2266 + }, + { + "epoch": 1.5704883962590925, + "grad_norm": 0.7518221735954285, + "learning_rate": 8.43481276005548e-06, + "loss": 0.0809, + "step": 2267 + }, + { + "epoch": 1.5711811569102876, + "grad_norm": 0.8670949935913086, + "learning_rate": 8.434119278779473e-06, + "loss": 0.0992, + "step": 2268 + }, + { + "epoch": 1.5718739175614824, + "grad_norm": 0.8235729336738586, + "learning_rate": 8.433425797503468e-06, + "loss": 0.0958, + "step": 2269 + }, + { + "epoch": 1.5725666782126775, + "grad_norm": 0.8344941735267639, + "learning_rate": 8.432732316227463e-06, + "loss": 0.101, + "step": 2270 + }, + { + "epoch": 1.5732594388638725, + "grad_norm": 0.8182787895202637, + "learning_rate": 8.432038834951458e-06, + "loss": 0.1032, + "step": 2271 + }, + { + "epoch": 1.5739521995150676, + "grad_norm": 0.9052494168281555, + "learning_rate": 8.431345353675453e-06, + "loss": 0.0842, + "step": 2272 + }, + { + "epoch": 1.5746449601662627, + "grad_norm": 0.8101468682289124, + "learning_rate": 8.430651872399446e-06, + "loss": 0.0897, + "step": 2273 + }, + { + "epoch": 1.5753377208174575, + "grad_norm": 0.7904663681983948, + "learning_rate": 8.42995839112344e-06, + "loss": 0.0918, + "step": 2274 + }, + { + "epoch": 1.5760304814686525, + "grad_norm": 0.9042665958404541, + "learning_rate": 8.429264909847434e-06, + "loss": 0.0922, + "step": 2275 + }, + { + "epoch": 1.5767232421198476, + "grad_norm": 0.8572434782981873, + "learning_rate": 8.428571428571429e-06, + "loss": 0.0952, + "step": 2276 + }, + { + "epoch": 1.5774160027710427, + "grad_norm": 0.7563155293464661, + "learning_rate": 8.427877947295424e-06, + "loss": 0.0838, + "step": 2277 + }, + { + "epoch": 1.5781087634222377, + "grad_norm": 0.7798501253128052, + "learning_rate": 8.427184466019419e-06, + "loss": 0.0838, + "step": 2278 + }, + { + "epoch": 1.5788015240734325, + "grad_norm": 0.901751697063446, + "learning_rate": 8.426490984743412e-06, + "loss": 0.117, + "step": 2279 + }, + { + "epoch": 1.5794942847246276, + "grad_norm": 0.9761192798614502, + "learning_rate": 8.425797503467407e-06, + "loss": 0.0993, + "step": 2280 + }, + { + "epoch": 1.5801870453758227, + "grad_norm": 1.0279780626296997, + "learning_rate": 8.425104022191402e-06, + "loss": 0.141, + "step": 2281 + }, + { + "epoch": 1.5808798060270177, + "grad_norm": 0.848374605178833, + "learning_rate": 8.424410540915397e-06, + "loss": 0.0901, + "step": 2282 + }, + { + "epoch": 1.5815725666782128, + "grad_norm": 0.7858184576034546, + "learning_rate": 8.42371705963939e-06, + "loss": 0.0867, + "step": 2283 + }, + { + "epoch": 1.5822653273294076, + "grad_norm": 0.8985626101493835, + "learning_rate": 8.423023578363385e-06, + "loss": 0.1012, + "step": 2284 + }, + { + "epoch": 1.5829580879806027, + "grad_norm": 0.9050174355506897, + "learning_rate": 8.422330097087378e-06, + "loss": 0.0774, + "step": 2285 + }, + { + "epoch": 1.5836508486317977, + "grad_norm": 0.7872288823127747, + "learning_rate": 8.421636615811373e-06, + "loss": 0.0825, + "step": 2286 + }, + { + "epoch": 1.5843436092829926, + "grad_norm": 0.862546980381012, + "learning_rate": 8.420943134535368e-06, + "loss": 0.1098, + "step": 2287 + }, + { + "epoch": 1.5850363699341878, + "grad_norm": 0.8242610096931458, + "learning_rate": 8.420249653259363e-06, + "loss": 0.0962, + "step": 2288 + }, + { + "epoch": 1.5857291305853827, + "grad_norm": 0.9755899310112, + "learning_rate": 8.419556171983358e-06, + "loss": 0.1015, + "step": 2289 + }, + { + "epoch": 1.5864218912365777, + "grad_norm": 1.0072556734085083, + "learning_rate": 8.418862690707351e-06, + "loss": 0.1444, + "step": 2290 + }, + { + "epoch": 1.5871146518877728, + "grad_norm": 0.980300784111023, + "learning_rate": 8.418169209431346e-06, + "loss": 0.1126, + "step": 2291 + }, + { + "epoch": 1.5878074125389676, + "grad_norm": 0.770821750164032, + "learning_rate": 8.417475728155341e-06, + "loss": 0.0851, + "step": 2292 + }, + { + "epoch": 1.588500173190163, + "grad_norm": 0.8465372323989868, + "learning_rate": 8.416782246879334e-06, + "loss": 0.0993, + "step": 2293 + }, + { + "epoch": 1.5891929338413577, + "grad_norm": 0.7881616353988647, + "learning_rate": 8.41608876560333e-06, + "loss": 0.0852, + "step": 2294 + }, + { + "epoch": 1.5898856944925528, + "grad_norm": 0.8990263342857361, + "learning_rate": 8.415395284327324e-06, + "loss": 0.1093, + "step": 2295 + }, + { + "epoch": 1.5905784551437478, + "grad_norm": 0.8176400065422058, + "learning_rate": 8.414701803051319e-06, + "loss": 0.0919, + "step": 2296 + }, + { + "epoch": 1.5912712157949427, + "grad_norm": 0.831199049949646, + "learning_rate": 8.414008321775314e-06, + "loss": 0.0935, + "step": 2297 + }, + { + "epoch": 1.591963976446138, + "grad_norm": 0.9600399732589722, + "learning_rate": 8.413314840499307e-06, + "loss": 0.1248, + "step": 2298 + }, + { + "epoch": 1.5926567370973328, + "grad_norm": 0.8325619697570801, + "learning_rate": 8.412621359223302e-06, + "loss": 0.0868, + "step": 2299 + }, + { + "epoch": 1.5933494977485279, + "grad_norm": 0.7649892568588257, + "learning_rate": 8.411927877947295e-06, + "loss": 0.0913, + "step": 2300 + }, + { + "epoch": 1.594042258399723, + "grad_norm": 1.0146855115890503, + "learning_rate": 8.41123439667129e-06, + "loss": 0.0886, + "step": 2301 + }, + { + "epoch": 1.5947350190509177, + "grad_norm": 0.775775134563446, + "learning_rate": 8.410540915395285e-06, + "loss": 0.0824, + "step": 2302 + }, + { + "epoch": 1.595427779702113, + "grad_norm": 0.830560564994812, + "learning_rate": 8.409847434119279e-06, + "loss": 0.0918, + "step": 2303 + }, + { + "epoch": 1.5961205403533079, + "grad_norm": 0.7563770413398743, + "learning_rate": 8.409153952843274e-06, + "loss": 0.083, + "step": 2304 + }, + { + "epoch": 1.596813301004503, + "grad_norm": 0.8576757311820984, + "learning_rate": 8.408460471567268e-06, + "loss": 0.0931, + "step": 2305 + }, + { + "epoch": 1.597506061655698, + "grad_norm": 0.8318626880645752, + "learning_rate": 8.407766990291263e-06, + "loss": 0.0818, + "step": 2306 + }, + { + "epoch": 1.5981988223068928, + "grad_norm": 0.8872581124305725, + "learning_rate": 8.407073509015258e-06, + "loss": 0.117, + "step": 2307 + }, + { + "epoch": 1.598891582958088, + "grad_norm": 0.8937380313873291, + "learning_rate": 8.406380027739252e-06, + "loss": 0.1128, + "step": 2308 + }, + { + "epoch": 1.599584343609283, + "grad_norm": 0.7964592576026917, + "learning_rate": 8.405686546463247e-06, + "loss": 0.0863, + "step": 2309 + }, + { + "epoch": 1.600277104260478, + "grad_norm": 0.7582256197929382, + "learning_rate": 8.40499306518724e-06, + "loss": 0.0784, + "step": 2310 + }, + { + "epoch": 1.600969864911673, + "grad_norm": 0.7886786460876465, + "learning_rate": 8.404299583911235e-06, + "loss": 0.0917, + "step": 2311 + }, + { + "epoch": 1.6016626255628679, + "grad_norm": 0.9329751133918762, + "learning_rate": 8.40360610263523e-06, + "loss": 0.1231, + "step": 2312 + }, + { + "epoch": 1.6023553862140631, + "grad_norm": 0.8119995594024658, + "learning_rate": 8.402912621359225e-06, + "loss": 0.0927, + "step": 2313 + }, + { + "epoch": 1.603048146865258, + "grad_norm": 0.8827449679374695, + "learning_rate": 8.40221914008322e-06, + "loss": 0.096, + "step": 2314 + }, + { + "epoch": 1.603740907516453, + "grad_norm": 0.8343936204910278, + "learning_rate": 8.401525658807213e-06, + "loss": 0.1083, + "step": 2315 + }, + { + "epoch": 1.604433668167648, + "grad_norm": 0.769190788269043, + "learning_rate": 8.400832177531208e-06, + "loss": 0.0731, + "step": 2316 + }, + { + "epoch": 1.605126428818843, + "grad_norm": 0.8587493896484375, + "learning_rate": 8.400138696255203e-06, + "loss": 0.1032, + "step": 2317 + }, + { + "epoch": 1.6058191894700382, + "grad_norm": 0.9102193117141724, + "learning_rate": 8.399445214979196e-06, + "loss": 0.0979, + "step": 2318 + }, + { + "epoch": 1.606511950121233, + "grad_norm": 0.7680855989456177, + "learning_rate": 8.39875173370319e-06, + "loss": 0.0859, + "step": 2319 + }, + { + "epoch": 1.607204710772428, + "grad_norm": 0.8719523549079895, + "learning_rate": 8.398058252427184e-06, + "loss": 0.1093, + "step": 2320 + }, + { + "epoch": 1.6078974714236232, + "grad_norm": 0.8795385956764221, + "learning_rate": 8.397364771151179e-06, + "loss": 0.0992, + "step": 2321 + }, + { + "epoch": 1.608590232074818, + "grad_norm": 0.7637278437614441, + "learning_rate": 8.396671289875174e-06, + "loss": 0.0921, + "step": 2322 + }, + { + "epoch": 1.6092829927260133, + "grad_norm": 0.7343862652778625, + "learning_rate": 8.395977808599169e-06, + "loss": 0.087, + "step": 2323 + }, + { + "epoch": 1.609975753377208, + "grad_norm": 0.899121880531311, + "learning_rate": 8.395284327323164e-06, + "loss": 0.0987, + "step": 2324 + }, + { + "epoch": 1.6106685140284032, + "grad_norm": 0.7771249413490295, + "learning_rate": 8.394590846047157e-06, + "loss": 0.0823, + "step": 2325 + }, + { + "epoch": 1.6113612746795982, + "grad_norm": 0.8386172652244568, + "learning_rate": 8.393897364771152e-06, + "loss": 0.0924, + "step": 2326 + }, + { + "epoch": 1.612054035330793, + "grad_norm": 0.8147438764572144, + "learning_rate": 8.393203883495147e-06, + "loss": 0.0955, + "step": 2327 + }, + { + "epoch": 1.6127467959819883, + "grad_norm": 0.95125412940979, + "learning_rate": 8.39251040221914e-06, + "loss": 0.1013, + "step": 2328 + }, + { + "epoch": 1.6134395566331832, + "grad_norm": 0.8926140069961548, + "learning_rate": 8.391816920943135e-06, + "loss": 0.1077, + "step": 2329 + }, + { + "epoch": 1.6141323172843782, + "grad_norm": 0.916304349899292, + "learning_rate": 8.39112343966713e-06, + "loss": 0.1121, + "step": 2330 + }, + { + "epoch": 1.6148250779355733, + "grad_norm": 0.8023694753646851, + "learning_rate": 8.390429958391125e-06, + "loss": 0.0877, + "step": 2331 + }, + { + "epoch": 1.6155178385867681, + "grad_norm": 0.8459504246711731, + "learning_rate": 8.38973647711512e-06, + "loss": 0.0984, + "step": 2332 + }, + { + "epoch": 1.6162105992379634, + "grad_norm": 0.9580438733100891, + "learning_rate": 8.389042995839113e-06, + "loss": 0.1066, + "step": 2333 + }, + { + "epoch": 1.6169033598891582, + "grad_norm": 0.9596865773200989, + "learning_rate": 8.388349514563108e-06, + "loss": 0.1227, + "step": 2334 + }, + { + "epoch": 1.6175961205403533, + "grad_norm": 0.86414635181427, + "learning_rate": 8.387656033287101e-06, + "loss": 0.1046, + "step": 2335 + }, + { + "epoch": 1.6182888811915483, + "grad_norm": 0.8180360198020935, + "learning_rate": 8.386962552011096e-06, + "loss": 0.0839, + "step": 2336 + }, + { + "epoch": 1.6189816418427432, + "grad_norm": 0.8903512358665466, + "learning_rate": 8.386269070735091e-06, + "loss": 0.0875, + "step": 2337 + }, + { + "epoch": 1.6196744024939385, + "grad_norm": 0.7703673839569092, + "learning_rate": 8.385575589459084e-06, + "loss": 0.0887, + "step": 2338 + }, + { + "epoch": 1.6203671631451333, + "grad_norm": 0.8291234374046326, + "learning_rate": 8.38488210818308e-06, + "loss": 0.0738, + "step": 2339 + }, + { + "epoch": 1.6210599237963284, + "grad_norm": 0.850165069103241, + "learning_rate": 8.384188626907074e-06, + "loss": 0.1036, + "step": 2340 + }, + { + "epoch": 1.6217526844475234, + "grad_norm": 0.7883290648460388, + "learning_rate": 8.383495145631069e-06, + "loss": 0.0974, + "step": 2341 + }, + { + "epoch": 1.6224454450987182, + "grad_norm": 0.9966535568237305, + "learning_rate": 8.382801664355064e-06, + "loss": 0.1101, + "step": 2342 + }, + { + "epoch": 1.6231382057499135, + "grad_norm": 0.8514889478683472, + "learning_rate": 8.382108183079057e-06, + "loss": 0.0961, + "step": 2343 + }, + { + "epoch": 1.6238309664011084, + "grad_norm": 0.8843449950218201, + "learning_rate": 8.381414701803052e-06, + "loss": 0.0921, + "step": 2344 + }, + { + "epoch": 1.6245237270523034, + "grad_norm": 0.8890202641487122, + "learning_rate": 8.380721220527045e-06, + "loss": 0.1057, + "step": 2345 + }, + { + "epoch": 1.6252164877034985, + "grad_norm": 0.8342781066894531, + "learning_rate": 8.38002773925104e-06, + "loss": 0.1014, + "step": 2346 + }, + { + "epoch": 1.6259092483546933, + "grad_norm": 0.9303826689720154, + "learning_rate": 8.379334257975035e-06, + "loss": 0.1357, + "step": 2347 + }, + { + "epoch": 1.6266020090058886, + "grad_norm": 0.8655639290809631, + "learning_rate": 8.37864077669903e-06, + "loss": 0.0977, + "step": 2348 + }, + { + "epoch": 1.6272947696570834, + "grad_norm": 0.9540505409240723, + "learning_rate": 8.377947295423025e-06, + "loss": 0.0954, + "step": 2349 + }, + { + "epoch": 1.6279875303082785, + "grad_norm": 0.8128141164779663, + "learning_rate": 8.377253814147018e-06, + "loss": 0.0744, + "step": 2350 + }, + { + "epoch": 1.6286802909594735, + "grad_norm": 0.8301307559013367, + "learning_rate": 8.376560332871013e-06, + "loss": 0.1025, + "step": 2351 + }, + { + "epoch": 1.6293730516106684, + "grad_norm": 0.9831981658935547, + "learning_rate": 8.375866851595008e-06, + "loss": 0.1147, + "step": 2352 + }, + { + "epoch": 1.6300658122618636, + "grad_norm": 0.8499257564544678, + "learning_rate": 8.375173370319002e-06, + "loss": 0.0936, + "step": 2353 + }, + { + "epoch": 1.6307585729130585, + "grad_norm": 0.8565851449966431, + "learning_rate": 8.374479889042996e-06, + "loss": 0.0939, + "step": 2354 + }, + { + "epoch": 1.6314513335642535, + "grad_norm": 0.9829320907592773, + "learning_rate": 8.373786407766991e-06, + "loss": 0.0887, + "step": 2355 + }, + { + "epoch": 1.6321440942154486, + "grad_norm": 0.8618051409721375, + "learning_rate": 8.373092926490985e-06, + "loss": 0.095, + "step": 2356 + }, + { + "epoch": 1.6328368548666434, + "grad_norm": 0.8647840023040771, + "learning_rate": 8.37239944521498e-06, + "loss": 0.0929, + "step": 2357 + }, + { + "epoch": 1.6335296155178387, + "grad_norm": 0.8255338072776794, + "learning_rate": 8.371705963938975e-06, + "loss": 0.0896, + "step": 2358 + }, + { + "epoch": 1.6342223761690335, + "grad_norm": 0.772359311580658, + "learning_rate": 8.37101248266297e-06, + "loss": 0.0771, + "step": 2359 + }, + { + "epoch": 1.6349151368202286, + "grad_norm": 0.7591848373413086, + "learning_rate": 8.370319001386963e-06, + "loss": 0.086, + "step": 2360 + }, + { + "epoch": 1.6356078974714237, + "grad_norm": 0.8855777978897095, + "learning_rate": 8.369625520110958e-06, + "loss": 0.081, + "step": 2361 + }, + { + "epoch": 1.6363006581226185, + "grad_norm": 0.8855263590812683, + "learning_rate": 8.368932038834953e-06, + "loss": 0.0948, + "step": 2362 + }, + { + "epoch": 1.6369934187738138, + "grad_norm": 0.8888722658157349, + "learning_rate": 8.368238557558946e-06, + "loss": 0.0886, + "step": 2363 + }, + { + "epoch": 1.6376861794250086, + "grad_norm": 0.9112776517868042, + "learning_rate": 8.36754507628294e-06, + "loss": 0.1144, + "step": 2364 + }, + { + "epoch": 1.6383789400762037, + "grad_norm": 0.8456845283508301, + "learning_rate": 8.366851595006936e-06, + "loss": 0.0932, + "step": 2365 + }, + { + "epoch": 1.6390717007273987, + "grad_norm": 0.9201662540435791, + "learning_rate": 8.36615811373093e-06, + "loss": 0.0937, + "step": 2366 + }, + { + "epoch": 1.6397644613785936, + "grad_norm": 0.9050766825675964, + "learning_rate": 8.365464632454926e-06, + "loss": 0.1163, + "step": 2367 + }, + { + "epoch": 1.6404572220297888, + "grad_norm": 0.9505478739738464, + "learning_rate": 8.364771151178919e-06, + "loss": 0.1182, + "step": 2368 + }, + { + "epoch": 1.6411499826809837, + "grad_norm": 0.8112539052963257, + "learning_rate": 8.364077669902914e-06, + "loss": 0.0842, + "step": 2369 + }, + { + "epoch": 1.6418427433321787, + "grad_norm": 0.8253764510154724, + "learning_rate": 8.363384188626907e-06, + "loss": 0.0877, + "step": 2370 + }, + { + "epoch": 1.6425355039833738, + "grad_norm": 0.9781371355056763, + "learning_rate": 8.362690707350902e-06, + "loss": 0.1425, + "step": 2371 + }, + { + "epoch": 1.6432282646345686, + "grad_norm": 0.917624831199646, + "learning_rate": 8.361997226074897e-06, + "loss": 0.0885, + "step": 2372 + }, + { + "epoch": 1.643921025285764, + "grad_norm": 0.817848801612854, + "learning_rate": 8.361303744798892e-06, + "loss": 0.1102, + "step": 2373 + }, + { + "epoch": 1.6446137859369587, + "grad_norm": 0.9392564296722412, + "learning_rate": 8.360610263522887e-06, + "loss": 0.1327, + "step": 2374 + }, + { + "epoch": 1.6453065465881538, + "grad_norm": 0.9744406342506409, + "learning_rate": 8.35991678224688e-06, + "loss": 0.1217, + "step": 2375 + }, + { + "epoch": 1.6459993072393488, + "grad_norm": 0.8594071865081787, + "learning_rate": 8.359223300970875e-06, + "loss": 0.1105, + "step": 2376 + }, + { + "epoch": 1.6466920678905437, + "grad_norm": 0.7380807399749756, + "learning_rate": 8.35852981969487e-06, + "loss": 0.0929, + "step": 2377 + }, + { + "epoch": 1.647384828541739, + "grad_norm": 0.8635945320129395, + "learning_rate": 8.357836338418863e-06, + "loss": 0.1011, + "step": 2378 + }, + { + "epoch": 1.6480775891929338, + "grad_norm": 0.9151350259780884, + "learning_rate": 8.357142857142858e-06, + "loss": 0.1154, + "step": 2379 + }, + { + "epoch": 1.6487703498441288, + "grad_norm": 0.8731752038002014, + "learning_rate": 8.356449375866851e-06, + "loss": 0.1034, + "step": 2380 + }, + { + "epoch": 1.649463110495324, + "grad_norm": 0.9005547165870667, + "learning_rate": 8.355755894590846e-06, + "loss": 0.0866, + "step": 2381 + }, + { + "epoch": 1.6501558711465187, + "grad_norm": 0.8569620251655579, + "learning_rate": 8.355062413314841e-06, + "loss": 0.0912, + "step": 2382 + }, + { + "epoch": 1.650848631797714, + "grad_norm": 0.8079420924186707, + "learning_rate": 8.354368932038836e-06, + "loss": 0.0894, + "step": 2383 + }, + { + "epoch": 1.6515413924489089, + "grad_norm": 0.9063421487808228, + "learning_rate": 8.353675450762831e-06, + "loss": 0.0893, + "step": 2384 + }, + { + "epoch": 1.652234153100104, + "grad_norm": 0.8574304580688477, + "learning_rate": 8.352981969486824e-06, + "loss": 0.0971, + "step": 2385 + }, + { + "epoch": 1.652926913751299, + "grad_norm": 0.8941143751144409, + "learning_rate": 8.352288488210819e-06, + "loss": 0.1124, + "step": 2386 + }, + { + "epoch": 1.6536196744024938, + "grad_norm": 0.8861686587333679, + "learning_rate": 8.351595006934814e-06, + "loss": 0.0916, + "step": 2387 + }, + { + "epoch": 1.654312435053689, + "grad_norm": 0.8929340243339539, + "learning_rate": 8.350901525658807e-06, + "loss": 0.1187, + "step": 2388 + }, + { + "epoch": 1.655005195704884, + "grad_norm": 0.8803771138191223, + "learning_rate": 8.350208044382802e-06, + "loss": 0.0893, + "step": 2389 + }, + { + "epoch": 1.655697956356079, + "grad_norm": 0.8754448294639587, + "learning_rate": 8.349514563106797e-06, + "loss": 0.0779, + "step": 2390 + }, + { + "epoch": 1.656390717007274, + "grad_norm": 0.7831154465675354, + "learning_rate": 8.348821081830792e-06, + "loss": 0.0974, + "step": 2391 + }, + { + "epoch": 1.6570834776584689, + "grad_norm": 0.8255621790885925, + "learning_rate": 8.348127600554787e-06, + "loss": 0.0894, + "step": 2392 + }, + { + "epoch": 1.6577762383096641, + "grad_norm": 0.8432981371879578, + "learning_rate": 8.34743411927878e-06, + "loss": 0.0946, + "step": 2393 + }, + { + "epoch": 1.658468998960859, + "grad_norm": 0.8284807801246643, + "learning_rate": 8.346740638002775e-06, + "loss": 0.0788, + "step": 2394 + }, + { + "epoch": 1.659161759612054, + "grad_norm": 0.8855732679367065, + "learning_rate": 8.346047156726768e-06, + "loss": 0.0872, + "step": 2395 + }, + { + "epoch": 1.659854520263249, + "grad_norm": 0.8978323936462402, + "learning_rate": 8.345353675450763e-06, + "loss": 0.1052, + "step": 2396 + }, + { + "epoch": 1.660547280914444, + "grad_norm": 0.8796271681785583, + "learning_rate": 8.344660194174758e-06, + "loss": 0.1155, + "step": 2397 + }, + { + "epoch": 1.6612400415656392, + "grad_norm": 0.7829756736755371, + "learning_rate": 8.343966712898752e-06, + "loss": 0.0842, + "step": 2398 + }, + { + "epoch": 1.661932802216834, + "grad_norm": 0.8052772879600525, + "learning_rate": 8.343273231622746e-06, + "loss": 0.0867, + "step": 2399 + }, + { + "epoch": 1.662625562868029, + "grad_norm": 0.7163562774658203, + "learning_rate": 8.342579750346741e-06, + "loss": 0.0817, + "step": 2400 + }, + { + "epoch": 1.6633183235192242, + "grad_norm": 0.8346788287162781, + "learning_rate": 8.341886269070736e-06, + "loss": 0.1039, + "step": 2401 + }, + { + "epoch": 1.664011084170419, + "grad_norm": 0.8440046310424805, + "learning_rate": 8.341192787794731e-06, + "loss": 0.0859, + "step": 2402 + }, + { + "epoch": 1.6647038448216143, + "grad_norm": 0.8900326490402222, + "learning_rate": 8.340499306518725e-06, + "loss": 0.1011, + "step": 2403 + }, + { + "epoch": 1.665396605472809, + "grad_norm": 0.7580977082252502, + "learning_rate": 8.33980582524272e-06, + "loss": 0.0772, + "step": 2404 + }, + { + "epoch": 1.6660893661240042, + "grad_norm": 0.8428370356559753, + "learning_rate": 8.339112343966713e-06, + "loss": 0.09, + "step": 2405 + }, + { + "epoch": 1.6667821267751992, + "grad_norm": 0.7755301594734192, + "learning_rate": 8.338418862690708e-06, + "loss": 0.095, + "step": 2406 + }, + { + "epoch": 1.667474887426394, + "grad_norm": 0.7946776747703552, + "learning_rate": 8.337725381414703e-06, + "loss": 0.0922, + "step": 2407 + }, + { + "epoch": 1.6681676480775893, + "grad_norm": 0.7756094932556152, + "learning_rate": 8.337031900138697e-06, + "loss": 0.0895, + "step": 2408 + }, + { + "epoch": 1.6688604087287842, + "grad_norm": 0.8539913892745972, + "learning_rate": 8.336338418862692e-06, + "loss": 0.0982, + "step": 2409 + }, + { + "epoch": 1.6695531693799792, + "grad_norm": 0.7312172651290894, + "learning_rate": 8.335644937586686e-06, + "loss": 0.0801, + "step": 2410 + }, + { + "epoch": 1.6702459300311743, + "grad_norm": 0.8065546751022339, + "learning_rate": 8.33495145631068e-06, + "loss": 0.0941, + "step": 2411 + }, + { + "epoch": 1.670938690682369, + "grad_norm": 0.7429964542388916, + "learning_rate": 8.334257975034676e-06, + "loss": 0.0828, + "step": 2412 + }, + { + "epoch": 1.6716314513335644, + "grad_norm": 0.8563544154167175, + "learning_rate": 8.333564493758669e-06, + "loss": 0.095, + "step": 2413 + }, + { + "epoch": 1.6723242119847592, + "grad_norm": 0.8839187026023865, + "learning_rate": 8.332871012482664e-06, + "loss": 0.1261, + "step": 2414 + }, + { + "epoch": 1.6730169726359543, + "grad_norm": 0.9011917114257812, + "learning_rate": 8.332177531206657e-06, + "loss": 0.0873, + "step": 2415 + }, + { + "epoch": 1.6737097332871493, + "grad_norm": 0.8669494390487671, + "learning_rate": 8.331484049930652e-06, + "loss": 0.113, + "step": 2416 + }, + { + "epoch": 1.6744024939383442, + "grad_norm": 0.8910118341445923, + "learning_rate": 8.330790568654647e-06, + "loss": 0.0887, + "step": 2417 + }, + { + "epoch": 1.6750952545895395, + "grad_norm": 0.9729399085044861, + "learning_rate": 8.330097087378642e-06, + "loss": 0.0961, + "step": 2418 + }, + { + "epoch": 1.6757880152407343, + "grad_norm": 0.954855740070343, + "learning_rate": 8.329403606102637e-06, + "loss": 0.1121, + "step": 2419 + }, + { + "epoch": 1.6764807758919293, + "grad_norm": 0.8312874436378479, + "learning_rate": 8.32871012482663e-06, + "loss": 0.097, + "step": 2420 + }, + { + "epoch": 1.6771735365431244, + "grad_norm": 0.8347145915031433, + "learning_rate": 8.328016643550625e-06, + "loss": 0.0895, + "step": 2421 + }, + { + "epoch": 1.6778662971943192, + "grad_norm": 0.8984479904174805, + "learning_rate": 8.32732316227462e-06, + "loss": 0.0936, + "step": 2422 + }, + { + "epoch": 1.6785590578455145, + "grad_norm": 0.7961512804031372, + "learning_rate": 8.326629680998613e-06, + "loss": 0.0836, + "step": 2423 + }, + { + "epoch": 1.6792518184967093, + "grad_norm": 0.9486488699913025, + "learning_rate": 8.325936199722608e-06, + "loss": 0.0976, + "step": 2424 + }, + { + "epoch": 1.6799445791479044, + "grad_norm": 0.8769304156303406, + "learning_rate": 8.325242718446603e-06, + "loss": 0.1022, + "step": 2425 + }, + { + "epoch": 1.6806373397990995, + "grad_norm": 0.8644242882728577, + "learning_rate": 8.324549237170598e-06, + "loss": 0.0875, + "step": 2426 + }, + { + "epoch": 1.6813301004502943, + "grad_norm": 0.8403621912002563, + "learning_rate": 8.323855755894593e-06, + "loss": 0.0991, + "step": 2427 + }, + { + "epoch": 1.6820228611014896, + "grad_norm": 0.8801661729812622, + "learning_rate": 8.323162274618586e-06, + "loss": 0.104, + "step": 2428 + }, + { + "epoch": 1.6827156217526844, + "grad_norm": 0.7376656532287598, + "learning_rate": 8.322468793342581e-06, + "loss": 0.0849, + "step": 2429 + }, + { + "epoch": 1.6834083824038795, + "grad_norm": 0.7886162996292114, + "learning_rate": 8.321775312066574e-06, + "loss": 0.0817, + "step": 2430 + }, + { + "epoch": 1.6841011430550745, + "grad_norm": 0.7826414704322815, + "learning_rate": 8.321081830790569e-06, + "loss": 0.0939, + "step": 2431 + }, + { + "epoch": 1.6847939037062694, + "grad_norm": 0.9302990436553955, + "learning_rate": 8.320388349514564e-06, + "loss": 0.1199, + "step": 2432 + }, + { + "epoch": 1.6854866643574646, + "grad_norm": 0.87852942943573, + "learning_rate": 8.319694868238557e-06, + "loss": 0.0981, + "step": 2433 + }, + { + "epoch": 1.6861794250086595, + "grad_norm": 0.8910174369812012, + "learning_rate": 8.319001386962552e-06, + "loss": 0.0973, + "step": 2434 + }, + { + "epoch": 1.6868721856598545, + "grad_norm": 0.8218281269073486, + "learning_rate": 8.318307905686547e-06, + "loss": 0.0882, + "step": 2435 + }, + { + "epoch": 1.6875649463110496, + "grad_norm": 0.8782800436019897, + "learning_rate": 8.317614424410542e-06, + "loss": 0.1105, + "step": 2436 + }, + { + "epoch": 1.6882577069622444, + "grad_norm": 0.7855113744735718, + "learning_rate": 8.316920943134537e-06, + "loss": 0.1052, + "step": 2437 + }, + { + "epoch": 1.6889504676134397, + "grad_norm": 0.8524637222290039, + "learning_rate": 8.31622746185853e-06, + "loss": 0.1091, + "step": 2438 + }, + { + "epoch": 1.6896432282646345, + "grad_norm": 0.7963051199913025, + "learning_rate": 8.315533980582525e-06, + "loss": 0.0945, + "step": 2439 + }, + { + "epoch": 1.6903359889158296, + "grad_norm": 0.8323280811309814, + "learning_rate": 8.314840499306518e-06, + "loss": 0.0946, + "step": 2440 + }, + { + "epoch": 1.6910287495670246, + "grad_norm": 0.8163182139396667, + "learning_rate": 8.314147018030513e-06, + "loss": 0.0867, + "step": 2441 + }, + { + "epoch": 1.6917215102182195, + "grad_norm": 0.8384972810745239, + "learning_rate": 8.313453536754508e-06, + "loss": 0.0982, + "step": 2442 + }, + { + "epoch": 1.6924142708694148, + "grad_norm": 0.8221649527549744, + "learning_rate": 8.312760055478503e-06, + "loss": 0.0985, + "step": 2443 + }, + { + "epoch": 1.6931070315206096, + "grad_norm": 0.832737386226654, + "learning_rate": 8.312066574202498e-06, + "loss": 0.0946, + "step": 2444 + }, + { + "epoch": 1.6937997921718047, + "grad_norm": 0.8320939540863037, + "learning_rate": 8.311373092926491e-06, + "loss": 0.073, + "step": 2445 + }, + { + "epoch": 1.6944925528229997, + "grad_norm": 0.7923811078071594, + "learning_rate": 8.310679611650486e-06, + "loss": 0.0871, + "step": 2446 + }, + { + "epoch": 1.6951853134741945, + "grad_norm": 0.8732138276100159, + "learning_rate": 8.309986130374481e-06, + "loss": 0.1052, + "step": 2447 + }, + { + "epoch": 1.6958780741253898, + "grad_norm": 0.8823708891868591, + "learning_rate": 8.309292649098474e-06, + "loss": 0.0871, + "step": 2448 + }, + { + "epoch": 1.6965708347765847, + "grad_norm": 0.8793323636054993, + "learning_rate": 8.30859916782247e-06, + "loss": 0.0879, + "step": 2449 + }, + { + "epoch": 1.6972635954277797, + "grad_norm": 0.9887531399726868, + "learning_rate": 8.307905686546464e-06, + "loss": 0.1195, + "step": 2450 + }, + { + "epoch": 1.6979563560789748, + "grad_norm": 0.7778171896934509, + "learning_rate": 8.30721220527046e-06, + "loss": 0.0912, + "step": 2451 + }, + { + "epoch": 1.6986491167301696, + "grad_norm": 0.937286913394928, + "learning_rate": 8.306518723994453e-06, + "loss": 0.0915, + "step": 2452 + }, + { + "epoch": 1.6993418773813649, + "grad_norm": 0.8203182220458984, + "learning_rate": 8.305825242718447e-06, + "loss": 0.0967, + "step": 2453 + }, + { + "epoch": 1.7000346380325597, + "grad_norm": 0.8339923620223999, + "learning_rate": 8.305131761442442e-06, + "loss": 0.0997, + "step": 2454 + }, + { + "epoch": 1.7007273986837548, + "grad_norm": 0.8548272252082825, + "learning_rate": 8.304438280166436e-06, + "loss": 0.0978, + "step": 2455 + }, + { + "epoch": 1.7014201593349498, + "grad_norm": 0.907862663269043, + "learning_rate": 8.30374479889043e-06, + "loss": 0.0938, + "step": 2456 + }, + { + "epoch": 1.7021129199861447, + "grad_norm": 0.819660484790802, + "learning_rate": 8.303051317614425e-06, + "loss": 0.0763, + "step": 2457 + }, + { + "epoch": 1.70280568063734, + "grad_norm": 0.7305322885513306, + "learning_rate": 8.302357836338419e-06, + "loss": 0.0891, + "step": 2458 + }, + { + "epoch": 1.7034984412885348, + "grad_norm": 0.8794694542884827, + "learning_rate": 8.301664355062414e-06, + "loss": 0.1, + "step": 2459 + }, + { + "epoch": 1.7041912019397298, + "grad_norm": 0.9221211075782776, + "learning_rate": 8.300970873786409e-06, + "loss": 0.1073, + "step": 2460 + }, + { + "epoch": 1.704883962590925, + "grad_norm": 0.8608057498931885, + "learning_rate": 8.300277392510404e-06, + "loss": 0.0943, + "step": 2461 + }, + { + "epoch": 1.7055767232421197, + "grad_norm": 0.767064094543457, + "learning_rate": 8.299583911234398e-06, + "loss": 0.083, + "step": 2462 + }, + { + "epoch": 1.706269483893315, + "grad_norm": 0.8210756778717041, + "learning_rate": 8.298890429958392e-06, + "loss": 0.0907, + "step": 2463 + }, + { + "epoch": 1.7069622445445098, + "grad_norm": 0.8764961361885071, + "learning_rate": 8.298196948682387e-06, + "loss": 0.0933, + "step": 2464 + }, + { + "epoch": 1.707655005195705, + "grad_norm": 0.8870079517364502, + "learning_rate": 8.29750346740638e-06, + "loss": 0.1225, + "step": 2465 + }, + { + "epoch": 1.7083477658469, + "grad_norm": 0.8085984587669373, + "learning_rate": 8.296809986130375e-06, + "loss": 0.093, + "step": 2466 + }, + { + "epoch": 1.7090405264980948, + "grad_norm": 0.7712332606315613, + "learning_rate": 8.29611650485437e-06, + "loss": 0.0869, + "step": 2467 + }, + { + "epoch": 1.70973328714929, + "grad_norm": 0.9115517139434814, + "learning_rate": 8.295423023578365e-06, + "loss": 0.102, + "step": 2468 + }, + { + "epoch": 1.710426047800485, + "grad_norm": 0.8816676735877991, + "learning_rate": 8.29472954230236e-06, + "loss": 0.0975, + "step": 2469 + }, + { + "epoch": 1.71111880845168, + "grad_norm": 0.821628212928772, + "learning_rate": 8.294036061026353e-06, + "loss": 0.1004, + "step": 2470 + }, + { + "epoch": 1.711811569102875, + "grad_norm": 0.8490061163902283, + "learning_rate": 8.293342579750348e-06, + "loss": 0.0797, + "step": 2471 + }, + { + "epoch": 1.7125043297540699, + "grad_norm": 0.8417266607284546, + "learning_rate": 8.292649098474343e-06, + "loss": 0.0939, + "step": 2472 + }, + { + "epoch": 1.7131970904052651, + "grad_norm": 0.7965371608734131, + "learning_rate": 8.291955617198336e-06, + "loss": 0.0999, + "step": 2473 + }, + { + "epoch": 1.71388985105646, + "grad_norm": 0.886691689491272, + "learning_rate": 8.291262135922331e-06, + "loss": 0.1093, + "step": 2474 + }, + { + "epoch": 1.714582611707655, + "grad_norm": 0.8134506940841675, + "learning_rate": 8.290568654646324e-06, + "loss": 0.0912, + "step": 2475 + }, + { + "epoch": 1.71527537235885, + "grad_norm": 0.8399380445480347, + "learning_rate": 8.289875173370319e-06, + "loss": 0.0968, + "step": 2476 + }, + { + "epoch": 1.715968133010045, + "grad_norm": 1.010575771331787, + "learning_rate": 8.289181692094314e-06, + "loss": 0.1254, + "step": 2477 + }, + { + "epoch": 1.7166608936612402, + "grad_norm": 0.8292006254196167, + "learning_rate": 8.288488210818309e-06, + "loss": 0.0889, + "step": 2478 + }, + { + "epoch": 1.717353654312435, + "grad_norm": 0.8920153379440308, + "learning_rate": 8.287794729542304e-06, + "loss": 0.0835, + "step": 2479 + }, + { + "epoch": 1.71804641496363, + "grad_norm": 0.7608556151390076, + "learning_rate": 8.287101248266297e-06, + "loss": 0.08, + "step": 2480 + }, + { + "epoch": 1.7187391756148251, + "grad_norm": 0.7679067254066467, + "learning_rate": 8.286407766990292e-06, + "loss": 0.0807, + "step": 2481 + }, + { + "epoch": 1.71943193626602, + "grad_norm": 0.9358129501342773, + "learning_rate": 8.285714285714287e-06, + "loss": 0.101, + "step": 2482 + }, + { + "epoch": 1.7201246969172153, + "grad_norm": 0.8521108627319336, + "learning_rate": 8.28502080443828e-06, + "loss": 0.0766, + "step": 2483 + }, + { + "epoch": 1.72081745756841, + "grad_norm": 0.7279655933380127, + "learning_rate": 8.284327323162275e-06, + "loss": 0.0747, + "step": 2484 + }, + { + "epoch": 1.7215102182196051, + "grad_norm": 0.8134982585906982, + "learning_rate": 8.28363384188627e-06, + "loss": 0.0778, + "step": 2485 + }, + { + "epoch": 1.7222029788708002, + "grad_norm": 0.8102089166641235, + "learning_rate": 8.282940360610265e-06, + "loss": 0.0872, + "step": 2486 + }, + { + "epoch": 1.722895739521995, + "grad_norm": 0.9148496985435486, + "learning_rate": 8.28224687933426e-06, + "loss": 0.0999, + "step": 2487 + }, + { + "epoch": 1.7235885001731903, + "grad_norm": 0.9564724564552307, + "learning_rate": 8.281553398058253e-06, + "loss": 0.0738, + "step": 2488 + }, + { + "epoch": 1.7242812608243852, + "grad_norm": 0.8015518188476562, + "learning_rate": 8.280859916782248e-06, + "loss": 0.1031, + "step": 2489 + }, + { + "epoch": 1.7249740214755802, + "grad_norm": 0.7892102599143982, + "learning_rate": 8.280166435506241e-06, + "loss": 0.074, + "step": 2490 + }, + { + "epoch": 1.7256667821267753, + "grad_norm": 0.811720609664917, + "learning_rate": 8.279472954230236e-06, + "loss": 0.0913, + "step": 2491 + }, + { + "epoch": 1.72635954277797, + "grad_norm": 0.8779986500740051, + "learning_rate": 8.278779472954231e-06, + "loss": 0.1088, + "step": 2492 + }, + { + "epoch": 1.7270523034291654, + "grad_norm": 0.7302936911582947, + "learning_rate": 8.278085991678224e-06, + "loss": 0.0724, + "step": 2493 + }, + { + "epoch": 1.7277450640803602, + "grad_norm": 0.9817513227462769, + "learning_rate": 8.27739251040222e-06, + "loss": 0.0948, + "step": 2494 + }, + { + "epoch": 1.7284378247315553, + "grad_norm": 0.8882198929786682, + "learning_rate": 8.276699029126214e-06, + "loss": 0.1112, + "step": 2495 + }, + { + "epoch": 1.7291305853827503, + "grad_norm": 0.911967933177948, + "learning_rate": 8.27600554785021e-06, + "loss": 0.1198, + "step": 2496 + }, + { + "epoch": 1.7298233460339452, + "grad_norm": 0.7240428328514099, + "learning_rate": 8.275312066574204e-06, + "loss": 0.0806, + "step": 2497 + }, + { + "epoch": 1.7305161066851404, + "grad_norm": 0.8238398432731628, + "learning_rate": 8.274618585298197e-06, + "loss": 0.0913, + "step": 2498 + }, + { + "epoch": 1.7312088673363353, + "grad_norm": 0.784412145614624, + "learning_rate": 8.273925104022192e-06, + "loss": 0.0912, + "step": 2499 + }, + { + "epoch": 1.7319016279875303, + "grad_norm": 0.8094228506088257, + "learning_rate": 8.273231622746186e-06, + "loss": 0.0973, + "step": 2500 + }, + { + "epoch": 1.7325943886387254, + "grad_norm": 0.8541341423988342, + "learning_rate": 8.27253814147018e-06, + "loss": 0.0976, + "step": 2501 + }, + { + "epoch": 1.7332871492899202, + "grad_norm": 0.7942793965339661, + "learning_rate": 8.271844660194175e-06, + "loss": 0.078, + "step": 2502 + }, + { + "epoch": 1.7339799099411155, + "grad_norm": 0.787823498249054, + "learning_rate": 8.27115117891817e-06, + "loss": 0.0909, + "step": 2503 + }, + { + "epoch": 1.7346726705923103, + "grad_norm": 0.8610603213310242, + "learning_rate": 8.270457697642165e-06, + "loss": 0.0981, + "step": 2504 + }, + { + "epoch": 1.7353654312435054, + "grad_norm": 0.9186752438545227, + "learning_rate": 8.269764216366159e-06, + "loss": 0.0874, + "step": 2505 + }, + { + "epoch": 1.7360581918947005, + "grad_norm": 0.7902175188064575, + "learning_rate": 8.269070735090154e-06, + "loss": 0.0847, + "step": 2506 + }, + { + "epoch": 1.7367509525458953, + "grad_norm": 0.8692470788955688, + "learning_rate": 8.268377253814148e-06, + "loss": 0.095, + "step": 2507 + }, + { + "epoch": 1.7374437131970906, + "grad_norm": 0.8168033361434937, + "learning_rate": 8.267683772538142e-06, + "loss": 0.1165, + "step": 2508 + }, + { + "epoch": 1.7381364738482854, + "grad_norm": 0.8191872835159302, + "learning_rate": 8.266990291262137e-06, + "loss": 0.0948, + "step": 2509 + }, + { + "epoch": 1.7388292344994805, + "grad_norm": 0.9185828566551208, + "learning_rate": 8.266296809986132e-06, + "loss": 0.0879, + "step": 2510 + }, + { + "epoch": 1.7395219951506755, + "grad_norm": 0.8484848737716675, + "learning_rate": 8.265603328710125e-06, + "loss": 0.116, + "step": 2511 + }, + { + "epoch": 1.7402147558018703, + "grad_norm": 0.8429151773452759, + "learning_rate": 8.26490984743412e-06, + "loss": 0.0897, + "step": 2512 + }, + { + "epoch": 1.7409075164530656, + "grad_norm": 0.8342225551605225, + "learning_rate": 8.264216366158115e-06, + "loss": 0.1034, + "step": 2513 + }, + { + "epoch": 1.7416002771042605, + "grad_norm": 0.8275694251060486, + "learning_rate": 8.26352288488211e-06, + "loss": 0.0925, + "step": 2514 + }, + { + "epoch": 1.7422930377554555, + "grad_norm": 0.8445605039596558, + "learning_rate": 8.262829403606103e-06, + "loss": 0.0778, + "step": 2515 + }, + { + "epoch": 1.7429857984066506, + "grad_norm": 0.8089935183525085, + "learning_rate": 8.262135922330098e-06, + "loss": 0.0901, + "step": 2516 + }, + { + "epoch": 1.7436785590578454, + "grad_norm": 0.8095570206642151, + "learning_rate": 8.261442441054093e-06, + "loss": 0.0897, + "step": 2517 + }, + { + "epoch": 1.7443713197090407, + "grad_norm": 0.7870088815689087, + "learning_rate": 8.260748959778086e-06, + "loss": 0.0717, + "step": 2518 + }, + { + "epoch": 1.7450640803602355, + "grad_norm": 0.8679880499839783, + "learning_rate": 8.260055478502081e-06, + "loss": 0.1004, + "step": 2519 + }, + { + "epoch": 1.7457568410114306, + "grad_norm": 0.7995391488075256, + "learning_rate": 8.259361997226076e-06, + "loss": 0.0896, + "step": 2520 + }, + { + "epoch": 1.7464496016626256, + "grad_norm": 0.6763067841529846, + "learning_rate": 8.25866851595007e-06, + "loss": 0.0682, + "step": 2521 + }, + { + "epoch": 1.7471423623138205, + "grad_norm": 0.8070166707038879, + "learning_rate": 8.257975034674066e-06, + "loss": 0.0926, + "step": 2522 + }, + { + "epoch": 1.7478351229650158, + "grad_norm": 1.0157724618911743, + "learning_rate": 8.257281553398059e-06, + "loss": 0.0986, + "step": 2523 + }, + { + "epoch": 1.7485278836162106, + "grad_norm": 0.844245970249176, + "learning_rate": 8.256588072122054e-06, + "loss": 0.0841, + "step": 2524 + }, + { + "epoch": 1.7492206442674056, + "grad_norm": 0.9986148476600647, + "learning_rate": 8.255894590846047e-06, + "loss": 0.1219, + "step": 2525 + }, + { + "epoch": 1.7499134049186007, + "grad_norm": 0.9110881090164185, + "learning_rate": 8.255201109570042e-06, + "loss": 0.095, + "step": 2526 + }, + { + "epoch": 1.7506061655697955, + "grad_norm": 0.8442522883415222, + "learning_rate": 8.254507628294037e-06, + "loss": 0.0901, + "step": 2527 + }, + { + "epoch": 1.7512989262209906, + "grad_norm": 0.7971848845481873, + "learning_rate": 8.253814147018032e-06, + "loss": 0.0819, + "step": 2528 + }, + { + "epoch": 1.7519916868721856, + "grad_norm": 0.7700503468513489, + "learning_rate": 8.253120665742025e-06, + "loss": 0.0788, + "step": 2529 + }, + { + "epoch": 1.7526844475233807, + "grad_norm": 0.8246543407440186, + "learning_rate": 8.25242718446602e-06, + "loss": 0.0915, + "step": 2530 + }, + { + "epoch": 1.7533772081745758, + "grad_norm": 0.96554034948349, + "learning_rate": 8.251733703190015e-06, + "loss": 0.1027, + "step": 2531 + }, + { + "epoch": 1.7540699688257706, + "grad_norm": 0.882771372795105, + "learning_rate": 8.25104022191401e-06, + "loss": 0.1009, + "step": 2532 + }, + { + "epoch": 1.7547627294769657, + "grad_norm": 0.8232872486114502, + "learning_rate": 8.250346740638003e-06, + "loss": 0.1193, + "step": 2533 + }, + { + "epoch": 1.7554554901281607, + "grad_norm": 0.7570979595184326, + "learning_rate": 8.249653259361998e-06, + "loss": 0.086, + "step": 2534 + }, + { + "epoch": 1.7561482507793558, + "grad_norm": 0.8680220246315002, + "learning_rate": 8.248959778085991e-06, + "loss": 0.0922, + "step": 2535 + }, + { + "epoch": 1.7568410114305508, + "grad_norm": 0.7393411993980408, + "learning_rate": 8.248266296809986e-06, + "loss": 0.0701, + "step": 2536 + }, + { + "epoch": 1.7575337720817457, + "grad_norm": 0.8233626484870911, + "learning_rate": 8.247572815533981e-06, + "loss": 0.0867, + "step": 2537 + }, + { + "epoch": 1.7582265327329407, + "grad_norm": 0.927104115486145, + "learning_rate": 8.246879334257976e-06, + "loss": 0.1022, + "step": 2538 + }, + { + "epoch": 1.7589192933841358, + "grad_norm": 0.7783875465393066, + "learning_rate": 8.246185852981971e-06, + "loss": 0.0857, + "step": 2539 + }, + { + "epoch": 1.7596120540353308, + "grad_norm": 0.9003371596336365, + "learning_rate": 8.245492371705964e-06, + "loss": 0.1037, + "step": 2540 + }, + { + "epoch": 1.7603048146865259, + "grad_norm": 0.8483417630195618, + "learning_rate": 8.24479889042996e-06, + "loss": 0.0833, + "step": 2541 + }, + { + "epoch": 1.7609975753377207, + "grad_norm": 0.806175172328949, + "learning_rate": 8.244105409153952e-06, + "loss": 0.0896, + "step": 2542 + }, + { + "epoch": 1.7616903359889158, + "grad_norm": 0.8583565950393677, + "learning_rate": 8.243411927877947e-06, + "loss": 0.093, + "step": 2543 + }, + { + "epoch": 1.7623830966401108, + "grad_norm": 0.8102852702140808, + "learning_rate": 8.242718446601942e-06, + "loss": 0.0804, + "step": 2544 + }, + { + "epoch": 1.763075857291306, + "grad_norm": 0.9145060777664185, + "learning_rate": 8.242024965325937e-06, + "loss": 0.1106, + "step": 2545 + }, + { + "epoch": 1.763768617942501, + "grad_norm": 0.9196067452430725, + "learning_rate": 8.241331484049932e-06, + "loss": 0.0947, + "step": 2546 + }, + { + "epoch": 1.7644613785936958, + "grad_norm": 0.9018850922584534, + "learning_rate": 8.240638002773925e-06, + "loss": 0.1057, + "step": 2547 + }, + { + "epoch": 1.7651541392448908, + "grad_norm": 0.8649345636367798, + "learning_rate": 8.23994452149792e-06, + "loss": 0.0845, + "step": 2548 + }, + { + "epoch": 1.765846899896086, + "grad_norm": 0.7476176619529724, + "learning_rate": 8.239251040221915e-06, + "loss": 0.0898, + "step": 2549 + }, + { + "epoch": 1.766539660547281, + "grad_norm": 0.8914130330085754, + "learning_rate": 8.238557558945909e-06, + "loss": 0.1024, + "step": 2550 + }, + { + "epoch": 1.767232421198476, + "grad_norm": 0.8068163990974426, + "learning_rate": 8.237864077669903e-06, + "loss": 0.0938, + "step": 2551 + }, + { + "epoch": 1.7679251818496708, + "grad_norm": 0.8041585087776184, + "learning_rate": 8.237170596393897e-06, + "loss": 0.0971, + "step": 2552 + }, + { + "epoch": 1.768617942500866, + "grad_norm": 2.1894426345825195, + "learning_rate": 8.236477115117892e-06, + "loss": 0.0931, + "step": 2553 + }, + { + "epoch": 1.769310703152061, + "grad_norm": 0.8765089511871338, + "learning_rate": 8.235783633841887e-06, + "loss": 0.079, + "step": 2554 + }, + { + "epoch": 1.770003463803256, + "grad_norm": 0.8140201568603516, + "learning_rate": 8.235090152565882e-06, + "loss": 0.0785, + "step": 2555 + }, + { + "epoch": 1.770696224454451, + "grad_norm": 0.7975987792015076, + "learning_rate": 8.234396671289876e-06, + "loss": 0.0819, + "step": 2556 + }, + { + "epoch": 1.771388985105646, + "grad_norm": 0.9499161839485168, + "learning_rate": 8.23370319001387e-06, + "loss": 0.0903, + "step": 2557 + }, + { + "epoch": 1.772081745756841, + "grad_norm": 0.8398935198783875, + "learning_rate": 8.233009708737865e-06, + "loss": 0.1053, + "step": 2558 + }, + { + "epoch": 1.772774506408036, + "grad_norm": 0.8754689693450928, + "learning_rate": 8.23231622746186e-06, + "loss": 0.0989, + "step": 2559 + }, + { + "epoch": 1.773467267059231, + "grad_norm": 0.8374683856964111, + "learning_rate": 8.231622746185853e-06, + "loss": 0.0741, + "step": 2560 + }, + { + "epoch": 1.7741600277104261, + "grad_norm": 0.9576423764228821, + "learning_rate": 8.230929264909848e-06, + "loss": 0.1182, + "step": 2561 + }, + { + "epoch": 1.774852788361621, + "grad_norm": 0.9090636372566223, + "learning_rate": 8.230235783633843e-06, + "loss": 0.1065, + "step": 2562 + }, + { + "epoch": 1.775545549012816, + "grad_norm": 0.8778228759765625, + "learning_rate": 8.229542302357838e-06, + "loss": 0.0844, + "step": 2563 + }, + { + "epoch": 1.776238309664011, + "grad_norm": 0.8534209728240967, + "learning_rate": 8.228848821081833e-06, + "loss": 0.0941, + "step": 2564 + }, + { + "epoch": 1.7769310703152061, + "grad_norm": 0.8219755291938782, + "learning_rate": 8.228155339805826e-06, + "loss": 0.0951, + "step": 2565 + }, + { + "epoch": 1.7776238309664012, + "grad_norm": 0.8871533274650574, + "learning_rate": 8.22746185852982e-06, + "loss": 0.1154, + "step": 2566 + }, + { + "epoch": 1.778316591617596, + "grad_norm": 0.9020233750343323, + "learning_rate": 8.226768377253814e-06, + "loss": 0.0926, + "step": 2567 + }, + { + "epoch": 1.779009352268791, + "grad_norm": 0.9474078416824341, + "learning_rate": 8.226074895977809e-06, + "loss": 0.1006, + "step": 2568 + }, + { + "epoch": 1.7797021129199861, + "grad_norm": 0.8097711205482483, + "learning_rate": 8.225381414701804e-06, + "loss": 0.1019, + "step": 2569 + }, + { + "epoch": 1.7803948735711812, + "grad_norm": 0.9169875979423523, + "learning_rate": 8.224687933425797e-06, + "loss": 0.0968, + "step": 2570 + }, + { + "epoch": 1.7810876342223763, + "grad_norm": 0.7902358174324036, + "learning_rate": 8.223994452149792e-06, + "loss": 0.0715, + "step": 2571 + }, + { + "epoch": 1.781780394873571, + "grad_norm": 0.842688262462616, + "learning_rate": 8.223300970873787e-06, + "loss": 0.0991, + "step": 2572 + }, + { + "epoch": 1.7824731555247662, + "grad_norm": 1.0050466060638428, + "learning_rate": 8.222607489597782e-06, + "loss": 0.1143, + "step": 2573 + }, + { + "epoch": 1.7831659161759612, + "grad_norm": 0.8606399297714233, + "learning_rate": 8.221914008321777e-06, + "loss": 0.0982, + "step": 2574 + }, + { + "epoch": 1.7838586768271563, + "grad_norm": 0.7917369604110718, + "learning_rate": 8.22122052704577e-06, + "loss": 0.1021, + "step": 2575 + }, + { + "epoch": 1.7845514374783513, + "grad_norm": 0.8177589178085327, + "learning_rate": 8.220527045769765e-06, + "loss": 0.1045, + "step": 2576 + }, + { + "epoch": 1.7852441981295462, + "grad_norm": 0.7824551463127136, + "learning_rate": 8.219833564493758e-06, + "loss": 0.0909, + "step": 2577 + }, + { + "epoch": 1.7859369587807412, + "grad_norm": 0.776489794254303, + "learning_rate": 8.219140083217753e-06, + "loss": 0.0915, + "step": 2578 + }, + { + "epoch": 1.7866297194319363, + "grad_norm": 0.9233396649360657, + "learning_rate": 8.218446601941748e-06, + "loss": 0.0987, + "step": 2579 + }, + { + "epoch": 1.7873224800831313, + "grad_norm": 0.7486032247543335, + "learning_rate": 8.217753120665743e-06, + "loss": 0.0673, + "step": 2580 + }, + { + "epoch": 1.7880152407343264, + "grad_norm": 0.9192646741867065, + "learning_rate": 8.217059639389738e-06, + "loss": 0.1029, + "step": 2581 + }, + { + "epoch": 1.7887080013855212, + "grad_norm": 0.9108365774154663, + "learning_rate": 8.216366158113731e-06, + "loss": 0.0936, + "step": 2582 + }, + { + "epoch": 1.7894007620367163, + "grad_norm": 0.8713735342025757, + "learning_rate": 8.215672676837726e-06, + "loss": 0.0923, + "step": 2583 + }, + { + "epoch": 1.7900935226879113, + "grad_norm": 0.8167701363563538, + "learning_rate": 8.214979195561721e-06, + "loss": 0.0987, + "step": 2584 + }, + { + "epoch": 1.7907862833391064, + "grad_norm": 0.8005883693695068, + "learning_rate": 8.214285714285714e-06, + "loss": 0.0821, + "step": 2585 + }, + { + "epoch": 1.7914790439903014, + "grad_norm": 0.8728950023651123, + "learning_rate": 8.21359223300971e-06, + "loss": 0.0888, + "step": 2586 + }, + { + "epoch": 1.7921718046414963, + "grad_norm": 0.9217014908790588, + "learning_rate": 8.212898751733704e-06, + "loss": 0.092, + "step": 2587 + }, + { + "epoch": 1.7928645652926913, + "grad_norm": 0.8786712288856506, + "learning_rate": 8.212205270457697e-06, + "loss": 0.0922, + "step": 2588 + }, + { + "epoch": 1.7935573259438864, + "grad_norm": 0.8297293782234192, + "learning_rate": 8.211511789181692e-06, + "loss": 0.087, + "step": 2589 + }, + { + "epoch": 1.7942500865950815, + "grad_norm": 0.8937293291091919, + "learning_rate": 8.210818307905687e-06, + "loss": 0.1082, + "step": 2590 + }, + { + "epoch": 1.7949428472462765, + "grad_norm": 0.9118964672088623, + "learning_rate": 8.210124826629682e-06, + "loss": 0.1229, + "step": 2591 + }, + { + "epoch": 1.7956356078974713, + "grad_norm": 0.8846879005432129, + "learning_rate": 8.209431345353675e-06, + "loss": 0.1107, + "step": 2592 + }, + { + "epoch": 1.7963283685486664, + "grad_norm": 0.9043500423431396, + "learning_rate": 8.20873786407767e-06, + "loss": 0.1271, + "step": 2593 + }, + { + "epoch": 1.7970211291998615, + "grad_norm": 0.9226686954498291, + "learning_rate": 8.208044382801665e-06, + "loss": 0.1101, + "step": 2594 + }, + { + "epoch": 1.7977138898510565, + "grad_norm": 0.9043811559677124, + "learning_rate": 8.207350901525659e-06, + "loss": 0.0766, + "step": 2595 + }, + { + "epoch": 1.7984066505022516, + "grad_norm": 0.8195856213569641, + "learning_rate": 8.206657420249653e-06, + "loss": 0.0896, + "step": 2596 + }, + { + "epoch": 1.7990994111534464, + "grad_norm": 0.8749009370803833, + "learning_rate": 8.205963938973648e-06, + "loss": 0.1003, + "step": 2597 + }, + { + "epoch": 1.7997921718046415, + "grad_norm": 0.9396862387657166, + "learning_rate": 8.205270457697643e-06, + "loss": 0.1147, + "step": 2598 + }, + { + "epoch": 1.8004849324558365, + "grad_norm": 0.8920246362686157, + "learning_rate": 8.204576976421638e-06, + "loss": 0.1166, + "step": 2599 + }, + { + "epoch": 1.8011776931070316, + "grad_norm": 0.9035167694091797, + "learning_rate": 8.203883495145632e-06, + "loss": 0.08, + "step": 2600 + }, + { + "epoch": 1.8018704537582266, + "grad_norm": 0.7884556651115417, + "learning_rate": 8.203190013869626e-06, + "loss": 0.0887, + "step": 2601 + }, + { + "epoch": 1.8025632144094215, + "grad_norm": 0.9172216653823853, + "learning_rate": 8.20249653259362e-06, + "loss": 0.0902, + "step": 2602 + }, + { + "epoch": 1.8032559750606165, + "grad_norm": 0.7486192584037781, + "learning_rate": 8.201803051317615e-06, + "loss": 0.0867, + "step": 2603 + }, + { + "epoch": 1.8039487357118116, + "grad_norm": 0.9297676086425781, + "learning_rate": 8.20110957004161e-06, + "loss": 0.0926, + "step": 2604 + }, + { + "epoch": 1.8046414963630066, + "grad_norm": 0.9983718395233154, + "learning_rate": 8.200416088765604e-06, + "loss": 0.0912, + "step": 2605 + }, + { + "epoch": 1.8053342570142017, + "grad_norm": 0.7971341609954834, + "learning_rate": 8.1997226074896e-06, + "loss": 0.0905, + "step": 2606 + }, + { + "epoch": 1.8060270176653965, + "grad_norm": 0.7908374071121216, + "learning_rate": 8.199029126213593e-06, + "loss": 0.0728, + "step": 2607 + }, + { + "epoch": 1.8067197783165916, + "grad_norm": 0.8686855435371399, + "learning_rate": 8.198335644937588e-06, + "loss": 0.0931, + "step": 2608 + }, + { + "epoch": 1.8074125389677866, + "grad_norm": 1.0262415409088135, + "learning_rate": 8.197642163661583e-06, + "loss": 0.107, + "step": 2609 + }, + { + "epoch": 1.8081052996189815, + "grad_norm": 0.8252395987510681, + "learning_rate": 8.196948682385576e-06, + "loss": 0.0926, + "step": 2610 + }, + { + "epoch": 1.8087980602701768, + "grad_norm": 0.8116162419319153, + "learning_rate": 8.19625520110957e-06, + "loss": 0.0894, + "step": 2611 + }, + { + "epoch": 1.8094908209213716, + "grad_norm": 0.8530290722846985, + "learning_rate": 8.195561719833564e-06, + "loss": 0.096, + "step": 2612 + }, + { + "epoch": 1.8101835815725666, + "grad_norm": 0.8475037813186646, + "learning_rate": 8.194868238557559e-06, + "loss": 0.0859, + "step": 2613 + }, + { + "epoch": 1.8108763422237617, + "grad_norm": 0.8325061202049255, + "learning_rate": 8.194174757281554e-06, + "loss": 0.0987, + "step": 2614 + }, + { + "epoch": 1.8115691028749565, + "grad_norm": 0.9618691205978394, + "learning_rate": 8.193481276005549e-06, + "loss": 0.126, + "step": 2615 + }, + { + "epoch": 1.8122618635261518, + "grad_norm": 0.838097333908081, + "learning_rate": 8.192787794729544e-06, + "loss": 0.0698, + "step": 2616 + }, + { + "epoch": 1.8129546241773467, + "grad_norm": 0.8407849669456482, + "learning_rate": 8.192094313453537e-06, + "loss": 0.0812, + "step": 2617 + }, + { + "epoch": 1.8136473848285417, + "grad_norm": 0.8925338387489319, + "learning_rate": 8.191400832177532e-06, + "loss": 0.0942, + "step": 2618 + }, + { + "epoch": 1.8143401454797368, + "grad_norm": 0.8784374594688416, + "learning_rate": 8.190707350901527e-06, + "loss": 0.0964, + "step": 2619 + }, + { + "epoch": 1.8150329061309316, + "grad_norm": 0.9722921848297119, + "learning_rate": 8.19001386962552e-06, + "loss": 0.0971, + "step": 2620 + }, + { + "epoch": 1.8157256667821269, + "grad_norm": 0.9016208648681641, + "learning_rate": 8.189320388349515e-06, + "loss": 0.1139, + "step": 2621 + }, + { + "epoch": 1.8164184274333217, + "grad_norm": 0.8219439387321472, + "learning_rate": 8.18862690707351e-06, + "loss": 0.0819, + "step": 2622 + }, + { + "epoch": 1.8171111880845168, + "grad_norm": 0.903080940246582, + "learning_rate": 8.187933425797505e-06, + "loss": 0.0943, + "step": 2623 + }, + { + "epoch": 1.8178039487357118, + "grad_norm": 0.804472029209137, + "learning_rate": 8.1872399445215e-06, + "loss": 0.0887, + "step": 2624 + }, + { + "epoch": 1.8184967093869067, + "grad_norm": 0.8361225128173828, + "learning_rate": 8.186546463245493e-06, + "loss": 0.0814, + "step": 2625 + }, + { + "epoch": 1.819189470038102, + "grad_norm": 0.9732152819633484, + "learning_rate": 8.185852981969488e-06, + "loss": 0.0811, + "step": 2626 + }, + { + "epoch": 1.8198822306892968, + "grad_norm": 0.8066356182098389, + "learning_rate": 8.185159500693481e-06, + "loss": 0.0863, + "step": 2627 + }, + { + "epoch": 1.8205749913404918, + "grad_norm": 0.8024638891220093, + "learning_rate": 8.184466019417476e-06, + "loss": 0.099, + "step": 2628 + }, + { + "epoch": 1.821267751991687, + "grad_norm": 1.0059717893600464, + "learning_rate": 8.183772538141471e-06, + "loss": 0.1005, + "step": 2629 + }, + { + "epoch": 1.8219605126428817, + "grad_norm": 0.8454689383506775, + "learning_rate": 8.183079056865464e-06, + "loss": 0.0842, + "step": 2630 + }, + { + "epoch": 1.822653273294077, + "grad_norm": 1.0026649236679077, + "learning_rate": 8.18238557558946e-06, + "loss": 0.095, + "step": 2631 + }, + { + "epoch": 1.8233460339452718, + "grad_norm": 0.8365281820297241, + "learning_rate": 8.181692094313454e-06, + "loss": 0.0915, + "step": 2632 + }, + { + "epoch": 1.824038794596467, + "grad_norm": 0.9412815570831299, + "learning_rate": 8.180998613037449e-06, + "loss": 0.0919, + "step": 2633 + }, + { + "epoch": 1.824731555247662, + "grad_norm": 0.7574265003204346, + "learning_rate": 8.180305131761444e-06, + "loss": 0.0759, + "step": 2634 + }, + { + "epoch": 1.8254243158988568, + "grad_norm": 0.9514365792274475, + "learning_rate": 8.179611650485437e-06, + "loss": 0.104, + "step": 2635 + }, + { + "epoch": 1.826117076550052, + "grad_norm": 0.9478408694267273, + "learning_rate": 8.178918169209432e-06, + "loss": 0.1164, + "step": 2636 + }, + { + "epoch": 1.826809837201247, + "grad_norm": 0.8486576676368713, + "learning_rate": 8.178224687933425e-06, + "loss": 0.0999, + "step": 2637 + }, + { + "epoch": 1.827502597852442, + "grad_norm": 0.8942376375198364, + "learning_rate": 8.17753120665742e-06, + "loss": 0.0972, + "step": 2638 + }, + { + "epoch": 1.828195358503637, + "grad_norm": 0.9121793508529663, + "learning_rate": 8.176837725381415e-06, + "loss": 0.12, + "step": 2639 + }, + { + "epoch": 1.8288881191548318, + "grad_norm": 0.8649296760559082, + "learning_rate": 8.17614424410541e-06, + "loss": 0.0963, + "step": 2640 + }, + { + "epoch": 1.8295808798060271, + "grad_norm": 0.9005740880966187, + "learning_rate": 8.175450762829405e-06, + "loss": 0.1168, + "step": 2641 + }, + { + "epoch": 1.830273640457222, + "grad_norm": 1.1905404329299927, + "learning_rate": 8.174757281553398e-06, + "loss": 0.111, + "step": 2642 + }, + { + "epoch": 1.830966401108417, + "grad_norm": 0.8033245801925659, + "learning_rate": 8.174063800277393e-06, + "loss": 0.0846, + "step": 2643 + }, + { + "epoch": 1.831659161759612, + "grad_norm": 0.8120441436767578, + "learning_rate": 8.173370319001388e-06, + "loss": 0.0788, + "step": 2644 + }, + { + "epoch": 1.832351922410807, + "grad_norm": 0.9285821318626404, + "learning_rate": 8.172676837725381e-06, + "loss": 0.0834, + "step": 2645 + }, + { + "epoch": 1.8330446830620022, + "grad_norm": 0.9043736457824707, + "learning_rate": 8.171983356449376e-06, + "loss": 0.1068, + "step": 2646 + }, + { + "epoch": 1.833737443713197, + "grad_norm": 0.9138674139976501, + "learning_rate": 8.17128987517337e-06, + "loss": 0.1053, + "step": 2647 + }, + { + "epoch": 1.834430204364392, + "grad_norm": 0.8692159056663513, + "learning_rate": 8.170596393897365e-06, + "loss": 0.1023, + "step": 2648 + }, + { + "epoch": 1.8351229650155871, + "grad_norm": 0.9652042388916016, + "learning_rate": 8.16990291262136e-06, + "loss": 0.0959, + "step": 2649 + }, + { + "epoch": 1.835815725666782, + "grad_norm": 0.8220810890197754, + "learning_rate": 8.169209431345354e-06, + "loss": 0.0821, + "step": 2650 + }, + { + "epoch": 1.8365084863179773, + "grad_norm": 0.9395898580551147, + "learning_rate": 8.16851595006935e-06, + "loss": 0.1217, + "step": 2651 + }, + { + "epoch": 1.837201246969172, + "grad_norm": 0.7368136048316956, + "learning_rate": 8.167822468793343e-06, + "loss": 0.0816, + "step": 2652 + }, + { + "epoch": 1.8378940076203671, + "grad_norm": 0.8244364857673645, + "learning_rate": 8.167128987517338e-06, + "loss": 0.0854, + "step": 2653 + }, + { + "epoch": 1.8385867682715622, + "grad_norm": 0.931873083114624, + "learning_rate": 8.166435506241333e-06, + "loss": 0.12, + "step": 2654 + }, + { + "epoch": 1.839279528922757, + "grad_norm": 0.7082671523094177, + "learning_rate": 8.165742024965326e-06, + "loss": 0.0607, + "step": 2655 + }, + { + "epoch": 1.8399722895739523, + "grad_norm": 0.7946502566337585, + "learning_rate": 8.16504854368932e-06, + "loss": 0.0846, + "step": 2656 + }, + { + "epoch": 1.8406650502251471, + "grad_norm": 0.8827128410339355, + "learning_rate": 8.164355062413316e-06, + "loss": 0.1049, + "step": 2657 + }, + { + "epoch": 1.8413578108763422, + "grad_norm": 0.7891724705696106, + "learning_rate": 8.16366158113731e-06, + "loss": 0.0894, + "step": 2658 + }, + { + "epoch": 1.8420505715275373, + "grad_norm": 0.7655206322669983, + "learning_rate": 8.162968099861305e-06, + "loss": 0.0945, + "step": 2659 + }, + { + "epoch": 1.842743332178732, + "grad_norm": 0.7659879326820374, + "learning_rate": 8.162274618585299e-06, + "loss": 0.0864, + "step": 2660 + }, + { + "epoch": 1.8434360928299274, + "grad_norm": 0.9626227617263794, + "learning_rate": 8.161581137309294e-06, + "loss": 0.1046, + "step": 2661 + }, + { + "epoch": 1.8441288534811222, + "grad_norm": 0.8859971761703491, + "learning_rate": 8.160887656033287e-06, + "loss": 0.1298, + "step": 2662 + }, + { + "epoch": 1.8448216141323173, + "grad_norm": 0.7885403037071228, + "learning_rate": 8.160194174757282e-06, + "loss": 0.0879, + "step": 2663 + }, + { + "epoch": 1.8455143747835123, + "grad_norm": 0.9495970606803894, + "learning_rate": 8.159500693481277e-06, + "loss": 0.11, + "step": 2664 + }, + { + "epoch": 1.8462071354347072, + "grad_norm": 0.7970353364944458, + "learning_rate": 8.15880721220527e-06, + "loss": 0.0833, + "step": 2665 + }, + { + "epoch": 1.8468998960859024, + "grad_norm": 0.796225368976593, + "learning_rate": 8.158113730929265e-06, + "loss": 0.083, + "step": 2666 + }, + { + "epoch": 1.8475926567370973, + "grad_norm": 0.9057906270027161, + "learning_rate": 8.15742024965326e-06, + "loss": 0.1051, + "step": 2667 + }, + { + "epoch": 1.8482854173882923, + "grad_norm": 0.8341637849807739, + "learning_rate": 8.156726768377255e-06, + "loss": 0.1086, + "step": 2668 + }, + { + "epoch": 1.8489781780394874, + "grad_norm": 0.7643277049064636, + "learning_rate": 8.15603328710125e-06, + "loss": 0.0747, + "step": 2669 + }, + { + "epoch": 1.8496709386906822, + "grad_norm": 0.7797998189926147, + "learning_rate": 8.155339805825243e-06, + "loss": 0.0743, + "step": 2670 + }, + { + "epoch": 1.8503636993418775, + "grad_norm": 0.896926999092102, + "learning_rate": 8.154646324549238e-06, + "loss": 0.097, + "step": 2671 + }, + { + "epoch": 1.8510564599930723, + "grad_norm": 0.7922436594963074, + "learning_rate": 8.153952843273231e-06, + "loss": 0.0913, + "step": 2672 + }, + { + "epoch": 1.8517492206442674, + "grad_norm": 0.8700946569442749, + "learning_rate": 8.153259361997226e-06, + "loss": 0.0848, + "step": 2673 + }, + { + "epoch": 1.8524419812954624, + "grad_norm": 0.953936755657196, + "learning_rate": 8.152565880721221e-06, + "loss": 0.1209, + "step": 2674 + }, + { + "epoch": 1.8531347419466573, + "grad_norm": 0.8067466020584106, + "learning_rate": 8.151872399445216e-06, + "loss": 0.1108, + "step": 2675 + }, + { + "epoch": 1.8538275025978526, + "grad_norm": 0.8336835503578186, + "learning_rate": 8.151178918169211e-06, + "loss": 0.1026, + "step": 2676 + }, + { + "epoch": 1.8545202632490474, + "grad_norm": 0.8462893962860107, + "learning_rate": 8.150485436893204e-06, + "loss": 0.0775, + "step": 2677 + }, + { + "epoch": 1.8552130239002425, + "grad_norm": 0.8883715867996216, + "learning_rate": 8.149791955617199e-06, + "loss": 0.1208, + "step": 2678 + }, + { + "epoch": 1.8559057845514375, + "grad_norm": 0.7880957722663879, + "learning_rate": 8.149098474341194e-06, + "loss": 0.0932, + "step": 2679 + }, + { + "epoch": 1.8565985452026323, + "grad_norm": 0.6992374062538147, + "learning_rate": 8.148404993065187e-06, + "loss": 0.0695, + "step": 2680 + }, + { + "epoch": 1.8572913058538276, + "grad_norm": 0.8892483115196228, + "learning_rate": 8.147711511789182e-06, + "loss": 0.1047, + "step": 2681 + }, + { + "epoch": 1.8579840665050225, + "grad_norm": 0.7660226225852966, + "learning_rate": 8.147018030513177e-06, + "loss": 0.0902, + "step": 2682 + }, + { + "epoch": 1.8586768271562175, + "grad_norm": 0.929363489151001, + "learning_rate": 8.146324549237172e-06, + "loss": 0.1113, + "step": 2683 + }, + { + "epoch": 1.8593695878074126, + "grad_norm": 0.9377325177192688, + "learning_rate": 8.145631067961165e-06, + "loss": 0.0966, + "step": 2684 + }, + { + "epoch": 1.8600623484586074, + "grad_norm": 0.8253400325775146, + "learning_rate": 8.14493758668516e-06, + "loss": 0.0892, + "step": 2685 + }, + { + "epoch": 1.8607551091098027, + "grad_norm": 0.816107451915741, + "learning_rate": 8.144244105409155e-06, + "loss": 0.0805, + "step": 2686 + }, + { + "epoch": 1.8614478697609975, + "grad_norm": 0.8043034672737122, + "learning_rate": 8.143550624133148e-06, + "loss": 0.0924, + "step": 2687 + }, + { + "epoch": 1.8621406304121926, + "grad_norm": 0.8767082691192627, + "learning_rate": 8.142857142857143e-06, + "loss": 0.1195, + "step": 2688 + }, + { + "epoch": 1.8628333910633876, + "grad_norm": 0.8499141931533813, + "learning_rate": 8.142163661581138e-06, + "loss": 0.1162, + "step": 2689 + }, + { + "epoch": 1.8635261517145825, + "grad_norm": 0.8079686164855957, + "learning_rate": 8.141470180305131e-06, + "loss": 0.0958, + "step": 2690 + }, + { + "epoch": 1.8642189123657777, + "grad_norm": 0.897999107837677, + "learning_rate": 8.140776699029126e-06, + "loss": 0.1013, + "step": 2691 + }, + { + "epoch": 1.8649116730169726, + "grad_norm": 0.8873015642166138, + "learning_rate": 8.140083217753121e-06, + "loss": 0.0851, + "step": 2692 + }, + { + "epoch": 1.8656044336681676, + "grad_norm": 0.8154669404029846, + "learning_rate": 8.139389736477116e-06, + "loss": 0.0878, + "step": 2693 + }, + { + "epoch": 1.8662971943193627, + "grad_norm": 0.8138689994812012, + "learning_rate": 8.138696255201111e-06, + "loss": 0.0907, + "step": 2694 + }, + { + "epoch": 1.8669899549705575, + "grad_norm": 0.9204639792442322, + "learning_rate": 8.138002773925104e-06, + "loss": 0.1067, + "step": 2695 + }, + { + "epoch": 1.8676827156217528, + "grad_norm": 0.9914817214012146, + "learning_rate": 8.1373092926491e-06, + "loss": 0.0782, + "step": 2696 + }, + { + "epoch": 1.8683754762729476, + "grad_norm": 0.89031982421875, + "learning_rate": 8.136615811373093e-06, + "loss": 0.111, + "step": 2697 + }, + { + "epoch": 1.8690682369241427, + "grad_norm": 0.8497005105018616, + "learning_rate": 8.135922330097088e-06, + "loss": 0.0887, + "step": 2698 + }, + { + "epoch": 1.8697609975753378, + "grad_norm": 0.8609863519668579, + "learning_rate": 8.135228848821082e-06, + "loss": 0.0899, + "step": 2699 + }, + { + "epoch": 1.8704537582265326, + "grad_norm": 0.8168871402740479, + "learning_rate": 8.134535367545077e-06, + "loss": 0.0957, + "step": 2700 + }, + { + "epoch": 1.8711465188777279, + "grad_norm": 0.878494143486023, + "learning_rate": 8.133841886269072e-06, + "loss": 0.0834, + "step": 2701 + }, + { + "epoch": 1.8718392795289227, + "grad_norm": 0.9112176299095154, + "learning_rate": 8.133148404993066e-06, + "loss": 0.1281, + "step": 2702 + }, + { + "epoch": 1.8725320401801178, + "grad_norm": 0.8315407633781433, + "learning_rate": 8.13245492371706e-06, + "loss": 0.0787, + "step": 2703 + }, + { + "epoch": 1.8732248008313128, + "grad_norm": 0.6737753748893738, + "learning_rate": 8.131761442441055e-06, + "loss": 0.0684, + "step": 2704 + }, + { + "epoch": 1.8739175614825077, + "grad_norm": 0.9361903667449951, + "learning_rate": 8.131067961165049e-06, + "loss": 0.1022, + "step": 2705 + }, + { + "epoch": 1.874610322133703, + "grad_norm": 0.8183360695838928, + "learning_rate": 8.130374479889044e-06, + "loss": 0.082, + "step": 2706 + }, + { + "epoch": 1.8753030827848978, + "grad_norm": 0.8900784850120544, + "learning_rate": 8.129680998613037e-06, + "loss": 0.118, + "step": 2707 + }, + { + "epoch": 1.8759958434360928, + "grad_norm": 0.8236899375915527, + "learning_rate": 8.128987517337032e-06, + "loss": 0.0755, + "step": 2708 + }, + { + "epoch": 1.8766886040872879, + "grad_norm": 0.8137473464012146, + "learning_rate": 8.128294036061027e-06, + "loss": 0.1, + "step": 2709 + }, + { + "epoch": 1.8773813647384827, + "grad_norm": 0.8047782778739929, + "learning_rate": 8.127600554785022e-06, + "loss": 0.076, + "step": 2710 + }, + { + "epoch": 1.878074125389678, + "grad_norm": 0.8028963804244995, + "learning_rate": 8.126907073509017e-06, + "loss": 0.0753, + "step": 2711 + }, + { + "epoch": 1.8787668860408728, + "grad_norm": 0.8761164546012878, + "learning_rate": 8.12621359223301e-06, + "loss": 0.092, + "step": 2712 + }, + { + "epoch": 1.8794596466920679, + "grad_norm": 0.9060432314872742, + "learning_rate": 8.125520110957005e-06, + "loss": 0.101, + "step": 2713 + }, + { + "epoch": 1.880152407343263, + "grad_norm": 0.8995921611785889, + "learning_rate": 8.124826629681e-06, + "loss": 0.1149, + "step": 2714 + }, + { + "epoch": 1.8808451679944578, + "grad_norm": 0.7580709457397461, + "learning_rate": 8.124133148404993e-06, + "loss": 0.1024, + "step": 2715 + }, + { + "epoch": 1.881537928645653, + "grad_norm": 0.8204407095909119, + "learning_rate": 8.123439667128988e-06, + "loss": 0.0936, + "step": 2716 + }, + { + "epoch": 1.882230689296848, + "grad_norm": 0.7477433681488037, + "learning_rate": 8.122746185852983e-06, + "loss": 0.0801, + "step": 2717 + }, + { + "epoch": 1.882923449948043, + "grad_norm": 0.7480289936065674, + "learning_rate": 8.122052704576978e-06, + "loss": 0.079, + "step": 2718 + }, + { + "epoch": 1.883616210599238, + "grad_norm": 0.8686448931694031, + "learning_rate": 8.121359223300973e-06, + "loss": 0.0977, + "step": 2719 + }, + { + "epoch": 1.8843089712504328, + "grad_norm": 0.8237332701683044, + "learning_rate": 8.120665742024966e-06, + "loss": 0.0895, + "step": 2720 + }, + { + "epoch": 1.8850017319016281, + "grad_norm": 0.9018293619155884, + "learning_rate": 8.119972260748961e-06, + "loss": 0.1073, + "step": 2721 + }, + { + "epoch": 1.885694492552823, + "grad_norm": 0.8091353178024292, + "learning_rate": 8.119278779472954e-06, + "loss": 0.0947, + "step": 2722 + }, + { + "epoch": 1.886387253204018, + "grad_norm": 0.7719805836677551, + "learning_rate": 8.118585298196949e-06, + "loss": 0.0858, + "step": 2723 + }, + { + "epoch": 1.887080013855213, + "grad_norm": 0.8000248670578003, + "learning_rate": 8.117891816920944e-06, + "loss": 0.0843, + "step": 2724 + }, + { + "epoch": 1.887772774506408, + "grad_norm": 0.737173855304718, + "learning_rate": 8.117198335644937e-06, + "loss": 0.0811, + "step": 2725 + }, + { + "epoch": 1.8884655351576032, + "grad_norm": 0.9293944239616394, + "learning_rate": 8.116504854368932e-06, + "loss": 0.0915, + "step": 2726 + }, + { + "epoch": 1.889158295808798, + "grad_norm": 0.8899128437042236, + "learning_rate": 8.115811373092927e-06, + "loss": 0.1179, + "step": 2727 + }, + { + "epoch": 1.889851056459993, + "grad_norm": 1.142066478729248, + "learning_rate": 8.115117891816922e-06, + "loss": 0.1177, + "step": 2728 + }, + { + "epoch": 1.8905438171111881, + "grad_norm": 0.7394709587097168, + "learning_rate": 8.114424410540917e-06, + "loss": 0.0812, + "step": 2729 + }, + { + "epoch": 1.891236577762383, + "grad_norm": 0.7760245203971863, + "learning_rate": 8.11373092926491e-06, + "loss": 0.0899, + "step": 2730 + }, + { + "epoch": 1.8919293384135782, + "grad_norm": 0.7233203053474426, + "learning_rate": 8.113037447988905e-06, + "loss": 0.0859, + "step": 2731 + }, + { + "epoch": 1.892622099064773, + "grad_norm": 0.8594115376472473, + "learning_rate": 8.112343966712898e-06, + "loss": 0.0965, + "step": 2732 + }, + { + "epoch": 1.8933148597159681, + "grad_norm": 0.8868102431297302, + "learning_rate": 8.111650485436893e-06, + "loss": 0.0973, + "step": 2733 + }, + { + "epoch": 1.8940076203671632, + "grad_norm": 0.9329622387886047, + "learning_rate": 8.110957004160888e-06, + "loss": 0.1014, + "step": 2734 + }, + { + "epoch": 1.894700381018358, + "grad_norm": 0.8387184143066406, + "learning_rate": 8.110263522884883e-06, + "loss": 0.0808, + "step": 2735 + }, + { + "epoch": 1.8953931416695533, + "grad_norm": 0.7633192539215088, + "learning_rate": 8.109570041608878e-06, + "loss": 0.0951, + "step": 2736 + }, + { + "epoch": 1.8960859023207481, + "grad_norm": 0.8906262516975403, + "learning_rate": 8.108876560332871e-06, + "loss": 0.0958, + "step": 2737 + }, + { + "epoch": 1.8967786629719432, + "grad_norm": 0.8757254481315613, + "learning_rate": 8.108183079056866e-06, + "loss": 0.0963, + "step": 2738 + }, + { + "epoch": 1.8974714236231383, + "grad_norm": 0.9309480786323547, + "learning_rate": 8.107489597780861e-06, + "loss": 0.1078, + "step": 2739 + }, + { + "epoch": 1.898164184274333, + "grad_norm": 0.8245842456817627, + "learning_rate": 8.106796116504854e-06, + "loss": 0.1053, + "step": 2740 + }, + { + "epoch": 1.8988569449255284, + "grad_norm": 0.657714307308197, + "learning_rate": 8.10610263522885e-06, + "loss": 0.062, + "step": 2741 + }, + { + "epoch": 1.8995497055767232, + "grad_norm": 0.8045280575752258, + "learning_rate": 8.105409153952843e-06, + "loss": 0.0874, + "step": 2742 + }, + { + "epoch": 1.9002424662279183, + "grad_norm": 0.728067934513092, + "learning_rate": 8.104715672676838e-06, + "loss": 0.0624, + "step": 2743 + }, + { + "epoch": 1.9009352268791133, + "grad_norm": 0.9533117413520813, + "learning_rate": 8.104022191400832e-06, + "loss": 0.11, + "step": 2744 + }, + { + "epoch": 1.9016279875303082, + "grad_norm": 0.7989661693572998, + "learning_rate": 8.103328710124827e-06, + "loss": 0.0838, + "step": 2745 + }, + { + "epoch": 1.9023207481815034, + "grad_norm": 0.8091473579406738, + "learning_rate": 8.102635228848822e-06, + "loss": 0.0826, + "step": 2746 + }, + { + "epoch": 1.9030135088326983, + "grad_norm": 0.9147849082946777, + "learning_rate": 8.101941747572816e-06, + "loss": 0.1009, + "step": 2747 + }, + { + "epoch": 1.9037062694838933, + "grad_norm": 0.8592344522476196, + "learning_rate": 8.10124826629681e-06, + "loss": 0.0824, + "step": 2748 + }, + { + "epoch": 1.9043990301350884, + "grad_norm": 0.8707457780838013, + "learning_rate": 8.100554785020805e-06, + "loss": 0.1079, + "step": 2749 + }, + { + "epoch": 1.9050917907862832, + "grad_norm": 0.8529501557350159, + "learning_rate": 8.099861303744799e-06, + "loss": 0.0983, + "step": 2750 + }, + { + "epoch": 1.9057845514374785, + "grad_norm": 0.842431366443634, + "learning_rate": 8.099167822468794e-06, + "loss": 0.1047, + "step": 2751 + }, + { + "epoch": 1.9064773120886733, + "grad_norm": 0.8026554584503174, + "learning_rate": 8.098474341192789e-06, + "loss": 0.0753, + "step": 2752 + }, + { + "epoch": 1.9071700727398684, + "grad_norm": 0.8910338878631592, + "learning_rate": 8.097780859916783e-06, + "loss": 0.1004, + "step": 2753 + }, + { + "epoch": 1.9078628333910634, + "grad_norm": 0.6836819648742676, + "learning_rate": 8.097087378640778e-06, + "loss": 0.0693, + "step": 2754 + }, + { + "epoch": 1.9085555940422583, + "grad_norm": 0.7922234535217285, + "learning_rate": 8.096393897364772e-06, + "loss": 0.0893, + "step": 2755 + }, + { + "epoch": 1.9092483546934536, + "grad_norm": 0.799467921257019, + "learning_rate": 8.095700416088767e-06, + "loss": 0.0862, + "step": 2756 + }, + { + "epoch": 1.9099411153446484, + "grad_norm": 0.9862185120582581, + "learning_rate": 8.09500693481276e-06, + "loss": 0.0984, + "step": 2757 + }, + { + "epoch": 1.9106338759958434, + "grad_norm": 1.1314369440078735, + "learning_rate": 8.094313453536755e-06, + "loss": 0.1002, + "step": 2758 + }, + { + "epoch": 1.9113266366470385, + "grad_norm": 0.8351694941520691, + "learning_rate": 8.09361997226075e-06, + "loss": 0.1064, + "step": 2759 + }, + { + "epoch": 1.9120193972982333, + "grad_norm": 0.9159316420555115, + "learning_rate": 8.092926490984745e-06, + "loss": 0.0971, + "step": 2760 + }, + { + "epoch": 1.9127121579494286, + "grad_norm": 0.8018559813499451, + "learning_rate": 8.092233009708738e-06, + "loss": 0.0847, + "step": 2761 + }, + { + "epoch": 1.9134049186006234, + "grad_norm": 0.9117119312286377, + "learning_rate": 8.091539528432733e-06, + "loss": 0.1019, + "step": 2762 + }, + { + "epoch": 1.9140976792518185, + "grad_norm": 0.7862089276313782, + "learning_rate": 8.090846047156728e-06, + "loss": 0.0838, + "step": 2763 + }, + { + "epoch": 1.9147904399030136, + "grad_norm": 0.8904909491539001, + "learning_rate": 8.090152565880723e-06, + "loss": 0.1102, + "step": 2764 + }, + { + "epoch": 1.9154832005542084, + "grad_norm": 0.7712727189064026, + "learning_rate": 8.089459084604716e-06, + "loss": 0.0716, + "step": 2765 + }, + { + "epoch": 1.9161759612054037, + "grad_norm": 0.7882879376411438, + "learning_rate": 8.08876560332871e-06, + "loss": 0.092, + "step": 2766 + }, + { + "epoch": 1.9168687218565985, + "grad_norm": 0.7251750230789185, + "learning_rate": 8.088072122052704e-06, + "loss": 0.0823, + "step": 2767 + }, + { + "epoch": 1.9175614825077936, + "grad_norm": 0.8687427639961243, + "learning_rate": 8.087378640776699e-06, + "loss": 0.09, + "step": 2768 + }, + { + "epoch": 1.9182542431589886, + "grad_norm": 0.7910054326057434, + "learning_rate": 8.086685159500694e-06, + "loss": 0.0961, + "step": 2769 + }, + { + "epoch": 1.9189470038101835, + "grad_norm": 0.7606614232063293, + "learning_rate": 8.085991678224689e-06, + "loss": 0.0841, + "step": 2770 + }, + { + "epoch": 1.9196397644613787, + "grad_norm": 0.9103227257728577, + "learning_rate": 8.085298196948684e-06, + "loss": 0.0889, + "step": 2771 + }, + { + "epoch": 1.9203325251125736, + "grad_norm": 0.8335845470428467, + "learning_rate": 8.084604715672677e-06, + "loss": 0.0883, + "step": 2772 + }, + { + "epoch": 1.9210252857637686, + "grad_norm": 0.8396230340003967, + "learning_rate": 8.083911234396672e-06, + "loss": 0.0909, + "step": 2773 + }, + { + "epoch": 1.9217180464149637, + "grad_norm": 0.9027969241142273, + "learning_rate": 8.083217753120667e-06, + "loss": 0.1057, + "step": 2774 + }, + { + "epoch": 1.9224108070661585, + "grad_norm": 0.8963081240653992, + "learning_rate": 8.08252427184466e-06, + "loss": 0.0933, + "step": 2775 + }, + { + "epoch": 1.9231035677173538, + "grad_norm": 0.8285951614379883, + "learning_rate": 8.081830790568655e-06, + "loss": 0.0748, + "step": 2776 + }, + { + "epoch": 1.9237963283685486, + "grad_norm": 0.959260880947113, + "learning_rate": 8.08113730929265e-06, + "loss": 0.0959, + "step": 2777 + }, + { + "epoch": 1.9244890890197437, + "grad_norm": 0.7964215278625488, + "learning_rate": 8.080443828016645e-06, + "loss": 0.0846, + "step": 2778 + }, + { + "epoch": 1.9251818496709387, + "grad_norm": 0.7677169442176819, + "learning_rate": 8.07975034674064e-06, + "loss": 0.0955, + "step": 2779 + }, + { + "epoch": 1.9258746103221336, + "grad_norm": 0.892074704170227, + "learning_rate": 8.079056865464633e-06, + "loss": 0.0905, + "step": 2780 + }, + { + "epoch": 1.9265673709733289, + "grad_norm": 0.884797215461731, + "learning_rate": 8.078363384188628e-06, + "loss": 0.1175, + "step": 2781 + }, + { + "epoch": 1.9272601316245237, + "grad_norm": 0.9015825986862183, + "learning_rate": 8.077669902912621e-06, + "loss": 0.1072, + "step": 2782 + }, + { + "epoch": 1.9279528922757188, + "grad_norm": 0.8209898471832275, + "learning_rate": 8.076976421636616e-06, + "loss": 0.0833, + "step": 2783 + }, + { + "epoch": 1.9286456529269138, + "grad_norm": 0.9417157173156738, + "learning_rate": 8.076282940360611e-06, + "loss": 0.0949, + "step": 2784 + }, + { + "epoch": 1.9293384135781086, + "grad_norm": 0.8368494510650635, + "learning_rate": 8.075589459084604e-06, + "loss": 0.1081, + "step": 2785 + }, + { + "epoch": 1.930031174229304, + "grad_norm": 0.9572821855545044, + "learning_rate": 8.0748959778086e-06, + "loss": 0.0917, + "step": 2786 + }, + { + "epoch": 1.9307239348804988, + "grad_norm": 0.8616113662719727, + "learning_rate": 8.074202496532594e-06, + "loss": 0.1063, + "step": 2787 + }, + { + "epoch": 1.9314166955316938, + "grad_norm": 0.8733940124511719, + "learning_rate": 8.07350901525659e-06, + "loss": 0.087, + "step": 2788 + }, + { + "epoch": 1.9321094561828889, + "grad_norm": 0.9132512807846069, + "learning_rate": 8.072815533980584e-06, + "loss": 0.0978, + "step": 2789 + }, + { + "epoch": 1.9328022168340837, + "grad_norm": 0.8157616853713989, + "learning_rate": 8.072122052704577e-06, + "loss": 0.0748, + "step": 2790 + }, + { + "epoch": 1.933494977485279, + "grad_norm": 0.7877839207649231, + "learning_rate": 8.071428571428572e-06, + "loss": 0.0913, + "step": 2791 + }, + { + "epoch": 1.9341877381364738, + "grad_norm": 0.8950355052947998, + "learning_rate": 8.070735090152566e-06, + "loss": 0.1013, + "step": 2792 + }, + { + "epoch": 1.9348804987876689, + "grad_norm": 0.8914209604263306, + "learning_rate": 8.07004160887656e-06, + "loss": 0.1147, + "step": 2793 + }, + { + "epoch": 1.935573259438864, + "grad_norm": 0.7586885690689087, + "learning_rate": 8.069348127600555e-06, + "loss": 0.075, + "step": 2794 + }, + { + "epoch": 1.9362660200900588, + "grad_norm": 0.9337842464447021, + "learning_rate": 8.06865464632455e-06, + "loss": 0.1157, + "step": 2795 + }, + { + "epoch": 1.936958780741254, + "grad_norm": 0.7821459174156189, + "learning_rate": 8.067961165048545e-06, + "loss": 0.0995, + "step": 2796 + }, + { + "epoch": 1.9376515413924489, + "grad_norm": 0.8888229131698608, + "learning_rate": 8.067267683772539e-06, + "loss": 0.1026, + "step": 2797 + }, + { + "epoch": 1.938344302043644, + "grad_norm": 0.7771438360214233, + "learning_rate": 8.066574202496533e-06, + "loss": 0.0836, + "step": 2798 + }, + { + "epoch": 1.939037062694839, + "grad_norm": 0.7105121612548828, + "learning_rate": 8.065880721220528e-06, + "loss": 0.0706, + "step": 2799 + }, + { + "epoch": 1.9397298233460338, + "grad_norm": 1.0043034553527832, + "learning_rate": 8.065187239944522e-06, + "loss": 0.119, + "step": 2800 + }, + { + "epoch": 1.9404225839972291, + "grad_norm": 0.8070681095123291, + "learning_rate": 8.064493758668517e-06, + "loss": 0.0992, + "step": 2801 + }, + { + "epoch": 1.941115344648424, + "grad_norm": 0.86124587059021, + "learning_rate": 8.06380027739251e-06, + "loss": 0.0968, + "step": 2802 + }, + { + "epoch": 1.941808105299619, + "grad_norm": 0.8489442467689514, + "learning_rate": 8.063106796116505e-06, + "loss": 0.1085, + "step": 2803 + }, + { + "epoch": 1.942500865950814, + "grad_norm": 0.8105494379997253, + "learning_rate": 8.0624133148405e-06, + "loss": 0.0775, + "step": 2804 + }, + { + "epoch": 1.943193626602009, + "grad_norm": 0.6798321604728699, + "learning_rate": 8.061719833564495e-06, + "loss": 0.0694, + "step": 2805 + }, + { + "epoch": 1.9438863872532042, + "grad_norm": 0.8851845264434814, + "learning_rate": 8.06102635228849e-06, + "loss": 0.1133, + "step": 2806 + }, + { + "epoch": 1.944579147904399, + "grad_norm": 0.8672387003898621, + "learning_rate": 8.060332871012483e-06, + "loss": 0.1009, + "step": 2807 + }, + { + "epoch": 1.945271908555594, + "grad_norm": 0.8221784234046936, + "learning_rate": 8.059639389736478e-06, + "loss": 0.0868, + "step": 2808 + }, + { + "epoch": 1.9459646692067891, + "grad_norm": 0.8535941243171692, + "learning_rate": 8.058945908460473e-06, + "loss": 0.0892, + "step": 2809 + }, + { + "epoch": 1.946657429857984, + "grad_norm": 0.8289077877998352, + "learning_rate": 8.058252427184466e-06, + "loss": 0.0948, + "step": 2810 + }, + { + "epoch": 1.9473501905091792, + "grad_norm": 0.794834315776825, + "learning_rate": 8.05755894590846e-06, + "loss": 0.0764, + "step": 2811 + }, + { + "epoch": 1.948042951160374, + "grad_norm": 0.797599732875824, + "learning_rate": 8.056865464632456e-06, + "loss": 0.0855, + "step": 2812 + }, + { + "epoch": 1.9487357118115691, + "grad_norm": 0.8061396479606628, + "learning_rate": 8.05617198335645e-06, + "loss": 0.0924, + "step": 2813 + }, + { + "epoch": 1.9494284724627642, + "grad_norm": 0.7848390936851501, + "learning_rate": 8.055478502080446e-06, + "loss": 0.0924, + "step": 2814 + }, + { + "epoch": 1.950121233113959, + "grad_norm": 0.982895016670227, + "learning_rate": 8.054785020804439e-06, + "loss": 0.1266, + "step": 2815 + }, + { + "epoch": 1.9508139937651543, + "grad_norm": 0.8684736490249634, + "learning_rate": 8.054091539528434e-06, + "loss": 0.1049, + "step": 2816 + }, + { + "epoch": 1.9515067544163491, + "grad_norm": 0.8869232535362244, + "learning_rate": 8.053398058252427e-06, + "loss": 0.1149, + "step": 2817 + }, + { + "epoch": 1.9521995150675442, + "grad_norm": 0.8731580376625061, + "learning_rate": 8.052704576976422e-06, + "loss": 0.1045, + "step": 2818 + }, + { + "epoch": 1.9528922757187392, + "grad_norm": 0.9235742688179016, + "learning_rate": 8.052011095700417e-06, + "loss": 0.1248, + "step": 2819 + }, + { + "epoch": 1.953585036369934, + "grad_norm": 0.943651556968689, + "learning_rate": 8.05131761442441e-06, + "loss": 0.0933, + "step": 2820 + }, + { + "epoch": 1.9542777970211294, + "grad_norm": 0.7307215929031372, + "learning_rate": 8.050624133148405e-06, + "loss": 0.0651, + "step": 2821 + }, + { + "epoch": 1.9549705576723242, + "grad_norm": 0.7968959212303162, + "learning_rate": 8.0499306518724e-06, + "loss": 0.0993, + "step": 2822 + }, + { + "epoch": 1.9556633183235193, + "grad_norm": 0.8350526094436646, + "learning_rate": 8.049237170596395e-06, + "loss": 0.08, + "step": 2823 + }, + { + "epoch": 1.9563560789747143, + "grad_norm": 0.7916170358657837, + "learning_rate": 8.04854368932039e-06, + "loss": 0.0848, + "step": 2824 + }, + { + "epoch": 1.9570488396259091, + "grad_norm": 0.7483957409858704, + "learning_rate": 8.047850208044383e-06, + "loss": 0.0716, + "step": 2825 + }, + { + "epoch": 1.9577416002771044, + "grad_norm": 0.7722062468528748, + "learning_rate": 8.047156726768378e-06, + "loss": 0.0952, + "step": 2826 + }, + { + "epoch": 1.9584343609282993, + "grad_norm": 0.8419215083122253, + "learning_rate": 8.046463245492371e-06, + "loss": 0.0846, + "step": 2827 + }, + { + "epoch": 1.9591271215794943, + "grad_norm": 0.8864947557449341, + "learning_rate": 8.045769764216366e-06, + "loss": 0.0883, + "step": 2828 + }, + { + "epoch": 1.9598198822306894, + "grad_norm": 0.865912914276123, + "learning_rate": 8.045076282940361e-06, + "loss": 0.0983, + "step": 2829 + }, + { + "epoch": 1.9605126428818842, + "grad_norm": 0.8244338631629944, + "learning_rate": 8.044382801664356e-06, + "loss": 0.0973, + "step": 2830 + }, + { + "epoch": 1.9612054035330795, + "grad_norm": 0.762330949306488, + "learning_rate": 8.043689320388351e-06, + "loss": 0.0802, + "step": 2831 + }, + { + "epoch": 1.9618981641842743, + "grad_norm": 0.8400306105613708, + "learning_rate": 8.042995839112344e-06, + "loss": 0.0925, + "step": 2832 + }, + { + "epoch": 1.9625909248354694, + "grad_norm": 0.8090659976005554, + "learning_rate": 8.04230235783634e-06, + "loss": 0.0959, + "step": 2833 + }, + { + "epoch": 1.9632836854866644, + "grad_norm": 0.9807546734809875, + "learning_rate": 8.041608876560334e-06, + "loss": 0.11, + "step": 2834 + }, + { + "epoch": 1.9639764461378593, + "grad_norm": 0.7383775115013123, + "learning_rate": 8.040915395284327e-06, + "loss": 0.0813, + "step": 2835 + }, + { + "epoch": 1.9646692067890545, + "grad_norm": 1.0186407566070557, + "learning_rate": 8.040221914008322e-06, + "loss": 0.128, + "step": 2836 + }, + { + "epoch": 1.9653619674402494, + "grad_norm": 0.7591642737388611, + "learning_rate": 8.039528432732317e-06, + "loss": 0.0876, + "step": 2837 + }, + { + "epoch": 1.9660547280914444, + "grad_norm": 0.9109426140785217, + "learning_rate": 8.038834951456312e-06, + "loss": 0.0865, + "step": 2838 + }, + { + "epoch": 1.9667474887426395, + "grad_norm": 0.9108000993728638, + "learning_rate": 8.038141470180305e-06, + "loss": 0.1074, + "step": 2839 + }, + { + "epoch": 1.9674402493938343, + "grad_norm": 0.9961658120155334, + "learning_rate": 8.0374479889043e-06, + "loss": 0.1089, + "step": 2840 + }, + { + "epoch": 1.9681330100450296, + "grad_norm": 1.0040456056594849, + "learning_rate": 8.036754507628295e-06, + "loss": 0.0915, + "step": 2841 + }, + { + "epoch": 1.9688257706962244, + "grad_norm": 0.7719624042510986, + "learning_rate": 8.036061026352289e-06, + "loss": 0.0971, + "step": 2842 + }, + { + "epoch": 1.9695185313474195, + "grad_norm": 0.8600178360939026, + "learning_rate": 8.035367545076283e-06, + "loss": 0.0887, + "step": 2843 + }, + { + "epoch": 1.9702112919986146, + "grad_norm": 0.8480808138847351, + "learning_rate": 8.034674063800278e-06, + "loss": 0.0853, + "step": 2844 + }, + { + "epoch": 1.9709040526498094, + "grad_norm": 0.765336275100708, + "learning_rate": 8.033980582524272e-06, + "loss": 0.0842, + "step": 2845 + }, + { + "epoch": 1.9715968133010047, + "grad_norm": 0.895420491695404, + "learning_rate": 8.033287101248267e-06, + "loss": 0.0858, + "step": 2846 + }, + { + "epoch": 1.9722895739521995, + "grad_norm": 0.9333667159080505, + "learning_rate": 8.032593619972261e-06, + "loss": 0.088, + "step": 2847 + }, + { + "epoch": 1.9729823346033946, + "grad_norm": 0.815650463104248, + "learning_rate": 8.031900138696256e-06, + "loss": 0.0834, + "step": 2848 + }, + { + "epoch": 1.9736750952545896, + "grad_norm": 0.8807889819145203, + "learning_rate": 8.031206657420251e-06, + "loss": 0.1003, + "step": 2849 + }, + { + "epoch": 1.9743678559057845, + "grad_norm": 0.9676668643951416, + "learning_rate": 8.030513176144245e-06, + "loss": 0.1208, + "step": 2850 + }, + { + "epoch": 1.9750606165569795, + "grad_norm": 0.8601505756378174, + "learning_rate": 8.02981969486824e-06, + "loss": 0.1093, + "step": 2851 + }, + { + "epoch": 1.9757533772081746, + "grad_norm": 0.8118881583213806, + "learning_rate": 8.029126213592233e-06, + "loss": 0.0895, + "step": 2852 + }, + { + "epoch": 1.9764461378593696, + "grad_norm": 0.8941214084625244, + "learning_rate": 8.028432732316228e-06, + "loss": 0.0917, + "step": 2853 + }, + { + "epoch": 1.9771388985105647, + "grad_norm": 0.9647018909454346, + "learning_rate": 8.027739251040223e-06, + "loss": 0.1209, + "step": 2854 + }, + { + "epoch": 1.9778316591617595, + "grad_norm": 0.9033962488174438, + "learning_rate": 8.027045769764218e-06, + "loss": 0.0974, + "step": 2855 + }, + { + "epoch": 1.9785244198129546, + "grad_norm": 1.0619659423828125, + "learning_rate": 8.026352288488212e-06, + "loss": 0.1026, + "step": 2856 + }, + { + "epoch": 1.9792171804641496, + "grad_norm": 0.7518752813339233, + "learning_rate": 8.025658807212206e-06, + "loss": 0.0845, + "step": 2857 + }, + { + "epoch": 1.9799099411153447, + "grad_norm": 0.8235559463500977, + "learning_rate": 8.0249653259362e-06, + "loss": 0.0884, + "step": 2858 + }, + { + "epoch": 1.9806027017665397, + "grad_norm": 0.8043603897094727, + "learning_rate": 8.024271844660196e-06, + "loss": 0.0921, + "step": 2859 + }, + { + "epoch": 1.9812954624177346, + "grad_norm": 0.8355159759521484, + "learning_rate": 8.023578363384189e-06, + "loss": 0.0895, + "step": 2860 + }, + { + "epoch": 1.9819882230689296, + "grad_norm": 0.9968959093093872, + "learning_rate": 8.022884882108184e-06, + "loss": 0.109, + "step": 2861 + }, + { + "epoch": 1.9826809837201247, + "grad_norm": 0.8901378512382507, + "learning_rate": 8.022191400832177e-06, + "loss": 0.1086, + "step": 2862 + }, + { + "epoch": 1.9833737443713197, + "grad_norm": 0.8336589336395264, + "learning_rate": 8.021497919556172e-06, + "loss": 0.0877, + "step": 2863 + }, + { + "epoch": 1.9840665050225148, + "grad_norm": 0.8249452710151672, + "learning_rate": 8.020804438280167e-06, + "loss": 0.0773, + "step": 2864 + }, + { + "epoch": 1.9847592656737096, + "grad_norm": 0.8235527276992798, + "learning_rate": 8.020110957004162e-06, + "loss": 0.0897, + "step": 2865 + }, + { + "epoch": 1.9854520263249047, + "grad_norm": 0.9498945474624634, + "learning_rate": 8.019417475728157e-06, + "loss": 0.1136, + "step": 2866 + }, + { + "epoch": 1.9861447869760998, + "grad_norm": 0.7912014722824097, + "learning_rate": 8.01872399445215e-06, + "loss": 0.0935, + "step": 2867 + }, + { + "epoch": 1.9868375476272948, + "grad_norm": 0.7760205268859863, + "learning_rate": 8.018030513176145e-06, + "loss": 0.0817, + "step": 2868 + }, + { + "epoch": 1.9875303082784899, + "grad_norm": 0.8328711986541748, + "learning_rate": 8.01733703190014e-06, + "loss": 0.0877, + "step": 2869 + }, + { + "epoch": 1.9882230689296847, + "grad_norm": 0.8077870607376099, + "learning_rate": 8.016643550624133e-06, + "loss": 0.0796, + "step": 2870 + }, + { + "epoch": 1.9889158295808798, + "grad_norm": 1.1094268560409546, + "learning_rate": 8.015950069348128e-06, + "loss": 0.0997, + "step": 2871 + }, + { + "epoch": 1.9896085902320748, + "grad_norm": 0.8952378630638123, + "learning_rate": 8.015256588072123e-06, + "loss": 0.0955, + "step": 2872 + }, + { + "epoch": 1.9903013508832699, + "grad_norm": 0.7212051153182983, + "learning_rate": 8.014563106796118e-06, + "loss": 0.0755, + "step": 2873 + }, + { + "epoch": 1.990994111534465, + "grad_norm": 0.7911790609359741, + "learning_rate": 8.013869625520113e-06, + "loss": 0.1015, + "step": 2874 + }, + { + "epoch": 1.9916868721856598, + "grad_norm": 0.807488203048706, + "learning_rate": 8.013176144244106e-06, + "loss": 0.1066, + "step": 2875 + }, + { + "epoch": 1.9923796328368548, + "grad_norm": 0.8579085469245911, + "learning_rate": 8.012482662968101e-06, + "loss": 0.0918, + "step": 2876 + }, + { + "epoch": 1.9930723934880499, + "grad_norm": 0.8495670557022095, + "learning_rate": 8.011789181692094e-06, + "loss": 0.0944, + "step": 2877 + }, + { + "epoch": 1.993765154139245, + "grad_norm": 0.7934786081314087, + "learning_rate": 8.01109570041609e-06, + "loss": 0.0792, + "step": 2878 + }, + { + "epoch": 1.99445791479044, + "grad_norm": 0.9561198353767395, + "learning_rate": 8.010402219140084e-06, + "loss": 0.1, + "step": 2879 + }, + { + "epoch": 1.9951506754416348, + "grad_norm": 0.8843298554420471, + "learning_rate": 8.009708737864077e-06, + "loss": 0.1082, + "step": 2880 + }, + { + "epoch": 1.9958434360928299, + "grad_norm": 0.9929296374320984, + "learning_rate": 8.009015256588072e-06, + "loss": 0.1353, + "step": 2881 + }, + { + "epoch": 1.996536196744025, + "grad_norm": 0.9166862368583679, + "learning_rate": 8.008321775312067e-06, + "loss": 0.0744, + "step": 2882 + }, + { + "epoch": 1.99722895739522, + "grad_norm": 0.7657050490379333, + "learning_rate": 8.007628294036062e-06, + "loss": 0.0863, + "step": 2883 + }, + { + "epoch": 1.997921718046415, + "grad_norm": 1.0217808485031128, + "learning_rate": 8.006934812760057e-06, + "loss": 0.1173, + "step": 2884 + }, + { + "epoch": 1.9986144786976099, + "grad_norm": 1.1051387786865234, + "learning_rate": 8.00624133148405e-06, + "loss": 0.0966, + "step": 2885 + }, + { + "epoch": 1.999307239348805, + "grad_norm": 0.908137321472168, + "learning_rate": 8.005547850208045e-06, + "loss": 0.1077, + "step": 2886 + }, + { + "epoch": 2.0, + "grad_norm": 1.0401666164398193, + "learning_rate": 8.004854368932038e-06, + "loss": 0.105, + "step": 2887 + }, + { + "epoch": 2.0, + "eval_loss": 0.2396479994058609, + "eval_runtime": 7675.0388, + "eval_samples_per_second": 1.042, + "eval_steps_per_second": 0.033, + "eval_wer": 13.347775953038191, + "step": 2887 + }, + { + "epoch": 2.000692760651195, + "grad_norm": 0.5223402976989746, + "learning_rate": 8.004160887656033e-06, + "loss": 0.0599, + "step": 2888 + }, + { + "epoch": 2.00138552130239, + "grad_norm": 0.557817816734314, + "learning_rate": 8.003467406380028e-06, + "loss": 0.0408, + "step": 2889 + }, + { + "epoch": 2.002078281953585, + "grad_norm": 0.5676665902137756, + "learning_rate": 8.002773925104023e-06, + "loss": 0.0476, + "step": 2890 + }, + { + "epoch": 2.0027710426047802, + "grad_norm": 0.6631727814674377, + "learning_rate": 8.002080443828018e-06, + "loss": 0.0633, + "step": 2891 + }, + { + "epoch": 2.003463803255975, + "grad_norm": 0.6056278347969055, + "learning_rate": 8.001386962552011e-06, + "loss": 0.0596, + "step": 2892 + }, + { + "epoch": 2.00415656390717, + "grad_norm": 0.573501706123352, + "learning_rate": 8.000693481276006e-06, + "loss": 0.0525, + "step": 2893 + }, + { + "epoch": 2.004849324558365, + "grad_norm": 0.6473743319511414, + "learning_rate": 8.000000000000001e-06, + "loss": 0.0541, + "step": 2894 + }, + { + "epoch": 2.00554208520956, + "grad_norm": 0.6857211589813232, + "learning_rate": 7.999306518723995e-06, + "loss": 0.0602, + "step": 2895 + }, + { + "epoch": 2.0062348458607553, + "grad_norm": 0.7229959964752197, + "learning_rate": 7.99861303744799e-06, + "loss": 0.066, + "step": 2896 + }, + { + "epoch": 2.00692760651195, + "grad_norm": 0.5958266854286194, + "learning_rate": 7.997919556171983e-06, + "loss": 0.0621, + "step": 2897 + }, + { + "epoch": 2.007620367163145, + "grad_norm": 0.6253556609153748, + "learning_rate": 7.997226074895978e-06, + "loss": 0.0554, + "step": 2898 + }, + { + "epoch": 2.0083131278143402, + "grad_norm": 0.6457939147949219, + "learning_rate": 7.996532593619973e-06, + "loss": 0.0652, + "step": 2899 + }, + { + "epoch": 2.009005888465535, + "grad_norm": 0.7323454022407532, + "learning_rate": 7.995839112343968e-06, + "loss": 0.0561, + "step": 2900 + }, + { + "epoch": 2.0096986491167304, + "grad_norm": 0.6677737236022949, + "learning_rate": 7.995145631067962e-06, + "loss": 0.0748, + "step": 2901 + }, + { + "epoch": 2.010391409767925, + "grad_norm": 0.7082302570343018, + "learning_rate": 7.994452149791956e-06, + "loss": 0.0503, + "step": 2902 + }, + { + "epoch": 2.01108417041912, + "grad_norm": 0.6542059183120728, + "learning_rate": 7.99375866851595e-06, + "loss": 0.0498, + "step": 2903 + }, + { + "epoch": 2.0117769310703153, + "grad_norm": 0.6658546328544617, + "learning_rate": 7.993065187239946e-06, + "loss": 0.0586, + "step": 2904 + }, + { + "epoch": 2.01246969172151, + "grad_norm": 0.8179011940956116, + "learning_rate": 7.992371705963939e-06, + "loss": 0.06, + "step": 2905 + }, + { + "epoch": 2.0131624523727054, + "grad_norm": 0.6736352443695068, + "learning_rate": 7.991678224687934e-06, + "loss": 0.0544, + "step": 2906 + }, + { + "epoch": 2.0138552130239002, + "grad_norm": 0.6913522481918335, + "learning_rate": 7.990984743411929e-06, + "loss": 0.0524, + "step": 2907 + }, + { + "epoch": 2.014547973675095, + "grad_norm": 0.7478219270706177, + "learning_rate": 7.990291262135924e-06, + "loss": 0.0601, + "step": 2908 + }, + { + "epoch": 2.0152407343262904, + "grad_norm": 0.7340624332427979, + "learning_rate": 7.989597780859919e-06, + "loss": 0.0632, + "step": 2909 + }, + { + "epoch": 2.015933494977485, + "grad_norm": 0.6508964896202087, + "learning_rate": 7.988904299583912e-06, + "loss": 0.0575, + "step": 2910 + }, + { + "epoch": 2.0166262556286805, + "grad_norm": 0.6782339811325073, + "learning_rate": 7.988210818307907e-06, + "loss": 0.0526, + "step": 2911 + }, + { + "epoch": 2.0173190162798753, + "grad_norm": 0.6231827735900879, + "learning_rate": 7.9875173370319e-06, + "loss": 0.0515, + "step": 2912 + }, + { + "epoch": 2.01801177693107, + "grad_norm": 0.6707166433334351, + "learning_rate": 7.986823855755895e-06, + "loss": 0.0553, + "step": 2913 + }, + { + "epoch": 2.0187045375822654, + "grad_norm": 0.6236500144004822, + "learning_rate": 7.98613037447989e-06, + "loss": 0.0512, + "step": 2914 + }, + { + "epoch": 2.0193972982334603, + "grad_norm": 0.7530456781387329, + "learning_rate": 7.985436893203885e-06, + "loss": 0.0567, + "step": 2915 + }, + { + "epoch": 2.0200900588846555, + "grad_norm": 0.7120798230171204, + "learning_rate": 7.984743411927878e-06, + "loss": 0.0667, + "step": 2916 + }, + { + "epoch": 2.0207828195358504, + "grad_norm": 0.6329599618911743, + "learning_rate": 7.984049930651873e-06, + "loss": 0.0603, + "step": 2917 + }, + { + "epoch": 2.021475580187045, + "grad_norm": 0.6339494585990906, + "learning_rate": 7.983356449375868e-06, + "loss": 0.0584, + "step": 2918 + }, + { + "epoch": 2.0221683408382405, + "grad_norm": 0.6025832891464233, + "learning_rate": 7.982662968099863e-06, + "loss": 0.0537, + "step": 2919 + }, + { + "epoch": 2.0228611014894353, + "grad_norm": 0.6312150359153748, + "learning_rate": 7.981969486823856e-06, + "loss": 0.0551, + "step": 2920 + }, + { + "epoch": 2.0235538621406306, + "grad_norm": 0.5520080327987671, + "learning_rate": 7.981276005547851e-06, + "loss": 0.0366, + "step": 2921 + }, + { + "epoch": 2.0242466227918254, + "grad_norm": 0.648896336555481, + "learning_rate": 7.980582524271844e-06, + "loss": 0.0456, + "step": 2922 + }, + { + "epoch": 2.0249393834430203, + "grad_norm": 0.6456888914108276, + "learning_rate": 7.97988904299584e-06, + "loss": 0.0576, + "step": 2923 + }, + { + "epoch": 2.0256321440942155, + "grad_norm": 0.6049957871437073, + "learning_rate": 7.979195561719834e-06, + "loss": 0.0524, + "step": 2924 + }, + { + "epoch": 2.0263249047454104, + "grad_norm": 0.6657125353813171, + "learning_rate": 7.978502080443829e-06, + "loss": 0.0623, + "step": 2925 + }, + { + "epoch": 2.0270176653966057, + "grad_norm": 0.6422529816627502, + "learning_rate": 7.977808599167824e-06, + "loss": 0.0627, + "step": 2926 + }, + { + "epoch": 2.0277104260478005, + "grad_norm": 0.5862202048301697, + "learning_rate": 7.977115117891817e-06, + "loss": 0.0368, + "step": 2927 + }, + { + "epoch": 2.0284031866989953, + "grad_norm": 0.6602012515068054, + "learning_rate": 7.976421636615812e-06, + "loss": 0.0663, + "step": 2928 + }, + { + "epoch": 2.0290959473501906, + "grad_norm": 0.6749635338783264, + "learning_rate": 7.975728155339807e-06, + "loss": 0.0566, + "step": 2929 + }, + { + "epoch": 2.0297887080013854, + "grad_norm": 0.6463996767997742, + "learning_rate": 7.9750346740638e-06, + "loss": 0.0528, + "step": 2930 + }, + { + "epoch": 2.0304814686525807, + "grad_norm": 0.6901179552078247, + "learning_rate": 7.974341192787795e-06, + "loss": 0.0525, + "step": 2931 + }, + { + "epoch": 2.0311742293037756, + "grad_norm": 0.6191695332527161, + "learning_rate": 7.97364771151179e-06, + "loss": 0.0449, + "step": 2932 + }, + { + "epoch": 2.0318669899549704, + "grad_norm": 0.6079990267753601, + "learning_rate": 7.972954230235785e-06, + "loss": 0.049, + "step": 2933 + }, + { + "epoch": 2.0325597506061657, + "grad_norm": 0.7164596319198608, + "learning_rate": 7.97226074895978e-06, + "loss": 0.0598, + "step": 2934 + }, + { + "epoch": 2.0332525112573605, + "grad_norm": 0.6479280591011047, + "learning_rate": 7.971567267683773e-06, + "loss": 0.0536, + "step": 2935 + }, + { + "epoch": 2.033945271908556, + "grad_norm": 0.7407905459403992, + "learning_rate": 7.970873786407768e-06, + "loss": 0.0657, + "step": 2936 + }, + { + "epoch": 2.0346380325597506, + "grad_norm": 0.6542361378669739, + "learning_rate": 7.970180305131761e-06, + "loss": 0.0552, + "step": 2937 + }, + { + "epoch": 2.0353307932109455, + "grad_norm": 0.6269006729125977, + "learning_rate": 7.969486823855756e-06, + "loss": 0.0668, + "step": 2938 + }, + { + "epoch": 2.0360235538621407, + "grad_norm": 0.6172609329223633, + "learning_rate": 7.968793342579751e-06, + "loss": 0.0465, + "step": 2939 + }, + { + "epoch": 2.0367163145133356, + "grad_norm": 0.7666977643966675, + "learning_rate": 7.968099861303745e-06, + "loss": 0.0518, + "step": 2940 + }, + { + "epoch": 2.037409075164531, + "grad_norm": 0.6957048177719116, + "learning_rate": 7.96740638002774e-06, + "loss": 0.0612, + "step": 2941 + }, + { + "epoch": 2.0381018358157257, + "grad_norm": 0.5479961633682251, + "learning_rate": 7.966712898751734e-06, + "loss": 0.039, + "step": 2942 + }, + { + "epoch": 2.0387945964669205, + "grad_norm": 0.7017296552658081, + "learning_rate": 7.96601941747573e-06, + "loss": 0.0548, + "step": 2943 + }, + { + "epoch": 2.039487357118116, + "grad_norm": 0.6820141077041626, + "learning_rate": 7.965325936199724e-06, + "loss": 0.054, + "step": 2944 + }, + { + "epoch": 2.0401801177693106, + "grad_norm": 0.7078639268875122, + "learning_rate": 7.964632454923718e-06, + "loss": 0.064, + "step": 2945 + }, + { + "epoch": 2.040872878420506, + "grad_norm": 0.6844906210899353, + "learning_rate": 7.963938973647712e-06, + "loss": 0.0557, + "step": 2946 + }, + { + "epoch": 2.0415656390717007, + "grad_norm": 0.75017911195755, + "learning_rate": 7.963245492371706e-06, + "loss": 0.0641, + "step": 2947 + }, + { + "epoch": 2.0422583997228956, + "grad_norm": 0.6246193647384644, + "learning_rate": 7.9625520110957e-06, + "loss": 0.0493, + "step": 2948 + }, + { + "epoch": 2.042951160374091, + "grad_norm": 0.6279093027114868, + "learning_rate": 7.961858529819696e-06, + "loss": 0.0502, + "step": 2949 + }, + { + "epoch": 2.0436439210252857, + "grad_norm": 0.6237124800682068, + "learning_rate": 7.96116504854369e-06, + "loss": 0.0503, + "step": 2950 + }, + { + "epoch": 2.044336681676481, + "grad_norm": 0.5340720415115356, + "learning_rate": 7.960471567267685e-06, + "loss": 0.0405, + "step": 2951 + }, + { + "epoch": 2.045029442327676, + "grad_norm": 0.565596342086792, + "learning_rate": 7.959778085991679e-06, + "loss": 0.0513, + "step": 2952 + }, + { + "epoch": 2.0457222029788706, + "grad_norm": 0.6254692077636719, + "learning_rate": 7.959084604715674e-06, + "loss": 0.0516, + "step": 2953 + }, + { + "epoch": 2.046414963630066, + "grad_norm": 0.5610790848731995, + "learning_rate": 7.958391123439669e-06, + "loss": 0.0503, + "step": 2954 + }, + { + "epoch": 2.0471077242812608, + "grad_norm": 0.7071074843406677, + "learning_rate": 7.957697642163662e-06, + "loss": 0.0609, + "step": 2955 + }, + { + "epoch": 2.047800484932456, + "grad_norm": 0.6902168989181519, + "learning_rate": 7.957004160887657e-06, + "loss": 0.0543, + "step": 2956 + }, + { + "epoch": 2.048493245583651, + "grad_norm": 0.6811742186546326, + "learning_rate": 7.95631067961165e-06, + "loss": 0.0508, + "step": 2957 + }, + { + "epoch": 2.0491860062348457, + "grad_norm": 0.5946432948112488, + "learning_rate": 7.955617198335645e-06, + "loss": 0.0415, + "step": 2958 + }, + { + "epoch": 2.049878766886041, + "grad_norm": 0.7139899134635925, + "learning_rate": 7.95492371705964e-06, + "loss": 0.0523, + "step": 2959 + }, + { + "epoch": 2.050571527537236, + "grad_norm": 0.5924705266952515, + "learning_rate": 7.954230235783635e-06, + "loss": 0.054, + "step": 2960 + }, + { + "epoch": 2.051264288188431, + "grad_norm": 0.6712675094604492, + "learning_rate": 7.95353675450763e-06, + "loss": 0.0535, + "step": 2961 + }, + { + "epoch": 2.051957048839626, + "grad_norm": 0.6940580010414124, + "learning_rate": 7.952843273231623e-06, + "loss": 0.0554, + "step": 2962 + }, + { + "epoch": 2.0526498094908208, + "grad_norm": 0.6344907879829407, + "learning_rate": 7.952149791955618e-06, + "loss": 0.043, + "step": 2963 + }, + { + "epoch": 2.053342570142016, + "grad_norm": 0.6311092376708984, + "learning_rate": 7.951456310679613e-06, + "loss": 0.0504, + "step": 2964 + }, + { + "epoch": 2.054035330793211, + "grad_norm": 0.6497060656547546, + "learning_rate": 7.950762829403606e-06, + "loss": 0.0402, + "step": 2965 + }, + { + "epoch": 2.054728091444406, + "grad_norm": 0.5674020051956177, + "learning_rate": 7.950069348127601e-06, + "loss": 0.0505, + "step": 2966 + }, + { + "epoch": 2.055420852095601, + "grad_norm": 0.5772608518600464, + "learning_rate": 7.949375866851596e-06, + "loss": 0.0501, + "step": 2967 + }, + { + "epoch": 2.056113612746796, + "grad_norm": 0.7331957221031189, + "learning_rate": 7.94868238557559e-06, + "loss": 0.0426, + "step": 2968 + }, + { + "epoch": 2.056806373397991, + "grad_norm": 0.6528723835945129, + "learning_rate": 7.947988904299586e-06, + "loss": 0.0516, + "step": 2969 + }, + { + "epoch": 2.057499134049186, + "grad_norm": 0.5991925597190857, + "learning_rate": 7.947295423023579e-06, + "loss": 0.0489, + "step": 2970 + }, + { + "epoch": 2.058191894700381, + "grad_norm": 0.539746105670929, + "learning_rate": 7.946601941747574e-06, + "loss": 0.0419, + "step": 2971 + }, + { + "epoch": 2.058884655351576, + "grad_norm": 0.7843754887580872, + "learning_rate": 7.945908460471567e-06, + "loss": 0.0545, + "step": 2972 + }, + { + "epoch": 2.059577416002771, + "grad_norm": 0.7631123065948486, + "learning_rate": 7.945214979195562e-06, + "loss": 0.0374, + "step": 2973 + }, + { + "epoch": 2.060270176653966, + "grad_norm": 0.7931402325630188, + "learning_rate": 7.944521497919557e-06, + "loss": 0.0609, + "step": 2974 + }, + { + "epoch": 2.060962937305161, + "grad_norm": 0.5307839512825012, + "learning_rate": 7.94382801664355e-06, + "loss": 0.0422, + "step": 2975 + }, + { + "epoch": 2.0616556979563563, + "grad_norm": 0.6274649500846863, + "learning_rate": 7.943134535367545e-06, + "loss": 0.05, + "step": 2976 + }, + { + "epoch": 2.062348458607551, + "grad_norm": 0.6522165536880493, + "learning_rate": 7.94244105409154e-06, + "loss": 0.0474, + "step": 2977 + }, + { + "epoch": 2.063041219258746, + "grad_norm": 0.677781343460083, + "learning_rate": 7.941747572815535e-06, + "loss": 0.0453, + "step": 2978 + }, + { + "epoch": 2.0637339799099412, + "grad_norm": 0.6549713015556335, + "learning_rate": 7.94105409153953e-06, + "loss": 0.0605, + "step": 2979 + }, + { + "epoch": 2.064426740561136, + "grad_norm": 0.6024202108383179, + "learning_rate": 7.940360610263523e-06, + "loss": 0.0567, + "step": 2980 + }, + { + "epoch": 2.0651195012123313, + "grad_norm": 0.6366188526153564, + "learning_rate": 7.939667128987518e-06, + "loss": 0.0496, + "step": 2981 + }, + { + "epoch": 2.065812261863526, + "grad_norm": 0.7096099853515625, + "learning_rate": 7.938973647711511e-06, + "loss": 0.062, + "step": 2982 + }, + { + "epoch": 2.066505022514721, + "grad_norm": 0.7041839957237244, + "learning_rate": 7.938280166435506e-06, + "loss": 0.0547, + "step": 2983 + }, + { + "epoch": 2.0671977831659163, + "grad_norm": 0.6971117258071899, + "learning_rate": 7.937586685159501e-06, + "loss": 0.0442, + "step": 2984 + }, + { + "epoch": 2.067890543817111, + "grad_norm": 0.6415303349494934, + "learning_rate": 7.936893203883496e-06, + "loss": 0.0538, + "step": 2985 + }, + { + "epoch": 2.0685833044683064, + "grad_norm": 0.6057801246643066, + "learning_rate": 7.936199722607491e-06, + "loss": 0.0425, + "step": 2986 + }, + { + "epoch": 2.0692760651195012, + "grad_norm": 0.7372746467590332, + "learning_rate": 7.935506241331484e-06, + "loss": 0.0514, + "step": 2987 + }, + { + "epoch": 2.069968825770696, + "grad_norm": 0.6791513562202454, + "learning_rate": 7.93481276005548e-06, + "loss": 0.0531, + "step": 2988 + }, + { + "epoch": 2.0706615864218914, + "grad_norm": 0.6759649515151978, + "learning_rate": 7.934119278779474e-06, + "loss": 0.0537, + "step": 2989 + }, + { + "epoch": 2.071354347073086, + "grad_norm": 0.64107346534729, + "learning_rate": 7.933425797503468e-06, + "loss": 0.0504, + "step": 2990 + }, + { + "epoch": 2.0720471077242815, + "grad_norm": 0.6264382600784302, + "learning_rate": 7.932732316227462e-06, + "loss": 0.0472, + "step": 2991 + }, + { + "epoch": 2.0727398683754763, + "grad_norm": 0.6337283253669739, + "learning_rate": 7.932038834951457e-06, + "loss": 0.0542, + "step": 2992 + }, + { + "epoch": 2.073432629026671, + "grad_norm": 0.7504245638847351, + "learning_rate": 7.93134535367545e-06, + "loss": 0.0552, + "step": 2993 + }, + { + "epoch": 2.0741253896778664, + "grad_norm": 0.8058987855911255, + "learning_rate": 7.930651872399446e-06, + "loss": 0.07, + "step": 2994 + }, + { + "epoch": 2.0748181503290613, + "grad_norm": 0.6072273254394531, + "learning_rate": 7.92995839112344e-06, + "loss": 0.0563, + "step": 2995 + }, + { + "epoch": 2.0755109109802565, + "grad_norm": 0.6442796587944031, + "learning_rate": 7.929264909847435e-06, + "loss": 0.0563, + "step": 2996 + }, + { + "epoch": 2.0762036716314514, + "grad_norm": 0.6914668083190918, + "learning_rate": 7.928571428571429e-06, + "loss": 0.0561, + "step": 2997 + }, + { + "epoch": 2.076896432282646, + "grad_norm": 0.6183263659477234, + "learning_rate": 7.927877947295424e-06, + "loss": 0.0436, + "step": 2998 + }, + { + "epoch": 2.0775891929338415, + "grad_norm": 0.5945209264755249, + "learning_rate": 7.927184466019419e-06, + "loss": 0.0491, + "step": 2999 + }, + { + "epoch": 2.0782819535850363, + "grad_norm": 0.7157323956489563, + "learning_rate": 7.926490984743412e-06, + "loss": 0.0639, + "step": 3000 + }, + { + "epoch": 2.0789747142362316, + "grad_norm": 0.7220014929771423, + "learning_rate": 7.925797503467407e-06, + "loss": 0.0724, + "step": 3001 + }, + { + "epoch": 2.0796674748874264, + "grad_norm": 0.6445550918579102, + "learning_rate": 7.925104022191402e-06, + "loss": 0.0525, + "step": 3002 + }, + { + "epoch": 2.0803602355386213, + "grad_norm": 0.6667922139167786, + "learning_rate": 7.924410540915397e-06, + "loss": 0.0553, + "step": 3003 + }, + { + "epoch": 2.0810529961898165, + "grad_norm": 0.7367938160896301, + "learning_rate": 7.923717059639391e-06, + "loss": 0.0459, + "step": 3004 + }, + { + "epoch": 2.0817457568410114, + "grad_norm": 0.5869571566581726, + "learning_rate": 7.923023578363385e-06, + "loss": 0.0465, + "step": 3005 + }, + { + "epoch": 2.0824385174922067, + "grad_norm": 0.6422292590141296, + "learning_rate": 7.92233009708738e-06, + "loss": 0.0455, + "step": 3006 + }, + { + "epoch": 2.0831312781434015, + "grad_norm": 0.7092993855476379, + "learning_rate": 7.921636615811373e-06, + "loss": 0.0544, + "step": 3007 + }, + { + "epoch": 2.0838240387945963, + "grad_norm": 0.6405553817749023, + "learning_rate": 7.920943134535368e-06, + "loss": 0.0619, + "step": 3008 + }, + { + "epoch": 2.0845167994457916, + "grad_norm": 0.7614275217056274, + "learning_rate": 7.920249653259363e-06, + "loss": 0.0612, + "step": 3009 + }, + { + "epoch": 2.0852095600969864, + "grad_norm": 0.6826539635658264, + "learning_rate": 7.919556171983358e-06, + "loss": 0.0641, + "step": 3010 + }, + { + "epoch": 2.0859023207481817, + "grad_norm": 0.7215129137039185, + "learning_rate": 7.918862690707353e-06, + "loss": 0.0538, + "step": 3011 + }, + { + "epoch": 2.0865950813993766, + "grad_norm": 0.5781185030937195, + "learning_rate": 7.918169209431346e-06, + "loss": 0.0503, + "step": 3012 + }, + { + "epoch": 2.0872878420505714, + "grad_norm": 0.7786000967025757, + "learning_rate": 7.91747572815534e-06, + "loss": 0.0675, + "step": 3013 + }, + { + "epoch": 2.0879806027017667, + "grad_norm": 0.6735415458679199, + "learning_rate": 7.916782246879336e-06, + "loss": 0.0517, + "step": 3014 + }, + { + "epoch": 2.0886733633529615, + "grad_norm": 0.6100610494613647, + "learning_rate": 7.916088765603329e-06, + "loss": 0.055, + "step": 3015 + }, + { + "epoch": 2.0893661240041568, + "grad_norm": 0.6115071773529053, + "learning_rate": 7.915395284327324e-06, + "loss": 0.0486, + "step": 3016 + }, + { + "epoch": 2.0900588846553516, + "grad_norm": 0.6331568360328674, + "learning_rate": 7.914701803051317e-06, + "loss": 0.0397, + "step": 3017 + }, + { + "epoch": 2.0907516453065464, + "grad_norm": 0.6336967945098877, + "learning_rate": 7.914008321775312e-06, + "loss": 0.0497, + "step": 3018 + }, + { + "epoch": 2.0914444059577417, + "grad_norm": 0.7088524699211121, + "learning_rate": 7.913314840499307e-06, + "loss": 0.0571, + "step": 3019 + }, + { + "epoch": 2.0921371666089366, + "grad_norm": 0.8578203320503235, + "learning_rate": 7.912621359223302e-06, + "loss": 0.0577, + "step": 3020 + }, + { + "epoch": 2.092829927260132, + "grad_norm": 0.665698230266571, + "learning_rate": 7.911927877947297e-06, + "loss": 0.0519, + "step": 3021 + }, + { + "epoch": 2.0935226879113267, + "grad_norm": 0.625718355178833, + "learning_rate": 7.91123439667129e-06, + "loss": 0.0571, + "step": 3022 + }, + { + "epoch": 2.0942154485625215, + "grad_norm": 0.6121425032615662, + "learning_rate": 7.910540915395285e-06, + "loss": 0.0483, + "step": 3023 + }, + { + "epoch": 2.094908209213717, + "grad_norm": 0.6906800866127014, + "learning_rate": 7.90984743411928e-06, + "loss": 0.0535, + "step": 3024 + }, + { + "epoch": 2.0956009698649116, + "grad_norm": 0.688827395439148, + "learning_rate": 7.909153952843273e-06, + "loss": 0.0678, + "step": 3025 + }, + { + "epoch": 2.0962937305161065, + "grad_norm": 0.6596013903617859, + "learning_rate": 7.908460471567268e-06, + "loss": 0.0537, + "step": 3026 + }, + { + "epoch": 2.0969864911673017, + "grad_norm": 0.6849279999732971, + "learning_rate": 7.907766990291263e-06, + "loss": 0.0545, + "step": 3027 + }, + { + "epoch": 2.0976792518184966, + "grad_norm": 0.5821280479431152, + "learning_rate": 7.907073509015258e-06, + "loss": 0.0462, + "step": 3028 + }, + { + "epoch": 2.098372012469692, + "grad_norm": 0.6135004162788391, + "learning_rate": 7.906380027739253e-06, + "loss": 0.0525, + "step": 3029 + }, + { + "epoch": 2.0990647731208867, + "grad_norm": 0.5608628988265991, + "learning_rate": 7.905686546463246e-06, + "loss": 0.0425, + "step": 3030 + }, + { + "epoch": 2.099757533772082, + "grad_norm": 0.9066096544265747, + "learning_rate": 7.904993065187241e-06, + "loss": 0.0461, + "step": 3031 + }, + { + "epoch": 2.100450294423277, + "grad_norm": 0.6573208570480347, + "learning_rate": 7.904299583911234e-06, + "loss": 0.0557, + "step": 3032 + }, + { + "epoch": 2.1011430550744716, + "grad_norm": 0.5651605725288391, + "learning_rate": 7.90360610263523e-06, + "loss": 0.0431, + "step": 3033 + }, + { + "epoch": 2.101835815725667, + "grad_norm": 0.547998309135437, + "learning_rate": 7.902912621359224e-06, + "loss": 0.0417, + "step": 3034 + }, + { + "epoch": 2.1025285763768617, + "grad_norm": 0.5699440240859985, + "learning_rate": 7.902219140083217e-06, + "loss": 0.0514, + "step": 3035 + }, + { + "epoch": 2.1032213370280566, + "grad_norm": 0.7615641951560974, + "learning_rate": 7.901525658807212e-06, + "loss": 0.0602, + "step": 3036 + }, + { + "epoch": 2.103914097679252, + "grad_norm": 0.638684868812561, + "learning_rate": 7.900832177531207e-06, + "loss": 0.0525, + "step": 3037 + }, + { + "epoch": 2.1046068583304467, + "grad_norm": 0.663013756275177, + "learning_rate": 7.900138696255202e-06, + "loss": 0.0334, + "step": 3038 + }, + { + "epoch": 2.105299618981642, + "grad_norm": 0.7992621660232544, + "learning_rate": 7.899445214979197e-06, + "loss": 0.054, + "step": 3039 + }, + { + "epoch": 2.105992379632837, + "grad_norm": 0.6039307713508606, + "learning_rate": 7.89875173370319e-06, + "loss": 0.0525, + "step": 3040 + }, + { + "epoch": 2.106685140284032, + "grad_norm": 0.607890248298645, + "learning_rate": 7.898058252427185e-06, + "loss": 0.047, + "step": 3041 + }, + { + "epoch": 2.107377900935227, + "grad_norm": 0.7209950089454651, + "learning_rate": 7.897364771151179e-06, + "loss": 0.0704, + "step": 3042 + }, + { + "epoch": 2.1080706615864218, + "grad_norm": 0.6088247299194336, + "learning_rate": 7.896671289875174e-06, + "loss": 0.0398, + "step": 3043 + }, + { + "epoch": 2.108763422237617, + "grad_norm": 0.6290914416313171, + "learning_rate": 7.895977808599168e-06, + "loss": 0.0493, + "step": 3044 + }, + { + "epoch": 2.109456182888812, + "grad_norm": 0.61188143491745, + "learning_rate": 7.895284327323163e-06, + "loss": 0.0491, + "step": 3045 + }, + { + "epoch": 2.1101489435400067, + "grad_norm": 0.6168179512023926, + "learning_rate": 7.894590846047158e-06, + "loss": 0.0423, + "step": 3046 + }, + { + "epoch": 2.110841704191202, + "grad_norm": 0.7018704414367676, + "learning_rate": 7.893897364771152e-06, + "loss": 0.0543, + "step": 3047 + }, + { + "epoch": 2.111534464842397, + "grad_norm": 0.5940234065055847, + "learning_rate": 7.893203883495147e-06, + "loss": 0.0445, + "step": 3048 + }, + { + "epoch": 2.112227225493592, + "grad_norm": 0.6289170384407043, + "learning_rate": 7.892510402219141e-06, + "loss": 0.0479, + "step": 3049 + }, + { + "epoch": 2.112919986144787, + "grad_norm": 0.7200828194618225, + "learning_rate": 7.891816920943135e-06, + "loss": 0.057, + "step": 3050 + }, + { + "epoch": 2.113612746795982, + "grad_norm": 0.6054809093475342, + "learning_rate": 7.89112343966713e-06, + "loss": 0.0474, + "step": 3051 + }, + { + "epoch": 2.114305507447177, + "grad_norm": 0.7207465171813965, + "learning_rate": 7.890429958391123e-06, + "loss": 0.0751, + "step": 3052 + }, + { + "epoch": 2.114998268098372, + "grad_norm": 0.6288327574729919, + "learning_rate": 7.889736477115118e-06, + "loss": 0.0515, + "step": 3053 + }, + { + "epoch": 2.115691028749567, + "grad_norm": 0.6983470916748047, + "learning_rate": 7.889042995839113e-06, + "loss": 0.0491, + "step": 3054 + }, + { + "epoch": 2.116383789400762, + "grad_norm": 0.6435803771018982, + "learning_rate": 7.888349514563108e-06, + "loss": 0.0434, + "step": 3055 + }, + { + "epoch": 2.117076550051957, + "grad_norm": 0.6716493368148804, + "learning_rate": 7.887656033287103e-06, + "loss": 0.0558, + "step": 3056 + }, + { + "epoch": 2.117769310703152, + "grad_norm": 0.6133092641830444, + "learning_rate": 7.886962552011096e-06, + "loss": 0.0534, + "step": 3057 + }, + { + "epoch": 2.118462071354347, + "grad_norm": 0.5793967247009277, + "learning_rate": 7.88626907073509e-06, + "loss": 0.0401, + "step": 3058 + }, + { + "epoch": 2.119154832005542, + "grad_norm": 0.6967165470123291, + "learning_rate": 7.885575589459086e-06, + "loss": 0.0532, + "step": 3059 + }, + { + "epoch": 2.119847592656737, + "grad_norm": 0.5157840251922607, + "learning_rate": 7.884882108183079e-06, + "loss": 0.0377, + "step": 3060 + }, + { + "epoch": 2.1205403533079323, + "grad_norm": 0.6262407302856445, + "learning_rate": 7.884188626907074e-06, + "loss": 0.0458, + "step": 3061 + }, + { + "epoch": 2.121233113959127, + "grad_norm": 0.6209738254547119, + "learning_rate": 7.883495145631069e-06, + "loss": 0.0511, + "step": 3062 + }, + { + "epoch": 2.121925874610322, + "grad_norm": 0.6587721705436707, + "learning_rate": 7.882801664355064e-06, + "loss": 0.0579, + "step": 3063 + }, + { + "epoch": 2.1226186352615173, + "grad_norm": 0.6426872611045837, + "learning_rate": 7.882108183079059e-06, + "loss": 0.0541, + "step": 3064 + }, + { + "epoch": 2.123311395912712, + "grad_norm": 0.714944064617157, + "learning_rate": 7.881414701803052e-06, + "loss": 0.0582, + "step": 3065 + }, + { + "epoch": 2.124004156563907, + "grad_norm": 0.6246589422225952, + "learning_rate": 7.880721220527047e-06, + "loss": 0.0563, + "step": 3066 + }, + { + "epoch": 2.1246969172151022, + "grad_norm": 0.8801479339599609, + "learning_rate": 7.88002773925104e-06, + "loss": 0.0638, + "step": 3067 + }, + { + "epoch": 2.125389677866297, + "grad_norm": 0.6280035376548767, + "learning_rate": 7.879334257975035e-06, + "loss": 0.0483, + "step": 3068 + }, + { + "epoch": 2.1260824385174923, + "grad_norm": 0.670604407787323, + "learning_rate": 7.87864077669903e-06, + "loss": 0.0577, + "step": 3069 + }, + { + "epoch": 2.126775199168687, + "grad_norm": 0.6534485220909119, + "learning_rate": 7.877947295423023e-06, + "loss": 0.0465, + "step": 3070 + }, + { + "epoch": 2.1274679598198825, + "grad_norm": 0.7170853614807129, + "learning_rate": 7.877253814147018e-06, + "loss": 0.0411, + "step": 3071 + }, + { + "epoch": 2.1281607204710773, + "grad_norm": 0.6874311566352844, + "learning_rate": 7.876560332871013e-06, + "loss": 0.0568, + "step": 3072 + }, + { + "epoch": 2.128853481122272, + "grad_norm": 0.5714280605316162, + "learning_rate": 7.875866851595008e-06, + "loss": 0.0437, + "step": 3073 + }, + { + "epoch": 2.1295462417734674, + "grad_norm": 0.5944762229919434, + "learning_rate": 7.875173370319003e-06, + "loss": 0.0467, + "step": 3074 + }, + { + "epoch": 2.1302390024246622, + "grad_norm": 0.696190357208252, + "learning_rate": 7.874479889042996e-06, + "loss": 0.056, + "step": 3075 + }, + { + "epoch": 2.130931763075857, + "grad_norm": 0.576362669467926, + "learning_rate": 7.873786407766991e-06, + "loss": 0.0495, + "step": 3076 + }, + { + "epoch": 2.1316245237270524, + "grad_norm": 0.7446839213371277, + "learning_rate": 7.873092926490984e-06, + "loss": 0.0591, + "step": 3077 + }, + { + "epoch": 2.132317284378247, + "grad_norm": 0.5943527221679688, + "learning_rate": 7.87239944521498e-06, + "loss": 0.0486, + "step": 3078 + }, + { + "epoch": 2.1330100450294425, + "grad_norm": 0.5902846455574036, + "learning_rate": 7.871705963938974e-06, + "loss": 0.0446, + "step": 3079 + }, + { + "epoch": 2.1337028056806373, + "grad_norm": 0.6252435445785522, + "learning_rate": 7.87101248266297e-06, + "loss": 0.0438, + "step": 3080 + }, + { + "epoch": 2.1343955663318326, + "grad_norm": 0.6630060076713562, + "learning_rate": 7.870319001386964e-06, + "loss": 0.0614, + "step": 3081 + }, + { + "epoch": 2.1350883269830274, + "grad_norm": 0.5988588929176331, + "learning_rate": 7.869625520110957e-06, + "loss": 0.0431, + "step": 3082 + }, + { + "epoch": 2.1357810876342223, + "grad_norm": 0.8282397985458374, + "learning_rate": 7.868932038834952e-06, + "loss": 0.0673, + "step": 3083 + }, + { + "epoch": 2.1364738482854175, + "grad_norm": 0.8640880584716797, + "learning_rate": 7.868238557558947e-06, + "loss": 0.0504, + "step": 3084 + }, + { + "epoch": 2.1371666089366124, + "grad_norm": 0.6471401453018188, + "learning_rate": 7.86754507628294e-06, + "loss": 0.0553, + "step": 3085 + }, + { + "epoch": 2.137859369587807, + "grad_norm": 0.6992062330245972, + "learning_rate": 7.866851595006935e-06, + "loss": 0.0531, + "step": 3086 + }, + { + "epoch": 2.1385521302390025, + "grad_norm": 0.6501442790031433, + "learning_rate": 7.86615811373093e-06, + "loss": 0.0482, + "step": 3087 + }, + { + "epoch": 2.1392448908901973, + "grad_norm": 0.6028953790664673, + "learning_rate": 7.865464632454925e-06, + "loss": 0.0434, + "step": 3088 + }, + { + "epoch": 2.1399376515413926, + "grad_norm": 0.7869958281517029, + "learning_rate": 7.864771151178918e-06, + "loss": 0.0579, + "step": 3089 + }, + { + "epoch": 2.1406304121925874, + "grad_norm": 0.8090862035751343, + "learning_rate": 7.864077669902913e-06, + "loss": 0.0658, + "step": 3090 + }, + { + "epoch": 2.1413231728437827, + "grad_norm": 0.6419026851654053, + "learning_rate": 7.863384188626908e-06, + "loss": 0.0521, + "step": 3091 + }, + { + "epoch": 2.1420159334949775, + "grad_norm": 0.6343168616294861, + "learning_rate": 7.862690707350902e-06, + "loss": 0.0484, + "step": 3092 + }, + { + "epoch": 2.1427086941461724, + "grad_norm": 0.6734219789505005, + "learning_rate": 7.861997226074897e-06, + "loss": 0.0437, + "step": 3093 + }, + { + "epoch": 2.1434014547973677, + "grad_norm": 0.7136476635932922, + "learning_rate": 7.861303744798891e-06, + "loss": 0.057, + "step": 3094 + }, + { + "epoch": 2.1440942154485625, + "grad_norm": 0.8247693777084351, + "learning_rate": 7.860610263522885e-06, + "loss": 0.071, + "step": 3095 + }, + { + "epoch": 2.1447869760997573, + "grad_norm": 0.6431103348731995, + "learning_rate": 7.85991678224688e-06, + "loss": 0.0611, + "step": 3096 + }, + { + "epoch": 2.1454797367509526, + "grad_norm": 0.6054585576057434, + "learning_rate": 7.859223300970875e-06, + "loss": 0.047, + "step": 3097 + }, + { + "epoch": 2.1461724974021474, + "grad_norm": 0.7017614245414734, + "learning_rate": 7.85852981969487e-06, + "loss": 0.059, + "step": 3098 + }, + { + "epoch": 2.1468652580533427, + "grad_norm": 0.5154880881309509, + "learning_rate": 7.857836338418864e-06, + "loss": 0.0323, + "step": 3099 + }, + { + "epoch": 2.1475580187045376, + "grad_norm": 0.6577040553092957, + "learning_rate": 7.857142857142858e-06, + "loss": 0.0544, + "step": 3100 + }, + { + "epoch": 2.1482507793557324, + "grad_norm": 0.5519864559173584, + "learning_rate": 7.856449375866853e-06, + "loss": 0.0403, + "step": 3101 + }, + { + "epoch": 2.1489435400069277, + "grad_norm": 0.6759754419326782, + "learning_rate": 7.855755894590846e-06, + "loss": 0.0597, + "step": 3102 + }, + { + "epoch": 2.1496363006581225, + "grad_norm": 0.6048867106437683, + "learning_rate": 7.85506241331484e-06, + "loss": 0.0487, + "step": 3103 + }, + { + "epoch": 2.150329061309318, + "grad_norm": 0.6813057065010071, + "learning_rate": 7.854368932038836e-06, + "loss": 0.0518, + "step": 3104 + }, + { + "epoch": 2.1510218219605126, + "grad_norm": 0.7083479166030884, + "learning_rate": 7.85367545076283e-06, + "loss": 0.0405, + "step": 3105 + }, + { + "epoch": 2.1517145826117074, + "grad_norm": 0.5903550982475281, + "learning_rate": 7.852981969486826e-06, + "loss": 0.0595, + "step": 3106 + }, + { + "epoch": 2.1524073432629027, + "grad_norm": 0.700406551361084, + "learning_rate": 7.852288488210819e-06, + "loss": 0.056, + "step": 3107 + }, + { + "epoch": 2.1531001039140976, + "grad_norm": 0.7421799302101135, + "learning_rate": 7.851595006934814e-06, + "loss": 0.0625, + "step": 3108 + }, + { + "epoch": 2.153792864565293, + "grad_norm": 0.6779900193214417, + "learning_rate": 7.850901525658809e-06, + "loss": 0.0537, + "step": 3109 + }, + { + "epoch": 2.1544856252164877, + "grad_norm": 0.7126687169075012, + "learning_rate": 7.850208044382802e-06, + "loss": 0.0538, + "step": 3110 + }, + { + "epoch": 2.1551783858676825, + "grad_norm": 0.6870993971824646, + "learning_rate": 7.849514563106797e-06, + "loss": 0.0593, + "step": 3111 + }, + { + "epoch": 2.155871146518878, + "grad_norm": 0.9907678365707397, + "learning_rate": 7.84882108183079e-06, + "loss": 0.0746, + "step": 3112 + }, + { + "epoch": 2.1565639071700726, + "grad_norm": 0.5799802541732788, + "learning_rate": 7.848127600554785e-06, + "loss": 0.0422, + "step": 3113 + }, + { + "epoch": 2.157256667821268, + "grad_norm": 0.7612532377243042, + "learning_rate": 7.84743411927878e-06, + "loss": 0.0542, + "step": 3114 + }, + { + "epoch": 2.1579494284724627, + "grad_norm": 0.6533823013305664, + "learning_rate": 7.846740638002775e-06, + "loss": 0.0518, + "step": 3115 + }, + { + "epoch": 2.1586421891236576, + "grad_norm": 0.7348969578742981, + "learning_rate": 7.84604715672677e-06, + "loss": 0.0606, + "step": 3116 + }, + { + "epoch": 2.159334949774853, + "grad_norm": 0.6741594076156616, + "learning_rate": 7.845353675450763e-06, + "loss": 0.0544, + "step": 3117 + }, + { + "epoch": 2.1600277104260477, + "grad_norm": 0.6544450521469116, + "learning_rate": 7.844660194174758e-06, + "loss": 0.0548, + "step": 3118 + }, + { + "epoch": 2.160720471077243, + "grad_norm": 0.7590257525444031, + "learning_rate": 7.843966712898753e-06, + "loss": 0.051, + "step": 3119 + }, + { + "epoch": 2.161413231728438, + "grad_norm": 0.7456057071685791, + "learning_rate": 7.843273231622746e-06, + "loss": 0.0483, + "step": 3120 + }, + { + "epoch": 2.1621059923796326, + "grad_norm": 0.6765349507331848, + "learning_rate": 7.842579750346741e-06, + "loss": 0.0671, + "step": 3121 + }, + { + "epoch": 2.162798753030828, + "grad_norm": 0.6293055415153503, + "learning_rate": 7.841886269070736e-06, + "loss": 0.0461, + "step": 3122 + }, + { + "epoch": 2.1634915136820227, + "grad_norm": 0.627202570438385, + "learning_rate": 7.841192787794731e-06, + "loss": 0.0599, + "step": 3123 + }, + { + "epoch": 2.164184274333218, + "grad_norm": 0.5979534387588501, + "learning_rate": 7.840499306518726e-06, + "loss": 0.047, + "step": 3124 + }, + { + "epoch": 2.164877034984413, + "grad_norm": 0.6458120346069336, + "learning_rate": 7.839805825242719e-06, + "loss": 0.0471, + "step": 3125 + }, + { + "epoch": 2.1655697956356077, + "grad_norm": 0.6617605090141296, + "learning_rate": 7.839112343966714e-06, + "loss": 0.0631, + "step": 3126 + }, + { + "epoch": 2.166262556286803, + "grad_norm": 0.7290152907371521, + "learning_rate": 7.838418862690707e-06, + "loss": 0.0715, + "step": 3127 + }, + { + "epoch": 2.166955316937998, + "grad_norm": 0.6518305540084839, + "learning_rate": 7.837725381414702e-06, + "loss": 0.056, + "step": 3128 + }, + { + "epoch": 2.167648077589193, + "grad_norm": 0.6306636333465576, + "learning_rate": 7.837031900138697e-06, + "loss": 0.0495, + "step": 3129 + }, + { + "epoch": 2.168340838240388, + "grad_norm": 0.6336146593093872, + "learning_rate": 7.83633841886269e-06, + "loss": 0.0545, + "step": 3130 + }, + { + "epoch": 2.1690335988915828, + "grad_norm": 0.6405359506607056, + "learning_rate": 7.835644937586685e-06, + "loss": 0.0542, + "step": 3131 + }, + { + "epoch": 2.169726359542778, + "grad_norm": 0.5968299508094788, + "learning_rate": 7.83495145631068e-06, + "loss": 0.0408, + "step": 3132 + }, + { + "epoch": 2.170419120193973, + "grad_norm": 0.6285966634750366, + "learning_rate": 7.834257975034675e-06, + "loss": 0.0465, + "step": 3133 + }, + { + "epoch": 2.171111880845168, + "grad_norm": 0.9761276841163635, + "learning_rate": 7.83356449375867e-06, + "loss": 0.0615, + "step": 3134 + }, + { + "epoch": 2.171804641496363, + "grad_norm": 0.686771035194397, + "learning_rate": 7.832871012482663e-06, + "loss": 0.0439, + "step": 3135 + }, + { + "epoch": 2.172497402147558, + "grad_norm": 0.6235953569412231, + "learning_rate": 7.832177531206658e-06, + "loss": 0.0471, + "step": 3136 + }, + { + "epoch": 2.173190162798753, + "grad_norm": 0.695323646068573, + "learning_rate": 7.831484049930652e-06, + "loss": 0.0584, + "step": 3137 + }, + { + "epoch": 2.173882923449948, + "grad_norm": 0.6751148104667664, + "learning_rate": 7.830790568654646e-06, + "loss": 0.0522, + "step": 3138 + }, + { + "epoch": 2.174575684101143, + "grad_norm": 0.6266228556632996, + "learning_rate": 7.830097087378641e-06, + "loss": 0.0501, + "step": 3139 + }, + { + "epoch": 2.175268444752338, + "grad_norm": 0.6762698888778687, + "learning_rate": 7.829403606102636e-06, + "loss": 0.0557, + "step": 3140 + }, + { + "epoch": 2.175961205403533, + "grad_norm": 0.6563423275947571, + "learning_rate": 7.828710124826631e-06, + "loss": 0.0479, + "step": 3141 + }, + { + "epoch": 2.176653966054728, + "grad_norm": 0.6651897430419922, + "learning_rate": 7.828016643550625e-06, + "loss": 0.055, + "step": 3142 + }, + { + "epoch": 2.177346726705923, + "grad_norm": 0.6874797344207764, + "learning_rate": 7.82732316227462e-06, + "loss": 0.0541, + "step": 3143 + }, + { + "epoch": 2.1780394873571183, + "grad_norm": 0.7515442371368408, + "learning_rate": 7.826629680998614e-06, + "loss": 0.0678, + "step": 3144 + }, + { + "epoch": 2.178732248008313, + "grad_norm": 0.7127979397773743, + "learning_rate": 7.825936199722608e-06, + "loss": 0.0633, + "step": 3145 + }, + { + "epoch": 2.179425008659508, + "grad_norm": 0.6155861020088196, + "learning_rate": 7.825242718446603e-06, + "loss": 0.0355, + "step": 3146 + }, + { + "epoch": 2.1801177693107032, + "grad_norm": 0.6179302930831909, + "learning_rate": 7.824549237170598e-06, + "loss": 0.0474, + "step": 3147 + }, + { + "epoch": 2.180810529961898, + "grad_norm": 0.6917999386787415, + "learning_rate": 7.82385575589459e-06, + "loss": 0.0581, + "step": 3148 + }, + { + "epoch": 2.1815032906130933, + "grad_norm": 0.7743158340454102, + "learning_rate": 7.823162274618586e-06, + "loss": 0.0614, + "step": 3149 + }, + { + "epoch": 2.182196051264288, + "grad_norm": 0.6892214417457581, + "learning_rate": 7.82246879334258e-06, + "loss": 0.0494, + "step": 3150 + }, + { + "epoch": 2.182888811915483, + "grad_norm": 0.6922516226768494, + "learning_rate": 7.821775312066576e-06, + "loss": 0.063, + "step": 3151 + }, + { + "epoch": 2.1835815725666783, + "grad_norm": 0.5864610075950623, + "learning_rate": 7.821081830790569e-06, + "loss": 0.0435, + "step": 3152 + }, + { + "epoch": 2.184274333217873, + "grad_norm": 0.6471571922302246, + "learning_rate": 7.820388349514564e-06, + "loss": 0.0538, + "step": 3153 + }, + { + "epoch": 2.1849670938690684, + "grad_norm": 0.707469642162323, + "learning_rate": 7.819694868238559e-06, + "loss": 0.0814, + "step": 3154 + }, + { + "epoch": 2.1856598545202632, + "grad_norm": 0.656551718711853, + "learning_rate": 7.819001386962552e-06, + "loss": 0.0591, + "step": 3155 + }, + { + "epoch": 2.186352615171458, + "grad_norm": 0.6638685464859009, + "learning_rate": 7.818307905686547e-06, + "loss": 0.0529, + "step": 3156 + }, + { + "epoch": 2.1870453758226533, + "grad_norm": 0.682731568813324, + "learning_rate": 7.817614424410542e-06, + "loss": 0.0583, + "step": 3157 + }, + { + "epoch": 2.187738136473848, + "grad_norm": 0.6498706340789795, + "learning_rate": 7.816920943134537e-06, + "loss": 0.0528, + "step": 3158 + }, + { + "epoch": 2.1884308971250435, + "grad_norm": 0.6912710666656494, + "learning_rate": 7.816227461858532e-06, + "loss": 0.0558, + "step": 3159 + }, + { + "epoch": 2.1891236577762383, + "grad_norm": 0.7095940709114075, + "learning_rate": 7.815533980582525e-06, + "loss": 0.0612, + "step": 3160 + }, + { + "epoch": 2.189816418427433, + "grad_norm": 0.7736082673072815, + "learning_rate": 7.81484049930652e-06, + "loss": 0.0512, + "step": 3161 + }, + { + "epoch": 2.1905091790786284, + "grad_norm": 0.6644687056541443, + "learning_rate": 7.814147018030513e-06, + "loss": 0.0533, + "step": 3162 + }, + { + "epoch": 2.1912019397298232, + "grad_norm": 0.7022423148155212, + "learning_rate": 7.813453536754508e-06, + "loss": 0.0583, + "step": 3163 + }, + { + "epoch": 2.1918947003810185, + "grad_norm": 0.7000126838684082, + "learning_rate": 7.812760055478503e-06, + "loss": 0.0467, + "step": 3164 + }, + { + "epoch": 2.1925874610322134, + "grad_norm": 0.6936020851135254, + "learning_rate": 7.812066574202498e-06, + "loss": 0.0631, + "step": 3165 + }, + { + "epoch": 2.193280221683408, + "grad_norm": 0.64175945520401, + "learning_rate": 7.811373092926493e-06, + "loss": 0.0412, + "step": 3166 + }, + { + "epoch": 2.1939729823346035, + "grad_norm": 0.7573792338371277, + "learning_rate": 7.810679611650486e-06, + "loss": 0.0539, + "step": 3167 + }, + { + "epoch": 2.1946657429857983, + "grad_norm": 0.6865546703338623, + "learning_rate": 7.809986130374481e-06, + "loss": 0.0783, + "step": 3168 + }, + { + "epoch": 2.1953585036369936, + "grad_norm": 0.6688504219055176, + "learning_rate": 7.809292649098476e-06, + "loss": 0.0454, + "step": 3169 + }, + { + "epoch": 2.1960512642881884, + "grad_norm": 0.5821261405944824, + "learning_rate": 7.808599167822469e-06, + "loss": 0.0464, + "step": 3170 + }, + { + "epoch": 2.1967440249393833, + "grad_norm": 0.6367770433425903, + "learning_rate": 7.807905686546464e-06, + "loss": 0.0474, + "step": 3171 + }, + { + "epoch": 2.1974367855905785, + "grad_norm": 0.6454951763153076, + "learning_rate": 7.807212205270457e-06, + "loss": 0.0518, + "step": 3172 + }, + { + "epoch": 2.1981295462417734, + "grad_norm": 0.6782212257385254, + "learning_rate": 7.806518723994452e-06, + "loss": 0.0538, + "step": 3173 + }, + { + "epoch": 2.1988223068929686, + "grad_norm": 0.7437390685081482, + "learning_rate": 7.805825242718447e-06, + "loss": 0.0503, + "step": 3174 + }, + { + "epoch": 2.1995150675441635, + "grad_norm": 0.6945703029632568, + "learning_rate": 7.805131761442442e-06, + "loss": 0.0631, + "step": 3175 + }, + { + "epoch": 2.2002078281953583, + "grad_norm": 0.654305636882782, + "learning_rate": 7.804438280166437e-06, + "loss": 0.0502, + "step": 3176 + }, + { + "epoch": 2.2009005888465536, + "grad_norm": 0.6783000230789185, + "learning_rate": 7.80374479889043e-06, + "loss": 0.0527, + "step": 3177 + }, + { + "epoch": 2.2015933494977484, + "grad_norm": 0.6859548687934875, + "learning_rate": 7.803051317614425e-06, + "loss": 0.0615, + "step": 3178 + }, + { + "epoch": 2.2022861101489437, + "grad_norm": 0.6832144260406494, + "learning_rate": 7.80235783633842e-06, + "loss": 0.0493, + "step": 3179 + }, + { + "epoch": 2.2029788708001385, + "grad_norm": 0.6961016654968262, + "learning_rate": 7.801664355062413e-06, + "loss": 0.0651, + "step": 3180 + }, + { + "epoch": 2.2036716314513334, + "grad_norm": 0.5712021589279175, + "learning_rate": 7.800970873786408e-06, + "loss": 0.0347, + "step": 3181 + }, + { + "epoch": 2.2043643921025287, + "grad_norm": 0.6744644045829773, + "learning_rate": 7.800277392510403e-06, + "loss": 0.0432, + "step": 3182 + }, + { + "epoch": 2.2050571527537235, + "grad_norm": 0.6067906618118286, + "learning_rate": 7.799583911234398e-06, + "loss": 0.0512, + "step": 3183 + }, + { + "epoch": 2.2057499134049188, + "grad_norm": 0.6815451383590698, + "learning_rate": 7.798890429958393e-06, + "loss": 0.0593, + "step": 3184 + }, + { + "epoch": 2.2064426740561136, + "grad_norm": 0.7582570910453796, + "learning_rate": 7.798196948682386e-06, + "loss": 0.0436, + "step": 3185 + }, + { + "epoch": 2.2071354347073084, + "grad_norm": 0.7396038770675659, + "learning_rate": 7.797503467406381e-06, + "loss": 0.0617, + "step": 3186 + }, + { + "epoch": 2.2078281953585037, + "grad_norm": 0.700670599937439, + "learning_rate": 7.796809986130375e-06, + "loss": 0.0437, + "step": 3187 + }, + { + "epoch": 2.2085209560096986, + "grad_norm": 0.7770943641662598, + "learning_rate": 7.79611650485437e-06, + "loss": 0.0491, + "step": 3188 + }, + { + "epoch": 2.209213716660894, + "grad_norm": 0.6324613094329834, + "learning_rate": 7.795423023578364e-06, + "loss": 0.0495, + "step": 3189 + }, + { + "epoch": 2.2099064773120887, + "grad_norm": 0.6112574934959412, + "learning_rate": 7.794729542302358e-06, + "loss": 0.0442, + "step": 3190 + }, + { + "epoch": 2.2105992379632835, + "grad_norm": 0.7486513257026672, + "learning_rate": 7.794036061026353e-06, + "loss": 0.0557, + "step": 3191 + }, + { + "epoch": 2.211291998614479, + "grad_norm": 0.7533448934555054, + "learning_rate": 7.793342579750347e-06, + "loss": 0.0555, + "step": 3192 + }, + { + "epoch": 2.2119847592656736, + "grad_norm": 0.7420341968536377, + "learning_rate": 7.792649098474342e-06, + "loss": 0.0565, + "step": 3193 + }, + { + "epoch": 2.212677519916869, + "grad_norm": 0.645088255405426, + "learning_rate": 7.791955617198337e-06, + "loss": 0.0545, + "step": 3194 + }, + { + "epoch": 2.2133702805680637, + "grad_norm": 0.6115416884422302, + "learning_rate": 7.79126213592233e-06, + "loss": 0.0562, + "step": 3195 + }, + { + "epoch": 2.2140630412192586, + "grad_norm": 0.6269606351852417, + "learning_rate": 7.790568654646326e-06, + "loss": 0.0527, + "step": 3196 + }, + { + "epoch": 2.214755801870454, + "grad_norm": 0.6584285497665405, + "learning_rate": 7.789875173370319e-06, + "loss": 0.0485, + "step": 3197 + }, + { + "epoch": 2.2154485625216487, + "grad_norm": 0.6327019333839417, + "learning_rate": 7.789181692094314e-06, + "loss": 0.0445, + "step": 3198 + }, + { + "epoch": 2.216141323172844, + "grad_norm": 0.6777721643447876, + "learning_rate": 7.788488210818309e-06, + "loss": 0.0549, + "step": 3199 + }, + { + "epoch": 2.216834083824039, + "grad_norm": 0.6763327121734619, + "learning_rate": 7.787794729542304e-06, + "loss": 0.059, + "step": 3200 + }, + { + "epoch": 2.2175268444752336, + "grad_norm": 0.6816011071205139, + "learning_rate": 7.787101248266299e-06, + "loss": 0.0655, + "step": 3201 + }, + { + "epoch": 2.218219605126429, + "grad_norm": 0.611720621585846, + "learning_rate": 7.786407766990292e-06, + "loss": 0.0466, + "step": 3202 + }, + { + "epoch": 2.2189123657776237, + "grad_norm": 0.630315899848938, + "learning_rate": 7.785714285714287e-06, + "loss": 0.0566, + "step": 3203 + }, + { + "epoch": 2.219605126428819, + "grad_norm": 0.7050280570983887, + "learning_rate": 7.785020804438282e-06, + "loss": 0.0589, + "step": 3204 + }, + { + "epoch": 2.220297887080014, + "grad_norm": 0.7381419539451599, + "learning_rate": 7.784327323162275e-06, + "loss": 0.0646, + "step": 3205 + }, + { + "epoch": 2.2209906477312087, + "grad_norm": 0.6707060933113098, + "learning_rate": 7.78363384188627e-06, + "loss": 0.056, + "step": 3206 + }, + { + "epoch": 2.221683408382404, + "grad_norm": 0.5947423577308655, + "learning_rate": 7.782940360610263e-06, + "loss": 0.0509, + "step": 3207 + }, + { + "epoch": 2.222376169033599, + "grad_norm": 0.6475197672843933, + "learning_rate": 7.782246879334258e-06, + "loss": 0.0503, + "step": 3208 + }, + { + "epoch": 2.223068929684794, + "grad_norm": 0.6982625722885132, + "learning_rate": 7.781553398058253e-06, + "loss": 0.0631, + "step": 3209 + }, + { + "epoch": 2.223761690335989, + "grad_norm": 0.7394846677780151, + "learning_rate": 7.780859916782248e-06, + "loss": 0.0618, + "step": 3210 + }, + { + "epoch": 2.2244544509871838, + "grad_norm": 0.7193068265914917, + "learning_rate": 7.780166435506243e-06, + "loss": 0.0491, + "step": 3211 + }, + { + "epoch": 2.225147211638379, + "grad_norm": 0.5738288760185242, + "learning_rate": 7.779472954230236e-06, + "loss": 0.0443, + "step": 3212 + }, + { + "epoch": 2.225839972289574, + "grad_norm": 0.7070469856262207, + "learning_rate": 7.778779472954231e-06, + "loss": 0.0439, + "step": 3213 + }, + { + "epoch": 2.226532732940769, + "grad_norm": 0.6029460430145264, + "learning_rate": 7.778085991678226e-06, + "loss": 0.0482, + "step": 3214 + }, + { + "epoch": 2.227225493591964, + "grad_norm": 0.6252948045730591, + "learning_rate": 7.777392510402219e-06, + "loss": 0.0447, + "step": 3215 + }, + { + "epoch": 2.227918254243159, + "grad_norm": 0.6637809872627258, + "learning_rate": 7.776699029126214e-06, + "loss": 0.0524, + "step": 3216 + }, + { + "epoch": 2.228611014894354, + "grad_norm": 0.7449789643287659, + "learning_rate": 7.776005547850209e-06, + "loss": 0.0714, + "step": 3217 + }, + { + "epoch": 2.229303775545549, + "grad_norm": 0.6193034648895264, + "learning_rate": 7.775312066574204e-06, + "loss": 0.0385, + "step": 3218 + }, + { + "epoch": 2.229996536196744, + "grad_norm": 0.702811598777771, + "learning_rate": 7.774618585298199e-06, + "loss": 0.0538, + "step": 3219 + }, + { + "epoch": 2.230689296847939, + "grad_norm": 0.746298611164093, + "learning_rate": 7.773925104022192e-06, + "loss": 0.0549, + "step": 3220 + }, + { + "epoch": 2.231382057499134, + "grad_norm": 0.6173125505447388, + "learning_rate": 7.773231622746187e-06, + "loss": 0.0422, + "step": 3221 + }, + { + "epoch": 2.232074818150329, + "grad_norm": 0.7027741074562073, + "learning_rate": 7.77253814147018e-06, + "loss": 0.0492, + "step": 3222 + }, + { + "epoch": 2.232767578801524, + "grad_norm": 0.7909572720527649, + "learning_rate": 7.771844660194175e-06, + "loss": 0.0701, + "step": 3223 + }, + { + "epoch": 2.2334603394527193, + "grad_norm": 0.6326325535774231, + "learning_rate": 7.77115117891817e-06, + "loss": 0.0649, + "step": 3224 + }, + { + "epoch": 2.234153100103914, + "grad_norm": 0.6022499203681946, + "learning_rate": 7.770457697642163e-06, + "loss": 0.0497, + "step": 3225 + }, + { + "epoch": 2.234845860755109, + "grad_norm": 0.7148244976997375, + "learning_rate": 7.769764216366158e-06, + "loss": 0.0548, + "step": 3226 + }, + { + "epoch": 2.235538621406304, + "grad_norm": 0.7357353568077087, + "learning_rate": 7.769070735090153e-06, + "loss": 0.0527, + "step": 3227 + }, + { + "epoch": 2.236231382057499, + "grad_norm": 0.6361616253852844, + "learning_rate": 7.768377253814148e-06, + "loss": 0.0506, + "step": 3228 + }, + { + "epoch": 2.2369241427086943, + "grad_norm": 0.6656021475791931, + "learning_rate": 7.767683772538143e-06, + "loss": 0.0611, + "step": 3229 + }, + { + "epoch": 2.237616903359889, + "grad_norm": 0.7853916883468628, + "learning_rate": 7.766990291262136e-06, + "loss": 0.05, + "step": 3230 + }, + { + "epoch": 2.238309664011084, + "grad_norm": 0.7821096181869507, + "learning_rate": 7.766296809986131e-06, + "loss": 0.0473, + "step": 3231 + }, + { + "epoch": 2.2390024246622793, + "grad_norm": 0.7403864860534668, + "learning_rate": 7.765603328710125e-06, + "loss": 0.0601, + "step": 3232 + }, + { + "epoch": 2.239695185313474, + "grad_norm": 0.8113395571708679, + "learning_rate": 7.76490984743412e-06, + "loss": 0.0598, + "step": 3233 + }, + { + "epoch": 2.2403879459646694, + "grad_norm": 0.6732940077781677, + "learning_rate": 7.764216366158114e-06, + "loss": 0.0508, + "step": 3234 + }, + { + "epoch": 2.2410807066158642, + "grad_norm": 0.735000729560852, + "learning_rate": 7.76352288488211e-06, + "loss": 0.069, + "step": 3235 + }, + { + "epoch": 2.241773467267059, + "grad_norm": 0.6428692936897278, + "learning_rate": 7.762829403606104e-06, + "loss": 0.0511, + "step": 3236 + }, + { + "epoch": 2.2424662279182543, + "grad_norm": 0.7943806648254395, + "learning_rate": 7.762135922330097e-06, + "loss": 0.0669, + "step": 3237 + }, + { + "epoch": 2.243158988569449, + "grad_norm": 0.6358294486999512, + "learning_rate": 7.761442441054092e-06, + "loss": 0.0482, + "step": 3238 + }, + { + "epoch": 2.2438517492206445, + "grad_norm": 0.6320732831954956, + "learning_rate": 7.760748959778087e-06, + "loss": 0.0478, + "step": 3239 + }, + { + "epoch": 2.2445445098718393, + "grad_norm": 0.8697232604026794, + "learning_rate": 7.76005547850208e-06, + "loss": 0.0598, + "step": 3240 + }, + { + "epoch": 2.245237270523034, + "grad_norm": 0.5765998959541321, + "learning_rate": 7.759361997226076e-06, + "loss": 0.0458, + "step": 3241 + }, + { + "epoch": 2.2459300311742294, + "grad_norm": 0.6806704998016357, + "learning_rate": 7.75866851595007e-06, + "loss": 0.0681, + "step": 3242 + }, + { + "epoch": 2.2466227918254242, + "grad_norm": 0.5612189173698425, + "learning_rate": 7.757975034674065e-06, + "loss": 0.0433, + "step": 3243 + }, + { + "epoch": 2.2473155524766195, + "grad_norm": 0.6763518452644348, + "learning_rate": 7.757281553398059e-06, + "loss": 0.0591, + "step": 3244 + }, + { + "epoch": 2.2480083131278144, + "grad_norm": 0.7218267917633057, + "learning_rate": 7.756588072122054e-06, + "loss": 0.0678, + "step": 3245 + }, + { + "epoch": 2.248701073779009, + "grad_norm": 0.5798293352127075, + "learning_rate": 7.755894590846048e-06, + "loss": 0.0378, + "step": 3246 + }, + { + "epoch": 2.2493938344302045, + "grad_norm": 0.6329561471939087, + "learning_rate": 7.755201109570042e-06, + "loss": 0.0493, + "step": 3247 + }, + { + "epoch": 2.2500865950813993, + "grad_norm": 0.8438361287117004, + "learning_rate": 7.754507628294037e-06, + "loss": 0.0567, + "step": 3248 + }, + { + "epoch": 2.2507793557325946, + "grad_norm": 0.6507179737091064, + "learning_rate": 7.753814147018032e-06, + "loss": 0.0466, + "step": 3249 + }, + { + "epoch": 2.2514721163837894, + "grad_norm": 0.6499149799346924, + "learning_rate": 7.753120665742025e-06, + "loss": 0.0614, + "step": 3250 + }, + { + "epoch": 2.2521648770349842, + "grad_norm": 0.7082823514938354, + "learning_rate": 7.75242718446602e-06, + "loss": 0.0567, + "step": 3251 + }, + { + "epoch": 2.2528576376861795, + "grad_norm": 0.8208842873573303, + "learning_rate": 7.751733703190015e-06, + "loss": 0.0416, + "step": 3252 + }, + { + "epoch": 2.2535503983373744, + "grad_norm": 0.7298017144203186, + "learning_rate": 7.75104022191401e-06, + "loss": 0.0558, + "step": 3253 + }, + { + "epoch": 2.2542431589885696, + "grad_norm": 0.6573927402496338, + "learning_rate": 7.750346740638005e-06, + "loss": 0.0467, + "step": 3254 + }, + { + "epoch": 2.2549359196397645, + "grad_norm": 0.7816340923309326, + "learning_rate": 7.749653259361998e-06, + "loss": 0.0587, + "step": 3255 + }, + { + "epoch": 2.2556286802909593, + "grad_norm": 0.9797429442405701, + "learning_rate": 7.748959778085993e-06, + "loss": 0.0526, + "step": 3256 + }, + { + "epoch": 2.2563214409421546, + "grad_norm": 0.8206029534339905, + "learning_rate": 7.748266296809986e-06, + "loss": 0.0782, + "step": 3257 + }, + { + "epoch": 2.2570142015933494, + "grad_norm": 0.6237604022026062, + "learning_rate": 7.747572815533981e-06, + "loss": 0.0525, + "step": 3258 + }, + { + "epoch": 2.2577069622445447, + "grad_norm": 0.5808584094047546, + "learning_rate": 7.746879334257976e-06, + "loss": 0.051, + "step": 3259 + }, + { + "epoch": 2.2583997228957395, + "grad_norm": 0.6812832355499268, + "learning_rate": 7.74618585298197e-06, + "loss": 0.05, + "step": 3260 + }, + { + "epoch": 2.2590924835469344, + "grad_norm": 0.6276348829269409, + "learning_rate": 7.745492371705966e-06, + "loss": 0.0453, + "step": 3261 + }, + { + "epoch": 2.2597852441981297, + "grad_norm": 0.6594873070716858, + "learning_rate": 7.744798890429959e-06, + "loss": 0.0522, + "step": 3262 + }, + { + "epoch": 2.2604780048493245, + "grad_norm": 0.6078917980194092, + "learning_rate": 7.744105409153954e-06, + "loss": 0.0524, + "step": 3263 + }, + { + "epoch": 2.2611707655005198, + "grad_norm": 0.7057173252105713, + "learning_rate": 7.743411927877947e-06, + "loss": 0.0725, + "step": 3264 + }, + { + "epoch": 2.2618635261517146, + "grad_norm": 0.8082737922668457, + "learning_rate": 7.742718446601942e-06, + "loss": 0.0569, + "step": 3265 + }, + { + "epoch": 2.2625562868029094, + "grad_norm": 0.7110036015510559, + "learning_rate": 7.742024965325937e-06, + "loss": 0.0537, + "step": 3266 + }, + { + "epoch": 2.2632490474541047, + "grad_norm": 0.6635528206825256, + "learning_rate": 7.74133148404993e-06, + "loss": 0.0534, + "step": 3267 + }, + { + "epoch": 2.2639418081052995, + "grad_norm": 0.7297467589378357, + "learning_rate": 7.740638002773925e-06, + "loss": 0.0476, + "step": 3268 + }, + { + "epoch": 2.2646345687564944, + "grad_norm": 0.629737377166748, + "learning_rate": 7.73994452149792e-06, + "loss": 0.0504, + "step": 3269 + }, + { + "epoch": 2.2653273294076897, + "grad_norm": 0.7582229971885681, + "learning_rate": 7.739251040221915e-06, + "loss": 0.0601, + "step": 3270 + }, + { + "epoch": 2.2660200900588845, + "grad_norm": 0.8479446172714233, + "learning_rate": 7.73855755894591e-06, + "loss": 0.0457, + "step": 3271 + }, + { + "epoch": 2.2667128507100798, + "grad_norm": 0.7234289050102234, + "learning_rate": 7.737864077669903e-06, + "loss": 0.0508, + "step": 3272 + }, + { + "epoch": 2.2674056113612746, + "grad_norm": 0.653535008430481, + "learning_rate": 7.737170596393898e-06, + "loss": 0.061, + "step": 3273 + }, + { + "epoch": 2.26809837201247, + "grad_norm": 0.6727892160415649, + "learning_rate": 7.736477115117891e-06, + "loss": 0.043, + "step": 3274 + }, + { + "epoch": 2.2687911326636647, + "grad_norm": 0.7123434543609619, + "learning_rate": 7.735783633841886e-06, + "loss": 0.0571, + "step": 3275 + }, + { + "epoch": 2.2694838933148596, + "grad_norm": 0.6722378134727478, + "learning_rate": 7.735090152565881e-06, + "loss": 0.0449, + "step": 3276 + }, + { + "epoch": 2.270176653966055, + "grad_norm": 0.8674989342689514, + "learning_rate": 7.734396671289876e-06, + "loss": 0.0588, + "step": 3277 + }, + { + "epoch": 2.2708694146172497, + "grad_norm": 0.6619544625282288, + "learning_rate": 7.733703190013871e-06, + "loss": 0.0478, + "step": 3278 + }, + { + "epoch": 2.2715621752684445, + "grad_norm": 0.7146316766738892, + "learning_rate": 7.733009708737864e-06, + "loss": 0.0578, + "step": 3279 + }, + { + "epoch": 2.27225493591964, + "grad_norm": 0.6067398190498352, + "learning_rate": 7.73231622746186e-06, + "loss": 0.0427, + "step": 3280 + }, + { + "epoch": 2.2729476965708346, + "grad_norm": 0.7777449488639832, + "learning_rate": 7.731622746185854e-06, + "loss": 0.0513, + "step": 3281 + }, + { + "epoch": 2.27364045722203, + "grad_norm": 0.6652003526687622, + "learning_rate": 7.730929264909847e-06, + "loss": 0.0685, + "step": 3282 + }, + { + "epoch": 2.2743332178732247, + "grad_norm": 0.7848238348960876, + "learning_rate": 7.730235783633842e-06, + "loss": 0.0771, + "step": 3283 + }, + { + "epoch": 2.27502597852442, + "grad_norm": 0.6617704033851624, + "learning_rate": 7.729542302357836e-06, + "loss": 0.0468, + "step": 3284 + }, + { + "epoch": 2.275718739175615, + "grad_norm": 0.712853729724884, + "learning_rate": 7.72884882108183e-06, + "loss": 0.0644, + "step": 3285 + }, + { + "epoch": 2.2764114998268097, + "grad_norm": 0.6999899744987488, + "learning_rate": 7.728155339805825e-06, + "loss": 0.0533, + "step": 3286 + }, + { + "epoch": 2.277104260478005, + "grad_norm": 0.790968120098114, + "learning_rate": 7.72746185852982e-06, + "loss": 0.0718, + "step": 3287 + }, + { + "epoch": 2.2777970211292, + "grad_norm": 0.6867518424987793, + "learning_rate": 7.726768377253815e-06, + "loss": 0.0668, + "step": 3288 + }, + { + "epoch": 2.2784897817803946, + "grad_norm": 0.6289021968841553, + "learning_rate": 7.726074895977809e-06, + "loss": 0.0573, + "step": 3289 + }, + { + "epoch": 2.27918254243159, + "grad_norm": 0.7025508880615234, + "learning_rate": 7.725381414701804e-06, + "loss": 0.0459, + "step": 3290 + }, + { + "epoch": 2.2798753030827847, + "grad_norm": 0.6779745817184448, + "learning_rate": 7.724687933425798e-06, + "loss": 0.0444, + "step": 3291 + }, + { + "epoch": 2.28056806373398, + "grad_norm": 0.6166813969612122, + "learning_rate": 7.723994452149792e-06, + "loss": 0.0617, + "step": 3292 + }, + { + "epoch": 2.281260824385175, + "grad_norm": 0.766677975654602, + "learning_rate": 7.723300970873787e-06, + "loss": 0.0506, + "step": 3293 + }, + { + "epoch": 2.28195358503637, + "grad_norm": 0.9004775285720825, + "learning_rate": 7.722607489597782e-06, + "loss": 0.063, + "step": 3294 + }, + { + "epoch": 2.282646345687565, + "grad_norm": 0.7935314774513245, + "learning_rate": 7.721914008321777e-06, + "loss": 0.0546, + "step": 3295 + }, + { + "epoch": 2.28333910633876, + "grad_norm": 0.7481679916381836, + "learning_rate": 7.721220527045771e-06, + "loss": 0.0589, + "step": 3296 + }, + { + "epoch": 2.284031866989955, + "grad_norm": 0.7239437103271484, + "learning_rate": 7.720527045769765e-06, + "loss": 0.0427, + "step": 3297 + }, + { + "epoch": 2.28472462764115, + "grad_norm": 0.7293107509613037, + "learning_rate": 7.71983356449376e-06, + "loss": 0.0555, + "step": 3298 + }, + { + "epoch": 2.2854173882923448, + "grad_norm": 0.6999469995498657, + "learning_rate": 7.719140083217753e-06, + "loss": 0.0713, + "step": 3299 + }, + { + "epoch": 2.28611014894354, + "grad_norm": 0.6839814186096191, + "learning_rate": 7.718446601941748e-06, + "loss": 0.049, + "step": 3300 + }, + { + "epoch": 2.286802909594735, + "grad_norm": 0.6535493731498718, + "learning_rate": 7.717753120665743e-06, + "loss": 0.0533, + "step": 3301 + }, + { + "epoch": 2.28749567024593, + "grad_norm": 0.6733627319335938, + "learning_rate": 7.717059639389736e-06, + "loss": 0.0628, + "step": 3302 + }, + { + "epoch": 2.288188430897125, + "grad_norm": 0.7033507823944092, + "learning_rate": 7.716366158113731e-06, + "loss": 0.0473, + "step": 3303 + }, + { + "epoch": 2.2888811915483203, + "grad_norm": 0.8091992139816284, + "learning_rate": 7.715672676837726e-06, + "loss": 0.0715, + "step": 3304 + }, + { + "epoch": 2.289573952199515, + "grad_norm": 0.6329368352890015, + "learning_rate": 7.71497919556172e-06, + "loss": 0.0407, + "step": 3305 + }, + { + "epoch": 2.29026671285071, + "grad_norm": 0.6877524852752686, + "learning_rate": 7.714285714285716e-06, + "loss": 0.0586, + "step": 3306 + }, + { + "epoch": 2.290959473501905, + "grad_norm": 0.7619491815567017, + "learning_rate": 7.713592233009709e-06, + "loss": 0.0565, + "step": 3307 + }, + { + "epoch": 2.2916522341531, + "grad_norm": 0.6912320256233215, + "learning_rate": 7.712898751733704e-06, + "loss": 0.0632, + "step": 3308 + }, + { + "epoch": 2.292344994804295, + "grad_norm": 0.7955142259597778, + "learning_rate": 7.712205270457697e-06, + "loss": 0.0456, + "step": 3309 + }, + { + "epoch": 2.29303775545549, + "grad_norm": 0.8076238036155701, + "learning_rate": 7.711511789181692e-06, + "loss": 0.066, + "step": 3310 + }, + { + "epoch": 2.293730516106685, + "grad_norm": 0.6438044905662537, + "learning_rate": 7.710818307905687e-06, + "loss": 0.0508, + "step": 3311 + }, + { + "epoch": 2.2944232767578803, + "grad_norm": 0.6734865307807922, + "learning_rate": 7.710124826629682e-06, + "loss": 0.0561, + "step": 3312 + }, + { + "epoch": 2.295116037409075, + "grad_norm": 0.6374542117118835, + "learning_rate": 7.709431345353677e-06, + "loss": 0.0511, + "step": 3313 + }, + { + "epoch": 2.2958087980602704, + "grad_norm": 0.7424620985984802, + "learning_rate": 7.70873786407767e-06, + "loss": 0.0665, + "step": 3314 + }, + { + "epoch": 2.296501558711465, + "grad_norm": 0.6437535881996155, + "learning_rate": 7.708044382801665e-06, + "loss": 0.0526, + "step": 3315 + }, + { + "epoch": 2.29719431936266, + "grad_norm": 0.674333930015564, + "learning_rate": 7.70735090152566e-06, + "loss": 0.061, + "step": 3316 + }, + { + "epoch": 2.2978870800138553, + "grad_norm": 0.759538471698761, + "learning_rate": 7.706657420249653e-06, + "loss": 0.0749, + "step": 3317 + }, + { + "epoch": 2.29857984066505, + "grad_norm": 0.7620114088058472, + "learning_rate": 7.705963938973648e-06, + "loss": 0.072, + "step": 3318 + }, + { + "epoch": 2.299272601316245, + "grad_norm": 0.7666256427764893, + "learning_rate": 7.705270457697643e-06, + "loss": 0.0544, + "step": 3319 + }, + { + "epoch": 2.2999653619674403, + "grad_norm": 0.703120768070221, + "learning_rate": 7.704576976421638e-06, + "loss": 0.05, + "step": 3320 + }, + { + "epoch": 2.300658122618635, + "grad_norm": 0.6174482107162476, + "learning_rate": 7.703883495145631e-06, + "loss": 0.0411, + "step": 3321 + }, + { + "epoch": 2.3013508832698304, + "grad_norm": 0.7337712049484253, + "learning_rate": 7.703190013869626e-06, + "loss": 0.0749, + "step": 3322 + }, + { + "epoch": 2.3020436439210252, + "grad_norm": 0.754616379737854, + "learning_rate": 7.702496532593621e-06, + "loss": 0.0542, + "step": 3323 + }, + { + "epoch": 2.3027364045722205, + "grad_norm": 0.5999162793159485, + "learning_rate": 7.701803051317614e-06, + "loss": 0.0453, + "step": 3324 + }, + { + "epoch": 2.3034291652234153, + "grad_norm": 0.5953550338745117, + "learning_rate": 7.70110957004161e-06, + "loss": 0.0388, + "step": 3325 + }, + { + "epoch": 2.30412192587461, + "grad_norm": 0.6107983589172363, + "learning_rate": 7.700416088765604e-06, + "loss": 0.0504, + "step": 3326 + }, + { + "epoch": 2.3048146865258055, + "grad_norm": 0.625367283821106, + "learning_rate": 7.699722607489597e-06, + "loss": 0.0414, + "step": 3327 + }, + { + "epoch": 2.3055074471770003, + "grad_norm": 0.6652399301528931, + "learning_rate": 7.699029126213592e-06, + "loss": 0.0479, + "step": 3328 + }, + { + "epoch": 2.306200207828195, + "grad_norm": 0.6237362623214722, + "learning_rate": 7.698335644937587e-06, + "loss": 0.0556, + "step": 3329 + }, + { + "epoch": 2.3068929684793904, + "grad_norm": 0.707344114780426, + "learning_rate": 7.697642163661582e-06, + "loss": 0.0503, + "step": 3330 + }, + { + "epoch": 2.3075857291305852, + "grad_norm": 0.5936099290847778, + "learning_rate": 7.696948682385577e-06, + "loss": 0.0484, + "step": 3331 + }, + { + "epoch": 2.3082784897817805, + "grad_norm": 0.6967319250106812, + "learning_rate": 7.69625520110957e-06, + "loss": 0.0482, + "step": 3332 + }, + { + "epoch": 2.3089712504329754, + "grad_norm": 0.6179273128509521, + "learning_rate": 7.695561719833565e-06, + "loss": 0.0512, + "step": 3333 + }, + { + "epoch": 2.3096640110841706, + "grad_norm": 0.6576906442642212, + "learning_rate": 7.694868238557559e-06, + "loss": 0.0524, + "step": 3334 + }, + { + "epoch": 2.3103567717353655, + "grad_norm": 0.6755362749099731, + "learning_rate": 7.694174757281554e-06, + "loss": 0.059, + "step": 3335 + }, + { + "epoch": 2.3110495323865603, + "grad_norm": 0.6898190379142761, + "learning_rate": 7.693481276005548e-06, + "loss": 0.0511, + "step": 3336 + }, + { + "epoch": 2.3117422930377556, + "grad_norm": 0.7592430114746094, + "learning_rate": 7.692787794729543e-06, + "loss": 0.0711, + "step": 3337 + }, + { + "epoch": 2.3124350536889504, + "grad_norm": 0.7317067384719849, + "learning_rate": 7.692094313453538e-06, + "loss": 0.0535, + "step": 3338 + }, + { + "epoch": 2.3131278143401452, + "grad_norm": 0.6594251990318298, + "learning_rate": 7.691400832177532e-06, + "loss": 0.0502, + "step": 3339 + }, + { + "epoch": 2.3138205749913405, + "grad_norm": 0.6752980351448059, + "learning_rate": 7.690707350901526e-06, + "loss": 0.0561, + "step": 3340 + }, + { + "epoch": 2.3145133356425354, + "grad_norm": 0.7381166815757751, + "learning_rate": 7.690013869625521e-06, + "loss": 0.059, + "step": 3341 + }, + { + "epoch": 2.3152060962937306, + "grad_norm": 0.8149734139442444, + "learning_rate": 7.689320388349515e-06, + "loss": 0.07, + "step": 3342 + }, + { + "epoch": 2.3158988569449255, + "grad_norm": 0.6074904203414917, + "learning_rate": 7.68862690707351e-06, + "loss": 0.0454, + "step": 3343 + }, + { + "epoch": 2.3165916175961208, + "grad_norm": 0.7600086331367493, + "learning_rate": 7.687933425797503e-06, + "loss": 0.0617, + "step": 3344 + }, + { + "epoch": 2.3172843782473156, + "grad_norm": 0.6978055238723755, + "learning_rate": 7.687239944521498e-06, + "loss": 0.0532, + "step": 3345 + }, + { + "epoch": 2.3179771388985104, + "grad_norm": 0.5998455882072449, + "learning_rate": 7.686546463245493e-06, + "loss": 0.0547, + "step": 3346 + }, + { + "epoch": 2.3186698995497057, + "grad_norm": 0.7415463328361511, + "learning_rate": 7.685852981969488e-06, + "loss": 0.0546, + "step": 3347 + }, + { + "epoch": 2.3193626602009005, + "grad_norm": 0.6985805034637451, + "learning_rate": 7.685159500693483e-06, + "loss": 0.0598, + "step": 3348 + }, + { + "epoch": 2.3200554208520954, + "grad_norm": 0.7012605667114258, + "learning_rate": 7.684466019417476e-06, + "loss": 0.0643, + "step": 3349 + }, + { + "epoch": 2.3207481815032907, + "grad_norm": 0.7899414896965027, + "learning_rate": 7.68377253814147e-06, + "loss": 0.0494, + "step": 3350 + }, + { + "epoch": 2.3214409421544855, + "grad_norm": 0.7159631848335266, + "learning_rate": 7.683079056865466e-06, + "loss": 0.0545, + "step": 3351 + }, + { + "epoch": 2.3221337028056808, + "grad_norm": 0.6470344066619873, + "learning_rate": 7.682385575589459e-06, + "loss": 0.0501, + "step": 3352 + }, + { + "epoch": 2.3228264634568756, + "grad_norm": 0.6960127353668213, + "learning_rate": 7.681692094313454e-06, + "loss": 0.0595, + "step": 3353 + }, + { + "epoch": 2.323519224108071, + "grad_norm": 0.6139276027679443, + "learning_rate": 7.680998613037449e-06, + "loss": 0.0559, + "step": 3354 + }, + { + "epoch": 2.3242119847592657, + "grad_norm": 0.6615197062492371, + "learning_rate": 7.680305131761444e-06, + "loss": 0.0578, + "step": 3355 + }, + { + "epoch": 2.3249047454104605, + "grad_norm": 0.6286178231239319, + "learning_rate": 7.679611650485439e-06, + "loss": 0.0565, + "step": 3356 + }, + { + "epoch": 2.325597506061656, + "grad_norm": 0.7141321301460266, + "learning_rate": 7.678918169209432e-06, + "loss": 0.044, + "step": 3357 + }, + { + "epoch": 2.3262902667128507, + "grad_norm": 0.7097179889678955, + "learning_rate": 7.678224687933427e-06, + "loss": 0.0537, + "step": 3358 + }, + { + "epoch": 2.3269830273640455, + "grad_norm": 0.7265419363975525, + "learning_rate": 7.67753120665742e-06, + "loss": 0.0714, + "step": 3359 + }, + { + "epoch": 2.3276757880152408, + "grad_norm": 0.5918153524398804, + "learning_rate": 7.676837725381415e-06, + "loss": 0.0411, + "step": 3360 + }, + { + "epoch": 2.3283685486664356, + "grad_norm": 0.7092829942703247, + "learning_rate": 7.67614424410541e-06, + "loss": 0.0545, + "step": 3361 + }, + { + "epoch": 2.329061309317631, + "grad_norm": 0.6691579222679138, + "learning_rate": 7.675450762829403e-06, + "loss": 0.0616, + "step": 3362 + }, + { + "epoch": 2.3297540699688257, + "grad_norm": 0.7718387842178345, + "learning_rate": 7.674757281553398e-06, + "loss": 0.0662, + "step": 3363 + }, + { + "epoch": 2.330446830620021, + "grad_norm": 0.7115030884742737, + "learning_rate": 7.674063800277393e-06, + "loss": 0.0429, + "step": 3364 + }, + { + "epoch": 2.331139591271216, + "grad_norm": 0.6774365901947021, + "learning_rate": 7.673370319001388e-06, + "loss": 0.0498, + "step": 3365 + }, + { + "epoch": 2.3318323519224107, + "grad_norm": 0.7079840302467346, + "learning_rate": 7.672676837725383e-06, + "loss": 0.0617, + "step": 3366 + }, + { + "epoch": 2.332525112573606, + "grad_norm": 0.5688597559928894, + "learning_rate": 7.671983356449376e-06, + "loss": 0.0462, + "step": 3367 + }, + { + "epoch": 2.333217873224801, + "grad_norm": 0.7698050737380981, + "learning_rate": 7.671289875173371e-06, + "loss": 0.0598, + "step": 3368 + }, + { + "epoch": 2.3339106338759956, + "grad_norm": 0.8596594333648682, + "learning_rate": 7.670596393897364e-06, + "loss": 0.054, + "step": 3369 + }, + { + "epoch": 2.334603394527191, + "grad_norm": 0.6817486882209778, + "learning_rate": 7.66990291262136e-06, + "loss": 0.0593, + "step": 3370 + }, + { + "epoch": 2.3352961551783857, + "grad_norm": 0.6705496907234192, + "learning_rate": 7.669209431345354e-06, + "loss": 0.0507, + "step": 3371 + }, + { + "epoch": 2.335988915829581, + "grad_norm": 0.665134608745575, + "learning_rate": 7.668515950069349e-06, + "loss": 0.0535, + "step": 3372 + }, + { + "epoch": 2.336681676480776, + "grad_norm": 0.5743928551673889, + "learning_rate": 7.667822468793344e-06, + "loss": 0.0388, + "step": 3373 + }, + { + "epoch": 2.337374437131971, + "grad_norm": 0.6930652260780334, + "learning_rate": 7.667128987517337e-06, + "loss": 0.062, + "step": 3374 + }, + { + "epoch": 2.338067197783166, + "grad_norm": 0.6121317744255066, + "learning_rate": 7.666435506241332e-06, + "loss": 0.0415, + "step": 3375 + }, + { + "epoch": 2.338759958434361, + "grad_norm": 0.7009512782096863, + "learning_rate": 7.665742024965327e-06, + "loss": 0.0519, + "step": 3376 + }, + { + "epoch": 2.339452719085556, + "grad_norm": 0.7098727822303772, + "learning_rate": 7.66504854368932e-06, + "loss": 0.0469, + "step": 3377 + }, + { + "epoch": 2.340145479736751, + "grad_norm": 0.6412307620048523, + "learning_rate": 7.664355062413315e-06, + "loss": 0.051, + "step": 3378 + }, + { + "epoch": 2.3408382403879457, + "grad_norm": 0.8032299876213074, + "learning_rate": 7.663661581137309e-06, + "loss": 0.0665, + "step": 3379 + }, + { + "epoch": 2.341531001039141, + "grad_norm": 0.7739721536636353, + "learning_rate": 7.662968099861303e-06, + "loss": 0.0553, + "step": 3380 + }, + { + "epoch": 2.342223761690336, + "grad_norm": 0.5881069302558899, + "learning_rate": 7.662274618585298e-06, + "loss": 0.0424, + "step": 3381 + }, + { + "epoch": 2.342916522341531, + "grad_norm": 0.7282366156578064, + "learning_rate": 7.661581137309293e-06, + "loss": 0.0489, + "step": 3382 + }, + { + "epoch": 2.343609282992726, + "grad_norm": 0.6442456245422363, + "learning_rate": 7.660887656033288e-06, + "loss": 0.0536, + "step": 3383 + }, + { + "epoch": 2.3443020436439213, + "grad_norm": 0.6761475801467896, + "learning_rate": 7.660194174757282e-06, + "loss": 0.0523, + "step": 3384 + }, + { + "epoch": 2.344994804295116, + "grad_norm": 0.6950258016586304, + "learning_rate": 7.659500693481276e-06, + "loss": 0.0572, + "step": 3385 + }, + { + "epoch": 2.345687564946311, + "grad_norm": 0.6278981566429138, + "learning_rate": 7.658807212205271e-06, + "loss": 0.0509, + "step": 3386 + }, + { + "epoch": 2.346380325597506, + "grad_norm": 0.6928952932357788, + "learning_rate": 7.658113730929265e-06, + "loss": 0.0524, + "step": 3387 + }, + { + "epoch": 2.347073086248701, + "grad_norm": 0.7491379380226135, + "learning_rate": 7.65742024965326e-06, + "loss": 0.0461, + "step": 3388 + }, + { + "epoch": 2.347765846899896, + "grad_norm": 0.7919186353683472, + "learning_rate": 7.656726768377255e-06, + "loss": 0.061, + "step": 3389 + }, + { + "epoch": 2.348458607551091, + "grad_norm": 0.7522119283676147, + "learning_rate": 7.65603328710125e-06, + "loss": 0.0685, + "step": 3390 + }, + { + "epoch": 2.349151368202286, + "grad_norm": 0.6858955025672913, + "learning_rate": 7.655339805825244e-06, + "loss": 0.0695, + "step": 3391 + }, + { + "epoch": 2.3498441288534813, + "grad_norm": 0.7146169543266296, + "learning_rate": 7.654646324549238e-06, + "loss": 0.0649, + "step": 3392 + }, + { + "epoch": 2.350536889504676, + "grad_norm": 0.7618662714958191, + "learning_rate": 7.653952843273233e-06, + "loss": 0.0639, + "step": 3393 + }, + { + "epoch": 2.3512296501558714, + "grad_norm": 0.7013067007064819, + "learning_rate": 7.653259361997226e-06, + "loss": 0.051, + "step": 3394 + }, + { + "epoch": 2.351922410807066, + "grad_norm": 0.7284305095672607, + "learning_rate": 7.65256588072122e-06, + "loss": 0.0575, + "step": 3395 + }, + { + "epoch": 2.352615171458261, + "grad_norm": 0.951767086982727, + "learning_rate": 7.651872399445216e-06, + "loss": 0.0742, + "step": 3396 + }, + { + "epoch": 2.3533079321094563, + "grad_norm": 0.6954596638679504, + "learning_rate": 7.65117891816921e-06, + "loss": 0.064, + "step": 3397 + }, + { + "epoch": 2.354000692760651, + "grad_norm": 0.7585980892181396, + "learning_rate": 7.650485436893204e-06, + "loss": 0.0561, + "step": 3398 + }, + { + "epoch": 2.354693453411846, + "grad_norm": 0.5955365300178528, + "learning_rate": 7.649791955617199e-06, + "loss": 0.05, + "step": 3399 + }, + { + "epoch": 2.3553862140630413, + "grad_norm": 0.7831491827964783, + "learning_rate": 7.649098474341194e-06, + "loss": 0.0649, + "step": 3400 + }, + { + "epoch": 2.356078974714236, + "grad_norm": 0.7788838744163513, + "learning_rate": 7.648404993065189e-06, + "loss": 0.0615, + "step": 3401 + }, + { + "epoch": 2.3567717353654314, + "grad_norm": 0.5785791277885437, + "learning_rate": 7.647711511789182e-06, + "loss": 0.0473, + "step": 3402 + }, + { + "epoch": 2.357464496016626, + "grad_norm": 0.6168627738952637, + "learning_rate": 7.647018030513177e-06, + "loss": 0.0543, + "step": 3403 + }, + { + "epoch": 2.3581572566678215, + "grad_norm": 0.7591379284858704, + "learning_rate": 7.64632454923717e-06, + "loss": 0.0655, + "step": 3404 + }, + { + "epoch": 2.3588500173190163, + "grad_norm": 0.601206362247467, + "learning_rate": 7.645631067961165e-06, + "loss": 0.0425, + "step": 3405 + }, + { + "epoch": 2.359542777970211, + "grad_norm": 0.6227657198905945, + "learning_rate": 7.64493758668516e-06, + "loss": 0.0481, + "step": 3406 + }, + { + "epoch": 2.3602355386214064, + "grad_norm": 0.6924575567245483, + "learning_rate": 7.644244105409155e-06, + "loss": 0.0645, + "step": 3407 + }, + { + "epoch": 2.3609282992726013, + "grad_norm": 0.6208140254020691, + "learning_rate": 7.64355062413315e-06, + "loss": 0.0464, + "step": 3408 + }, + { + "epoch": 2.361621059923796, + "grad_norm": 0.6188228130340576, + "learning_rate": 7.642857142857143e-06, + "loss": 0.0444, + "step": 3409 + }, + { + "epoch": 2.3623138205749914, + "grad_norm": 0.6708952188491821, + "learning_rate": 7.642163661581138e-06, + "loss": 0.0551, + "step": 3410 + }, + { + "epoch": 2.3630065812261862, + "grad_norm": 0.6148442029953003, + "learning_rate": 7.641470180305133e-06, + "loss": 0.0457, + "step": 3411 + }, + { + "epoch": 2.3636993418773815, + "grad_norm": 0.6684728860855103, + "learning_rate": 7.640776699029126e-06, + "loss": 0.0536, + "step": 3412 + }, + { + "epoch": 2.3643921025285763, + "grad_norm": 0.729336678981781, + "learning_rate": 7.640083217753121e-06, + "loss": 0.047, + "step": 3413 + }, + { + "epoch": 2.3650848631797716, + "grad_norm": 0.5910308957099915, + "learning_rate": 7.639389736477116e-06, + "loss": 0.0452, + "step": 3414 + }, + { + "epoch": 2.3657776238309665, + "grad_norm": 0.621016800403595, + "learning_rate": 7.638696255201111e-06, + "loss": 0.0474, + "step": 3415 + }, + { + "epoch": 2.3664703844821613, + "grad_norm": 0.7773977518081665, + "learning_rate": 7.638002773925106e-06, + "loss": 0.0418, + "step": 3416 + }, + { + "epoch": 2.3671631451333566, + "grad_norm": 0.8472411036491394, + "learning_rate": 7.637309292649099e-06, + "loss": 0.081, + "step": 3417 + }, + { + "epoch": 2.3678559057845514, + "grad_norm": 0.6587639451026917, + "learning_rate": 7.636615811373094e-06, + "loss": 0.0467, + "step": 3418 + }, + { + "epoch": 2.3685486664357462, + "grad_norm": 0.811945378780365, + "learning_rate": 7.635922330097087e-06, + "loss": 0.0623, + "step": 3419 + }, + { + "epoch": 2.3692414270869415, + "grad_norm": 0.6511952877044678, + "learning_rate": 7.635228848821082e-06, + "loss": 0.0497, + "step": 3420 + }, + { + "epoch": 2.3699341877381364, + "grad_norm": 0.6955153346061707, + "learning_rate": 7.634535367545077e-06, + "loss": 0.0551, + "step": 3421 + }, + { + "epoch": 2.3706269483893316, + "grad_norm": 0.7224912643432617, + "learning_rate": 7.63384188626907e-06, + "loss": 0.0559, + "step": 3422 + }, + { + "epoch": 2.3713197090405265, + "grad_norm": 0.6983798742294312, + "learning_rate": 7.633148404993065e-06, + "loss": 0.049, + "step": 3423 + }, + { + "epoch": 2.3720124696917217, + "grad_norm": 0.696856677532196, + "learning_rate": 7.63245492371706e-06, + "loss": 0.0517, + "step": 3424 + }, + { + "epoch": 2.3727052303429166, + "grad_norm": 0.7483749389648438, + "learning_rate": 7.631761442441055e-06, + "loss": 0.0525, + "step": 3425 + }, + { + "epoch": 2.3733979909941114, + "grad_norm": 0.8678938150405884, + "learning_rate": 7.63106796116505e-06, + "loss": 0.0616, + "step": 3426 + }, + { + "epoch": 2.3740907516453067, + "grad_norm": 0.7085834741592407, + "learning_rate": 7.630374479889043e-06, + "loss": 0.0672, + "step": 3427 + }, + { + "epoch": 2.3747835122965015, + "grad_norm": 0.6799753904342651, + "learning_rate": 7.629680998613038e-06, + "loss": 0.0479, + "step": 3428 + }, + { + "epoch": 2.3754762729476964, + "grad_norm": 0.654399037361145, + "learning_rate": 7.628987517337032e-06, + "loss": 0.0506, + "step": 3429 + }, + { + "epoch": 2.3761690335988916, + "grad_norm": 0.7557405829429626, + "learning_rate": 7.628294036061027e-06, + "loss": 0.0622, + "step": 3430 + }, + { + "epoch": 2.3768617942500865, + "grad_norm": 0.6945838928222656, + "learning_rate": 7.627600554785022e-06, + "loss": 0.0597, + "step": 3431 + }, + { + "epoch": 2.3775545549012818, + "grad_norm": 0.7190044522285461, + "learning_rate": 7.6269070735090155e-06, + "loss": 0.0458, + "step": 3432 + }, + { + "epoch": 2.3782473155524766, + "grad_norm": 0.7875450849533081, + "learning_rate": 7.62621359223301e-06, + "loss": 0.0587, + "step": 3433 + }, + { + "epoch": 2.378940076203672, + "grad_norm": 0.7193741798400879, + "learning_rate": 7.6255201109570045e-06, + "loss": 0.0612, + "step": 3434 + }, + { + "epoch": 2.3796328368548667, + "grad_norm": 0.5874949097633362, + "learning_rate": 7.624826629680999e-06, + "loss": 0.047, + "step": 3435 + }, + { + "epoch": 2.3803255975060615, + "grad_norm": 0.6685798764228821, + "learning_rate": 7.624133148404994e-06, + "loss": 0.0498, + "step": 3436 + }, + { + "epoch": 2.381018358157257, + "grad_norm": 0.6457735896110535, + "learning_rate": 7.623439667128988e-06, + "loss": 0.0602, + "step": 3437 + }, + { + "epoch": 2.3817111188084517, + "grad_norm": 0.7490965127944946, + "learning_rate": 7.6227461858529825e-06, + "loss": 0.061, + "step": 3438 + }, + { + "epoch": 2.3824038794596465, + "grad_norm": 0.9154457449913025, + "learning_rate": 7.622052704576977e-06, + "loss": 0.0629, + "step": 3439 + }, + { + "epoch": 2.3830966401108418, + "grad_norm": 0.6912528276443481, + "learning_rate": 7.6213592233009715e-06, + "loss": 0.0523, + "step": 3440 + }, + { + "epoch": 2.3837894007620366, + "grad_norm": 0.7128589153289795, + "learning_rate": 7.6206657420249665e-06, + "loss": 0.0487, + "step": 3441 + }, + { + "epoch": 2.384482161413232, + "grad_norm": 0.7874253392219543, + "learning_rate": 7.61997226074896e-06, + "loss": 0.0524, + "step": 3442 + }, + { + "epoch": 2.3851749220644267, + "grad_norm": 0.7684004306793213, + "learning_rate": 7.619278779472955e-06, + "loss": 0.0701, + "step": 3443 + }, + { + "epoch": 2.385867682715622, + "grad_norm": 0.8227635622024536, + "learning_rate": 7.618585298196949e-06, + "loss": 0.0557, + "step": 3444 + }, + { + "epoch": 2.386560443366817, + "grad_norm": 0.6978826522827148, + "learning_rate": 7.617891816920944e-06, + "loss": 0.0556, + "step": 3445 + }, + { + "epoch": 2.3872532040180117, + "grad_norm": 0.7304977774620056, + "learning_rate": 7.617198335644939e-06, + "loss": 0.0483, + "step": 3446 + }, + { + "epoch": 2.387945964669207, + "grad_norm": 0.6925829648971558, + "learning_rate": 7.616504854368933e-06, + "loss": 0.0586, + "step": 3447 + }, + { + "epoch": 2.388638725320402, + "grad_norm": 0.7023626565933228, + "learning_rate": 7.615811373092928e-06, + "loss": 0.068, + "step": 3448 + }, + { + "epoch": 2.3893314859715966, + "grad_norm": 0.6978974938392639, + "learning_rate": 7.615117891816921e-06, + "loss": 0.0533, + "step": 3449 + }, + { + "epoch": 2.390024246622792, + "grad_norm": 0.6212133169174194, + "learning_rate": 7.614424410540916e-06, + "loss": 0.048, + "step": 3450 + }, + { + "epoch": 2.3907170072739867, + "grad_norm": 0.6057196259498596, + "learning_rate": 7.613730929264911e-06, + "loss": 0.0455, + "step": 3451 + }, + { + "epoch": 2.391409767925182, + "grad_norm": 0.7019253373146057, + "learning_rate": 7.613037447988905e-06, + "loss": 0.05, + "step": 3452 + }, + { + "epoch": 2.392102528576377, + "grad_norm": 0.6439147591590881, + "learning_rate": 7.6123439667129e-06, + "loss": 0.0482, + "step": 3453 + }, + { + "epoch": 2.392795289227572, + "grad_norm": 0.7034726142883301, + "learning_rate": 7.611650485436893e-06, + "loss": 0.0513, + "step": 3454 + }, + { + "epoch": 2.393488049878767, + "grad_norm": 0.576032280921936, + "learning_rate": 7.610957004160888e-06, + "loss": 0.0416, + "step": 3455 + }, + { + "epoch": 2.394180810529962, + "grad_norm": 0.6888068914413452, + "learning_rate": 7.610263522884883e-06, + "loss": 0.0572, + "step": 3456 + }, + { + "epoch": 2.394873571181157, + "grad_norm": 0.585434079170227, + "learning_rate": 7.609570041608877e-06, + "loss": 0.0412, + "step": 3457 + }, + { + "epoch": 2.395566331832352, + "grad_norm": 0.6731674075126648, + "learning_rate": 7.608876560332872e-06, + "loss": 0.0625, + "step": 3458 + }, + { + "epoch": 2.3962590924835467, + "grad_norm": 0.645865797996521, + "learning_rate": 7.608183079056866e-06, + "loss": 0.0574, + "step": 3459 + }, + { + "epoch": 2.396951853134742, + "grad_norm": 0.7278096079826355, + "learning_rate": 7.607489597780861e-06, + "loss": 0.0531, + "step": 3460 + }, + { + "epoch": 2.397644613785937, + "grad_norm": 0.692797064781189, + "learning_rate": 7.606796116504855e-06, + "loss": 0.0449, + "step": 3461 + }, + { + "epoch": 2.398337374437132, + "grad_norm": 0.6844128370285034, + "learning_rate": 7.606102635228849e-06, + "loss": 0.0637, + "step": 3462 + }, + { + "epoch": 2.399030135088327, + "grad_norm": 0.6277254223823547, + "learning_rate": 7.605409153952844e-06, + "loss": 0.0547, + "step": 3463 + }, + { + "epoch": 2.3997228957395222, + "grad_norm": 0.6868218183517456, + "learning_rate": 7.604715672676838e-06, + "loss": 0.0573, + "step": 3464 + }, + { + "epoch": 2.400415656390717, + "grad_norm": 0.9933450818061829, + "learning_rate": 7.604022191400833e-06, + "loss": 0.0793, + "step": 3465 + }, + { + "epoch": 2.401108417041912, + "grad_norm": 0.6962992548942566, + "learning_rate": 7.603328710124828e-06, + "loss": 0.0528, + "step": 3466 + }, + { + "epoch": 2.401801177693107, + "grad_norm": 0.770677387714386, + "learning_rate": 7.602635228848821e-06, + "loss": 0.0648, + "step": 3467 + }, + { + "epoch": 2.402493938344302, + "grad_norm": 0.7920851111412048, + "learning_rate": 7.601941747572816e-06, + "loss": 0.0569, + "step": 3468 + }, + { + "epoch": 2.403186698995497, + "grad_norm": 0.8551387786865234, + "learning_rate": 7.60124826629681e-06, + "loss": 0.0508, + "step": 3469 + }, + { + "epoch": 2.403879459646692, + "grad_norm": 0.6823190450668335, + "learning_rate": 7.600554785020805e-06, + "loss": 0.051, + "step": 3470 + }, + { + "epoch": 2.404572220297887, + "grad_norm": 0.7619594931602478, + "learning_rate": 7.5998613037448e-06, + "loss": 0.0685, + "step": 3471 + }, + { + "epoch": 2.4052649809490823, + "grad_norm": 0.5996909141540527, + "learning_rate": 7.599167822468793e-06, + "loss": 0.0423, + "step": 3472 + }, + { + "epoch": 2.405957741600277, + "grad_norm": 0.7393822073936462, + "learning_rate": 7.598474341192788e-06, + "loss": 0.0518, + "step": 3473 + }, + { + "epoch": 2.406650502251472, + "grad_norm": 0.7016523480415344, + "learning_rate": 7.597780859916782e-06, + "loss": 0.0538, + "step": 3474 + }, + { + "epoch": 2.407343262902667, + "grad_norm": 0.634170651435852, + "learning_rate": 7.597087378640777e-06, + "loss": 0.0527, + "step": 3475 + }, + { + "epoch": 2.408036023553862, + "grad_norm": 0.7694522142410278, + "learning_rate": 7.596393897364772e-06, + "loss": 0.0574, + "step": 3476 + }, + { + "epoch": 2.4087287842050573, + "grad_norm": 0.727978527545929, + "learning_rate": 7.595700416088766e-06, + "loss": 0.0494, + "step": 3477 + }, + { + "epoch": 2.409421544856252, + "grad_norm": 0.6250091195106506, + "learning_rate": 7.595006934812761e-06, + "loss": 0.0441, + "step": 3478 + }, + { + "epoch": 2.410114305507447, + "grad_norm": 0.7196205258369446, + "learning_rate": 7.5943134535367545e-06, + "loss": 0.0645, + "step": 3479 + }, + { + "epoch": 2.4108070661586423, + "grad_norm": 0.8385190963745117, + "learning_rate": 7.593619972260749e-06, + "loss": 0.0679, + "step": 3480 + }, + { + "epoch": 2.411499826809837, + "grad_norm": 1.0772920846939087, + "learning_rate": 7.592926490984744e-06, + "loss": 0.0838, + "step": 3481 + }, + { + "epoch": 2.4121925874610324, + "grad_norm": 0.7560489773750305, + "learning_rate": 7.592233009708738e-06, + "loss": 0.0641, + "step": 3482 + }, + { + "epoch": 2.412885348112227, + "grad_norm": 0.6982017159461975, + "learning_rate": 7.591539528432733e-06, + "loss": 0.0552, + "step": 3483 + }, + { + "epoch": 2.413578108763422, + "grad_norm": 0.6059712171554565, + "learning_rate": 7.590846047156727e-06, + "loss": 0.0476, + "step": 3484 + }, + { + "epoch": 2.4142708694146173, + "grad_norm": 0.7760432362556458, + "learning_rate": 7.5901525658807215e-06, + "loss": 0.0565, + "step": 3485 + }, + { + "epoch": 2.414963630065812, + "grad_norm": 0.6153656840324402, + "learning_rate": 7.5894590846047165e-06, + "loss": 0.0528, + "step": 3486 + }, + { + "epoch": 2.4156563907170074, + "grad_norm": 0.6630606055259705, + "learning_rate": 7.5887656033287105e-06, + "loss": 0.0459, + "step": 3487 + }, + { + "epoch": 2.4163491513682023, + "grad_norm": 0.5501273274421692, + "learning_rate": 7.5880721220527055e-06, + "loss": 0.0417, + "step": 3488 + }, + { + "epoch": 2.417041912019397, + "grad_norm": 0.6584319472312927, + "learning_rate": 7.5873786407766996e-06, + "loss": 0.0553, + "step": 3489 + }, + { + "epoch": 2.4177346726705924, + "grad_norm": 0.7364506125450134, + "learning_rate": 7.586685159500694e-06, + "loss": 0.0626, + "step": 3490 + }, + { + "epoch": 2.4184274333217872, + "grad_norm": 0.6536809802055359, + "learning_rate": 7.585991678224689e-06, + "loss": 0.0504, + "step": 3491 + }, + { + "epoch": 2.4191201939729825, + "grad_norm": 0.6651002168655396, + "learning_rate": 7.585298196948683e-06, + "loss": 0.0484, + "step": 3492 + }, + { + "epoch": 2.4198129546241773, + "grad_norm": 0.6081539988517761, + "learning_rate": 7.584604715672678e-06, + "loss": 0.0574, + "step": 3493 + }, + { + "epoch": 2.420505715275372, + "grad_norm": 0.7257691621780396, + "learning_rate": 7.583911234396672e-06, + "loss": 0.0518, + "step": 3494 + }, + { + "epoch": 2.4211984759265675, + "grad_norm": 0.742874026298523, + "learning_rate": 7.583217753120667e-06, + "loss": 0.0581, + "step": 3495 + }, + { + "epoch": 2.4218912365777623, + "grad_norm": 0.6096038818359375, + "learning_rate": 7.5825242718446616e-06, + "loss": 0.054, + "step": 3496 + }, + { + "epoch": 2.4225839972289576, + "grad_norm": 0.7022075057029724, + "learning_rate": 7.581830790568655e-06, + "loss": 0.0712, + "step": 3497 + }, + { + "epoch": 2.4232767578801524, + "grad_norm": 0.665908694267273, + "learning_rate": 7.58113730929265e-06, + "loss": 0.0562, + "step": 3498 + }, + { + "epoch": 2.4239695185313472, + "grad_norm": 0.8648689985275269, + "learning_rate": 7.580443828016644e-06, + "loss": 0.0747, + "step": 3499 + }, + { + "epoch": 2.4246622791825425, + "grad_norm": 0.6802393198013306, + "learning_rate": 7.579750346740639e-06, + "loss": 0.0584, + "step": 3500 + }, + { + "epoch": 2.4253550398337373, + "grad_norm": 0.732030987739563, + "learning_rate": 7.579056865464634e-06, + "loss": 0.0593, + "step": 3501 + }, + { + "epoch": 2.4260478004849326, + "grad_norm": 0.7286328673362732, + "learning_rate": 7.578363384188627e-06, + "loss": 0.0488, + "step": 3502 + }, + { + "epoch": 2.4267405611361275, + "grad_norm": 0.6514286994934082, + "learning_rate": 7.577669902912622e-06, + "loss": 0.046, + "step": 3503 + }, + { + "epoch": 2.4274333217873223, + "grad_norm": 0.6425239443778992, + "learning_rate": 7.576976421636616e-06, + "loss": 0.0567, + "step": 3504 + }, + { + "epoch": 2.4281260824385176, + "grad_norm": 0.621367871761322, + "learning_rate": 7.576282940360611e-06, + "loss": 0.0538, + "step": 3505 + }, + { + "epoch": 2.4288188430897124, + "grad_norm": 0.848767876625061, + "learning_rate": 7.575589459084606e-06, + "loss": 0.0611, + "step": 3506 + }, + { + "epoch": 2.4295116037409077, + "grad_norm": 0.7124624252319336, + "learning_rate": 7.5748959778086e-06, + "loss": 0.0577, + "step": 3507 + }, + { + "epoch": 2.4302043643921025, + "grad_norm": 0.6805168390274048, + "learning_rate": 7.574202496532595e-06, + "loss": 0.0585, + "step": 3508 + }, + { + "epoch": 2.4308971250432974, + "grad_norm": 0.6756210327148438, + "learning_rate": 7.573509015256588e-06, + "loss": 0.0473, + "step": 3509 + }, + { + "epoch": 2.4315898856944926, + "grad_norm": 0.6795691251754761, + "learning_rate": 7.572815533980583e-06, + "loss": 0.0518, + "step": 3510 + }, + { + "epoch": 2.4322826463456875, + "grad_norm": 0.6456634998321533, + "learning_rate": 7.572122052704578e-06, + "loss": 0.0489, + "step": 3511 + }, + { + "epoch": 2.4329754069968828, + "grad_norm": 0.6561073064804077, + "learning_rate": 7.571428571428572e-06, + "loss": 0.0519, + "step": 3512 + }, + { + "epoch": 2.4336681676480776, + "grad_norm": 0.8649124503135681, + "learning_rate": 7.570735090152567e-06, + "loss": 0.0494, + "step": 3513 + }, + { + "epoch": 2.4343609282992724, + "grad_norm": 0.684529185295105, + "learning_rate": 7.57004160887656e-06, + "loss": 0.0573, + "step": 3514 + }, + { + "epoch": 2.4350536889504677, + "grad_norm": 0.8203072547912598, + "learning_rate": 7.569348127600555e-06, + "loss": 0.0618, + "step": 3515 + }, + { + "epoch": 2.4357464496016625, + "grad_norm": 0.69808030128479, + "learning_rate": 7.56865464632455e-06, + "loss": 0.0465, + "step": 3516 + }, + { + "epoch": 2.436439210252858, + "grad_norm": 0.6848331093788147, + "learning_rate": 7.567961165048544e-06, + "loss": 0.0596, + "step": 3517 + }, + { + "epoch": 2.4371319709040526, + "grad_norm": 0.7920529246330261, + "learning_rate": 7.567267683772539e-06, + "loss": 0.0633, + "step": 3518 + }, + { + "epoch": 2.4378247315552475, + "grad_norm": 0.6909356117248535, + "learning_rate": 7.566574202496532e-06, + "loss": 0.0567, + "step": 3519 + }, + { + "epoch": 2.4385174922064428, + "grad_norm": 0.6651289463043213, + "learning_rate": 7.565880721220527e-06, + "loss": 0.0432, + "step": 3520 + }, + { + "epoch": 2.4392102528576376, + "grad_norm": 0.7031319737434387, + "learning_rate": 7.565187239944522e-06, + "loss": 0.0618, + "step": 3521 + }, + { + "epoch": 2.439903013508833, + "grad_norm": 0.7217929363250732, + "learning_rate": 7.564493758668516e-06, + "loss": 0.0582, + "step": 3522 + }, + { + "epoch": 2.4405957741600277, + "grad_norm": 0.7406728863716125, + "learning_rate": 7.563800277392511e-06, + "loss": 0.0606, + "step": 3523 + }, + { + "epoch": 2.4412885348112225, + "grad_norm": 0.6231144070625305, + "learning_rate": 7.563106796116505e-06, + "loss": 0.0342, + "step": 3524 + }, + { + "epoch": 2.441981295462418, + "grad_norm": 0.630927562713623, + "learning_rate": 7.5624133148405e-06, + "loss": 0.051, + "step": 3525 + }, + { + "epoch": 2.4426740561136127, + "grad_norm": 0.720359206199646, + "learning_rate": 7.561719833564495e-06, + "loss": 0.0493, + "step": 3526 + }, + { + "epoch": 2.443366816764808, + "grad_norm": 0.6691286563873291, + "learning_rate": 7.561026352288488e-06, + "loss": 0.058, + "step": 3527 + }, + { + "epoch": 2.4440595774160028, + "grad_norm": 0.6614663600921631, + "learning_rate": 7.560332871012483e-06, + "loss": 0.0497, + "step": 3528 + }, + { + "epoch": 2.4447523380671976, + "grad_norm": 0.7070098519325256, + "learning_rate": 7.559639389736477e-06, + "loss": 0.051, + "step": 3529 + }, + { + "epoch": 2.445445098718393, + "grad_norm": 0.5920138359069824, + "learning_rate": 7.558945908460472e-06, + "loss": 0.0399, + "step": 3530 + }, + { + "epoch": 2.4461378593695877, + "grad_norm": 0.659697413444519, + "learning_rate": 7.558252427184467e-06, + "loss": 0.0554, + "step": 3531 + }, + { + "epoch": 2.446830620020783, + "grad_norm": 0.6952112317085266, + "learning_rate": 7.5575589459084605e-06, + "loss": 0.0539, + "step": 3532 + }, + { + "epoch": 2.447523380671978, + "grad_norm": 0.7827281951904297, + "learning_rate": 7.5568654646324555e-06, + "loss": 0.0669, + "step": 3533 + }, + { + "epoch": 2.4482161413231727, + "grad_norm": 0.708702027797699, + "learning_rate": 7.5561719833564495e-06, + "loss": 0.0476, + "step": 3534 + }, + { + "epoch": 2.448908901974368, + "grad_norm": 0.8409815430641174, + "learning_rate": 7.5554785020804445e-06, + "loss": 0.0624, + "step": 3535 + }, + { + "epoch": 2.449601662625563, + "grad_norm": 0.7902380228042603, + "learning_rate": 7.554785020804439e-06, + "loss": 0.0595, + "step": 3536 + }, + { + "epoch": 2.450294423276758, + "grad_norm": 0.8560856580734253, + "learning_rate": 7.5540915395284335e-06, + "loss": 0.0623, + "step": 3537 + }, + { + "epoch": 2.450987183927953, + "grad_norm": 0.7568857669830322, + "learning_rate": 7.553398058252428e-06, + "loss": 0.0548, + "step": 3538 + }, + { + "epoch": 2.4516799445791477, + "grad_norm": 0.7374472618103027, + "learning_rate": 7.552704576976422e-06, + "loss": 0.0557, + "step": 3539 + }, + { + "epoch": 2.452372705230343, + "grad_norm": 0.6479283571243286, + "learning_rate": 7.552011095700417e-06, + "loss": 0.0406, + "step": 3540 + }, + { + "epoch": 2.453065465881538, + "grad_norm": 0.6632204055786133, + "learning_rate": 7.5513176144244115e-06, + "loss": 0.052, + "step": 3541 + }, + { + "epoch": 2.453758226532733, + "grad_norm": 0.9688203930854797, + "learning_rate": 7.550624133148406e-06, + "loss": 0.0607, + "step": 3542 + }, + { + "epoch": 2.454450987183928, + "grad_norm": 0.7763074040412903, + "learning_rate": 7.5499306518724006e-06, + "loss": 0.0615, + "step": 3543 + }, + { + "epoch": 2.455143747835123, + "grad_norm": 0.7124184370040894, + "learning_rate": 7.549237170596394e-06, + "loss": 0.0607, + "step": 3544 + }, + { + "epoch": 2.455836508486318, + "grad_norm": 0.669926106929779, + "learning_rate": 7.548543689320389e-06, + "loss": 0.0606, + "step": 3545 + }, + { + "epoch": 2.456529269137513, + "grad_norm": 0.7063635587692261, + "learning_rate": 7.547850208044384e-06, + "loss": 0.0639, + "step": 3546 + }, + { + "epoch": 2.457222029788708, + "grad_norm": 0.5827317833900452, + "learning_rate": 7.547156726768378e-06, + "loss": 0.0422, + "step": 3547 + }, + { + "epoch": 2.457914790439903, + "grad_norm": 0.6933485865592957, + "learning_rate": 7.546463245492373e-06, + "loss": 0.0615, + "step": 3548 + }, + { + "epoch": 2.458607551091098, + "grad_norm": 0.6758912205696106, + "learning_rate": 7.545769764216366e-06, + "loss": 0.0537, + "step": 3549 + }, + { + "epoch": 2.459300311742293, + "grad_norm": 0.6951971650123596, + "learning_rate": 7.545076282940361e-06, + "loss": 0.06, + "step": 3550 + }, + { + "epoch": 2.459993072393488, + "grad_norm": 0.8742147088050842, + "learning_rate": 7.544382801664356e-06, + "loss": 0.0611, + "step": 3551 + }, + { + "epoch": 2.4606858330446832, + "grad_norm": 0.7043006420135498, + "learning_rate": 7.54368932038835e-06, + "loss": 0.0603, + "step": 3552 + }, + { + "epoch": 2.461378593695878, + "grad_norm": 0.6522670984268188, + "learning_rate": 7.542995839112345e-06, + "loss": 0.0483, + "step": 3553 + }, + { + "epoch": 2.462071354347073, + "grad_norm": 0.589442253112793, + "learning_rate": 7.542302357836339e-06, + "loss": 0.0461, + "step": 3554 + }, + { + "epoch": 2.462764114998268, + "grad_norm": 0.7959421873092651, + "learning_rate": 7.541608876560334e-06, + "loss": 0.0654, + "step": 3555 + }, + { + "epoch": 2.463456875649463, + "grad_norm": 0.7357652187347412, + "learning_rate": 7.540915395284329e-06, + "loss": 0.0584, + "step": 3556 + }, + { + "epoch": 2.4641496363006583, + "grad_norm": 0.7577213644981384, + "learning_rate": 7.540221914008322e-06, + "loss": 0.0496, + "step": 3557 + }, + { + "epoch": 2.464842396951853, + "grad_norm": 1.2063661813735962, + "learning_rate": 7.539528432732317e-06, + "loss": 0.0624, + "step": 3558 + }, + { + "epoch": 2.465535157603048, + "grad_norm": 0.594376802444458, + "learning_rate": 7.538834951456311e-06, + "loss": 0.0562, + "step": 3559 + }, + { + "epoch": 2.4662279182542433, + "grad_norm": 0.6683982014656067, + "learning_rate": 7.538141470180306e-06, + "loss": 0.0545, + "step": 3560 + }, + { + "epoch": 2.466920678905438, + "grad_norm": 0.8373034000396729, + "learning_rate": 7.537447988904301e-06, + "loss": 0.0618, + "step": 3561 + }, + { + "epoch": 2.4676134395566334, + "grad_norm": 0.6339336037635803, + "learning_rate": 7.536754507628294e-06, + "loss": 0.0471, + "step": 3562 + }, + { + "epoch": 2.468306200207828, + "grad_norm": 0.6370022892951965, + "learning_rate": 7.536061026352289e-06, + "loss": 0.0495, + "step": 3563 + }, + { + "epoch": 2.468998960859023, + "grad_norm": 0.8200846314430237, + "learning_rate": 7.535367545076283e-06, + "loss": 0.069, + "step": 3564 + }, + { + "epoch": 2.4696917215102183, + "grad_norm": 0.7583621144294739, + "learning_rate": 7.534674063800278e-06, + "loss": 0.0568, + "step": 3565 + }, + { + "epoch": 2.470384482161413, + "grad_norm": 0.6182228922843933, + "learning_rate": 7.533980582524273e-06, + "loss": 0.0449, + "step": 3566 + }, + { + "epoch": 2.4710772428126084, + "grad_norm": 0.7657793760299683, + "learning_rate": 7.533287101248266e-06, + "loss": 0.0542, + "step": 3567 + }, + { + "epoch": 2.4717700034638033, + "grad_norm": 0.6714324951171875, + "learning_rate": 7.532593619972261e-06, + "loss": 0.0574, + "step": 3568 + }, + { + "epoch": 2.472462764114998, + "grad_norm": 0.6751946210861206, + "learning_rate": 7.531900138696255e-06, + "loss": 0.0423, + "step": 3569 + }, + { + "epoch": 2.4731555247661934, + "grad_norm": 0.7521834969520569, + "learning_rate": 7.53120665742025e-06, + "loss": 0.0585, + "step": 3570 + }, + { + "epoch": 2.473848285417388, + "grad_norm": 0.6951082348823547, + "learning_rate": 7.530513176144245e-06, + "loss": 0.052, + "step": 3571 + }, + { + "epoch": 2.4745410460685835, + "grad_norm": 0.7604899406433105, + "learning_rate": 7.529819694868239e-06, + "loss": 0.0554, + "step": 3572 + }, + { + "epoch": 2.4752338067197783, + "grad_norm": 0.5871224403381348, + "learning_rate": 7.529126213592234e-06, + "loss": 0.0429, + "step": 3573 + }, + { + "epoch": 2.475926567370973, + "grad_norm": 0.7135995626449585, + "learning_rate": 7.528432732316227e-06, + "loss": 0.0427, + "step": 3574 + }, + { + "epoch": 2.4766193280221684, + "grad_norm": 0.5908096432685852, + "learning_rate": 7.527739251040222e-06, + "loss": 0.0432, + "step": 3575 + }, + { + "epoch": 2.4773120886733633, + "grad_norm": 0.7182779908180237, + "learning_rate": 7.527045769764217e-06, + "loss": 0.0605, + "step": 3576 + }, + { + "epoch": 2.4780048493245586, + "grad_norm": 0.7429741621017456, + "learning_rate": 7.526352288488211e-06, + "loss": 0.0562, + "step": 3577 + }, + { + "epoch": 2.4786976099757534, + "grad_norm": 0.5671314001083374, + "learning_rate": 7.525658807212206e-06, + "loss": 0.0435, + "step": 3578 + }, + { + "epoch": 2.4793903706269482, + "grad_norm": 0.7295961380004883, + "learning_rate": 7.5249653259361995e-06, + "loss": 0.0536, + "step": 3579 + }, + { + "epoch": 2.4800831312781435, + "grad_norm": 0.7264970541000366, + "learning_rate": 7.5242718446601945e-06, + "loss": 0.0673, + "step": 3580 + }, + { + "epoch": 2.4807758919293383, + "grad_norm": 0.6332563757896423, + "learning_rate": 7.523578363384189e-06, + "loss": 0.0436, + "step": 3581 + }, + { + "epoch": 2.4814686525805336, + "grad_norm": 0.6721775531768799, + "learning_rate": 7.5228848821081835e-06, + "loss": 0.0566, + "step": 3582 + }, + { + "epoch": 2.4821614132317285, + "grad_norm": 0.7073386311531067, + "learning_rate": 7.522191400832178e-06, + "loss": 0.0516, + "step": 3583 + }, + { + "epoch": 2.4828541738829233, + "grad_norm": 0.6986554265022278, + "learning_rate": 7.5214979195561725e-06, + "loss": 0.0488, + "step": 3584 + }, + { + "epoch": 2.4835469345341186, + "grad_norm": 0.5938754677772522, + "learning_rate": 7.5208044382801674e-06, + "loss": 0.0517, + "step": 3585 + }, + { + "epoch": 2.4842396951853134, + "grad_norm": 0.8253476619720459, + "learning_rate": 7.5201109570041615e-06, + "loss": 0.0713, + "step": 3586 + }, + { + "epoch": 2.4849324558365087, + "grad_norm": 0.688849925994873, + "learning_rate": 7.519417475728156e-06, + "loss": 0.0493, + "step": 3587 + }, + { + "epoch": 2.4856252164877035, + "grad_norm": 0.8110700249671936, + "learning_rate": 7.5187239944521505e-06, + "loss": 0.078, + "step": 3588 + }, + { + "epoch": 2.4863179771388983, + "grad_norm": 0.7634989023208618, + "learning_rate": 7.518030513176145e-06, + "loss": 0.0565, + "step": 3589 + }, + { + "epoch": 2.4870107377900936, + "grad_norm": 0.7390944361686707, + "learning_rate": 7.5173370319001396e-06, + "loss": 0.0532, + "step": 3590 + }, + { + "epoch": 2.4877034984412885, + "grad_norm": 0.6951395869255066, + "learning_rate": 7.5166435506241345e-06, + "loss": 0.0501, + "step": 3591 + }, + { + "epoch": 2.4883962590924833, + "grad_norm": 0.600458025932312, + "learning_rate": 7.515950069348128e-06, + "loss": 0.0449, + "step": 3592 + }, + { + "epoch": 2.4890890197436786, + "grad_norm": 0.6846388578414917, + "learning_rate": 7.515256588072123e-06, + "loss": 0.0462, + "step": 3593 + }, + { + "epoch": 2.4897817803948734, + "grad_norm": 0.589938223361969, + "learning_rate": 7.514563106796117e-06, + "loss": 0.0467, + "step": 3594 + }, + { + "epoch": 2.4904745410460687, + "grad_norm": 0.6740636229515076, + "learning_rate": 7.513869625520112e-06, + "loss": 0.046, + "step": 3595 + }, + { + "epoch": 2.4911673016972635, + "grad_norm": 0.6663030982017517, + "learning_rate": 7.513176144244107e-06, + "loss": 0.042, + "step": 3596 + }, + { + "epoch": 2.491860062348459, + "grad_norm": 0.6415688991546631, + "learning_rate": 7.5124826629681e-06, + "loss": 0.0496, + "step": 3597 + }, + { + "epoch": 2.4925528229996536, + "grad_norm": 0.6303988099098206, + "learning_rate": 7.511789181692095e-06, + "loss": 0.0506, + "step": 3598 + }, + { + "epoch": 2.4932455836508485, + "grad_norm": 0.7231244444847107, + "learning_rate": 7.511095700416089e-06, + "loss": 0.0604, + "step": 3599 + }, + { + "epoch": 2.4939383443020438, + "grad_norm": 0.7289586663246155, + "learning_rate": 7.510402219140084e-06, + "loss": 0.055, + "step": 3600 + }, + { + "epoch": 2.4946311049532386, + "grad_norm": 0.8104788661003113, + "learning_rate": 7.509708737864079e-06, + "loss": 0.0504, + "step": 3601 + }, + { + "epoch": 2.4953238656044334, + "grad_norm": 0.6668550968170166, + "learning_rate": 7.509015256588073e-06, + "loss": 0.0458, + "step": 3602 + }, + { + "epoch": 2.4960166262556287, + "grad_norm": 0.7586086988449097, + "learning_rate": 7.508321775312068e-06, + "loss": 0.0694, + "step": 3603 + }, + { + "epoch": 2.4967093869068235, + "grad_norm": 0.6411376595497131, + "learning_rate": 7.507628294036061e-06, + "loss": 0.0539, + "step": 3604 + }, + { + "epoch": 2.497402147558019, + "grad_norm": 0.8673600554466248, + "learning_rate": 7.506934812760056e-06, + "loss": 0.0636, + "step": 3605 + }, + { + "epoch": 2.4980949082092136, + "grad_norm": 0.6442834734916687, + "learning_rate": 7.506241331484051e-06, + "loss": 0.0542, + "step": 3606 + }, + { + "epoch": 2.498787668860409, + "grad_norm": 0.702248215675354, + "learning_rate": 7.505547850208045e-06, + "loss": 0.0582, + "step": 3607 + }, + { + "epoch": 2.4994804295116038, + "grad_norm": 0.7120597958564758, + "learning_rate": 7.50485436893204e-06, + "loss": 0.0658, + "step": 3608 + }, + { + "epoch": 2.5001731901627986, + "grad_norm": 0.8212564587593079, + "learning_rate": 7.504160887656033e-06, + "loss": 0.0655, + "step": 3609 + }, + { + "epoch": 2.500865950813994, + "grad_norm": 0.6543794870376587, + "learning_rate": 7.503467406380028e-06, + "loss": 0.0594, + "step": 3610 + }, + { + "epoch": 2.5015587114651887, + "grad_norm": 0.6933072209358215, + "learning_rate": 7.502773925104023e-06, + "loss": 0.0632, + "step": 3611 + }, + { + "epoch": 2.5022514721163835, + "grad_norm": 0.6953290104866028, + "learning_rate": 7.502080443828017e-06, + "loss": 0.0547, + "step": 3612 + }, + { + "epoch": 2.502944232767579, + "grad_norm": 0.6507391929626465, + "learning_rate": 7.501386962552012e-06, + "loss": 0.0456, + "step": 3613 + }, + { + "epoch": 2.5036369934187737, + "grad_norm": 0.6569274067878723, + "learning_rate": 7.500693481276006e-06, + "loss": 0.0493, + "step": 3614 + }, + { + "epoch": 2.504329754069969, + "grad_norm": 0.6933046579360962, + "learning_rate": 7.500000000000001e-06, + "loss": 0.0638, + "step": 3615 + }, + { + "epoch": 2.5050225147211638, + "grad_norm": 0.6412826776504517, + "learning_rate": 7.499306518723995e-06, + "loss": 0.0477, + "step": 3616 + }, + { + "epoch": 2.505715275372359, + "grad_norm": 0.7654886841773987, + "learning_rate": 7.498613037447989e-06, + "loss": 0.0509, + "step": 3617 + }, + { + "epoch": 2.506408036023554, + "grad_norm": 0.679672122001648, + "learning_rate": 7.497919556171984e-06, + "loss": 0.0534, + "step": 3618 + }, + { + "epoch": 2.5071007966747487, + "grad_norm": 0.6829155683517456, + "learning_rate": 7.497226074895978e-06, + "loss": 0.0584, + "step": 3619 + }, + { + "epoch": 2.507793557325944, + "grad_norm": 0.8345833420753479, + "learning_rate": 7.496532593619973e-06, + "loss": 0.0678, + "step": 3620 + }, + { + "epoch": 2.508486317977139, + "grad_norm": 0.6249558925628662, + "learning_rate": 7.495839112343968e-06, + "loss": 0.046, + "step": 3621 + }, + { + "epoch": 2.5091790786283337, + "grad_norm": 0.7350931763648987, + "learning_rate": 7.495145631067961e-06, + "loss": 0.0654, + "step": 3622 + }, + { + "epoch": 2.509871839279529, + "grad_norm": 0.6759945154190063, + "learning_rate": 7.494452149791956e-06, + "loss": 0.0645, + "step": 3623 + }, + { + "epoch": 2.510564599930724, + "grad_norm": 0.6918412446975708, + "learning_rate": 7.49375866851595e-06, + "loss": 0.0437, + "step": 3624 + }, + { + "epoch": 2.511257360581919, + "grad_norm": 0.7094055414199829, + "learning_rate": 7.493065187239945e-06, + "loss": 0.0638, + "step": 3625 + }, + { + "epoch": 2.511950121233114, + "grad_norm": 0.5930887460708618, + "learning_rate": 7.49237170596394e-06, + "loss": 0.0402, + "step": 3626 + }, + { + "epoch": 2.512642881884309, + "grad_norm": 0.912276566028595, + "learning_rate": 7.4916782246879335e-06, + "loss": 0.0678, + "step": 3627 + }, + { + "epoch": 2.513335642535504, + "grad_norm": 0.654692530632019, + "learning_rate": 7.490984743411928e-06, + "loss": 0.0532, + "step": 3628 + }, + { + "epoch": 2.514028403186699, + "grad_norm": 0.6371455788612366, + "learning_rate": 7.4902912621359225e-06, + "loss": 0.0547, + "step": 3629 + }, + { + "epoch": 2.514721163837894, + "grad_norm": 0.7303754091262817, + "learning_rate": 7.489597780859917e-06, + "loss": 0.0495, + "step": 3630 + }, + { + "epoch": 2.515413924489089, + "grad_norm": 0.8115444183349609, + "learning_rate": 7.488904299583912e-06, + "loss": 0.0718, + "step": 3631 + }, + { + "epoch": 2.516106685140284, + "grad_norm": 0.7716137766838074, + "learning_rate": 7.4882108183079064e-06, + "loss": 0.0735, + "step": 3632 + }, + { + "epoch": 2.516799445791479, + "grad_norm": 0.6851175427436829, + "learning_rate": 7.487517337031901e-06, + "loss": 0.0537, + "step": 3633 + }, + { + "epoch": 2.517492206442674, + "grad_norm": 0.6670947670936584, + "learning_rate": 7.486823855755895e-06, + "loss": 0.0482, + "step": 3634 + }, + { + "epoch": 2.518184967093869, + "grad_norm": 0.6720465421676636, + "learning_rate": 7.4861303744798895e-06, + "loss": 0.0443, + "step": 3635 + }, + { + "epoch": 2.518877727745064, + "grad_norm": 0.6166452169418335, + "learning_rate": 7.4854368932038845e-06, + "loss": 0.0526, + "step": 3636 + }, + { + "epoch": 2.5195704883962593, + "grad_norm": 0.6722489595413208, + "learning_rate": 7.4847434119278786e-06, + "loss": 0.0406, + "step": 3637 + }, + { + "epoch": 2.520263249047454, + "grad_norm": 0.7445875406265259, + "learning_rate": 7.4840499306518735e-06, + "loss": 0.0636, + "step": 3638 + }, + { + "epoch": 2.520956009698649, + "grad_norm": 0.7646205425262451, + "learning_rate": 7.483356449375867e-06, + "loss": 0.0542, + "step": 3639 + }, + { + "epoch": 2.5216487703498442, + "grad_norm": 0.6960675716400146, + "learning_rate": 7.482662968099862e-06, + "loss": 0.0592, + "step": 3640 + }, + { + "epoch": 2.522341531001039, + "grad_norm": 0.7878941893577576, + "learning_rate": 7.481969486823857e-06, + "loss": 0.0593, + "step": 3641 + }, + { + "epoch": 2.523034291652234, + "grad_norm": 0.8090975880622864, + "learning_rate": 7.481276005547851e-06, + "loss": 0.0661, + "step": 3642 + }, + { + "epoch": 2.523727052303429, + "grad_norm": 0.6913262009620667, + "learning_rate": 7.480582524271846e-06, + "loss": 0.0477, + "step": 3643 + }, + { + "epoch": 2.524419812954624, + "grad_norm": 0.6624709367752075, + "learning_rate": 7.47988904299584e-06, + "loss": 0.0413, + "step": 3644 + }, + { + "epoch": 2.5251125736058193, + "grad_norm": 0.863525927066803, + "learning_rate": 7.479195561719834e-06, + "loss": 0.0467, + "step": 3645 + }, + { + "epoch": 2.525805334257014, + "grad_norm": 0.6592664122581482, + "learning_rate": 7.478502080443829e-06, + "loss": 0.0459, + "step": 3646 + }, + { + "epoch": 2.5264980949082094, + "grad_norm": 0.6367437243461609, + "learning_rate": 7.477808599167823e-06, + "loss": 0.036, + "step": 3647 + }, + { + "epoch": 2.5271908555594043, + "grad_norm": 0.7011104822158813, + "learning_rate": 7.477115117891818e-06, + "loss": 0.0436, + "step": 3648 + }, + { + "epoch": 2.527883616210599, + "grad_norm": 0.7507541179656982, + "learning_rate": 7.476421636615812e-06, + "loss": 0.0488, + "step": 3649 + }, + { + "epoch": 2.5285763768617944, + "grad_norm": 0.7038018703460693, + "learning_rate": 7.475728155339807e-06, + "loss": 0.0369, + "step": 3650 + }, + { + "epoch": 2.529269137512989, + "grad_norm": 0.7123277187347412, + "learning_rate": 7.475034674063802e-06, + "loss": 0.052, + "step": 3651 + }, + { + "epoch": 2.529961898164184, + "grad_norm": 0.6427280306816101, + "learning_rate": 7.474341192787795e-06, + "loss": 0.0437, + "step": 3652 + }, + { + "epoch": 2.5306546588153793, + "grad_norm": 0.6419147849082947, + "learning_rate": 7.47364771151179e-06, + "loss": 0.048, + "step": 3653 + }, + { + "epoch": 2.531347419466574, + "grad_norm": 0.7547383308410645, + "learning_rate": 7.472954230235784e-06, + "loss": 0.0609, + "step": 3654 + }, + { + "epoch": 2.5320401801177694, + "grad_norm": 0.748364269733429, + "learning_rate": 7.472260748959779e-06, + "loss": 0.0601, + "step": 3655 + }, + { + "epoch": 2.5327329407689643, + "grad_norm": 0.7975109219551086, + "learning_rate": 7.471567267683774e-06, + "loss": 0.0582, + "step": 3656 + }, + { + "epoch": 2.5334257014201595, + "grad_norm": 0.7377816438674927, + "learning_rate": 7.470873786407767e-06, + "loss": 0.0607, + "step": 3657 + }, + { + "epoch": 2.5341184620713544, + "grad_norm": 0.7805709838867188, + "learning_rate": 7.470180305131762e-06, + "loss": 0.0727, + "step": 3658 + }, + { + "epoch": 2.534811222722549, + "grad_norm": 0.7987939119338989, + "learning_rate": 7.469486823855756e-06, + "loss": 0.0602, + "step": 3659 + }, + { + "epoch": 2.5355039833737445, + "grad_norm": 0.7342275381088257, + "learning_rate": 7.468793342579751e-06, + "loss": 0.0633, + "step": 3660 + }, + { + "epoch": 2.5361967440249393, + "grad_norm": 0.7222427129745483, + "learning_rate": 7.468099861303746e-06, + "loss": 0.0514, + "step": 3661 + }, + { + "epoch": 2.536889504676134, + "grad_norm": 0.7093556523323059, + "learning_rate": 7.46740638002774e-06, + "loss": 0.06, + "step": 3662 + }, + { + "epoch": 2.5375822653273294, + "grad_norm": 0.7948033809661865, + "learning_rate": 7.466712898751735e-06, + "loss": 0.0564, + "step": 3663 + }, + { + "epoch": 2.5382750259785243, + "grad_norm": 0.7399566173553467, + "learning_rate": 7.466019417475728e-06, + "loss": 0.0563, + "step": 3664 + }, + { + "epoch": 2.5389677866297196, + "grad_norm": 0.7637043595314026, + "learning_rate": 7.465325936199723e-06, + "loss": 0.0689, + "step": 3665 + }, + { + "epoch": 2.5396605472809144, + "grad_norm": 0.657764732837677, + "learning_rate": 7.464632454923718e-06, + "loss": 0.0432, + "step": 3666 + }, + { + "epoch": 2.5403533079321097, + "grad_norm": 0.6431865096092224, + "learning_rate": 7.463938973647712e-06, + "loss": 0.0391, + "step": 3667 + }, + { + "epoch": 2.5410460685833045, + "grad_norm": 0.6540981531143188, + "learning_rate": 7.463245492371707e-06, + "loss": 0.0447, + "step": 3668 + }, + { + "epoch": 2.5417388292344993, + "grad_norm": 0.6576544046401978, + "learning_rate": 7.4625520110957e-06, + "loss": 0.0508, + "step": 3669 + }, + { + "epoch": 2.5424315898856946, + "grad_norm": 0.8123604655265808, + "learning_rate": 7.461858529819695e-06, + "loss": 0.0706, + "step": 3670 + }, + { + "epoch": 2.5431243505368895, + "grad_norm": 0.8841568231582642, + "learning_rate": 7.46116504854369e-06, + "loss": 0.0467, + "step": 3671 + }, + { + "epoch": 2.5438171111880843, + "grad_norm": 0.7083867788314819, + "learning_rate": 7.460471567267684e-06, + "loss": 0.0654, + "step": 3672 + }, + { + "epoch": 2.5445098718392796, + "grad_norm": 0.6732116341590881, + "learning_rate": 7.459778085991679e-06, + "loss": 0.0502, + "step": 3673 + }, + { + "epoch": 2.5452026324904744, + "grad_norm": 0.6401447653770447, + "learning_rate": 7.4590846047156725e-06, + "loss": 0.04, + "step": 3674 + }, + { + "epoch": 2.5458953931416697, + "grad_norm": 0.6104236245155334, + "learning_rate": 7.458391123439667e-06, + "loss": 0.0427, + "step": 3675 + }, + { + "epoch": 2.5465881537928645, + "grad_norm": 0.9199113845825195, + "learning_rate": 7.457697642163662e-06, + "loss": 0.0601, + "step": 3676 + }, + { + "epoch": 2.54728091444406, + "grad_norm": 0.7299613952636719, + "learning_rate": 7.457004160887656e-06, + "loss": 0.0601, + "step": 3677 + }, + { + "epoch": 2.5479736750952546, + "grad_norm": 0.678725004196167, + "learning_rate": 7.456310679611651e-06, + "loss": 0.0492, + "step": 3678 + }, + { + "epoch": 2.5486664357464495, + "grad_norm": 0.7396339178085327, + "learning_rate": 7.4556171983356454e-06, + "loss": 0.0465, + "step": 3679 + }, + { + "epoch": 2.5493591963976447, + "grad_norm": 0.7016897797584534, + "learning_rate": 7.45492371705964e-06, + "loss": 0.0634, + "step": 3680 + }, + { + "epoch": 2.5500519570488396, + "grad_norm": 0.6942267417907715, + "learning_rate": 7.454230235783635e-06, + "loss": 0.0471, + "step": 3681 + }, + { + "epoch": 2.5507447177000344, + "grad_norm": 0.6844359040260315, + "learning_rate": 7.4535367545076285e-06, + "loss": 0.0564, + "step": 3682 + }, + { + "epoch": 2.5514374783512297, + "grad_norm": 0.6959248781204224, + "learning_rate": 7.4528432732316235e-06, + "loss": 0.0445, + "step": 3683 + }, + { + "epoch": 2.5521302390024245, + "grad_norm": 0.6269494891166687, + "learning_rate": 7.4521497919556176e-06, + "loss": 0.0393, + "step": 3684 + }, + { + "epoch": 2.55282299965362, + "grad_norm": 0.6538231372833252, + "learning_rate": 7.4514563106796125e-06, + "loss": 0.055, + "step": 3685 + }, + { + "epoch": 2.5535157603048146, + "grad_norm": 0.9394417405128479, + "learning_rate": 7.4507628294036074e-06, + "loss": 0.0826, + "step": 3686 + }, + { + "epoch": 2.55420852095601, + "grad_norm": 0.7278565168380737, + "learning_rate": 7.450069348127601e-06, + "loss": 0.0533, + "step": 3687 + }, + { + "epoch": 2.5549012816072048, + "grad_norm": 0.6613569259643555, + "learning_rate": 7.449375866851596e-06, + "loss": 0.0539, + "step": 3688 + }, + { + "epoch": 2.5555940422583996, + "grad_norm": 0.7118561863899231, + "learning_rate": 7.44868238557559e-06, + "loss": 0.0542, + "step": 3689 + }, + { + "epoch": 2.556286802909595, + "grad_norm": 0.61842280626297, + "learning_rate": 7.447988904299585e-06, + "loss": 0.0453, + "step": 3690 + }, + { + "epoch": 2.5569795635607897, + "grad_norm": 0.6063533425331116, + "learning_rate": 7.4472954230235796e-06, + "loss": 0.0535, + "step": 3691 + }, + { + "epoch": 2.5576723242119845, + "grad_norm": 0.7033942937850952, + "learning_rate": 7.446601941747574e-06, + "loss": 0.0534, + "step": 3692 + }, + { + "epoch": 2.55836508486318, + "grad_norm": 0.6714750528335571, + "learning_rate": 7.445908460471568e-06, + "loss": 0.0581, + "step": 3693 + }, + { + "epoch": 2.5590578455143747, + "grad_norm": 0.707249641418457, + "learning_rate": 7.445214979195562e-06, + "loss": 0.0593, + "step": 3694 + }, + { + "epoch": 2.55975060616557, + "grad_norm": 0.6404327750205994, + "learning_rate": 7.444521497919557e-06, + "loss": 0.0448, + "step": 3695 + }, + { + "epoch": 2.5604433668167648, + "grad_norm": 0.6509929299354553, + "learning_rate": 7.443828016643552e-06, + "loss": 0.0489, + "step": 3696 + }, + { + "epoch": 2.56113612746796, + "grad_norm": 0.6667312383651733, + "learning_rate": 7.443134535367546e-06, + "loss": 0.0476, + "step": 3697 + }, + { + "epoch": 2.561828888119155, + "grad_norm": 0.6964579224586487, + "learning_rate": 7.442441054091541e-06, + "loss": 0.0631, + "step": 3698 + }, + { + "epoch": 2.5625216487703497, + "grad_norm": 0.6812249422073364, + "learning_rate": 7.441747572815534e-06, + "loss": 0.0583, + "step": 3699 + }, + { + "epoch": 2.563214409421545, + "grad_norm": 0.6796555519104004, + "learning_rate": 7.441054091539529e-06, + "loss": 0.0618, + "step": 3700 + }, + { + "epoch": 2.56390717007274, + "grad_norm": 0.7967904210090637, + "learning_rate": 7.440360610263524e-06, + "loss": 0.0562, + "step": 3701 + }, + { + "epoch": 2.5645999307239347, + "grad_norm": 0.7128233909606934, + "learning_rate": 7.439667128987518e-06, + "loss": 0.0566, + "step": 3702 + }, + { + "epoch": 2.56529269137513, + "grad_norm": 0.7470800280570984, + "learning_rate": 7.438973647711513e-06, + "loss": 0.062, + "step": 3703 + }, + { + "epoch": 2.5659854520263248, + "grad_norm": 0.6518545150756836, + "learning_rate": 7.438280166435506e-06, + "loss": 0.051, + "step": 3704 + }, + { + "epoch": 2.56667821267752, + "grad_norm": 0.6771643757820129, + "learning_rate": 7.437586685159501e-06, + "loss": 0.0537, + "step": 3705 + }, + { + "epoch": 2.567370973328715, + "grad_norm": 0.6569119691848755, + "learning_rate": 7.436893203883496e-06, + "loss": 0.0452, + "step": 3706 + }, + { + "epoch": 2.56806373397991, + "grad_norm": 0.7679269909858704, + "learning_rate": 7.43619972260749e-06, + "loss": 0.0578, + "step": 3707 + }, + { + "epoch": 2.568756494631105, + "grad_norm": 0.7990003228187561, + "learning_rate": 7.435506241331485e-06, + "loss": 0.0688, + "step": 3708 + }, + { + "epoch": 2.5694492552823, + "grad_norm": 0.7669041156768799, + "learning_rate": 7.434812760055479e-06, + "loss": 0.0665, + "step": 3709 + }, + { + "epoch": 2.570142015933495, + "grad_norm": 0.6859337091445923, + "learning_rate": 7.434119278779474e-06, + "loss": 0.0571, + "step": 3710 + }, + { + "epoch": 2.57083477658469, + "grad_norm": 0.6683074831962585, + "learning_rate": 7.433425797503469e-06, + "loss": 0.0491, + "step": 3711 + }, + { + "epoch": 2.571527537235885, + "grad_norm": 0.6565935611724854, + "learning_rate": 7.432732316227462e-06, + "loss": 0.0554, + "step": 3712 + }, + { + "epoch": 2.57222029788708, + "grad_norm": 0.7693780660629272, + "learning_rate": 7.432038834951457e-06, + "loss": 0.0686, + "step": 3713 + }, + { + "epoch": 2.572913058538275, + "grad_norm": 0.7004335522651672, + "learning_rate": 7.431345353675451e-06, + "loss": 0.0581, + "step": 3714 + }, + { + "epoch": 2.57360581918947, + "grad_norm": 0.6624301075935364, + "learning_rate": 7.430651872399446e-06, + "loss": 0.0513, + "step": 3715 + }, + { + "epoch": 2.574298579840665, + "grad_norm": 0.6729522943496704, + "learning_rate": 7.429958391123441e-06, + "loss": 0.0477, + "step": 3716 + }, + { + "epoch": 2.5749913404918603, + "grad_norm": 0.7227064967155457, + "learning_rate": 7.429264909847434e-06, + "loss": 0.0543, + "step": 3717 + }, + { + "epoch": 2.575684101143055, + "grad_norm": 0.7444384694099426, + "learning_rate": 7.428571428571429e-06, + "loss": 0.0491, + "step": 3718 + }, + { + "epoch": 2.57637686179425, + "grad_norm": 0.7027382850646973, + "learning_rate": 7.427877947295423e-06, + "loss": 0.0582, + "step": 3719 + }, + { + "epoch": 2.5770696224454452, + "grad_norm": 0.7209217548370361, + "learning_rate": 7.427184466019418e-06, + "loss": 0.0618, + "step": 3720 + }, + { + "epoch": 2.57776238309664, + "grad_norm": 0.6173632740974426, + "learning_rate": 7.426490984743413e-06, + "loss": 0.0598, + "step": 3721 + }, + { + "epoch": 2.578455143747835, + "grad_norm": 0.8661364316940308, + "learning_rate": 7.425797503467406e-06, + "loss": 0.0627, + "step": 3722 + }, + { + "epoch": 2.57914790439903, + "grad_norm": 0.7888517379760742, + "learning_rate": 7.425104022191401e-06, + "loss": 0.074, + "step": 3723 + }, + { + "epoch": 2.579840665050225, + "grad_norm": 0.6954256296157837, + "learning_rate": 7.424410540915395e-06, + "loss": 0.0542, + "step": 3724 + }, + { + "epoch": 2.5805334257014203, + "grad_norm": 1.132676362991333, + "learning_rate": 7.42371705963939e-06, + "loss": 0.0602, + "step": 3725 + }, + { + "epoch": 2.581226186352615, + "grad_norm": 0.7591542601585388, + "learning_rate": 7.423023578363385e-06, + "loss": 0.0621, + "step": 3726 + }, + { + "epoch": 2.5819189470038104, + "grad_norm": 0.6864979267120361, + "learning_rate": 7.422330097087379e-06, + "loss": 0.0474, + "step": 3727 + }, + { + "epoch": 2.5826117076550053, + "grad_norm": 0.6854942440986633, + "learning_rate": 7.421636615811374e-06, + "loss": 0.0481, + "step": 3728 + }, + { + "epoch": 2.5833044683062, + "grad_norm": 0.8545223474502563, + "learning_rate": 7.4209431345353675e-06, + "loss": 0.0613, + "step": 3729 + }, + { + "epoch": 2.5839972289573954, + "grad_norm": 0.699947714805603, + "learning_rate": 7.4202496532593625e-06, + "loss": 0.0574, + "step": 3730 + }, + { + "epoch": 2.58468998960859, + "grad_norm": 0.7740626931190491, + "learning_rate": 7.419556171983357e-06, + "loss": 0.0513, + "step": 3731 + }, + { + "epoch": 2.585382750259785, + "grad_norm": 0.6306399703025818, + "learning_rate": 7.4188626907073515e-06, + "loss": 0.047, + "step": 3732 + }, + { + "epoch": 2.5860755109109803, + "grad_norm": 0.7971992492675781, + "learning_rate": 7.4181692094313464e-06, + "loss": 0.0618, + "step": 3733 + }, + { + "epoch": 2.586768271562175, + "grad_norm": 0.7833073735237122, + "learning_rate": 7.41747572815534e-06, + "loss": 0.0512, + "step": 3734 + }, + { + "epoch": 2.5874610322133704, + "grad_norm": 0.6905516386032104, + "learning_rate": 7.416782246879335e-06, + "loss": 0.0526, + "step": 3735 + }, + { + "epoch": 2.5881537928645653, + "grad_norm": 0.8069033622741699, + "learning_rate": 7.4160887656033295e-06, + "loss": 0.0674, + "step": 3736 + }, + { + "epoch": 2.5888465535157605, + "grad_norm": 0.7322914600372314, + "learning_rate": 7.415395284327324e-06, + "loss": 0.0582, + "step": 3737 + }, + { + "epoch": 2.5895393141669554, + "grad_norm": 0.6214407682418823, + "learning_rate": 7.4147018030513186e-06, + "loss": 0.0344, + "step": 3738 + }, + { + "epoch": 2.59023207481815, + "grad_norm": 0.7403656840324402, + "learning_rate": 7.414008321775313e-06, + "loss": 0.0589, + "step": 3739 + }, + { + "epoch": 2.5909248354693455, + "grad_norm": 0.6851581931114197, + "learning_rate": 7.4133148404993076e-06, + "loss": 0.061, + "step": 3740 + }, + { + "epoch": 2.5916175961205403, + "grad_norm": 0.6936320066452026, + "learning_rate": 7.412621359223302e-06, + "loss": 0.0521, + "step": 3741 + }, + { + "epoch": 2.592310356771735, + "grad_norm": 0.7396749258041382, + "learning_rate": 7.411927877947296e-06, + "loss": 0.0614, + "step": 3742 + }, + { + "epoch": 2.5930031174229304, + "grad_norm": 0.6849899291992188, + "learning_rate": 7.411234396671291e-06, + "loss": 0.05, + "step": 3743 + }, + { + "epoch": 2.5936958780741253, + "grad_norm": 0.8668308854103088, + "learning_rate": 7.410540915395285e-06, + "loss": 0.0688, + "step": 3744 + }, + { + "epoch": 2.5943886387253206, + "grad_norm": 0.7260075211524963, + "learning_rate": 7.40984743411928e-06, + "loss": 0.064, + "step": 3745 + }, + { + "epoch": 2.5950813993765154, + "grad_norm": 0.8617621660232544, + "learning_rate": 7.409153952843275e-06, + "loss": 0.0644, + "step": 3746 + }, + { + "epoch": 2.5957741600277107, + "grad_norm": 0.6431972980499268, + "learning_rate": 7.408460471567268e-06, + "loss": 0.0535, + "step": 3747 + }, + { + "epoch": 2.5964669206789055, + "grad_norm": 0.7590421438217163, + "learning_rate": 7.407766990291263e-06, + "loss": 0.0529, + "step": 3748 + }, + { + "epoch": 2.5971596813301003, + "grad_norm": 0.8376240134239197, + "learning_rate": 7.407073509015257e-06, + "loss": 0.0634, + "step": 3749 + }, + { + "epoch": 2.5978524419812956, + "grad_norm": 0.5832990407943726, + "learning_rate": 7.406380027739252e-06, + "loss": 0.0357, + "step": 3750 + }, + { + "epoch": 2.5985452026324904, + "grad_norm": 0.6788563132286072, + "learning_rate": 7.405686546463247e-06, + "loss": 0.0624, + "step": 3751 + }, + { + "epoch": 2.5992379632836853, + "grad_norm": 0.8296613097190857, + "learning_rate": 7.40499306518724e-06, + "loss": 0.0553, + "step": 3752 + }, + { + "epoch": 2.5999307239348806, + "grad_norm": 0.7007006406784058, + "learning_rate": 7.404299583911235e-06, + "loss": 0.0465, + "step": 3753 + }, + { + "epoch": 2.6006234845860754, + "grad_norm": 0.6576237082481384, + "learning_rate": 7.403606102635229e-06, + "loss": 0.0453, + "step": 3754 + }, + { + "epoch": 2.6013162452372707, + "grad_norm": 0.6901983022689819, + "learning_rate": 7.402912621359224e-06, + "loss": 0.0573, + "step": 3755 + }, + { + "epoch": 2.6020090058884655, + "grad_norm": 0.6307435631752014, + "learning_rate": 7.402219140083219e-06, + "loss": 0.0479, + "step": 3756 + }, + { + "epoch": 2.602701766539661, + "grad_norm": 0.5679149627685547, + "learning_rate": 7.401525658807213e-06, + "loss": 0.038, + "step": 3757 + }, + { + "epoch": 2.6033945271908556, + "grad_norm": 0.7377214431762695, + "learning_rate": 7.400832177531208e-06, + "loss": 0.0553, + "step": 3758 + }, + { + "epoch": 2.6040872878420505, + "grad_norm": 0.6187472939491272, + "learning_rate": 7.400138696255201e-06, + "loss": 0.041, + "step": 3759 + }, + { + "epoch": 2.6047800484932457, + "grad_norm": 1.044419527053833, + "learning_rate": 7.399445214979196e-06, + "loss": 0.0686, + "step": 3760 + }, + { + "epoch": 2.6054728091444406, + "grad_norm": 0.6558099389076233, + "learning_rate": 7.398751733703191e-06, + "loss": 0.0551, + "step": 3761 + }, + { + "epoch": 2.6061655697956354, + "grad_norm": 0.6940314769744873, + "learning_rate": 7.398058252427185e-06, + "loss": 0.0474, + "step": 3762 + }, + { + "epoch": 2.6068583304468307, + "grad_norm": 0.6689845323562622, + "learning_rate": 7.39736477115118e-06, + "loss": 0.0433, + "step": 3763 + }, + { + "epoch": 2.6075510910980255, + "grad_norm": 0.688528835773468, + "learning_rate": 7.396671289875173e-06, + "loss": 0.0686, + "step": 3764 + }, + { + "epoch": 2.608243851749221, + "grad_norm": 0.747090756893158, + "learning_rate": 7.395977808599168e-06, + "loss": 0.0538, + "step": 3765 + }, + { + "epoch": 2.6089366124004156, + "grad_norm": 0.6368103623390198, + "learning_rate": 7.395284327323163e-06, + "loss": 0.0449, + "step": 3766 + }, + { + "epoch": 2.609629373051611, + "grad_norm": 0.7641220092773438, + "learning_rate": 7.394590846047157e-06, + "loss": 0.0643, + "step": 3767 + }, + { + "epoch": 2.6103221337028057, + "grad_norm": 0.7362112402915955, + "learning_rate": 7.393897364771152e-06, + "loss": 0.0633, + "step": 3768 + }, + { + "epoch": 2.6110148943540006, + "grad_norm": 0.6080925464630127, + "learning_rate": 7.393203883495146e-06, + "loss": 0.0476, + "step": 3769 + }, + { + "epoch": 2.611707655005196, + "grad_norm": 0.6577709913253784, + "learning_rate": 7.39251040221914e-06, + "loss": 0.047, + "step": 3770 + }, + { + "epoch": 2.6124004156563907, + "grad_norm": 0.8339235186576843, + "learning_rate": 7.391816920943135e-06, + "loss": 0.0566, + "step": 3771 + }, + { + "epoch": 2.6130931763075855, + "grad_norm": 0.6208629012107849, + "learning_rate": 7.391123439667129e-06, + "loss": 0.0461, + "step": 3772 + }, + { + "epoch": 2.613785936958781, + "grad_norm": 0.6897596120834351, + "learning_rate": 7.390429958391124e-06, + "loss": 0.0494, + "step": 3773 + }, + { + "epoch": 2.6144786976099756, + "grad_norm": 0.6946137547492981, + "learning_rate": 7.389736477115118e-06, + "loss": 0.049, + "step": 3774 + }, + { + "epoch": 2.615171458261171, + "grad_norm": 0.6451416015625, + "learning_rate": 7.389042995839113e-06, + "loss": 0.0481, + "step": 3775 + }, + { + "epoch": 2.6158642189123658, + "grad_norm": 0.7888498306274414, + "learning_rate": 7.388349514563108e-06, + "loss": 0.0563, + "step": 3776 + }, + { + "epoch": 2.616556979563561, + "grad_norm": 0.6911794543266296, + "learning_rate": 7.3876560332871015e-06, + "loss": 0.0538, + "step": 3777 + }, + { + "epoch": 2.617249740214756, + "grad_norm": 0.6358610987663269, + "learning_rate": 7.386962552011096e-06, + "loss": 0.0589, + "step": 3778 + }, + { + "epoch": 2.6179425008659507, + "grad_norm": 0.6725031137466431, + "learning_rate": 7.3862690707350905e-06, + "loss": 0.0567, + "step": 3779 + }, + { + "epoch": 2.618635261517146, + "grad_norm": 0.6633414030075073, + "learning_rate": 7.3855755894590854e-06, + "loss": 0.0378, + "step": 3780 + }, + { + "epoch": 2.619328022168341, + "grad_norm": 0.8340132236480713, + "learning_rate": 7.38488210818308e-06, + "loss": 0.0588, + "step": 3781 + }, + { + "epoch": 2.6200207828195357, + "grad_norm": 0.6586992144584656, + "learning_rate": 7.384188626907074e-06, + "loss": 0.0527, + "step": 3782 + }, + { + "epoch": 2.620713543470731, + "grad_norm": 0.8827303051948547, + "learning_rate": 7.3834951456310685e-06, + "loss": 0.0596, + "step": 3783 + }, + { + "epoch": 2.6214063041219258, + "grad_norm": 0.6179441809654236, + "learning_rate": 7.382801664355063e-06, + "loss": 0.0474, + "step": 3784 + }, + { + "epoch": 2.622099064773121, + "grad_norm": 0.6691875457763672, + "learning_rate": 7.3821081830790576e-06, + "loss": 0.0564, + "step": 3785 + }, + { + "epoch": 2.622791825424316, + "grad_norm": 0.8572561740875244, + "learning_rate": 7.3814147018030525e-06, + "loss": 0.0599, + "step": 3786 + }, + { + "epoch": 2.623484586075511, + "grad_norm": 0.5666581392288208, + "learning_rate": 7.3807212205270466e-06, + "loss": 0.0433, + "step": 3787 + }, + { + "epoch": 2.624177346726706, + "grad_norm": 0.7028751373291016, + "learning_rate": 7.3800277392510415e-06, + "loss": 0.0548, + "step": 3788 + }, + { + "epoch": 2.624870107377901, + "grad_norm": 0.7753036618232727, + "learning_rate": 7.379334257975035e-06, + "loss": 0.0659, + "step": 3789 + }, + { + "epoch": 2.625562868029096, + "grad_norm": 0.6794808506965637, + "learning_rate": 7.37864077669903e-06, + "loss": 0.0572, + "step": 3790 + }, + { + "epoch": 2.626255628680291, + "grad_norm": 0.6950632333755493, + "learning_rate": 7.377947295423025e-06, + "loss": 0.0594, + "step": 3791 + }, + { + "epoch": 2.626948389331486, + "grad_norm": 0.8033772110939026, + "learning_rate": 7.377253814147019e-06, + "loss": 0.0565, + "step": 3792 + }, + { + "epoch": 2.627641149982681, + "grad_norm": 0.6774613857269287, + "learning_rate": 7.376560332871014e-06, + "loss": 0.057, + "step": 3793 + }, + { + "epoch": 2.628333910633876, + "grad_norm": 0.6674844026565552, + "learning_rate": 7.375866851595007e-06, + "loss": 0.0519, + "step": 3794 + }, + { + "epoch": 2.629026671285071, + "grad_norm": 0.603571891784668, + "learning_rate": 7.375173370319002e-06, + "loss": 0.0446, + "step": 3795 + }, + { + "epoch": 2.629719431936266, + "grad_norm": 0.7092214226722717, + "learning_rate": 7.374479889042997e-06, + "loss": 0.0502, + "step": 3796 + }, + { + "epoch": 2.6304121925874613, + "grad_norm": 0.6835474371910095, + "learning_rate": 7.373786407766991e-06, + "loss": 0.0511, + "step": 3797 + }, + { + "epoch": 2.631104953238656, + "grad_norm": 0.670049786567688, + "learning_rate": 7.373092926490986e-06, + "loss": 0.0556, + "step": 3798 + }, + { + "epoch": 2.631797713889851, + "grad_norm": 0.7796568274497986, + "learning_rate": 7.372399445214979e-06, + "loss": 0.071, + "step": 3799 + }, + { + "epoch": 2.6324904745410462, + "grad_norm": 0.8385136127471924, + "learning_rate": 7.371705963938974e-06, + "loss": 0.0555, + "step": 3800 + }, + { + "epoch": 2.633183235192241, + "grad_norm": 0.7091666460037231, + "learning_rate": 7.371012482662969e-06, + "loss": 0.0554, + "step": 3801 + }, + { + "epoch": 2.633875995843436, + "grad_norm": 0.7389783263206482, + "learning_rate": 7.370319001386963e-06, + "loss": 0.0551, + "step": 3802 + }, + { + "epoch": 2.634568756494631, + "grad_norm": 0.7565779089927673, + "learning_rate": 7.369625520110958e-06, + "loss": 0.0445, + "step": 3803 + }, + { + "epoch": 2.635261517145826, + "grad_norm": 0.6586689352989197, + "learning_rate": 7.368932038834952e-06, + "loss": 0.0493, + "step": 3804 + }, + { + "epoch": 2.6359542777970213, + "grad_norm": 0.7760875225067139, + "learning_rate": 7.368238557558947e-06, + "loss": 0.0735, + "step": 3805 + }, + { + "epoch": 2.636647038448216, + "grad_norm": 0.735569953918457, + "learning_rate": 7.367545076282942e-06, + "loss": 0.0523, + "step": 3806 + }, + { + "epoch": 2.6373397990994114, + "grad_norm": 0.6939058303833008, + "learning_rate": 7.366851595006935e-06, + "loss": 0.051, + "step": 3807 + }, + { + "epoch": 2.6380325597506062, + "grad_norm": 0.615863025188446, + "learning_rate": 7.36615811373093e-06, + "loss": 0.0416, + "step": 3808 + }, + { + "epoch": 2.638725320401801, + "grad_norm": 0.6464378833770752, + "learning_rate": 7.365464632454924e-06, + "loss": 0.0583, + "step": 3809 + }, + { + "epoch": 2.6394180810529964, + "grad_norm": 0.7069610357284546, + "learning_rate": 7.364771151178919e-06, + "loss": 0.0585, + "step": 3810 + }, + { + "epoch": 2.640110841704191, + "grad_norm": 0.7769689559936523, + "learning_rate": 7.364077669902914e-06, + "loss": 0.0625, + "step": 3811 + }, + { + "epoch": 2.640803602355386, + "grad_norm": 0.7485852837562561, + "learning_rate": 7.363384188626907e-06, + "loss": 0.0539, + "step": 3812 + }, + { + "epoch": 2.6414963630065813, + "grad_norm": 0.7916412353515625, + "learning_rate": 7.362690707350902e-06, + "loss": 0.0566, + "step": 3813 + }, + { + "epoch": 2.642189123657776, + "grad_norm": 0.5960851907730103, + "learning_rate": 7.361997226074896e-06, + "loss": 0.0473, + "step": 3814 + }, + { + "epoch": 2.642881884308971, + "grad_norm": 0.658810555934906, + "learning_rate": 7.361303744798891e-06, + "loss": 0.0491, + "step": 3815 + }, + { + "epoch": 2.6435746449601663, + "grad_norm": 0.7364745736122131, + "learning_rate": 7.360610263522886e-06, + "loss": 0.0609, + "step": 3816 + }, + { + "epoch": 2.6442674056113615, + "grad_norm": 0.688632071018219, + "learning_rate": 7.35991678224688e-06, + "loss": 0.0604, + "step": 3817 + }, + { + "epoch": 2.6449601662625564, + "grad_norm": 0.7310763001441956, + "learning_rate": 7.359223300970874e-06, + "loss": 0.0593, + "step": 3818 + }, + { + "epoch": 2.645652926913751, + "grad_norm": 0.6695185899734497, + "learning_rate": 7.358529819694868e-06, + "loss": 0.0491, + "step": 3819 + }, + { + "epoch": 2.6463456875649465, + "grad_norm": 0.6718903183937073, + "learning_rate": 7.357836338418863e-06, + "loss": 0.0671, + "step": 3820 + }, + { + "epoch": 2.6470384482161413, + "grad_norm": 0.6498568654060364, + "learning_rate": 7.357142857142858e-06, + "loss": 0.0485, + "step": 3821 + }, + { + "epoch": 2.647731208867336, + "grad_norm": 0.700734555721283, + "learning_rate": 7.356449375866852e-06, + "loss": 0.0522, + "step": 3822 + }, + { + "epoch": 2.6484239695185314, + "grad_norm": 0.7129335403442383, + "learning_rate": 7.355755894590847e-06, + "loss": 0.0512, + "step": 3823 + }, + { + "epoch": 2.6491167301697263, + "grad_norm": 0.9340489506721497, + "learning_rate": 7.3550624133148405e-06, + "loss": 0.0557, + "step": 3824 + }, + { + "epoch": 2.649809490820921, + "grad_norm": 0.6520780324935913, + "learning_rate": 7.354368932038835e-06, + "loss": 0.0494, + "step": 3825 + }, + { + "epoch": 2.6505022514721164, + "grad_norm": 0.6372048258781433, + "learning_rate": 7.35367545076283e-06, + "loss": 0.0467, + "step": 3826 + }, + { + "epoch": 2.6511950121233117, + "grad_norm": 0.6569356322288513, + "learning_rate": 7.3529819694868244e-06, + "loss": 0.0486, + "step": 3827 + }, + { + "epoch": 2.6518877727745065, + "grad_norm": 0.720349133014679, + "learning_rate": 7.352288488210819e-06, + "loss": 0.053, + "step": 3828 + }, + { + "epoch": 2.6525805334257013, + "grad_norm": 0.6570484042167664, + "learning_rate": 7.351595006934813e-06, + "loss": 0.0519, + "step": 3829 + }, + { + "epoch": 2.6532732940768966, + "grad_norm": 0.7745928168296814, + "learning_rate": 7.3509015256588075e-06, + "loss": 0.0681, + "step": 3830 + }, + { + "epoch": 2.6539660547280914, + "grad_norm": 0.6763771176338196, + "learning_rate": 7.3502080443828025e-06, + "loss": 0.0567, + "step": 3831 + }, + { + "epoch": 2.6546588153792863, + "grad_norm": 0.8067436814308167, + "learning_rate": 7.3495145631067966e-06, + "loss": 0.0542, + "step": 3832 + }, + { + "epoch": 2.6553515760304816, + "grad_norm": 0.7435930967330933, + "learning_rate": 7.3488210818307915e-06, + "loss": 0.0482, + "step": 3833 + }, + { + "epoch": 2.6560443366816764, + "grad_norm": 0.8211119174957275, + "learning_rate": 7.3481276005547856e-06, + "loss": 0.072, + "step": 3834 + }, + { + "epoch": 2.6567370973328712, + "grad_norm": 0.6172375082969666, + "learning_rate": 7.3474341192787805e-06, + "loss": 0.0484, + "step": 3835 + }, + { + "epoch": 2.6574298579840665, + "grad_norm": 0.789193332195282, + "learning_rate": 7.3467406380027754e-06, + "loss": 0.0478, + "step": 3836 + }, + { + "epoch": 2.658122618635262, + "grad_norm": 0.8191679120063782, + "learning_rate": 7.346047156726769e-06, + "loss": 0.0649, + "step": 3837 + }, + { + "epoch": 2.6588153792864566, + "grad_norm": 1.0866827964782715, + "learning_rate": 7.345353675450764e-06, + "loss": 0.0748, + "step": 3838 + }, + { + "epoch": 2.6595081399376514, + "grad_norm": 0.6912830471992493, + "learning_rate": 7.344660194174758e-06, + "loss": 0.0607, + "step": 3839 + }, + { + "epoch": 2.6602009005888467, + "grad_norm": 0.7192431688308716, + "learning_rate": 7.343966712898753e-06, + "loss": 0.042, + "step": 3840 + }, + { + "epoch": 2.6608936612400416, + "grad_norm": 0.7954387664794922, + "learning_rate": 7.3432732316227476e-06, + "loss": 0.0649, + "step": 3841 + }, + { + "epoch": 2.6615864218912364, + "grad_norm": 0.9418224096298218, + "learning_rate": 7.342579750346741e-06, + "loss": 0.0694, + "step": 3842 + }, + { + "epoch": 2.6622791825424317, + "grad_norm": 0.7117047309875488, + "learning_rate": 7.341886269070736e-06, + "loss": 0.0531, + "step": 3843 + }, + { + "epoch": 2.6629719431936265, + "grad_norm": 0.7617055773735046, + "learning_rate": 7.34119278779473e-06, + "loss": 0.0633, + "step": 3844 + }, + { + "epoch": 2.6636647038448213, + "grad_norm": 0.744536280632019, + "learning_rate": 7.340499306518725e-06, + "loss": 0.0692, + "step": 3845 + }, + { + "epoch": 2.6643574644960166, + "grad_norm": 0.7266493439674377, + "learning_rate": 7.33980582524272e-06, + "loss": 0.0625, + "step": 3846 + }, + { + "epoch": 2.665050225147212, + "grad_norm": 0.5693483948707581, + "learning_rate": 7.339112343966713e-06, + "loss": 0.0429, + "step": 3847 + }, + { + "epoch": 2.6657429857984067, + "grad_norm": 0.6484627723693848, + "learning_rate": 7.338418862690708e-06, + "loss": 0.0508, + "step": 3848 + }, + { + "epoch": 2.6664357464496016, + "grad_norm": 0.6556325554847717, + "learning_rate": 7.337725381414702e-06, + "loss": 0.0539, + "step": 3849 + }, + { + "epoch": 2.667128507100797, + "grad_norm": 0.749572217464447, + "learning_rate": 7.337031900138697e-06, + "loss": 0.0647, + "step": 3850 + }, + { + "epoch": 2.6678212677519917, + "grad_norm": 0.7320320010185242, + "learning_rate": 7.336338418862692e-06, + "loss": 0.0563, + "step": 3851 + }, + { + "epoch": 2.6685140284031865, + "grad_norm": 0.720440149307251, + "learning_rate": 7.335644937586686e-06, + "loss": 0.0627, + "step": 3852 + }, + { + "epoch": 2.669206789054382, + "grad_norm": 0.6776395440101624, + "learning_rate": 7.334951456310681e-06, + "loss": 0.0525, + "step": 3853 + }, + { + "epoch": 2.6698995497055766, + "grad_norm": 0.7585930824279785, + "learning_rate": 7.334257975034674e-06, + "loss": 0.0517, + "step": 3854 + }, + { + "epoch": 2.6705923103567715, + "grad_norm": 0.8265200853347778, + "learning_rate": 7.333564493758669e-06, + "loss": 0.0703, + "step": 3855 + }, + { + "epoch": 2.6712850710079667, + "grad_norm": 0.667201042175293, + "learning_rate": 7.332871012482664e-06, + "loss": 0.0548, + "step": 3856 + }, + { + "epoch": 2.671977831659162, + "grad_norm": 0.6452093720436096, + "learning_rate": 7.332177531206658e-06, + "loss": 0.0458, + "step": 3857 + }, + { + "epoch": 2.672670592310357, + "grad_norm": 0.7274707555770874, + "learning_rate": 7.331484049930653e-06, + "loss": 0.0637, + "step": 3858 + }, + { + "epoch": 2.6733633529615517, + "grad_norm": 0.7111242413520813, + "learning_rate": 7.330790568654646e-06, + "loss": 0.0634, + "step": 3859 + }, + { + "epoch": 2.674056113612747, + "grad_norm": 0.679294764995575, + "learning_rate": 7.330097087378641e-06, + "loss": 0.0484, + "step": 3860 + }, + { + "epoch": 2.674748874263942, + "grad_norm": 0.7324373722076416, + "learning_rate": 7.329403606102636e-06, + "loss": 0.0519, + "step": 3861 + }, + { + "epoch": 2.6754416349151366, + "grad_norm": 0.7865861058235168, + "learning_rate": 7.32871012482663e-06, + "loss": 0.0564, + "step": 3862 + }, + { + "epoch": 2.676134395566332, + "grad_norm": 0.6728985905647278, + "learning_rate": 7.328016643550625e-06, + "loss": 0.0534, + "step": 3863 + }, + { + "epoch": 2.6768271562175268, + "grad_norm": 0.7406061887741089, + "learning_rate": 7.327323162274619e-06, + "loss": 0.0635, + "step": 3864 + }, + { + "epoch": 2.6775199168687216, + "grad_norm": 0.6972936391830444, + "learning_rate": 7.326629680998614e-06, + "loss": 0.0606, + "step": 3865 + }, + { + "epoch": 2.678212677519917, + "grad_norm": 0.7474087476730347, + "learning_rate": 7.325936199722608e-06, + "loss": 0.0585, + "step": 3866 + }, + { + "epoch": 2.678905438171112, + "grad_norm": 0.618772566318512, + "learning_rate": 7.325242718446602e-06, + "loss": 0.0513, + "step": 3867 + }, + { + "epoch": 2.679598198822307, + "grad_norm": 0.6597331762313843, + "learning_rate": 7.324549237170597e-06, + "loss": 0.0567, + "step": 3868 + }, + { + "epoch": 2.680290959473502, + "grad_norm": 0.6849602460861206, + "learning_rate": 7.323855755894591e-06, + "loss": 0.0536, + "step": 3869 + }, + { + "epoch": 2.680983720124697, + "grad_norm": 0.6464720368385315, + "learning_rate": 7.323162274618586e-06, + "loss": 0.0486, + "step": 3870 + }, + { + "epoch": 2.681676480775892, + "grad_norm": 0.7262237071990967, + "learning_rate": 7.322468793342581e-06, + "loss": 0.053, + "step": 3871 + }, + { + "epoch": 2.6823692414270868, + "grad_norm": 0.7746634483337402, + "learning_rate": 7.321775312066574e-06, + "loss": 0.0627, + "step": 3872 + }, + { + "epoch": 2.683062002078282, + "grad_norm": 0.677756667137146, + "learning_rate": 7.321081830790569e-06, + "loss": 0.0496, + "step": 3873 + }, + { + "epoch": 2.683754762729477, + "grad_norm": 0.6235165596008301, + "learning_rate": 7.3203883495145634e-06, + "loss": 0.0518, + "step": 3874 + }, + { + "epoch": 2.6844475233806717, + "grad_norm": 0.6622484922409058, + "learning_rate": 7.319694868238558e-06, + "loss": 0.0535, + "step": 3875 + }, + { + "epoch": 2.685140284031867, + "grad_norm": 0.7261942028999329, + "learning_rate": 7.319001386962553e-06, + "loss": 0.0665, + "step": 3876 + }, + { + "epoch": 2.6858330446830623, + "grad_norm": 0.7278459072113037, + "learning_rate": 7.3183079056865465e-06, + "loss": 0.0589, + "step": 3877 + }, + { + "epoch": 2.686525805334257, + "grad_norm": 0.7369149923324585, + "learning_rate": 7.3176144244105415e-06, + "loss": 0.0549, + "step": 3878 + }, + { + "epoch": 2.687218565985452, + "grad_norm": 0.7115172147750854, + "learning_rate": 7.3169209431345356e-06, + "loss": 0.0568, + "step": 3879 + }, + { + "epoch": 2.6879113266366472, + "grad_norm": 0.8291954398155212, + "learning_rate": 7.3162274618585305e-06, + "loss": 0.0548, + "step": 3880 + }, + { + "epoch": 2.688604087287842, + "grad_norm": 0.789232611656189, + "learning_rate": 7.315533980582525e-06, + "loss": 0.0681, + "step": 3881 + }, + { + "epoch": 2.689296847939037, + "grad_norm": 0.7152739763259888, + "learning_rate": 7.3148404993065195e-06, + "loss": 0.0642, + "step": 3882 + }, + { + "epoch": 2.689989608590232, + "grad_norm": 0.7494675517082214, + "learning_rate": 7.3141470180305144e-06, + "loss": 0.0537, + "step": 3883 + }, + { + "epoch": 2.690682369241427, + "grad_norm": 0.6653921604156494, + "learning_rate": 7.313453536754508e-06, + "loss": 0.045, + "step": 3884 + }, + { + "epoch": 2.691375129892622, + "grad_norm": 0.6322457790374756, + "learning_rate": 7.312760055478503e-06, + "loss": 0.0514, + "step": 3885 + }, + { + "epoch": 2.692067890543817, + "grad_norm": 0.7568194270133972, + "learning_rate": 7.3120665742024975e-06, + "loss": 0.0674, + "step": 3886 + }, + { + "epoch": 2.6927606511950124, + "grad_norm": 0.6978754997253418, + "learning_rate": 7.311373092926492e-06, + "loss": 0.0548, + "step": 3887 + }, + { + "epoch": 2.6934534118462072, + "grad_norm": 0.7465512752532959, + "learning_rate": 7.3106796116504866e-06, + "loss": 0.0489, + "step": 3888 + }, + { + "epoch": 2.694146172497402, + "grad_norm": 0.595215380191803, + "learning_rate": 7.30998613037448e-06, + "loss": 0.0406, + "step": 3889 + }, + { + "epoch": 2.6948389331485973, + "grad_norm": 0.6685272455215454, + "learning_rate": 7.309292649098475e-06, + "loss": 0.0504, + "step": 3890 + }, + { + "epoch": 2.695531693799792, + "grad_norm": 0.6387081742286682, + "learning_rate": 7.30859916782247e-06, + "loss": 0.0525, + "step": 3891 + }, + { + "epoch": 2.696224454450987, + "grad_norm": 0.8346490859985352, + "learning_rate": 7.307905686546464e-06, + "loss": 0.0656, + "step": 3892 + }, + { + "epoch": 2.6969172151021823, + "grad_norm": 0.7743673324584961, + "learning_rate": 7.307212205270459e-06, + "loss": 0.0599, + "step": 3893 + }, + { + "epoch": 2.697609975753377, + "grad_norm": 0.7080375552177429, + "learning_rate": 7.306518723994453e-06, + "loss": 0.062, + "step": 3894 + }, + { + "epoch": 2.698302736404572, + "grad_norm": 0.6892623901367188, + "learning_rate": 7.305825242718447e-06, + "loss": 0.0543, + "step": 3895 + }, + { + "epoch": 2.6989954970557672, + "grad_norm": 0.778924822807312, + "learning_rate": 7.305131761442442e-06, + "loss": 0.0602, + "step": 3896 + }, + { + "epoch": 2.699688257706962, + "grad_norm": 0.8009116649627686, + "learning_rate": 7.304438280166436e-06, + "loss": 0.054, + "step": 3897 + }, + { + "epoch": 2.7003810183581574, + "grad_norm": 0.6834834218025208, + "learning_rate": 7.303744798890431e-06, + "loss": 0.0482, + "step": 3898 + }, + { + "epoch": 2.701073779009352, + "grad_norm": 0.7371225357055664, + "learning_rate": 7.303051317614425e-06, + "loss": 0.0596, + "step": 3899 + }, + { + "epoch": 2.7017665396605475, + "grad_norm": 0.7083562612533569, + "learning_rate": 7.30235783633842e-06, + "loss": 0.0626, + "step": 3900 + }, + { + "epoch": 2.7024593003117423, + "grad_norm": 0.8710654377937317, + "learning_rate": 7.301664355062415e-06, + "loss": 0.0658, + "step": 3901 + }, + { + "epoch": 2.703152060962937, + "grad_norm": 0.6961199045181274, + "learning_rate": 7.300970873786408e-06, + "loss": 0.0531, + "step": 3902 + }, + { + "epoch": 2.7038448216141324, + "grad_norm": 0.6109960079193115, + "learning_rate": 7.300277392510403e-06, + "loss": 0.0463, + "step": 3903 + }, + { + "epoch": 2.7045375822653273, + "grad_norm": 0.7572680115699768, + "learning_rate": 7.299583911234397e-06, + "loss": 0.0566, + "step": 3904 + }, + { + "epoch": 2.705230342916522, + "grad_norm": 0.7776378393173218, + "learning_rate": 7.298890429958392e-06, + "loss": 0.0616, + "step": 3905 + }, + { + "epoch": 2.7059231035677174, + "grad_norm": 0.7098103165626526, + "learning_rate": 7.298196948682387e-06, + "loss": 0.0512, + "step": 3906 + }, + { + "epoch": 2.706615864218912, + "grad_norm": 0.6883637309074402, + "learning_rate": 7.29750346740638e-06, + "loss": 0.0478, + "step": 3907 + }, + { + "epoch": 2.7073086248701075, + "grad_norm": 0.6861960291862488, + "learning_rate": 7.296809986130375e-06, + "loss": 0.0684, + "step": 3908 + }, + { + "epoch": 2.7080013855213023, + "grad_norm": 0.7326226234436035, + "learning_rate": 7.296116504854369e-06, + "loss": 0.0536, + "step": 3909 + }, + { + "epoch": 2.7086941461724976, + "grad_norm": 0.721124529838562, + "learning_rate": 7.295423023578364e-06, + "loss": 0.0402, + "step": 3910 + }, + { + "epoch": 2.7093869068236924, + "grad_norm": 0.6418149471282959, + "learning_rate": 7.294729542302359e-06, + "loss": 0.0554, + "step": 3911 + }, + { + "epoch": 2.7100796674748873, + "grad_norm": 0.7129888534545898, + "learning_rate": 7.294036061026353e-06, + "loss": 0.0496, + "step": 3912 + }, + { + "epoch": 2.7107724281260825, + "grad_norm": 0.6219625473022461, + "learning_rate": 7.293342579750348e-06, + "loss": 0.0419, + "step": 3913 + }, + { + "epoch": 2.7114651887772774, + "grad_norm": 0.6785973906517029, + "learning_rate": 7.292649098474341e-06, + "loss": 0.051, + "step": 3914 + }, + { + "epoch": 2.712157949428472, + "grad_norm": 0.7339017987251282, + "learning_rate": 7.291955617198336e-06, + "loss": 0.0567, + "step": 3915 + }, + { + "epoch": 2.7128507100796675, + "grad_norm": 0.7542696595191956, + "learning_rate": 7.291262135922331e-06, + "loss": 0.068, + "step": 3916 + }, + { + "epoch": 2.7135434707308623, + "grad_norm": 0.8791110515594482, + "learning_rate": 7.290568654646325e-06, + "loss": 0.0545, + "step": 3917 + }, + { + "epoch": 2.7142362313820576, + "grad_norm": 0.6041953563690186, + "learning_rate": 7.28987517337032e-06, + "loss": 0.0511, + "step": 3918 + }, + { + "epoch": 2.7149289920332524, + "grad_norm": 0.7498956322669983, + "learning_rate": 7.289181692094313e-06, + "loss": 0.0525, + "step": 3919 + }, + { + "epoch": 2.7156217526844477, + "grad_norm": 0.6190578937530518, + "learning_rate": 7.288488210818308e-06, + "loss": 0.038, + "step": 3920 + }, + { + "epoch": 2.7163145133356426, + "grad_norm": 0.7730886936187744, + "learning_rate": 7.287794729542303e-06, + "loss": 0.0508, + "step": 3921 + }, + { + "epoch": 2.7170072739868374, + "grad_norm": 0.7318882942199707, + "learning_rate": 7.287101248266297e-06, + "loss": 0.0568, + "step": 3922 + }, + { + "epoch": 2.7177000346380327, + "grad_norm": 0.834860622882843, + "learning_rate": 7.286407766990292e-06, + "loss": 0.0659, + "step": 3923 + }, + { + "epoch": 2.7183927952892275, + "grad_norm": 0.7603409886360168, + "learning_rate": 7.285714285714286e-06, + "loss": 0.065, + "step": 3924 + }, + { + "epoch": 2.7190855559404223, + "grad_norm": 0.6554049253463745, + "learning_rate": 7.2850208044382805e-06, + "loss": 0.058, + "step": 3925 + }, + { + "epoch": 2.7197783165916176, + "grad_norm": 1.1610909700393677, + "learning_rate": 7.284327323162275e-06, + "loss": 0.0726, + "step": 3926 + }, + { + "epoch": 2.7204710772428125, + "grad_norm": 0.6697677969932556, + "learning_rate": 7.2836338418862695e-06, + "loss": 0.0585, + "step": 3927 + }, + { + "epoch": 2.7211638378940077, + "grad_norm": 0.6267381906509399, + "learning_rate": 7.282940360610264e-06, + "loss": 0.0466, + "step": 3928 + }, + { + "epoch": 2.7218565985452026, + "grad_norm": 0.6425818800926208, + "learning_rate": 7.2822468793342585e-06, + "loss": 0.0478, + "step": 3929 + }, + { + "epoch": 2.722549359196398, + "grad_norm": 0.7394554018974304, + "learning_rate": 7.2815533980582534e-06, + "loss": 0.0587, + "step": 3930 + }, + { + "epoch": 2.7232421198475927, + "grad_norm": 0.7270798087120056, + "learning_rate": 7.280859916782248e-06, + "loss": 0.0564, + "step": 3931 + }, + { + "epoch": 2.7239348804987875, + "grad_norm": 0.7313693761825562, + "learning_rate": 7.280166435506242e-06, + "loss": 0.0574, + "step": 3932 + }, + { + "epoch": 2.724627641149983, + "grad_norm": 0.7176659107208252, + "learning_rate": 7.2794729542302365e-06, + "loss": 0.0565, + "step": 3933 + }, + { + "epoch": 2.7253204018011776, + "grad_norm": 0.6980408430099487, + "learning_rate": 7.278779472954231e-06, + "loss": 0.0502, + "step": 3934 + }, + { + "epoch": 2.7260131624523725, + "grad_norm": 0.7114086151123047, + "learning_rate": 7.2780859916782256e-06, + "loss": 0.0592, + "step": 3935 + }, + { + "epoch": 2.7267059231035677, + "grad_norm": 0.6872692108154297, + "learning_rate": 7.2773925104022205e-06, + "loss": 0.0474, + "step": 3936 + }, + { + "epoch": 2.7273986837547626, + "grad_norm": 0.6300058960914612, + "learning_rate": 7.276699029126214e-06, + "loss": 0.046, + "step": 3937 + }, + { + "epoch": 2.728091444405958, + "grad_norm": 0.6446839570999146, + "learning_rate": 7.276005547850209e-06, + "loss": 0.0503, + "step": 3938 + }, + { + "epoch": 2.7287842050571527, + "grad_norm": 0.5818908214569092, + "learning_rate": 7.275312066574203e-06, + "loss": 0.0428, + "step": 3939 + }, + { + "epoch": 2.729476965708348, + "grad_norm": 0.7795031070709229, + "learning_rate": 7.274618585298198e-06, + "loss": 0.0592, + "step": 3940 + }, + { + "epoch": 2.730169726359543, + "grad_norm": 0.6601243615150452, + "learning_rate": 7.273925104022193e-06, + "loss": 0.0529, + "step": 3941 + }, + { + "epoch": 2.7308624870107376, + "grad_norm": 0.7666286826133728, + "learning_rate": 7.273231622746187e-06, + "loss": 0.0637, + "step": 3942 + }, + { + "epoch": 2.731555247661933, + "grad_norm": 0.7480244040489197, + "learning_rate": 7.272538141470181e-06, + "loss": 0.062, + "step": 3943 + }, + { + "epoch": 2.7322480083131278, + "grad_norm": 0.7202300429344177, + "learning_rate": 7.271844660194175e-06, + "loss": 0.0521, + "step": 3944 + }, + { + "epoch": 2.7329407689643226, + "grad_norm": 0.7029700875282288, + "learning_rate": 7.27115117891817e-06, + "loss": 0.0648, + "step": 3945 + }, + { + "epoch": 2.733633529615518, + "grad_norm": 0.7759830951690674, + "learning_rate": 7.270457697642165e-06, + "loss": 0.0564, + "step": 3946 + }, + { + "epoch": 2.7343262902667127, + "grad_norm": 0.7026242017745972, + "learning_rate": 7.269764216366159e-06, + "loss": 0.0591, + "step": 3947 + }, + { + "epoch": 2.735019050917908, + "grad_norm": 0.7203380465507507, + "learning_rate": 7.269070735090154e-06, + "loss": 0.0581, + "step": 3948 + }, + { + "epoch": 2.735711811569103, + "grad_norm": 0.7329349517822266, + "learning_rate": 7.268377253814147e-06, + "loss": 0.0555, + "step": 3949 + }, + { + "epoch": 2.736404572220298, + "grad_norm": 0.8174529075622559, + "learning_rate": 7.267683772538142e-06, + "loss": 0.0607, + "step": 3950 + }, + { + "epoch": 2.737097332871493, + "grad_norm": 0.6682785153388977, + "learning_rate": 7.266990291262137e-06, + "loss": 0.0461, + "step": 3951 + }, + { + "epoch": 2.7377900935226878, + "grad_norm": 0.7733043432235718, + "learning_rate": 7.266296809986131e-06, + "loss": 0.048, + "step": 3952 + }, + { + "epoch": 2.738482854173883, + "grad_norm": 0.8042845726013184, + "learning_rate": 7.265603328710126e-06, + "loss": 0.0594, + "step": 3953 + }, + { + "epoch": 2.739175614825078, + "grad_norm": 0.688261091709137, + "learning_rate": 7.264909847434119e-06, + "loss": 0.0609, + "step": 3954 + }, + { + "epoch": 2.7398683754762727, + "grad_norm": 1.2057961225509644, + "learning_rate": 7.264216366158114e-06, + "loss": 0.0676, + "step": 3955 + }, + { + "epoch": 2.740561136127468, + "grad_norm": 0.7686060070991516, + "learning_rate": 7.263522884882109e-06, + "loss": 0.07, + "step": 3956 + }, + { + "epoch": 2.741253896778663, + "grad_norm": 0.76099693775177, + "learning_rate": 7.262829403606103e-06, + "loss": 0.0537, + "step": 3957 + }, + { + "epoch": 2.741946657429858, + "grad_norm": 0.7034679651260376, + "learning_rate": 7.262135922330098e-06, + "loss": 0.0568, + "step": 3958 + }, + { + "epoch": 2.742639418081053, + "grad_norm": 0.7430107593536377, + "learning_rate": 7.261442441054092e-06, + "loss": 0.0611, + "step": 3959 + }, + { + "epoch": 2.743332178732248, + "grad_norm": 0.5567717552185059, + "learning_rate": 7.260748959778087e-06, + "loss": 0.0499, + "step": 3960 + }, + { + "epoch": 2.744024939383443, + "grad_norm": 0.7748571038246155, + "learning_rate": 7.260055478502082e-06, + "loss": 0.0642, + "step": 3961 + }, + { + "epoch": 2.744717700034638, + "grad_norm": 0.8475006818771362, + "learning_rate": 7.259361997226075e-06, + "loss": 0.0697, + "step": 3962 + }, + { + "epoch": 2.745410460685833, + "grad_norm": 0.6778804659843445, + "learning_rate": 7.25866851595007e-06, + "loss": 0.0609, + "step": 3963 + }, + { + "epoch": 2.746103221337028, + "grad_norm": 0.6167569160461426, + "learning_rate": 7.257975034674064e-06, + "loss": 0.0534, + "step": 3964 + }, + { + "epoch": 2.746795981988223, + "grad_norm": 0.8046923875808716, + "learning_rate": 7.257281553398059e-06, + "loss": 0.053, + "step": 3965 + }, + { + "epoch": 2.747488742639418, + "grad_norm": 0.8193111419677734, + "learning_rate": 7.256588072122054e-06, + "loss": 0.0662, + "step": 3966 + }, + { + "epoch": 2.748181503290613, + "grad_norm": 0.8219735622406006, + "learning_rate": 7.255894590846047e-06, + "loss": 0.0715, + "step": 3967 + }, + { + "epoch": 2.7488742639418082, + "grad_norm": 0.7069618701934814, + "learning_rate": 7.255201109570042e-06, + "loss": 0.064, + "step": 3968 + }, + { + "epoch": 2.749567024593003, + "grad_norm": 0.7169185876846313, + "learning_rate": 7.254507628294036e-06, + "loss": 0.0652, + "step": 3969 + }, + { + "epoch": 2.7502597852441983, + "grad_norm": 0.7799632549285889, + "learning_rate": 7.253814147018031e-06, + "loss": 0.0532, + "step": 3970 + }, + { + "epoch": 2.750952545895393, + "grad_norm": 0.7554654479026794, + "learning_rate": 7.253120665742026e-06, + "loss": 0.0771, + "step": 3971 + }, + { + "epoch": 2.751645306546588, + "grad_norm": 0.8206004500389099, + "learning_rate": 7.25242718446602e-06, + "loss": 0.066, + "step": 3972 + }, + { + "epoch": 2.7523380671977833, + "grad_norm": 0.8165253400802612, + "learning_rate": 7.251733703190014e-06, + "loss": 0.0512, + "step": 3973 + }, + { + "epoch": 2.753030827848978, + "grad_norm": 0.7096630930900574, + "learning_rate": 7.2510402219140085e-06, + "loss": 0.0572, + "step": 3974 + }, + { + "epoch": 2.753723588500173, + "grad_norm": 0.6919059157371521, + "learning_rate": 7.250346740638003e-06, + "loss": 0.0524, + "step": 3975 + }, + { + "epoch": 2.7544163491513682, + "grad_norm": 0.6617465019226074, + "learning_rate": 7.2496532593619975e-06, + "loss": 0.0491, + "step": 3976 + }, + { + "epoch": 2.755109109802563, + "grad_norm": 0.6623074412345886, + "learning_rate": 7.2489597780859924e-06, + "loss": 0.0614, + "step": 3977 + }, + { + "epoch": 2.7558018704537584, + "grad_norm": 0.6906787157058716, + "learning_rate": 7.248266296809987e-06, + "loss": 0.061, + "step": 3978 + }, + { + "epoch": 2.756494631104953, + "grad_norm": 0.6674212217330933, + "learning_rate": 7.247572815533981e-06, + "loss": 0.0496, + "step": 3979 + }, + { + "epoch": 2.7571873917561485, + "grad_norm": 0.6485714316368103, + "learning_rate": 7.2468793342579755e-06, + "loss": 0.0574, + "step": 3980 + }, + { + "epoch": 2.7578801524073433, + "grad_norm": 0.693846583366394, + "learning_rate": 7.24618585298197e-06, + "loss": 0.0425, + "step": 3981 + }, + { + "epoch": 2.758572913058538, + "grad_norm": 0.8718501329421997, + "learning_rate": 7.2454923717059646e-06, + "loss": 0.0617, + "step": 3982 + }, + { + "epoch": 2.7592656737097334, + "grad_norm": 0.7123166918754578, + "learning_rate": 7.2447988904299595e-06, + "loss": 0.0587, + "step": 3983 + }, + { + "epoch": 2.7599584343609282, + "grad_norm": 0.6755354404449463, + "learning_rate": 7.244105409153953e-06, + "loss": 0.0509, + "step": 3984 + }, + { + "epoch": 2.760651195012123, + "grad_norm": 0.6938267350196838, + "learning_rate": 7.243411927877948e-06, + "loss": 0.0579, + "step": 3985 + }, + { + "epoch": 2.7613439556633184, + "grad_norm": 0.6104715466499329, + "learning_rate": 7.242718446601942e-06, + "loss": 0.0477, + "step": 3986 + }, + { + "epoch": 2.762036716314513, + "grad_norm": 0.760593831539154, + "learning_rate": 7.242024965325937e-06, + "loss": 0.0579, + "step": 3987 + }, + { + "epoch": 2.7627294769657085, + "grad_norm": 0.831852376461029, + "learning_rate": 7.241331484049932e-06, + "loss": 0.0718, + "step": 3988 + }, + { + "epoch": 2.7634222376169033, + "grad_norm": 0.7221866250038147, + "learning_rate": 7.240638002773926e-06, + "loss": 0.0539, + "step": 3989 + }, + { + "epoch": 2.7641149982680986, + "grad_norm": 0.6120632290840149, + "learning_rate": 7.239944521497921e-06, + "loss": 0.0601, + "step": 3990 + }, + { + "epoch": 2.7648077589192934, + "grad_norm": 0.8550262451171875, + "learning_rate": 7.239251040221914e-06, + "loss": 0.0688, + "step": 3991 + }, + { + "epoch": 2.7655005195704883, + "grad_norm": 0.7031256556510925, + "learning_rate": 7.238557558945909e-06, + "loss": 0.0623, + "step": 3992 + }, + { + "epoch": 2.7661932802216835, + "grad_norm": 0.677352786064148, + "learning_rate": 7.237864077669904e-06, + "loss": 0.051, + "step": 3993 + }, + { + "epoch": 2.7668860408728784, + "grad_norm": 0.6129022240638733, + "learning_rate": 7.237170596393898e-06, + "loss": 0.0472, + "step": 3994 + }, + { + "epoch": 2.767578801524073, + "grad_norm": 0.6582059264183044, + "learning_rate": 7.236477115117893e-06, + "loss": 0.0391, + "step": 3995 + }, + { + "epoch": 2.7682715621752685, + "grad_norm": 0.7357588410377502, + "learning_rate": 7.235783633841886e-06, + "loss": 0.0633, + "step": 3996 + }, + { + "epoch": 2.7689643228264633, + "grad_norm": 0.7520215511322021, + "learning_rate": 7.235090152565881e-06, + "loss": 0.0528, + "step": 3997 + }, + { + "epoch": 2.7696570834776586, + "grad_norm": 0.7143958806991577, + "learning_rate": 7.234396671289876e-06, + "loss": 0.0611, + "step": 3998 + }, + { + "epoch": 2.7703498441288534, + "grad_norm": 0.6682828068733215, + "learning_rate": 7.23370319001387e-06, + "loss": 0.0595, + "step": 3999 + }, + { + "epoch": 2.7710426047800487, + "grad_norm": 0.6994439959526062, + "learning_rate": 7.233009708737865e-06, + "loss": 0.0688, + "step": 4000 + }, + { + "epoch": 2.7717353654312435, + "grad_norm": 0.7368742227554321, + "learning_rate": 7.232316227461859e-06, + "loss": 0.0635, + "step": 4001 + }, + { + "epoch": 2.7724281260824384, + "grad_norm": 0.6693440675735474, + "learning_rate": 7.231622746185853e-06, + "loss": 0.0564, + "step": 4002 + }, + { + "epoch": 2.7731208867336337, + "grad_norm": 0.623653769493103, + "learning_rate": 7.230929264909848e-06, + "loss": 0.0419, + "step": 4003 + }, + { + "epoch": 2.7738136473848285, + "grad_norm": 0.7218508124351501, + "learning_rate": 7.230235783633842e-06, + "loss": 0.0616, + "step": 4004 + }, + { + "epoch": 2.7745064080360233, + "grad_norm": 0.8679817914962769, + "learning_rate": 7.229542302357837e-06, + "loss": 0.0567, + "step": 4005 + }, + { + "epoch": 2.7751991686872186, + "grad_norm": 0.7767743468284607, + "learning_rate": 7.228848821081831e-06, + "loss": 0.0615, + "step": 4006 + }, + { + "epoch": 2.7758919293384134, + "grad_norm": 0.6840187907218933, + "learning_rate": 7.228155339805826e-06, + "loss": 0.0525, + "step": 4007 + }, + { + "epoch": 2.7765846899896087, + "grad_norm": 0.6501522064208984, + "learning_rate": 7.227461858529821e-06, + "loss": 0.0478, + "step": 4008 + }, + { + "epoch": 2.7772774506408036, + "grad_norm": 0.6367305517196655, + "learning_rate": 7.226768377253814e-06, + "loss": 0.0496, + "step": 4009 + }, + { + "epoch": 2.777970211291999, + "grad_norm": 0.765961766242981, + "learning_rate": 7.226074895977809e-06, + "loss": 0.0549, + "step": 4010 + }, + { + "epoch": 2.7786629719431937, + "grad_norm": 0.7556353211402893, + "learning_rate": 7.225381414701803e-06, + "loss": 0.0597, + "step": 4011 + }, + { + "epoch": 2.7793557325943885, + "grad_norm": 0.674601137638092, + "learning_rate": 7.224687933425798e-06, + "loss": 0.0559, + "step": 4012 + }, + { + "epoch": 2.780048493245584, + "grad_norm": 0.7502955198287964, + "learning_rate": 7.223994452149793e-06, + "loss": 0.0544, + "step": 4013 + }, + { + "epoch": 2.7807412538967786, + "grad_norm": 0.7685797214508057, + "learning_rate": 7.223300970873786e-06, + "loss": 0.0496, + "step": 4014 + }, + { + "epoch": 2.7814340145479735, + "grad_norm": 0.6448591351509094, + "learning_rate": 7.222607489597781e-06, + "loss": 0.0434, + "step": 4015 + }, + { + "epoch": 2.7821267751991687, + "grad_norm": 0.5939489603042603, + "learning_rate": 7.221914008321775e-06, + "loss": 0.0425, + "step": 4016 + }, + { + "epoch": 2.7828195358503636, + "grad_norm": 0.6740111112594604, + "learning_rate": 7.22122052704577e-06, + "loss": 0.0593, + "step": 4017 + }, + { + "epoch": 2.783512296501559, + "grad_norm": 0.7750644087791443, + "learning_rate": 7.220527045769765e-06, + "loss": 0.0575, + "step": 4018 + }, + { + "epoch": 2.7842050571527537, + "grad_norm": 0.6008273363113403, + "learning_rate": 7.219833564493759e-06, + "loss": 0.0524, + "step": 4019 + }, + { + "epoch": 2.784897817803949, + "grad_norm": 0.6259392499923706, + "learning_rate": 7.219140083217754e-06, + "loss": 0.0407, + "step": 4020 + }, + { + "epoch": 2.785590578455144, + "grad_norm": 0.6633020043373108, + "learning_rate": 7.2184466019417475e-06, + "loss": 0.0535, + "step": 4021 + }, + { + "epoch": 2.7862833391063386, + "grad_norm": 0.7504967451095581, + "learning_rate": 7.217753120665742e-06, + "loss": 0.0554, + "step": 4022 + }, + { + "epoch": 2.786976099757534, + "grad_norm": 0.5940077304840088, + "learning_rate": 7.217059639389737e-06, + "loss": 0.0445, + "step": 4023 + }, + { + "epoch": 2.7876688604087287, + "grad_norm": 0.7825884819030762, + "learning_rate": 7.2163661581137314e-06, + "loss": 0.0592, + "step": 4024 + }, + { + "epoch": 2.7883616210599236, + "grad_norm": 0.6419134736061096, + "learning_rate": 7.215672676837726e-06, + "loss": 0.0541, + "step": 4025 + }, + { + "epoch": 2.789054381711119, + "grad_norm": 0.7044313549995422, + "learning_rate": 7.21497919556172e-06, + "loss": 0.0621, + "step": 4026 + }, + { + "epoch": 2.7897471423623137, + "grad_norm": 1.0729942321777344, + "learning_rate": 7.2142857142857145e-06, + "loss": 0.0581, + "step": 4027 + }, + { + "epoch": 2.790439903013509, + "grad_norm": 0.6810662150382996, + "learning_rate": 7.2135922330097095e-06, + "loss": 0.0599, + "step": 4028 + }, + { + "epoch": 2.791132663664704, + "grad_norm": 0.6379590034484863, + "learning_rate": 7.2128987517337036e-06, + "loss": 0.0476, + "step": 4029 + }, + { + "epoch": 2.791825424315899, + "grad_norm": 0.7135487794876099, + "learning_rate": 7.2122052704576985e-06, + "loss": 0.0505, + "step": 4030 + }, + { + "epoch": 2.792518184967094, + "grad_norm": 0.7583690285682678, + "learning_rate": 7.211511789181692e-06, + "loss": 0.0623, + "step": 4031 + }, + { + "epoch": 2.7932109456182888, + "grad_norm": 0.6712807416915894, + "learning_rate": 7.210818307905687e-06, + "loss": 0.0471, + "step": 4032 + }, + { + "epoch": 2.793903706269484, + "grad_norm": 0.7846777439117432, + "learning_rate": 7.210124826629682e-06, + "loss": 0.0534, + "step": 4033 + }, + { + "epoch": 2.794596466920679, + "grad_norm": 0.6510107517242432, + "learning_rate": 7.209431345353676e-06, + "loss": 0.0516, + "step": 4034 + }, + { + "epoch": 2.7952892275718737, + "grad_norm": 0.8907626271247864, + "learning_rate": 7.208737864077671e-06, + "loss": 0.0721, + "step": 4035 + }, + { + "epoch": 2.795981988223069, + "grad_norm": 0.8742437362670898, + "learning_rate": 7.208044382801665e-06, + "loss": 0.0606, + "step": 4036 + }, + { + "epoch": 2.796674748874264, + "grad_norm": 0.706000030040741, + "learning_rate": 7.20735090152566e-06, + "loss": 0.0421, + "step": 4037 + }, + { + "epoch": 2.797367509525459, + "grad_norm": 0.6947118043899536, + "learning_rate": 7.206657420249655e-06, + "loss": 0.0563, + "step": 4038 + }, + { + "epoch": 2.798060270176654, + "grad_norm": 0.7536004185676575, + "learning_rate": 7.205963938973648e-06, + "loss": 0.051, + "step": 4039 + }, + { + "epoch": 2.798753030827849, + "grad_norm": 0.6250074505805969, + "learning_rate": 7.205270457697643e-06, + "loss": 0.0512, + "step": 4040 + }, + { + "epoch": 2.799445791479044, + "grad_norm": 0.7176235914230347, + "learning_rate": 7.204576976421637e-06, + "loss": 0.0575, + "step": 4041 + }, + { + "epoch": 2.800138552130239, + "grad_norm": 0.7992220520973206, + "learning_rate": 7.203883495145632e-06, + "loss": 0.063, + "step": 4042 + }, + { + "epoch": 2.800831312781434, + "grad_norm": 0.732123613357544, + "learning_rate": 7.203190013869627e-06, + "loss": 0.0542, + "step": 4043 + }, + { + "epoch": 2.801524073432629, + "grad_norm": 0.6620665192604065, + "learning_rate": 7.20249653259362e-06, + "loss": 0.0523, + "step": 4044 + }, + { + "epoch": 2.802216834083824, + "grad_norm": 0.675926148891449, + "learning_rate": 7.201803051317615e-06, + "loss": 0.0545, + "step": 4045 + }, + { + "epoch": 2.802909594735019, + "grad_norm": 0.7969706654548645, + "learning_rate": 7.201109570041609e-06, + "loss": 0.0489, + "step": 4046 + }, + { + "epoch": 2.803602355386214, + "grad_norm": 0.6930369734764099, + "learning_rate": 7.200416088765604e-06, + "loss": 0.055, + "step": 4047 + }, + { + "epoch": 2.804295116037409, + "grad_norm": 0.6895542740821838, + "learning_rate": 7.199722607489599e-06, + "loss": 0.0571, + "step": 4048 + }, + { + "epoch": 2.804987876688604, + "grad_norm": 0.7721331715583801, + "learning_rate": 7.199029126213593e-06, + "loss": 0.0664, + "step": 4049 + }, + { + "epoch": 2.8056806373397993, + "grad_norm": 0.7242488265037537, + "learning_rate": 7.198335644937587e-06, + "loss": 0.0626, + "step": 4050 + }, + { + "epoch": 2.806373397990994, + "grad_norm": 0.6718471646308899, + "learning_rate": 7.197642163661581e-06, + "loss": 0.0486, + "step": 4051 + }, + { + "epoch": 2.807066158642189, + "grad_norm": 0.7310835719108582, + "learning_rate": 7.196948682385576e-06, + "loss": 0.0632, + "step": 4052 + }, + { + "epoch": 2.8077589192933843, + "grad_norm": 0.6099986433982849, + "learning_rate": 7.196255201109571e-06, + "loss": 0.046, + "step": 4053 + }, + { + "epoch": 2.808451679944579, + "grad_norm": 0.7324040532112122, + "learning_rate": 7.195561719833565e-06, + "loss": 0.0593, + "step": 4054 + }, + { + "epoch": 2.809144440595774, + "grad_norm": 0.8426403403282166, + "learning_rate": 7.19486823855756e-06, + "loss": 0.0699, + "step": 4055 + }, + { + "epoch": 2.8098372012469692, + "grad_norm": 0.7356042265892029, + "learning_rate": 7.194174757281553e-06, + "loss": 0.0489, + "step": 4056 + }, + { + "epoch": 2.810529961898164, + "grad_norm": 0.7246345281600952, + "learning_rate": 7.193481276005548e-06, + "loss": 0.0632, + "step": 4057 + }, + { + "epoch": 2.8112227225493593, + "grad_norm": 0.7151074409484863, + "learning_rate": 7.192787794729543e-06, + "loss": 0.0439, + "step": 4058 + }, + { + "epoch": 2.811915483200554, + "grad_norm": 0.7404351830482483, + "learning_rate": 7.192094313453537e-06, + "loss": 0.0549, + "step": 4059 + }, + { + "epoch": 2.8126082438517495, + "grad_norm": 0.8077854514122009, + "learning_rate": 7.191400832177532e-06, + "loss": 0.0478, + "step": 4060 + }, + { + "epoch": 2.8133010045029443, + "grad_norm": 0.6892199516296387, + "learning_rate": 7.190707350901525e-06, + "loss": 0.0573, + "step": 4061 + }, + { + "epoch": 2.813993765154139, + "grad_norm": 0.6757320761680603, + "learning_rate": 7.19001386962552e-06, + "loss": 0.0497, + "step": 4062 + }, + { + "epoch": 2.8146865258053344, + "grad_norm": 0.717888593673706, + "learning_rate": 7.189320388349515e-06, + "loss": 0.0614, + "step": 4063 + }, + { + "epoch": 2.8153792864565292, + "grad_norm": 0.6951326131820679, + "learning_rate": 7.188626907073509e-06, + "loss": 0.0524, + "step": 4064 + }, + { + "epoch": 2.816072047107724, + "grad_norm": 0.7384958267211914, + "learning_rate": 7.187933425797504e-06, + "loss": 0.0414, + "step": 4065 + }, + { + "epoch": 2.8167648077589194, + "grad_norm": 0.7302805781364441, + "learning_rate": 7.187239944521498e-06, + "loss": 0.0508, + "step": 4066 + }, + { + "epoch": 2.817457568410114, + "grad_norm": 0.7409061193466187, + "learning_rate": 7.186546463245493e-06, + "loss": 0.0581, + "step": 4067 + }, + { + "epoch": 2.8181503290613095, + "grad_norm": 0.6641678214073181, + "learning_rate": 7.185852981969488e-06, + "loss": 0.0523, + "step": 4068 + }, + { + "epoch": 2.8188430897125043, + "grad_norm": 0.6693853139877319, + "learning_rate": 7.1851595006934814e-06, + "loss": 0.0458, + "step": 4069 + }, + { + "epoch": 2.8195358503636996, + "grad_norm": 0.7115046381950378, + "learning_rate": 7.184466019417476e-06, + "loss": 0.0558, + "step": 4070 + }, + { + "epoch": 2.8202286110148944, + "grad_norm": 0.6776880025863647, + "learning_rate": 7.1837725381414704e-06, + "loss": 0.0532, + "step": 4071 + }, + { + "epoch": 2.8209213716660893, + "grad_norm": 0.72972172498703, + "learning_rate": 7.183079056865465e-06, + "loss": 0.0482, + "step": 4072 + }, + { + "epoch": 2.8216141323172845, + "grad_norm": 0.6619596481323242, + "learning_rate": 7.18238557558946e-06, + "loss": 0.0505, + "step": 4073 + }, + { + "epoch": 2.8223068929684794, + "grad_norm": 0.7599324584007263, + "learning_rate": 7.1816920943134535e-06, + "loss": 0.0624, + "step": 4074 + }, + { + "epoch": 2.822999653619674, + "grad_norm": 0.7475978136062622, + "learning_rate": 7.1809986130374485e-06, + "loss": 0.0666, + "step": 4075 + }, + { + "epoch": 2.8236924142708695, + "grad_norm": 0.7838507294654846, + "learning_rate": 7.1803051317614426e-06, + "loss": 0.0568, + "step": 4076 + }, + { + "epoch": 2.8243851749220643, + "grad_norm": 0.7802300453186035, + "learning_rate": 7.1796116504854375e-06, + "loss": 0.0501, + "step": 4077 + }, + { + "epoch": 2.8250779355732596, + "grad_norm": 0.6380999088287354, + "learning_rate": 7.1789181692094324e-06, + "loss": 0.0526, + "step": 4078 + }, + { + "epoch": 2.8257706962244544, + "grad_norm": 0.7001136541366577, + "learning_rate": 7.178224687933426e-06, + "loss": 0.0609, + "step": 4079 + }, + { + "epoch": 2.8264634568756497, + "grad_norm": 0.8101133108139038, + "learning_rate": 7.177531206657421e-06, + "loss": 0.0673, + "step": 4080 + }, + { + "epoch": 2.8271562175268445, + "grad_norm": 0.757917046546936, + "learning_rate": 7.176837725381415e-06, + "loss": 0.0684, + "step": 4081 + }, + { + "epoch": 2.8278489781780394, + "grad_norm": 0.7094997763633728, + "learning_rate": 7.17614424410541e-06, + "loss": 0.0559, + "step": 4082 + }, + { + "epoch": 2.8285417388292347, + "grad_norm": 0.7780656814575195, + "learning_rate": 7.1754507628294046e-06, + "loss": 0.0693, + "step": 4083 + }, + { + "epoch": 2.8292344994804295, + "grad_norm": 0.8230788707733154, + "learning_rate": 7.174757281553399e-06, + "loss": 0.0599, + "step": 4084 + }, + { + "epoch": 2.8299272601316243, + "grad_norm": 0.8302596807479858, + "learning_rate": 7.174063800277394e-06, + "loss": 0.0583, + "step": 4085 + }, + { + "epoch": 2.8306200207828196, + "grad_norm": 0.6846452951431274, + "learning_rate": 7.173370319001387e-06, + "loss": 0.057, + "step": 4086 + }, + { + "epoch": 2.8313127814340144, + "grad_norm": 0.7271231412887573, + "learning_rate": 7.172676837725382e-06, + "loss": 0.0531, + "step": 4087 + }, + { + "epoch": 2.8320055420852097, + "grad_norm": 0.7599466443061829, + "learning_rate": 7.171983356449377e-06, + "loss": 0.0478, + "step": 4088 + }, + { + "epoch": 2.8326983027364045, + "grad_norm": 0.682426929473877, + "learning_rate": 7.171289875173371e-06, + "loss": 0.0432, + "step": 4089 + }, + { + "epoch": 2.8333910633876, + "grad_norm": 0.7017044425010681, + "learning_rate": 7.170596393897366e-06, + "loss": 0.0432, + "step": 4090 + }, + { + "epoch": 2.8340838240387947, + "grad_norm": 0.7831794023513794, + "learning_rate": 7.169902912621359e-06, + "loss": 0.0616, + "step": 4091 + }, + { + "epoch": 2.8347765846899895, + "grad_norm": 0.6838694214820862, + "learning_rate": 7.169209431345354e-06, + "loss": 0.0526, + "step": 4092 + }, + { + "epoch": 2.8354693453411848, + "grad_norm": 0.7550931572914124, + "learning_rate": 7.168515950069349e-06, + "loss": 0.0628, + "step": 4093 + }, + { + "epoch": 2.8361621059923796, + "grad_norm": 0.7070609331130981, + "learning_rate": 7.167822468793343e-06, + "loss": 0.0441, + "step": 4094 + }, + { + "epoch": 2.8368548666435744, + "grad_norm": 0.6957732439041138, + "learning_rate": 7.167128987517338e-06, + "loss": 0.0492, + "step": 4095 + }, + { + "epoch": 2.8375476272947697, + "grad_norm": 0.7564902305603027, + "learning_rate": 7.166435506241332e-06, + "loss": 0.0516, + "step": 4096 + }, + { + "epoch": 2.8382403879459646, + "grad_norm": 0.7061901688575745, + "learning_rate": 7.165742024965327e-06, + "loss": 0.0496, + "step": 4097 + }, + { + "epoch": 2.83893314859716, + "grad_norm": 0.6750578880310059, + "learning_rate": 7.165048543689321e-06, + "loss": 0.0515, + "step": 4098 + }, + { + "epoch": 2.8396259092483547, + "grad_norm": 0.5834484100341797, + "learning_rate": 7.164355062413315e-06, + "loss": 0.054, + "step": 4099 + }, + { + "epoch": 2.84031866989955, + "grad_norm": 0.700020432472229, + "learning_rate": 7.16366158113731e-06, + "loss": 0.0507, + "step": 4100 + }, + { + "epoch": 2.841011430550745, + "grad_norm": 0.8128746151924133, + "learning_rate": 7.162968099861304e-06, + "loss": 0.0525, + "step": 4101 + }, + { + "epoch": 2.8417041912019396, + "grad_norm": 0.7868772745132446, + "learning_rate": 7.162274618585299e-06, + "loss": 0.0482, + "step": 4102 + }, + { + "epoch": 2.842396951853135, + "grad_norm": 0.7407437562942505, + "learning_rate": 7.161581137309294e-06, + "loss": 0.0605, + "step": 4103 + }, + { + "epoch": 2.8430897125043297, + "grad_norm": 0.6782182455062866, + "learning_rate": 7.160887656033287e-06, + "loss": 0.05, + "step": 4104 + }, + { + "epoch": 2.8437824731555246, + "grad_norm": 0.6876909732818604, + "learning_rate": 7.160194174757282e-06, + "loss": 0.0498, + "step": 4105 + }, + { + "epoch": 2.84447523380672, + "grad_norm": 0.6600090861320496, + "learning_rate": 7.159500693481276e-06, + "loss": 0.0508, + "step": 4106 + }, + { + "epoch": 2.8451679944579147, + "grad_norm": 0.7737667560577393, + "learning_rate": 7.158807212205271e-06, + "loss": 0.0566, + "step": 4107 + }, + { + "epoch": 2.84586075510911, + "grad_norm": 0.734607994556427, + "learning_rate": 7.158113730929266e-06, + "loss": 0.0527, + "step": 4108 + }, + { + "epoch": 2.846553515760305, + "grad_norm": 0.6881810426712036, + "learning_rate": 7.157420249653259e-06, + "loss": 0.0548, + "step": 4109 + }, + { + "epoch": 2.8472462764115, + "grad_norm": 0.7521769404411316, + "learning_rate": 7.156726768377254e-06, + "loss": 0.0483, + "step": 4110 + }, + { + "epoch": 2.847939037062695, + "grad_norm": 0.7764769792556763, + "learning_rate": 7.156033287101248e-06, + "loss": 0.0694, + "step": 4111 + }, + { + "epoch": 2.8486317977138897, + "grad_norm": 0.9802010655403137, + "learning_rate": 7.155339805825243e-06, + "loss": 0.0616, + "step": 4112 + }, + { + "epoch": 2.849324558365085, + "grad_norm": 0.6978768706321716, + "learning_rate": 7.154646324549238e-06, + "loss": 0.0486, + "step": 4113 + }, + { + "epoch": 2.85001731901628, + "grad_norm": 0.6799197793006897, + "learning_rate": 7.153952843273232e-06, + "loss": 0.053, + "step": 4114 + }, + { + "epoch": 2.8507100796674747, + "grad_norm": 0.7332054972648621, + "learning_rate": 7.153259361997227e-06, + "loss": 0.0617, + "step": 4115 + }, + { + "epoch": 2.85140284031867, + "grad_norm": 0.829636812210083, + "learning_rate": 7.1525658807212204e-06, + "loss": 0.0674, + "step": 4116 + }, + { + "epoch": 2.852095600969865, + "grad_norm": 0.7661008238792419, + "learning_rate": 7.151872399445215e-06, + "loss": 0.0813, + "step": 4117 + }, + { + "epoch": 2.85278836162106, + "grad_norm": 0.616958498954773, + "learning_rate": 7.15117891816921e-06, + "loss": 0.0506, + "step": 4118 + }, + { + "epoch": 2.853481122272255, + "grad_norm": 0.7801319360733032, + "learning_rate": 7.150485436893204e-06, + "loss": 0.044, + "step": 4119 + }, + { + "epoch": 2.85417388292345, + "grad_norm": 0.8029889464378357, + "learning_rate": 7.149791955617199e-06, + "loss": 0.0722, + "step": 4120 + }, + { + "epoch": 2.854866643574645, + "grad_norm": 0.7517969012260437, + "learning_rate": 7.1490984743411925e-06, + "loss": 0.0736, + "step": 4121 + }, + { + "epoch": 2.85555940422584, + "grad_norm": 0.6682642698287964, + "learning_rate": 7.1484049930651875e-06, + "loss": 0.0401, + "step": 4122 + }, + { + "epoch": 2.856252164877035, + "grad_norm": 0.6198400855064392, + "learning_rate": 7.147711511789182e-06, + "loss": 0.0523, + "step": 4123 + }, + { + "epoch": 2.85694492552823, + "grad_norm": 0.7525218725204468, + "learning_rate": 7.1470180305131765e-06, + "loss": 0.0591, + "step": 4124 + }, + { + "epoch": 2.857637686179425, + "grad_norm": 0.6783974170684814, + "learning_rate": 7.1463245492371714e-06, + "loss": 0.0627, + "step": 4125 + }, + { + "epoch": 2.85833044683062, + "grad_norm": 0.813218891620636, + "learning_rate": 7.1456310679611655e-06, + "loss": 0.0694, + "step": 4126 + }, + { + "epoch": 2.859023207481815, + "grad_norm": 0.7259506583213806, + "learning_rate": 7.14493758668516e-06, + "loss": 0.0514, + "step": 4127 + }, + { + "epoch": 2.85971596813301, + "grad_norm": 0.7025173902511597, + "learning_rate": 7.1442441054091545e-06, + "loss": 0.0607, + "step": 4128 + }, + { + "epoch": 2.860408728784205, + "grad_norm": 0.9219744801521301, + "learning_rate": 7.143550624133149e-06, + "loss": 0.0581, + "step": 4129 + }, + { + "epoch": 2.8611014894354003, + "grad_norm": 0.6772065162658691, + "learning_rate": 7.1428571428571436e-06, + "loss": 0.0534, + "step": 4130 + }, + { + "epoch": 2.861794250086595, + "grad_norm": 0.6437684893608093, + "learning_rate": 7.142163661581138e-06, + "loss": 0.0587, + "step": 4131 + }, + { + "epoch": 2.86248701073779, + "grad_norm": 0.7225500345230103, + "learning_rate": 7.141470180305133e-06, + "loss": 0.053, + "step": 4132 + }, + { + "epoch": 2.8631797713889853, + "grad_norm": 0.6994206309318542, + "learning_rate": 7.1407766990291275e-06, + "loss": 0.054, + "step": 4133 + }, + { + "epoch": 2.86387253204018, + "grad_norm": 0.7558009624481201, + "learning_rate": 7.140083217753121e-06, + "loss": 0.0637, + "step": 4134 + }, + { + "epoch": 2.864565292691375, + "grad_norm": 0.7665509581565857, + "learning_rate": 7.139389736477116e-06, + "loss": 0.0653, + "step": 4135 + }, + { + "epoch": 2.86525805334257, + "grad_norm": 0.8576058745384216, + "learning_rate": 7.13869625520111e-06, + "loss": 0.0799, + "step": 4136 + }, + { + "epoch": 2.865950813993765, + "grad_norm": 0.681874692440033, + "learning_rate": 7.138002773925105e-06, + "loss": 0.0614, + "step": 4137 + }, + { + "epoch": 2.86664357464496, + "grad_norm": 0.7096306085586548, + "learning_rate": 7.1373092926491e-06, + "loss": 0.0563, + "step": 4138 + }, + { + "epoch": 2.867336335296155, + "grad_norm": 0.733961284160614, + "learning_rate": 7.136615811373093e-06, + "loss": 0.0609, + "step": 4139 + }, + { + "epoch": 2.8680290959473504, + "grad_norm": 0.7193312644958496, + "learning_rate": 7.135922330097088e-06, + "loss": 0.05, + "step": 4140 + }, + { + "epoch": 2.8687218565985453, + "grad_norm": 0.841863214969635, + "learning_rate": 7.135228848821082e-06, + "loss": 0.06, + "step": 4141 + }, + { + "epoch": 2.86941461724974, + "grad_norm": 0.7233185172080994, + "learning_rate": 7.134535367545077e-06, + "loss": 0.0513, + "step": 4142 + }, + { + "epoch": 2.8701073779009354, + "grad_norm": 0.7611878514289856, + "learning_rate": 7.133841886269072e-06, + "loss": 0.0614, + "step": 4143 + }, + { + "epoch": 2.8708001385521302, + "grad_norm": 0.6750400066375732, + "learning_rate": 7.133148404993066e-06, + "loss": 0.045, + "step": 4144 + }, + { + "epoch": 2.871492899203325, + "grad_norm": 0.6719549298286438, + "learning_rate": 7.132454923717061e-06, + "loss": 0.0612, + "step": 4145 + }, + { + "epoch": 2.8721856598545203, + "grad_norm": 0.6650805473327637, + "learning_rate": 7.131761442441054e-06, + "loss": 0.0481, + "step": 4146 + }, + { + "epoch": 2.872878420505715, + "grad_norm": 0.6743792295455933, + "learning_rate": 7.131067961165049e-06, + "loss": 0.0498, + "step": 4147 + }, + { + "epoch": 2.87357118115691, + "grad_norm": 0.7500253319740295, + "learning_rate": 7.130374479889044e-06, + "loss": 0.064, + "step": 4148 + }, + { + "epoch": 2.8742639418081053, + "grad_norm": 0.6445143818855286, + "learning_rate": 7.129680998613038e-06, + "loss": 0.0423, + "step": 4149 + }, + { + "epoch": 2.8749567024593006, + "grad_norm": 0.7735868096351624, + "learning_rate": 7.128987517337033e-06, + "loss": 0.0643, + "step": 4150 + }, + { + "epoch": 2.8756494631104954, + "grad_norm": 0.8251103758811951, + "learning_rate": 7.128294036061026e-06, + "loss": 0.0616, + "step": 4151 + }, + { + "epoch": 2.8763422237616902, + "grad_norm": 0.7739446759223938, + "learning_rate": 7.127600554785021e-06, + "loss": 0.0564, + "step": 4152 + }, + { + "epoch": 2.8770349844128855, + "grad_norm": 0.720413327217102, + "learning_rate": 7.126907073509016e-06, + "loss": 0.0544, + "step": 4153 + }, + { + "epoch": 2.8777277450640804, + "grad_norm": 0.7851538062095642, + "learning_rate": 7.12621359223301e-06, + "loss": 0.0584, + "step": 4154 + }, + { + "epoch": 2.878420505715275, + "grad_norm": 0.6678401231765747, + "learning_rate": 7.125520110957005e-06, + "loss": 0.0536, + "step": 4155 + }, + { + "epoch": 2.8791132663664705, + "grad_norm": 0.6614765524864197, + "learning_rate": 7.124826629680998e-06, + "loss": 0.0478, + "step": 4156 + }, + { + "epoch": 2.8798060270176653, + "grad_norm": 0.7225939631462097, + "learning_rate": 7.124133148404993e-06, + "loss": 0.053, + "step": 4157 + }, + { + "epoch": 2.88049878766886, + "grad_norm": 0.7855435609817505, + "learning_rate": 7.123439667128988e-06, + "loss": 0.0667, + "step": 4158 + }, + { + "epoch": 2.8811915483200554, + "grad_norm": 0.7119595408439636, + "learning_rate": 7.122746185852982e-06, + "loss": 0.061, + "step": 4159 + }, + { + "epoch": 2.8818843089712507, + "grad_norm": 0.8931176662445068, + "learning_rate": 7.122052704576977e-06, + "loss": 0.072, + "step": 4160 + }, + { + "epoch": 2.8825770696224455, + "grad_norm": 0.8780502080917358, + "learning_rate": 7.121359223300971e-06, + "loss": 0.0633, + "step": 4161 + }, + { + "epoch": 2.8832698302736404, + "grad_norm": 0.6471735835075378, + "learning_rate": 7.120665742024966e-06, + "loss": 0.0423, + "step": 4162 + }, + { + "epoch": 2.8839625909248356, + "grad_norm": 0.8132211565971375, + "learning_rate": 7.119972260748961e-06, + "loss": 0.0663, + "step": 4163 + }, + { + "epoch": 2.8846553515760305, + "grad_norm": 0.9457964301109314, + "learning_rate": 7.119278779472954e-06, + "loss": 0.0565, + "step": 4164 + }, + { + "epoch": 2.8853481122272253, + "grad_norm": 0.6737701892852783, + "learning_rate": 7.118585298196949e-06, + "loss": 0.0548, + "step": 4165 + }, + { + "epoch": 2.8860408728784206, + "grad_norm": 0.7230066061019897, + "learning_rate": 7.117891816920943e-06, + "loss": 0.0621, + "step": 4166 + }, + { + "epoch": 2.8867336335296154, + "grad_norm": 0.8144643306732178, + "learning_rate": 7.117198335644938e-06, + "loss": 0.0516, + "step": 4167 + }, + { + "epoch": 2.8874263941808103, + "grad_norm": 0.7155871391296387, + "learning_rate": 7.116504854368933e-06, + "loss": 0.0458, + "step": 4168 + }, + { + "epoch": 2.8881191548320055, + "grad_norm": 0.8399498462677002, + "learning_rate": 7.1158113730929265e-06, + "loss": 0.0661, + "step": 4169 + }, + { + "epoch": 2.888811915483201, + "grad_norm": 0.763512134552002, + "learning_rate": 7.115117891816921e-06, + "loss": 0.0485, + "step": 4170 + }, + { + "epoch": 2.8895046761343957, + "grad_norm": 0.7869910597801208, + "learning_rate": 7.1144244105409155e-06, + "loss": 0.067, + "step": 4171 + }, + { + "epoch": 2.8901974367855905, + "grad_norm": 0.7691038846969604, + "learning_rate": 7.1137309292649104e-06, + "loss": 0.0591, + "step": 4172 + }, + { + "epoch": 2.8908901974367858, + "grad_norm": 0.911909282207489, + "learning_rate": 7.113037447988905e-06, + "loss": 0.0581, + "step": 4173 + }, + { + "epoch": 2.8915829580879806, + "grad_norm": 0.8166438341140747, + "learning_rate": 7.1123439667128995e-06, + "loss": 0.0611, + "step": 4174 + }, + { + "epoch": 2.8922757187391754, + "grad_norm": 0.6859363913536072, + "learning_rate": 7.1116504854368935e-06, + "loss": 0.0539, + "step": 4175 + }, + { + "epoch": 2.8929684793903707, + "grad_norm": 0.7008057236671448, + "learning_rate": 7.110957004160888e-06, + "loss": 0.0566, + "step": 4176 + }, + { + "epoch": 2.8936612400415656, + "grad_norm": 0.760750949382782, + "learning_rate": 7.1102635228848826e-06, + "loss": 0.0567, + "step": 4177 + }, + { + "epoch": 2.8943540006927604, + "grad_norm": 0.782202422618866, + "learning_rate": 7.1095700416088775e-06, + "loss": 0.0566, + "step": 4178 + }, + { + "epoch": 2.8950467613439557, + "grad_norm": 0.6821407079696655, + "learning_rate": 7.108876560332872e-06, + "loss": 0.0602, + "step": 4179 + }, + { + "epoch": 2.895739521995151, + "grad_norm": 0.7670574188232422, + "learning_rate": 7.1081830790568665e-06, + "loss": 0.0573, + "step": 4180 + }, + { + "epoch": 2.896432282646346, + "grad_norm": 0.6521784067153931, + "learning_rate": 7.10748959778086e-06, + "loss": 0.0382, + "step": 4181 + }, + { + "epoch": 2.8971250432975406, + "grad_norm": 0.7804185748100281, + "learning_rate": 7.106796116504855e-06, + "loss": 0.0598, + "step": 4182 + }, + { + "epoch": 2.897817803948736, + "grad_norm": 0.7645688056945801, + "learning_rate": 7.10610263522885e-06, + "loss": 0.0585, + "step": 4183 + }, + { + "epoch": 2.8985105645999307, + "grad_norm": 0.7269492149353027, + "learning_rate": 7.105409153952844e-06, + "loss": 0.0608, + "step": 4184 + }, + { + "epoch": 2.8992033252511256, + "grad_norm": 0.719804048538208, + "learning_rate": 7.104715672676839e-06, + "loss": 0.0653, + "step": 4185 + }, + { + "epoch": 2.899896085902321, + "grad_norm": 0.7282113432884216, + "learning_rate": 7.104022191400832e-06, + "loss": 0.0591, + "step": 4186 + }, + { + "epoch": 2.9005888465535157, + "grad_norm": 0.729363739490509, + "learning_rate": 7.103328710124827e-06, + "loss": 0.054, + "step": 4187 + }, + { + "epoch": 2.9012816072047105, + "grad_norm": 0.7966277599334717, + "learning_rate": 7.102635228848822e-06, + "loss": 0.0688, + "step": 4188 + }, + { + "epoch": 2.901974367855906, + "grad_norm": 0.9300838112831116, + "learning_rate": 7.101941747572816e-06, + "loss": 0.0559, + "step": 4189 + }, + { + "epoch": 2.902667128507101, + "grad_norm": 0.777302086353302, + "learning_rate": 7.101248266296811e-06, + "loss": 0.0712, + "step": 4190 + }, + { + "epoch": 2.903359889158296, + "grad_norm": 0.785150945186615, + "learning_rate": 7.100554785020805e-06, + "loss": 0.0558, + "step": 4191 + }, + { + "epoch": 2.9040526498094907, + "grad_norm": 0.7144959568977356, + "learning_rate": 7.0998613037448e-06, + "loss": 0.0621, + "step": 4192 + }, + { + "epoch": 2.904745410460686, + "grad_norm": 0.5931468605995178, + "learning_rate": 7.099167822468795e-06, + "loss": 0.0442, + "step": 4193 + }, + { + "epoch": 2.905438171111881, + "grad_norm": 0.8034635186195374, + "learning_rate": 7.098474341192788e-06, + "loss": 0.061, + "step": 4194 + }, + { + "epoch": 2.9061309317630757, + "grad_norm": 0.701107382774353, + "learning_rate": 7.097780859916783e-06, + "loss": 0.0566, + "step": 4195 + }, + { + "epoch": 2.906823692414271, + "grad_norm": 0.6087248921394348, + "learning_rate": 7.097087378640777e-06, + "loss": 0.0516, + "step": 4196 + }, + { + "epoch": 2.907516453065466, + "grad_norm": 0.7192146182060242, + "learning_rate": 7.096393897364772e-06, + "loss": 0.0553, + "step": 4197 + }, + { + "epoch": 2.9082092137166606, + "grad_norm": 0.724388837814331, + "learning_rate": 7.095700416088767e-06, + "loss": 0.075, + "step": 4198 + }, + { + "epoch": 2.908901974367856, + "grad_norm": 0.6641338467597961, + "learning_rate": 7.09500693481276e-06, + "loss": 0.0557, + "step": 4199 + }, + { + "epoch": 2.909594735019051, + "grad_norm": 0.6138565540313721, + "learning_rate": 7.094313453536755e-06, + "loss": 0.0543, + "step": 4200 + }, + { + "epoch": 2.910287495670246, + "grad_norm": 0.605147123336792, + "learning_rate": 7.093619972260749e-06, + "loss": 0.0449, + "step": 4201 + }, + { + "epoch": 2.910980256321441, + "grad_norm": 0.6798510551452637, + "learning_rate": 7.092926490984744e-06, + "loss": 0.0477, + "step": 4202 + }, + { + "epoch": 2.911673016972636, + "grad_norm": 0.7753946781158447, + "learning_rate": 7.092233009708739e-06, + "loss": 0.0723, + "step": 4203 + }, + { + "epoch": 2.912365777623831, + "grad_norm": 0.6524571776390076, + "learning_rate": 7.091539528432732e-06, + "loss": 0.0574, + "step": 4204 + }, + { + "epoch": 2.913058538275026, + "grad_norm": 0.5583915114402771, + "learning_rate": 7.090846047156727e-06, + "loss": 0.0473, + "step": 4205 + }, + { + "epoch": 2.913751298926221, + "grad_norm": 0.8100289106369019, + "learning_rate": 7.090152565880721e-06, + "loss": 0.0736, + "step": 4206 + }, + { + "epoch": 2.914444059577416, + "grad_norm": 0.6637985706329346, + "learning_rate": 7.089459084604716e-06, + "loss": 0.0508, + "step": 4207 + }, + { + "epoch": 2.9151368202286108, + "grad_norm": 0.8062850832939148, + "learning_rate": 7.088765603328711e-06, + "loss": 0.0608, + "step": 4208 + }, + { + "epoch": 2.915829580879806, + "grad_norm": 0.71599280834198, + "learning_rate": 7.088072122052705e-06, + "loss": 0.0465, + "step": 4209 + }, + { + "epoch": 2.9165223415310013, + "grad_norm": 0.8217005729675293, + "learning_rate": 7.0873786407767e-06, + "loss": 0.0792, + "step": 4210 + }, + { + "epoch": 2.917215102182196, + "grad_norm": 0.6581671833992004, + "learning_rate": 7.086685159500693e-06, + "loss": 0.0519, + "step": 4211 + }, + { + "epoch": 2.917907862833391, + "grad_norm": 0.8184213638305664, + "learning_rate": 7.085991678224688e-06, + "loss": 0.0701, + "step": 4212 + }, + { + "epoch": 2.9186006234845863, + "grad_norm": 0.6626460552215576, + "learning_rate": 7.085298196948683e-06, + "loss": 0.0587, + "step": 4213 + }, + { + "epoch": 2.919293384135781, + "grad_norm": 0.8326358795166016, + "learning_rate": 7.084604715672677e-06, + "loss": 0.0696, + "step": 4214 + }, + { + "epoch": 2.919986144786976, + "grad_norm": 0.7991271615028381, + "learning_rate": 7.083911234396672e-06, + "loss": 0.0597, + "step": 4215 + }, + { + "epoch": 2.920678905438171, + "grad_norm": 0.6827394366264343, + "learning_rate": 7.0832177531206655e-06, + "loss": 0.0491, + "step": 4216 + }, + { + "epoch": 2.921371666089366, + "grad_norm": 0.6888494491577148, + "learning_rate": 7.08252427184466e-06, + "loss": 0.0546, + "step": 4217 + }, + { + "epoch": 2.922064426740561, + "grad_norm": 0.6674357056617737, + "learning_rate": 7.081830790568655e-06, + "loss": 0.0538, + "step": 4218 + }, + { + "epoch": 2.922757187391756, + "grad_norm": 0.6578598618507385, + "learning_rate": 7.0811373092926494e-06, + "loss": 0.0526, + "step": 4219 + }, + { + "epoch": 2.923449948042951, + "grad_norm": 0.7401788234710693, + "learning_rate": 7.080443828016644e-06, + "loss": 0.0546, + "step": 4220 + }, + { + "epoch": 2.9241427086941463, + "grad_norm": 0.6942839622497559, + "learning_rate": 7.0797503467406385e-06, + "loss": 0.0466, + "step": 4221 + }, + { + "epoch": 2.924835469345341, + "grad_norm": 0.7386929392814636, + "learning_rate": 7.079056865464633e-06, + "loss": 0.0653, + "step": 4222 + }, + { + "epoch": 2.9255282299965364, + "grad_norm": 0.9131103754043579, + "learning_rate": 7.0783633841886275e-06, + "loss": 0.0557, + "step": 4223 + }, + { + "epoch": 2.9262209906477312, + "grad_norm": 0.8071298003196716, + "learning_rate": 7.0776699029126216e-06, + "loss": 0.0556, + "step": 4224 + }, + { + "epoch": 2.926913751298926, + "grad_norm": 0.7347648739814758, + "learning_rate": 7.0769764216366165e-06, + "loss": 0.0572, + "step": 4225 + }, + { + "epoch": 2.9276065119501213, + "grad_norm": 0.8403554558753967, + "learning_rate": 7.076282940360611e-06, + "loss": 0.045, + "step": 4226 + }, + { + "epoch": 2.928299272601316, + "grad_norm": 0.7159295082092285, + "learning_rate": 7.0755894590846055e-06, + "loss": 0.0638, + "step": 4227 + }, + { + "epoch": 2.928992033252511, + "grad_norm": 0.7982239127159119, + "learning_rate": 7.0748959778086004e-06, + "loss": 0.0652, + "step": 4228 + }, + { + "epoch": 2.9296847939037063, + "grad_norm": 0.8985694050788879, + "learning_rate": 7.074202496532594e-06, + "loss": 0.0505, + "step": 4229 + }, + { + "epoch": 2.930377554554901, + "grad_norm": 0.5979345440864563, + "learning_rate": 7.073509015256589e-06, + "loss": 0.0401, + "step": 4230 + }, + { + "epoch": 2.9310703152060964, + "grad_norm": 0.7456455826759338, + "learning_rate": 7.072815533980583e-06, + "loss": 0.0493, + "step": 4231 + }, + { + "epoch": 2.9317630758572912, + "grad_norm": 0.6642611622810364, + "learning_rate": 7.072122052704578e-06, + "loss": 0.055, + "step": 4232 + }, + { + "epoch": 2.9324558365084865, + "grad_norm": 0.7627031207084656, + "learning_rate": 7.0714285714285726e-06, + "loss": 0.0686, + "step": 4233 + }, + { + "epoch": 2.9331485971596813, + "grad_norm": 0.6851139664649963, + "learning_rate": 7.070735090152566e-06, + "loss": 0.0542, + "step": 4234 + }, + { + "epoch": 2.933841357810876, + "grad_norm": 0.885277509689331, + "learning_rate": 7.070041608876561e-06, + "loss": 0.0937, + "step": 4235 + }, + { + "epoch": 2.9345341184620715, + "grad_norm": 0.8729960918426514, + "learning_rate": 7.069348127600555e-06, + "loss": 0.0649, + "step": 4236 + }, + { + "epoch": 2.9352268791132663, + "grad_norm": 0.7657375931739807, + "learning_rate": 7.06865464632455e-06, + "loss": 0.0663, + "step": 4237 + }, + { + "epoch": 2.935919639764461, + "grad_norm": 0.6353119611740112, + "learning_rate": 7.067961165048545e-06, + "loss": 0.0506, + "step": 4238 + }, + { + "epoch": 2.9366124004156564, + "grad_norm": 0.7135321497917175, + "learning_rate": 7.067267683772539e-06, + "loss": 0.055, + "step": 4239 + }, + { + "epoch": 2.9373051610668512, + "grad_norm": 0.7522273063659668, + "learning_rate": 7.066574202496534e-06, + "loss": 0.0634, + "step": 4240 + }, + { + "epoch": 2.9379979217180465, + "grad_norm": 0.7556070685386658, + "learning_rate": 7.065880721220527e-06, + "loss": 0.064, + "step": 4241 + }, + { + "epoch": 2.9386906823692414, + "grad_norm": 0.7151448726654053, + "learning_rate": 7.065187239944522e-06, + "loss": 0.0521, + "step": 4242 + }, + { + "epoch": 2.9393834430204366, + "grad_norm": 0.6853150129318237, + "learning_rate": 7.064493758668517e-06, + "loss": 0.0514, + "step": 4243 + }, + { + "epoch": 2.9400762036716315, + "grad_norm": 0.7302935719490051, + "learning_rate": 7.063800277392511e-06, + "loss": 0.051, + "step": 4244 + }, + { + "epoch": 2.9407689643228263, + "grad_norm": 0.6857462525367737, + "learning_rate": 7.063106796116506e-06, + "loss": 0.0495, + "step": 4245 + }, + { + "epoch": 2.9414617249740216, + "grad_norm": 0.6912993788719177, + "learning_rate": 7.062413314840499e-06, + "loss": 0.0646, + "step": 4246 + }, + { + "epoch": 2.9421544856252164, + "grad_norm": 0.6467717289924622, + "learning_rate": 7.061719833564494e-06, + "loss": 0.0495, + "step": 4247 + }, + { + "epoch": 2.9428472462764113, + "grad_norm": 0.7279195189476013, + "learning_rate": 7.061026352288489e-06, + "loss": 0.058, + "step": 4248 + }, + { + "epoch": 2.9435400069276065, + "grad_norm": 0.8106805682182312, + "learning_rate": 7.060332871012483e-06, + "loss": 0.0548, + "step": 4249 + }, + { + "epoch": 2.9442327675788014, + "grad_norm": 0.6478182673454285, + "learning_rate": 7.059639389736478e-06, + "loss": 0.0555, + "step": 4250 + }, + { + "epoch": 2.9449255282299966, + "grad_norm": 0.7866591811180115, + "learning_rate": 7.058945908460472e-06, + "loss": 0.0518, + "step": 4251 + }, + { + "epoch": 2.9456182888811915, + "grad_norm": 0.6769903302192688, + "learning_rate": 7.058252427184467e-06, + "loss": 0.0574, + "step": 4252 + }, + { + "epoch": 2.9463110495323868, + "grad_norm": 0.6291192770004272, + "learning_rate": 7.057558945908461e-06, + "loss": 0.0512, + "step": 4253 + }, + { + "epoch": 2.9470038101835816, + "grad_norm": 0.7872965335845947, + "learning_rate": 7.056865464632455e-06, + "loss": 0.0508, + "step": 4254 + }, + { + "epoch": 2.9476965708347764, + "grad_norm": 0.7260953783988953, + "learning_rate": 7.05617198335645e-06, + "loss": 0.0472, + "step": 4255 + }, + { + "epoch": 2.9483893314859717, + "grad_norm": 0.6566283702850342, + "learning_rate": 7.055478502080444e-06, + "loss": 0.0469, + "step": 4256 + }, + { + "epoch": 2.9490820921371665, + "grad_norm": 0.7882493734359741, + "learning_rate": 7.054785020804439e-06, + "loss": 0.0558, + "step": 4257 + }, + { + "epoch": 2.9497748527883614, + "grad_norm": 0.8718060255050659, + "learning_rate": 7.054091539528434e-06, + "loss": 0.0412, + "step": 4258 + }, + { + "epoch": 2.9504676134395567, + "grad_norm": 0.869996190071106, + "learning_rate": 7.053398058252427e-06, + "loss": 0.0661, + "step": 4259 + }, + { + "epoch": 2.9511603740907515, + "grad_norm": 0.7169980406761169, + "learning_rate": 7.052704576976422e-06, + "loss": 0.0556, + "step": 4260 + }, + { + "epoch": 2.9518531347419468, + "grad_norm": 0.7835674285888672, + "learning_rate": 7.052011095700416e-06, + "loss": 0.0589, + "step": 4261 + }, + { + "epoch": 2.9525458953931416, + "grad_norm": 0.7977503538131714, + "learning_rate": 7.051317614424411e-06, + "loss": 0.0537, + "step": 4262 + }, + { + "epoch": 2.953238656044337, + "grad_norm": 0.6363850831985474, + "learning_rate": 7.050624133148406e-06, + "loss": 0.0463, + "step": 4263 + }, + { + "epoch": 2.9539314166955317, + "grad_norm": 0.7391969561576843, + "learning_rate": 7.049930651872399e-06, + "loss": 0.0456, + "step": 4264 + }, + { + "epoch": 2.9546241773467266, + "grad_norm": 0.8899480700492859, + "learning_rate": 7.049237170596394e-06, + "loss": 0.051, + "step": 4265 + }, + { + "epoch": 2.955316937997922, + "grad_norm": 0.8108271360397339, + "learning_rate": 7.0485436893203884e-06, + "loss": 0.0513, + "step": 4266 + }, + { + "epoch": 2.9560096986491167, + "grad_norm": 0.6377550959587097, + "learning_rate": 7.047850208044383e-06, + "loss": 0.0419, + "step": 4267 + }, + { + "epoch": 2.9567024593003115, + "grad_norm": 0.7099804878234863, + "learning_rate": 7.047156726768378e-06, + "loss": 0.0541, + "step": 4268 + }, + { + "epoch": 2.957395219951507, + "grad_norm": 0.6533857583999634, + "learning_rate": 7.046463245492372e-06, + "loss": 0.044, + "step": 4269 + }, + { + "epoch": 2.9580879806027016, + "grad_norm": 0.6126710176467896, + "learning_rate": 7.045769764216367e-06, + "loss": 0.0416, + "step": 4270 + }, + { + "epoch": 2.958780741253897, + "grad_norm": 0.7803050875663757, + "learning_rate": 7.0450762829403606e-06, + "loss": 0.0504, + "step": 4271 + }, + { + "epoch": 2.9594735019050917, + "grad_norm": 0.7303830981254578, + "learning_rate": 7.0443828016643555e-06, + "loss": 0.0628, + "step": 4272 + }, + { + "epoch": 2.960166262556287, + "grad_norm": 0.7416102290153503, + "learning_rate": 7.0436893203883504e-06, + "loss": 0.0591, + "step": 4273 + }, + { + "epoch": 2.960859023207482, + "grad_norm": 0.9078226089477539, + "learning_rate": 7.0429958391123445e-06, + "loss": 0.0566, + "step": 4274 + }, + { + "epoch": 2.9615517838586767, + "grad_norm": 0.7663037776947021, + "learning_rate": 7.0423023578363395e-06, + "loss": 0.0631, + "step": 4275 + }, + { + "epoch": 2.962244544509872, + "grad_norm": 0.7635251879692078, + "learning_rate": 7.041608876560333e-06, + "loss": 0.0668, + "step": 4276 + }, + { + "epoch": 2.962937305161067, + "grad_norm": 0.7865013480186462, + "learning_rate": 7.040915395284328e-06, + "loss": 0.0686, + "step": 4277 + }, + { + "epoch": 2.9636300658122616, + "grad_norm": 0.7668929100036621, + "learning_rate": 7.0402219140083226e-06, + "loss": 0.0574, + "step": 4278 + }, + { + "epoch": 2.964322826463457, + "grad_norm": 0.7387571930885315, + "learning_rate": 7.039528432732317e-06, + "loss": 0.0532, + "step": 4279 + }, + { + "epoch": 2.9650155871146517, + "grad_norm": 0.71168053150177, + "learning_rate": 7.0388349514563116e-06, + "loss": 0.0455, + "step": 4280 + }, + { + "epoch": 2.965708347765847, + "grad_norm": 0.6919817328453064, + "learning_rate": 7.038141470180306e-06, + "loss": 0.0469, + "step": 4281 + }, + { + "epoch": 2.966401108417042, + "grad_norm": 0.672911524772644, + "learning_rate": 7.0374479889043e-06, + "loss": 0.0455, + "step": 4282 + }, + { + "epoch": 2.967093869068237, + "grad_norm": 0.6759064793586731, + "learning_rate": 7.036754507628295e-06, + "loss": 0.061, + "step": 4283 + }, + { + "epoch": 2.967786629719432, + "grad_norm": 0.7007350921630859, + "learning_rate": 7.036061026352289e-06, + "loss": 0.0607, + "step": 4284 + }, + { + "epoch": 2.968479390370627, + "grad_norm": 0.681125283241272, + "learning_rate": 7.035367545076284e-06, + "loss": 0.0529, + "step": 4285 + }, + { + "epoch": 2.969172151021822, + "grad_norm": 0.6818118095397949, + "learning_rate": 7.034674063800278e-06, + "loss": 0.0467, + "step": 4286 + }, + { + "epoch": 2.969864911673017, + "grad_norm": 0.7406013607978821, + "learning_rate": 7.033980582524273e-06, + "loss": 0.0401, + "step": 4287 + }, + { + "epoch": 2.9705576723242118, + "grad_norm": 0.5973888039588928, + "learning_rate": 7.033287101248268e-06, + "loss": 0.0494, + "step": 4288 + }, + { + "epoch": 2.971250432975407, + "grad_norm": 0.676389217376709, + "learning_rate": 7.032593619972261e-06, + "loss": 0.0416, + "step": 4289 + }, + { + "epoch": 2.971943193626602, + "grad_norm": 0.6153448224067688, + "learning_rate": 7.031900138696256e-06, + "loss": 0.0477, + "step": 4290 + }, + { + "epoch": 2.972635954277797, + "grad_norm": 0.8072543144226074, + "learning_rate": 7.03120665742025e-06, + "loss": 0.0766, + "step": 4291 + }, + { + "epoch": 2.973328714928992, + "grad_norm": 0.8234242796897888, + "learning_rate": 7.030513176144245e-06, + "loss": 0.0661, + "step": 4292 + }, + { + "epoch": 2.9740214755801873, + "grad_norm": 0.8944745659828186, + "learning_rate": 7.02981969486824e-06, + "loss": 0.0539, + "step": 4293 + }, + { + "epoch": 2.974714236231382, + "grad_norm": 0.7886906266212463, + "learning_rate": 7.029126213592233e-06, + "loss": 0.0597, + "step": 4294 + }, + { + "epoch": 2.975406996882577, + "grad_norm": 0.7113933563232422, + "learning_rate": 7.028432732316228e-06, + "loss": 0.0558, + "step": 4295 + }, + { + "epoch": 2.976099757533772, + "grad_norm": 0.6581159234046936, + "learning_rate": 7.027739251040222e-06, + "loss": 0.0441, + "step": 4296 + }, + { + "epoch": 2.976792518184967, + "grad_norm": 0.6525651216506958, + "learning_rate": 7.027045769764217e-06, + "loss": 0.0471, + "step": 4297 + }, + { + "epoch": 2.977485278836162, + "grad_norm": 0.8623197078704834, + "learning_rate": 7.026352288488212e-06, + "loss": 0.0497, + "step": 4298 + }, + { + "epoch": 2.978178039487357, + "grad_norm": 0.7325842380523682, + "learning_rate": 7.025658807212206e-06, + "loss": 0.0577, + "step": 4299 + }, + { + "epoch": 2.978870800138552, + "grad_norm": 0.7979859709739685, + "learning_rate": 7.024965325936201e-06, + "loss": 0.0654, + "step": 4300 + }, + { + "epoch": 2.9795635607897473, + "grad_norm": 0.7051188349723816, + "learning_rate": 7.024271844660194e-06, + "loss": 0.0598, + "step": 4301 + }, + { + "epoch": 2.980256321440942, + "grad_norm": 0.6145420670509338, + "learning_rate": 7.023578363384189e-06, + "loss": 0.0479, + "step": 4302 + }, + { + "epoch": 2.9809490820921374, + "grad_norm": 0.8106651306152344, + "learning_rate": 7.022884882108184e-06, + "loss": 0.0475, + "step": 4303 + }, + { + "epoch": 2.981641842743332, + "grad_norm": 0.6330602169036865, + "learning_rate": 7.022191400832178e-06, + "loss": 0.0455, + "step": 4304 + }, + { + "epoch": 2.982334603394527, + "grad_norm": 0.7322973608970642, + "learning_rate": 7.021497919556173e-06, + "loss": 0.0508, + "step": 4305 + }, + { + "epoch": 2.9830273640457223, + "grad_norm": 0.5652062892913818, + "learning_rate": 7.020804438280166e-06, + "loss": 0.0377, + "step": 4306 + }, + { + "epoch": 2.983720124696917, + "grad_norm": 0.6371831893920898, + "learning_rate": 7.020110957004161e-06, + "loss": 0.0544, + "step": 4307 + }, + { + "epoch": 2.984412885348112, + "grad_norm": 0.663821280002594, + "learning_rate": 7.019417475728156e-06, + "loss": 0.0479, + "step": 4308 + }, + { + "epoch": 2.9851056459993073, + "grad_norm": 0.7163864970207214, + "learning_rate": 7.01872399445215e-06, + "loss": 0.0485, + "step": 4309 + }, + { + "epoch": 2.985798406650502, + "grad_norm": 0.7398287653923035, + "learning_rate": 7.018030513176145e-06, + "loss": 0.0707, + "step": 4310 + }, + { + "epoch": 2.9864911673016974, + "grad_norm": 0.6587268710136414, + "learning_rate": 7.017337031900138e-06, + "loss": 0.0417, + "step": 4311 + }, + { + "epoch": 2.9871839279528922, + "grad_norm": 0.7377708554267883, + "learning_rate": 7.016643550624133e-06, + "loss": 0.0634, + "step": 4312 + }, + { + "epoch": 2.9878766886040875, + "grad_norm": 0.6966869235038757, + "learning_rate": 7.015950069348128e-06, + "loss": 0.0612, + "step": 4313 + }, + { + "epoch": 2.9885694492552823, + "grad_norm": 0.6966063380241394, + "learning_rate": 7.015256588072122e-06, + "loss": 0.0568, + "step": 4314 + }, + { + "epoch": 2.989262209906477, + "grad_norm": 0.6529871821403503, + "learning_rate": 7.014563106796117e-06, + "loss": 0.0551, + "step": 4315 + }, + { + "epoch": 2.9899549705576725, + "grad_norm": 0.6561982035636902, + "learning_rate": 7.013869625520111e-06, + "loss": 0.0434, + "step": 4316 + }, + { + "epoch": 2.9906477312088673, + "grad_norm": 0.7706881761550903, + "learning_rate": 7.013176144244106e-06, + "loss": 0.0445, + "step": 4317 + }, + { + "epoch": 2.991340491860062, + "grad_norm": 0.6570540070533752, + "learning_rate": 7.012482662968101e-06, + "loss": 0.0483, + "step": 4318 + }, + { + "epoch": 2.9920332525112574, + "grad_norm": 0.6899812817573547, + "learning_rate": 7.0117891816920945e-06, + "loss": 0.0501, + "step": 4319 + }, + { + "epoch": 2.9927260131624522, + "grad_norm": 0.6786242723464966, + "learning_rate": 7.0110957004160894e-06, + "loss": 0.0531, + "step": 4320 + }, + { + "epoch": 2.9934187738136475, + "grad_norm": 0.6994133591651917, + "learning_rate": 7.0104022191400835e-06, + "loss": 0.0551, + "step": 4321 + }, + { + "epoch": 2.9941115344648424, + "grad_norm": 0.7612938284873962, + "learning_rate": 7.0097087378640785e-06, + "loss": 0.0583, + "step": 4322 + }, + { + "epoch": 2.9948042951160376, + "grad_norm": 0.666746973991394, + "learning_rate": 7.009015256588073e-06, + "loss": 0.0502, + "step": 4323 + }, + { + "epoch": 2.9954970557672325, + "grad_norm": 0.7190625071525574, + "learning_rate": 7.008321775312067e-06, + "loss": 0.0613, + "step": 4324 + }, + { + "epoch": 2.9961898164184273, + "grad_norm": 0.6754704713821411, + "learning_rate": 7.0076282940360616e-06, + "loss": 0.0517, + "step": 4325 + }, + { + "epoch": 2.9968825770696226, + "grad_norm": 0.6130306720733643, + "learning_rate": 7.006934812760056e-06, + "loss": 0.0473, + "step": 4326 + }, + { + "epoch": 2.9975753377208174, + "grad_norm": 0.7010437846183777, + "learning_rate": 7.006241331484051e-06, + "loss": 0.05, + "step": 4327 + }, + { + "epoch": 2.9982680983720122, + "grad_norm": 0.731165885925293, + "learning_rate": 7.0055478502080455e-06, + "loss": 0.0523, + "step": 4328 + }, + { + "epoch": 2.9989608590232075, + "grad_norm": 0.876032292842865, + "learning_rate": 7.00485436893204e-06, + "loss": 0.0599, + "step": 4329 + }, + { + "epoch": 2.9996536196744024, + "grad_norm": 0.9039497375488281, + "learning_rate": 7.004160887656034e-06, + "loss": 0.0774, + "step": 4330 + }, + { + "epoch": 2.9996536196744024, + "eval_loss": 0.24455492198467255, + "eval_runtime": 7659.838, + "eval_samples_per_second": 1.044, + "eval_steps_per_second": 0.033, + "eval_wer": 12.86208462909256, + "step": 4330 + }, + { + "epoch": 3.0003463803255976, + "grad_norm": 0.6198810935020447, + "learning_rate": 7.003467406380028e-06, + "loss": 0.0422, + "step": 4331 + }, + { + "epoch": 3.0010391409767925, + "grad_norm": 0.4685623049736023, + "learning_rate": 7.002773925104023e-06, + "loss": 0.0315, + "step": 4332 + }, + { + "epoch": 3.0017319016279873, + "grad_norm": 0.9896730780601501, + "learning_rate": 7.002080443828018e-06, + "loss": 0.0373, + "step": 4333 + }, + { + "epoch": 3.0024246622791826, + "grad_norm": 0.6355042457580566, + "learning_rate": 7.001386962552012e-06, + "loss": 0.0314, + "step": 4334 + }, + { + "epoch": 3.0031174229303774, + "grad_norm": 0.45164135098457336, + "learning_rate": 7.000693481276007e-06, + "loss": 0.0297, + "step": 4335 + }, + { + "epoch": 3.0038101835815727, + "grad_norm": 0.5834810137748718, + "learning_rate": 7e-06, + "loss": 0.0303, + "step": 4336 + }, + { + "epoch": 3.0045029442327675, + "grad_norm": 0.458690345287323, + "learning_rate": 6.999306518723995e-06, + "loss": 0.032, + "step": 4337 + }, + { + "epoch": 3.0051957048839624, + "grad_norm": 0.5150337815284729, + "learning_rate": 6.99861303744799e-06, + "loss": 0.0338, + "step": 4338 + }, + { + "epoch": 3.0058884655351576, + "grad_norm": 0.4686882197856903, + "learning_rate": 6.997919556171984e-06, + "loss": 0.0246, + "step": 4339 + }, + { + "epoch": 3.0065812261863525, + "grad_norm": 0.48966941237449646, + "learning_rate": 6.997226074895979e-06, + "loss": 0.0341, + "step": 4340 + }, + { + "epoch": 3.0072739868375478, + "grad_norm": 0.5113352537155151, + "learning_rate": 6.996532593619972e-06, + "loss": 0.0282, + "step": 4341 + }, + { + "epoch": 3.0079667474887426, + "grad_norm": 0.623214602470398, + "learning_rate": 6.995839112343967e-06, + "loss": 0.0357, + "step": 4342 + }, + { + "epoch": 3.0086595081399374, + "grad_norm": 0.5896956920623779, + "learning_rate": 6.995145631067962e-06, + "loss": 0.0319, + "step": 4343 + }, + { + "epoch": 3.0093522687911327, + "grad_norm": 0.6594386696815491, + "learning_rate": 6.994452149791956e-06, + "loss": 0.0354, + "step": 4344 + }, + { + "epoch": 3.0100450294423275, + "grad_norm": 0.6911342144012451, + "learning_rate": 6.993758668515951e-06, + "loss": 0.0255, + "step": 4345 + }, + { + "epoch": 3.010737790093523, + "grad_norm": 0.5812859535217285, + "learning_rate": 6.993065187239945e-06, + "loss": 0.0361, + "step": 4346 + }, + { + "epoch": 3.0114305507447177, + "grad_norm": 0.5115344524383545, + "learning_rate": 6.99237170596394e-06, + "loss": 0.0247, + "step": 4347 + }, + { + "epoch": 3.0121233113959125, + "grad_norm": 0.5840116739273071, + "learning_rate": 6.991678224687935e-06, + "loss": 0.0287, + "step": 4348 + }, + { + "epoch": 3.0128160720471078, + "grad_norm": 0.5522916913032532, + "learning_rate": 6.990984743411928e-06, + "loss": 0.0258, + "step": 4349 + }, + { + "epoch": 3.0135088326983026, + "grad_norm": 0.8488218188285828, + "learning_rate": 6.990291262135923e-06, + "loss": 0.0387, + "step": 4350 + }, + { + "epoch": 3.014201593349498, + "grad_norm": 0.5356283187866211, + "learning_rate": 6.989597780859917e-06, + "loss": 0.0373, + "step": 4351 + }, + { + "epoch": 3.0148943540006927, + "grad_norm": 0.5400470495223999, + "learning_rate": 6.988904299583912e-06, + "loss": 0.0347, + "step": 4352 + }, + { + "epoch": 3.0155871146518876, + "grad_norm": 0.4884282648563385, + "learning_rate": 6.988210818307907e-06, + "loss": 0.0251, + "step": 4353 + }, + { + "epoch": 3.016279875303083, + "grad_norm": 0.5223973393440247, + "learning_rate": 6.9875173370319e-06, + "loss": 0.0333, + "step": 4354 + }, + { + "epoch": 3.0169726359542777, + "grad_norm": 0.6101812124252319, + "learning_rate": 6.986823855755895e-06, + "loss": 0.024, + "step": 4355 + }, + { + "epoch": 3.017665396605473, + "grad_norm": 0.604290783405304, + "learning_rate": 6.986130374479889e-06, + "loss": 0.0272, + "step": 4356 + }, + { + "epoch": 3.018358157256668, + "grad_norm": 0.5956676006317139, + "learning_rate": 6.985436893203884e-06, + "loss": 0.0316, + "step": 4357 + }, + { + "epoch": 3.0190509179078626, + "grad_norm": 0.5751438140869141, + "learning_rate": 6.984743411927879e-06, + "loss": 0.036, + "step": 4358 + }, + { + "epoch": 3.019743678559058, + "grad_norm": 0.6272618174552917, + "learning_rate": 6.984049930651872e-06, + "loss": 0.0336, + "step": 4359 + }, + { + "epoch": 3.0204364392102527, + "grad_norm": 0.5761919617652893, + "learning_rate": 6.983356449375867e-06, + "loss": 0.026, + "step": 4360 + }, + { + "epoch": 3.021129199861448, + "grad_norm": 0.5389761328697205, + "learning_rate": 6.982662968099861e-06, + "loss": 0.0265, + "step": 4361 + }, + { + "epoch": 3.021821960512643, + "grad_norm": 0.621613085269928, + "learning_rate": 6.981969486823856e-06, + "loss": 0.0333, + "step": 4362 + }, + { + "epoch": 3.0225147211638377, + "grad_norm": 0.44902804493904114, + "learning_rate": 6.981276005547851e-06, + "loss": 0.0278, + "step": 4363 + }, + { + "epoch": 3.023207481815033, + "grad_norm": 0.658017635345459, + "learning_rate": 6.980582524271845e-06, + "loss": 0.0367, + "step": 4364 + }, + { + "epoch": 3.023900242466228, + "grad_norm": 0.5380011200904846, + "learning_rate": 6.97988904299584e-06, + "loss": 0.0336, + "step": 4365 + }, + { + "epoch": 3.024593003117423, + "grad_norm": 0.6718837022781372, + "learning_rate": 6.9791955617198335e-06, + "loss": 0.0328, + "step": 4366 + }, + { + "epoch": 3.025285763768618, + "grad_norm": 0.6377173662185669, + "learning_rate": 6.9785020804438284e-06, + "loss": 0.0352, + "step": 4367 + }, + { + "epoch": 3.0259785244198127, + "grad_norm": 0.44522419571876526, + "learning_rate": 6.977808599167823e-06, + "loss": 0.023, + "step": 4368 + }, + { + "epoch": 3.026671285071008, + "grad_norm": 0.537746250629425, + "learning_rate": 6.9771151178918175e-06, + "loss": 0.0316, + "step": 4369 + }, + { + "epoch": 3.027364045722203, + "grad_norm": 0.5502179265022278, + "learning_rate": 6.976421636615812e-06, + "loss": 0.0331, + "step": 4370 + }, + { + "epoch": 3.028056806373398, + "grad_norm": 0.602715790271759, + "learning_rate": 6.975728155339806e-06, + "loss": 0.0254, + "step": 4371 + }, + { + "epoch": 3.028749567024593, + "grad_norm": 0.5520962476730347, + "learning_rate": 6.9750346740638006e-06, + "loss": 0.0328, + "step": 4372 + }, + { + "epoch": 3.029442327675788, + "grad_norm": 0.5209055542945862, + "learning_rate": 6.9743411927877955e-06, + "loss": 0.0252, + "step": 4373 + }, + { + "epoch": 3.030135088326983, + "grad_norm": 0.5743737816810608, + "learning_rate": 6.97364771151179e-06, + "loss": 0.035, + "step": 4374 + }, + { + "epoch": 3.030827848978178, + "grad_norm": 0.5154269933700562, + "learning_rate": 6.9729542302357845e-06, + "loss": 0.0238, + "step": 4375 + }, + { + "epoch": 3.031520609629373, + "grad_norm": 0.5269982218742371, + "learning_rate": 6.972260748959779e-06, + "loss": 0.027, + "step": 4376 + }, + { + "epoch": 3.032213370280568, + "grad_norm": 0.5954230427742004, + "learning_rate": 6.9715672676837735e-06, + "loss": 0.0336, + "step": 4377 + }, + { + "epoch": 3.032906130931763, + "grad_norm": 0.53389972448349, + "learning_rate": 6.970873786407768e-06, + "loss": 0.0257, + "step": 4378 + }, + { + "epoch": 3.033598891582958, + "grad_norm": 0.5607286095619202, + "learning_rate": 6.970180305131762e-06, + "loss": 0.0397, + "step": 4379 + }, + { + "epoch": 3.034291652234153, + "grad_norm": 0.5695968270301819, + "learning_rate": 6.969486823855757e-06, + "loss": 0.0369, + "step": 4380 + }, + { + "epoch": 3.0349844128853483, + "grad_norm": 0.45668014883995056, + "learning_rate": 6.968793342579751e-06, + "loss": 0.0265, + "step": 4381 + }, + { + "epoch": 3.035677173536543, + "grad_norm": 0.462616890668869, + "learning_rate": 6.968099861303746e-06, + "loss": 0.026, + "step": 4382 + }, + { + "epoch": 3.036369934187738, + "grad_norm": 0.5351200103759766, + "learning_rate": 6.967406380027741e-06, + "loss": 0.0243, + "step": 4383 + }, + { + "epoch": 3.037062694838933, + "grad_norm": 0.5747559070587158, + "learning_rate": 6.966712898751734e-06, + "loss": 0.0328, + "step": 4384 + }, + { + "epoch": 3.037755455490128, + "grad_norm": 0.3567298352718353, + "learning_rate": 6.966019417475729e-06, + "loss": 0.016, + "step": 4385 + }, + { + "epoch": 3.0384482161413233, + "grad_norm": 0.6422958970069885, + "learning_rate": 6.965325936199723e-06, + "loss": 0.0256, + "step": 4386 + }, + { + "epoch": 3.039140976792518, + "grad_norm": 0.7727379202842712, + "learning_rate": 6.964632454923718e-06, + "loss": 0.0203, + "step": 4387 + }, + { + "epoch": 3.039833737443713, + "grad_norm": 0.6507742404937744, + "learning_rate": 6.963938973647713e-06, + "loss": 0.0345, + "step": 4388 + }, + { + "epoch": 3.0405264980949083, + "grad_norm": 0.48902198672294617, + "learning_rate": 6.963245492371706e-06, + "loss": 0.025, + "step": 4389 + }, + { + "epoch": 3.041219258746103, + "grad_norm": 0.527797281742096, + "learning_rate": 6.962552011095701e-06, + "loss": 0.0286, + "step": 4390 + }, + { + "epoch": 3.0419120193972984, + "grad_norm": 0.4999096393585205, + "learning_rate": 6.961858529819695e-06, + "loss": 0.027, + "step": 4391 + }, + { + "epoch": 3.042604780048493, + "grad_norm": 0.5397186279296875, + "learning_rate": 6.96116504854369e-06, + "loss": 0.0305, + "step": 4392 + }, + { + "epoch": 3.043297540699688, + "grad_norm": 0.6479367017745972, + "learning_rate": 6.960471567267685e-06, + "loss": 0.0343, + "step": 4393 + }, + { + "epoch": 3.0439903013508833, + "grad_norm": 0.4883427619934082, + "learning_rate": 6.959778085991679e-06, + "loss": 0.0195, + "step": 4394 + }, + { + "epoch": 3.044683062002078, + "grad_norm": 0.5787862539291382, + "learning_rate": 6.959084604715674e-06, + "loss": 0.0311, + "step": 4395 + }, + { + "epoch": 3.0453758226532734, + "grad_norm": 0.484783411026001, + "learning_rate": 6.958391123439667e-06, + "loss": 0.0223, + "step": 4396 + }, + { + "epoch": 3.0460685833044683, + "grad_norm": 0.6791329383850098, + "learning_rate": 6.957697642163662e-06, + "loss": 0.029, + "step": 4397 + }, + { + "epoch": 3.046761343955663, + "grad_norm": 0.47099193930625916, + "learning_rate": 6.957004160887657e-06, + "loss": 0.025, + "step": 4398 + }, + { + "epoch": 3.0474541046068584, + "grad_norm": 0.5062325596809387, + "learning_rate": 6.956310679611651e-06, + "loss": 0.0274, + "step": 4399 + }, + { + "epoch": 3.0481468652580532, + "grad_norm": 0.5030776858329773, + "learning_rate": 6.955617198335646e-06, + "loss": 0.0308, + "step": 4400 + }, + { + "epoch": 3.0488396259092485, + "grad_norm": 0.5200594067573547, + "learning_rate": 6.954923717059639e-06, + "loss": 0.0301, + "step": 4401 + }, + { + "epoch": 3.0495323865604433, + "grad_norm": 0.4133424758911133, + "learning_rate": 6.954230235783634e-06, + "loss": 0.0194, + "step": 4402 + }, + { + "epoch": 3.050225147211638, + "grad_norm": 0.47432467341423035, + "learning_rate": 6.953536754507629e-06, + "loss": 0.0306, + "step": 4403 + }, + { + "epoch": 3.0509179078628335, + "grad_norm": 0.5050131678581238, + "learning_rate": 6.952843273231623e-06, + "loss": 0.0293, + "step": 4404 + }, + { + "epoch": 3.0516106685140283, + "grad_norm": 0.6067765355110168, + "learning_rate": 6.952149791955618e-06, + "loss": 0.0358, + "step": 4405 + }, + { + "epoch": 3.0523034291652236, + "grad_norm": 0.5132820010185242, + "learning_rate": 6.951456310679612e-06, + "loss": 0.0278, + "step": 4406 + }, + { + "epoch": 3.0529961898164184, + "grad_norm": 0.5128617286682129, + "learning_rate": 6.950762829403606e-06, + "loss": 0.0268, + "step": 4407 + }, + { + "epoch": 3.0536889504676132, + "grad_norm": 0.5275478363037109, + "learning_rate": 6.950069348127601e-06, + "loss": 0.0255, + "step": 4408 + }, + { + "epoch": 3.0543817111188085, + "grad_norm": 0.5116069912910461, + "learning_rate": 6.949375866851595e-06, + "loss": 0.0342, + "step": 4409 + }, + { + "epoch": 3.0550744717700034, + "grad_norm": 0.591738224029541, + "learning_rate": 6.94868238557559e-06, + "loss": 0.0293, + "step": 4410 + }, + { + "epoch": 3.0557672324211986, + "grad_norm": 0.8489149808883667, + "learning_rate": 6.947988904299584e-06, + "loss": 0.038, + "step": 4411 + }, + { + "epoch": 3.0564599930723935, + "grad_norm": 0.6078969836235046, + "learning_rate": 6.947295423023579e-06, + "loss": 0.0438, + "step": 4412 + }, + { + "epoch": 3.0571527537235883, + "grad_norm": 0.5629372000694275, + "learning_rate": 6.946601941747574e-06, + "loss": 0.0356, + "step": 4413 + }, + { + "epoch": 3.0578455143747836, + "grad_norm": 0.5793541669845581, + "learning_rate": 6.9459084604715674e-06, + "loss": 0.0274, + "step": 4414 + }, + { + "epoch": 3.0585382750259784, + "grad_norm": 0.4805000126361847, + "learning_rate": 6.945214979195562e-06, + "loss": 0.0235, + "step": 4415 + }, + { + "epoch": 3.0592310356771737, + "grad_norm": 0.4964812397956848, + "learning_rate": 6.9445214979195565e-06, + "loss": 0.0265, + "step": 4416 + }, + { + "epoch": 3.0599237963283685, + "grad_norm": 0.47828176617622375, + "learning_rate": 6.943828016643551e-06, + "loss": 0.0265, + "step": 4417 + }, + { + "epoch": 3.0606165569795634, + "grad_norm": 0.5604119300842285, + "learning_rate": 6.943134535367546e-06, + "loss": 0.0311, + "step": 4418 + }, + { + "epoch": 3.0613093176307586, + "grad_norm": 0.551511287689209, + "learning_rate": 6.9424410540915396e-06, + "loss": 0.0251, + "step": 4419 + }, + { + "epoch": 3.0620020782819535, + "grad_norm": 0.49019280076026917, + "learning_rate": 6.9417475728155345e-06, + "loss": 0.0231, + "step": 4420 + }, + { + "epoch": 3.0626948389331488, + "grad_norm": 0.545318067073822, + "learning_rate": 6.941054091539529e-06, + "loss": 0.0317, + "step": 4421 + }, + { + "epoch": 3.0633875995843436, + "grad_norm": 0.504234790802002, + "learning_rate": 6.9403606102635235e-06, + "loss": 0.0263, + "step": 4422 + }, + { + "epoch": 3.0640803602355384, + "grad_norm": 0.45859989523887634, + "learning_rate": 6.9396671289875184e-06, + "loss": 0.0224, + "step": 4423 + }, + { + "epoch": 3.0647731208867337, + "grad_norm": 0.4985523223876953, + "learning_rate": 6.9389736477115125e-06, + "loss": 0.0254, + "step": 4424 + }, + { + "epoch": 3.0654658815379285, + "grad_norm": 0.4899972677230835, + "learning_rate": 6.9382801664355075e-06, + "loss": 0.0196, + "step": 4425 + }, + { + "epoch": 3.066158642189124, + "grad_norm": 0.550105631351471, + "learning_rate": 6.937586685159501e-06, + "loss": 0.0257, + "step": 4426 + }, + { + "epoch": 3.0668514028403187, + "grad_norm": 0.5070916414260864, + "learning_rate": 6.936893203883496e-06, + "loss": 0.0209, + "step": 4427 + }, + { + "epoch": 3.0675441634915135, + "grad_norm": 0.5206437110900879, + "learning_rate": 6.9361997226074906e-06, + "loss": 0.0268, + "step": 4428 + }, + { + "epoch": 3.0682369241427088, + "grad_norm": 0.6033762097358704, + "learning_rate": 6.935506241331485e-06, + "loss": 0.0319, + "step": 4429 + }, + { + "epoch": 3.0689296847939036, + "grad_norm": 0.6515141725540161, + "learning_rate": 6.93481276005548e-06, + "loss": 0.0297, + "step": 4430 + }, + { + "epoch": 3.069622445445099, + "grad_norm": 0.5186117887496948, + "learning_rate": 6.934119278779473e-06, + "loss": 0.027, + "step": 4431 + }, + { + "epoch": 3.0703152060962937, + "grad_norm": 0.5309900641441345, + "learning_rate": 6.933425797503468e-06, + "loss": 0.0318, + "step": 4432 + }, + { + "epoch": 3.0710079667474885, + "grad_norm": 0.5771611928939819, + "learning_rate": 6.932732316227463e-06, + "loss": 0.029, + "step": 4433 + }, + { + "epoch": 3.071700727398684, + "grad_norm": 0.551252543926239, + "learning_rate": 6.932038834951457e-06, + "loss": 0.0239, + "step": 4434 + }, + { + "epoch": 3.0723934880498787, + "grad_norm": 0.6136420369148254, + "learning_rate": 6.931345353675452e-06, + "loss": 0.0407, + "step": 4435 + }, + { + "epoch": 3.073086248701074, + "grad_norm": 0.5737125277519226, + "learning_rate": 6.930651872399445e-06, + "loss": 0.0316, + "step": 4436 + }, + { + "epoch": 3.0737790093522688, + "grad_norm": 0.583035945892334, + "learning_rate": 6.92995839112344e-06, + "loss": 0.0329, + "step": 4437 + }, + { + "epoch": 3.0744717700034636, + "grad_norm": 0.5928806662559509, + "learning_rate": 6.929264909847435e-06, + "loss": 0.0296, + "step": 4438 + }, + { + "epoch": 3.075164530654659, + "grad_norm": 0.5638597011566162, + "learning_rate": 6.928571428571429e-06, + "loss": 0.0262, + "step": 4439 + }, + { + "epoch": 3.0758572913058537, + "grad_norm": 0.6342456936836243, + "learning_rate": 6.927877947295424e-06, + "loss": 0.0367, + "step": 4440 + }, + { + "epoch": 3.076550051957049, + "grad_norm": 0.5516890287399292, + "learning_rate": 6.927184466019418e-06, + "loss": 0.0303, + "step": 4441 + }, + { + "epoch": 3.077242812608244, + "grad_norm": 0.8059898018836975, + "learning_rate": 6.926490984743413e-06, + "loss": 0.0357, + "step": 4442 + }, + { + "epoch": 3.0779355732594387, + "grad_norm": 0.6928458213806152, + "learning_rate": 6.925797503467408e-06, + "loss": 0.0338, + "step": 4443 + }, + { + "epoch": 3.078628333910634, + "grad_norm": 0.6038082838058472, + "learning_rate": 6.925104022191401e-06, + "loss": 0.0225, + "step": 4444 + }, + { + "epoch": 3.079321094561829, + "grad_norm": 0.44677624106407166, + "learning_rate": 6.924410540915396e-06, + "loss": 0.0235, + "step": 4445 + }, + { + "epoch": 3.080013855213024, + "grad_norm": 0.5955592393875122, + "learning_rate": 6.92371705963939e-06, + "loss": 0.0393, + "step": 4446 + }, + { + "epoch": 3.080706615864219, + "grad_norm": 0.4668789207935333, + "learning_rate": 6.923023578363385e-06, + "loss": 0.0279, + "step": 4447 + }, + { + "epoch": 3.0813993765154137, + "grad_norm": 0.5402735471725464, + "learning_rate": 6.92233009708738e-06, + "loss": 0.0283, + "step": 4448 + }, + { + "epoch": 3.082092137166609, + "grad_norm": 0.6488705277442932, + "learning_rate": 6.921636615811373e-06, + "loss": 0.0216, + "step": 4449 + }, + { + "epoch": 3.082784897817804, + "grad_norm": 0.665450394153595, + "learning_rate": 6.920943134535368e-06, + "loss": 0.0311, + "step": 4450 + }, + { + "epoch": 3.083477658468999, + "grad_norm": 0.5854861736297607, + "learning_rate": 6.920249653259362e-06, + "loss": 0.0258, + "step": 4451 + }, + { + "epoch": 3.084170419120194, + "grad_norm": 0.8058608770370483, + "learning_rate": 6.919556171983357e-06, + "loss": 0.0344, + "step": 4452 + }, + { + "epoch": 3.084863179771389, + "grad_norm": 0.5175086259841919, + "learning_rate": 6.918862690707352e-06, + "loss": 0.0234, + "step": 4453 + }, + { + "epoch": 3.085555940422584, + "grad_norm": 0.4833068549633026, + "learning_rate": 6.918169209431346e-06, + "loss": 0.0235, + "step": 4454 + }, + { + "epoch": 3.086248701073779, + "grad_norm": 0.5273468494415283, + "learning_rate": 6.91747572815534e-06, + "loss": 0.0321, + "step": 4455 + }, + { + "epoch": 3.086941461724974, + "grad_norm": 0.44908297061920166, + "learning_rate": 6.916782246879334e-06, + "loss": 0.0273, + "step": 4456 + }, + { + "epoch": 3.087634222376169, + "grad_norm": 0.5509052276611328, + "learning_rate": 6.916088765603329e-06, + "loss": 0.035, + "step": 4457 + }, + { + "epoch": 3.088326983027364, + "grad_norm": 0.5545693635940552, + "learning_rate": 6.915395284327324e-06, + "loss": 0.0318, + "step": 4458 + }, + { + "epoch": 3.089019743678559, + "grad_norm": 0.5566913485527039, + "learning_rate": 6.914701803051318e-06, + "loss": 0.0297, + "step": 4459 + }, + { + "epoch": 3.089712504329754, + "grad_norm": 0.5710915923118591, + "learning_rate": 6.914008321775313e-06, + "loss": 0.0291, + "step": 4460 + }, + { + "epoch": 3.0904052649809493, + "grad_norm": 0.5165517926216125, + "learning_rate": 6.9133148404993064e-06, + "loss": 0.0253, + "step": 4461 + }, + { + "epoch": 3.091098025632144, + "grad_norm": 0.5543791651725769, + "learning_rate": 6.912621359223301e-06, + "loss": 0.0248, + "step": 4462 + }, + { + "epoch": 3.091790786283339, + "grad_norm": 0.5821418762207031, + "learning_rate": 6.911927877947296e-06, + "loss": 0.0365, + "step": 4463 + }, + { + "epoch": 3.092483546934534, + "grad_norm": 0.4341883063316345, + "learning_rate": 6.91123439667129e-06, + "loss": 0.0227, + "step": 4464 + }, + { + "epoch": 3.093176307585729, + "grad_norm": 0.5115289688110352, + "learning_rate": 6.910540915395285e-06, + "loss": 0.0237, + "step": 4465 + }, + { + "epoch": 3.0938690682369243, + "grad_norm": 0.5711712837219238, + "learning_rate": 6.9098474341192786e-06, + "loss": 0.0274, + "step": 4466 + }, + { + "epoch": 3.094561828888119, + "grad_norm": 0.48416972160339355, + "learning_rate": 6.9091539528432735e-06, + "loss": 0.0237, + "step": 4467 + }, + { + "epoch": 3.095254589539314, + "grad_norm": 0.6813589930534363, + "learning_rate": 6.9084604715672684e-06, + "loss": 0.032, + "step": 4468 + }, + { + "epoch": 3.0959473501905093, + "grad_norm": 0.39871472120285034, + "learning_rate": 6.9077669902912625e-06, + "loss": 0.0204, + "step": 4469 + }, + { + "epoch": 3.096640110841704, + "grad_norm": 0.6924740076065063, + "learning_rate": 6.9070735090152574e-06, + "loss": 0.0406, + "step": 4470 + }, + { + "epoch": 3.0973328714928994, + "grad_norm": 0.41659021377563477, + "learning_rate": 6.9063800277392515e-06, + "loss": 0.0195, + "step": 4471 + }, + { + "epoch": 3.098025632144094, + "grad_norm": 0.5689798593521118, + "learning_rate": 6.9056865464632465e-06, + "loss": 0.018, + "step": 4472 + }, + { + "epoch": 3.098718392795289, + "grad_norm": 0.6138285994529724, + "learning_rate": 6.904993065187241e-06, + "loss": 0.0303, + "step": 4473 + }, + { + "epoch": 3.0994111534464843, + "grad_norm": 0.48150208592414856, + "learning_rate": 6.904299583911235e-06, + "loss": 0.0228, + "step": 4474 + }, + { + "epoch": 3.100103914097679, + "grad_norm": 0.6282781362533569, + "learning_rate": 6.9036061026352296e-06, + "loss": 0.031, + "step": 4475 + }, + { + "epoch": 3.1007966747488744, + "grad_norm": 0.4962520897388458, + "learning_rate": 6.902912621359224e-06, + "loss": 0.0286, + "step": 4476 + }, + { + "epoch": 3.1014894354000693, + "grad_norm": 0.5997893214225769, + "learning_rate": 6.902219140083219e-06, + "loss": 0.0352, + "step": 4477 + }, + { + "epoch": 3.102182196051264, + "grad_norm": 0.5330485105514526, + "learning_rate": 6.9015256588072135e-06, + "loss": 0.0239, + "step": 4478 + }, + { + "epoch": 3.1028749567024594, + "grad_norm": 0.5510309934616089, + "learning_rate": 6.900832177531207e-06, + "loss": 0.0341, + "step": 4479 + }, + { + "epoch": 3.103567717353654, + "grad_norm": 0.5287234783172607, + "learning_rate": 6.900138696255202e-06, + "loss": 0.0275, + "step": 4480 + }, + { + "epoch": 3.1042604780048495, + "grad_norm": 0.473359614610672, + "learning_rate": 6.899445214979196e-06, + "loss": 0.0274, + "step": 4481 + }, + { + "epoch": 3.1049532386560443, + "grad_norm": 0.608839213848114, + "learning_rate": 6.898751733703191e-06, + "loss": 0.0332, + "step": 4482 + }, + { + "epoch": 3.105645999307239, + "grad_norm": 0.6796181797981262, + "learning_rate": 6.898058252427186e-06, + "loss": 0.0281, + "step": 4483 + }, + { + "epoch": 3.1063387599584344, + "grad_norm": 0.7151910066604614, + "learning_rate": 6.897364771151179e-06, + "loss": 0.0307, + "step": 4484 + }, + { + "epoch": 3.1070315206096293, + "grad_norm": 0.5276244878768921, + "learning_rate": 6.896671289875174e-06, + "loss": 0.0271, + "step": 4485 + }, + { + "epoch": 3.1077242812608246, + "grad_norm": 0.6411841511726379, + "learning_rate": 6.895977808599168e-06, + "loss": 0.0307, + "step": 4486 + }, + { + "epoch": 3.1084170419120194, + "grad_norm": 0.584810197353363, + "learning_rate": 6.895284327323163e-06, + "loss": 0.0366, + "step": 4487 + }, + { + "epoch": 3.1091098025632142, + "grad_norm": 0.4790900945663452, + "learning_rate": 6.894590846047158e-06, + "loss": 0.0319, + "step": 4488 + }, + { + "epoch": 3.1098025632144095, + "grad_norm": 0.5784599781036377, + "learning_rate": 6.893897364771152e-06, + "loss": 0.0331, + "step": 4489 + }, + { + "epoch": 3.1104953238656043, + "grad_norm": 0.4899098873138428, + "learning_rate": 6.893203883495147e-06, + "loss": 0.0244, + "step": 4490 + }, + { + "epoch": 3.1111880845167996, + "grad_norm": 0.522589921951294, + "learning_rate": 6.89251040221914e-06, + "loss": 0.03, + "step": 4491 + }, + { + "epoch": 3.1118808451679945, + "grad_norm": 0.47644123435020447, + "learning_rate": 6.891816920943135e-06, + "loss": 0.0286, + "step": 4492 + }, + { + "epoch": 3.1125736058191893, + "grad_norm": 0.49770569801330566, + "learning_rate": 6.89112343966713e-06, + "loss": 0.0242, + "step": 4493 + }, + { + "epoch": 3.1132663664703846, + "grad_norm": 0.5135511159896851, + "learning_rate": 6.890429958391124e-06, + "loss": 0.0295, + "step": 4494 + }, + { + "epoch": 3.1139591271215794, + "grad_norm": 0.6963201761245728, + "learning_rate": 6.889736477115119e-06, + "loss": 0.0338, + "step": 4495 + }, + { + "epoch": 3.1146518877727747, + "grad_norm": 0.4779321253299713, + "learning_rate": 6.889042995839112e-06, + "loss": 0.0287, + "step": 4496 + }, + { + "epoch": 3.1153446484239695, + "grad_norm": 0.595662534236908, + "learning_rate": 6.888349514563107e-06, + "loss": 0.0267, + "step": 4497 + }, + { + "epoch": 3.1160374090751644, + "grad_norm": 0.637178361415863, + "learning_rate": 6.887656033287102e-06, + "loss": 0.0332, + "step": 4498 + }, + { + "epoch": 3.1167301697263596, + "grad_norm": 0.5409026741981506, + "learning_rate": 6.886962552011096e-06, + "loss": 0.0241, + "step": 4499 + }, + { + "epoch": 3.1174229303775545, + "grad_norm": 0.8406730890274048, + "learning_rate": 6.886269070735091e-06, + "loss": 0.0372, + "step": 4500 + }, + { + "epoch": 3.1181156910287497, + "grad_norm": 0.5198965668678284, + "learning_rate": 6.885575589459085e-06, + "loss": 0.0321, + "step": 4501 + }, + { + "epoch": 3.1188084516799446, + "grad_norm": 0.4785172641277313, + "learning_rate": 6.88488210818308e-06, + "loss": 0.0223, + "step": 4502 + }, + { + "epoch": 3.1195012123311394, + "grad_norm": 0.5014827847480774, + "learning_rate": 6.884188626907074e-06, + "loss": 0.0249, + "step": 4503 + }, + { + "epoch": 3.1201939729823347, + "grad_norm": 0.5796141624450684, + "learning_rate": 6.883495145631068e-06, + "loss": 0.0307, + "step": 4504 + }, + { + "epoch": 3.1208867336335295, + "grad_norm": 0.6407247185707092, + "learning_rate": 6.882801664355063e-06, + "loss": 0.0307, + "step": 4505 + }, + { + "epoch": 3.121579494284725, + "grad_norm": 0.5700838565826416, + "learning_rate": 6.882108183079057e-06, + "loss": 0.0359, + "step": 4506 + }, + { + "epoch": 3.1222722549359196, + "grad_norm": 0.5296303033828735, + "learning_rate": 6.881414701803052e-06, + "loss": 0.0273, + "step": 4507 + }, + { + "epoch": 3.1229650155871145, + "grad_norm": 0.5510543584823608, + "learning_rate": 6.880721220527047e-06, + "loss": 0.0257, + "step": 4508 + }, + { + "epoch": 3.1236577762383098, + "grad_norm": 0.6245564818382263, + "learning_rate": 6.88002773925104e-06, + "loss": 0.0243, + "step": 4509 + }, + { + "epoch": 3.1243505368895046, + "grad_norm": 0.5439155101776123, + "learning_rate": 6.879334257975035e-06, + "loss": 0.0287, + "step": 4510 + }, + { + "epoch": 3.1250432975407, + "grad_norm": 0.5894782543182373, + "learning_rate": 6.878640776699029e-06, + "loss": 0.0334, + "step": 4511 + }, + { + "epoch": 3.1257360581918947, + "grad_norm": 0.4923941195011139, + "learning_rate": 6.877947295423024e-06, + "loss": 0.0241, + "step": 4512 + }, + { + "epoch": 3.1264288188430895, + "grad_norm": 0.4189883768558502, + "learning_rate": 6.877253814147019e-06, + "loss": 0.02, + "step": 4513 + }, + { + "epoch": 3.127121579494285, + "grad_norm": 0.5520276427268982, + "learning_rate": 6.8765603328710125e-06, + "loss": 0.0257, + "step": 4514 + }, + { + "epoch": 3.1278143401454797, + "grad_norm": 0.5777350664138794, + "learning_rate": 6.8758668515950074e-06, + "loss": 0.027, + "step": 4515 + }, + { + "epoch": 3.128507100796675, + "grad_norm": 0.5808957815170288, + "learning_rate": 6.8751733703190015e-06, + "loss": 0.0354, + "step": 4516 + }, + { + "epoch": 3.1291998614478698, + "grad_norm": 0.5078085660934448, + "learning_rate": 6.8744798890429964e-06, + "loss": 0.0323, + "step": 4517 + }, + { + "epoch": 3.1298926220990646, + "grad_norm": 0.6827059388160706, + "learning_rate": 6.873786407766991e-06, + "loss": 0.0396, + "step": 4518 + }, + { + "epoch": 3.13058538275026, + "grad_norm": 0.472192645072937, + "learning_rate": 6.8730929264909855e-06, + "loss": 0.0206, + "step": 4519 + }, + { + "epoch": 3.1312781434014547, + "grad_norm": 0.4870913326740265, + "learning_rate": 6.87239944521498e-06, + "loss": 0.0298, + "step": 4520 + }, + { + "epoch": 3.13197090405265, + "grad_norm": 0.4849689304828644, + "learning_rate": 6.871705963938974e-06, + "loss": 0.025, + "step": 4521 + }, + { + "epoch": 3.132663664703845, + "grad_norm": 0.5785959362983704, + "learning_rate": 6.8710124826629686e-06, + "loss": 0.0254, + "step": 4522 + }, + { + "epoch": 3.1333564253550397, + "grad_norm": 0.48383548855781555, + "learning_rate": 6.8703190013869635e-06, + "loss": 0.0242, + "step": 4523 + }, + { + "epoch": 3.134049186006235, + "grad_norm": 0.6626015901565552, + "learning_rate": 6.869625520110958e-06, + "loss": 0.0377, + "step": 4524 + }, + { + "epoch": 3.13474194665743, + "grad_norm": 0.5851209163665771, + "learning_rate": 6.8689320388349525e-06, + "loss": 0.0368, + "step": 4525 + }, + { + "epoch": 3.135434707308625, + "grad_norm": 0.5402666926383972, + "learning_rate": 6.868238557558946e-06, + "loss": 0.0266, + "step": 4526 + }, + { + "epoch": 3.13612746795982, + "grad_norm": 0.5914937257766724, + "learning_rate": 6.867545076282941e-06, + "loss": 0.0315, + "step": 4527 + }, + { + "epoch": 3.1368202286110147, + "grad_norm": 0.5798299312591553, + "learning_rate": 6.866851595006936e-06, + "loss": 0.0338, + "step": 4528 + }, + { + "epoch": 3.13751298926221, + "grad_norm": 0.6052075624465942, + "learning_rate": 6.86615811373093e-06, + "loss": 0.0291, + "step": 4529 + }, + { + "epoch": 3.138205749913405, + "grad_norm": 0.5369600653648376, + "learning_rate": 6.865464632454925e-06, + "loss": 0.0307, + "step": 4530 + }, + { + "epoch": 3.1388985105646, + "grad_norm": 0.4198368191719055, + "learning_rate": 6.864771151178919e-06, + "loss": 0.0227, + "step": 4531 + }, + { + "epoch": 3.139591271215795, + "grad_norm": 0.4274318516254425, + "learning_rate": 6.864077669902913e-06, + "loss": 0.0204, + "step": 4532 + }, + { + "epoch": 3.14028403186699, + "grad_norm": 0.5620263814926147, + "learning_rate": 6.863384188626908e-06, + "loss": 0.0242, + "step": 4533 + }, + { + "epoch": 3.140976792518185, + "grad_norm": 0.4838273227214813, + "learning_rate": 6.862690707350902e-06, + "loss": 0.0277, + "step": 4534 + }, + { + "epoch": 3.14166955316938, + "grad_norm": 0.5215245485305786, + "learning_rate": 6.861997226074897e-06, + "loss": 0.0322, + "step": 4535 + }, + { + "epoch": 3.142362313820575, + "grad_norm": 0.6789940595626831, + "learning_rate": 6.861303744798891e-06, + "loss": 0.028, + "step": 4536 + }, + { + "epoch": 3.14305507447177, + "grad_norm": 0.6261676549911499, + "learning_rate": 6.860610263522886e-06, + "loss": 0.0366, + "step": 4537 + }, + { + "epoch": 3.143747835122965, + "grad_norm": 0.5720306038856506, + "learning_rate": 6.859916782246881e-06, + "loss": 0.0319, + "step": 4538 + }, + { + "epoch": 3.14444059577416, + "grad_norm": 0.5291834473609924, + "learning_rate": 6.859223300970874e-06, + "loss": 0.028, + "step": 4539 + }, + { + "epoch": 3.145133356425355, + "grad_norm": 0.4897461235523224, + "learning_rate": 6.858529819694869e-06, + "loss": 0.0233, + "step": 4540 + }, + { + "epoch": 3.1458261170765502, + "grad_norm": 0.5566214323043823, + "learning_rate": 6.857836338418863e-06, + "loss": 0.0274, + "step": 4541 + }, + { + "epoch": 3.146518877727745, + "grad_norm": 0.6070666909217834, + "learning_rate": 6.857142857142858e-06, + "loss": 0.0309, + "step": 4542 + }, + { + "epoch": 3.14721163837894, + "grad_norm": 0.7450109720230103, + "learning_rate": 6.856449375866853e-06, + "loss": 0.0439, + "step": 4543 + }, + { + "epoch": 3.147904399030135, + "grad_norm": 0.9209780693054199, + "learning_rate": 6.855755894590846e-06, + "loss": 0.0311, + "step": 4544 + }, + { + "epoch": 3.14859715968133, + "grad_norm": 0.5648210048675537, + "learning_rate": 6.855062413314841e-06, + "loss": 0.0313, + "step": 4545 + }, + { + "epoch": 3.1492899203325253, + "grad_norm": 0.5119938850402832, + "learning_rate": 6.854368932038835e-06, + "loss": 0.0299, + "step": 4546 + }, + { + "epoch": 3.14998268098372, + "grad_norm": 0.5387217998504639, + "learning_rate": 6.85367545076283e-06, + "loss": 0.025, + "step": 4547 + }, + { + "epoch": 3.150675441634915, + "grad_norm": 0.5152221918106079, + "learning_rate": 6.852981969486825e-06, + "loss": 0.0281, + "step": 4548 + }, + { + "epoch": 3.1513682022861103, + "grad_norm": 0.5119185447692871, + "learning_rate": 6.852288488210819e-06, + "loss": 0.02, + "step": 4549 + }, + { + "epoch": 3.152060962937305, + "grad_norm": 0.512662947177887, + "learning_rate": 6.851595006934814e-06, + "loss": 0.0289, + "step": 4550 + }, + { + "epoch": 3.1527537235885004, + "grad_norm": 0.5649303793907166, + "learning_rate": 6.850901525658807e-06, + "loss": 0.03, + "step": 4551 + }, + { + "epoch": 3.153446484239695, + "grad_norm": 0.5543296337127686, + "learning_rate": 6.850208044382802e-06, + "loss": 0.0345, + "step": 4552 + }, + { + "epoch": 3.15413924489089, + "grad_norm": 0.6043996214866638, + "learning_rate": 6.849514563106797e-06, + "loss": 0.0278, + "step": 4553 + }, + { + "epoch": 3.1548320055420853, + "grad_norm": 0.8201333284378052, + "learning_rate": 6.848821081830791e-06, + "loss": 0.0336, + "step": 4554 + }, + { + "epoch": 3.15552476619328, + "grad_norm": 0.5810098052024841, + "learning_rate": 6.848127600554786e-06, + "loss": 0.0308, + "step": 4555 + }, + { + "epoch": 3.1562175268444754, + "grad_norm": 0.5499187111854553, + "learning_rate": 6.847434119278779e-06, + "loss": 0.0345, + "step": 4556 + }, + { + "epoch": 3.1569102874956703, + "grad_norm": 0.6160211563110352, + "learning_rate": 6.846740638002774e-06, + "loss": 0.0369, + "step": 4557 + }, + { + "epoch": 3.157603048146865, + "grad_norm": 0.6396363377571106, + "learning_rate": 6.846047156726769e-06, + "loss": 0.0375, + "step": 4558 + }, + { + "epoch": 3.1582958087980604, + "grad_norm": 0.6576462388038635, + "learning_rate": 6.845353675450763e-06, + "loss": 0.0342, + "step": 4559 + }, + { + "epoch": 3.158988569449255, + "grad_norm": 0.4900040030479431, + "learning_rate": 6.844660194174758e-06, + "loss": 0.0267, + "step": 4560 + }, + { + "epoch": 3.1596813301004505, + "grad_norm": 0.6411131620407104, + "learning_rate": 6.843966712898752e-06, + "loss": 0.031, + "step": 4561 + }, + { + "epoch": 3.1603740907516453, + "grad_norm": 0.5328938961029053, + "learning_rate": 6.8432732316227464e-06, + "loss": 0.0328, + "step": 4562 + }, + { + "epoch": 3.16106685140284, + "grad_norm": 0.5043810606002808, + "learning_rate": 6.842579750346741e-06, + "loss": 0.0324, + "step": 4563 + }, + { + "epoch": 3.1617596120540354, + "grad_norm": 0.5267139077186584, + "learning_rate": 6.8418862690707354e-06, + "loss": 0.0296, + "step": 4564 + }, + { + "epoch": 3.1624523727052303, + "grad_norm": 0.4793320596218109, + "learning_rate": 6.84119278779473e-06, + "loss": 0.0245, + "step": 4565 + }, + { + "epoch": 3.1631451333564256, + "grad_norm": 0.605859637260437, + "learning_rate": 6.8404993065187245e-06, + "loss": 0.034, + "step": 4566 + }, + { + "epoch": 3.1638378940076204, + "grad_norm": 0.5951684713363647, + "learning_rate": 6.839805825242719e-06, + "loss": 0.0316, + "step": 4567 + }, + { + "epoch": 3.1645306546588152, + "grad_norm": 0.6127306222915649, + "learning_rate": 6.839112343966714e-06, + "loss": 0.0357, + "step": 4568 + }, + { + "epoch": 3.1652234153100105, + "grad_norm": 0.6114452481269836, + "learning_rate": 6.8384188626907076e-06, + "loss": 0.0294, + "step": 4569 + }, + { + "epoch": 3.1659161759612053, + "grad_norm": 0.7887547612190247, + "learning_rate": 6.8377253814147025e-06, + "loss": 0.0286, + "step": 4570 + }, + { + "epoch": 3.1666089366124006, + "grad_norm": 0.6409960985183716, + "learning_rate": 6.837031900138697e-06, + "loss": 0.0284, + "step": 4571 + }, + { + "epoch": 3.1673016972635955, + "grad_norm": 0.4506374001502991, + "learning_rate": 6.8363384188626915e-06, + "loss": 0.0245, + "step": 4572 + }, + { + "epoch": 3.1679944579147903, + "grad_norm": 0.8309051990509033, + "learning_rate": 6.8356449375866865e-06, + "loss": 0.0283, + "step": 4573 + }, + { + "epoch": 3.1686872185659856, + "grad_norm": 0.5388268828392029, + "learning_rate": 6.83495145631068e-06, + "loss": 0.0251, + "step": 4574 + }, + { + "epoch": 3.1693799792171804, + "grad_norm": 0.5915524363517761, + "learning_rate": 6.834257975034675e-06, + "loss": 0.0277, + "step": 4575 + }, + { + "epoch": 3.1700727398683757, + "grad_norm": 0.6235132217407227, + "learning_rate": 6.833564493758669e-06, + "loss": 0.0298, + "step": 4576 + }, + { + "epoch": 3.1707655005195705, + "grad_norm": 0.5598345398902893, + "learning_rate": 6.832871012482664e-06, + "loss": 0.0271, + "step": 4577 + }, + { + "epoch": 3.1714582611707653, + "grad_norm": 0.6950753927230835, + "learning_rate": 6.832177531206659e-06, + "loss": 0.0384, + "step": 4578 + }, + { + "epoch": 3.1721510218219606, + "grad_norm": 0.593630850315094, + "learning_rate": 6.831484049930653e-06, + "loss": 0.0277, + "step": 4579 + }, + { + "epoch": 3.1728437824731555, + "grad_norm": 0.6155409216880798, + "learning_rate": 6.830790568654648e-06, + "loss": 0.03, + "step": 4580 + }, + { + "epoch": 3.1735365431243507, + "grad_norm": 0.6785510182380676, + "learning_rate": 6.830097087378641e-06, + "loss": 0.0219, + "step": 4581 + }, + { + "epoch": 3.1742293037755456, + "grad_norm": 0.5539804100990295, + "learning_rate": 6.829403606102636e-06, + "loss": 0.0251, + "step": 4582 + }, + { + "epoch": 3.1749220644267404, + "grad_norm": 0.6123777627944946, + "learning_rate": 6.828710124826631e-06, + "loss": 0.0361, + "step": 4583 + }, + { + "epoch": 3.1756148250779357, + "grad_norm": 0.5027550458908081, + "learning_rate": 6.828016643550625e-06, + "loss": 0.0272, + "step": 4584 + }, + { + "epoch": 3.1763075857291305, + "grad_norm": 0.6634909510612488, + "learning_rate": 6.82732316227462e-06, + "loss": 0.0361, + "step": 4585 + }, + { + "epoch": 3.177000346380326, + "grad_norm": 0.5998567342758179, + "learning_rate": 6.826629680998613e-06, + "loss": 0.0263, + "step": 4586 + }, + { + "epoch": 3.1776931070315206, + "grad_norm": 0.5641564130783081, + "learning_rate": 6.825936199722608e-06, + "loss": 0.0241, + "step": 4587 + }, + { + "epoch": 3.1783858676827155, + "grad_norm": 0.6491827368736267, + "learning_rate": 6.825242718446603e-06, + "loss": 0.0268, + "step": 4588 + }, + { + "epoch": 3.1790786283339107, + "grad_norm": 0.5773411393165588, + "learning_rate": 6.824549237170597e-06, + "loss": 0.0261, + "step": 4589 + }, + { + "epoch": 3.1797713889851056, + "grad_norm": 0.5505303740501404, + "learning_rate": 6.823855755894592e-06, + "loss": 0.0247, + "step": 4590 + }, + { + "epoch": 3.180464149636301, + "grad_norm": 0.49144992232322693, + "learning_rate": 6.823162274618585e-06, + "loss": 0.0233, + "step": 4591 + }, + { + "epoch": 3.1811569102874957, + "grad_norm": 0.5164524912834167, + "learning_rate": 6.82246879334258e-06, + "loss": 0.0316, + "step": 4592 + }, + { + "epoch": 3.1818496709386905, + "grad_norm": 0.6317377686500549, + "learning_rate": 6.821775312066575e-06, + "loss": 0.0286, + "step": 4593 + }, + { + "epoch": 3.182542431589886, + "grad_norm": 0.4366499185562134, + "learning_rate": 6.821081830790569e-06, + "loss": 0.0219, + "step": 4594 + }, + { + "epoch": 3.1832351922410806, + "grad_norm": 0.7562986612319946, + "learning_rate": 6.820388349514564e-06, + "loss": 0.0357, + "step": 4595 + }, + { + "epoch": 3.183927952892276, + "grad_norm": 0.5120450854301453, + "learning_rate": 6.819694868238558e-06, + "loss": 0.0321, + "step": 4596 + }, + { + "epoch": 3.1846207135434708, + "grad_norm": 0.6288981437683105, + "learning_rate": 6.819001386962553e-06, + "loss": 0.0307, + "step": 4597 + }, + { + "epoch": 3.1853134741946656, + "grad_norm": 0.5563024878501892, + "learning_rate": 6.818307905686548e-06, + "loss": 0.0269, + "step": 4598 + }, + { + "epoch": 3.186006234845861, + "grad_norm": 0.5937584638595581, + "learning_rate": 6.817614424410541e-06, + "loss": 0.0331, + "step": 4599 + }, + { + "epoch": 3.1866989954970557, + "grad_norm": 0.6336886286735535, + "learning_rate": 6.816920943134536e-06, + "loss": 0.0366, + "step": 4600 + }, + { + "epoch": 3.187391756148251, + "grad_norm": 0.5646620392799377, + "learning_rate": 6.81622746185853e-06, + "loss": 0.024, + "step": 4601 + }, + { + "epoch": 3.188084516799446, + "grad_norm": 0.4449456036090851, + "learning_rate": 6.815533980582525e-06, + "loss": 0.026, + "step": 4602 + }, + { + "epoch": 3.1887772774506407, + "grad_norm": 0.5936587452888489, + "learning_rate": 6.81484049930652e-06, + "loss": 0.0328, + "step": 4603 + }, + { + "epoch": 3.189470038101836, + "grad_norm": 0.5598872900009155, + "learning_rate": 6.814147018030513e-06, + "loss": 0.0308, + "step": 4604 + }, + { + "epoch": 3.1901627987530308, + "grad_norm": 0.6282536387443542, + "learning_rate": 6.813453536754508e-06, + "loss": 0.0311, + "step": 4605 + }, + { + "epoch": 3.190855559404226, + "grad_norm": 0.79432612657547, + "learning_rate": 6.812760055478502e-06, + "loss": 0.0305, + "step": 4606 + }, + { + "epoch": 3.191548320055421, + "grad_norm": 0.5502942800521851, + "learning_rate": 6.812066574202497e-06, + "loss": 0.0294, + "step": 4607 + }, + { + "epoch": 3.1922410807066157, + "grad_norm": 0.5728614330291748, + "learning_rate": 6.811373092926492e-06, + "loss": 0.0279, + "step": 4608 + }, + { + "epoch": 3.192933841357811, + "grad_norm": 0.5338517427444458, + "learning_rate": 6.810679611650486e-06, + "loss": 0.0212, + "step": 4609 + }, + { + "epoch": 3.193626602009006, + "grad_norm": 0.5098572969436646, + "learning_rate": 6.80998613037448e-06, + "loss": 0.0244, + "step": 4610 + }, + { + "epoch": 3.1943193626602007, + "grad_norm": 0.557035505771637, + "learning_rate": 6.8092926490984744e-06, + "loss": 0.0257, + "step": 4611 + }, + { + "epoch": 3.195012123311396, + "grad_norm": 0.6642405390739441, + "learning_rate": 6.808599167822469e-06, + "loss": 0.0361, + "step": 4612 + }, + { + "epoch": 3.195704883962591, + "grad_norm": 0.6308565735816956, + "learning_rate": 6.807905686546464e-06, + "loss": 0.0228, + "step": 4613 + }, + { + "epoch": 3.196397644613786, + "grad_norm": 0.6261695027351379, + "learning_rate": 6.807212205270458e-06, + "loss": 0.035, + "step": 4614 + }, + { + "epoch": 3.197090405264981, + "grad_norm": 0.5674648284912109, + "learning_rate": 6.806518723994453e-06, + "loss": 0.0314, + "step": 4615 + }, + { + "epoch": 3.197783165916176, + "grad_norm": 0.5257681012153625, + "learning_rate": 6.8058252427184466e-06, + "loss": 0.0287, + "step": 4616 + }, + { + "epoch": 3.198475926567371, + "grad_norm": 0.5438860058784485, + "learning_rate": 6.8051317614424415e-06, + "loss": 0.0233, + "step": 4617 + }, + { + "epoch": 3.199168687218566, + "grad_norm": 0.6246986389160156, + "learning_rate": 6.8044382801664364e-06, + "loss": 0.0307, + "step": 4618 + }, + { + "epoch": 3.199861447869761, + "grad_norm": 0.6030694842338562, + "learning_rate": 6.8037447988904305e-06, + "loss": 0.03, + "step": 4619 + }, + { + "epoch": 3.200554208520956, + "grad_norm": 0.6196433305740356, + "learning_rate": 6.8030513176144255e-06, + "loss": 0.0373, + "step": 4620 + }, + { + "epoch": 3.201246969172151, + "grad_norm": 0.5739027261734009, + "learning_rate": 6.802357836338419e-06, + "loss": 0.0322, + "step": 4621 + }, + { + "epoch": 3.201939729823346, + "grad_norm": 0.5053880214691162, + "learning_rate": 6.801664355062414e-06, + "loss": 0.0283, + "step": 4622 + }, + { + "epoch": 3.202632490474541, + "grad_norm": 0.46181535720825195, + "learning_rate": 6.8009708737864086e-06, + "loss": 0.0244, + "step": 4623 + }, + { + "epoch": 3.203325251125736, + "grad_norm": 0.6071468591690063, + "learning_rate": 6.800277392510403e-06, + "loss": 0.0298, + "step": 4624 + }, + { + "epoch": 3.204018011776931, + "grad_norm": 0.4515599012374878, + "learning_rate": 6.799583911234398e-06, + "loss": 0.0227, + "step": 4625 + }, + { + "epoch": 3.2047107724281263, + "grad_norm": 0.5076680183410645, + "learning_rate": 6.798890429958392e-06, + "loss": 0.0206, + "step": 4626 + }, + { + "epoch": 3.205403533079321, + "grad_norm": 0.5873421430587769, + "learning_rate": 6.798196948682387e-06, + "loss": 0.0339, + "step": 4627 + }, + { + "epoch": 3.206096293730516, + "grad_norm": 0.6761807203292847, + "learning_rate": 6.7975034674063815e-06, + "loss": 0.0279, + "step": 4628 + }, + { + "epoch": 3.2067890543817112, + "grad_norm": 0.4351724684238434, + "learning_rate": 6.796809986130375e-06, + "loss": 0.0194, + "step": 4629 + }, + { + "epoch": 3.207481815032906, + "grad_norm": 0.515922486782074, + "learning_rate": 6.79611650485437e-06, + "loss": 0.026, + "step": 4630 + }, + { + "epoch": 3.208174575684101, + "grad_norm": 0.5686444640159607, + "learning_rate": 6.795423023578364e-06, + "loss": 0.0347, + "step": 4631 + }, + { + "epoch": 3.208867336335296, + "grad_norm": 0.5975972414016724, + "learning_rate": 6.794729542302359e-06, + "loss": 0.0415, + "step": 4632 + }, + { + "epoch": 3.209560096986491, + "grad_norm": 0.5428729057312012, + "learning_rate": 6.794036061026354e-06, + "loss": 0.0239, + "step": 4633 + }, + { + "epoch": 3.2102528576376863, + "grad_norm": 0.6052834987640381, + "learning_rate": 6.793342579750347e-06, + "loss": 0.033, + "step": 4634 + }, + { + "epoch": 3.210945618288881, + "grad_norm": 0.5841012001037598, + "learning_rate": 6.792649098474342e-06, + "loss": 0.0341, + "step": 4635 + }, + { + "epoch": 3.2116383789400764, + "grad_norm": 0.48423075675964355, + "learning_rate": 6.791955617198336e-06, + "loss": 0.028, + "step": 4636 + }, + { + "epoch": 3.2123311395912713, + "grad_norm": 0.487452894449234, + "learning_rate": 6.791262135922331e-06, + "loss": 0.0281, + "step": 4637 + }, + { + "epoch": 3.213023900242466, + "grad_norm": 0.600516676902771, + "learning_rate": 6.790568654646326e-06, + "loss": 0.0323, + "step": 4638 + }, + { + "epoch": 3.2137166608936614, + "grad_norm": 0.6476401090621948, + "learning_rate": 6.789875173370319e-06, + "loss": 0.035, + "step": 4639 + }, + { + "epoch": 3.214409421544856, + "grad_norm": 0.5982544422149658, + "learning_rate": 6.789181692094314e-06, + "loss": 0.0345, + "step": 4640 + }, + { + "epoch": 3.215102182196051, + "grad_norm": 0.5159494876861572, + "learning_rate": 6.788488210818308e-06, + "loss": 0.0235, + "step": 4641 + }, + { + "epoch": 3.2157949428472463, + "grad_norm": 0.628759503364563, + "learning_rate": 6.787794729542303e-06, + "loss": 0.0303, + "step": 4642 + }, + { + "epoch": 3.216487703498441, + "grad_norm": 0.4892570376396179, + "learning_rate": 6.787101248266298e-06, + "loss": 0.0213, + "step": 4643 + }, + { + "epoch": 3.2171804641496364, + "grad_norm": 0.5421319603919983, + "learning_rate": 6.786407766990292e-06, + "loss": 0.0296, + "step": 4644 + }, + { + "epoch": 3.2178732248008313, + "grad_norm": 0.7370326519012451, + "learning_rate": 6.785714285714287e-06, + "loss": 0.029, + "step": 4645 + }, + { + "epoch": 3.2185659854520265, + "grad_norm": 0.5076510310173035, + "learning_rate": 6.78502080443828e-06, + "loss": 0.0331, + "step": 4646 + }, + { + "epoch": 3.2192587461032214, + "grad_norm": 0.5268062949180603, + "learning_rate": 6.784327323162275e-06, + "loss": 0.0288, + "step": 4647 + }, + { + "epoch": 3.219951506754416, + "grad_norm": 0.6819027662277222, + "learning_rate": 6.78363384188627e-06, + "loss": 0.0386, + "step": 4648 + }, + { + "epoch": 3.2206442674056115, + "grad_norm": 0.7199299931526184, + "learning_rate": 6.782940360610264e-06, + "loss": 0.0279, + "step": 4649 + }, + { + "epoch": 3.2213370280568063, + "grad_norm": 0.5368867516517639, + "learning_rate": 6.782246879334259e-06, + "loss": 0.0289, + "step": 4650 + }, + { + "epoch": 3.222029788708001, + "grad_norm": 0.4995492398738861, + "learning_rate": 6.781553398058252e-06, + "loss": 0.0284, + "step": 4651 + }, + { + "epoch": 3.2227225493591964, + "grad_norm": 0.5490744709968567, + "learning_rate": 6.780859916782247e-06, + "loss": 0.0305, + "step": 4652 + }, + { + "epoch": 3.2234153100103913, + "grad_norm": 0.49523964524269104, + "learning_rate": 6.780166435506242e-06, + "loss": 0.0312, + "step": 4653 + }, + { + "epoch": 3.2241080706615866, + "grad_norm": 0.5173907279968262, + "learning_rate": 6.779472954230236e-06, + "loss": 0.0333, + "step": 4654 + }, + { + "epoch": 3.2248008313127814, + "grad_norm": 0.521741509437561, + "learning_rate": 6.778779472954231e-06, + "loss": 0.0227, + "step": 4655 + }, + { + "epoch": 3.2254935919639767, + "grad_norm": 0.5782747864723206, + "learning_rate": 6.778085991678225e-06, + "loss": 0.0322, + "step": 4656 + }, + { + "epoch": 3.2261863526151715, + "grad_norm": 0.605554461479187, + "learning_rate": 6.77739251040222e-06, + "loss": 0.0284, + "step": 4657 + }, + { + "epoch": 3.2268791132663663, + "grad_norm": 0.573573112487793, + "learning_rate": 6.776699029126214e-06, + "loss": 0.0348, + "step": 4658 + }, + { + "epoch": 3.2275718739175616, + "grad_norm": 0.585656464099884, + "learning_rate": 6.776005547850208e-06, + "loss": 0.0324, + "step": 4659 + }, + { + "epoch": 3.2282646345687565, + "grad_norm": 0.48179271817207336, + "learning_rate": 6.775312066574203e-06, + "loss": 0.0246, + "step": 4660 + }, + { + "epoch": 3.2289573952199513, + "grad_norm": 0.6250631213188171, + "learning_rate": 6.774618585298197e-06, + "loss": 0.0369, + "step": 4661 + }, + { + "epoch": 3.2296501558711466, + "grad_norm": 0.5276439785957336, + "learning_rate": 6.773925104022192e-06, + "loss": 0.0285, + "step": 4662 + }, + { + "epoch": 3.2303429165223414, + "grad_norm": 0.5746296644210815, + "learning_rate": 6.773231622746187e-06, + "loss": 0.0218, + "step": 4663 + }, + { + "epoch": 3.2310356771735367, + "grad_norm": 0.5363883376121521, + "learning_rate": 6.7725381414701805e-06, + "loss": 0.0296, + "step": 4664 + }, + { + "epoch": 3.2317284378247315, + "grad_norm": 0.5687172412872314, + "learning_rate": 6.7718446601941754e-06, + "loss": 0.0298, + "step": 4665 + }, + { + "epoch": 3.232421198475927, + "grad_norm": 0.5608294010162354, + "learning_rate": 6.7711511789181695e-06, + "loss": 0.0245, + "step": 4666 + }, + { + "epoch": 3.2331139591271216, + "grad_norm": 0.6284404993057251, + "learning_rate": 6.7704576976421645e-06, + "loss": 0.0245, + "step": 4667 + }, + { + "epoch": 3.2338067197783165, + "grad_norm": 0.5148667693138123, + "learning_rate": 6.769764216366159e-06, + "loss": 0.0297, + "step": 4668 + }, + { + "epoch": 3.2344994804295117, + "grad_norm": 0.7157672047615051, + "learning_rate": 6.769070735090153e-06, + "loss": 0.0356, + "step": 4669 + }, + { + "epoch": 3.2351922410807066, + "grad_norm": 0.6483341455459595, + "learning_rate": 6.7683772538141476e-06, + "loss": 0.037, + "step": 4670 + }, + { + "epoch": 3.2358850017319014, + "grad_norm": 0.5658396482467651, + "learning_rate": 6.767683772538142e-06, + "loss": 0.0297, + "step": 4671 + }, + { + "epoch": 3.2365777623830967, + "grad_norm": 0.6210411190986633, + "learning_rate": 6.766990291262137e-06, + "loss": 0.0327, + "step": 4672 + }, + { + "epoch": 3.2372705230342915, + "grad_norm": 0.5706200003623962, + "learning_rate": 6.7662968099861315e-06, + "loss": 0.034, + "step": 4673 + }, + { + "epoch": 3.237963283685487, + "grad_norm": 0.6245120763778687, + "learning_rate": 6.765603328710126e-06, + "loss": 0.0323, + "step": 4674 + }, + { + "epoch": 3.2386560443366816, + "grad_norm": 0.6629225015640259, + "learning_rate": 6.7649098474341205e-06, + "loss": 0.0355, + "step": 4675 + }, + { + "epoch": 3.239348804987877, + "grad_norm": 0.5242277383804321, + "learning_rate": 6.764216366158114e-06, + "loss": 0.0259, + "step": 4676 + }, + { + "epoch": 3.2400415656390718, + "grad_norm": 0.5269795060157776, + "learning_rate": 6.763522884882109e-06, + "loss": 0.0242, + "step": 4677 + }, + { + "epoch": 3.2407343262902666, + "grad_norm": 0.63178950548172, + "learning_rate": 6.762829403606104e-06, + "loss": 0.036, + "step": 4678 + }, + { + "epoch": 3.241427086941462, + "grad_norm": 0.4579290449619293, + "learning_rate": 6.762135922330098e-06, + "loss": 0.0223, + "step": 4679 + }, + { + "epoch": 3.2421198475926567, + "grad_norm": 0.5542458891868591, + "learning_rate": 6.761442441054093e-06, + "loss": 0.0345, + "step": 4680 + }, + { + "epoch": 3.2428126082438515, + "grad_norm": 0.6242253184318542, + "learning_rate": 6.760748959778086e-06, + "loss": 0.0412, + "step": 4681 + }, + { + "epoch": 3.243505368895047, + "grad_norm": 0.4446013271808624, + "learning_rate": 6.760055478502081e-06, + "loss": 0.02, + "step": 4682 + }, + { + "epoch": 3.2441981295462416, + "grad_norm": 0.5094194412231445, + "learning_rate": 6.759361997226076e-06, + "loss": 0.025, + "step": 4683 + }, + { + "epoch": 3.244890890197437, + "grad_norm": 0.6685901880264282, + "learning_rate": 6.75866851595007e-06, + "loss": 0.032, + "step": 4684 + }, + { + "epoch": 3.2455836508486318, + "grad_norm": 0.4762212336063385, + "learning_rate": 6.757975034674065e-06, + "loss": 0.027, + "step": 4685 + }, + { + "epoch": 3.246276411499827, + "grad_norm": 0.5818709135055542, + "learning_rate": 6.757281553398059e-06, + "loss": 0.0304, + "step": 4686 + }, + { + "epoch": 3.246969172151022, + "grad_norm": 0.5958693623542786, + "learning_rate": 6.756588072122053e-06, + "loss": 0.0322, + "step": 4687 + }, + { + "epoch": 3.2476619328022167, + "grad_norm": 0.5693372488021851, + "learning_rate": 6.755894590846048e-06, + "loss": 0.0287, + "step": 4688 + }, + { + "epoch": 3.248354693453412, + "grad_norm": 0.5314691662788391, + "learning_rate": 6.755201109570042e-06, + "loss": 0.0319, + "step": 4689 + }, + { + "epoch": 3.249047454104607, + "grad_norm": 0.5831869840621948, + "learning_rate": 6.754507628294037e-06, + "loss": 0.0287, + "step": 4690 + }, + { + "epoch": 3.2497402147558017, + "grad_norm": 0.5545961856842041, + "learning_rate": 6.753814147018031e-06, + "loss": 0.0349, + "step": 4691 + }, + { + "epoch": 3.250432975406997, + "grad_norm": 0.5970706939697266, + "learning_rate": 6.753120665742026e-06, + "loss": 0.0257, + "step": 4692 + }, + { + "epoch": 3.2511257360581918, + "grad_norm": 0.558861255645752, + "learning_rate": 6.752427184466021e-06, + "loss": 0.0307, + "step": 4693 + }, + { + "epoch": 3.251818496709387, + "grad_norm": 0.5072506070137024, + "learning_rate": 6.751733703190014e-06, + "loss": 0.0298, + "step": 4694 + }, + { + "epoch": 3.252511257360582, + "grad_norm": 0.5481452345848083, + "learning_rate": 6.751040221914009e-06, + "loss": 0.0296, + "step": 4695 + }, + { + "epoch": 3.253204018011777, + "grad_norm": 0.5336576104164124, + "learning_rate": 6.750346740638003e-06, + "loss": 0.029, + "step": 4696 + }, + { + "epoch": 3.253896778662972, + "grad_norm": 0.6487609148025513, + "learning_rate": 6.749653259361998e-06, + "loss": 0.0276, + "step": 4697 + }, + { + "epoch": 3.254589539314167, + "grad_norm": 0.603401780128479, + "learning_rate": 6.748959778085991e-06, + "loss": 0.0268, + "step": 4698 + }, + { + "epoch": 3.255282299965362, + "grad_norm": 0.6136023998260498, + "learning_rate": 6.748266296809986e-06, + "loss": 0.0328, + "step": 4699 + }, + { + "epoch": 3.255975060616557, + "grad_norm": 0.5705479979515076, + "learning_rate": 6.747572815533981e-06, + "loss": 0.0315, + "step": 4700 + }, + { + "epoch": 3.256667821267752, + "grad_norm": 0.5006148815155029, + "learning_rate": 6.746879334257975e-06, + "loss": 0.0331, + "step": 4701 + }, + { + "epoch": 3.257360581918947, + "grad_norm": 0.7428138256072998, + "learning_rate": 6.74618585298197e-06, + "loss": 0.0409, + "step": 4702 + }, + { + "epoch": 3.258053342570142, + "grad_norm": 0.48673370480537415, + "learning_rate": 6.745492371705964e-06, + "loss": 0.0276, + "step": 4703 + }, + { + "epoch": 3.258746103221337, + "grad_norm": 0.6352970004081726, + "learning_rate": 6.744798890429959e-06, + "loss": 0.0273, + "step": 4704 + }, + { + "epoch": 3.259438863872532, + "grad_norm": 0.5196688175201416, + "learning_rate": 6.744105409153954e-06, + "loss": 0.0239, + "step": 4705 + }, + { + "epoch": 3.2601316245237273, + "grad_norm": 0.5542315244674683, + "learning_rate": 6.743411927877947e-06, + "loss": 0.0253, + "step": 4706 + }, + { + "epoch": 3.260824385174922, + "grad_norm": 0.6497322916984558, + "learning_rate": 6.742718446601942e-06, + "loss": 0.0372, + "step": 4707 + }, + { + "epoch": 3.261517145826117, + "grad_norm": 0.6121558547019958, + "learning_rate": 6.742024965325936e-06, + "loss": 0.0377, + "step": 4708 + }, + { + "epoch": 3.2622099064773122, + "grad_norm": 0.5766769051551819, + "learning_rate": 6.741331484049931e-06, + "loss": 0.0252, + "step": 4709 + }, + { + "epoch": 3.262902667128507, + "grad_norm": 0.5521363615989685, + "learning_rate": 6.740638002773926e-06, + "loss": 0.0315, + "step": 4710 + }, + { + "epoch": 3.263595427779702, + "grad_norm": 0.6215150356292725, + "learning_rate": 6.7399445214979195e-06, + "loss": 0.0363, + "step": 4711 + }, + { + "epoch": 3.264288188430897, + "grad_norm": 0.6115555763244629, + "learning_rate": 6.7392510402219144e-06, + "loss": 0.028, + "step": 4712 + }, + { + "epoch": 3.264980949082092, + "grad_norm": 0.4691926836967468, + "learning_rate": 6.7385575589459085e-06, + "loss": 0.0234, + "step": 4713 + }, + { + "epoch": 3.2656737097332873, + "grad_norm": 0.529046893119812, + "learning_rate": 6.7378640776699035e-06, + "loss": 0.0288, + "step": 4714 + }, + { + "epoch": 3.266366470384482, + "grad_norm": 0.5988357067108154, + "learning_rate": 6.737170596393898e-06, + "loss": 0.0283, + "step": 4715 + }, + { + "epoch": 3.2670592310356774, + "grad_norm": 0.5368452072143555, + "learning_rate": 6.736477115117892e-06, + "loss": 0.0253, + "step": 4716 + }, + { + "epoch": 3.2677519916868722, + "grad_norm": 0.5600354075431824, + "learning_rate": 6.7357836338418866e-06, + "loss": 0.0251, + "step": 4717 + }, + { + "epoch": 3.268444752338067, + "grad_norm": 0.5538088083267212, + "learning_rate": 6.735090152565881e-06, + "loss": 0.0296, + "step": 4718 + }, + { + "epoch": 3.2691375129892624, + "grad_norm": 0.5466650128364563, + "learning_rate": 6.734396671289876e-06, + "loss": 0.0261, + "step": 4719 + }, + { + "epoch": 3.269830273640457, + "grad_norm": 0.590917706489563, + "learning_rate": 6.7337031900138705e-06, + "loss": 0.0335, + "step": 4720 + }, + { + "epoch": 3.270523034291652, + "grad_norm": 0.6178110241889954, + "learning_rate": 6.733009708737865e-06, + "loss": 0.0315, + "step": 4721 + }, + { + "epoch": 3.2712157949428473, + "grad_norm": 0.564285933971405, + "learning_rate": 6.7323162274618595e-06, + "loss": 0.0333, + "step": 4722 + }, + { + "epoch": 3.271908555594042, + "grad_norm": 0.5403800010681152, + "learning_rate": 6.731622746185853e-06, + "loss": 0.0252, + "step": 4723 + }, + { + "epoch": 3.2726013162452374, + "grad_norm": 0.6292010545730591, + "learning_rate": 6.730929264909848e-06, + "loss": 0.0336, + "step": 4724 + }, + { + "epoch": 3.2732940768964323, + "grad_norm": 0.5613750219345093, + "learning_rate": 6.730235783633843e-06, + "loss": 0.0227, + "step": 4725 + }, + { + "epoch": 3.2739868375476275, + "grad_norm": 0.6914113759994507, + "learning_rate": 6.729542302357837e-06, + "loss": 0.0242, + "step": 4726 + }, + { + "epoch": 3.2746795981988224, + "grad_norm": 0.5726519823074341, + "learning_rate": 6.728848821081832e-06, + "loss": 0.0255, + "step": 4727 + }, + { + "epoch": 3.275372358850017, + "grad_norm": 0.6543517708778381, + "learning_rate": 6.728155339805825e-06, + "loss": 0.0421, + "step": 4728 + }, + { + "epoch": 3.2760651195012125, + "grad_norm": 0.5008887648582458, + "learning_rate": 6.72746185852982e-06, + "loss": 0.0322, + "step": 4729 + }, + { + "epoch": 3.2767578801524073, + "grad_norm": 0.5868710875511169, + "learning_rate": 6.726768377253815e-06, + "loss": 0.0334, + "step": 4730 + }, + { + "epoch": 3.277450640803602, + "grad_norm": 0.6493781208992004, + "learning_rate": 6.726074895977809e-06, + "loss": 0.0427, + "step": 4731 + }, + { + "epoch": 3.2781434014547974, + "grad_norm": 0.5291950702667236, + "learning_rate": 6.725381414701804e-06, + "loss": 0.0289, + "step": 4732 + }, + { + "epoch": 3.2788361621059923, + "grad_norm": 0.5385832786560059, + "learning_rate": 6.724687933425798e-06, + "loss": 0.0313, + "step": 4733 + }, + { + "epoch": 3.2795289227571875, + "grad_norm": 0.6078015565872192, + "learning_rate": 6.723994452149793e-06, + "loss": 0.0336, + "step": 4734 + }, + { + "epoch": 3.2802216834083824, + "grad_norm": 0.5908147692680359, + "learning_rate": 6.723300970873787e-06, + "loss": 0.0394, + "step": 4735 + }, + { + "epoch": 3.2809144440595777, + "grad_norm": 0.520795464515686, + "learning_rate": 6.722607489597781e-06, + "loss": 0.0242, + "step": 4736 + }, + { + "epoch": 3.2816072047107725, + "grad_norm": 0.6225765943527222, + "learning_rate": 6.721914008321776e-06, + "loss": 0.0323, + "step": 4737 + }, + { + "epoch": 3.2822999653619673, + "grad_norm": 0.5009455680847168, + "learning_rate": 6.72122052704577e-06, + "loss": 0.0344, + "step": 4738 + }, + { + "epoch": 3.2829927260131626, + "grad_norm": 0.4971337914466858, + "learning_rate": 6.720527045769765e-06, + "loss": 0.0326, + "step": 4739 + }, + { + "epoch": 3.2836854866643574, + "grad_norm": 0.6354343295097351, + "learning_rate": 6.71983356449376e-06, + "loss": 0.0327, + "step": 4740 + }, + { + "epoch": 3.2843782473155523, + "grad_norm": 0.5460329651832581, + "learning_rate": 6.719140083217753e-06, + "loss": 0.0251, + "step": 4741 + }, + { + "epoch": 3.2850710079667476, + "grad_norm": 0.5400301814079285, + "learning_rate": 6.718446601941748e-06, + "loss": 0.0338, + "step": 4742 + }, + { + "epoch": 3.2857637686179424, + "grad_norm": 0.5009092092514038, + "learning_rate": 6.717753120665742e-06, + "loss": 0.0286, + "step": 4743 + }, + { + "epoch": 3.2864565292691377, + "grad_norm": 0.564020037651062, + "learning_rate": 6.717059639389737e-06, + "loss": 0.0343, + "step": 4744 + }, + { + "epoch": 3.2871492899203325, + "grad_norm": 0.621777355670929, + "learning_rate": 6.716366158113732e-06, + "loss": 0.0373, + "step": 4745 + }, + { + "epoch": 3.287842050571528, + "grad_norm": 0.5231059193611145, + "learning_rate": 6.715672676837725e-06, + "loss": 0.0311, + "step": 4746 + }, + { + "epoch": 3.2885348112227226, + "grad_norm": 0.4751788079738617, + "learning_rate": 6.71497919556172e-06, + "loss": 0.024, + "step": 4747 + }, + { + "epoch": 3.2892275718739175, + "grad_norm": 0.5766786336898804, + "learning_rate": 6.714285714285714e-06, + "loss": 0.0367, + "step": 4748 + }, + { + "epoch": 3.2899203325251127, + "grad_norm": 0.5312018990516663, + "learning_rate": 6.713592233009709e-06, + "loss": 0.019, + "step": 4749 + }, + { + "epoch": 3.2906130931763076, + "grad_norm": 0.5392399430274963, + "learning_rate": 6.712898751733704e-06, + "loss": 0.0265, + "step": 4750 + }, + { + "epoch": 3.2913058538275024, + "grad_norm": 0.7252957224845886, + "learning_rate": 6.712205270457698e-06, + "loss": 0.0332, + "step": 4751 + }, + { + "epoch": 3.2919986144786977, + "grad_norm": 0.57252436876297, + "learning_rate": 6.711511789181693e-06, + "loss": 0.0282, + "step": 4752 + }, + { + "epoch": 3.2926913751298925, + "grad_norm": 1.0105564594268799, + "learning_rate": 6.710818307905686e-06, + "loss": 0.028, + "step": 4753 + }, + { + "epoch": 3.293384135781088, + "grad_norm": 0.4887373745441437, + "learning_rate": 6.710124826629681e-06, + "loss": 0.0264, + "step": 4754 + }, + { + "epoch": 3.2940768964322826, + "grad_norm": 0.5198420882225037, + "learning_rate": 6.709431345353676e-06, + "loss": 0.0259, + "step": 4755 + }, + { + "epoch": 3.2947696570834775, + "grad_norm": 0.6026989817619324, + "learning_rate": 6.70873786407767e-06, + "loss": 0.027, + "step": 4756 + }, + { + "epoch": 3.2954624177346727, + "grad_norm": 0.5375710129737854, + "learning_rate": 6.708044382801665e-06, + "loss": 0.0236, + "step": 4757 + }, + { + "epoch": 3.2961551783858676, + "grad_norm": 0.5420960783958435, + "learning_rate": 6.7073509015256585e-06, + "loss": 0.0328, + "step": 4758 + }, + { + "epoch": 3.296847939037063, + "grad_norm": 0.5559707880020142, + "learning_rate": 6.7066574202496534e-06, + "loss": 0.0301, + "step": 4759 + }, + { + "epoch": 3.2975406996882577, + "grad_norm": 0.4434460699558258, + "learning_rate": 6.705963938973648e-06, + "loss": 0.0242, + "step": 4760 + }, + { + "epoch": 3.2982334603394525, + "grad_norm": 0.5509433746337891, + "learning_rate": 6.7052704576976425e-06, + "loss": 0.029, + "step": 4761 + }, + { + "epoch": 3.298926220990648, + "grad_norm": 0.6564376950263977, + "learning_rate": 6.704576976421637e-06, + "loss": 0.0284, + "step": 4762 + }, + { + "epoch": 3.2996189816418426, + "grad_norm": 0.5901053547859192, + "learning_rate": 6.7038834951456315e-06, + "loss": 0.036, + "step": 4763 + }, + { + "epoch": 3.300311742293038, + "grad_norm": 0.619019091129303, + "learning_rate": 6.7031900138696256e-06, + "loss": 0.0348, + "step": 4764 + }, + { + "epoch": 3.3010045029442328, + "grad_norm": 0.6057151556015015, + "learning_rate": 6.7024965325936205e-06, + "loss": 0.0338, + "step": 4765 + }, + { + "epoch": 3.3016972635954276, + "grad_norm": 0.7371007800102234, + "learning_rate": 6.701803051317615e-06, + "loss": 0.045, + "step": 4766 + }, + { + "epoch": 3.302390024246623, + "grad_norm": 0.5042701959609985, + "learning_rate": 6.7011095700416095e-06, + "loss": 0.0231, + "step": 4767 + }, + { + "epoch": 3.3030827848978177, + "grad_norm": 0.4735625088214874, + "learning_rate": 6.700416088765604e-06, + "loss": 0.024, + "step": 4768 + }, + { + "epoch": 3.303775545549013, + "grad_norm": 0.669570803642273, + "learning_rate": 6.6997226074895985e-06, + "loss": 0.0325, + "step": 4769 + }, + { + "epoch": 3.304468306200208, + "grad_norm": 0.4947521388530731, + "learning_rate": 6.6990291262135935e-06, + "loss": 0.0261, + "step": 4770 + }, + { + "epoch": 3.3051610668514027, + "grad_norm": 0.5756155848503113, + "learning_rate": 6.698335644937587e-06, + "loss": 0.0366, + "step": 4771 + }, + { + "epoch": 3.305853827502598, + "grad_norm": 0.5192012786865234, + "learning_rate": 6.697642163661582e-06, + "loss": 0.0229, + "step": 4772 + }, + { + "epoch": 3.3065465881537928, + "grad_norm": 0.4495704174041748, + "learning_rate": 6.696948682385576e-06, + "loss": 0.0247, + "step": 4773 + }, + { + "epoch": 3.307239348804988, + "grad_norm": 1.2129278182983398, + "learning_rate": 6.696255201109571e-06, + "loss": 0.0288, + "step": 4774 + }, + { + "epoch": 3.307932109456183, + "grad_norm": 0.4990065097808838, + "learning_rate": 6.695561719833566e-06, + "loss": 0.0231, + "step": 4775 + }, + { + "epoch": 3.3086248701073777, + "grad_norm": 0.6738272905349731, + "learning_rate": 6.694868238557559e-06, + "loss": 0.0325, + "step": 4776 + }, + { + "epoch": 3.309317630758573, + "grad_norm": 0.4718707799911499, + "learning_rate": 6.694174757281554e-06, + "loss": 0.0213, + "step": 4777 + }, + { + "epoch": 3.310010391409768, + "grad_norm": 0.5815008878707886, + "learning_rate": 6.693481276005548e-06, + "loss": 0.0245, + "step": 4778 + }, + { + "epoch": 3.310703152060963, + "grad_norm": 0.6399415135383606, + "learning_rate": 6.692787794729543e-06, + "loss": 0.0285, + "step": 4779 + }, + { + "epoch": 3.311395912712158, + "grad_norm": 0.5460225939750671, + "learning_rate": 6.692094313453538e-06, + "loss": 0.0338, + "step": 4780 + }, + { + "epoch": 3.3120886733633528, + "grad_norm": 0.6218183040618896, + "learning_rate": 6.691400832177532e-06, + "loss": 0.0367, + "step": 4781 + }, + { + "epoch": 3.312781434014548, + "grad_norm": 0.5439281463623047, + "learning_rate": 6.690707350901527e-06, + "loss": 0.0318, + "step": 4782 + }, + { + "epoch": 3.313474194665743, + "grad_norm": 0.6438165307044983, + "learning_rate": 6.69001386962552e-06, + "loss": 0.0356, + "step": 4783 + }, + { + "epoch": 3.314166955316938, + "grad_norm": 0.759637713432312, + "learning_rate": 6.689320388349515e-06, + "loss": 0.0304, + "step": 4784 + }, + { + "epoch": 3.314859715968133, + "grad_norm": 0.6359278559684753, + "learning_rate": 6.68862690707351e-06, + "loss": 0.0277, + "step": 4785 + }, + { + "epoch": 3.315552476619328, + "grad_norm": 0.6943196654319763, + "learning_rate": 6.687933425797504e-06, + "loss": 0.0372, + "step": 4786 + }, + { + "epoch": 3.316245237270523, + "grad_norm": 0.6850655674934387, + "learning_rate": 6.687239944521499e-06, + "loss": 0.0316, + "step": 4787 + }, + { + "epoch": 3.316937997921718, + "grad_norm": 0.5700212121009827, + "learning_rate": 6.686546463245492e-06, + "loss": 0.0278, + "step": 4788 + }, + { + "epoch": 3.3176307585729132, + "grad_norm": 0.669863224029541, + "learning_rate": 6.685852981969487e-06, + "loss": 0.028, + "step": 4789 + }, + { + "epoch": 3.318323519224108, + "grad_norm": 0.5694512724876404, + "learning_rate": 6.685159500693482e-06, + "loss": 0.0292, + "step": 4790 + }, + { + "epoch": 3.319016279875303, + "grad_norm": 0.5469205975532532, + "learning_rate": 6.684466019417476e-06, + "loss": 0.0274, + "step": 4791 + }, + { + "epoch": 3.319709040526498, + "grad_norm": 0.5163033604621887, + "learning_rate": 6.683772538141471e-06, + "loss": 0.0247, + "step": 4792 + }, + { + "epoch": 3.320401801177693, + "grad_norm": 0.5916407704353333, + "learning_rate": 6.683079056865464e-06, + "loss": 0.0317, + "step": 4793 + }, + { + "epoch": 3.3210945618288883, + "grad_norm": 0.6355410218238831, + "learning_rate": 6.682385575589459e-06, + "loss": 0.0348, + "step": 4794 + }, + { + "epoch": 3.321787322480083, + "grad_norm": 0.6622012853622437, + "learning_rate": 6.681692094313454e-06, + "loss": 0.0294, + "step": 4795 + }, + { + "epoch": 3.322480083131278, + "grad_norm": 0.6014128923416138, + "learning_rate": 6.680998613037448e-06, + "loss": 0.0282, + "step": 4796 + }, + { + "epoch": 3.3231728437824732, + "grad_norm": 0.580251157283783, + "learning_rate": 6.680305131761443e-06, + "loss": 0.028, + "step": 4797 + }, + { + "epoch": 3.323865604433668, + "grad_norm": 0.5978350639343262, + "learning_rate": 6.679611650485437e-06, + "loss": 0.0304, + "step": 4798 + }, + { + "epoch": 3.3245583650848634, + "grad_norm": 0.6344513893127441, + "learning_rate": 6.678918169209432e-06, + "loss": 0.0349, + "step": 4799 + }, + { + "epoch": 3.325251125736058, + "grad_norm": 0.5302347540855408, + "learning_rate": 6.678224687933427e-06, + "loss": 0.0243, + "step": 4800 + }, + { + "epoch": 3.325943886387253, + "grad_norm": 0.8989347219467163, + "learning_rate": 6.67753120665742e-06, + "loss": 0.0412, + "step": 4801 + }, + { + "epoch": 3.3266366470384483, + "grad_norm": 0.5310397148132324, + "learning_rate": 6.676837725381415e-06, + "loss": 0.0233, + "step": 4802 + }, + { + "epoch": 3.327329407689643, + "grad_norm": 0.44594675302505493, + "learning_rate": 6.676144244105409e-06, + "loss": 0.0216, + "step": 4803 + }, + { + "epoch": 3.3280221683408384, + "grad_norm": 0.5148747563362122, + "learning_rate": 6.675450762829404e-06, + "loss": 0.0242, + "step": 4804 + }, + { + "epoch": 3.3287149289920333, + "grad_norm": 0.7338892817497253, + "learning_rate": 6.674757281553399e-06, + "loss": 0.0314, + "step": 4805 + }, + { + "epoch": 3.329407689643228, + "grad_norm": 0.511072039604187, + "learning_rate": 6.6740638002773924e-06, + "loss": 0.0206, + "step": 4806 + }, + { + "epoch": 3.3301004502944234, + "grad_norm": 0.5795922875404358, + "learning_rate": 6.673370319001387e-06, + "loss": 0.0273, + "step": 4807 + }, + { + "epoch": 3.330793210945618, + "grad_norm": 0.6510863900184631, + "learning_rate": 6.6726768377253815e-06, + "loss": 0.0233, + "step": 4808 + }, + { + "epoch": 3.3314859715968135, + "grad_norm": 0.6126107573509216, + "learning_rate": 6.671983356449376e-06, + "loss": 0.0284, + "step": 4809 + }, + { + "epoch": 3.3321787322480083, + "grad_norm": 0.5978538990020752, + "learning_rate": 6.671289875173371e-06, + "loss": 0.0308, + "step": 4810 + }, + { + "epoch": 3.332871492899203, + "grad_norm": 0.4925459623336792, + "learning_rate": 6.670596393897365e-06, + "loss": 0.0282, + "step": 4811 + }, + { + "epoch": 3.3335642535503984, + "grad_norm": 0.7590802311897278, + "learning_rate": 6.6699029126213595e-06, + "loss": 0.0356, + "step": 4812 + }, + { + "epoch": 3.3342570142015933, + "grad_norm": 0.7301770448684692, + "learning_rate": 6.669209431345354e-06, + "loss": 0.0281, + "step": 4813 + }, + { + "epoch": 3.3349497748527885, + "grad_norm": 0.44639384746551514, + "learning_rate": 6.6685159500693485e-06, + "loss": 0.0244, + "step": 4814 + }, + { + "epoch": 3.3356425355039834, + "grad_norm": 0.5887128114700317, + "learning_rate": 6.6678224687933435e-06, + "loss": 0.0294, + "step": 4815 + }, + { + "epoch": 3.336335296155178, + "grad_norm": 0.6999009847640991, + "learning_rate": 6.6671289875173375e-06, + "loss": 0.0261, + "step": 4816 + }, + { + "epoch": 3.3370280568063735, + "grad_norm": 0.6228030920028687, + "learning_rate": 6.6664355062413325e-06, + "loss": 0.039, + "step": 4817 + }, + { + "epoch": 3.3377208174575683, + "grad_norm": 0.6191381812095642, + "learning_rate": 6.665742024965326e-06, + "loss": 0.0323, + "step": 4818 + }, + { + "epoch": 3.3384135781087636, + "grad_norm": 0.6570302844047546, + "learning_rate": 6.665048543689321e-06, + "loss": 0.0326, + "step": 4819 + }, + { + "epoch": 3.3391063387599584, + "grad_norm": 0.5036888718605042, + "learning_rate": 6.664355062413316e-06, + "loss": 0.0225, + "step": 4820 + }, + { + "epoch": 3.3397990994111533, + "grad_norm": 0.5116766095161438, + "learning_rate": 6.66366158113731e-06, + "loss": 0.028, + "step": 4821 + }, + { + "epoch": 3.3404918600623486, + "grad_norm": 0.5185744166374207, + "learning_rate": 6.662968099861305e-06, + "loss": 0.0226, + "step": 4822 + }, + { + "epoch": 3.3411846207135434, + "grad_norm": 0.5043239593505859, + "learning_rate": 6.662274618585298e-06, + "loss": 0.0252, + "step": 4823 + }, + { + "epoch": 3.3418773813647387, + "grad_norm": 0.5223549008369446, + "learning_rate": 6.661581137309293e-06, + "loss": 0.0327, + "step": 4824 + }, + { + "epoch": 3.3425701420159335, + "grad_norm": 0.5926573872566223, + "learning_rate": 6.660887656033288e-06, + "loss": 0.0433, + "step": 4825 + }, + { + "epoch": 3.3432629026671283, + "grad_norm": 0.5235224962234497, + "learning_rate": 6.660194174757282e-06, + "loss": 0.0226, + "step": 4826 + }, + { + "epoch": 3.3439556633183236, + "grad_norm": 0.6044923663139343, + "learning_rate": 6.659500693481277e-06, + "loss": 0.0352, + "step": 4827 + }, + { + "epoch": 3.3446484239695184, + "grad_norm": 0.5524185299873352, + "learning_rate": 6.658807212205271e-06, + "loss": 0.0227, + "step": 4828 + }, + { + "epoch": 3.3453411846207137, + "grad_norm": 0.4465431869029999, + "learning_rate": 6.658113730929266e-06, + "loss": 0.0238, + "step": 4829 + }, + { + "epoch": 3.3460339452719086, + "grad_norm": 0.6737369894981384, + "learning_rate": 6.657420249653261e-06, + "loss": 0.0339, + "step": 4830 + }, + { + "epoch": 3.3467267059231034, + "grad_norm": 0.5326617360115051, + "learning_rate": 6.656726768377254e-06, + "loss": 0.029, + "step": 4831 + }, + { + "epoch": 3.3474194665742987, + "grad_norm": 0.5432512760162354, + "learning_rate": 6.656033287101249e-06, + "loss": 0.0248, + "step": 4832 + }, + { + "epoch": 3.3481122272254935, + "grad_norm": 0.6148502826690674, + "learning_rate": 6.655339805825243e-06, + "loss": 0.0311, + "step": 4833 + }, + { + "epoch": 3.348804987876689, + "grad_norm": 0.5826267004013062, + "learning_rate": 6.654646324549238e-06, + "loss": 0.0368, + "step": 4834 + }, + { + "epoch": 3.3494977485278836, + "grad_norm": 0.5656581521034241, + "learning_rate": 6.653952843273233e-06, + "loss": 0.0267, + "step": 4835 + }, + { + "epoch": 3.3501905091790785, + "grad_norm": 0.6797323226928711, + "learning_rate": 6.653259361997226e-06, + "loss": 0.0282, + "step": 4836 + }, + { + "epoch": 3.3508832698302737, + "grad_norm": 0.6435461044311523, + "learning_rate": 6.652565880721221e-06, + "loss": 0.035, + "step": 4837 + }, + { + "epoch": 3.3515760304814686, + "grad_norm": 0.5476511120796204, + "learning_rate": 6.651872399445215e-06, + "loss": 0.0285, + "step": 4838 + }, + { + "epoch": 3.352268791132664, + "grad_norm": 0.4512452483177185, + "learning_rate": 6.65117891816921e-06, + "loss": 0.0207, + "step": 4839 + }, + { + "epoch": 3.3529615517838587, + "grad_norm": 0.5661075711250305, + "learning_rate": 6.650485436893205e-06, + "loss": 0.026, + "step": 4840 + }, + { + "epoch": 3.3536543124350535, + "grad_norm": 0.5836403965950012, + "learning_rate": 6.649791955617198e-06, + "loss": 0.0269, + "step": 4841 + }, + { + "epoch": 3.354347073086249, + "grad_norm": 0.6065439581871033, + "learning_rate": 6.649098474341193e-06, + "loss": 0.0319, + "step": 4842 + }, + { + "epoch": 3.3550398337374436, + "grad_norm": 0.5517366528511047, + "learning_rate": 6.648404993065187e-06, + "loss": 0.0324, + "step": 4843 + }, + { + "epoch": 3.355732594388639, + "grad_norm": 0.4713611900806427, + "learning_rate": 6.647711511789182e-06, + "loss": 0.0256, + "step": 4844 + }, + { + "epoch": 3.3564253550398337, + "grad_norm": 0.6643800139427185, + "learning_rate": 6.647018030513177e-06, + "loss": 0.0294, + "step": 4845 + }, + { + "epoch": 3.3571181156910286, + "grad_norm": 0.5426997542381287, + "learning_rate": 6.646324549237171e-06, + "loss": 0.0305, + "step": 4846 + }, + { + "epoch": 3.357810876342224, + "grad_norm": 0.5457416772842407, + "learning_rate": 6.645631067961166e-06, + "loss": 0.0293, + "step": 4847 + }, + { + "epoch": 3.3585036369934187, + "grad_norm": 0.6430615186691284, + "learning_rate": 6.644937586685159e-06, + "loss": 0.0297, + "step": 4848 + }, + { + "epoch": 3.359196397644614, + "grad_norm": 0.6658140420913696, + "learning_rate": 6.644244105409154e-06, + "loss": 0.0296, + "step": 4849 + }, + { + "epoch": 3.359889158295809, + "grad_norm": 0.6753531098365784, + "learning_rate": 6.643550624133149e-06, + "loss": 0.0278, + "step": 4850 + }, + { + "epoch": 3.3605819189470036, + "grad_norm": 0.5653782486915588, + "learning_rate": 6.642857142857143e-06, + "loss": 0.0279, + "step": 4851 + }, + { + "epoch": 3.361274679598199, + "grad_norm": 0.7654158473014832, + "learning_rate": 6.642163661581138e-06, + "loss": 0.0312, + "step": 4852 + }, + { + "epoch": 3.3619674402493938, + "grad_norm": 0.495338499546051, + "learning_rate": 6.6414701803051314e-06, + "loss": 0.024, + "step": 4853 + }, + { + "epoch": 3.362660200900589, + "grad_norm": 0.7510350346565247, + "learning_rate": 6.640776699029126e-06, + "loss": 0.0263, + "step": 4854 + }, + { + "epoch": 3.363352961551784, + "grad_norm": 0.5413582921028137, + "learning_rate": 6.640083217753121e-06, + "loss": 0.0259, + "step": 4855 + }, + { + "epoch": 3.3640457222029787, + "grad_norm": 0.5555694699287415, + "learning_rate": 6.639389736477115e-06, + "loss": 0.0291, + "step": 4856 + }, + { + "epoch": 3.364738482854174, + "grad_norm": 0.6627641916275024, + "learning_rate": 6.63869625520111e-06, + "loss": 0.0326, + "step": 4857 + }, + { + "epoch": 3.365431243505369, + "grad_norm": 0.46388813853263855, + "learning_rate": 6.638002773925104e-06, + "loss": 0.0217, + "step": 4858 + }, + { + "epoch": 3.366124004156564, + "grad_norm": 0.603958010673523, + "learning_rate": 6.637309292649099e-06, + "loss": 0.03, + "step": 4859 + }, + { + "epoch": 3.366816764807759, + "grad_norm": 0.6201016306877136, + "learning_rate": 6.6366158113730934e-06, + "loss": 0.022, + "step": 4860 + }, + { + "epoch": 3.3675095254589538, + "grad_norm": 0.5489681363105774, + "learning_rate": 6.6359223300970875e-06, + "loss": 0.0275, + "step": 4861 + }, + { + "epoch": 3.368202286110149, + "grad_norm": 0.5528520345687866, + "learning_rate": 6.6352288488210825e-06, + "loss": 0.0229, + "step": 4862 + }, + { + "epoch": 3.368895046761344, + "grad_norm": 0.5541024804115295, + "learning_rate": 6.6345353675450765e-06, + "loss": 0.0317, + "step": 4863 + }, + { + "epoch": 3.369587807412539, + "grad_norm": 0.519614040851593, + "learning_rate": 6.6338418862690715e-06, + "loss": 0.0265, + "step": 4864 + }, + { + "epoch": 3.370280568063734, + "grad_norm": 0.6692903637886047, + "learning_rate": 6.633148404993066e-06, + "loss": 0.0298, + "step": 4865 + }, + { + "epoch": 3.370973328714929, + "grad_norm": 0.6233783960342407, + "learning_rate": 6.63245492371706e-06, + "loss": 0.0327, + "step": 4866 + }, + { + "epoch": 3.371666089366124, + "grad_norm": 0.6404414176940918, + "learning_rate": 6.631761442441055e-06, + "loss": 0.0327, + "step": 4867 + }, + { + "epoch": 3.372358850017319, + "grad_norm": 0.46966442465782166, + "learning_rate": 6.631067961165049e-06, + "loss": 0.0196, + "step": 4868 + }, + { + "epoch": 3.373051610668514, + "grad_norm": 0.6829743981361389, + "learning_rate": 6.630374479889044e-06, + "loss": 0.0379, + "step": 4869 + }, + { + "epoch": 3.373744371319709, + "grad_norm": 0.5777661800384521, + "learning_rate": 6.6296809986130385e-06, + "loss": 0.025, + "step": 4870 + }, + { + "epoch": 3.374437131970904, + "grad_norm": 0.7087007164955139, + "learning_rate": 6.628987517337032e-06, + "loss": 0.0283, + "step": 4871 + }, + { + "epoch": 3.375129892622099, + "grad_norm": 0.5589938163757324, + "learning_rate": 6.628294036061027e-06, + "loss": 0.029, + "step": 4872 + }, + { + "epoch": 3.375822653273294, + "grad_norm": 0.5609465837478638, + "learning_rate": 6.627600554785021e-06, + "loss": 0.0313, + "step": 4873 + }, + { + "epoch": 3.376515413924489, + "grad_norm": 0.5925002098083496, + "learning_rate": 6.626907073509016e-06, + "loss": 0.0396, + "step": 4874 + }, + { + "epoch": 3.377208174575684, + "grad_norm": 0.5495500564575195, + "learning_rate": 6.626213592233011e-06, + "loss": 0.0216, + "step": 4875 + }, + { + "epoch": 3.377900935226879, + "grad_norm": 0.564579963684082, + "learning_rate": 6.625520110957005e-06, + "loss": 0.0229, + "step": 4876 + }, + { + "epoch": 3.3785936958780742, + "grad_norm": 0.5010783076286316, + "learning_rate": 6.624826629681e-06, + "loss": 0.0222, + "step": 4877 + }, + { + "epoch": 3.379286456529269, + "grad_norm": 0.6719745993614197, + "learning_rate": 6.624133148404993e-06, + "loss": 0.0407, + "step": 4878 + }, + { + "epoch": 3.3799792171804643, + "grad_norm": 0.5503398776054382, + "learning_rate": 6.623439667128988e-06, + "loss": 0.0295, + "step": 4879 + }, + { + "epoch": 3.380671977831659, + "grad_norm": 0.5117623805999756, + "learning_rate": 6.622746185852983e-06, + "loss": 0.0266, + "step": 4880 + }, + { + "epoch": 3.381364738482854, + "grad_norm": 0.460734486579895, + "learning_rate": 6.622052704576977e-06, + "loss": 0.0252, + "step": 4881 + }, + { + "epoch": 3.3820574991340493, + "grad_norm": 0.5623500347137451, + "learning_rate": 6.621359223300972e-06, + "loss": 0.0277, + "step": 4882 + }, + { + "epoch": 3.382750259785244, + "grad_norm": 0.623411238193512, + "learning_rate": 6.620665742024965e-06, + "loss": 0.0263, + "step": 4883 + }, + { + "epoch": 3.383443020436439, + "grad_norm": 0.6305052638053894, + "learning_rate": 6.61997226074896e-06, + "loss": 0.0281, + "step": 4884 + }, + { + "epoch": 3.3841357810876342, + "grad_norm": 0.5371508598327637, + "learning_rate": 6.619278779472955e-06, + "loss": 0.0243, + "step": 4885 + }, + { + "epoch": 3.384828541738829, + "grad_norm": 0.5948777794837952, + "learning_rate": 6.618585298196949e-06, + "loss": 0.0332, + "step": 4886 + }, + { + "epoch": 3.3855213023900244, + "grad_norm": 0.6408388614654541, + "learning_rate": 6.617891816920944e-06, + "loss": 0.0306, + "step": 4887 + }, + { + "epoch": 3.386214063041219, + "grad_norm": 0.4946375787258148, + "learning_rate": 6.617198335644938e-06, + "loss": 0.0199, + "step": 4888 + }, + { + "epoch": 3.3869068236924145, + "grad_norm": 0.5868860483169556, + "learning_rate": 6.616504854368933e-06, + "loss": 0.0331, + "step": 4889 + }, + { + "epoch": 3.3875995843436093, + "grad_norm": 0.5085358619689941, + "learning_rate": 6.615811373092927e-06, + "loss": 0.0259, + "step": 4890 + }, + { + "epoch": 3.388292344994804, + "grad_norm": 0.7379584312438965, + "learning_rate": 6.615117891816921e-06, + "loss": 0.0324, + "step": 4891 + }, + { + "epoch": 3.3889851056459994, + "grad_norm": 0.7220065593719482, + "learning_rate": 6.614424410540916e-06, + "loss": 0.0345, + "step": 4892 + }, + { + "epoch": 3.3896778662971943, + "grad_norm": 0.646913468837738, + "learning_rate": 6.61373092926491e-06, + "loss": 0.0353, + "step": 4893 + }, + { + "epoch": 3.390370626948389, + "grad_norm": 0.9849861264228821, + "learning_rate": 6.613037447988905e-06, + "loss": 0.0388, + "step": 4894 + }, + { + "epoch": 3.3910633875995844, + "grad_norm": 0.5091375112533569, + "learning_rate": 6.6123439667129e-06, + "loss": 0.036, + "step": 4895 + }, + { + "epoch": 3.391756148250779, + "grad_norm": 0.5635720491409302, + "learning_rate": 6.611650485436893e-06, + "loss": 0.0294, + "step": 4896 + }, + { + "epoch": 3.3924489089019745, + "grad_norm": 0.546768069267273, + "learning_rate": 6.610957004160888e-06, + "loss": 0.029, + "step": 4897 + }, + { + "epoch": 3.3931416695531693, + "grad_norm": 0.44892817735671997, + "learning_rate": 6.610263522884882e-06, + "loss": 0.0202, + "step": 4898 + }, + { + "epoch": 3.3938344302043646, + "grad_norm": 0.6681339144706726, + "learning_rate": 6.609570041608877e-06, + "loss": 0.0221, + "step": 4899 + }, + { + "epoch": 3.3945271908555594, + "grad_norm": 0.5175942778587341, + "learning_rate": 6.608876560332872e-06, + "loss": 0.023, + "step": 4900 + }, + { + "epoch": 3.3952199515067543, + "grad_norm": 0.5441368818283081, + "learning_rate": 6.608183079056865e-06, + "loss": 0.0303, + "step": 4901 + }, + { + "epoch": 3.3959127121579495, + "grad_norm": 0.5968049168586731, + "learning_rate": 6.60748959778086e-06, + "loss": 0.0267, + "step": 4902 + }, + { + "epoch": 3.3966054728091444, + "grad_norm": 0.4440753757953644, + "learning_rate": 6.606796116504854e-06, + "loss": 0.0168, + "step": 4903 + }, + { + "epoch": 3.397298233460339, + "grad_norm": 0.5775256752967834, + "learning_rate": 6.606102635228849e-06, + "loss": 0.0328, + "step": 4904 + }, + { + "epoch": 3.3979909941115345, + "grad_norm": 0.6890320777893066, + "learning_rate": 6.605409153952844e-06, + "loss": 0.0307, + "step": 4905 + }, + { + "epoch": 3.3986837547627293, + "grad_norm": 0.504628598690033, + "learning_rate": 6.604715672676838e-06, + "loss": 0.0248, + "step": 4906 + }, + { + "epoch": 3.3993765154139246, + "grad_norm": 0.5967927575111389, + "learning_rate": 6.604022191400833e-06, + "loss": 0.0335, + "step": 4907 + }, + { + "epoch": 3.4000692760651194, + "grad_norm": 0.6110968589782715, + "learning_rate": 6.6033287101248265e-06, + "loss": 0.0357, + "step": 4908 + }, + { + "epoch": 3.4007620367163147, + "grad_norm": 0.5766600370407104, + "learning_rate": 6.6026352288488215e-06, + "loss": 0.0329, + "step": 4909 + }, + { + "epoch": 3.4014547973675096, + "grad_norm": 0.5956555008888245, + "learning_rate": 6.601941747572816e-06, + "loss": 0.0268, + "step": 4910 + }, + { + "epoch": 3.4021475580187044, + "grad_norm": 0.8311800360679626, + "learning_rate": 6.6012482662968105e-06, + "loss": 0.0311, + "step": 4911 + }, + { + "epoch": 3.4028403186698997, + "grad_norm": 0.5553978681564331, + "learning_rate": 6.600554785020805e-06, + "loss": 0.0338, + "step": 4912 + }, + { + "epoch": 3.4035330793210945, + "grad_norm": 0.6768361330032349, + "learning_rate": 6.599861303744799e-06, + "loss": 0.035, + "step": 4913 + }, + { + "epoch": 3.4042258399722893, + "grad_norm": 0.46058088541030884, + "learning_rate": 6.599167822468794e-06, + "loss": 0.0233, + "step": 4914 + }, + { + "epoch": 3.4049186006234846, + "grad_norm": 0.48217880725860596, + "learning_rate": 6.5984743411927885e-06, + "loss": 0.0184, + "step": 4915 + }, + { + "epoch": 3.4056113612746794, + "grad_norm": 0.5471254587173462, + "learning_rate": 6.597780859916783e-06, + "loss": 0.0304, + "step": 4916 + }, + { + "epoch": 3.4063041219258747, + "grad_norm": 0.5644373893737793, + "learning_rate": 6.5970873786407775e-06, + "loss": 0.0293, + "step": 4917 + }, + { + "epoch": 3.4069968825770696, + "grad_norm": 0.6265426874160767, + "learning_rate": 6.596393897364772e-06, + "loss": 0.0358, + "step": 4918 + }, + { + "epoch": 3.407689643228265, + "grad_norm": 0.5866125822067261, + "learning_rate": 6.595700416088766e-06, + "loss": 0.0266, + "step": 4919 + }, + { + "epoch": 3.4083824038794597, + "grad_norm": 0.5871446132659912, + "learning_rate": 6.595006934812761e-06, + "loss": 0.0265, + "step": 4920 + }, + { + "epoch": 3.4090751645306545, + "grad_norm": 0.5365444421768188, + "learning_rate": 6.594313453536755e-06, + "loss": 0.0221, + "step": 4921 + }, + { + "epoch": 3.40976792518185, + "grad_norm": 0.465727835893631, + "learning_rate": 6.59361997226075e-06, + "loss": 0.0243, + "step": 4922 + }, + { + "epoch": 3.4104606858330446, + "grad_norm": 0.5453728437423706, + "learning_rate": 6.592926490984744e-06, + "loss": 0.0279, + "step": 4923 + }, + { + "epoch": 3.4111534464842395, + "grad_norm": 0.5906558036804199, + "learning_rate": 6.592233009708739e-06, + "loss": 0.0275, + "step": 4924 + }, + { + "epoch": 3.4118462071354347, + "grad_norm": 0.6555768251419067, + "learning_rate": 6.591539528432734e-06, + "loss": 0.0237, + "step": 4925 + }, + { + "epoch": 3.4125389677866296, + "grad_norm": 0.6605638861656189, + "learning_rate": 6.590846047156727e-06, + "loss": 0.0325, + "step": 4926 + }, + { + "epoch": 3.413231728437825, + "grad_norm": 0.598610520362854, + "learning_rate": 6.590152565880722e-06, + "loss": 0.0376, + "step": 4927 + }, + { + "epoch": 3.4139244890890197, + "grad_norm": 0.6504144072532654, + "learning_rate": 6.589459084604716e-06, + "loss": 0.0331, + "step": 4928 + }, + { + "epoch": 3.414617249740215, + "grad_norm": 0.5567007064819336, + "learning_rate": 6.588765603328711e-06, + "loss": 0.0223, + "step": 4929 + }, + { + "epoch": 3.41531001039141, + "grad_norm": 0.5931421518325806, + "learning_rate": 6.588072122052706e-06, + "loss": 0.0338, + "step": 4930 + }, + { + "epoch": 3.4160027710426046, + "grad_norm": 0.6375455260276794, + "learning_rate": 6.587378640776699e-06, + "loss": 0.0291, + "step": 4931 + }, + { + "epoch": 3.4166955316938, + "grad_norm": 0.6157872080802917, + "learning_rate": 6.586685159500694e-06, + "loss": 0.0323, + "step": 4932 + }, + { + "epoch": 3.4173882923449947, + "grad_norm": 0.5205309391021729, + "learning_rate": 6.585991678224688e-06, + "loss": 0.0223, + "step": 4933 + }, + { + "epoch": 3.4180810529961896, + "grad_norm": 0.521050751209259, + "learning_rate": 6.585298196948683e-06, + "loss": 0.0283, + "step": 4934 + }, + { + "epoch": 3.418773813647385, + "grad_norm": 0.6215744614601135, + "learning_rate": 6.584604715672678e-06, + "loss": 0.0277, + "step": 4935 + }, + { + "epoch": 3.4194665742985797, + "grad_norm": 0.47959715127944946, + "learning_rate": 6.583911234396672e-06, + "loss": 0.02, + "step": 4936 + }, + { + "epoch": 3.420159334949775, + "grad_norm": 0.6303134560585022, + "learning_rate": 6.583217753120667e-06, + "loss": 0.0374, + "step": 4937 + }, + { + "epoch": 3.42085209560097, + "grad_norm": 0.5215510129928589, + "learning_rate": 6.58252427184466e-06, + "loss": 0.0281, + "step": 4938 + }, + { + "epoch": 3.421544856252165, + "grad_norm": 0.614483118057251, + "learning_rate": 6.581830790568655e-06, + "loss": 0.0232, + "step": 4939 + }, + { + "epoch": 3.42223761690336, + "grad_norm": 0.6388923525810242, + "learning_rate": 6.58113730929265e-06, + "loss": 0.0273, + "step": 4940 + }, + { + "epoch": 3.4229303775545548, + "grad_norm": 0.6062755584716797, + "learning_rate": 6.580443828016644e-06, + "loss": 0.0343, + "step": 4941 + }, + { + "epoch": 3.42362313820575, + "grad_norm": 0.6559286713600159, + "learning_rate": 6.579750346740639e-06, + "loss": 0.0336, + "step": 4942 + }, + { + "epoch": 3.424315898856945, + "grad_norm": 0.6256927251815796, + "learning_rate": 6.579056865464632e-06, + "loss": 0.0353, + "step": 4943 + }, + { + "epoch": 3.4250086595081397, + "grad_norm": 0.571751594543457, + "learning_rate": 6.578363384188627e-06, + "loss": 0.0292, + "step": 4944 + }, + { + "epoch": 3.425701420159335, + "grad_norm": 0.5033543705940247, + "learning_rate": 6.577669902912622e-06, + "loss": 0.024, + "step": 4945 + }, + { + "epoch": 3.42639418081053, + "grad_norm": 0.5530136227607727, + "learning_rate": 6.576976421636616e-06, + "loss": 0.0333, + "step": 4946 + }, + { + "epoch": 3.427086941461725, + "grad_norm": 0.6370141506195068, + "learning_rate": 6.576282940360611e-06, + "loss": 0.035, + "step": 4947 + }, + { + "epoch": 3.42777970211292, + "grad_norm": 0.5829577445983887, + "learning_rate": 6.575589459084604e-06, + "loss": 0.0289, + "step": 4948 + }, + { + "epoch": 3.428472462764115, + "grad_norm": 1.0694974660873413, + "learning_rate": 6.574895977808599e-06, + "loss": 0.0378, + "step": 4949 + }, + { + "epoch": 3.42916522341531, + "grad_norm": 0.49449989199638367, + "learning_rate": 6.574202496532594e-06, + "loss": 0.0259, + "step": 4950 + }, + { + "epoch": 3.429857984066505, + "grad_norm": 0.5746321678161621, + "learning_rate": 6.573509015256588e-06, + "loss": 0.0357, + "step": 4951 + }, + { + "epoch": 3.4305507447177, + "grad_norm": 0.5766549706459045, + "learning_rate": 6.572815533980583e-06, + "loss": 0.0369, + "step": 4952 + }, + { + "epoch": 3.431243505368895, + "grad_norm": 0.5804804563522339, + "learning_rate": 6.572122052704577e-06, + "loss": 0.0377, + "step": 4953 + }, + { + "epoch": 3.43193626602009, + "grad_norm": 0.5649195909500122, + "learning_rate": 6.571428571428572e-06, + "loss": 0.0225, + "step": 4954 + }, + { + "epoch": 3.432629026671285, + "grad_norm": 0.6376426815986633, + "learning_rate": 6.570735090152567e-06, + "loss": 0.0284, + "step": 4955 + }, + { + "epoch": 3.43332178732248, + "grad_norm": 0.5937965512275696, + "learning_rate": 6.5700416088765605e-06, + "loss": 0.0327, + "step": 4956 + }, + { + "epoch": 3.4340145479736752, + "grad_norm": 0.6171954870223999, + "learning_rate": 6.569348127600555e-06, + "loss": 0.0385, + "step": 4957 + }, + { + "epoch": 3.43470730862487, + "grad_norm": 0.7144383192062378, + "learning_rate": 6.5686546463245495e-06, + "loss": 0.0367, + "step": 4958 + }, + { + "epoch": 3.4354000692760653, + "grad_norm": 0.6109632849693298, + "learning_rate": 6.567961165048544e-06, + "loss": 0.0359, + "step": 4959 + }, + { + "epoch": 3.43609282992726, + "grad_norm": 0.6784688830375671, + "learning_rate": 6.567267683772539e-06, + "loss": 0.0275, + "step": 4960 + }, + { + "epoch": 3.436785590578455, + "grad_norm": 0.6274477243423462, + "learning_rate": 6.566574202496533e-06, + "loss": 0.0341, + "step": 4961 + }, + { + "epoch": 3.4374783512296503, + "grad_norm": 0.5536825656890869, + "learning_rate": 6.5658807212205275e-06, + "loss": 0.0248, + "step": 4962 + }, + { + "epoch": 3.438171111880845, + "grad_norm": 0.5654683113098145, + "learning_rate": 6.565187239944522e-06, + "loss": 0.0381, + "step": 4963 + }, + { + "epoch": 3.43886387253204, + "grad_norm": 0.6740880012512207, + "learning_rate": 6.5644937586685165e-06, + "loss": 0.0247, + "step": 4964 + }, + { + "epoch": 3.4395566331832352, + "grad_norm": 0.6297820210456848, + "learning_rate": 6.5638002773925115e-06, + "loss": 0.031, + "step": 4965 + }, + { + "epoch": 3.44024939383443, + "grad_norm": 0.5881513357162476, + "learning_rate": 6.5631067961165056e-06, + "loss": 0.0326, + "step": 4966 + }, + { + "epoch": 3.4409421544856253, + "grad_norm": 0.6150519847869873, + "learning_rate": 6.5624133148405e-06, + "loss": 0.0377, + "step": 4967 + }, + { + "epoch": 3.44163491513682, + "grad_norm": 0.5904532074928284, + "learning_rate": 6.561719833564494e-06, + "loss": 0.0304, + "step": 4968 + }, + { + "epoch": 3.4423276757880155, + "grad_norm": 0.5624169111251831, + "learning_rate": 6.561026352288489e-06, + "loss": 0.0307, + "step": 4969 + }, + { + "epoch": 3.4430204364392103, + "grad_norm": 0.7530590891838074, + "learning_rate": 6.560332871012484e-06, + "loss": 0.0328, + "step": 4970 + }, + { + "epoch": 3.443713197090405, + "grad_norm": 0.6149039268493652, + "learning_rate": 6.559639389736478e-06, + "loss": 0.0247, + "step": 4971 + }, + { + "epoch": 3.4444059577416004, + "grad_norm": 0.6508607864379883, + "learning_rate": 6.558945908460473e-06, + "loss": 0.0268, + "step": 4972 + }, + { + "epoch": 3.4450987183927952, + "grad_norm": 0.6199063062667847, + "learning_rate": 6.558252427184466e-06, + "loss": 0.0278, + "step": 4973 + }, + { + "epoch": 3.44579147904399, + "grad_norm": 0.695668637752533, + "learning_rate": 6.557558945908461e-06, + "loss": 0.0411, + "step": 4974 + }, + { + "epoch": 3.4464842396951854, + "grad_norm": 0.6539093852043152, + "learning_rate": 6.556865464632456e-06, + "loss": 0.0326, + "step": 4975 + }, + { + "epoch": 3.44717700034638, + "grad_norm": 0.5350441932678223, + "learning_rate": 6.55617198335645e-06, + "loss": 0.0255, + "step": 4976 + }, + { + "epoch": 3.4478697609975755, + "grad_norm": 0.6660082340240479, + "learning_rate": 6.555478502080445e-06, + "loss": 0.0379, + "step": 4977 + }, + { + "epoch": 3.4485625216487703, + "grad_norm": 0.6372496485710144, + "learning_rate": 6.554785020804438e-06, + "loss": 0.024, + "step": 4978 + }, + { + "epoch": 3.4492552822999656, + "grad_norm": 0.8061148524284363, + "learning_rate": 6.554091539528433e-06, + "loss": 0.0353, + "step": 4979 + }, + { + "epoch": 3.4499480429511604, + "grad_norm": 0.5602715611457825, + "learning_rate": 6.553398058252428e-06, + "loss": 0.0229, + "step": 4980 + }, + { + "epoch": 3.4506408036023553, + "grad_norm": 0.610578715801239, + "learning_rate": 6.552704576976422e-06, + "loss": 0.0353, + "step": 4981 + }, + { + "epoch": 3.4513335642535505, + "grad_norm": 0.585390031337738, + "learning_rate": 6.552011095700417e-06, + "loss": 0.031, + "step": 4982 + }, + { + "epoch": 3.4520263249047454, + "grad_norm": 0.5611028075218201, + "learning_rate": 6.551317614424411e-06, + "loss": 0.0278, + "step": 4983 + }, + { + "epoch": 3.45271908555594, + "grad_norm": 0.8584210872650146, + "learning_rate": 6.550624133148406e-06, + "loss": 0.0362, + "step": 4984 + }, + { + "epoch": 3.4534118462071355, + "grad_norm": 0.6460939049720764, + "learning_rate": 6.549930651872401e-06, + "loss": 0.0327, + "step": 4985 + }, + { + "epoch": 3.4541046068583303, + "grad_norm": 0.5460928082466125, + "learning_rate": 6.549237170596394e-06, + "loss": 0.0314, + "step": 4986 + }, + { + "epoch": 3.4547973675095256, + "grad_norm": 0.4444792568683624, + "learning_rate": 6.548543689320389e-06, + "loss": 0.0232, + "step": 4987 + }, + { + "epoch": 3.4554901281607204, + "grad_norm": 0.5863757729530334, + "learning_rate": 6.547850208044383e-06, + "loss": 0.0225, + "step": 4988 + }, + { + "epoch": 3.4561828888119157, + "grad_norm": 0.6400433778762817, + "learning_rate": 6.547156726768378e-06, + "loss": 0.0237, + "step": 4989 + }, + { + "epoch": 3.4568756494631105, + "grad_norm": 0.6727641224861145, + "learning_rate": 6.546463245492373e-06, + "loss": 0.0396, + "step": 4990 + }, + { + "epoch": 3.4575684101143054, + "grad_norm": 0.4800737500190735, + "learning_rate": 6.545769764216366e-06, + "loss": 0.0213, + "step": 4991 + }, + { + "epoch": 3.4582611707655007, + "grad_norm": 0.6022713780403137, + "learning_rate": 6.545076282940361e-06, + "loss": 0.0287, + "step": 4992 + }, + { + "epoch": 3.4589539314166955, + "grad_norm": 0.607678234577179, + "learning_rate": 6.544382801664355e-06, + "loss": 0.0314, + "step": 4993 + }, + { + "epoch": 3.4596466920678903, + "grad_norm": 0.6265416741371155, + "learning_rate": 6.54368932038835e-06, + "loss": 0.0311, + "step": 4994 + }, + { + "epoch": 3.4603394527190856, + "grad_norm": 0.7412983179092407, + "learning_rate": 6.542995839112345e-06, + "loss": 0.0375, + "step": 4995 + }, + { + "epoch": 3.4610322133702804, + "grad_norm": 0.6574375033378601, + "learning_rate": 6.542302357836338e-06, + "loss": 0.0373, + "step": 4996 + }, + { + "epoch": 3.4617249740214757, + "grad_norm": 0.550150990486145, + "learning_rate": 6.541608876560333e-06, + "loss": 0.0311, + "step": 4997 + }, + { + "epoch": 3.4624177346726706, + "grad_norm": 0.6278256773948669, + "learning_rate": 6.540915395284327e-06, + "loss": 0.0272, + "step": 4998 + }, + { + "epoch": 3.463110495323866, + "grad_norm": 0.621241569519043, + "learning_rate": 6.540221914008322e-06, + "loss": 0.0335, + "step": 4999 + }, + { + "epoch": 3.4638032559750607, + "grad_norm": 0.6454599499702454, + "learning_rate": 6.539528432732317e-06, + "loss": 0.029, + "step": 5000 + }, + { + "epoch": 3.4644960166262555, + "grad_norm": 0.5408979058265686, + "learning_rate": 6.538834951456311e-06, + "loss": 0.0277, + "step": 5001 + }, + { + "epoch": 3.465188777277451, + "grad_norm": 0.7437618374824524, + "learning_rate": 6.538141470180306e-06, + "loss": 0.031, + "step": 5002 + }, + { + "epoch": 3.4658815379286456, + "grad_norm": 0.5505980849266052, + "learning_rate": 6.5374479889042995e-06, + "loss": 0.03, + "step": 5003 + }, + { + "epoch": 3.4665742985798405, + "grad_norm": 0.5235286355018616, + "learning_rate": 6.536754507628294e-06, + "loss": 0.0236, + "step": 5004 + }, + { + "epoch": 3.4672670592310357, + "grad_norm": 0.5948695540428162, + "learning_rate": 6.536061026352289e-06, + "loss": 0.0313, + "step": 5005 + }, + { + "epoch": 3.4679598198822306, + "grad_norm": 0.5590401291847229, + "learning_rate": 6.535367545076283e-06, + "loss": 0.029, + "step": 5006 + }, + { + "epoch": 3.468652580533426, + "grad_norm": 0.5125772356987, + "learning_rate": 6.534674063800278e-06, + "loss": 0.0256, + "step": 5007 + }, + { + "epoch": 3.4693453411846207, + "grad_norm": 0.6227302551269531, + "learning_rate": 6.533980582524272e-06, + "loss": 0.0316, + "step": 5008 + }, + { + "epoch": 3.470038101835816, + "grad_norm": 0.5919042229652405, + "learning_rate": 6.5332871012482665e-06, + "loss": 0.0293, + "step": 5009 + }, + { + "epoch": 3.470730862487011, + "grad_norm": 0.5775042176246643, + "learning_rate": 6.5325936199722614e-06, + "loss": 0.0313, + "step": 5010 + }, + { + "epoch": 3.4714236231382056, + "grad_norm": 0.5381078124046326, + "learning_rate": 6.5319001386962555e-06, + "loss": 0.0297, + "step": 5011 + }, + { + "epoch": 3.472116383789401, + "grad_norm": 0.5170828700065613, + "learning_rate": 6.5312066574202505e-06, + "loss": 0.0318, + "step": 5012 + }, + { + "epoch": 3.4728091444405957, + "grad_norm": 0.6526956558227539, + "learning_rate": 6.5305131761442446e-06, + "loss": 0.0333, + "step": 5013 + }, + { + "epoch": 3.4735019050917906, + "grad_norm": 0.6767124533653259, + "learning_rate": 6.5298196948682395e-06, + "loss": 0.0355, + "step": 5014 + }, + { + "epoch": 3.474194665742986, + "grad_norm": 0.5965400338172913, + "learning_rate": 6.5291262135922336e-06, + "loss": 0.0322, + "step": 5015 + }, + { + "epoch": 3.4748874263941807, + "grad_norm": 0.5544615387916565, + "learning_rate": 6.528432732316228e-06, + "loss": 0.0251, + "step": 5016 + }, + { + "epoch": 3.475580187045376, + "grad_norm": 0.636720597743988, + "learning_rate": 6.527739251040223e-06, + "loss": 0.0326, + "step": 5017 + }, + { + "epoch": 3.476272947696571, + "grad_norm": 0.4987667500972748, + "learning_rate": 6.527045769764217e-06, + "loss": 0.0281, + "step": 5018 + }, + { + "epoch": 3.476965708347766, + "grad_norm": 0.5134775042533875, + "learning_rate": 6.526352288488212e-06, + "loss": 0.0272, + "step": 5019 + }, + { + "epoch": 3.477658468998961, + "grad_norm": 0.6622617840766907, + "learning_rate": 6.5256588072122065e-06, + "loss": 0.0332, + "step": 5020 + }, + { + "epoch": 3.4783512296501558, + "grad_norm": 0.6579943895339966, + "learning_rate": 6.5249653259362e-06, + "loss": 0.0304, + "step": 5021 + }, + { + "epoch": 3.479043990301351, + "grad_norm": 0.6182815432548523, + "learning_rate": 6.524271844660195e-06, + "loss": 0.0328, + "step": 5022 + }, + { + "epoch": 3.479736750952546, + "grad_norm": 0.5382652878761292, + "learning_rate": 6.523578363384189e-06, + "loss": 0.0253, + "step": 5023 + }, + { + "epoch": 3.4804295116037407, + "grad_norm": 0.7151105999946594, + "learning_rate": 6.522884882108184e-06, + "loss": 0.0303, + "step": 5024 + }, + { + "epoch": 3.481122272254936, + "grad_norm": 0.5923858284950256, + "learning_rate": 6.522191400832179e-06, + "loss": 0.0251, + "step": 5025 + }, + { + "epoch": 3.481815032906131, + "grad_norm": 0.5400237441062927, + "learning_rate": 6.521497919556172e-06, + "loss": 0.0305, + "step": 5026 + }, + { + "epoch": 3.482507793557326, + "grad_norm": 0.5522944331169128, + "learning_rate": 6.520804438280167e-06, + "loss": 0.0262, + "step": 5027 + }, + { + "epoch": 3.483200554208521, + "grad_norm": 0.7159655094146729, + "learning_rate": 6.520110957004161e-06, + "loss": 0.0268, + "step": 5028 + }, + { + "epoch": 3.483893314859716, + "grad_norm": 0.7188911437988281, + "learning_rate": 6.519417475728156e-06, + "loss": 0.0395, + "step": 5029 + }, + { + "epoch": 3.484586075510911, + "grad_norm": 0.5369138717651367, + "learning_rate": 6.518723994452151e-06, + "loss": 0.0319, + "step": 5030 + }, + { + "epoch": 3.485278836162106, + "grad_norm": 0.6051187515258789, + "learning_rate": 6.518030513176145e-06, + "loss": 0.038, + "step": 5031 + }, + { + "epoch": 3.485971596813301, + "grad_norm": 0.4840165376663208, + "learning_rate": 6.51733703190014e-06, + "loss": 0.022, + "step": 5032 + }, + { + "epoch": 3.486664357464496, + "grad_norm": 0.5075309872627258, + "learning_rate": 6.516643550624133e-06, + "loss": 0.026, + "step": 5033 + }, + { + "epoch": 3.487357118115691, + "grad_norm": 0.5605731010437012, + "learning_rate": 6.515950069348128e-06, + "loss": 0.0254, + "step": 5034 + }, + { + "epoch": 3.488049878766886, + "grad_norm": 0.5532118082046509, + "learning_rate": 6.515256588072123e-06, + "loss": 0.0245, + "step": 5035 + }, + { + "epoch": 3.488742639418081, + "grad_norm": 0.5283900499343872, + "learning_rate": 6.514563106796117e-06, + "loss": 0.0306, + "step": 5036 + }, + { + "epoch": 3.489435400069276, + "grad_norm": 0.5738806128501892, + "learning_rate": 6.513869625520112e-06, + "loss": 0.028, + "step": 5037 + }, + { + "epoch": 3.490128160720471, + "grad_norm": 0.45622918009757996, + "learning_rate": 6.513176144244105e-06, + "loss": 0.0211, + "step": 5038 + }, + { + "epoch": 3.4908209213716663, + "grad_norm": 0.5849683284759521, + "learning_rate": 6.5124826629681e-06, + "loss": 0.0345, + "step": 5039 + }, + { + "epoch": 3.491513682022861, + "grad_norm": 0.6652094721794128, + "learning_rate": 6.511789181692095e-06, + "loss": 0.0322, + "step": 5040 + }, + { + "epoch": 3.492206442674056, + "grad_norm": 0.5022942423820496, + "learning_rate": 6.511095700416089e-06, + "loss": 0.0213, + "step": 5041 + }, + { + "epoch": 3.4928992033252513, + "grad_norm": 0.5821471810340881, + "learning_rate": 6.510402219140084e-06, + "loss": 0.0325, + "step": 5042 + }, + { + "epoch": 3.493591963976446, + "grad_norm": 0.5777978897094727, + "learning_rate": 6.509708737864078e-06, + "loss": 0.0303, + "step": 5043 + }, + { + "epoch": 3.494284724627641, + "grad_norm": 0.5560289621353149, + "learning_rate": 6.509015256588072e-06, + "loss": 0.0324, + "step": 5044 + }, + { + "epoch": 3.4949774852788362, + "grad_norm": 0.5134693384170532, + "learning_rate": 6.508321775312067e-06, + "loss": 0.0213, + "step": 5045 + }, + { + "epoch": 3.495670245930031, + "grad_norm": 0.6379109025001526, + "learning_rate": 6.507628294036061e-06, + "loss": 0.0365, + "step": 5046 + }, + { + "epoch": 3.4963630065812263, + "grad_norm": 0.5841686725616455, + "learning_rate": 6.506934812760056e-06, + "loss": 0.0234, + "step": 5047 + }, + { + "epoch": 3.497055767232421, + "grad_norm": 0.5673247575759888, + "learning_rate": 6.50624133148405e-06, + "loss": 0.0202, + "step": 5048 + }, + { + "epoch": 3.4977485278836165, + "grad_norm": 0.5946432948112488, + "learning_rate": 6.505547850208045e-06, + "loss": 0.0318, + "step": 5049 + }, + { + "epoch": 3.4984412885348113, + "grad_norm": 0.5511200428009033, + "learning_rate": 6.50485436893204e-06, + "loss": 0.0251, + "step": 5050 + }, + { + "epoch": 3.499134049186006, + "grad_norm": 0.5466020703315735, + "learning_rate": 6.504160887656033e-06, + "loss": 0.0239, + "step": 5051 + }, + { + "epoch": 3.4998268098372014, + "grad_norm": 0.6413298845291138, + "learning_rate": 6.503467406380028e-06, + "loss": 0.0375, + "step": 5052 + }, + { + "epoch": 3.5005195704883962, + "grad_norm": 0.6071054935455322, + "learning_rate": 6.502773925104022e-06, + "loss": 0.029, + "step": 5053 + }, + { + "epoch": 3.501212331139591, + "grad_norm": 0.7107934355735779, + "learning_rate": 6.502080443828017e-06, + "loss": 0.0305, + "step": 5054 + }, + { + "epoch": 3.5019050917907864, + "grad_norm": 0.6063798666000366, + "learning_rate": 6.501386962552012e-06, + "loss": 0.0284, + "step": 5055 + }, + { + "epoch": 3.502597852441981, + "grad_norm": 0.6412544250488281, + "learning_rate": 6.5006934812760055e-06, + "loss": 0.0352, + "step": 5056 + }, + { + "epoch": 3.5032906130931765, + "grad_norm": 0.6058118343353271, + "learning_rate": 6.5000000000000004e-06, + "loss": 0.0317, + "step": 5057 + }, + { + "epoch": 3.5039833737443713, + "grad_norm": 0.6159127950668335, + "learning_rate": 6.4993065187239945e-06, + "loss": 0.0363, + "step": 5058 + }, + { + "epoch": 3.5046761343955666, + "grad_norm": 0.5549063682556152, + "learning_rate": 6.4986130374479895e-06, + "loss": 0.031, + "step": 5059 + }, + { + "epoch": 3.5053688950467614, + "grad_norm": 0.7032778859138489, + "learning_rate": 6.497919556171984e-06, + "loss": 0.0319, + "step": 5060 + }, + { + "epoch": 3.5060616556979562, + "grad_norm": 0.6714448928833008, + "learning_rate": 6.4972260748959785e-06, + "loss": 0.0345, + "step": 5061 + }, + { + "epoch": 3.5067544163491515, + "grad_norm": 0.46812206506729126, + "learning_rate": 6.496532593619973e-06, + "loss": 0.0268, + "step": 5062 + }, + { + "epoch": 3.5074471770003464, + "grad_norm": 0.5488625764846802, + "learning_rate": 6.495839112343967e-06, + "loss": 0.0276, + "step": 5063 + }, + { + "epoch": 3.508139937651541, + "grad_norm": 0.5239285826683044, + "learning_rate": 6.495145631067962e-06, + "loss": 0.0251, + "step": 5064 + }, + { + "epoch": 3.5088326983027365, + "grad_norm": 0.5092358589172363, + "learning_rate": 6.4944521497919565e-06, + "loss": 0.0253, + "step": 5065 + }, + { + "epoch": 3.5095254589539313, + "grad_norm": 0.7008700370788574, + "learning_rate": 6.493758668515951e-06, + "loss": 0.0242, + "step": 5066 + }, + { + "epoch": 3.5102182196051266, + "grad_norm": 0.5645884871482849, + "learning_rate": 6.4930651872399455e-06, + "loss": 0.0311, + "step": 5067 + }, + { + "epoch": 3.5109109802563214, + "grad_norm": 0.7841275930404663, + "learning_rate": 6.492371705963939e-06, + "loss": 0.0437, + "step": 5068 + }, + { + "epoch": 3.5116037409075167, + "grad_norm": 0.5667563676834106, + "learning_rate": 6.491678224687934e-06, + "loss": 0.026, + "step": 5069 + }, + { + "epoch": 3.5122965015587115, + "grad_norm": 0.6524698734283447, + "learning_rate": 6.490984743411929e-06, + "loss": 0.0303, + "step": 5070 + }, + { + "epoch": 3.5129892622099064, + "grad_norm": 0.6123387217521667, + "learning_rate": 6.490291262135923e-06, + "loss": 0.036, + "step": 5071 + }, + { + "epoch": 3.5136820228611017, + "grad_norm": 0.6954811811447144, + "learning_rate": 6.489597780859918e-06, + "loss": 0.0358, + "step": 5072 + }, + { + "epoch": 3.5143747835122965, + "grad_norm": 0.5746607780456543, + "learning_rate": 6.488904299583911e-06, + "loss": 0.0387, + "step": 5073 + }, + { + "epoch": 3.5150675441634913, + "grad_norm": 0.6515728831291199, + "learning_rate": 6.488210818307906e-06, + "loss": 0.0353, + "step": 5074 + }, + { + "epoch": 3.5157603048146866, + "grad_norm": 0.6279569864273071, + "learning_rate": 6.487517337031901e-06, + "loss": 0.0329, + "step": 5075 + }, + { + "epoch": 3.5164530654658814, + "grad_norm": 0.5640904307365417, + "learning_rate": 6.486823855755895e-06, + "loss": 0.0209, + "step": 5076 + }, + { + "epoch": 3.5171458261170767, + "grad_norm": 0.5993984341621399, + "learning_rate": 6.48613037447989e-06, + "loss": 0.0286, + "step": 5077 + }, + { + "epoch": 3.5178385867682715, + "grad_norm": 0.6607270240783691, + "learning_rate": 6.485436893203884e-06, + "loss": 0.0301, + "step": 5078 + }, + { + "epoch": 3.518531347419467, + "grad_norm": 0.5850106477737427, + "learning_rate": 6.484743411927879e-06, + "loss": 0.0322, + "step": 5079 + }, + { + "epoch": 3.5192241080706617, + "grad_norm": 0.6616483330726624, + "learning_rate": 6.484049930651874e-06, + "loss": 0.0288, + "step": 5080 + }, + { + "epoch": 3.5199168687218565, + "grad_norm": 0.6212733387947083, + "learning_rate": 6.483356449375867e-06, + "loss": 0.029, + "step": 5081 + }, + { + "epoch": 3.5206096293730518, + "grad_norm": 0.6463752388954163, + "learning_rate": 6.482662968099862e-06, + "loss": 0.0245, + "step": 5082 + }, + { + "epoch": 3.5213023900242466, + "grad_norm": 0.6329712271690369, + "learning_rate": 6.481969486823856e-06, + "loss": 0.0407, + "step": 5083 + }, + { + "epoch": 3.5219951506754414, + "grad_norm": 0.5336253643035889, + "learning_rate": 6.481276005547851e-06, + "loss": 0.0256, + "step": 5084 + }, + { + "epoch": 3.5226879113266367, + "grad_norm": 0.6275019645690918, + "learning_rate": 6.480582524271846e-06, + "loss": 0.0322, + "step": 5085 + }, + { + "epoch": 3.5233806719778316, + "grad_norm": 0.6023123860359192, + "learning_rate": 6.479889042995839e-06, + "loss": 0.0383, + "step": 5086 + }, + { + "epoch": 3.524073432629027, + "grad_norm": 1.0357578992843628, + "learning_rate": 6.479195561719834e-06, + "loss": 0.0265, + "step": 5087 + }, + { + "epoch": 3.5247661932802217, + "grad_norm": 0.7056729197502136, + "learning_rate": 6.478502080443828e-06, + "loss": 0.0429, + "step": 5088 + }, + { + "epoch": 3.525458953931417, + "grad_norm": 0.56638503074646, + "learning_rate": 6.477808599167823e-06, + "loss": 0.0285, + "step": 5089 + }, + { + "epoch": 3.526151714582612, + "grad_norm": 0.5246169567108154, + "learning_rate": 6.477115117891818e-06, + "loss": 0.028, + "step": 5090 + }, + { + "epoch": 3.5268444752338066, + "grad_norm": 0.6372687220573425, + "learning_rate": 6.476421636615812e-06, + "loss": 0.0353, + "step": 5091 + }, + { + "epoch": 3.527537235885002, + "grad_norm": 0.5155999064445496, + "learning_rate": 6.475728155339806e-06, + "loss": 0.0248, + "step": 5092 + }, + { + "epoch": 3.5282299965361967, + "grad_norm": 0.544079601764679, + "learning_rate": 6.4750346740638e-06, + "loss": 0.0296, + "step": 5093 + }, + { + "epoch": 3.5289227571873916, + "grad_norm": 0.5598334074020386, + "learning_rate": 6.474341192787795e-06, + "loss": 0.0277, + "step": 5094 + }, + { + "epoch": 3.529615517838587, + "grad_norm": 0.7250769138336182, + "learning_rate": 6.47364771151179e-06, + "loss": 0.0437, + "step": 5095 + }, + { + "epoch": 3.5303082784897817, + "grad_norm": 0.6074924468994141, + "learning_rate": 6.472954230235784e-06, + "loss": 0.0318, + "step": 5096 + }, + { + "epoch": 3.5310010391409765, + "grad_norm": 0.5366249084472656, + "learning_rate": 6.472260748959779e-06, + "loss": 0.0277, + "step": 5097 + }, + { + "epoch": 3.531693799792172, + "grad_norm": 0.4772739112377167, + "learning_rate": 6.471567267683772e-06, + "loss": 0.0208, + "step": 5098 + }, + { + "epoch": 3.532386560443367, + "grad_norm": 0.6919244527816772, + "learning_rate": 6.470873786407767e-06, + "loss": 0.0311, + "step": 5099 + }, + { + "epoch": 3.533079321094562, + "grad_norm": 0.62872713804245, + "learning_rate": 6.470180305131762e-06, + "loss": 0.0392, + "step": 5100 + }, + { + "epoch": 3.5337720817457567, + "grad_norm": 0.5211807489395142, + "learning_rate": 6.469486823855756e-06, + "loss": 0.0223, + "step": 5101 + }, + { + "epoch": 3.534464842396952, + "grad_norm": 0.647637665271759, + "learning_rate": 6.468793342579751e-06, + "loss": 0.0378, + "step": 5102 + }, + { + "epoch": 3.535157603048147, + "grad_norm": 0.6397925019264221, + "learning_rate": 6.4680998613037445e-06, + "loss": 0.0343, + "step": 5103 + }, + { + "epoch": 3.5358503636993417, + "grad_norm": 0.487280935049057, + "learning_rate": 6.4674063800277394e-06, + "loss": 0.026, + "step": 5104 + }, + { + "epoch": 3.536543124350537, + "grad_norm": 0.5970995426177979, + "learning_rate": 6.466712898751734e-06, + "loss": 0.0341, + "step": 5105 + }, + { + "epoch": 3.537235885001732, + "grad_norm": 0.6427799463272095, + "learning_rate": 6.4660194174757285e-06, + "loss": 0.0416, + "step": 5106 + }, + { + "epoch": 3.5379286456529266, + "grad_norm": 0.8776400685310364, + "learning_rate": 6.465325936199723e-06, + "loss": 0.0409, + "step": 5107 + }, + { + "epoch": 3.538621406304122, + "grad_norm": 0.5709391236305237, + "learning_rate": 6.4646324549237175e-06, + "loss": 0.0311, + "step": 5108 + }, + { + "epoch": 3.539314166955317, + "grad_norm": 0.5883795022964478, + "learning_rate": 6.463938973647712e-06, + "loss": 0.0415, + "step": 5109 + }, + { + "epoch": 3.540006927606512, + "grad_norm": 0.6908285021781921, + "learning_rate": 6.463245492371707e-06, + "loss": 0.0381, + "step": 5110 + }, + { + "epoch": 3.540699688257707, + "grad_norm": 0.5694625973701477, + "learning_rate": 6.462552011095701e-06, + "loss": 0.0315, + "step": 5111 + }, + { + "epoch": 3.541392448908902, + "grad_norm": 0.7304494976997375, + "learning_rate": 6.4618585298196955e-06, + "loss": 0.0406, + "step": 5112 + }, + { + "epoch": 3.542085209560097, + "grad_norm": 0.5786203145980835, + "learning_rate": 6.46116504854369e-06, + "loss": 0.0264, + "step": 5113 + }, + { + "epoch": 3.542777970211292, + "grad_norm": 0.5530243515968323, + "learning_rate": 6.4604715672676845e-06, + "loss": 0.0256, + "step": 5114 + }, + { + "epoch": 3.543470730862487, + "grad_norm": 0.6862016916275024, + "learning_rate": 6.4597780859916795e-06, + "loss": 0.0346, + "step": 5115 + }, + { + "epoch": 3.544163491513682, + "grad_norm": 0.6406251192092896, + "learning_rate": 6.459084604715673e-06, + "loss": 0.0357, + "step": 5116 + }, + { + "epoch": 3.5448562521648768, + "grad_norm": 0.6029278039932251, + "learning_rate": 6.458391123439668e-06, + "loss": 0.0303, + "step": 5117 + }, + { + "epoch": 3.545549012816072, + "grad_norm": 0.49682673811912537, + "learning_rate": 6.457697642163662e-06, + "loss": 0.0281, + "step": 5118 + }, + { + "epoch": 3.5462417734672673, + "grad_norm": 0.8219375610351562, + "learning_rate": 6.457004160887657e-06, + "loss": 0.0295, + "step": 5119 + }, + { + "epoch": 3.546934534118462, + "grad_norm": 0.6140983700752258, + "learning_rate": 6.456310679611652e-06, + "loss": 0.0266, + "step": 5120 + }, + { + "epoch": 3.547627294769657, + "grad_norm": 0.6019349098205566, + "learning_rate": 6.455617198335645e-06, + "loss": 0.0299, + "step": 5121 + }, + { + "epoch": 3.5483200554208523, + "grad_norm": 0.5619654059410095, + "learning_rate": 6.45492371705964e-06, + "loss": 0.0308, + "step": 5122 + }, + { + "epoch": 3.549012816072047, + "grad_norm": 0.4877817928791046, + "learning_rate": 6.454230235783634e-06, + "loss": 0.0246, + "step": 5123 + }, + { + "epoch": 3.549705576723242, + "grad_norm": 0.9568673372268677, + "learning_rate": 6.453536754507629e-06, + "loss": 0.0333, + "step": 5124 + }, + { + "epoch": 3.550398337374437, + "grad_norm": 0.6768813133239746, + "learning_rate": 6.452843273231624e-06, + "loss": 0.0353, + "step": 5125 + }, + { + "epoch": 3.551091098025632, + "grad_norm": 0.5718116760253906, + "learning_rate": 6.452149791955618e-06, + "loss": 0.0321, + "step": 5126 + }, + { + "epoch": 3.551783858676827, + "grad_norm": 0.5333515405654907, + "learning_rate": 6.451456310679613e-06, + "loss": 0.0286, + "step": 5127 + }, + { + "epoch": 3.552476619328022, + "grad_norm": 0.6497219204902649, + "learning_rate": 6.450762829403606e-06, + "loss": 0.0338, + "step": 5128 + }, + { + "epoch": 3.5531693799792174, + "grad_norm": 0.6667235493659973, + "learning_rate": 6.450069348127601e-06, + "loss": 0.0318, + "step": 5129 + }, + { + "epoch": 3.5538621406304123, + "grad_norm": 0.6585575342178345, + "learning_rate": 6.449375866851596e-06, + "loss": 0.0321, + "step": 5130 + }, + { + "epoch": 3.554554901281607, + "grad_norm": 0.6151941418647766, + "learning_rate": 6.44868238557559e-06, + "loss": 0.0326, + "step": 5131 + }, + { + "epoch": 3.5552476619328024, + "grad_norm": 0.5822476744651794, + "learning_rate": 6.447988904299585e-06, + "loss": 0.0228, + "step": 5132 + }, + { + "epoch": 3.5559404225839972, + "grad_norm": 0.6461667418479919, + "learning_rate": 6.447295423023578e-06, + "loss": 0.035, + "step": 5133 + }, + { + "epoch": 3.556633183235192, + "grad_norm": 0.6769730448722839, + "learning_rate": 6.446601941747573e-06, + "loss": 0.0365, + "step": 5134 + }, + { + "epoch": 3.5573259438863873, + "grad_norm": 0.6281021237373352, + "learning_rate": 6.445908460471568e-06, + "loss": 0.0318, + "step": 5135 + }, + { + "epoch": 3.558018704537582, + "grad_norm": 0.5587422847747803, + "learning_rate": 6.445214979195562e-06, + "loss": 0.0346, + "step": 5136 + }, + { + "epoch": 3.558711465188777, + "grad_norm": 0.537922203540802, + "learning_rate": 6.444521497919557e-06, + "loss": 0.0313, + "step": 5137 + }, + { + "epoch": 3.5594042258399723, + "grad_norm": 0.7479827404022217, + "learning_rate": 6.443828016643551e-06, + "loss": 0.037, + "step": 5138 + }, + { + "epoch": 3.5600969864911676, + "grad_norm": 0.6676491498947144, + "learning_rate": 6.443134535367546e-06, + "loss": 0.0299, + "step": 5139 + }, + { + "epoch": 3.5607897471423624, + "grad_norm": 0.5506122708320618, + "learning_rate": 6.44244105409154e-06, + "loss": 0.0364, + "step": 5140 + }, + { + "epoch": 3.5614825077935572, + "grad_norm": 0.5065435767173767, + "learning_rate": 6.441747572815534e-06, + "loss": 0.027, + "step": 5141 + }, + { + "epoch": 3.5621752684447525, + "grad_norm": 0.6939265131950378, + "learning_rate": 6.441054091539529e-06, + "loss": 0.0326, + "step": 5142 + }, + { + "epoch": 3.5628680290959474, + "grad_norm": 0.7979405522346497, + "learning_rate": 6.440360610263523e-06, + "loss": 0.0424, + "step": 5143 + }, + { + "epoch": 3.563560789747142, + "grad_norm": 0.6120886206626892, + "learning_rate": 6.439667128987518e-06, + "loss": 0.0376, + "step": 5144 + }, + { + "epoch": 3.5642535503983375, + "grad_norm": 0.6197006106376648, + "learning_rate": 6.438973647711513e-06, + "loss": 0.0264, + "step": 5145 + }, + { + "epoch": 3.5649463110495323, + "grad_norm": 0.6210958957672119, + "learning_rate": 6.438280166435506e-06, + "loss": 0.0305, + "step": 5146 + }, + { + "epoch": 3.565639071700727, + "grad_norm": 0.6047160029411316, + "learning_rate": 6.437586685159501e-06, + "loss": 0.0295, + "step": 5147 + }, + { + "epoch": 3.5663318323519224, + "grad_norm": 0.5486659407615662, + "learning_rate": 6.436893203883495e-06, + "loss": 0.0276, + "step": 5148 + }, + { + "epoch": 3.5670245930031177, + "grad_norm": 0.6126028895378113, + "learning_rate": 6.43619972260749e-06, + "loss": 0.0346, + "step": 5149 + }, + { + "epoch": 3.5677173536543125, + "grad_norm": 0.6695546507835388, + "learning_rate": 6.435506241331485e-06, + "loss": 0.0379, + "step": 5150 + }, + { + "epoch": 3.5684101143055074, + "grad_norm": 0.5959687232971191, + "learning_rate": 6.4348127600554784e-06, + "loss": 0.0286, + "step": 5151 + }, + { + "epoch": 3.5691028749567026, + "grad_norm": 0.6917016506195068, + "learning_rate": 6.434119278779473e-06, + "loss": 0.0338, + "step": 5152 + }, + { + "epoch": 3.5697956356078975, + "grad_norm": 0.604941725730896, + "learning_rate": 6.4334257975034675e-06, + "loss": 0.0283, + "step": 5153 + }, + { + "epoch": 3.5704883962590923, + "grad_norm": 0.4962579011917114, + "learning_rate": 6.432732316227462e-06, + "loss": 0.0212, + "step": 5154 + }, + { + "epoch": 3.5711811569102876, + "grad_norm": 0.6287131309509277, + "learning_rate": 6.432038834951457e-06, + "loss": 0.0331, + "step": 5155 + }, + { + "epoch": 3.5718739175614824, + "grad_norm": 0.5796381831169128, + "learning_rate": 6.431345353675451e-06, + "loss": 0.0317, + "step": 5156 + }, + { + "epoch": 3.5725666782126773, + "grad_norm": 0.7629371881484985, + "learning_rate": 6.430651872399446e-06, + "loss": 0.0376, + "step": 5157 + }, + { + "epoch": 3.5732594388638725, + "grad_norm": 0.6308226585388184, + "learning_rate": 6.42995839112344e-06, + "loss": 0.0373, + "step": 5158 + }, + { + "epoch": 3.573952199515068, + "grad_norm": 0.5209526419639587, + "learning_rate": 6.4292649098474345e-06, + "loss": 0.0302, + "step": 5159 + }, + { + "epoch": 3.5746449601662627, + "grad_norm": 0.6444221138954163, + "learning_rate": 6.4285714285714295e-06, + "loss": 0.0292, + "step": 5160 + }, + { + "epoch": 3.5753377208174575, + "grad_norm": 0.4564352035522461, + "learning_rate": 6.4278779472954235e-06, + "loss": 0.0208, + "step": 5161 + }, + { + "epoch": 3.5760304814686528, + "grad_norm": 0.5314273834228516, + "learning_rate": 6.4271844660194185e-06, + "loss": 0.0292, + "step": 5162 + }, + { + "epoch": 3.5767232421198476, + "grad_norm": 0.5213531851768494, + "learning_rate": 6.426490984743412e-06, + "loss": 0.0261, + "step": 5163 + }, + { + "epoch": 3.5774160027710424, + "grad_norm": 0.5768083333969116, + "learning_rate": 6.425797503467407e-06, + "loss": 0.028, + "step": 5164 + }, + { + "epoch": 3.5781087634222377, + "grad_norm": 0.5932965278625488, + "learning_rate": 6.425104022191402e-06, + "loss": 0.026, + "step": 5165 + }, + { + "epoch": 3.5788015240734325, + "grad_norm": 0.7689990401268005, + "learning_rate": 6.424410540915396e-06, + "loss": 0.0205, + "step": 5166 + }, + { + "epoch": 3.5794942847246274, + "grad_norm": 0.5870887637138367, + "learning_rate": 6.423717059639391e-06, + "loss": 0.0384, + "step": 5167 + }, + { + "epoch": 3.5801870453758227, + "grad_norm": 0.5823937058448792, + "learning_rate": 6.423023578363385e-06, + "loss": 0.0352, + "step": 5168 + }, + { + "epoch": 3.580879806027018, + "grad_norm": 0.7018173933029175, + "learning_rate": 6.422330097087379e-06, + "loss": 0.0423, + "step": 5169 + }, + { + "epoch": 3.5815725666782128, + "grad_norm": 0.545872151851654, + "learning_rate": 6.421636615811374e-06, + "loss": 0.0307, + "step": 5170 + }, + { + "epoch": 3.5822653273294076, + "grad_norm": 0.5199849009513855, + "learning_rate": 6.420943134535368e-06, + "loss": 0.0292, + "step": 5171 + }, + { + "epoch": 3.582958087980603, + "grad_norm": 0.41827356815338135, + "learning_rate": 6.420249653259363e-06, + "loss": 0.02, + "step": 5172 + }, + { + "epoch": 3.5836508486317977, + "grad_norm": 0.6563012599945068, + "learning_rate": 6.419556171983357e-06, + "loss": 0.0389, + "step": 5173 + }, + { + "epoch": 3.5843436092829926, + "grad_norm": 0.6314850449562073, + "learning_rate": 6.418862690707352e-06, + "loss": 0.0326, + "step": 5174 + }, + { + "epoch": 3.585036369934188, + "grad_norm": 0.475079745054245, + "learning_rate": 6.418169209431347e-06, + "loss": 0.0208, + "step": 5175 + }, + { + "epoch": 3.5857291305853827, + "grad_norm": 0.6002709269523621, + "learning_rate": 6.41747572815534e-06, + "loss": 0.0298, + "step": 5176 + }, + { + "epoch": 3.5864218912365775, + "grad_norm": 0.5082271099090576, + "learning_rate": 6.416782246879335e-06, + "loss": 0.0263, + "step": 5177 + }, + { + "epoch": 3.587114651887773, + "grad_norm": 0.584649384021759, + "learning_rate": 6.416088765603329e-06, + "loss": 0.0319, + "step": 5178 + }, + { + "epoch": 3.5878074125389676, + "grad_norm": 0.6434041857719421, + "learning_rate": 6.415395284327324e-06, + "loss": 0.0393, + "step": 5179 + }, + { + "epoch": 3.588500173190163, + "grad_norm": 0.6173291802406311, + "learning_rate": 6.414701803051319e-06, + "loss": 0.0365, + "step": 5180 + }, + { + "epoch": 3.5891929338413577, + "grad_norm": 0.6330900192260742, + "learning_rate": 6.414008321775312e-06, + "loss": 0.0324, + "step": 5181 + }, + { + "epoch": 3.589885694492553, + "grad_norm": 0.5519096255302429, + "learning_rate": 6.413314840499307e-06, + "loss": 0.0266, + "step": 5182 + }, + { + "epoch": 3.590578455143748, + "grad_norm": 0.5743852853775024, + "learning_rate": 6.412621359223301e-06, + "loss": 0.0264, + "step": 5183 + }, + { + "epoch": 3.5912712157949427, + "grad_norm": 0.6207646727561951, + "learning_rate": 6.411927877947296e-06, + "loss": 0.0263, + "step": 5184 + }, + { + "epoch": 3.591963976446138, + "grad_norm": 0.5601102709770203, + "learning_rate": 6.411234396671291e-06, + "loss": 0.026, + "step": 5185 + }, + { + "epoch": 3.592656737097333, + "grad_norm": 0.596768319606781, + "learning_rate": 6.410540915395285e-06, + "loss": 0.0319, + "step": 5186 + }, + { + "epoch": 3.5933494977485276, + "grad_norm": 0.6190519332885742, + "learning_rate": 6.40984743411928e-06, + "loss": 0.0253, + "step": 5187 + }, + { + "epoch": 3.594042258399723, + "grad_norm": 0.5937433838844299, + "learning_rate": 6.409153952843273e-06, + "loss": 0.0319, + "step": 5188 + }, + { + "epoch": 3.5947350190509177, + "grad_norm": 0.5507180094718933, + "learning_rate": 6.408460471567268e-06, + "loss": 0.025, + "step": 5189 + }, + { + "epoch": 3.595427779702113, + "grad_norm": 0.6053184866905212, + "learning_rate": 6.407766990291263e-06, + "loss": 0.0396, + "step": 5190 + }, + { + "epoch": 3.596120540353308, + "grad_norm": 0.6637827157974243, + "learning_rate": 6.407073509015257e-06, + "loss": 0.0302, + "step": 5191 + }, + { + "epoch": 3.596813301004503, + "grad_norm": 0.5112492442131042, + "learning_rate": 6.406380027739252e-06, + "loss": 0.0257, + "step": 5192 + }, + { + "epoch": 3.597506061655698, + "grad_norm": 0.5539286136627197, + "learning_rate": 6.405686546463245e-06, + "loss": 0.0286, + "step": 5193 + }, + { + "epoch": 3.598198822306893, + "grad_norm": 0.5003301501274109, + "learning_rate": 6.40499306518724e-06, + "loss": 0.0284, + "step": 5194 + }, + { + "epoch": 3.598891582958088, + "grad_norm": 0.5797738432884216, + "learning_rate": 6.404299583911235e-06, + "loss": 0.0317, + "step": 5195 + }, + { + "epoch": 3.599584343609283, + "grad_norm": 0.6324701905250549, + "learning_rate": 6.403606102635229e-06, + "loss": 0.0317, + "step": 5196 + }, + { + "epoch": 3.6002771042604778, + "grad_norm": 0.7718259692192078, + "learning_rate": 6.402912621359224e-06, + "loss": 0.0436, + "step": 5197 + }, + { + "epoch": 3.600969864911673, + "grad_norm": 0.5175343751907349, + "learning_rate": 6.402219140083218e-06, + "loss": 0.03, + "step": 5198 + }, + { + "epoch": 3.601662625562868, + "grad_norm": 0.688150942325592, + "learning_rate": 6.401525658807212e-06, + "loss": 0.0352, + "step": 5199 + }, + { + "epoch": 3.602355386214063, + "grad_norm": 0.4672299027442932, + "learning_rate": 6.400832177531207e-06, + "loss": 0.023, + "step": 5200 + }, + { + "epoch": 3.603048146865258, + "grad_norm": 0.49436065554618835, + "learning_rate": 6.400138696255201e-06, + "loss": 0.022, + "step": 5201 + }, + { + "epoch": 3.6037409075164533, + "grad_norm": 0.6367473006248474, + "learning_rate": 6.399445214979196e-06, + "loss": 0.03, + "step": 5202 + }, + { + "epoch": 3.604433668167648, + "grad_norm": 0.5535686612129211, + "learning_rate": 6.39875173370319e-06, + "loss": 0.0346, + "step": 5203 + }, + { + "epoch": 3.605126428818843, + "grad_norm": 0.5234419107437134, + "learning_rate": 6.398058252427185e-06, + "loss": 0.0279, + "step": 5204 + }, + { + "epoch": 3.605819189470038, + "grad_norm": 0.5995169281959534, + "learning_rate": 6.39736477115118e-06, + "loss": 0.028, + "step": 5205 + }, + { + "epoch": 3.606511950121233, + "grad_norm": 0.5009410381317139, + "learning_rate": 6.3966712898751735e-06, + "loss": 0.0253, + "step": 5206 + }, + { + "epoch": 3.607204710772428, + "grad_norm": 0.6058676838874817, + "learning_rate": 6.3959778085991685e-06, + "loss": 0.0297, + "step": 5207 + }, + { + "epoch": 3.607897471423623, + "grad_norm": 0.5171012282371521, + "learning_rate": 6.3952843273231625e-06, + "loss": 0.0264, + "step": 5208 + }, + { + "epoch": 3.608590232074818, + "grad_norm": 0.7421143054962158, + "learning_rate": 6.3945908460471575e-06, + "loss": 0.0274, + "step": 5209 + }, + { + "epoch": 3.6092829927260133, + "grad_norm": 0.5940254926681519, + "learning_rate": 6.393897364771152e-06, + "loss": 0.0492, + "step": 5210 + }, + { + "epoch": 3.609975753377208, + "grad_norm": 0.5492020845413208, + "learning_rate": 6.393203883495146e-06, + "loss": 0.0324, + "step": 5211 + }, + { + "epoch": 3.6106685140284034, + "grad_norm": 0.5443360209465027, + "learning_rate": 6.392510402219141e-06, + "loss": 0.0399, + "step": 5212 + }, + { + "epoch": 3.611361274679598, + "grad_norm": 0.752368688583374, + "learning_rate": 6.391816920943135e-06, + "loss": 0.0441, + "step": 5213 + }, + { + "epoch": 3.612054035330793, + "grad_norm": 0.6221478581428528, + "learning_rate": 6.39112343966713e-06, + "loss": 0.0338, + "step": 5214 + }, + { + "epoch": 3.6127467959819883, + "grad_norm": 0.487557590007782, + "learning_rate": 6.3904299583911245e-06, + "loss": 0.0269, + "step": 5215 + }, + { + "epoch": 3.613439556633183, + "grad_norm": 0.5465534329414368, + "learning_rate": 6.389736477115119e-06, + "loss": 0.0324, + "step": 5216 + }, + { + "epoch": 3.614132317284378, + "grad_norm": 0.5025929808616638, + "learning_rate": 6.3890429958391136e-06, + "loss": 0.0267, + "step": 5217 + }, + { + "epoch": 3.6148250779355733, + "grad_norm": 0.5270314812660217, + "learning_rate": 6.388349514563107e-06, + "loss": 0.0267, + "step": 5218 + }, + { + "epoch": 3.615517838586768, + "grad_norm": 0.5299199223518372, + "learning_rate": 6.387656033287102e-06, + "loss": 0.025, + "step": 5219 + }, + { + "epoch": 3.6162105992379634, + "grad_norm": 0.6264725923538208, + "learning_rate": 6.386962552011097e-06, + "loss": 0.0242, + "step": 5220 + }, + { + "epoch": 3.6169033598891582, + "grad_norm": 0.5444220304489136, + "learning_rate": 6.386269070735091e-06, + "loss": 0.0275, + "step": 5221 + }, + { + "epoch": 3.6175961205403535, + "grad_norm": 0.654484212398529, + "learning_rate": 6.385575589459086e-06, + "loss": 0.0392, + "step": 5222 + }, + { + "epoch": 3.6182888811915483, + "grad_norm": 0.5375171303749084, + "learning_rate": 6.384882108183079e-06, + "loss": 0.0301, + "step": 5223 + }, + { + "epoch": 3.618981641842743, + "grad_norm": 0.6169750690460205, + "learning_rate": 6.384188626907074e-06, + "loss": 0.0255, + "step": 5224 + }, + { + "epoch": 3.6196744024939385, + "grad_norm": 0.579149067401886, + "learning_rate": 6.383495145631069e-06, + "loss": 0.0351, + "step": 5225 + }, + { + "epoch": 3.6203671631451333, + "grad_norm": 0.8136575222015381, + "learning_rate": 6.382801664355063e-06, + "loss": 0.036, + "step": 5226 + }, + { + "epoch": 3.621059923796328, + "grad_norm": 0.7540092468261719, + "learning_rate": 6.382108183079058e-06, + "loss": 0.0391, + "step": 5227 + }, + { + "epoch": 3.6217526844475234, + "grad_norm": 0.5661916732788086, + "learning_rate": 6.381414701803051e-06, + "loss": 0.0262, + "step": 5228 + }, + { + "epoch": 3.6224454450987182, + "grad_norm": 0.6469407081604004, + "learning_rate": 6.380721220527046e-06, + "loss": 0.0331, + "step": 5229 + }, + { + "epoch": 3.6231382057499135, + "grad_norm": 0.6229873299598694, + "learning_rate": 6.380027739251041e-06, + "loss": 0.0283, + "step": 5230 + }, + { + "epoch": 3.6238309664011084, + "grad_norm": 0.575975239276886, + "learning_rate": 6.379334257975035e-06, + "loss": 0.0346, + "step": 5231 + }, + { + "epoch": 3.6245237270523036, + "grad_norm": 0.5156070590019226, + "learning_rate": 6.37864077669903e-06, + "loss": 0.0218, + "step": 5232 + }, + { + "epoch": 3.6252164877034985, + "grad_norm": 0.6007578372955322, + "learning_rate": 6.377947295423024e-06, + "loss": 0.0266, + "step": 5233 + }, + { + "epoch": 3.6259092483546933, + "grad_norm": 0.6745254397392273, + "learning_rate": 6.377253814147019e-06, + "loss": 0.0343, + "step": 5234 + }, + { + "epoch": 3.6266020090058886, + "grad_norm": 0.601026177406311, + "learning_rate": 6.376560332871014e-06, + "loss": 0.0364, + "step": 5235 + }, + { + "epoch": 3.6272947696570834, + "grad_norm": 0.5416752099990845, + "learning_rate": 6.375866851595007e-06, + "loss": 0.0275, + "step": 5236 + }, + { + "epoch": 3.6279875303082783, + "grad_norm": 0.5874926447868347, + "learning_rate": 6.375173370319002e-06, + "loss": 0.0304, + "step": 5237 + }, + { + "epoch": 3.6286802909594735, + "grad_norm": 0.5943888425827026, + "learning_rate": 6.374479889042996e-06, + "loss": 0.0278, + "step": 5238 + }, + { + "epoch": 3.6293730516106684, + "grad_norm": 0.5624053478240967, + "learning_rate": 6.373786407766991e-06, + "loss": 0.028, + "step": 5239 + }, + { + "epoch": 3.6300658122618636, + "grad_norm": 0.5878169536590576, + "learning_rate": 6.373092926490986e-06, + "loss": 0.0347, + "step": 5240 + }, + { + "epoch": 3.6307585729130585, + "grad_norm": 0.5780265927314758, + "learning_rate": 6.372399445214979e-06, + "loss": 0.0352, + "step": 5241 + }, + { + "epoch": 3.6314513335642538, + "grad_norm": 0.7032192349433899, + "learning_rate": 6.371705963938974e-06, + "loss": 0.0295, + "step": 5242 + }, + { + "epoch": 3.6321440942154486, + "grad_norm": 0.6351820230484009, + "learning_rate": 6.371012482662968e-06, + "loss": 0.0336, + "step": 5243 + }, + { + "epoch": 3.6328368548666434, + "grad_norm": 0.5924007296562195, + "learning_rate": 6.370319001386963e-06, + "loss": 0.0312, + "step": 5244 + }, + { + "epoch": 3.6335296155178387, + "grad_norm": 0.6216744780540466, + "learning_rate": 6.369625520110958e-06, + "loss": 0.0327, + "step": 5245 + }, + { + "epoch": 3.6342223761690335, + "grad_norm": 0.5291363596916199, + "learning_rate": 6.368932038834952e-06, + "loss": 0.0266, + "step": 5246 + }, + { + "epoch": 3.6349151368202284, + "grad_norm": 0.628518283367157, + "learning_rate": 6.368238557558946e-06, + "loss": 0.0397, + "step": 5247 + }, + { + "epoch": 3.6356078974714237, + "grad_norm": 0.5681959986686707, + "learning_rate": 6.36754507628294e-06, + "loss": 0.0274, + "step": 5248 + }, + { + "epoch": 3.6363006581226185, + "grad_norm": 0.5964863896369934, + "learning_rate": 6.366851595006935e-06, + "loss": 0.0301, + "step": 5249 + }, + { + "epoch": 3.6369934187738138, + "grad_norm": 0.5186253786087036, + "learning_rate": 6.36615811373093e-06, + "loss": 0.0273, + "step": 5250 + }, + { + "epoch": 3.6376861794250086, + "grad_norm": 0.5899990797042847, + "learning_rate": 6.365464632454924e-06, + "loss": 0.0408, + "step": 5251 + }, + { + "epoch": 3.638378940076204, + "grad_norm": 0.5952342748641968, + "learning_rate": 6.364771151178919e-06, + "loss": 0.0309, + "step": 5252 + }, + { + "epoch": 3.6390717007273987, + "grad_norm": 0.5763419270515442, + "learning_rate": 6.3640776699029125e-06, + "loss": 0.0288, + "step": 5253 + }, + { + "epoch": 3.6397644613785936, + "grad_norm": 0.6522453427314758, + "learning_rate": 6.3633841886269075e-06, + "loss": 0.0329, + "step": 5254 + }, + { + "epoch": 3.640457222029789, + "grad_norm": 0.5152024626731873, + "learning_rate": 6.362690707350902e-06, + "loss": 0.0246, + "step": 5255 + }, + { + "epoch": 3.6411499826809837, + "grad_norm": 0.6586542725563049, + "learning_rate": 6.3619972260748965e-06, + "loss": 0.034, + "step": 5256 + }, + { + "epoch": 3.6418427433321785, + "grad_norm": 0.5849273800849915, + "learning_rate": 6.361303744798891e-06, + "loss": 0.033, + "step": 5257 + }, + { + "epoch": 3.642535503983374, + "grad_norm": 0.6569483876228333, + "learning_rate": 6.360610263522885e-06, + "loss": 0.0356, + "step": 5258 + }, + { + "epoch": 3.6432282646345686, + "grad_norm": 0.5631612539291382, + "learning_rate": 6.35991678224688e-06, + "loss": 0.0317, + "step": 5259 + }, + { + "epoch": 3.643921025285764, + "grad_norm": 0.5431450009346008, + "learning_rate": 6.3592233009708745e-06, + "loss": 0.0265, + "step": 5260 + }, + { + "epoch": 3.6446137859369587, + "grad_norm": 0.7423073649406433, + "learning_rate": 6.358529819694869e-06, + "loss": 0.0394, + "step": 5261 + }, + { + "epoch": 3.645306546588154, + "grad_norm": 0.7692938446998596, + "learning_rate": 6.3578363384188635e-06, + "loss": 0.0282, + "step": 5262 + }, + { + "epoch": 3.645999307239349, + "grad_norm": 0.6432677507400513, + "learning_rate": 6.357142857142858e-06, + "loss": 0.0289, + "step": 5263 + }, + { + "epoch": 3.6466920678905437, + "grad_norm": 0.7043903470039368, + "learning_rate": 6.3564493758668526e-06, + "loss": 0.0382, + "step": 5264 + }, + { + "epoch": 3.647384828541739, + "grad_norm": 0.5756590962409973, + "learning_rate": 6.3557558945908475e-06, + "loss": 0.0293, + "step": 5265 + }, + { + "epoch": 3.648077589192934, + "grad_norm": 0.47525158524513245, + "learning_rate": 6.355062413314841e-06, + "loss": 0.0251, + "step": 5266 + }, + { + "epoch": 3.6487703498441286, + "grad_norm": 0.6053153276443481, + "learning_rate": 6.354368932038836e-06, + "loss": 0.0301, + "step": 5267 + }, + { + "epoch": 3.649463110495324, + "grad_norm": 0.7235785126686096, + "learning_rate": 6.35367545076283e-06, + "loss": 0.0298, + "step": 5268 + }, + { + "epoch": 3.6501558711465187, + "grad_norm": 0.6563252210617065, + "learning_rate": 6.352981969486825e-06, + "loss": 0.0248, + "step": 5269 + }, + { + "epoch": 3.650848631797714, + "grad_norm": 0.5949928164482117, + "learning_rate": 6.35228848821082e-06, + "loss": 0.0343, + "step": 5270 + }, + { + "epoch": 3.651541392448909, + "grad_norm": 0.5927355289459229, + "learning_rate": 6.351595006934813e-06, + "loss": 0.024, + "step": 5271 + }, + { + "epoch": 3.652234153100104, + "grad_norm": 0.7582287788391113, + "learning_rate": 6.350901525658808e-06, + "loss": 0.0479, + "step": 5272 + }, + { + "epoch": 3.652926913751299, + "grad_norm": 0.5354639887809753, + "learning_rate": 6.350208044382802e-06, + "loss": 0.0247, + "step": 5273 + }, + { + "epoch": 3.653619674402494, + "grad_norm": 0.548228919506073, + "learning_rate": 6.349514563106797e-06, + "loss": 0.0236, + "step": 5274 + }, + { + "epoch": 3.654312435053689, + "grad_norm": 0.5116567015647888, + "learning_rate": 6.348821081830792e-06, + "loss": 0.0194, + "step": 5275 + }, + { + "epoch": 3.655005195704884, + "grad_norm": 0.4989963471889496, + "learning_rate": 6.348127600554785e-06, + "loss": 0.0246, + "step": 5276 + }, + { + "epoch": 3.6556979563560787, + "grad_norm": 0.6131188869476318, + "learning_rate": 6.34743411927878e-06, + "loss": 0.0275, + "step": 5277 + }, + { + "epoch": 3.656390717007274, + "grad_norm": 0.5656293034553528, + "learning_rate": 6.346740638002774e-06, + "loss": 0.0253, + "step": 5278 + }, + { + "epoch": 3.657083477658469, + "grad_norm": 0.6491434574127197, + "learning_rate": 6.346047156726769e-06, + "loss": 0.0323, + "step": 5279 + }, + { + "epoch": 3.657776238309664, + "grad_norm": 0.6328729391098022, + "learning_rate": 6.345353675450764e-06, + "loss": 0.0336, + "step": 5280 + }, + { + "epoch": 3.658468998960859, + "grad_norm": 0.5684226751327515, + "learning_rate": 6.344660194174758e-06, + "loss": 0.0307, + "step": 5281 + }, + { + "epoch": 3.6591617596120543, + "grad_norm": 0.8439211249351501, + "learning_rate": 6.343966712898753e-06, + "loss": 0.0269, + "step": 5282 + }, + { + "epoch": 3.659854520263249, + "grad_norm": 0.7445356249809265, + "learning_rate": 6.343273231622746e-06, + "loss": 0.022, + "step": 5283 + }, + { + "epoch": 3.660547280914444, + "grad_norm": 0.6294088959693909, + "learning_rate": 6.342579750346741e-06, + "loss": 0.0424, + "step": 5284 + }, + { + "epoch": 3.661240041565639, + "grad_norm": 0.6228764057159424, + "learning_rate": 6.341886269070736e-06, + "loss": 0.0316, + "step": 5285 + }, + { + "epoch": 3.661932802216834, + "grad_norm": 0.5894211530685425, + "learning_rate": 6.34119278779473e-06, + "loss": 0.0377, + "step": 5286 + }, + { + "epoch": 3.662625562868029, + "grad_norm": 0.6125890612602234, + "learning_rate": 6.340499306518725e-06, + "loss": 0.0321, + "step": 5287 + }, + { + "epoch": 3.663318323519224, + "grad_norm": 0.5297964215278625, + "learning_rate": 6.339805825242718e-06, + "loss": 0.0252, + "step": 5288 + }, + { + "epoch": 3.664011084170419, + "grad_norm": 0.5186530947685242, + "learning_rate": 6.339112343966713e-06, + "loss": 0.0262, + "step": 5289 + }, + { + "epoch": 3.6647038448216143, + "grad_norm": 0.5094587206840515, + "learning_rate": 6.338418862690708e-06, + "loss": 0.0237, + "step": 5290 + }, + { + "epoch": 3.665396605472809, + "grad_norm": 0.6024801731109619, + "learning_rate": 6.337725381414702e-06, + "loss": 0.0303, + "step": 5291 + }, + { + "epoch": 3.6660893661240044, + "grad_norm": 0.5105265378952026, + "learning_rate": 6.337031900138697e-06, + "loss": 0.023, + "step": 5292 + }, + { + "epoch": 3.666782126775199, + "grad_norm": 0.5699858665466309, + "learning_rate": 6.336338418862691e-06, + "loss": 0.0275, + "step": 5293 + }, + { + "epoch": 3.667474887426394, + "grad_norm": 0.474332332611084, + "learning_rate": 6.335644937586686e-06, + "loss": 0.0217, + "step": 5294 + }, + { + "epoch": 3.6681676480775893, + "grad_norm": 0.5792379379272461, + "learning_rate": 6.33495145631068e-06, + "loss": 0.0297, + "step": 5295 + }, + { + "epoch": 3.668860408728784, + "grad_norm": 0.6319828629493713, + "learning_rate": 6.334257975034674e-06, + "loss": 0.0402, + "step": 5296 + }, + { + "epoch": 3.669553169379979, + "grad_norm": 0.5288411378860474, + "learning_rate": 6.333564493758669e-06, + "loss": 0.0344, + "step": 5297 + }, + { + "epoch": 3.6702459300311743, + "grad_norm": 0.48269525170326233, + "learning_rate": 6.332871012482663e-06, + "loss": 0.0209, + "step": 5298 + }, + { + "epoch": 3.670938690682369, + "grad_norm": 0.6102100014686584, + "learning_rate": 6.332177531206658e-06, + "loss": 0.0266, + "step": 5299 + }, + { + "epoch": 3.6716314513335644, + "grad_norm": 0.532383918762207, + "learning_rate": 6.331484049930653e-06, + "loss": 0.0235, + "step": 5300 + }, + { + "epoch": 3.6723242119847592, + "grad_norm": 0.48872363567352295, + "learning_rate": 6.3307905686546465e-06, + "loss": 0.0203, + "step": 5301 + }, + { + "epoch": 3.6730169726359545, + "grad_norm": 0.5926398634910583, + "learning_rate": 6.330097087378641e-06, + "loss": 0.0295, + "step": 5302 + }, + { + "epoch": 3.6737097332871493, + "grad_norm": 0.525722324848175, + "learning_rate": 6.3294036061026355e-06, + "loss": 0.023, + "step": 5303 + }, + { + "epoch": 3.674402493938344, + "grad_norm": 0.5855699777603149, + "learning_rate": 6.32871012482663e-06, + "loss": 0.0382, + "step": 5304 + }, + { + "epoch": 3.6750952545895395, + "grad_norm": 0.6651132106781006, + "learning_rate": 6.328016643550625e-06, + "loss": 0.0303, + "step": 5305 + }, + { + "epoch": 3.6757880152407343, + "grad_norm": 0.617127001285553, + "learning_rate": 6.327323162274619e-06, + "loss": 0.0302, + "step": 5306 + }, + { + "epoch": 3.676480775891929, + "grad_norm": 0.5854039192199707, + "learning_rate": 6.3266296809986135e-06, + "loss": 0.0241, + "step": 5307 + }, + { + "epoch": 3.6771735365431244, + "grad_norm": 0.559038519859314, + "learning_rate": 6.325936199722608e-06, + "loss": 0.028, + "step": 5308 + }, + { + "epoch": 3.6778662971943192, + "grad_norm": 0.6301884055137634, + "learning_rate": 6.3252427184466025e-06, + "loss": 0.0325, + "step": 5309 + }, + { + "epoch": 3.6785590578455145, + "grad_norm": 0.6372119784355164, + "learning_rate": 6.3245492371705975e-06, + "loss": 0.0301, + "step": 5310 + }, + { + "epoch": 3.6792518184967093, + "grad_norm": 0.6511542201042175, + "learning_rate": 6.3238557558945916e-06, + "loss": 0.026, + "step": 5311 + }, + { + "epoch": 3.6799445791479046, + "grad_norm": 0.67556232213974, + "learning_rate": 6.3231622746185865e-06, + "loss": 0.0337, + "step": 5312 + }, + { + "epoch": 3.6806373397990995, + "grad_norm": 0.605854868888855, + "learning_rate": 6.32246879334258e-06, + "loss": 0.0328, + "step": 5313 + }, + { + "epoch": 3.6813301004502943, + "grad_norm": 0.5693877935409546, + "learning_rate": 6.321775312066575e-06, + "loss": 0.0285, + "step": 5314 + }, + { + "epoch": 3.6820228611014896, + "grad_norm": 0.6283695697784424, + "learning_rate": 6.32108183079057e-06, + "loss": 0.0318, + "step": 5315 + }, + { + "epoch": 3.6827156217526844, + "grad_norm": 0.6185287833213806, + "learning_rate": 6.320388349514564e-06, + "loss": 0.0278, + "step": 5316 + }, + { + "epoch": 3.6834083824038792, + "grad_norm": 0.5208211541175842, + "learning_rate": 6.319694868238559e-06, + "loss": 0.026, + "step": 5317 + }, + { + "epoch": 3.6841011430550745, + "grad_norm": 0.5192283987998962, + "learning_rate": 6.319001386962552e-06, + "loss": 0.0299, + "step": 5318 + }, + { + "epoch": 3.6847939037062694, + "grad_norm": 0.6171731352806091, + "learning_rate": 6.318307905686547e-06, + "loss": 0.031, + "step": 5319 + }, + { + "epoch": 3.6854866643574646, + "grad_norm": 0.5296388864517212, + "learning_rate": 6.317614424410542e-06, + "loss": 0.0316, + "step": 5320 + }, + { + "epoch": 3.6861794250086595, + "grad_norm": 0.6835871338844299, + "learning_rate": 6.316920943134536e-06, + "loss": 0.0355, + "step": 5321 + }, + { + "epoch": 3.6868721856598548, + "grad_norm": 0.5253806710243225, + "learning_rate": 6.316227461858531e-06, + "loss": 0.0294, + "step": 5322 + }, + { + "epoch": 3.6875649463110496, + "grad_norm": 0.5869537591934204, + "learning_rate": 6.315533980582525e-06, + "loss": 0.0274, + "step": 5323 + }, + { + "epoch": 3.6882577069622444, + "grad_norm": 0.5435597896575928, + "learning_rate": 6.314840499306519e-06, + "loss": 0.0321, + "step": 5324 + }, + { + "epoch": 3.6889504676134397, + "grad_norm": 0.5530980825424194, + "learning_rate": 6.314147018030514e-06, + "loss": 0.027, + "step": 5325 + }, + { + "epoch": 3.6896432282646345, + "grad_norm": 0.5563802123069763, + "learning_rate": 6.313453536754508e-06, + "loss": 0.0292, + "step": 5326 + }, + { + "epoch": 3.6903359889158294, + "grad_norm": 0.4938030540943146, + "learning_rate": 6.312760055478503e-06, + "loss": 0.0224, + "step": 5327 + }, + { + "epoch": 3.6910287495670246, + "grad_norm": 0.5866602659225464, + "learning_rate": 6.312066574202497e-06, + "loss": 0.0284, + "step": 5328 + }, + { + "epoch": 3.6917215102182195, + "grad_norm": 0.6909619569778442, + "learning_rate": 6.311373092926492e-06, + "loss": 0.0453, + "step": 5329 + }, + { + "epoch": 3.6924142708694148, + "grad_norm": 0.6352144479751587, + "learning_rate": 6.310679611650487e-06, + "loss": 0.0344, + "step": 5330 + }, + { + "epoch": 3.6931070315206096, + "grad_norm": 0.6577898859977722, + "learning_rate": 6.30998613037448e-06, + "loss": 0.0373, + "step": 5331 + }, + { + "epoch": 3.693799792171805, + "grad_norm": 0.5675220489501953, + "learning_rate": 6.309292649098475e-06, + "loss": 0.0272, + "step": 5332 + }, + { + "epoch": 3.6944925528229997, + "grad_norm": 0.702862560749054, + "learning_rate": 6.308599167822469e-06, + "loss": 0.0369, + "step": 5333 + }, + { + "epoch": 3.6951853134741945, + "grad_norm": 0.7315636873245239, + "learning_rate": 6.307905686546464e-06, + "loss": 0.0381, + "step": 5334 + }, + { + "epoch": 3.69587807412539, + "grad_norm": 0.6276235580444336, + "learning_rate": 6.307212205270459e-06, + "loss": 0.0262, + "step": 5335 + }, + { + "epoch": 3.6965708347765847, + "grad_norm": 0.4969485402107239, + "learning_rate": 6.306518723994452e-06, + "loss": 0.0351, + "step": 5336 + }, + { + "epoch": 3.6972635954277795, + "grad_norm": 0.6115286946296692, + "learning_rate": 6.305825242718447e-06, + "loss": 0.0372, + "step": 5337 + }, + { + "epoch": 3.6979563560789748, + "grad_norm": 0.6383223533630371, + "learning_rate": 6.305131761442441e-06, + "loss": 0.0347, + "step": 5338 + }, + { + "epoch": 3.6986491167301696, + "grad_norm": 0.6415778994560242, + "learning_rate": 6.304438280166436e-06, + "loss": 0.0291, + "step": 5339 + }, + { + "epoch": 3.699341877381365, + "grad_norm": 0.593380331993103, + "learning_rate": 6.303744798890431e-06, + "loss": 0.033, + "step": 5340 + }, + { + "epoch": 3.7000346380325597, + "grad_norm": 0.5095431208610535, + "learning_rate": 6.303051317614425e-06, + "loss": 0.0277, + "step": 5341 + }, + { + "epoch": 3.700727398683755, + "grad_norm": 0.6358219981193542, + "learning_rate": 6.30235783633842e-06, + "loss": 0.0368, + "step": 5342 + }, + { + "epoch": 3.70142015933495, + "grad_norm": 0.4579196572303772, + "learning_rate": 6.301664355062413e-06, + "loss": 0.0236, + "step": 5343 + }, + { + "epoch": 3.7021129199861447, + "grad_norm": 0.6052901744842529, + "learning_rate": 6.300970873786408e-06, + "loss": 0.0378, + "step": 5344 + }, + { + "epoch": 3.70280568063734, + "grad_norm": 0.5675000548362732, + "learning_rate": 6.300277392510403e-06, + "loss": 0.0292, + "step": 5345 + }, + { + "epoch": 3.703498441288535, + "grad_norm": 0.5974627137184143, + "learning_rate": 6.299583911234397e-06, + "loss": 0.0304, + "step": 5346 + }, + { + "epoch": 3.7041912019397296, + "grad_norm": 0.4792574942111969, + "learning_rate": 6.298890429958392e-06, + "loss": 0.0201, + "step": 5347 + }, + { + "epoch": 3.704883962590925, + "grad_norm": 0.7189779877662659, + "learning_rate": 6.2981969486823855e-06, + "loss": 0.0322, + "step": 5348 + }, + { + "epoch": 3.7055767232421197, + "grad_norm": 0.459058940410614, + "learning_rate": 6.29750346740638e-06, + "loss": 0.0204, + "step": 5349 + }, + { + "epoch": 3.706269483893315, + "grad_norm": 0.633417546749115, + "learning_rate": 6.296809986130375e-06, + "loss": 0.0332, + "step": 5350 + }, + { + "epoch": 3.70696224454451, + "grad_norm": 0.6857876181602478, + "learning_rate": 6.296116504854369e-06, + "loss": 0.0356, + "step": 5351 + }, + { + "epoch": 3.707655005195705, + "grad_norm": 0.6905134916305542, + "learning_rate": 6.295423023578364e-06, + "loss": 0.027, + "step": 5352 + }, + { + "epoch": 3.7083477658469, + "grad_norm": 0.7333101034164429, + "learning_rate": 6.294729542302358e-06, + "loss": 0.0356, + "step": 5353 + }, + { + "epoch": 3.709040526498095, + "grad_norm": 0.6089735627174377, + "learning_rate": 6.2940360610263525e-06, + "loss": 0.0354, + "step": 5354 + }, + { + "epoch": 3.70973328714929, + "grad_norm": 0.6550477743148804, + "learning_rate": 6.2933425797503475e-06, + "loss": 0.0431, + "step": 5355 + }, + { + "epoch": 3.710426047800485, + "grad_norm": 0.5756028890609741, + "learning_rate": 6.2926490984743415e-06, + "loss": 0.0302, + "step": 5356 + }, + { + "epoch": 3.7111188084516797, + "grad_norm": 0.5778351426124573, + "learning_rate": 6.2919556171983365e-06, + "loss": 0.0277, + "step": 5357 + }, + { + "epoch": 3.711811569102875, + "grad_norm": 0.5539336204528809, + "learning_rate": 6.2912621359223306e-06, + "loss": 0.0241, + "step": 5358 + }, + { + "epoch": 3.71250432975407, + "grad_norm": 0.7275177240371704, + "learning_rate": 6.2905686546463255e-06, + "loss": 0.0313, + "step": 5359 + }, + { + "epoch": 3.713197090405265, + "grad_norm": 0.5739328861236572, + "learning_rate": 6.2898751733703204e-06, + "loss": 0.0349, + "step": 5360 + }, + { + "epoch": 3.71388985105646, + "grad_norm": 0.7920843958854675, + "learning_rate": 6.289181692094314e-06, + "loss": 0.0356, + "step": 5361 + }, + { + "epoch": 3.7145826117076552, + "grad_norm": 0.5790738463401794, + "learning_rate": 6.288488210818309e-06, + "loss": 0.0309, + "step": 5362 + }, + { + "epoch": 3.71527537235885, + "grad_norm": 0.592981219291687, + "learning_rate": 6.287794729542303e-06, + "loss": 0.0317, + "step": 5363 + }, + { + "epoch": 3.715968133010045, + "grad_norm": 0.5291422605514526, + "learning_rate": 6.287101248266298e-06, + "loss": 0.0258, + "step": 5364 + }, + { + "epoch": 3.71666089366124, + "grad_norm": 0.6955481767654419, + "learning_rate": 6.2864077669902926e-06, + "loss": 0.0306, + "step": 5365 + }, + { + "epoch": 3.717353654312435, + "grad_norm": 0.7937054634094238, + "learning_rate": 6.285714285714286e-06, + "loss": 0.0359, + "step": 5366 + }, + { + "epoch": 3.71804641496363, + "grad_norm": 0.7813820838928223, + "learning_rate": 6.285020804438281e-06, + "loss": 0.0306, + "step": 5367 + }, + { + "epoch": 3.718739175614825, + "grad_norm": 0.5447640419006348, + "learning_rate": 6.284327323162275e-06, + "loss": 0.0304, + "step": 5368 + }, + { + "epoch": 3.71943193626602, + "grad_norm": 0.6013126373291016, + "learning_rate": 6.28363384188627e-06, + "loss": 0.0231, + "step": 5369 + }, + { + "epoch": 3.7201246969172153, + "grad_norm": 0.6309574842453003, + "learning_rate": 6.282940360610265e-06, + "loss": 0.0348, + "step": 5370 + }, + { + "epoch": 3.72081745756841, + "grad_norm": 0.5237972140312195, + "learning_rate": 6.282246879334259e-06, + "loss": 0.0301, + "step": 5371 + }, + { + "epoch": 3.7215102182196054, + "grad_norm": 0.680164098739624, + "learning_rate": 6.281553398058253e-06, + "loss": 0.0352, + "step": 5372 + }, + { + "epoch": 3.7222029788708, + "grad_norm": 0.5138393640518188, + "learning_rate": 6.280859916782247e-06, + "loss": 0.0282, + "step": 5373 + }, + { + "epoch": 3.722895739521995, + "grad_norm": 0.6719475984573364, + "learning_rate": 6.280166435506242e-06, + "loss": 0.0426, + "step": 5374 + }, + { + "epoch": 3.7235885001731903, + "grad_norm": 0.4255213737487793, + "learning_rate": 6.279472954230237e-06, + "loss": 0.0243, + "step": 5375 + }, + { + "epoch": 3.724281260824385, + "grad_norm": 0.6353483200073242, + "learning_rate": 6.278779472954231e-06, + "loss": 0.0375, + "step": 5376 + }, + { + "epoch": 3.72497402147558, + "grad_norm": 0.6046718955039978, + "learning_rate": 6.278085991678226e-06, + "loss": 0.0277, + "step": 5377 + }, + { + "epoch": 3.7256667821267753, + "grad_norm": 0.6534799933433533, + "learning_rate": 6.277392510402219e-06, + "loss": 0.0332, + "step": 5378 + }, + { + "epoch": 3.72635954277797, + "grad_norm": 0.5492955446243286, + "learning_rate": 6.276699029126214e-06, + "loss": 0.029, + "step": 5379 + }, + { + "epoch": 3.7270523034291654, + "grad_norm": 0.4888821840286255, + "learning_rate": 6.276005547850209e-06, + "loss": 0.033, + "step": 5380 + }, + { + "epoch": 3.72774506408036, + "grad_norm": 0.6340741515159607, + "learning_rate": 6.275312066574203e-06, + "loss": 0.0322, + "step": 5381 + }, + { + "epoch": 3.7284378247315555, + "grad_norm": 0.5831493735313416, + "learning_rate": 6.274618585298198e-06, + "loss": 0.028, + "step": 5382 + }, + { + "epoch": 3.7291305853827503, + "grad_norm": 0.6150385141372681, + "learning_rate": 6.273925104022191e-06, + "loss": 0.0385, + "step": 5383 + }, + { + "epoch": 3.729823346033945, + "grad_norm": 0.5997127890586853, + "learning_rate": 6.273231622746186e-06, + "loss": 0.0287, + "step": 5384 + }, + { + "epoch": 3.7305161066851404, + "grad_norm": 0.4969633221626282, + "learning_rate": 6.272538141470181e-06, + "loss": 0.0303, + "step": 5385 + }, + { + "epoch": 3.7312088673363353, + "grad_norm": 0.6962485313415527, + "learning_rate": 6.271844660194175e-06, + "loss": 0.0288, + "step": 5386 + }, + { + "epoch": 3.73190162798753, + "grad_norm": 0.708992600440979, + "learning_rate": 6.27115117891817e-06, + "loss": 0.052, + "step": 5387 + }, + { + "epoch": 3.7325943886387254, + "grad_norm": 0.5180173516273499, + "learning_rate": 6.270457697642164e-06, + "loss": 0.0242, + "step": 5388 + }, + { + "epoch": 3.7332871492899202, + "grad_norm": 0.6726405024528503, + "learning_rate": 6.269764216366159e-06, + "loss": 0.038, + "step": 5389 + }, + { + "epoch": 3.7339799099411155, + "grad_norm": 0.6243935823440552, + "learning_rate": 6.269070735090154e-06, + "loss": 0.0417, + "step": 5390 + }, + { + "epoch": 3.7346726705923103, + "grad_norm": 0.5270748734474182, + "learning_rate": 6.268377253814147e-06, + "loss": 0.0264, + "step": 5391 + }, + { + "epoch": 3.7353654312435056, + "grad_norm": 0.674968421459198, + "learning_rate": 6.267683772538142e-06, + "loss": 0.0388, + "step": 5392 + }, + { + "epoch": 3.7360581918947005, + "grad_norm": 0.6683635711669922, + "learning_rate": 6.266990291262136e-06, + "loss": 0.038, + "step": 5393 + }, + { + "epoch": 3.7367509525458953, + "grad_norm": 0.5001316070556641, + "learning_rate": 6.266296809986131e-06, + "loss": 0.0227, + "step": 5394 + }, + { + "epoch": 3.7374437131970906, + "grad_norm": 0.6227713227272034, + "learning_rate": 6.265603328710126e-06, + "loss": 0.0339, + "step": 5395 + }, + { + "epoch": 3.7381364738482854, + "grad_norm": 0.5813007950782776, + "learning_rate": 6.264909847434119e-06, + "loss": 0.0345, + "step": 5396 + }, + { + "epoch": 3.7388292344994802, + "grad_norm": 0.5505149960517883, + "learning_rate": 6.264216366158114e-06, + "loss": 0.0266, + "step": 5397 + }, + { + "epoch": 3.7395219951506755, + "grad_norm": 0.5475566983222961, + "learning_rate": 6.263522884882108e-06, + "loss": 0.0258, + "step": 5398 + }, + { + "epoch": 3.7402147558018703, + "grad_norm": 0.5670726895332336, + "learning_rate": 6.262829403606103e-06, + "loss": 0.0259, + "step": 5399 + }, + { + "epoch": 3.7409075164530656, + "grad_norm": 0.6908154487609863, + "learning_rate": 6.262135922330098e-06, + "loss": 0.035, + "step": 5400 + }, + { + "epoch": 3.7416002771042605, + "grad_norm": 0.630292534828186, + "learning_rate": 6.2614424410540915e-06, + "loss": 0.033, + "step": 5401 + }, + { + "epoch": 3.7422930377554557, + "grad_norm": 0.7584119439125061, + "learning_rate": 6.2607489597780865e-06, + "loss": 0.0386, + "step": 5402 + }, + { + "epoch": 3.7429857984066506, + "grad_norm": 0.7879133224487305, + "learning_rate": 6.2600554785020805e-06, + "loss": 0.0407, + "step": 5403 + }, + { + "epoch": 3.7436785590578454, + "grad_norm": 0.6253929734230042, + "learning_rate": 6.2593619972260755e-06, + "loss": 0.028, + "step": 5404 + }, + { + "epoch": 3.7443713197090407, + "grad_norm": 0.6191489696502686, + "learning_rate": 6.25866851595007e-06, + "loss": 0.0308, + "step": 5405 + }, + { + "epoch": 3.7450640803602355, + "grad_norm": 0.5253886580467224, + "learning_rate": 6.2579750346740645e-06, + "loss": 0.0262, + "step": 5406 + }, + { + "epoch": 3.7457568410114304, + "grad_norm": 0.4761843681335449, + "learning_rate": 6.2572815533980594e-06, + "loss": 0.0253, + "step": 5407 + }, + { + "epoch": 3.7464496016626256, + "grad_norm": 0.6056466698646545, + "learning_rate": 6.256588072122053e-06, + "loss": 0.0271, + "step": 5408 + }, + { + "epoch": 3.7471423623138205, + "grad_norm": 0.6499962210655212, + "learning_rate": 6.255894590846048e-06, + "loss": 0.0358, + "step": 5409 + }, + { + "epoch": 3.7478351229650158, + "grad_norm": 0.5838296413421631, + "learning_rate": 6.2552011095700425e-06, + "loss": 0.0257, + "step": 5410 + }, + { + "epoch": 3.7485278836162106, + "grad_norm": 0.4950014352798462, + "learning_rate": 6.254507628294037e-06, + "loss": 0.0266, + "step": 5411 + }, + { + "epoch": 3.749220644267406, + "grad_norm": 0.8059121370315552, + "learning_rate": 6.2538141470180316e-06, + "loss": 0.0445, + "step": 5412 + }, + { + "epoch": 3.7499134049186007, + "grad_norm": 0.5892294645309448, + "learning_rate": 6.253120665742025e-06, + "loss": 0.035, + "step": 5413 + }, + { + "epoch": 3.7506061655697955, + "grad_norm": 0.5557952523231506, + "learning_rate": 6.25242718446602e-06, + "loss": 0.0301, + "step": 5414 + }, + { + "epoch": 3.751298926220991, + "grad_norm": 0.515476644039154, + "learning_rate": 6.251733703190015e-06, + "loss": 0.0289, + "step": 5415 + }, + { + "epoch": 3.7519916868721856, + "grad_norm": 0.7727710604667664, + "learning_rate": 6.251040221914009e-06, + "loss": 0.0333, + "step": 5416 + }, + { + "epoch": 3.7526844475233805, + "grad_norm": 0.6355763077735901, + "learning_rate": 6.250346740638004e-06, + "loss": 0.0342, + "step": 5417 + }, + { + "epoch": 3.7533772081745758, + "grad_norm": 0.5792461037635803, + "learning_rate": 6.249653259361998e-06, + "loss": 0.0362, + "step": 5418 + }, + { + "epoch": 3.7540699688257706, + "grad_norm": 0.7207496166229248, + "learning_rate": 6.248959778085993e-06, + "loss": 0.0407, + "step": 5419 + }, + { + "epoch": 3.7547627294769654, + "grad_norm": 0.7004422545433044, + "learning_rate": 6.248266296809986e-06, + "loss": 0.0352, + "step": 5420 + }, + { + "epoch": 3.7554554901281607, + "grad_norm": 0.7354230284690857, + "learning_rate": 6.247572815533981e-06, + "loss": 0.0365, + "step": 5421 + }, + { + "epoch": 3.756148250779356, + "grad_norm": 0.5657079815864563, + "learning_rate": 6.246879334257976e-06, + "loss": 0.0211, + "step": 5422 + }, + { + "epoch": 3.756841011430551, + "grad_norm": 0.5205053687095642, + "learning_rate": 6.24618585298197e-06, + "loss": 0.0214, + "step": 5423 + }, + { + "epoch": 3.7575337720817457, + "grad_norm": 0.5499959588050842, + "learning_rate": 6.245492371705965e-06, + "loss": 0.0276, + "step": 5424 + }, + { + "epoch": 3.758226532732941, + "grad_norm": 0.7697461843490601, + "learning_rate": 6.244798890429958e-06, + "loss": 0.0379, + "step": 5425 + }, + { + "epoch": 3.7589192933841358, + "grad_norm": 0.6891419291496277, + "learning_rate": 6.244105409153953e-06, + "loss": 0.0324, + "step": 5426 + }, + { + "epoch": 3.7596120540353306, + "grad_norm": 0.5204426050186157, + "learning_rate": 6.243411927877948e-06, + "loss": 0.0235, + "step": 5427 + }, + { + "epoch": 3.760304814686526, + "grad_norm": 0.6464238166809082, + "learning_rate": 6.242718446601942e-06, + "loss": 0.0334, + "step": 5428 + }, + { + "epoch": 3.7609975753377207, + "grad_norm": 0.6387354731559753, + "learning_rate": 6.242024965325937e-06, + "loss": 0.0402, + "step": 5429 + }, + { + "epoch": 3.7616903359889156, + "grad_norm": 0.5376962423324585, + "learning_rate": 6.24133148404993e-06, + "loss": 0.0261, + "step": 5430 + }, + { + "epoch": 3.762383096640111, + "grad_norm": 0.6329233050346375, + "learning_rate": 6.240638002773925e-06, + "loss": 0.0304, + "step": 5431 + }, + { + "epoch": 3.763075857291306, + "grad_norm": 0.751514196395874, + "learning_rate": 6.23994452149792e-06, + "loss": 0.0323, + "step": 5432 + }, + { + "epoch": 3.763768617942501, + "grad_norm": 0.6077172756195068, + "learning_rate": 6.239251040221914e-06, + "loss": 0.0276, + "step": 5433 + }, + { + "epoch": 3.764461378593696, + "grad_norm": 0.5049221515655518, + "learning_rate": 6.238557558945909e-06, + "loss": 0.0251, + "step": 5434 + }, + { + "epoch": 3.765154139244891, + "grad_norm": 0.6007904410362244, + "learning_rate": 6.237864077669903e-06, + "loss": 0.0291, + "step": 5435 + }, + { + "epoch": 3.765846899896086, + "grad_norm": 0.628989040851593, + "learning_rate": 6.237170596393898e-06, + "loss": 0.0315, + "step": 5436 + }, + { + "epoch": 3.7665396605472807, + "grad_norm": 0.6191542744636536, + "learning_rate": 6.236477115117893e-06, + "loss": 0.0352, + "step": 5437 + }, + { + "epoch": 3.767232421198476, + "grad_norm": 0.5907944440841675, + "learning_rate": 6.235783633841886e-06, + "loss": 0.0325, + "step": 5438 + }, + { + "epoch": 3.767925181849671, + "grad_norm": 0.6437923908233643, + "learning_rate": 6.235090152565881e-06, + "loss": 0.031, + "step": 5439 + }, + { + "epoch": 3.7686179425008657, + "grad_norm": 0.6748537421226501, + "learning_rate": 6.234396671289875e-06, + "loss": 0.0264, + "step": 5440 + }, + { + "epoch": 3.769310703152061, + "grad_norm": 0.7189391851425171, + "learning_rate": 6.23370319001387e-06, + "loss": 0.032, + "step": 5441 + }, + { + "epoch": 3.7700034638032562, + "grad_norm": 0.6138607263565063, + "learning_rate": 6.233009708737865e-06, + "loss": 0.0305, + "step": 5442 + }, + { + "epoch": 3.770696224454451, + "grad_norm": 0.546404242515564, + "learning_rate": 6.232316227461858e-06, + "loss": 0.0293, + "step": 5443 + }, + { + "epoch": 3.771388985105646, + "grad_norm": 0.6265897750854492, + "learning_rate": 6.231622746185853e-06, + "loss": 0.0261, + "step": 5444 + }, + { + "epoch": 3.772081745756841, + "grad_norm": 0.667362630367279, + "learning_rate": 6.230929264909847e-06, + "loss": 0.0363, + "step": 5445 + }, + { + "epoch": 3.772774506408036, + "grad_norm": 1.0492887496948242, + "learning_rate": 6.230235783633842e-06, + "loss": 0.0289, + "step": 5446 + }, + { + "epoch": 3.773467267059231, + "grad_norm": 0.585996687412262, + "learning_rate": 6.229542302357837e-06, + "loss": 0.0281, + "step": 5447 + }, + { + "epoch": 3.774160027710426, + "grad_norm": 0.6105867624282837, + "learning_rate": 6.228848821081831e-06, + "loss": 0.0269, + "step": 5448 + }, + { + "epoch": 3.774852788361621, + "grad_norm": 0.5522729158401489, + "learning_rate": 6.2281553398058255e-06, + "loss": 0.0272, + "step": 5449 + }, + { + "epoch": 3.775545549012816, + "grad_norm": 0.6558374762535095, + "learning_rate": 6.2274618585298195e-06, + "loss": 0.0423, + "step": 5450 + }, + { + "epoch": 3.776238309664011, + "grad_norm": 0.7256389856338501, + "learning_rate": 6.2267683772538145e-06, + "loss": 0.0315, + "step": 5451 + }, + { + "epoch": 3.7769310703152064, + "grad_norm": 0.5794988870620728, + "learning_rate": 6.226074895977809e-06, + "loss": 0.022, + "step": 5452 + }, + { + "epoch": 3.777623830966401, + "grad_norm": 0.8709037899971008, + "learning_rate": 6.2253814147018035e-06, + "loss": 0.0367, + "step": 5453 + }, + { + "epoch": 3.778316591617596, + "grad_norm": 0.5693874955177307, + "learning_rate": 6.2246879334257984e-06, + "loss": 0.0297, + "step": 5454 + }, + { + "epoch": 3.7790093522687913, + "grad_norm": 0.7096821665763855, + "learning_rate": 6.223994452149792e-06, + "loss": 0.0353, + "step": 5455 + }, + { + "epoch": 3.779702112919986, + "grad_norm": 0.6339651942253113, + "learning_rate": 6.223300970873787e-06, + "loss": 0.0359, + "step": 5456 + }, + { + "epoch": 3.780394873571181, + "grad_norm": 0.6035611629486084, + "learning_rate": 6.2226074895977815e-06, + "loss": 0.0243, + "step": 5457 + }, + { + "epoch": 3.7810876342223763, + "grad_norm": 0.6057316660881042, + "learning_rate": 6.221914008321776e-06, + "loss": 0.0312, + "step": 5458 + }, + { + "epoch": 3.781780394873571, + "grad_norm": 0.6637762784957886, + "learning_rate": 6.2212205270457706e-06, + "loss": 0.0245, + "step": 5459 + }, + { + "epoch": 3.782473155524766, + "grad_norm": 0.630608081817627, + "learning_rate": 6.220527045769764e-06, + "loss": 0.0315, + "step": 5460 + }, + { + "epoch": 3.783165916175961, + "grad_norm": 0.8728220462799072, + "learning_rate": 6.219833564493759e-06, + "loss": 0.0419, + "step": 5461 + }, + { + "epoch": 3.7838586768271565, + "grad_norm": 0.6907286643981934, + "learning_rate": 6.219140083217754e-06, + "loss": 0.0333, + "step": 5462 + }, + { + "epoch": 3.7845514374783513, + "grad_norm": 0.6949223875999451, + "learning_rate": 6.218446601941748e-06, + "loss": 0.0442, + "step": 5463 + }, + { + "epoch": 3.785244198129546, + "grad_norm": 0.5685586333274841, + "learning_rate": 6.217753120665743e-06, + "loss": 0.0272, + "step": 5464 + }, + { + "epoch": 3.7859369587807414, + "grad_norm": 0.6025075316429138, + "learning_rate": 6.217059639389737e-06, + "loss": 0.0313, + "step": 5465 + }, + { + "epoch": 3.7866297194319363, + "grad_norm": 0.5943772792816162, + "learning_rate": 6.216366158113732e-06, + "loss": 0.0309, + "step": 5466 + }, + { + "epoch": 3.787322480083131, + "grad_norm": 0.6488465666770935, + "learning_rate": 6.215672676837727e-06, + "loss": 0.0302, + "step": 5467 + }, + { + "epoch": 3.7880152407343264, + "grad_norm": 0.6524391770362854, + "learning_rate": 6.21497919556172e-06, + "loss": 0.0335, + "step": 5468 + }, + { + "epoch": 3.788708001385521, + "grad_norm": 0.5891224145889282, + "learning_rate": 6.214285714285715e-06, + "loss": 0.0315, + "step": 5469 + }, + { + "epoch": 3.789400762036716, + "grad_norm": 0.5956622958183289, + "learning_rate": 6.213592233009709e-06, + "loss": 0.0355, + "step": 5470 + }, + { + "epoch": 3.7900935226879113, + "grad_norm": 0.587727427482605, + "learning_rate": 6.212898751733704e-06, + "loss": 0.0316, + "step": 5471 + }, + { + "epoch": 3.7907862833391066, + "grad_norm": 0.6151914000511169, + "learning_rate": 6.212205270457699e-06, + "loss": 0.0283, + "step": 5472 + }, + { + "epoch": 3.7914790439903014, + "grad_norm": 0.6343703866004944, + "learning_rate": 6.211511789181692e-06, + "loss": 0.037, + "step": 5473 + }, + { + "epoch": 3.7921718046414963, + "grad_norm": 0.6419991254806519, + "learning_rate": 6.210818307905687e-06, + "loss": 0.0381, + "step": 5474 + }, + { + "epoch": 3.7928645652926916, + "grad_norm": 0.5683888792991638, + "learning_rate": 6.210124826629681e-06, + "loss": 0.0248, + "step": 5475 + }, + { + "epoch": 3.7935573259438864, + "grad_norm": 0.5710171461105347, + "learning_rate": 6.209431345353676e-06, + "loss": 0.0286, + "step": 5476 + }, + { + "epoch": 3.7942500865950812, + "grad_norm": 0.7465535402297974, + "learning_rate": 6.208737864077671e-06, + "loss": 0.0306, + "step": 5477 + }, + { + "epoch": 3.7949428472462765, + "grad_norm": 0.5855188965797424, + "learning_rate": 6.208044382801665e-06, + "loss": 0.0278, + "step": 5478 + }, + { + "epoch": 3.7956356078974713, + "grad_norm": 0.6260008215904236, + "learning_rate": 6.207350901525659e-06, + "loss": 0.0298, + "step": 5479 + }, + { + "epoch": 3.796328368548666, + "grad_norm": 0.6134366393089294, + "learning_rate": 6.206657420249653e-06, + "loss": 0.0353, + "step": 5480 + }, + { + "epoch": 3.7970211291998615, + "grad_norm": 0.884404182434082, + "learning_rate": 6.205963938973648e-06, + "loss": 0.0332, + "step": 5481 + }, + { + "epoch": 3.7977138898510567, + "grad_norm": 0.659879744052887, + "learning_rate": 6.205270457697643e-06, + "loss": 0.0392, + "step": 5482 + }, + { + "epoch": 3.7984066505022516, + "grad_norm": 0.6756446361541748, + "learning_rate": 6.204576976421637e-06, + "loss": 0.0393, + "step": 5483 + }, + { + "epoch": 3.7990994111534464, + "grad_norm": 0.6037367582321167, + "learning_rate": 6.203883495145632e-06, + "loss": 0.0301, + "step": 5484 + }, + { + "epoch": 3.7997921718046417, + "grad_norm": 0.6975623965263367, + "learning_rate": 6.203190013869625e-06, + "loss": 0.0311, + "step": 5485 + }, + { + "epoch": 3.8004849324558365, + "grad_norm": 0.5865778923034668, + "learning_rate": 6.20249653259362e-06, + "loss": 0.0378, + "step": 5486 + }, + { + "epoch": 3.8011776931070314, + "grad_norm": 0.6444191336631775, + "learning_rate": 6.201803051317615e-06, + "loss": 0.0267, + "step": 5487 + }, + { + "epoch": 3.8018704537582266, + "grad_norm": 0.602146327495575, + "learning_rate": 6.201109570041609e-06, + "loss": 0.0319, + "step": 5488 + }, + { + "epoch": 3.8025632144094215, + "grad_norm": 0.5784410238265991, + "learning_rate": 6.200416088765604e-06, + "loss": 0.0253, + "step": 5489 + }, + { + "epoch": 3.8032559750606163, + "grad_norm": 0.5727723836898804, + "learning_rate": 6.199722607489597e-06, + "loss": 0.029, + "step": 5490 + }, + { + "epoch": 3.8039487357118116, + "grad_norm": 0.635518491268158, + "learning_rate": 6.199029126213592e-06, + "loss": 0.0314, + "step": 5491 + }, + { + "epoch": 3.804641496363007, + "grad_norm": 0.6199983954429626, + "learning_rate": 6.198335644937587e-06, + "loss": 0.0227, + "step": 5492 + }, + { + "epoch": 3.8053342570142017, + "grad_norm": 0.6068674921989441, + "learning_rate": 6.197642163661581e-06, + "loss": 0.0287, + "step": 5493 + }, + { + "epoch": 3.8060270176653965, + "grad_norm": 0.5692973732948303, + "learning_rate": 6.196948682385576e-06, + "loss": 0.029, + "step": 5494 + }, + { + "epoch": 3.806719778316592, + "grad_norm": 0.553572416305542, + "learning_rate": 6.19625520110957e-06, + "loss": 0.0331, + "step": 5495 + }, + { + "epoch": 3.8074125389677866, + "grad_norm": 0.6230151653289795, + "learning_rate": 6.195561719833565e-06, + "loss": 0.0354, + "step": 5496 + }, + { + "epoch": 3.8081052996189815, + "grad_norm": 0.6320762634277344, + "learning_rate": 6.194868238557559e-06, + "loss": 0.0334, + "step": 5497 + }, + { + "epoch": 3.8087980602701768, + "grad_norm": 0.6541516184806824, + "learning_rate": 6.1941747572815535e-06, + "loss": 0.029, + "step": 5498 + }, + { + "epoch": 3.8094908209213716, + "grad_norm": 0.6015684604644775, + "learning_rate": 6.193481276005548e-06, + "loss": 0.0341, + "step": 5499 + }, + { + "epoch": 3.8101835815725664, + "grad_norm": 0.6116700768470764, + "learning_rate": 6.1927877947295425e-06, + "loss": 0.0248, + "step": 5500 + }, + { + "epoch": 3.8108763422237617, + "grad_norm": 0.6509401798248291, + "learning_rate": 6.1920943134535374e-06, + "loss": 0.0274, + "step": 5501 + }, + { + "epoch": 3.8115691028749565, + "grad_norm": 0.4841687083244324, + "learning_rate": 6.191400832177532e-06, + "loss": 0.0245, + "step": 5502 + }, + { + "epoch": 3.812261863526152, + "grad_norm": 0.44500479102134705, + "learning_rate": 6.190707350901526e-06, + "loss": 0.02, + "step": 5503 + }, + { + "epoch": 3.8129546241773467, + "grad_norm": 0.5654506087303162, + "learning_rate": 6.1900138696255205e-06, + "loss": 0.0316, + "step": 5504 + }, + { + "epoch": 3.813647384828542, + "grad_norm": 0.7147808074951172, + "learning_rate": 6.189320388349515e-06, + "loss": 0.0391, + "step": 5505 + }, + { + "epoch": 3.8143401454797368, + "grad_norm": 0.6720602512359619, + "learning_rate": 6.1886269070735096e-06, + "loss": 0.0313, + "step": 5506 + }, + { + "epoch": 3.8150329061309316, + "grad_norm": 0.7915608882904053, + "learning_rate": 6.1879334257975045e-06, + "loss": 0.0349, + "step": 5507 + }, + { + "epoch": 3.815725666782127, + "grad_norm": 0.6050640940666199, + "learning_rate": 6.187239944521498e-06, + "loss": 0.0361, + "step": 5508 + }, + { + "epoch": 3.8164184274333217, + "grad_norm": 0.7607973217964172, + "learning_rate": 6.186546463245493e-06, + "loss": 0.0362, + "step": 5509 + }, + { + "epoch": 3.8171111880845165, + "grad_norm": 0.6928087472915649, + "learning_rate": 6.185852981969487e-06, + "loss": 0.0351, + "step": 5510 + }, + { + "epoch": 3.817803948735712, + "grad_norm": 0.570793867111206, + "learning_rate": 6.185159500693482e-06, + "loss": 0.0358, + "step": 5511 + }, + { + "epoch": 3.8184967093869067, + "grad_norm": 0.6071538329124451, + "learning_rate": 6.184466019417477e-06, + "loss": 0.0263, + "step": 5512 + }, + { + "epoch": 3.819189470038102, + "grad_norm": 0.6508318781852722, + "learning_rate": 6.183772538141471e-06, + "loss": 0.0336, + "step": 5513 + }, + { + "epoch": 3.8198822306892968, + "grad_norm": 0.6919434070587158, + "learning_rate": 6.183079056865466e-06, + "loss": 0.0317, + "step": 5514 + }, + { + "epoch": 3.820574991340492, + "grad_norm": 0.7077426314353943, + "learning_rate": 6.182385575589459e-06, + "loss": 0.0391, + "step": 5515 + }, + { + "epoch": 3.821267751991687, + "grad_norm": 0.6071457266807556, + "learning_rate": 6.181692094313454e-06, + "loss": 0.0428, + "step": 5516 + }, + { + "epoch": 3.8219605126428817, + "grad_norm": 0.5197275876998901, + "learning_rate": 6.180998613037449e-06, + "loss": 0.0275, + "step": 5517 + }, + { + "epoch": 3.822653273294077, + "grad_norm": 0.6566160917282104, + "learning_rate": 6.180305131761443e-06, + "loss": 0.0426, + "step": 5518 + }, + { + "epoch": 3.823346033945272, + "grad_norm": 0.5395252704620361, + "learning_rate": 6.179611650485438e-06, + "loss": 0.026, + "step": 5519 + }, + { + "epoch": 3.8240387945964667, + "grad_norm": 0.8134343028068542, + "learning_rate": 6.178918169209431e-06, + "loss": 0.0335, + "step": 5520 + }, + { + "epoch": 3.824731555247662, + "grad_norm": 0.5744146108627319, + "learning_rate": 6.178224687933426e-06, + "loss": 0.03, + "step": 5521 + }, + { + "epoch": 3.825424315898857, + "grad_norm": 0.6129289269447327, + "learning_rate": 6.177531206657421e-06, + "loss": 0.0476, + "step": 5522 + }, + { + "epoch": 3.826117076550052, + "grad_norm": 0.6244030594825745, + "learning_rate": 6.176837725381415e-06, + "loss": 0.0328, + "step": 5523 + }, + { + "epoch": 3.826809837201247, + "grad_norm": 0.5601057410240173, + "learning_rate": 6.17614424410541e-06, + "loss": 0.0297, + "step": 5524 + }, + { + "epoch": 3.827502597852442, + "grad_norm": 0.6210319995880127, + "learning_rate": 6.175450762829404e-06, + "loss": 0.036, + "step": 5525 + }, + { + "epoch": 3.828195358503637, + "grad_norm": 0.5982537865638733, + "learning_rate": 6.174757281553399e-06, + "loss": 0.0273, + "step": 5526 + }, + { + "epoch": 3.828888119154832, + "grad_norm": 0.5864853858947754, + "learning_rate": 6.174063800277393e-06, + "loss": 0.025, + "step": 5527 + }, + { + "epoch": 3.829580879806027, + "grad_norm": 0.5640624165534973, + "learning_rate": 6.173370319001387e-06, + "loss": 0.032, + "step": 5528 + }, + { + "epoch": 3.830273640457222, + "grad_norm": 0.8922573924064636, + "learning_rate": 6.172676837725382e-06, + "loss": 0.0304, + "step": 5529 + }, + { + "epoch": 3.830966401108417, + "grad_norm": 0.5165643692016602, + "learning_rate": 6.171983356449376e-06, + "loss": 0.0207, + "step": 5530 + }, + { + "epoch": 3.831659161759612, + "grad_norm": 0.5760807991027832, + "learning_rate": 6.171289875173371e-06, + "loss": 0.0248, + "step": 5531 + }, + { + "epoch": 3.832351922410807, + "grad_norm": 0.6061486005783081, + "learning_rate": 6.170596393897366e-06, + "loss": 0.0361, + "step": 5532 + }, + { + "epoch": 3.833044683062002, + "grad_norm": 0.5424039363861084, + "learning_rate": 6.169902912621359e-06, + "loss": 0.0251, + "step": 5533 + }, + { + "epoch": 3.833737443713197, + "grad_norm": 0.6765320897102356, + "learning_rate": 6.169209431345354e-06, + "loss": 0.0338, + "step": 5534 + }, + { + "epoch": 3.8344302043643923, + "grad_norm": 0.6279667019844055, + "learning_rate": 6.168515950069348e-06, + "loss": 0.0308, + "step": 5535 + }, + { + "epoch": 3.835122965015587, + "grad_norm": 0.5763036608695984, + "learning_rate": 6.167822468793343e-06, + "loss": 0.0279, + "step": 5536 + }, + { + "epoch": 3.835815725666782, + "grad_norm": 0.5578659772872925, + "learning_rate": 6.167128987517338e-06, + "loss": 0.0281, + "step": 5537 + }, + { + "epoch": 3.8365084863179773, + "grad_norm": 0.5194475054740906, + "learning_rate": 6.166435506241331e-06, + "loss": 0.0294, + "step": 5538 + }, + { + "epoch": 3.837201246969172, + "grad_norm": 0.6886339783668518, + "learning_rate": 6.165742024965326e-06, + "loss": 0.032, + "step": 5539 + }, + { + "epoch": 3.837894007620367, + "grad_norm": 0.8126854300498962, + "learning_rate": 6.16504854368932e-06, + "loss": 0.0259, + "step": 5540 + }, + { + "epoch": 3.838586768271562, + "grad_norm": 0.6386008262634277, + "learning_rate": 6.164355062413315e-06, + "loss": 0.031, + "step": 5541 + }, + { + "epoch": 3.839279528922757, + "grad_norm": 0.5712670683860779, + "learning_rate": 6.16366158113731e-06, + "loss": 0.0239, + "step": 5542 + }, + { + "epoch": 3.8399722895739523, + "grad_norm": 0.5645532608032227, + "learning_rate": 6.162968099861304e-06, + "loss": 0.0222, + "step": 5543 + }, + { + "epoch": 3.840665050225147, + "grad_norm": 0.7109053730964661, + "learning_rate": 6.162274618585299e-06, + "loss": 0.0337, + "step": 5544 + }, + { + "epoch": 3.8413578108763424, + "grad_norm": 0.6322464942932129, + "learning_rate": 6.1615811373092925e-06, + "loss": 0.0391, + "step": 5545 + }, + { + "epoch": 3.8420505715275373, + "grad_norm": 0.5483359694480896, + "learning_rate": 6.160887656033287e-06, + "loss": 0.0256, + "step": 5546 + }, + { + "epoch": 3.842743332178732, + "grad_norm": 0.6358020305633545, + "learning_rate": 6.160194174757282e-06, + "loss": 0.0296, + "step": 5547 + }, + { + "epoch": 3.8434360928299274, + "grad_norm": 0.7286901473999023, + "learning_rate": 6.1595006934812764e-06, + "loss": 0.0361, + "step": 5548 + }, + { + "epoch": 3.844128853481122, + "grad_norm": 0.603725254535675, + "learning_rate": 6.158807212205271e-06, + "loss": 0.0387, + "step": 5549 + }, + { + "epoch": 3.844821614132317, + "grad_norm": 0.5984010696411133, + "learning_rate": 6.158113730929265e-06, + "loss": 0.0358, + "step": 5550 + }, + { + "epoch": 3.8455143747835123, + "grad_norm": 0.5777190327644348, + "learning_rate": 6.1574202496532595e-06, + "loss": 0.026, + "step": 5551 + }, + { + "epoch": 3.846207135434707, + "grad_norm": 0.6447054743766785, + "learning_rate": 6.1567267683772545e-06, + "loss": 0.0343, + "step": 5552 + }, + { + "epoch": 3.8468998960859024, + "grad_norm": 0.6139410138130188, + "learning_rate": 6.1560332871012486e-06, + "loss": 0.0285, + "step": 5553 + }, + { + "epoch": 3.8475926567370973, + "grad_norm": 0.5288937091827393, + "learning_rate": 6.1553398058252435e-06, + "loss": 0.0235, + "step": 5554 + }, + { + "epoch": 3.8482854173882926, + "grad_norm": 0.6244781613349915, + "learning_rate": 6.1546463245492376e-06, + "loss": 0.0272, + "step": 5555 + }, + { + "epoch": 3.8489781780394874, + "grad_norm": 0.6281181573867798, + "learning_rate": 6.153952843273232e-06, + "loss": 0.0322, + "step": 5556 + }, + { + "epoch": 3.849670938690682, + "grad_norm": 0.9273653626441956, + "learning_rate": 6.153259361997227e-06, + "loss": 0.0321, + "step": 5557 + }, + { + "epoch": 3.8503636993418775, + "grad_norm": 0.5947158336639404, + "learning_rate": 6.152565880721221e-06, + "loss": 0.0349, + "step": 5558 + }, + { + "epoch": 3.8510564599930723, + "grad_norm": 0.5656794905662537, + "learning_rate": 6.151872399445216e-06, + "loss": 0.0291, + "step": 5559 + }, + { + "epoch": 3.851749220644267, + "grad_norm": 0.6337341070175171, + "learning_rate": 6.15117891816921e-06, + "loss": 0.0348, + "step": 5560 + }, + { + "epoch": 3.8524419812954624, + "grad_norm": 0.6344237923622131, + "learning_rate": 6.150485436893205e-06, + "loss": 0.0389, + "step": 5561 + }, + { + "epoch": 3.8531347419466573, + "grad_norm": 0.5519279837608337, + "learning_rate": 6.1497919556171996e-06, + "loss": 0.0321, + "step": 5562 + }, + { + "epoch": 3.8538275025978526, + "grad_norm": 0.6443216800689697, + "learning_rate": 6.149098474341193e-06, + "loss": 0.0361, + "step": 5563 + }, + { + "epoch": 3.8545202632490474, + "grad_norm": 0.6036908626556396, + "learning_rate": 6.148404993065188e-06, + "loss": 0.033, + "step": 5564 + }, + { + "epoch": 3.8552130239002427, + "grad_norm": 0.7718948125839233, + "learning_rate": 6.147711511789182e-06, + "loss": 0.041, + "step": 5565 + }, + { + "epoch": 3.8559057845514375, + "grad_norm": 0.6507557034492493, + "learning_rate": 6.147018030513177e-06, + "loss": 0.0376, + "step": 5566 + }, + { + "epoch": 3.8565985452026323, + "grad_norm": 0.6405223608016968, + "learning_rate": 6.146324549237172e-06, + "loss": 0.0363, + "step": 5567 + }, + { + "epoch": 3.8572913058538276, + "grad_norm": 0.6134037375450134, + "learning_rate": 6.145631067961165e-06, + "loss": 0.03, + "step": 5568 + }, + { + "epoch": 3.8579840665050225, + "grad_norm": 0.7090088129043579, + "learning_rate": 6.14493758668516e-06, + "loss": 0.0369, + "step": 5569 + }, + { + "epoch": 3.8586768271562173, + "grad_norm": 0.49887773394584656, + "learning_rate": 6.144244105409154e-06, + "loss": 0.0282, + "step": 5570 + }, + { + "epoch": 3.8593695878074126, + "grad_norm": 0.5553210377693176, + "learning_rate": 6.143550624133149e-06, + "loss": 0.0249, + "step": 5571 + }, + { + "epoch": 3.8600623484586074, + "grad_norm": 0.6809664964675903, + "learning_rate": 6.142857142857144e-06, + "loss": 0.0339, + "step": 5572 + }, + { + "epoch": 3.8607551091098027, + "grad_norm": 1.0848239660263062, + "learning_rate": 6.142163661581138e-06, + "loss": 0.0411, + "step": 5573 + }, + { + "epoch": 3.8614478697609975, + "grad_norm": 0.6416944265365601, + "learning_rate": 6.141470180305133e-06, + "loss": 0.0316, + "step": 5574 + }, + { + "epoch": 3.862140630412193, + "grad_norm": 0.5461816787719727, + "learning_rate": 6.140776699029126e-06, + "loss": 0.0297, + "step": 5575 + }, + { + "epoch": 3.8628333910633876, + "grad_norm": 0.5792284607887268, + "learning_rate": 6.140083217753121e-06, + "loss": 0.038, + "step": 5576 + }, + { + "epoch": 3.8635261517145825, + "grad_norm": 0.6209005117416382, + "learning_rate": 6.139389736477116e-06, + "loss": 0.0453, + "step": 5577 + }, + { + "epoch": 3.8642189123657777, + "grad_norm": 0.5361753702163696, + "learning_rate": 6.13869625520111e-06, + "loss": 0.0219, + "step": 5578 + }, + { + "epoch": 3.8649116730169726, + "grad_norm": 0.5698778629302979, + "learning_rate": 6.138002773925105e-06, + "loss": 0.0281, + "step": 5579 + }, + { + "epoch": 3.8656044336681674, + "grad_norm": 0.5724604725837708, + "learning_rate": 6.137309292649098e-06, + "loss": 0.0327, + "step": 5580 + }, + { + "epoch": 3.8662971943193627, + "grad_norm": 0.6027019619941711, + "learning_rate": 6.136615811373093e-06, + "loss": 0.0302, + "step": 5581 + }, + { + "epoch": 3.8669899549705575, + "grad_norm": 0.6003559827804565, + "learning_rate": 6.135922330097088e-06, + "loss": 0.0274, + "step": 5582 + }, + { + "epoch": 3.867682715621753, + "grad_norm": 0.5582703948020935, + "learning_rate": 6.135228848821082e-06, + "loss": 0.0216, + "step": 5583 + }, + { + "epoch": 3.8683754762729476, + "grad_norm": 0.5991309881210327, + "learning_rate": 6.134535367545077e-06, + "loss": 0.0356, + "step": 5584 + }, + { + "epoch": 3.869068236924143, + "grad_norm": 0.6197720766067505, + "learning_rate": 6.13384188626907e-06, + "loss": 0.034, + "step": 5585 + }, + { + "epoch": 3.8697609975753378, + "grad_norm": 0.5650931000709534, + "learning_rate": 6.133148404993065e-06, + "loss": 0.0314, + "step": 5586 + }, + { + "epoch": 3.8704537582265326, + "grad_norm": 0.6536426544189453, + "learning_rate": 6.13245492371706e-06, + "loss": 0.0334, + "step": 5587 + }, + { + "epoch": 3.871146518877728, + "grad_norm": 0.5091869831085205, + "learning_rate": 6.131761442441054e-06, + "loss": 0.0251, + "step": 5588 + }, + { + "epoch": 3.8718392795289227, + "grad_norm": 0.6284472942352295, + "learning_rate": 6.131067961165049e-06, + "loss": 0.0304, + "step": 5589 + }, + { + "epoch": 3.8725320401801175, + "grad_norm": 0.660051167011261, + "learning_rate": 6.130374479889043e-06, + "loss": 0.0312, + "step": 5590 + }, + { + "epoch": 3.873224800831313, + "grad_norm": 0.6200466752052307, + "learning_rate": 6.129680998613038e-06, + "loss": 0.0322, + "step": 5591 + }, + { + "epoch": 3.8739175614825077, + "grad_norm": 0.587002158164978, + "learning_rate": 6.128987517337033e-06, + "loss": 0.0382, + "step": 5592 + }, + { + "epoch": 3.874610322133703, + "grad_norm": 0.622517466545105, + "learning_rate": 6.128294036061026e-06, + "loss": 0.0258, + "step": 5593 + }, + { + "epoch": 3.8753030827848978, + "grad_norm": 0.6990046501159668, + "learning_rate": 6.127600554785021e-06, + "loss": 0.0259, + "step": 5594 + }, + { + "epoch": 3.875995843436093, + "grad_norm": 0.4947289824485779, + "learning_rate": 6.1269070735090154e-06, + "loss": 0.0234, + "step": 5595 + }, + { + "epoch": 3.876688604087288, + "grad_norm": 0.6375725865364075, + "learning_rate": 6.12621359223301e-06, + "loss": 0.0319, + "step": 5596 + }, + { + "epoch": 3.8773813647384827, + "grad_norm": 0.6191786527633667, + "learning_rate": 6.125520110957005e-06, + "loss": 0.0389, + "step": 5597 + }, + { + "epoch": 3.878074125389678, + "grad_norm": 0.5996862649917603, + "learning_rate": 6.1248266296809985e-06, + "loss": 0.0288, + "step": 5598 + }, + { + "epoch": 3.878766886040873, + "grad_norm": 0.6783562898635864, + "learning_rate": 6.1241331484049935e-06, + "loss": 0.0346, + "step": 5599 + }, + { + "epoch": 3.8794596466920677, + "grad_norm": 0.6701961755752563, + "learning_rate": 6.1234396671289876e-06, + "loss": 0.0368, + "step": 5600 + }, + { + "epoch": 3.880152407343263, + "grad_norm": 0.55217444896698, + "learning_rate": 6.1227461858529825e-06, + "loss": 0.0272, + "step": 5601 + }, + { + "epoch": 3.880845167994458, + "grad_norm": 0.5103873610496521, + "learning_rate": 6.122052704576977e-06, + "loss": 0.0248, + "step": 5602 + }, + { + "epoch": 3.881537928645653, + "grad_norm": 0.7307687401771545, + "learning_rate": 6.1213592233009715e-06, + "loss": 0.0315, + "step": 5603 + }, + { + "epoch": 3.882230689296848, + "grad_norm": 0.5511279106140137, + "learning_rate": 6.120665742024966e-06, + "loss": 0.0389, + "step": 5604 + }, + { + "epoch": 3.882923449948043, + "grad_norm": 0.5821829438209534, + "learning_rate": 6.11997226074896e-06, + "loss": 0.0252, + "step": 5605 + }, + { + "epoch": 3.883616210599238, + "grad_norm": 0.689548134803772, + "learning_rate": 6.119278779472955e-06, + "loss": 0.0344, + "step": 5606 + }, + { + "epoch": 3.884308971250433, + "grad_norm": 0.6860352158546448, + "learning_rate": 6.1185852981969495e-06, + "loss": 0.0332, + "step": 5607 + }, + { + "epoch": 3.885001731901628, + "grad_norm": 0.652440071105957, + "learning_rate": 6.117891816920944e-06, + "loss": 0.0286, + "step": 5608 + }, + { + "epoch": 3.885694492552823, + "grad_norm": 0.6276047825813293, + "learning_rate": 6.1171983356449386e-06, + "loss": 0.0315, + "step": 5609 + }, + { + "epoch": 3.886387253204018, + "grad_norm": 0.8977293968200684, + "learning_rate": 6.116504854368932e-06, + "loss": 0.0303, + "step": 5610 + }, + { + "epoch": 3.887080013855213, + "grad_norm": 0.6617765426635742, + "learning_rate": 6.115811373092927e-06, + "loss": 0.0344, + "step": 5611 + }, + { + "epoch": 3.887772774506408, + "grad_norm": 0.6797682642936707, + "learning_rate": 6.115117891816922e-06, + "loss": 0.0377, + "step": 5612 + }, + { + "epoch": 3.888465535157603, + "grad_norm": 0.5454661250114441, + "learning_rate": 6.114424410540916e-06, + "loss": 0.0304, + "step": 5613 + }, + { + "epoch": 3.889158295808798, + "grad_norm": 0.6362570524215698, + "learning_rate": 6.113730929264911e-06, + "loss": 0.0283, + "step": 5614 + }, + { + "epoch": 3.8898510564599933, + "grad_norm": 0.6150993704795837, + "learning_rate": 6.113037447988904e-06, + "loss": 0.0342, + "step": 5615 + }, + { + "epoch": 3.890543817111188, + "grad_norm": 0.4676876366138458, + "learning_rate": 6.112343966712899e-06, + "loss": 0.0208, + "step": 5616 + }, + { + "epoch": 3.891236577762383, + "grad_norm": 0.6185954213142395, + "learning_rate": 6.111650485436894e-06, + "loss": 0.0335, + "step": 5617 + }, + { + "epoch": 3.8919293384135782, + "grad_norm": 0.7884261608123779, + "learning_rate": 6.110957004160888e-06, + "loss": 0.0414, + "step": 5618 + }, + { + "epoch": 3.892622099064773, + "grad_norm": 0.6566745042800903, + "learning_rate": 6.110263522884883e-06, + "loss": 0.0309, + "step": 5619 + }, + { + "epoch": 3.893314859715968, + "grad_norm": 0.5411038994789124, + "learning_rate": 6.109570041608877e-06, + "loss": 0.0267, + "step": 5620 + }, + { + "epoch": 3.894007620367163, + "grad_norm": 0.5566913485527039, + "learning_rate": 6.108876560332872e-06, + "loss": 0.031, + "step": 5621 + }, + { + "epoch": 3.894700381018358, + "grad_norm": 0.49798959493637085, + "learning_rate": 6.108183079056867e-06, + "loss": 0.0319, + "step": 5622 + }, + { + "epoch": 3.8953931416695533, + "grad_norm": 0.6349419951438904, + "learning_rate": 6.10748959778086e-06, + "loss": 0.0277, + "step": 5623 + }, + { + "epoch": 3.896085902320748, + "grad_norm": 0.6099962592124939, + "learning_rate": 6.106796116504855e-06, + "loss": 0.0303, + "step": 5624 + }, + { + "epoch": 3.8967786629719434, + "grad_norm": 0.6855891346931458, + "learning_rate": 6.106102635228849e-06, + "loss": 0.041, + "step": 5625 + }, + { + "epoch": 3.8974714236231383, + "grad_norm": 0.6275585293769836, + "learning_rate": 6.105409153952844e-06, + "loss": 0.0332, + "step": 5626 + }, + { + "epoch": 3.898164184274333, + "grad_norm": 0.5623182654380798, + "learning_rate": 6.104715672676839e-06, + "loss": 0.0284, + "step": 5627 + }, + { + "epoch": 3.8988569449255284, + "grad_norm": 0.5225732922554016, + "learning_rate": 6.104022191400832e-06, + "loss": 0.0332, + "step": 5628 + }, + { + "epoch": 3.899549705576723, + "grad_norm": 0.7272273302078247, + "learning_rate": 6.103328710124827e-06, + "loss": 0.0357, + "step": 5629 + }, + { + "epoch": 3.900242466227918, + "grad_norm": 0.571682870388031, + "learning_rate": 6.102635228848821e-06, + "loss": 0.0286, + "step": 5630 + }, + { + "epoch": 3.9009352268791133, + "grad_norm": 0.5897750854492188, + "learning_rate": 6.101941747572816e-06, + "loss": 0.0313, + "step": 5631 + }, + { + "epoch": 3.901627987530308, + "grad_norm": 0.839536726474762, + "learning_rate": 6.101248266296811e-06, + "loss": 0.0341, + "step": 5632 + }, + { + "epoch": 3.9023207481815034, + "grad_norm": 0.780576765537262, + "learning_rate": 6.100554785020804e-06, + "loss": 0.0318, + "step": 5633 + }, + { + "epoch": 3.9030135088326983, + "grad_norm": 0.6373487710952759, + "learning_rate": 6.099861303744799e-06, + "loss": 0.027, + "step": 5634 + }, + { + "epoch": 3.9037062694838935, + "grad_norm": 0.584993839263916, + "learning_rate": 6.099167822468793e-06, + "loss": 0.0344, + "step": 5635 + }, + { + "epoch": 3.9043990301350884, + "grad_norm": 0.6027787327766418, + "learning_rate": 6.098474341192788e-06, + "loss": 0.0319, + "step": 5636 + }, + { + "epoch": 3.905091790786283, + "grad_norm": 0.7014255523681641, + "learning_rate": 6.097780859916783e-06, + "loss": 0.0351, + "step": 5637 + }, + { + "epoch": 3.9057845514374785, + "grad_norm": 0.5559835433959961, + "learning_rate": 6.097087378640777e-06, + "loss": 0.0258, + "step": 5638 + }, + { + "epoch": 3.9064773120886733, + "grad_norm": 0.6171395778656006, + "learning_rate": 6.096393897364772e-06, + "loss": 0.0357, + "step": 5639 + }, + { + "epoch": 3.907170072739868, + "grad_norm": 0.6097577810287476, + "learning_rate": 6.095700416088765e-06, + "loss": 0.0272, + "step": 5640 + }, + { + "epoch": 3.9078628333910634, + "grad_norm": 0.7907153367996216, + "learning_rate": 6.09500693481276e-06, + "loss": 0.0379, + "step": 5641 + }, + { + "epoch": 3.9085555940422583, + "grad_norm": 0.5487450361251831, + "learning_rate": 6.094313453536755e-06, + "loss": 0.0334, + "step": 5642 + }, + { + "epoch": 3.9092483546934536, + "grad_norm": 0.638064980506897, + "learning_rate": 6.093619972260749e-06, + "loss": 0.0355, + "step": 5643 + }, + { + "epoch": 3.9099411153446484, + "grad_norm": 0.5636724233627319, + "learning_rate": 6.092926490984744e-06, + "loss": 0.0203, + "step": 5644 + }, + { + "epoch": 3.9106338759958437, + "grad_norm": 0.6418699026107788, + "learning_rate": 6.0922330097087375e-06, + "loss": 0.0257, + "step": 5645 + }, + { + "epoch": 3.9113266366470385, + "grad_norm": 0.6871115565299988, + "learning_rate": 6.0915395284327325e-06, + "loss": 0.034, + "step": 5646 + }, + { + "epoch": 3.9120193972982333, + "grad_norm": 0.5053454041481018, + "learning_rate": 6.090846047156727e-06, + "loss": 0.0275, + "step": 5647 + }, + { + "epoch": 3.9127121579494286, + "grad_norm": 0.5848159193992615, + "learning_rate": 6.0901525658807215e-06, + "loss": 0.0295, + "step": 5648 + }, + { + "epoch": 3.9134049186006234, + "grad_norm": 0.7460579872131348, + "learning_rate": 6.089459084604716e-06, + "loss": 0.0449, + "step": 5649 + }, + { + "epoch": 3.9140976792518183, + "grad_norm": 0.5983049273490906, + "learning_rate": 6.0887656033287105e-06, + "loss": 0.0366, + "step": 5650 + }, + { + "epoch": 3.9147904399030136, + "grad_norm": 0.6734099984169006, + "learning_rate": 6.0880721220527054e-06, + "loss": 0.0285, + "step": 5651 + }, + { + "epoch": 3.9154832005542084, + "grad_norm": 0.6277661323547363, + "learning_rate": 6.0873786407766995e-06, + "loss": 0.0344, + "step": 5652 + }, + { + "epoch": 3.9161759612054037, + "grad_norm": 0.6390184164047241, + "learning_rate": 6.086685159500694e-06, + "loss": 0.0317, + "step": 5653 + }, + { + "epoch": 3.9168687218565985, + "grad_norm": 0.6505480408668518, + "learning_rate": 6.0859916782246885e-06, + "loss": 0.0344, + "step": 5654 + }, + { + "epoch": 3.917561482507794, + "grad_norm": 0.5460807085037231, + "learning_rate": 6.085298196948683e-06, + "loss": 0.0275, + "step": 5655 + }, + { + "epoch": 3.9182542431589886, + "grad_norm": 0.5947510004043579, + "learning_rate": 6.0846047156726776e-06, + "loss": 0.0348, + "step": 5656 + }, + { + "epoch": 3.9189470038101835, + "grad_norm": 0.8682061433792114, + "learning_rate": 6.0839112343966725e-06, + "loss": 0.0359, + "step": 5657 + }, + { + "epoch": 3.9196397644613787, + "grad_norm": 0.5251128077507019, + "learning_rate": 6.083217753120666e-06, + "loss": 0.026, + "step": 5658 + }, + { + "epoch": 3.9203325251125736, + "grad_norm": 0.7046948671340942, + "learning_rate": 6.082524271844661e-06, + "loss": 0.0352, + "step": 5659 + }, + { + "epoch": 3.9210252857637684, + "grad_norm": 0.5649242997169495, + "learning_rate": 6.081830790568655e-06, + "loss": 0.0302, + "step": 5660 + }, + { + "epoch": 3.9217180464149637, + "grad_norm": 0.6032993197441101, + "learning_rate": 6.08113730929265e-06, + "loss": 0.0334, + "step": 5661 + }, + { + "epoch": 3.9224108070661585, + "grad_norm": 0.5204776525497437, + "learning_rate": 6.080443828016645e-06, + "loss": 0.0209, + "step": 5662 + }, + { + "epoch": 3.923103567717354, + "grad_norm": 0.7001051902770996, + "learning_rate": 6.079750346740638e-06, + "loss": 0.0299, + "step": 5663 + }, + { + "epoch": 3.9237963283685486, + "grad_norm": 0.6807113885879517, + "learning_rate": 6.079056865464633e-06, + "loss": 0.025, + "step": 5664 + }, + { + "epoch": 3.924489089019744, + "grad_norm": 0.6066899299621582, + "learning_rate": 6.078363384188627e-06, + "loss": 0.0269, + "step": 5665 + }, + { + "epoch": 3.9251818496709387, + "grad_norm": 0.843798816204071, + "learning_rate": 6.077669902912622e-06, + "loss": 0.0266, + "step": 5666 + }, + { + "epoch": 3.9258746103221336, + "grad_norm": 0.6966928243637085, + "learning_rate": 6.076976421636617e-06, + "loss": 0.0336, + "step": 5667 + }, + { + "epoch": 3.926567370973329, + "grad_norm": 0.6036196351051331, + "learning_rate": 6.076282940360611e-06, + "loss": 0.0247, + "step": 5668 + }, + { + "epoch": 3.9272601316245237, + "grad_norm": 0.6237078309059143, + "learning_rate": 6.075589459084606e-06, + "loss": 0.0276, + "step": 5669 + }, + { + "epoch": 3.9279528922757185, + "grad_norm": 0.6260361671447754, + "learning_rate": 6.074895977808599e-06, + "loss": 0.0231, + "step": 5670 + }, + { + "epoch": 3.928645652926914, + "grad_norm": 0.7189197540283203, + "learning_rate": 6.074202496532594e-06, + "loss": 0.0404, + "step": 5671 + }, + { + "epoch": 3.9293384135781086, + "grad_norm": 0.585728645324707, + "learning_rate": 6.073509015256589e-06, + "loss": 0.0312, + "step": 5672 + }, + { + "epoch": 3.930031174229304, + "grad_norm": 0.5816975831985474, + "learning_rate": 6.072815533980583e-06, + "loss": 0.0254, + "step": 5673 + }, + { + "epoch": 3.9307239348804988, + "grad_norm": 0.5944647192955017, + "learning_rate": 6.072122052704578e-06, + "loss": 0.028, + "step": 5674 + }, + { + "epoch": 3.931416695531694, + "grad_norm": 0.5816596150398254, + "learning_rate": 6.071428571428571e-06, + "loss": 0.0274, + "step": 5675 + }, + { + "epoch": 3.932109456182889, + "grad_norm": 0.6381893157958984, + "learning_rate": 6.070735090152566e-06, + "loss": 0.0284, + "step": 5676 + }, + { + "epoch": 3.9328022168340837, + "grad_norm": 0.643214762210846, + "learning_rate": 6.070041608876561e-06, + "loss": 0.033, + "step": 5677 + }, + { + "epoch": 3.933494977485279, + "grad_norm": 0.5460039377212524, + "learning_rate": 6.069348127600555e-06, + "loss": 0.0303, + "step": 5678 + }, + { + "epoch": 3.934187738136474, + "grad_norm": 0.6501333117485046, + "learning_rate": 6.06865464632455e-06, + "loss": 0.032, + "step": 5679 + }, + { + "epoch": 3.9348804987876687, + "grad_norm": 0.7149091362953186, + "learning_rate": 6.067961165048544e-06, + "loss": 0.0412, + "step": 5680 + }, + { + "epoch": 3.935573259438864, + "grad_norm": 0.6299799680709839, + "learning_rate": 6.067267683772538e-06, + "loss": 0.0367, + "step": 5681 + }, + { + "epoch": 3.9362660200900588, + "grad_norm": 0.6541127562522888, + "learning_rate": 6.066574202496533e-06, + "loss": 0.033, + "step": 5682 + }, + { + "epoch": 3.936958780741254, + "grad_norm": 0.8621496558189392, + "learning_rate": 6.065880721220527e-06, + "loss": 0.0462, + "step": 5683 + }, + { + "epoch": 3.937651541392449, + "grad_norm": 0.6117715835571289, + "learning_rate": 6.065187239944522e-06, + "loss": 0.0273, + "step": 5684 + }, + { + "epoch": 3.938344302043644, + "grad_norm": 0.6450139284133911, + "learning_rate": 6.064493758668516e-06, + "loss": 0.035, + "step": 5685 + }, + { + "epoch": 3.939037062694839, + "grad_norm": 0.686332643032074, + "learning_rate": 6.063800277392511e-06, + "loss": 0.0334, + "step": 5686 + }, + { + "epoch": 3.939729823346034, + "grad_norm": 0.7327834963798523, + "learning_rate": 6.063106796116506e-06, + "loss": 0.0314, + "step": 5687 + }, + { + "epoch": 3.940422583997229, + "grad_norm": 0.5229851007461548, + "learning_rate": 6.062413314840499e-06, + "loss": 0.0308, + "step": 5688 + }, + { + "epoch": 3.941115344648424, + "grad_norm": 0.5099225044250488, + "learning_rate": 6.061719833564494e-06, + "loss": 0.0271, + "step": 5689 + }, + { + "epoch": 3.941808105299619, + "grad_norm": 0.6219456195831299, + "learning_rate": 6.061026352288488e-06, + "loss": 0.0411, + "step": 5690 + }, + { + "epoch": 3.942500865950814, + "grad_norm": 0.6136319637298584, + "learning_rate": 6.060332871012483e-06, + "loss": 0.0283, + "step": 5691 + }, + { + "epoch": 3.943193626602009, + "grad_norm": 0.7287399172782898, + "learning_rate": 6.059639389736478e-06, + "loss": 0.0325, + "step": 5692 + }, + { + "epoch": 3.943886387253204, + "grad_norm": 0.6488358378410339, + "learning_rate": 6.0589459084604715e-06, + "loss": 0.0352, + "step": 5693 + }, + { + "epoch": 3.944579147904399, + "grad_norm": 0.5832087993621826, + "learning_rate": 6.058252427184466e-06, + "loss": 0.0335, + "step": 5694 + }, + { + "epoch": 3.9452719085555943, + "grad_norm": 0.6404885053634644, + "learning_rate": 6.0575589459084605e-06, + "loss": 0.0391, + "step": 5695 + }, + { + "epoch": 3.945964669206789, + "grad_norm": 0.4916565418243408, + "learning_rate": 6.056865464632455e-06, + "loss": 0.023, + "step": 5696 + }, + { + "epoch": 3.946657429857984, + "grad_norm": 0.9438236355781555, + "learning_rate": 6.05617198335645e-06, + "loss": 0.0406, + "step": 5697 + }, + { + "epoch": 3.9473501905091792, + "grad_norm": 0.5329188704490662, + "learning_rate": 6.0554785020804444e-06, + "loss": 0.0197, + "step": 5698 + }, + { + "epoch": 3.948042951160374, + "grad_norm": 0.6993545889854431, + "learning_rate": 6.054785020804439e-06, + "loss": 0.0333, + "step": 5699 + }, + { + "epoch": 3.948735711811569, + "grad_norm": 0.5379102826118469, + "learning_rate": 6.054091539528433e-06, + "loss": 0.0275, + "step": 5700 + }, + { + "epoch": 3.949428472462764, + "grad_norm": 0.6155920028686523, + "learning_rate": 6.0533980582524275e-06, + "loss": 0.039, + "step": 5701 + }, + { + "epoch": 3.950121233113959, + "grad_norm": 0.6053874492645264, + "learning_rate": 6.0527045769764225e-06, + "loss": 0.0257, + "step": 5702 + }, + { + "epoch": 3.9508139937651543, + "grad_norm": 0.5987370610237122, + "learning_rate": 6.0520110957004166e-06, + "loss": 0.0425, + "step": 5703 + }, + { + "epoch": 3.951506754416349, + "grad_norm": 0.5557329654693604, + "learning_rate": 6.0513176144244115e-06, + "loss": 0.0315, + "step": 5704 + }, + { + "epoch": 3.9521995150675444, + "grad_norm": 0.5767523646354675, + "learning_rate": 6.050624133148405e-06, + "loss": 0.033, + "step": 5705 + }, + { + "epoch": 3.9528922757187392, + "grad_norm": 0.7918155789375305, + "learning_rate": 6.0499306518724e-06, + "loss": 0.0368, + "step": 5706 + }, + { + "epoch": 3.953585036369934, + "grad_norm": 0.7695801258087158, + "learning_rate": 6.049237170596395e-06, + "loss": 0.0321, + "step": 5707 + }, + { + "epoch": 3.9542777970211294, + "grad_norm": 0.8224954009056091, + "learning_rate": 6.048543689320389e-06, + "loss": 0.0368, + "step": 5708 + }, + { + "epoch": 3.954970557672324, + "grad_norm": 0.593601405620575, + "learning_rate": 6.047850208044384e-06, + "loss": 0.0353, + "step": 5709 + }, + { + "epoch": 3.955663318323519, + "grad_norm": 0.74740070104599, + "learning_rate": 6.047156726768377e-06, + "loss": 0.0408, + "step": 5710 + }, + { + "epoch": 3.9563560789747143, + "grad_norm": 0.5545331239700317, + "learning_rate": 6.046463245492372e-06, + "loss": 0.0298, + "step": 5711 + }, + { + "epoch": 3.957048839625909, + "grad_norm": 0.6210556626319885, + "learning_rate": 6.045769764216367e-06, + "loss": 0.0269, + "step": 5712 + }, + { + "epoch": 3.9577416002771044, + "grad_norm": 0.6635833978652954, + "learning_rate": 6.045076282940361e-06, + "loss": 0.0398, + "step": 5713 + }, + { + "epoch": 3.9584343609282993, + "grad_norm": 0.6100125312805176, + "learning_rate": 6.044382801664356e-06, + "loss": 0.0277, + "step": 5714 + }, + { + "epoch": 3.9591271215794945, + "grad_norm": 0.5405066013336182, + "learning_rate": 6.04368932038835e-06, + "loss": 0.0279, + "step": 5715 + }, + { + "epoch": 3.9598198822306894, + "grad_norm": 0.578748881816864, + "learning_rate": 6.042995839112345e-06, + "loss": 0.0274, + "step": 5716 + }, + { + "epoch": 3.960512642881884, + "grad_norm": 0.5417850017547607, + "learning_rate": 6.04230235783634e-06, + "loss": 0.0238, + "step": 5717 + }, + { + "epoch": 3.9612054035330795, + "grad_norm": 0.7624323964118958, + "learning_rate": 6.041608876560333e-06, + "loss": 0.0381, + "step": 5718 + }, + { + "epoch": 3.9618981641842743, + "grad_norm": 0.7578420042991638, + "learning_rate": 6.040915395284328e-06, + "loss": 0.0424, + "step": 5719 + }, + { + "epoch": 3.962590924835469, + "grad_norm": 0.6090701818466187, + "learning_rate": 6.040221914008322e-06, + "loss": 0.0318, + "step": 5720 + }, + { + "epoch": 3.9632836854866644, + "grad_norm": 0.5844273567199707, + "learning_rate": 6.039528432732317e-06, + "loss": 0.0302, + "step": 5721 + }, + { + "epoch": 3.9639764461378593, + "grad_norm": 0.5725448131561279, + "learning_rate": 6.038834951456312e-06, + "loss": 0.0262, + "step": 5722 + }, + { + "epoch": 3.9646692067890545, + "grad_norm": 0.617495596408844, + "learning_rate": 6.038141470180305e-06, + "loss": 0.0311, + "step": 5723 + }, + { + "epoch": 3.9653619674402494, + "grad_norm": 0.5846660733222961, + "learning_rate": 6.0374479889043e-06, + "loss": 0.0287, + "step": 5724 + }, + { + "epoch": 3.9660547280914447, + "grad_norm": 0.5877677202224731, + "learning_rate": 6.036754507628294e-06, + "loss": 0.0356, + "step": 5725 + }, + { + "epoch": 3.9667474887426395, + "grad_norm": 0.5529595613479614, + "learning_rate": 6.036061026352289e-06, + "loss": 0.0244, + "step": 5726 + }, + { + "epoch": 3.9674402493938343, + "grad_norm": 0.7487414479255676, + "learning_rate": 6.035367545076284e-06, + "loss": 0.0382, + "step": 5727 + }, + { + "epoch": 3.9681330100450296, + "grad_norm": 0.5108417868614197, + "learning_rate": 6.034674063800278e-06, + "loss": 0.0224, + "step": 5728 + }, + { + "epoch": 3.9688257706962244, + "grad_norm": 0.5159150958061218, + "learning_rate": 6.033980582524272e-06, + "loss": 0.0208, + "step": 5729 + }, + { + "epoch": 3.9695185313474193, + "grad_norm": 0.6342235803604126, + "learning_rate": 6.033287101248266e-06, + "loss": 0.0402, + "step": 5730 + }, + { + "epoch": 3.9702112919986146, + "grad_norm": 0.6006144881248474, + "learning_rate": 6.032593619972261e-06, + "loss": 0.0293, + "step": 5731 + }, + { + "epoch": 3.9709040526498094, + "grad_norm": 0.5523648858070374, + "learning_rate": 6.031900138696256e-06, + "loss": 0.0206, + "step": 5732 + }, + { + "epoch": 3.9715968133010047, + "grad_norm": 0.5949568748474121, + "learning_rate": 6.03120665742025e-06, + "loss": 0.0271, + "step": 5733 + }, + { + "epoch": 3.9722895739521995, + "grad_norm": 0.5630077123641968, + "learning_rate": 6.030513176144245e-06, + "loss": 0.0252, + "step": 5734 + }, + { + "epoch": 3.972982334603395, + "grad_norm": 0.5368571281433105, + "learning_rate": 6.029819694868238e-06, + "loss": 0.0248, + "step": 5735 + }, + { + "epoch": 3.9736750952545896, + "grad_norm": 0.6003128886222839, + "learning_rate": 6.029126213592233e-06, + "loss": 0.0242, + "step": 5736 + }, + { + "epoch": 3.9743678559057845, + "grad_norm": 0.5548897981643677, + "learning_rate": 6.028432732316228e-06, + "loss": 0.0258, + "step": 5737 + }, + { + "epoch": 3.9750606165569797, + "grad_norm": 0.5407117605209351, + "learning_rate": 6.027739251040222e-06, + "loss": 0.0242, + "step": 5738 + }, + { + "epoch": 3.9757533772081746, + "grad_norm": 0.5861427783966064, + "learning_rate": 6.027045769764217e-06, + "loss": 0.0276, + "step": 5739 + }, + { + "epoch": 3.9764461378593694, + "grad_norm": 0.6128274202346802, + "learning_rate": 6.0263522884882105e-06, + "loss": 0.0321, + "step": 5740 + }, + { + "epoch": 3.9771388985105647, + "grad_norm": 0.7110495567321777, + "learning_rate": 6.025658807212205e-06, + "loss": 0.0382, + "step": 5741 + }, + { + "epoch": 3.9778316591617595, + "grad_norm": 0.634889543056488, + "learning_rate": 6.0249653259362e-06, + "loss": 0.0311, + "step": 5742 + }, + { + "epoch": 3.9785244198129543, + "grad_norm": 0.5484490990638733, + "learning_rate": 6.024271844660194e-06, + "loss": 0.0258, + "step": 5743 + }, + { + "epoch": 3.9792171804641496, + "grad_norm": 0.6471605896949768, + "learning_rate": 6.023578363384189e-06, + "loss": 0.031, + "step": 5744 + }, + { + "epoch": 3.979909941115345, + "grad_norm": 0.6700403094291687, + "learning_rate": 6.0228848821081834e-06, + "loss": 0.0423, + "step": 5745 + }, + { + "epoch": 3.9806027017665397, + "grad_norm": 0.7532503008842468, + "learning_rate": 6.022191400832178e-06, + "loss": 0.029, + "step": 5746 + }, + { + "epoch": 3.9812954624177346, + "grad_norm": 0.49918434023857117, + "learning_rate": 6.021497919556173e-06, + "loss": 0.0241, + "step": 5747 + }, + { + "epoch": 3.98198822306893, + "grad_norm": 0.4753774404525757, + "learning_rate": 6.0208044382801665e-06, + "loss": 0.0238, + "step": 5748 + }, + { + "epoch": 3.9826809837201247, + "grad_norm": 0.6657820343971252, + "learning_rate": 6.0201109570041615e-06, + "loss": 0.0301, + "step": 5749 + }, + { + "epoch": 3.9833737443713195, + "grad_norm": 0.6276349425315857, + "learning_rate": 6.0194174757281556e-06, + "loss": 0.0283, + "step": 5750 + }, + { + "epoch": 3.984066505022515, + "grad_norm": 0.570073127746582, + "learning_rate": 6.0187239944521505e-06, + "loss": 0.0251, + "step": 5751 + }, + { + "epoch": 3.9847592656737096, + "grad_norm": 0.7095314860343933, + "learning_rate": 6.0180305131761454e-06, + "loss": 0.0321, + "step": 5752 + }, + { + "epoch": 3.9854520263249045, + "grad_norm": 0.6330137848854065, + "learning_rate": 6.017337031900139e-06, + "loss": 0.024, + "step": 5753 + }, + { + "epoch": 3.9861447869760998, + "grad_norm": 0.82314133644104, + "learning_rate": 6.016643550624134e-06, + "loss": 0.0357, + "step": 5754 + }, + { + "epoch": 3.986837547627295, + "grad_norm": 0.6464589834213257, + "learning_rate": 6.015950069348128e-06, + "loss": 0.0337, + "step": 5755 + }, + { + "epoch": 3.98753030827849, + "grad_norm": 0.626295268535614, + "learning_rate": 6.015256588072123e-06, + "loss": 0.0375, + "step": 5756 + }, + { + "epoch": 3.9882230689296847, + "grad_norm": 0.6050504446029663, + "learning_rate": 6.0145631067961176e-06, + "loss": 0.0233, + "step": 5757 + }, + { + "epoch": 3.98891582958088, + "grad_norm": 0.7396339774131775, + "learning_rate": 6.013869625520111e-06, + "loss": 0.0584, + "step": 5758 + }, + { + "epoch": 3.989608590232075, + "grad_norm": 0.6037498116493225, + "learning_rate": 6.013176144244106e-06, + "loss": 0.0354, + "step": 5759 + }, + { + "epoch": 3.9903013508832696, + "grad_norm": 0.5753601789474487, + "learning_rate": 6.0124826629681e-06, + "loss": 0.0278, + "step": 5760 + }, + { + "epoch": 3.990994111534465, + "grad_norm": 0.6056987643241882, + "learning_rate": 6.011789181692095e-06, + "loss": 0.0333, + "step": 5761 + }, + { + "epoch": 3.9916868721856598, + "grad_norm": 0.6861843466758728, + "learning_rate": 6.01109570041609e-06, + "loss": 0.041, + "step": 5762 + }, + { + "epoch": 3.9923796328368546, + "grad_norm": 0.6862473487854004, + "learning_rate": 6.010402219140084e-06, + "loss": 0.0395, + "step": 5763 + }, + { + "epoch": 3.99307239348805, + "grad_norm": 0.5396139621734619, + "learning_rate": 6.009708737864079e-06, + "loss": 0.028, + "step": 5764 + }, + { + "epoch": 3.993765154139245, + "grad_norm": 0.6474992632865906, + "learning_rate": 6.009015256588072e-06, + "loss": 0.0335, + "step": 5765 + }, + { + "epoch": 3.99445791479044, + "grad_norm": 0.5092523694038391, + "learning_rate": 6.008321775312067e-06, + "loss": 0.0252, + "step": 5766 + }, + { + "epoch": 3.995150675441635, + "grad_norm": 0.6200748682022095, + "learning_rate": 6.007628294036062e-06, + "loss": 0.0296, + "step": 5767 + }, + { + "epoch": 3.99584343609283, + "grad_norm": 0.6921845078468323, + "learning_rate": 6.006934812760056e-06, + "loss": 0.0336, + "step": 5768 + }, + { + "epoch": 3.996536196744025, + "grad_norm": 0.5785850286483765, + "learning_rate": 6.006241331484051e-06, + "loss": 0.0332, + "step": 5769 + }, + { + "epoch": 3.9972289573952198, + "grad_norm": 0.5929657816886902, + "learning_rate": 6.005547850208044e-06, + "loss": 0.031, + "step": 5770 + }, + { + "epoch": 3.997921718046415, + "grad_norm": 0.6023411750793457, + "learning_rate": 6.004854368932039e-06, + "loss": 0.0233, + "step": 5771 + }, + { + "epoch": 3.99861447869761, + "grad_norm": 0.5986213684082031, + "learning_rate": 6.004160887656034e-06, + "loss": 0.0284, + "step": 5772 + }, + { + "epoch": 3.9993072393488047, + "grad_norm": 0.5923529863357544, + "learning_rate": 6.003467406380028e-06, + "loss": 0.0292, + "step": 5773 + }, + { + "epoch": 4.0, + "grad_norm": 0.5920475721359253, + "learning_rate": 6.002773925104023e-06, + "loss": 0.0331, + "step": 5774 + }, + { + "epoch": 4.0, + "eval_loss": 0.2548733055591583, + "eval_runtime": 7633.7469, + "eval_samples_per_second": 1.048, + "eval_steps_per_second": 0.033, + "eval_wer": 12.593032600719804, + "step": 5774 + }, + { + "epoch": 4.000692760651195, + "grad_norm": 0.44014424085617065, + "learning_rate": 6.002080443828017e-06, + "loss": 0.0137, + "step": 5775 + }, + { + "epoch": 4.00138552130239, + "grad_norm": 0.39017313718795776, + "learning_rate": 6.001386962552012e-06, + "loss": 0.0152, + "step": 5776 + }, + { + "epoch": 4.002078281953585, + "grad_norm": 0.4326973557472229, + "learning_rate": 6.000693481276006e-06, + "loss": 0.0173, + "step": 5777 + }, + { + "epoch": 4.00277104260478, + "grad_norm": 0.36008891463279724, + "learning_rate": 6e-06, + "loss": 0.0132, + "step": 5778 + }, + { + "epoch": 4.003463803255975, + "grad_norm": 0.3663184344768524, + "learning_rate": 5.999306518723995e-06, + "loss": 0.016, + "step": 5779 + }, + { + "epoch": 4.00415656390717, + "grad_norm": 0.6453715562820435, + "learning_rate": 5.998613037447989e-06, + "loss": 0.0268, + "step": 5780 + }, + { + "epoch": 4.004849324558365, + "grad_norm": 0.38540932536125183, + "learning_rate": 5.997919556171984e-06, + "loss": 0.0135, + "step": 5781 + }, + { + "epoch": 4.0055420852095605, + "grad_norm": 0.45824089646339417, + "learning_rate": 5.997226074895979e-06, + "loss": 0.0163, + "step": 5782 + }, + { + "epoch": 4.006234845860755, + "grad_norm": 0.3920636475086212, + "learning_rate": 5.996532593619972e-06, + "loss": 0.0166, + "step": 5783 + }, + { + "epoch": 4.00692760651195, + "grad_norm": 0.5281590223312378, + "learning_rate": 5.995839112343967e-06, + "loss": 0.0178, + "step": 5784 + }, + { + "epoch": 4.007620367163145, + "grad_norm": 0.4874959886074066, + "learning_rate": 5.995145631067961e-06, + "loss": 0.0149, + "step": 5785 + }, + { + "epoch": 4.00831312781434, + "grad_norm": 0.46124112606048584, + "learning_rate": 5.994452149791956e-06, + "loss": 0.019, + "step": 5786 + }, + { + "epoch": 4.009005888465535, + "grad_norm": 0.5415635108947754, + "learning_rate": 5.993758668515951e-06, + "loss": 0.0197, + "step": 5787 + }, + { + "epoch": 4.00969864911673, + "grad_norm": 0.4696653187274933, + "learning_rate": 5.993065187239944e-06, + "loss": 0.0208, + "step": 5788 + }, + { + "epoch": 4.010391409767925, + "grad_norm": 0.43915730714797974, + "learning_rate": 5.992371705963939e-06, + "loss": 0.0139, + "step": 5789 + }, + { + "epoch": 4.01108417041912, + "grad_norm": 0.5245053768157959, + "learning_rate": 5.9916782246879334e-06, + "loss": 0.0188, + "step": 5790 + }, + { + "epoch": 4.011776931070315, + "grad_norm": 0.4111827313899994, + "learning_rate": 5.990984743411928e-06, + "loss": 0.012, + "step": 5791 + }, + { + "epoch": 4.012469691721511, + "grad_norm": 0.47101888060569763, + "learning_rate": 5.990291262135923e-06, + "loss": 0.022, + "step": 5792 + }, + { + "epoch": 4.013162452372705, + "grad_norm": 0.666670024394989, + "learning_rate": 5.989597780859917e-06, + "loss": 0.0148, + "step": 5793 + }, + { + "epoch": 4.0138552130239, + "grad_norm": 0.3970896899700165, + "learning_rate": 5.988904299583912e-06, + "loss": 0.0146, + "step": 5794 + }, + { + "epoch": 4.0145479736750955, + "grad_norm": 0.5917661190032959, + "learning_rate": 5.9882108183079055e-06, + "loss": 0.0098, + "step": 5795 + }, + { + "epoch": 4.01524073432629, + "grad_norm": 0.5205355286598206, + "learning_rate": 5.9875173370319005e-06, + "loss": 0.0146, + "step": 5796 + }, + { + "epoch": 4.015933494977485, + "grad_norm": 0.4315672218799591, + "learning_rate": 5.986823855755895e-06, + "loss": 0.0136, + "step": 5797 + }, + { + "epoch": 4.0166262556286805, + "grad_norm": 0.47964319586753845, + "learning_rate": 5.9861303744798895e-06, + "loss": 0.0163, + "step": 5798 + }, + { + "epoch": 4.017319016279875, + "grad_norm": 0.35959741473197937, + "learning_rate": 5.9854368932038844e-06, + "loss": 0.0127, + "step": 5799 + }, + { + "epoch": 4.01801177693107, + "grad_norm": 0.43572506308555603, + "learning_rate": 5.984743411927878e-06, + "loss": 0.0159, + "step": 5800 + }, + { + "epoch": 4.018704537582265, + "grad_norm": 0.3837737441062927, + "learning_rate": 5.984049930651873e-06, + "loss": 0.0132, + "step": 5801 + }, + { + "epoch": 4.019397298233461, + "grad_norm": 0.4511842727661133, + "learning_rate": 5.9833564493758675e-06, + "loss": 0.0134, + "step": 5802 + }, + { + "epoch": 4.020090058884655, + "grad_norm": 0.40162840485572815, + "learning_rate": 5.982662968099862e-06, + "loss": 0.0171, + "step": 5803 + }, + { + "epoch": 4.02078281953585, + "grad_norm": 0.4260197579860687, + "learning_rate": 5.9819694868238566e-06, + "loss": 0.0169, + "step": 5804 + }, + { + "epoch": 4.021475580187046, + "grad_norm": 0.4432424008846283, + "learning_rate": 5.981276005547851e-06, + "loss": 0.0156, + "step": 5805 + }, + { + "epoch": 4.02216834083824, + "grad_norm": 0.5178232789039612, + "learning_rate": 5.980582524271845e-06, + "loss": 0.0177, + "step": 5806 + }, + { + "epoch": 4.022861101489435, + "grad_norm": 0.3381485044956207, + "learning_rate": 5.97988904299584e-06, + "loss": 0.0147, + "step": 5807 + }, + { + "epoch": 4.023553862140631, + "grad_norm": 0.54864102602005, + "learning_rate": 5.979195561719834e-06, + "loss": 0.018, + "step": 5808 + }, + { + "epoch": 4.024246622791825, + "grad_norm": 0.36553627252578735, + "learning_rate": 5.978502080443829e-06, + "loss": 0.0157, + "step": 5809 + }, + { + "epoch": 4.02493938344302, + "grad_norm": 0.45925456285476685, + "learning_rate": 5.977808599167823e-06, + "loss": 0.018, + "step": 5810 + }, + { + "epoch": 4.0256321440942155, + "grad_norm": 0.5858983993530273, + "learning_rate": 5.977115117891818e-06, + "loss": 0.0176, + "step": 5811 + }, + { + "epoch": 4.026324904745411, + "grad_norm": 0.40633654594421387, + "learning_rate": 5.976421636615813e-06, + "loss": 0.0163, + "step": 5812 + }, + { + "epoch": 4.027017665396605, + "grad_norm": 0.3767582178115845, + "learning_rate": 5.975728155339806e-06, + "loss": 0.013, + "step": 5813 + }, + { + "epoch": 4.0277104260478005, + "grad_norm": 0.410575807094574, + "learning_rate": 5.975034674063801e-06, + "loss": 0.0146, + "step": 5814 + }, + { + "epoch": 4.028403186698996, + "grad_norm": 0.48538967967033386, + "learning_rate": 5.974341192787795e-06, + "loss": 0.0128, + "step": 5815 + }, + { + "epoch": 4.02909594735019, + "grad_norm": 0.5136532783508301, + "learning_rate": 5.97364771151179e-06, + "loss": 0.0182, + "step": 5816 + }, + { + "epoch": 4.0297887080013854, + "grad_norm": 0.6098598837852478, + "learning_rate": 5.972954230235785e-06, + "loss": 0.0155, + "step": 5817 + }, + { + "epoch": 4.030481468652581, + "grad_norm": 0.46291500329971313, + "learning_rate": 5.972260748959778e-06, + "loss": 0.0182, + "step": 5818 + }, + { + "epoch": 4.031174229303775, + "grad_norm": 0.3335826098918915, + "learning_rate": 5.971567267683773e-06, + "loss": 0.01, + "step": 5819 + }, + { + "epoch": 4.03186698995497, + "grad_norm": 0.44913986325263977, + "learning_rate": 5.970873786407767e-06, + "loss": 0.0137, + "step": 5820 + }, + { + "epoch": 4.032559750606166, + "grad_norm": 0.5580949187278748, + "learning_rate": 5.970180305131762e-06, + "loss": 0.0193, + "step": 5821 + }, + { + "epoch": 4.033252511257361, + "grad_norm": 0.6049192547798157, + "learning_rate": 5.969486823855757e-06, + "loss": 0.0186, + "step": 5822 + }, + { + "epoch": 4.033945271908555, + "grad_norm": 0.46268901228904724, + "learning_rate": 5.968793342579751e-06, + "loss": 0.0119, + "step": 5823 + }, + { + "epoch": 4.034638032559751, + "grad_norm": 0.5266537666320801, + "learning_rate": 5.968099861303746e-06, + "loss": 0.015, + "step": 5824 + }, + { + "epoch": 4.035330793210946, + "grad_norm": 0.5469845533370972, + "learning_rate": 5.967406380027739e-06, + "loss": 0.0236, + "step": 5825 + }, + { + "epoch": 4.03602355386214, + "grad_norm": 0.38151949644088745, + "learning_rate": 5.966712898751734e-06, + "loss": 0.0137, + "step": 5826 + }, + { + "epoch": 4.036716314513336, + "grad_norm": 0.41305768489837646, + "learning_rate": 5.966019417475729e-06, + "loss": 0.0113, + "step": 5827 + }, + { + "epoch": 4.037409075164531, + "grad_norm": 0.41235440969467163, + "learning_rate": 5.965325936199723e-06, + "loss": 0.0156, + "step": 5828 + }, + { + "epoch": 4.038101835815725, + "grad_norm": 0.713265597820282, + "learning_rate": 5.964632454923718e-06, + "loss": 0.0226, + "step": 5829 + }, + { + "epoch": 4.0387945964669205, + "grad_norm": 0.6062613129615784, + "learning_rate": 5.963938973647711e-06, + "loss": 0.025, + "step": 5830 + }, + { + "epoch": 4.039487357118116, + "grad_norm": 0.7014955282211304, + "learning_rate": 5.963245492371706e-06, + "loss": 0.0182, + "step": 5831 + }, + { + "epoch": 4.040180117769311, + "grad_norm": 0.534320592880249, + "learning_rate": 5.962552011095701e-06, + "loss": 0.0194, + "step": 5832 + }, + { + "epoch": 4.0408728784205055, + "grad_norm": 0.3863247036933899, + "learning_rate": 5.961858529819695e-06, + "loss": 0.0105, + "step": 5833 + }, + { + "epoch": 4.041565639071701, + "grad_norm": 0.5409172177314758, + "learning_rate": 5.96116504854369e-06, + "loss": 0.0148, + "step": 5834 + }, + { + "epoch": 4.042258399722896, + "grad_norm": 0.4131470024585724, + "learning_rate": 5.960471567267684e-06, + "loss": 0.0147, + "step": 5835 + }, + { + "epoch": 4.04295116037409, + "grad_norm": 0.46925875544548035, + "learning_rate": 5.959778085991678e-06, + "loss": 0.0175, + "step": 5836 + }, + { + "epoch": 4.043643921025286, + "grad_norm": 0.49055975675582886, + "learning_rate": 5.959084604715673e-06, + "loss": 0.0113, + "step": 5837 + }, + { + "epoch": 4.044336681676481, + "grad_norm": 0.4608907103538513, + "learning_rate": 5.958391123439667e-06, + "loss": 0.0121, + "step": 5838 + }, + { + "epoch": 4.045029442327675, + "grad_norm": 0.41653043031692505, + "learning_rate": 5.957697642163662e-06, + "loss": 0.0171, + "step": 5839 + }, + { + "epoch": 4.045722202978871, + "grad_norm": 0.45659032464027405, + "learning_rate": 5.957004160887656e-06, + "loss": 0.0184, + "step": 5840 + }, + { + "epoch": 4.046414963630066, + "grad_norm": 0.43281108140945435, + "learning_rate": 5.956310679611651e-06, + "loss": 0.0146, + "step": 5841 + }, + { + "epoch": 4.047107724281261, + "grad_norm": 0.3644174039363861, + "learning_rate": 5.955617198335646e-06, + "loss": 0.0115, + "step": 5842 + }, + { + "epoch": 4.047800484932456, + "grad_norm": 0.6167822480201721, + "learning_rate": 5.9549237170596395e-06, + "loss": 0.0189, + "step": 5843 + }, + { + "epoch": 4.048493245583651, + "grad_norm": 0.48649120330810547, + "learning_rate": 5.954230235783634e-06, + "loss": 0.0189, + "step": 5844 + }, + { + "epoch": 4.049186006234846, + "grad_norm": 0.5015866756439209, + "learning_rate": 5.9535367545076285e-06, + "loss": 0.0223, + "step": 5845 + }, + { + "epoch": 4.0498787668860405, + "grad_norm": 0.44310736656188965, + "learning_rate": 5.9528432732316234e-06, + "loss": 0.0129, + "step": 5846 + }, + { + "epoch": 4.050571527537236, + "grad_norm": 0.3944130837917328, + "learning_rate": 5.952149791955618e-06, + "loss": 0.0159, + "step": 5847 + }, + { + "epoch": 4.051264288188431, + "grad_norm": 0.6501338481903076, + "learning_rate": 5.951456310679612e-06, + "loss": 0.0177, + "step": 5848 + }, + { + "epoch": 4.0519570488396255, + "grad_norm": 0.44322100281715393, + "learning_rate": 5.9507628294036065e-06, + "loss": 0.0107, + "step": 5849 + }, + { + "epoch": 4.052649809490821, + "grad_norm": 0.4589598774909973, + "learning_rate": 5.950069348127601e-06, + "loss": 0.015, + "step": 5850 + }, + { + "epoch": 4.053342570142016, + "grad_norm": 0.5213820934295654, + "learning_rate": 5.9493758668515956e-06, + "loss": 0.0134, + "step": 5851 + }, + { + "epoch": 4.054035330793211, + "grad_norm": 0.4292052388191223, + "learning_rate": 5.9486823855755905e-06, + "loss": 0.0139, + "step": 5852 + }, + { + "epoch": 4.054728091444406, + "grad_norm": 0.3794039189815521, + "learning_rate": 5.947988904299585e-06, + "loss": 0.0163, + "step": 5853 + }, + { + "epoch": 4.055420852095601, + "grad_norm": 0.3504757285118103, + "learning_rate": 5.9472954230235795e-06, + "loss": 0.0111, + "step": 5854 + }, + { + "epoch": 4.056113612746796, + "grad_norm": 0.29840224981307983, + "learning_rate": 5.946601941747573e-06, + "loss": 0.0098, + "step": 5855 + }, + { + "epoch": 4.056806373397991, + "grad_norm": 0.565812885761261, + "learning_rate": 5.945908460471568e-06, + "loss": 0.0189, + "step": 5856 + }, + { + "epoch": 4.057499134049186, + "grad_norm": 0.3221192955970764, + "learning_rate": 5.945214979195563e-06, + "loss": 0.0116, + "step": 5857 + }, + { + "epoch": 4.058191894700381, + "grad_norm": 0.5720316767692566, + "learning_rate": 5.944521497919557e-06, + "loss": 0.0142, + "step": 5858 + }, + { + "epoch": 4.058884655351576, + "grad_norm": 0.5223294496536255, + "learning_rate": 5.943828016643552e-06, + "loss": 0.0197, + "step": 5859 + }, + { + "epoch": 4.059577416002771, + "grad_norm": 0.3791462779045105, + "learning_rate": 5.943134535367545e-06, + "loss": 0.0153, + "step": 5860 + }, + { + "epoch": 4.060270176653966, + "grad_norm": 0.3745465576648712, + "learning_rate": 5.94244105409154e-06, + "loss": 0.0129, + "step": 5861 + }, + { + "epoch": 4.0609629373051614, + "grad_norm": 0.46460044384002686, + "learning_rate": 5.941747572815535e-06, + "loss": 0.0129, + "step": 5862 + }, + { + "epoch": 4.061655697956356, + "grad_norm": 0.40892907977104187, + "learning_rate": 5.941054091539529e-06, + "loss": 0.0159, + "step": 5863 + }, + { + "epoch": 4.062348458607551, + "grad_norm": 0.41356760263442993, + "learning_rate": 5.940360610263524e-06, + "loss": 0.0145, + "step": 5864 + }, + { + "epoch": 4.063041219258746, + "grad_norm": 0.43089690804481506, + "learning_rate": 5.939667128987517e-06, + "loss": 0.0185, + "step": 5865 + }, + { + "epoch": 4.063733979909941, + "grad_norm": 0.36338141560554504, + "learning_rate": 5.938973647711512e-06, + "loss": 0.0134, + "step": 5866 + }, + { + "epoch": 4.064426740561136, + "grad_norm": 0.5414767265319824, + "learning_rate": 5.938280166435507e-06, + "loss": 0.0176, + "step": 5867 + }, + { + "epoch": 4.065119501212331, + "grad_norm": 0.6014214158058167, + "learning_rate": 5.937586685159501e-06, + "loss": 0.018, + "step": 5868 + }, + { + "epoch": 4.065812261863526, + "grad_norm": 0.4352724254131317, + "learning_rate": 5.936893203883496e-06, + "loss": 0.0147, + "step": 5869 + }, + { + "epoch": 4.066505022514721, + "grad_norm": 0.45042163133621216, + "learning_rate": 5.93619972260749e-06, + "loss": 0.0153, + "step": 5870 + }, + { + "epoch": 4.067197783165916, + "grad_norm": 0.39651691913604736, + "learning_rate": 5.935506241331485e-06, + "loss": 0.0122, + "step": 5871 + }, + { + "epoch": 4.067890543817112, + "grad_norm": 0.5998177528381348, + "learning_rate": 5.93481276005548e-06, + "loss": 0.0136, + "step": 5872 + }, + { + "epoch": 4.068583304468306, + "grad_norm": 0.46614235639572144, + "learning_rate": 5.934119278779473e-06, + "loss": 0.0132, + "step": 5873 + }, + { + "epoch": 4.069276065119501, + "grad_norm": 0.3535293638706207, + "learning_rate": 5.933425797503468e-06, + "loss": 0.0116, + "step": 5874 + }, + { + "epoch": 4.0699688257706965, + "grad_norm": 0.45569026470184326, + "learning_rate": 5.932732316227462e-06, + "loss": 0.0139, + "step": 5875 + }, + { + "epoch": 4.070661586421891, + "grad_norm": 0.4665956497192383, + "learning_rate": 5.932038834951457e-06, + "loss": 0.0161, + "step": 5876 + }, + { + "epoch": 4.071354347073086, + "grad_norm": 0.4282681345939636, + "learning_rate": 5.931345353675452e-06, + "loss": 0.0152, + "step": 5877 + }, + { + "epoch": 4.0720471077242815, + "grad_norm": 0.48855772614479065, + "learning_rate": 5.930651872399445e-06, + "loss": 0.0133, + "step": 5878 + }, + { + "epoch": 4.072739868375476, + "grad_norm": 0.44515877962112427, + "learning_rate": 5.92995839112344e-06, + "loss": 0.0164, + "step": 5879 + }, + { + "epoch": 4.073432629026671, + "grad_norm": 0.5303168892860413, + "learning_rate": 5.929264909847434e-06, + "loss": 0.0168, + "step": 5880 + }, + { + "epoch": 4.074125389677866, + "grad_norm": 0.3862502872943878, + "learning_rate": 5.928571428571429e-06, + "loss": 0.0105, + "step": 5881 + }, + { + "epoch": 4.074818150329062, + "grad_norm": 0.3936847150325775, + "learning_rate": 5.927877947295424e-06, + "loss": 0.0157, + "step": 5882 + }, + { + "epoch": 4.075510910980256, + "grad_norm": 0.4762272238731384, + "learning_rate": 5.927184466019418e-06, + "loss": 0.0137, + "step": 5883 + }, + { + "epoch": 4.076203671631451, + "grad_norm": 0.5180473923683167, + "learning_rate": 5.926490984743412e-06, + "loss": 0.0229, + "step": 5884 + }, + { + "epoch": 4.076896432282647, + "grad_norm": 0.40103399753570557, + "learning_rate": 5.925797503467406e-06, + "loss": 0.0099, + "step": 5885 + }, + { + "epoch": 4.077589192933841, + "grad_norm": 0.5800018310546875, + "learning_rate": 5.925104022191401e-06, + "loss": 0.0117, + "step": 5886 + }, + { + "epoch": 4.078281953585036, + "grad_norm": 0.39674970507621765, + "learning_rate": 5.924410540915396e-06, + "loss": 0.0156, + "step": 5887 + }, + { + "epoch": 4.078974714236232, + "grad_norm": 0.43166518211364746, + "learning_rate": 5.92371705963939e-06, + "loss": 0.0113, + "step": 5888 + }, + { + "epoch": 4.079667474887426, + "grad_norm": 0.5330915451049805, + "learning_rate": 5.923023578363385e-06, + "loss": 0.0142, + "step": 5889 + }, + { + "epoch": 4.080360235538621, + "grad_norm": 0.40920907258987427, + "learning_rate": 5.9223300970873785e-06, + "loss": 0.012, + "step": 5890 + }, + { + "epoch": 4.0810529961898165, + "grad_norm": 0.3786559998989105, + "learning_rate": 5.921636615811373e-06, + "loss": 0.0141, + "step": 5891 + }, + { + "epoch": 4.081745756841012, + "grad_norm": 0.4738898277282715, + "learning_rate": 5.920943134535368e-06, + "loss": 0.013, + "step": 5892 + }, + { + "epoch": 4.082438517492206, + "grad_norm": 0.3698556125164032, + "learning_rate": 5.9202496532593624e-06, + "loss": 0.0139, + "step": 5893 + }, + { + "epoch": 4.0831312781434015, + "grad_norm": 0.4169827401638031, + "learning_rate": 5.919556171983357e-06, + "loss": 0.0156, + "step": 5894 + }, + { + "epoch": 4.083824038794597, + "grad_norm": 0.3827657401561737, + "learning_rate": 5.918862690707351e-06, + "loss": 0.0101, + "step": 5895 + }, + { + "epoch": 4.084516799445791, + "grad_norm": 0.4681031107902527, + "learning_rate": 5.9181692094313455e-06, + "loss": 0.0175, + "step": 5896 + }, + { + "epoch": 4.085209560096986, + "grad_norm": 0.6041281819343567, + "learning_rate": 5.9174757281553405e-06, + "loss": 0.0163, + "step": 5897 + }, + { + "epoch": 4.085902320748182, + "grad_norm": 0.44216060638427734, + "learning_rate": 5.9167822468793346e-06, + "loss": 0.0158, + "step": 5898 + }, + { + "epoch": 4.086595081399376, + "grad_norm": 0.3780629634857178, + "learning_rate": 5.9160887656033295e-06, + "loss": 0.0109, + "step": 5899 + }, + { + "epoch": 4.087287842050571, + "grad_norm": 0.3870496153831482, + "learning_rate": 5.915395284327324e-06, + "loss": 0.0169, + "step": 5900 + }, + { + "epoch": 4.087980602701767, + "grad_norm": 0.4418138563632965, + "learning_rate": 5.9147018030513185e-06, + "loss": 0.0129, + "step": 5901 + }, + { + "epoch": 4.088673363352962, + "grad_norm": 0.3675566017627716, + "learning_rate": 5.9140083217753134e-06, + "loss": 0.0138, + "step": 5902 + }, + { + "epoch": 4.089366124004156, + "grad_norm": 0.691497266292572, + "learning_rate": 5.913314840499307e-06, + "loss": 0.016, + "step": 5903 + }, + { + "epoch": 4.090058884655352, + "grad_norm": 0.4711749255657196, + "learning_rate": 5.912621359223302e-06, + "loss": 0.0209, + "step": 5904 + }, + { + "epoch": 4.090751645306547, + "grad_norm": 0.3874192535877228, + "learning_rate": 5.911927877947296e-06, + "loss": 0.0104, + "step": 5905 + }, + { + "epoch": 4.091444405957741, + "grad_norm": 0.42109841108322144, + "learning_rate": 5.911234396671291e-06, + "loss": 0.0173, + "step": 5906 + }, + { + "epoch": 4.092137166608937, + "grad_norm": 0.3346777558326721, + "learning_rate": 5.9105409153952856e-06, + "loss": 0.0113, + "step": 5907 + }, + { + "epoch": 4.092829927260132, + "grad_norm": 0.41182124614715576, + "learning_rate": 5.909847434119279e-06, + "loss": 0.0149, + "step": 5908 + }, + { + "epoch": 4.093522687911326, + "grad_norm": 0.36244627833366394, + "learning_rate": 5.909153952843274e-06, + "loss": 0.0123, + "step": 5909 + }, + { + "epoch": 4.0942154485625215, + "grad_norm": 0.4372624158859253, + "learning_rate": 5.908460471567268e-06, + "loss": 0.0126, + "step": 5910 + }, + { + "epoch": 4.094908209213717, + "grad_norm": 0.3828391432762146, + "learning_rate": 5.907766990291263e-06, + "loss": 0.0143, + "step": 5911 + }, + { + "epoch": 4.095600969864912, + "grad_norm": 0.45144957304000854, + "learning_rate": 5.907073509015258e-06, + "loss": 0.0115, + "step": 5912 + }, + { + "epoch": 4.0962937305161065, + "grad_norm": 0.5366044044494629, + "learning_rate": 5.906380027739251e-06, + "loss": 0.0178, + "step": 5913 + }, + { + "epoch": 4.096986491167302, + "grad_norm": 0.37477147579193115, + "learning_rate": 5.905686546463246e-06, + "loss": 0.0153, + "step": 5914 + }, + { + "epoch": 4.097679251818497, + "grad_norm": 0.42522063851356506, + "learning_rate": 5.90499306518724e-06, + "loss": 0.0128, + "step": 5915 + }, + { + "epoch": 4.098372012469691, + "grad_norm": 0.41694512963294983, + "learning_rate": 5.904299583911235e-06, + "loss": 0.0111, + "step": 5916 + }, + { + "epoch": 4.099064773120887, + "grad_norm": 0.5273397564888, + "learning_rate": 5.90360610263523e-06, + "loss": 0.0165, + "step": 5917 + }, + { + "epoch": 4.099757533772082, + "grad_norm": 0.5179102420806885, + "learning_rate": 5.902912621359224e-06, + "loss": 0.0193, + "step": 5918 + }, + { + "epoch": 4.100450294423276, + "grad_norm": 0.5258399844169617, + "learning_rate": 5.902219140083219e-06, + "loss": 0.0214, + "step": 5919 + }, + { + "epoch": 4.101143055074472, + "grad_norm": 0.43828997015953064, + "learning_rate": 5.901525658807212e-06, + "loss": 0.0153, + "step": 5920 + }, + { + "epoch": 4.101835815725667, + "grad_norm": 0.35232213139533997, + "learning_rate": 5.900832177531207e-06, + "loss": 0.011, + "step": 5921 + }, + { + "epoch": 4.102528576376862, + "grad_norm": 0.43267562985420227, + "learning_rate": 5.900138696255202e-06, + "loss": 0.0159, + "step": 5922 + }, + { + "epoch": 4.103221337028057, + "grad_norm": 0.4062022566795349, + "learning_rate": 5.899445214979196e-06, + "loss": 0.0197, + "step": 5923 + }, + { + "epoch": 4.103914097679252, + "grad_norm": 0.3769342005252838, + "learning_rate": 5.898751733703191e-06, + "loss": 0.0165, + "step": 5924 + }, + { + "epoch": 4.104606858330447, + "grad_norm": 0.37129485607147217, + "learning_rate": 5.898058252427184e-06, + "loss": 0.0132, + "step": 5925 + }, + { + "epoch": 4.1052996189816415, + "grad_norm": 0.48331621289253235, + "learning_rate": 5.897364771151179e-06, + "loss": 0.0231, + "step": 5926 + }, + { + "epoch": 4.105992379632837, + "grad_norm": 0.4839484393596649, + "learning_rate": 5.896671289875174e-06, + "loss": 0.0174, + "step": 5927 + }, + { + "epoch": 4.106685140284032, + "grad_norm": 0.47911614179611206, + "learning_rate": 5.895977808599168e-06, + "loss": 0.0145, + "step": 5928 + }, + { + "epoch": 4.1073779009352265, + "grad_norm": 0.5056118369102478, + "learning_rate": 5.895284327323163e-06, + "loss": 0.0174, + "step": 5929 + }, + { + "epoch": 4.108070661586422, + "grad_norm": 0.6311521530151367, + "learning_rate": 5.894590846047157e-06, + "loss": 0.0153, + "step": 5930 + }, + { + "epoch": 4.108763422237617, + "grad_norm": 0.4103589653968811, + "learning_rate": 5.893897364771152e-06, + "loss": 0.015, + "step": 5931 + }, + { + "epoch": 4.109456182888812, + "grad_norm": 0.3977261781692505, + "learning_rate": 5.893203883495146e-06, + "loss": 0.0172, + "step": 5932 + }, + { + "epoch": 4.110148943540007, + "grad_norm": 0.6271304488182068, + "learning_rate": 5.89251040221914e-06, + "loss": 0.0177, + "step": 5933 + }, + { + "epoch": 4.110841704191202, + "grad_norm": 0.3656662106513977, + "learning_rate": 5.891816920943135e-06, + "loss": 0.0122, + "step": 5934 + }, + { + "epoch": 4.111534464842397, + "grad_norm": 0.4223005473613739, + "learning_rate": 5.891123439667129e-06, + "loss": 0.0159, + "step": 5935 + }, + { + "epoch": 4.112227225493592, + "grad_norm": 0.38988810777664185, + "learning_rate": 5.890429958391124e-06, + "loss": 0.0112, + "step": 5936 + }, + { + "epoch": 4.112919986144787, + "grad_norm": 0.4237985908985138, + "learning_rate": 5.889736477115119e-06, + "loss": 0.0155, + "step": 5937 + }, + { + "epoch": 4.113612746795982, + "grad_norm": 0.4080210030078888, + "learning_rate": 5.889042995839112e-06, + "loss": 0.0137, + "step": 5938 + }, + { + "epoch": 4.114305507447177, + "grad_norm": 0.5066666007041931, + "learning_rate": 5.888349514563107e-06, + "loss": 0.0231, + "step": 5939 + }, + { + "epoch": 4.114998268098372, + "grad_norm": 0.3873167932033539, + "learning_rate": 5.8876560332871014e-06, + "loss": 0.0169, + "step": 5940 + }, + { + "epoch": 4.115691028749567, + "grad_norm": 0.3531154692173004, + "learning_rate": 5.886962552011096e-06, + "loss": 0.0123, + "step": 5941 + }, + { + "epoch": 4.116383789400762, + "grad_norm": 0.4736908972263336, + "learning_rate": 5.886269070735091e-06, + "loss": 0.0173, + "step": 5942 + }, + { + "epoch": 4.117076550051957, + "grad_norm": 0.500149667263031, + "learning_rate": 5.8855755894590845e-06, + "loss": 0.0138, + "step": 5943 + }, + { + "epoch": 4.117769310703152, + "grad_norm": 0.43999993801116943, + "learning_rate": 5.8848821081830795e-06, + "loss": 0.0157, + "step": 5944 + }, + { + "epoch": 4.118462071354347, + "grad_norm": 0.4465121626853943, + "learning_rate": 5.8841886269070736e-06, + "loss": 0.0197, + "step": 5945 + }, + { + "epoch": 4.119154832005542, + "grad_norm": 0.36826783418655396, + "learning_rate": 5.8834951456310685e-06, + "loss": 0.0118, + "step": 5946 + }, + { + "epoch": 4.119847592656737, + "grad_norm": 0.3605504631996155, + "learning_rate": 5.8828016643550634e-06, + "loss": 0.012, + "step": 5947 + }, + { + "epoch": 4.120540353307932, + "grad_norm": 0.4197753667831421, + "learning_rate": 5.8821081830790575e-06, + "loss": 0.0152, + "step": 5948 + }, + { + "epoch": 4.121233113959127, + "grad_norm": 0.40580058097839355, + "learning_rate": 5.8814147018030524e-06, + "loss": 0.0148, + "step": 5949 + }, + { + "epoch": 4.121925874610322, + "grad_norm": 0.46573224663734436, + "learning_rate": 5.880721220527046e-06, + "loss": 0.0114, + "step": 5950 + }, + { + "epoch": 4.122618635261517, + "grad_norm": 0.49255266785621643, + "learning_rate": 5.880027739251041e-06, + "loss": 0.013, + "step": 5951 + }, + { + "epoch": 4.123311395912713, + "grad_norm": 0.500079333782196, + "learning_rate": 5.8793342579750356e-06, + "loss": 0.0163, + "step": 5952 + }, + { + "epoch": 4.124004156563907, + "grad_norm": 0.3782138526439667, + "learning_rate": 5.87864077669903e-06, + "loss": 0.0145, + "step": 5953 + }, + { + "epoch": 4.124696917215102, + "grad_norm": 0.6033086776733398, + "learning_rate": 5.8779472954230246e-06, + "loss": 0.0214, + "step": 5954 + }, + { + "epoch": 4.1253896778662975, + "grad_norm": 0.5711798667907715, + "learning_rate": 5.877253814147018e-06, + "loss": 0.0181, + "step": 5955 + }, + { + "epoch": 4.126082438517492, + "grad_norm": 0.4918570816516876, + "learning_rate": 5.876560332871013e-06, + "loss": 0.019, + "step": 5956 + }, + { + "epoch": 4.126775199168687, + "grad_norm": 0.4438480734825134, + "learning_rate": 5.875866851595008e-06, + "loss": 0.0181, + "step": 5957 + }, + { + "epoch": 4.1274679598198825, + "grad_norm": 0.5217729210853577, + "learning_rate": 5.875173370319002e-06, + "loss": 0.0111, + "step": 5958 + }, + { + "epoch": 4.128160720471077, + "grad_norm": 0.48910391330718994, + "learning_rate": 5.874479889042997e-06, + "loss": 0.0181, + "step": 5959 + }, + { + "epoch": 4.128853481122272, + "grad_norm": 0.42340871691703796, + "learning_rate": 5.873786407766991e-06, + "loss": 0.0141, + "step": 5960 + }, + { + "epoch": 4.129546241773467, + "grad_norm": 0.39708080887794495, + "learning_rate": 5.873092926490985e-06, + "loss": 0.0133, + "step": 5961 + }, + { + "epoch": 4.130239002424663, + "grad_norm": 1.0382578372955322, + "learning_rate": 5.87239944521498e-06, + "loss": 0.0222, + "step": 5962 + }, + { + "epoch": 4.130931763075857, + "grad_norm": 0.428201287984848, + "learning_rate": 5.871705963938974e-06, + "loss": 0.0126, + "step": 5963 + }, + { + "epoch": 4.131624523727052, + "grad_norm": 0.34670788049697876, + "learning_rate": 5.871012482662969e-06, + "loss": 0.0135, + "step": 5964 + }, + { + "epoch": 4.132317284378248, + "grad_norm": 0.4749477505683899, + "learning_rate": 5.870319001386963e-06, + "loss": 0.012, + "step": 5965 + }, + { + "epoch": 4.133010045029442, + "grad_norm": 0.6095417141914368, + "learning_rate": 5.869625520110958e-06, + "loss": 0.0234, + "step": 5966 + }, + { + "epoch": 4.133702805680637, + "grad_norm": 0.4395025372505188, + "learning_rate": 5.868932038834953e-06, + "loss": 0.0172, + "step": 5967 + }, + { + "epoch": 4.134395566331833, + "grad_norm": 0.4887484014034271, + "learning_rate": 5.868238557558946e-06, + "loss": 0.0198, + "step": 5968 + }, + { + "epoch": 4.135088326983027, + "grad_norm": 0.3763115406036377, + "learning_rate": 5.867545076282941e-06, + "loss": 0.0164, + "step": 5969 + }, + { + "epoch": 4.135781087634222, + "grad_norm": 0.3768296241760254, + "learning_rate": 5.866851595006935e-06, + "loss": 0.0129, + "step": 5970 + }, + { + "epoch": 4.1364738482854175, + "grad_norm": 1.0978857278823853, + "learning_rate": 5.86615811373093e-06, + "loss": 0.0154, + "step": 5971 + }, + { + "epoch": 4.137166608936613, + "grad_norm": 0.5517410039901733, + "learning_rate": 5.865464632454925e-06, + "loss": 0.0157, + "step": 5972 + }, + { + "epoch": 4.137859369587807, + "grad_norm": 0.4258527159690857, + "learning_rate": 5.864771151178918e-06, + "loss": 0.0096, + "step": 5973 + }, + { + "epoch": 4.1385521302390025, + "grad_norm": 0.501067578792572, + "learning_rate": 5.864077669902913e-06, + "loss": 0.011, + "step": 5974 + }, + { + "epoch": 4.139244890890198, + "grad_norm": 0.42069071531295776, + "learning_rate": 5.863384188626907e-06, + "loss": 0.012, + "step": 5975 + }, + { + "epoch": 4.139937651541392, + "grad_norm": 0.4824422001838684, + "learning_rate": 5.862690707350902e-06, + "loss": 0.0173, + "step": 5976 + }, + { + "epoch": 4.140630412192587, + "grad_norm": 0.46342602372169495, + "learning_rate": 5.861997226074897e-06, + "loss": 0.0163, + "step": 5977 + }, + { + "epoch": 4.141323172843783, + "grad_norm": 0.3886893689632416, + "learning_rate": 5.861303744798891e-06, + "loss": 0.0111, + "step": 5978 + }, + { + "epoch": 4.142015933494977, + "grad_norm": 0.5579688549041748, + "learning_rate": 5.860610263522886e-06, + "loss": 0.0156, + "step": 5979 + }, + { + "epoch": 4.142708694146172, + "grad_norm": 0.434151291847229, + "learning_rate": 5.859916782246879e-06, + "loss": 0.0139, + "step": 5980 + }, + { + "epoch": 4.143401454797368, + "grad_norm": 0.42820703983306885, + "learning_rate": 5.859223300970874e-06, + "loss": 0.0165, + "step": 5981 + }, + { + "epoch": 4.144094215448563, + "grad_norm": 0.3862599730491638, + "learning_rate": 5.858529819694869e-06, + "loss": 0.0112, + "step": 5982 + }, + { + "epoch": 4.144786976099757, + "grad_norm": 0.5400574803352356, + "learning_rate": 5.857836338418863e-06, + "loss": 0.0122, + "step": 5983 + }, + { + "epoch": 4.145479736750953, + "grad_norm": 0.47512558102607727, + "learning_rate": 5.857142857142858e-06, + "loss": 0.0181, + "step": 5984 + }, + { + "epoch": 4.146172497402148, + "grad_norm": 0.4970700442790985, + "learning_rate": 5.856449375866851e-06, + "loss": 0.0146, + "step": 5985 + }, + { + "epoch": 4.146865258053342, + "grad_norm": 0.40132901072502136, + "learning_rate": 5.855755894590846e-06, + "loss": 0.0126, + "step": 5986 + }, + { + "epoch": 4.1475580187045376, + "grad_norm": 0.4725714921951294, + "learning_rate": 5.855062413314841e-06, + "loss": 0.015, + "step": 5987 + }, + { + "epoch": 4.148250779355733, + "grad_norm": 0.397342324256897, + "learning_rate": 5.854368932038835e-06, + "loss": 0.019, + "step": 5988 + }, + { + "epoch": 4.148943540006927, + "grad_norm": 0.42282554507255554, + "learning_rate": 5.85367545076283e-06, + "loss": 0.0112, + "step": 5989 + }, + { + "epoch": 4.1496363006581225, + "grad_norm": 0.43358930945396423, + "learning_rate": 5.8529819694868235e-06, + "loss": 0.0139, + "step": 5990 + }, + { + "epoch": 4.150329061309318, + "grad_norm": 0.45167842507362366, + "learning_rate": 5.8522884882108185e-06, + "loss": 0.0209, + "step": 5991 + }, + { + "epoch": 4.151021821960513, + "grad_norm": 0.4032903015613556, + "learning_rate": 5.851595006934813e-06, + "loss": 0.0144, + "step": 5992 + }, + { + "epoch": 4.1517145826117074, + "grad_norm": 0.35296830534935, + "learning_rate": 5.8509015256588075e-06, + "loss": 0.0112, + "step": 5993 + }, + { + "epoch": 4.152407343262903, + "grad_norm": 0.5063374042510986, + "learning_rate": 5.8502080443828024e-06, + "loss": 0.0219, + "step": 5994 + }, + { + "epoch": 4.153100103914098, + "grad_norm": 0.4046128988265991, + "learning_rate": 5.8495145631067965e-06, + "loss": 0.0175, + "step": 5995 + }, + { + "epoch": 4.153792864565292, + "grad_norm": 0.630039632320404, + "learning_rate": 5.8488210818307915e-06, + "loss": 0.017, + "step": 5996 + }, + { + "epoch": 4.154485625216488, + "grad_norm": 0.4538635015487671, + "learning_rate": 5.848127600554786e-06, + "loss": 0.013, + "step": 5997 + }, + { + "epoch": 4.155178385867683, + "grad_norm": 0.40618401765823364, + "learning_rate": 5.84743411927878e-06, + "loss": 0.0129, + "step": 5998 + }, + { + "epoch": 4.155871146518877, + "grad_norm": 0.4100301265716553, + "learning_rate": 5.8467406380027746e-06, + "loss": 0.0168, + "step": 5999 + }, + { + "epoch": 4.156563907170073, + "grad_norm": 0.41093727946281433, + "learning_rate": 5.846047156726769e-06, + "loss": 0.0088, + "step": 6000 + }, + { + "epoch": 4.157256667821268, + "grad_norm": 0.4890499711036682, + "learning_rate": 5.8453536754507636e-06, + "loss": 0.0145, + "step": 6001 + }, + { + "epoch": 4.157949428472463, + "grad_norm": 0.5121679306030273, + "learning_rate": 5.8446601941747585e-06, + "loss": 0.0153, + "step": 6002 + }, + { + "epoch": 4.158642189123658, + "grad_norm": 0.36372610926628113, + "learning_rate": 5.843966712898752e-06, + "loss": 0.0101, + "step": 6003 + }, + { + "epoch": 4.159334949774853, + "grad_norm": 0.4928336441516876, + "learning_rate": 5.843273231622747e-06, + "loss": 0.018, + "step": 6004 + }, + { + "epoch": 4.160027710426048, + "grad_norm": 0.3554008901119232, + "learning_rate": 5.842579750346741e-06, + "loss": 0.0109, + "step": 6005 + }, + { + "epoch": 4.1607204710772425, + "grad_norm": 0.6662534475326538, + "learning_rate": 5.841886269070736e-06, + "loss": 0.0213, + "step": 6006 + }, + { + "epoch": 4.161413231728438, + "grad_norm": 0.4709452986717224, + "learning_rate": 5.841192787794731e-06, + "loss": 0.0133, + "step": 6007 + }, + { + "epoch": 4.162105992379633, + "grad_norm": 0.4719048738479614, + "learning_rate": 5.840499306518725e-06, + "loss": 0.0189, + "step": 6008 + }, + { + "epoch": 4.1627987530308275, + "grad_norm": 0.649800181388855, + "learning_rate": 5.839805825242719e-06, + "loss": 0.0162, + "step": 6009 + }, + { + "epoch": 4.163491513682023, + "grad_norm": 0.459963858127594, + "learning_rate": 5.839112343966713e-06, + "loss": 0.018, + "step": 6010 + }, + { + "epoch": 4.164184274333218, + "grad_norm": 0.5588138103485107, + "learning_rate": 5.838418862690708e-06, + "loss": 0.0122, + "step": 6011 + }, + { + "epoch": 4.164877034984413, + "grad_norm": 0.43948790431022644, + "learning_rate": 5.837725381414703e-06, + "loss": 0.014, + "step": 6012 + }, + { + "epoch": 4.165569795635608, + "grad_norm": 0.38377538323402405, + "learning_rate": 5.837031900138697e-06, + "loss": 0.0117, + "step": 6013 + }, + { + "epoch": 4.166262556286803, + "grad_norm": 0.4332597851753235, + "learning_rate": 5.836338418862692e-06, + "loss": 0.0123, + "step": 6014 + }, + { + "epoch": 4.166955316937998, + "grad_norm": 0.3645802140235901, + "learning_rate": 5.835644937586685e-06, + "loss": 0.0127, + "step": 6015 + }, + { + "epoch": 4.167648077589193, + "grad_norm": 0.42436033487319946, + "learning_rate": 5.83495145631068e-06, + "loss": 0.0122, + "step": 6016 + }, + { + "epoch": 4.168340838240388, + "grad_norm": 0.40074238181114197, + "learning_rate": 5.834257975034675e-06, + "loss": 0.014, + "step": 6017 + }, + { + "epoch": 4.169033598891583, + "grad_norm": 0.49794983863830566, + "learning_rate": 5.833564493758669e-06, + "loss": 0.0135, + "step": 6018 + }, + { + "epoch": 4.169726359542778, + "grad_norm": 0.46749451756477356, + "learning_rate": 5.832871012482664e-06, + "loss": 0.0213, + "step": 6019 + }, + { + "epoch": 4.170419120193973, + "grad_norm": 0.5884116888046265, + "learning_rate": 5.832177531206657e-06, + "loss": 0.0187, + "step": 6020 + }, + { + "epoch": 4.171111880845168, + "grad_norm": 0.4522908926010132, + "learning_rate": 5.831484049930652e-06, + "loss": 0.0157, + "step": 6021 + }, + { + "epoch": 4.171804641496363, + "grad_norm": 0.621612012386322, + "learning_rate": 5.830790568654647e-06, + "loss": 0.0304, + "step": 6022 + }, + { + "epoch": 4.172497402147558, + "grad_norm": 0.3476214110851288, + "learning_rate": 5.830097087378641e-06, + "loss": 0.0128, + "step": 6023 + }, + { + "epoch": 4.173190162798753, + "grad_norm": 0.47607842087745667, + "learning_rate": 5.829403606102636e-06, + "loss": 0.0165, + "step": 6024 + }, + { + "epoch": 4.173882923449948, + "grad_norm": 0.4754858911037445, + "learning_rate": 5.82871012482663e-06, + "loss": 0.0184, + "step": 6025 + }, + { + "epoch": 4.174575684101143, + "grad_norm": 0.382719486951828, + "learning_rate": 5.828016643550625e-06, + "loss": 0.0144, + "step": 6026 + }, + { + "epoch": 4.175268444752338, + "grad_norm": 0.5067079663276672, + "learning_rate": 5.82732316227462e-06, + "loss": 0.0147, + "step": 6027 + }, + { + "epoch": 4.175961205403533, + "grad_norm": 0.5615599751472473, + "learning_rate": 5.826629680998613e-06, + "loss": 0.0251, + "step": 6028 + }, + { + "epoch": 4.176653966054728, + "grad_norm": 0.4316640794277191, + "learning_rate": 5.825936199722608e-06, + "loss": 0.0142, + "step": 6029 + }, + { + "epoch": 4.177346726705923, + "grad_norm": 0.4999231994152069, + "learning_rate": 5.825242718446602e-06, + "loss": 0.0192, + "step": 6030 + }, + { + "epoch": 4.178039487357118, + "grad_norm": 0.3746076822280884, + "learning_rate": 5.824549237170597e-06, + "loss": 0.0118, + "step": 6031 + }, + { + "epoch": 4.1787322480083136, + "grad_norm": 0.38020339608192444, + "learning_rate": 5.823855755894592e-06, + "loss": 0.0107, + "step": 6032 + }, + { + "epoch": 4.179425008659508, + "grad_norm": 0.5295211672782898, + "learning_rate": 5.823162274618585e-06, + "loss": 0.0189, + "step": 6033 + }, + { + "epoch": 4.180117769310703, + "grad_norm": 0.6093255281448364, + "learning_rate": 5.82246879334258e-06, + "loss": 0.0204, + "step": 6034 + }, + { + "epoch": 4.1808105299618985, + "grad_norm": 0.5836412310600281, + "learning_rate": 5.821775312066574e-06, + "loss": 0.0197, + "step": 6035 + }, + { + "epoch": 4.181503290613093, + "grad_norm": 0.4363343119621277, + "learning_rate": 5.821081830790569e-06, + "loss": 0.014, + "step": 6036 + }, + { + "epoch": 4.182196051264288, + "grad_norm": 0.43642446398735046, + "learning_rate": 5.820388349514564e-06, + "loss": 0.017, + "step": 6037 + }, + { + "epoch": 4.1828888119154835, + "grad_norm": 0.3896452486515045, + "learning_rate": 5.8196948682385575e-06, + "loss": 0.0134, + "step": 6038 + }, + { + "epoch": 4.183581572566678, + "grad_norm": 0.3881409168243408, + "learning_rate": 5.819001386962552e-06, + "loss": 0.0129, + "step": 6039 + }, + { + "epoch": 4.184274333217873, + "grad_norm": 0.432341605424881, + "learning_rate": 5.8183079056865465e-06, + "loss": 0.0154, + "step": 6040 + }, + { + "epoch": 4.184967093869068, + "grad_norm": 0.5767948627471924, + "learning_rate": 5.8176144244105414e-06, + "loss": 0.0202, + "step": 6041 + }, + { + "epoch": 4.185659854520264, + "grad_norm": 0.4400050640106201, + "learning_rate": 5.816920943134536e-06, + "loss": 0.0141, + "step": 6042 + }, + { + "epoch": 4.186352615171458, + "grad_norm": 0.4426657259464264, + "learning_rate": 5.8162274618585305e-06, + "loss": 0.016, + "step": 6043 + }, + { + "epoch": 4.187045375822653, + "grad_norm": 0.43351590633392334, + "learning_rate": 5.815533980582525e-06, + "loss": 0.0115, + "step": 6044 + }, + { + "epoch": 4.187738136473849, + "grad_norm": 0.3226017355918884, + "learning_rate": 5.814840499306519e-06, + "loss": 0.0103, + "step": 6045 + }, + { + "epoch": 4.188430897125043, + "grad_norm": 0.45365339517593384, + "learning_rate": 5.8141470180305136e-06, + "loss": 0.0161, + "step": 6046 + }, + { + "epoch": 4.189123657776238, + "grad_norm": 0.419453501701355, + "learning_rate": 5.8134535367545085e-06, + "loss": 0.0153, + "step": 6047 + }, + { + "epoch": 4.189816418427434, + "grad_norm": 0.36882466077804565, + "learning_rate": 5.812760055478503e-06, + "loss": 0.0124, + "step": 6048 + }, + { + "epoch": 4.190509179078628, + "grad_norm": 0.4724201261997223, + "learning_rate": 5.8120665742024975e-06, + "loss": 0.0186, + "step": 6049 + }, + { + "epoch": 4.191201939729823, + "grad_norm": 0.5134153366088867, + "learning_rate": 5.811373092926491e-06, + "loss": 0.0129, + "step": 6050 + }, + { + "epoch": 4.1918947003810185, + "grad_norm": 0.3443831503391266, + "learning_rate": 5.810679611650486e-06, + "loss": 0.0125, + "step": 6051 + }, + { + "epoch": 4.192587461032213, + "grad_norm": 0.36510440707206726, + "learning_rate": 5.809986130374481e-06, + "loss": 0.0118, + "step": 6052 + }, + { + "epoch": 4.193280221683408, + "grad_norm": 0.4110822081565857, + "learning_rate": 5.809292649098475e-06, + "loss": 0.0159, + "step": 6053 + }, + { + "epoch": 4.1939729823346035, + "grad_norm": 0.5295119881629944, + "learning_rate": 5.80859916782247e-06, + "loss": 0.0112, + "step": 6054 + }, + { + "epoch": 4.194665742985799, + "grad_norm": 0.44774559140205383, + "learning_rate": 5.807905686546464e-06, + "loss": 0.0154, + "step": 6055 + }, + { + "epoch": 4.195358503636993, + "grad_norm": 0.39692816138267517, + "learning_rate": 5.807212205270459e-06, + "loss": 0.0148, + "step": 6056 + }, + { + "epoch": 4.196051264288188, + "grad_norm": 0.37895745038986206, + "learning_rate": 5.806518723994453e-06, + "loss": 0.0123, + "step": 6057 + }, + { + "epoch": 4.196744024939384, + "grad_norm": 0.4494268298149109, + "learning_rate": 5.805825242718447e-06, + "loss": 0.0123, + "step": 6058 + }, + { + "epoch": 4.197436785590578, + "grad_norm": 0.44115379452705383, + "learning_rate": 5.805131761442442e-06, + "loss": 0.0146, + "step": 6059 + }, + { + "epoch": 4.198129546241773, + "grad_norm": 0.33701446652412415, + "learning_rate": 5.804438280166436e-06, + "loss": 0.0119, + "step": 6060 + }, + { + "epoch": 4.198822306892969, + "grad_norm": 0.4546017646789551, + "learning_rate": 5.803744798890431e-06, + "loss": 0.0162, + "step": 6061 + }, + { + "epoch": 4.199515067544164, + "grad_norm": 0.34789028763771057, + "learning_rate": 5.803051317614426e-06, + "loss": 0.0097, + "step": 6062 + }, + { + "epoch": 4.200207828195358, + "grad_norm": 0.3898346722126007, + "learning_rate": 5.802357836338419e-06, + "loss": 0.0097, + "step": 6063 + }, + { + "epoch": 4.200900588846554, + "grad_norm": 0.49162551760673523, + "learning_rate": 5.801664355062414e-06, + "loss": 0.0173, + "step": 6064 + }, + { + "epoch": 4.201593349497749, + "grad_norm": 0.3743383288383484, + "learning_rate": 5.800970873786408e-06, + "loss": 0.0123, + "step": 6065 + }, + { + "epoch": 4.202286110148943, + "grad_norm": 0.4847966730594635, + "learning_rate": 5.800277392510403e-06, + "loss": 0.016, + "step": 6066 + }, + { + "epoch": 4.2029788708001385, + "grad_norm": 0.5072675347328186, + "learning_rate": 5.799583911234398e-06, + "loss": 0.0165, + "step": 6067 + }, + { + "epoch": 4.203671631451334, + "grad_norm": 0.39113467931747437, + "learning_rate": 5.798890429958391e-06, + "loss": 0.0122, + "step": 6068 + }, + { + "epoch": 4.204364392102528, + "grad_norm": 0.4836355745792389, + "learning_rate": 5.798196948682386e-06, + "loss": 0.0209, + "step": 6069 + }, + { + "epoch": 4.2050571527537235, + "grad_norm": 0.6245148181915283, + "learning_rate": 5.79750346740638e-06, + "loss": 0.0108, + "step": 6070 + }, + { + "epoch": 4.205749913404919, + "grad_norm": 0.4833970069885254, + "learning_rate": 5.796809986130375e-06, + "loss": 0.0161, + "step": 6071 + }, + { + "epoch": 4.206442674056113, + "grad_norm": 0.5720314979553223, + "learning_rate": 5.79611650485437e-06, + "loss": 0.0224, + "step": 6072 + }, + { + "epoch": 4.207135434707308, + "grad_norm": 0.45294252038002014, + "learning_rate": 5.795423023578364e-06, + "loss": 0.0215, + "step": 6073 + }, + { + "epoch": 4.207828195358504, + "grad_norm": 0.4205402433872223, + "learning_rate": 5.794729542302359e-06, + "loss": 0.0145, + "step": 6074 + }, + { + "epoch": 4.208520956009699, + "grad_norm": 0.48753929138183594, + "learning_rate": 5.794036061026352e-06, + "loss": 0.0203, + "step": 6075 + }, + { + "epoch": 4.209213716660893, + "grad_norm": 0.34081822633743286, + "learning_rate": 5.793342579750347e-06, + "loss": 0.0129, + "step": 6076 + }, + { + "epoch": 4.209906477312089, + "grad_norm": 0.406585156917572, + "learning_rate": 5.792649098474342e-06, + "loss": 0.0144, + "step": 6077 + }, + { + "epoch": 4.210599237963284, + "grad_norm": 0.405737042427063, + "learning_rate": 5.791955617198336e-06, + "loss": 0.0122, + "step": 6078 + }, + { + "epoch": 4.211291998614478, + "grad_norm": 0.3602556884288788, + "learning_rate": 5.791262135922331e-06, + "loss": 0.0158, + "step": 6079 + }, + { + "epoch": 4.211984759265674, + "grad_norm": 0.4407044053077698, + "learning_rate": 5.790568654646324e-06, + "loss": 0.0124, + "step": 6080 + }, + { + "epoch": 4.212677519916869, + "grad_norm": 0.5421193242073059, + "learning_rate": 5.789875173370319e-06, + "loss": 0.0165, + "step": 6081 + }, + { + "epoch": 4.213370280568064, + "grad_norm": 0.3487362563610077, + "learning_rate": 5.789181692094314e-06, + "loss": 0.0113, + "step": 6082 + }, + { + "epoch": 4.214063041219259, + "grad_norm": 0.45292338728904724, + "learning_rate": 5.788488210818308e-06, + "loss": 0.0159, + "step": 6083 + }, + { + "epoch": 4.214755801870454, + "grad_norm": 0.4305095076560974, + "learning_rate": 5.787794729542303e-06, + "loss": 0.0181, + "step": 6084 + }, + { + "epoch": 4.215448562521649, + "grad_norm": 0.413237065076828, + "learning_rate": 5.787101248266297e-06, + "loss": 0.015, + "step": 6085 + }, + { + "epoch": 4.2161413231728435, + "grad_norm": 0.4743373394012451, + "learning_rate": 5.786407766990291e-06, + "loss": 0.0168, + "step": 6086 + }, + { + "epoch": 4.216834083824039, + "grad_norm": 0.42930710315704346, + "learning_rate": 5.785714285714286e-06, + "loss": 0.0135, + "step": 6087 + }, + { + "epoch": 4.217526844475234, + "grad_norm": 0.7027872800827026, + "learning_rate": 5.7850208044382804e-06, + "loss": 0.0151, + "step": 6088 + }, + { + "epoch": 4.2182196051264285, + "grad_norm": 0.4259084463119507, + "learning_rate": 5.784327323162275e-06, + "loss": 0.0132, + "step": 6089 + }, + { + "epoch": 4.218912365777624, + "grad_norm": 0.488026887178421, + "learning_rate": 5.7836338418862695e-06, + "loss": 0.0135, + "step": 6090 + }, + { + "epoch": 4.219605126428819, + "grad_norm": 0.4157416522502899, + "learning_rate": 5.782940360610264e-06, + "loss": 0.0134, + "step": 6091 + }, + { + "epoch": 4.220297887080013, + "grad_norm": 0.4387360215187073, + "learning_rate": 5.782246879334259e-06, + "loss": 0.0129, + "step": 6092 + }, + { + "epoch": 4.220990647731209, + "grad_norm": 0.41994211077690125, + "learning_rate": 5.7815533980582526e-06, + "loss": 0.0123, + "step": 6093 + }, + { + "epoch": 4.221683408382404, + "grad_norm": 0.36253395676612854, + "learning_rate": 5.7808599167822475e-06, + "loss": 0.0119, + "step": 6094 + }, + { + "epoch": 4.222376169033599, + "grad_norm": 0.3640895485877991, + "learning_rate": 5.780166435506242e-06, + "loss": 0.014, + "step": 6095 + }, + { + "epoch": 4.223068929684794, + "grad_norm": 0.41119879484176636, + "learning_rate": 5.7794729542302365e-06, + "loss": 0.0126, + "step": 6096 + }, + { + "epoch": 4.223761690335989, + "grad_norm": 0.4430147707462311, + "learning_rate": 5.7787794729542314e-06, + "loss": 0.0136, + "step": 6097 + }, + { + "epoch": 4.224454450987184, + "grad_norm": 0.5249005556106567, + "learning_rate": 5.778085991678225e-06, + "loss": 0.0176, + "step": 6098 + }, + { + "epoch": 4.225147211638379, + "grad_norm": 0.3413819968700409, + "learning_rate": 5.77739251040222e-06, + "loss": 0.0113, + "step": 6099 + }, + { + "epoch": 4.225839972289574, + "grad_norm": 0.3949770927429199, + "learning_rate": 5.776699029126214e-06, + "loss": 0.0113, + "step": 6100 + }, + { + "epoch": 4.226532732940769, + "grad_norm": 0.43476852774620056, + "learning_rate": 5.776005547850209e-06, + "loss": 0.013, + "step": 6101 + }, + { + "epoch": 4.227225493591964, + "grad_norm": 0.6038212180137634, + "learning_rate": 5.7753120665742036e-06, + "loss": 0.0147, + "step": 6102 + }, + { + "epoch": 4.227918254243159, + "grad_norm": 0.44593775272369385, + "learning_rate": 5.774618585298198e-06, + "loss": 0.0122, + "step": 6103 + }, + { + "epoch": 4.228611014894354, + "grad_norm": 0.43964704871177673, + "learning_rate": 5.773925104022193e-06, + "loss": 0.0141, + "step": 6104 + }, + { + "epoch": 4.229303775545549, + "grad_norm": 0.45246773958206177, + "learning_rate": 5.773231622746186e-06, + "loss": 0.0179, + "step": 6105 + }, + { + "epoch": 4.229996536196744, + "grad_norm": 0.4594613313674927, + "learning_rate": 5.772538141470181e-06, + "loss": 0.0156, + "step": 6106 + }, + { + "epoch": 4.230689296847939, + "grad_norm": 0.44228464365005493, + "learning_rate": 5.771844660194176e-06, + "loss": 0.0139, + "step": 6107 + }, + { + "epoch": 4.231382057499134, + "grad_norm": 0.5431798100471497, + "learning_rate": 5.77115117891817e-06, + "loss": 0.0179, + "step": 6108 + }, + { + "epoch": 4.232074818150329, + "grad_norm": 0.4152578115463257, + "learning_rate": 5.770457697642165e-06, + "loss": 0.0171, + "step": 6109 + }, + { + "epoch": 4.232767578801524, + "grad_norm": 0.4811961352825165, + "learning_rate": 5.769764216366158e-06, + "loss": 0.0164, + "step": 6110 + }, + { + "epoch": 4.233460339452719, + "grad_norm": 0.45115652680397034, + "learning_rate": 5.769070735090153e-06, + "loss": 0.0198, + "step": 6111 + }, + { + "epoch": 4.234153100103914, + "grad_norm": 0.3338993787765503, + "learning_rate": 5.768377253814148e-06, + "loss": 0.0111, + "step": 6112 + }, + { + "epoch": 4.234845860755109, + "grad_norm": 0.5009710788726807, + "learning_rate": 5.767683772538142e-06, + "loss": 0.0159, + "step": 6113 + }, + { + "epoch": 4.235538621406304, + "grad_norm": 0.384583055973053, + "learning_rate": 5.766990291262137e-06, + "loss": 0.0128, + "step": 6114 + }, + { + "epoch": 4.2362313820574995, + "grad_norm": 0.6601429581642151, + "learning_rate": 5.766296809986131e-06, + "loss": 0.0168, + "step": 6115 + }, + { + "epoch": 4.236924142708694, + "grad_norm": 0.37512263655662537, + "learning_rate": 5.765603328710125e-06, + "loss": 0.0111, + "step": 6116 + }, + { + "epoch": 4.237616903359889, + "grad_norm": 0.39865222573280334, + "learning_rate": 5.76490984743412e-06, + "loss": 0.0141, + "step": 6117 + }, + { + "epoch": 4.238309664011084, + "grad_norm": 0.4099164307117462, + "learning_rate": 5.764216366158114e-06, + "loss": 0.0149, + "step": 6118 + }, + { + "epoch": 4.239002424662279, + "grad_norm": 0.4311049282550812, + "learning_rate": 5.763522884882109e-06, + "loss": 0.0171, + "step": 6119 + }, + { + "epoch": 4.239695185313474, + "grad_norm": 0.6434735655784607, + "learning_rate": 5.762829403606103e-06, + "loss": 0.0176, + "step": 6120 + }, + { + "epoch": 4.240387945964669, + "grad_norm": 0.4360758364200592, + "learning_rate": 5.762135922330098e-06, + "loss": 0.0189, + "step": 6121 + }, + { + "epoch": 4.241080706615865, + "grad_norm": 0.5391027331352234, + "learning_rate": 5.761442441054093e-06, + "loss": 0.016, + "step": 6122 + }, + { + "epoch": 4.241773467267059, + "grad_norm": 0.47517144680023193, + "learning_rate": 5.760748959778086e-06, + "loss": 0.0107, + "step": 6123 + }, + { + "epoch": 4.242466227918254, + "grad_norm": 0.4058322310447693, + "learning_rate": 5.760055478502081e-06, + "loss": 0.0121, + "step": 6124 + }, + { + "epoch": 4.24315898856945, + "grad_norm": 0.48418089747428894, + "learning_rate": 5.759361997226075e-06, + "loss": 0.0162, + "step": 6125 + }, + { + "epoch": 4.243851749220644, + "grad_norm": 0.3738580644130707, + "learning_rate": 5.75866851595007e-06, + "loss": 0.0151, + "step": 6126 + }, + { + "epoch": 4.244544509871839, + "grad_norm": 0.4556715488433838, + "learning_rate": 5.757975034674065e-06, + "loss": 0.0166, + "step": 6127 + }, + { + "epoch": 4.245237270523035, + "grad_norm": 0.36216968297958374, + "learning_rate": 5.757281553398058e-06, + "loss": 0.012, + "step": 6128 + }, + { + "epoch": 4.245930031174229, + "grad_norm": 0.6662531495094299, + "learning_rate": 5.756588072122053e-06, + "loss": 0.016, + "step": 6129 + }, + { + "epoch": 4.246622791825424, + "grad_norm": 0.477200984954834, + "learning_rate": 5.755894590846047e-06, + "loss": 0.0195, + "step": 6130 + }, + { + "epoch": 4.2473155524766195, + "grad_norm": 0.45956000685691833, + "learning_rate": 5.755201109570042e-06, + "loss": 0.0129, + "step": 6131 + }, + { + "epoch": 4.248008313127814, + "grad_norm": 0.38181018829345703, + "learning_rate": 5.754507628294037e-06, + "loss": 0.012, + "step": 6132 + }, + { + "epoch": 4.248701073779009, + "grad_norm": 0.48798617720603943, + "learning_rate": 5.753814147018031e-06, + "loss": 0.0152, + "step": 6133 + }, + { + "epoch": 4.2493938344302045, + "grad_norm": 0.48894646763801575, + "learning_rate": 5.753120665742025e-06, + "loss": 0.0173, + "step": 6134 + }, + { + "epoch": 4.2500865950814, + "grad_norm": 0.5880109071731567, + "learning_rate": 5.7524271844660194e-06, + "loss": 0.0203, + "step": 6135 + }, + { + "epoch": 4.250779355732594, + "grad_norm": 0.5441635847091675, + "learning_rate": 5.751733703190014e-06, + "loss": 0.0122, + "step": 6136 + }, + { + "epoch": 4.251472116383789, + "grad_norm": 0.5183771848678589, + "learning_rate": 5.751040221914009e-06, + "loss": 0.0157, + "step": 6137 + }, + { + "epoch": 4.252164877034985, + "grad_norm": 0.39211976528167725, + "learning_rate": 5.750346740638003e-06, + "loss": 0.016, + "step": 6138 + }, + { + "epoch": 4.252857637686179, + "grad_norm": 0.5179469585418701, + "learning_rate": 5.749653259361998e-06, + "loss": 0.0155, + "step": 6139 + }, + { + "epoch": 4.253550398337374, + "grad_norm": 0.6000032424926758, + "learning_rate": 5.7489597780859916e-06, + "loss": 0.0139, + "step": 6140 + }, + { + "epoch": 4.25424315898857, + "grad_norm": 0.6474589109420776, + "learning_rate": 5.7482662968099865e-06, + "loss": 0.0184, + "step": 6141 + }, + { + "epoch": 4.254935919639765, + "grad_norm": 0.37843167781829834, + "learning_rate": 5.747572815533981e-06, + "loss": 0.014, + "step": 6142 + }, + { + "epoch": 4.255628680290959, + "grad_norm": 0.449325293302536, + "learning_rate": 5.7468793342579755e-06, + "loss": 0.0115, + "step": 6143 + }, + { + "epoch": 4.256321440942155, + "grad_norm": 0.4086148738861084, + "learning_rate": 5.7461858529819704e-06, + "loss": 0.0132, + "step": 6144 + }, + { + "epoch": 4.25701420159335, + "grad_norm": 0.7150774002075195, + "learning_rate": 5.745492371705964e-06, + "loss": 0.0158, + "step": 6145 + }, + { + "epoch": 4.257706962244544, + "grad_norm": 0.4631674587726593, + "learning_rate": 5.744798890429959e-06, + "loss": 0.0126, + "step": 6146 + }, + { + "epoch": 4.2583997228957395, + "grad_norm": 0.3581140637397766, + "learning_rate": 5.744105409153953e-06, + "loss": 0.0133, + "step": 6147 + }, + { + "epoch": 4.259092483546935, + "grad_norm": 0.3423003554344177, + "learning_rate": 5.743411927877948e-06, + "loss": 0.0113, + "step": 6148 + }, + { + "epoch": 4.259785244198129, + "grad_norm": 0.5469533205032349, + "learning_rate": 5.7427184466019426e-06, + "loss": 0.0158, + "step": 6149 + }, + { + "epoch": 4.2604780048493245, + "grad_norm": 0.5302712321281433, + "learning_rate": 5.742024965325937e-06, + "loss": 0.0162, + "step": 6150 + }, + { + "epoch": 4.26117076550052, + "grad_norm": 0.5141806602478027, + "learning_rate": 5.741331484049932e-06, + "loss": 0.0149, + "step": 6151 + }, + { + "epoch": 4.261863526151714, + "grad_norm": 0.48647239804267883, + "learning_rate": 5.740638002773925e-06, + "loss": 0.0126, + "step": 6152 + }, + { + "epoch": 4.262556286802909, + "grad_norm": 0.464964896440506, + "learning_rate": 5.73994452149792e-06, + "loss": 0.0117, + "step": 6153 + }, + { + "epoch": 4.263249047454105, + "grad_norm": 0.48162877559661865, + "learning_rate": 5.739251040221915e-06, + "loss": 0.0174, + "step": 6154 + }, + { + "epoch": 4.2639418081053, + "grad_norm": 0.4663795232772827, + "learning_rate": 5.738557558945909e-06, + "loss": 0.0149, + "step": 6155 + }, + { + "epoch": 4.264634568756494, + "grad_norm": 0.3906458616256714, + "learning_rate": 5.737864077669904e-06, + "loss": 0.0196, + "step": 6156 + }, + { + "epoch": 4.26532732940769, + "grad_norm": 0.36130955815315247, + "learning_rate": 5.737170596393897e-06, + "loss": 0.011, + "step": 6157 + }, + { + "epoch": 4.266020090058885, + "grad_norm": 0.4151374399662018, + "learning_rate": 5.736477115117892e-06, + "loss": 0.0154, + "step": 6158 + }, + { + "epoch": 4.266712850710079, + "grad_norm": 0.4259635806083679, + "learning_rate": 5.735783633841887e-06, + "loss": 0.0167, + "step": 6159 + }, + { + "epoch": 4.267405611361275, + "grad_norm": 0.4480990767478943, + "learning_rate": 5.735090152565881e-06, + "loss": 0.0153, + "step": 6160 + }, + { + "epoch": 4.26809837201247, + "grad_norm": 0.4132225811481476, + "learning_rate": 5.734396671289876e-06, + "loss": 0.0128, + "step": 6161 + }, + { + "epoch": 4.268791132663665, + "grad_norm": 0.5066116452217102, + "learning_rate": 5.73370319001387e-06, + "loss": 0.0205, + "step": 6162 + }, + { + "epoch": 4.26948389331486, + "grad_norm": 0.4326753318309784, + "learning_rate": 5.733009708737865e-06, + "loss": 0.0105, + "step": 6163 + }, + { + "epoch": 4.270176653966055, + "grad_norm": 0.37269070744514465, + "learning_rate": 5.732316227461859e-06, + "loss": 0.0114, + "step": 6164 + }, + { + "epoch": 4.27086941461725, + "grad_norm": 0.7043964862823486, + "learning_rate": 5.731622746185853e-06, + "loss": 0.0167, + "step": 6165 + }, + { + "epoch": 4.2715621752684445, + "grad_norm": 0.37173211574554443, + "learning_rate": 5.730929264909848e-06, + "loss": 0.012, + "step": 6166 + }, + { + "epoch": 4.27225493591964, + "grad_norm": 0.5097013711929321, + "learning_rate": 5.730235783633842e-06, + "loss": 0.0162, + "step": 6167 + }, + { + "epoch": 4.272947696570835, + "grad_norm": 0.448404997587204, + "learning_rate": 5.729542302357837e-06, + "loss": 0.0149, + "step": 6168 + }, + { + "epoch": 4.2736404572220295, + "grad_norm": 0.4259950518608093, + "learning_rate": 5.728848821081832e-06, + "loss": 0.0118, + "step": 6169 + }, + { + "epoch": 4.274333217873225, + "grad_norm": 0.5313334465026855, + "learning_rate": 5.728155339805825e-06, + "loss": 0.0178, + "step": 6170 + }, + { + "epoch": 4.27502597852442, + "grad_norm": 0.4327642321586609, + "learning_rate": 5.72746185852982e-06, + "loss": 0.0152, + "step": 6171 + }, + { + "epoch": 4.275718739175614, + "grad_norm": 0.30305030941963196, + "learning_rate": 5.726768377253814e-06, + "loss": 0.0119, + "step": 6172 + }, + { + "epoch": 4.27641149982681, + "grad_norm": 0.42507559061050415, + "learning_rate": 5.726074895977809e-06, + "loss": 0.0166, + "step": 6173 + }, + { + "epoch": 4.277104260478005, + "grad_norm": 0.5407571792602539, + "learning_rate": 5.725381414701804e-06, + "loss": 0.0201, + "step": 6174 + }, + { + "epoch": 4.2777970211292, + "grad_norm": 0.43881678581237793, + "learning_rate": 5.724687933425797e-06, + "loss": 0.0149, + "step": 6175 + }, + { + "epoch": 4.278489781780395, + "grad_norm": 0.47735050320625305, + "learning_rate": 5.723994452149792e-06, + "loss": 0.0165, + "step": 6176 + }, + { + "epoch": 4.27918254243159, + "grad_norm": 0.35856977105140686, + "learning_rate": 5.723300970873786e-06, + "loss": 0.0163, + "step": 6177 + }, + { + "epoch": 4.279875303082785, + "grad_norm": 0.6316174268722534, + "learning_rate": 5.722607489597781e-06, + "loss": 0.0116, + "step": 6178 + }, + { + "epoch": 4.28056806373398, + "grad_norm": 0.4738340377807617, + "learning_rate": 5.721914008321776e-06, + "loss": 0.0174, + "step": 6179 + }, + { + "epoch": 4.281260824385175, + "grad_norm": 0.4427250027656555, + "learning_rate": 5.72122052704577e-06, + "loss": 0.0155, + "step": 6180 + }, + { + "epoch": 4.28195358503637, + "grad_norm": 0.4292624294757843, + "learning_rate": 5.720527045769765e-06, + "loss": 0.0105, + "step": 6181 + }, + { + "epoch": 4.282646345687565, + "grad_norm": 0.35663384199142456, + "learning_rate": 5.7198335644937584e-06, + "loss": 0.011, + "step": 6182 + }, + { + "epoch": 4.28333910633876, + "grad_norm": 0.4772343933582306, + "learning_rate": 5.719140083217753e-06, + "loss": 0.0195, + "step": 6183 + }, + { + "epoch": 4.284031866989955, + "grad_norm": 0.48446860909461975, + "learning_rate": 5.718446601941748e-06, + "loss": 0.0159, + "step": 6184 + }, + { + "epoch": 4.28472462764115, + "grad_norm": 0.5933783650398254, + "learning_rate": 5.717753120665742e-06, + "loss": 0.0184, + "step": 6185 + }, + { + "epoch": 4.285417388292345, + "grad_norm": 0.5756788849830627, + "learning_rate": 5.717059639389737e-06, + "loss": 0.012, + "step": 6186 + }, + { + "epoch": 4.28611014894354, + "grad_norm": 0.5146098136901855, + "learning_rate": 5.7163661581137306e-06, + "loss": 0.0179, + "step": 6187 + }, + { + "epoch": 4.286802909594735, + "grad_norm": 0.47209280729293823, + "learning_rate": 5.7156726768377255e-06, + "loss": 0.0143, + "step": 6188 + }, + { + "epoch": 4.28749567024593, + "grad_norm": 0.37178516387939453, + "learning_rate": 5.7149791955617204e-06, + "loss": 0.0123, + "step": 6189 + }, + { + "epoch": 4.288188430897125, + "grad_norm": 0.4624086320400238, + "learning_rate": 5.7142857142857145e-06, + "loss": 0.0144, + "step": 6190 + }, + { + "epoch": 4.28888119154832, + "grad_norm": 0.5005056262016296, + "learning_rate": 5.7135922330097094e-06, + "loss": 0.0213, + "step": 6191 + }, + { + "epoch": 4.289573952199515, + "grad_norm": 0.5770010948181152, + "learning_rate": 5.7128987517337035e-06, + "loss": 0.0162, + "step": 6192 + }, + { + "epoch": 4.29026671285071, + "grad_norm": 0.3798089623451233, + "learning_rate": 5.712205270457698e-06, + "loss": 0.0135, + "step": 6193 + }, + { + "epoch": 4.290959473501905, + "grad_norm": 0.5228467583656311, + "learning_rate": 5.7115117891816926e-06, + "loss": 0.0164, + "step": 6194 + }, + { + "epoch": 4.2916522341531005, + "grad_norm": 0.4126785695552826, + "learning_rate": 5.710818307905687e-06, + "loss": 0.0165, + "step": 6195 + }, + { + "epoch": 4.292344994804295, + "grad_norm": 0.39764726161956787, + "learning_rate": 5.7101248266296816e-06, + "loss": 0.0157, + "step": 6196 + }, + { + "epoch": 4.29303775545549, + "grad_norm": 0.386638879776001, + "learning_rate": 5.709431345353676e-06, + "loss": 0.0128, + "step": 6197 + }, + { + "epoch": 4.293730516106685, + "grad_norm": 0.5254207849502563, + "learning_rate": 5.708737864077671e-06, + "loss": 0.028, + "step": 6198 + }, + { + "epoch": 4.29442327675788, + "grad_norm": 0.46779918670654297, + "learning_rate": 5.7080443828016655e-06, + "loss": 0.014, + "step": 6199 + }, + { + "epoch": 4.295116037409075, + "grad_norm": 0.31714537739753723, + "learning_rate": 5.707350901525659e-06, + "loss": 0.011, + "step": 6200 + }, + { + "epoch": 4.29580879806027, + "grad_norm": 0.37947049736976624, + "learning_rate": 5.706657420249654e-06, + "loss": 0.0156, + "step": 6201 + }, + { + "epoch": 4.296501558711465, + "grad_norm": 0.4820058047771454, + "learning_rate": 5.705963938973648e-06, + "loss": 0.0151, + "step": 6202 + }, + { + "epoch": 4.29719431936266, + "grad_norm": 0.4857361912727356, + "learning_rate": 5.705270457697643e-06, + "loss": 0.0094, + "step": 6203 + }, + { + "epoch": 4.297887080013855, + "grad_norm": 0.3891202509403229, + "learning_rate": 5.704576976421638e-06, + "loss": 0.0103, + "step": 6204 + }, + { + "epoch": 4.298579840665051, + "grad_norm": 0.5287541151046753, + "learning_rate": 5.703883495145631e-06, + "loss": 0.0161, + "step": 6205 + }, + { + "epoch": 4.299272601316245, + "grad_norm": 0.4331056475639343, + "learning_rate": 5.703190013869626e-06, + "loss": 0.0115, + "step": 6206 + }, + { + "epoch": 4.29996536196744, + "grad_norm": 0.4861893951892853, + "learning_rate": 5.70249653259362e-06, + "loss": 0.0154, + "step": 6207 + }, + { + "epoch": 4.300658122618636, + "grad_norm": 0.43413400650024414, + "learning_rate": 5.701803051317615e-06, + "loss": 0.0124, + "step": 6208 + }, + { + "epoch": 4.30135088326983, + "grad_norm": 0.41063931584358215, + "learning_rate": 5.70110957004161e-06, + "loss": 0.0135, + "step": 6209 + }, + { + "epoch": 4.302043643921025, + "grad_norm": 0.5673719048500061, + "learning_rate": 5.700416088765604e-06, + "loss": 0.016, + "step": 6210 + }, + { + "epoch": 4.3027364045722205, + "grad_norm": 0.4758017361164093, + "learning_rate": 5.699722607489599e-06, + "loss": 0.0151, + "step": 6211 + }, + { + "epoch": 4.303429165223415, + "grad_norm": 0.4562930166721344, + "learning_rate": 5.699029126213592e-06, + "loss": 0.0202, + "step": 6212 + }, + { + "epoch": 4.30412192587461, + "grad_norm": 0.44941607117652893, + "learning_rate": 5.698335644937587e-06, + "loss": 0.0226, + "step": 6213 + }, + { + "epoch": 4.3048146865258055, + "grad_norm": 0.6464312076568604, + "learning_rate": 5.697642163661582e-06, + "loss": 0.0166, + "step": 6214 + }, + { + "epoch": 4.305507447177001, + "grad_norm": 0.4331834316253662, + "learning_rate": 5.696948682385576e-06, + "loss": 0.0149, + "step": 6215 + }, + { + "epoch": 4.306200207828195, + "grad_norm": 0.5660507082939148, + "learning_rate": 5.696255201109571e-06, + "loss": 0.0243, + "step": 6216 + }, + { + "epoch": 4.30689296847939, + "grad_norm": 0.513453483581543, + "learning_rate": 5.695561719833564e-06, + "loss": 0.0168, + "step": 6217 + }, + { + "epoch": 4.307585729130586, + "grad_norm": 0.39191699028015137, + "learning_rate": 5.694868238557559e-06, + "loss": 0.0151, + "step": 6218 + }, + { + "epoch": 4.30827848978178, + "grad_norm": 0.46802085638046265, + "learning_rate": 5.694174757281554e-06, + "loss": 0.0139, + "step": 6219 + }, + { + "epoch": 4.308971250432975, + "grad_norm": 0.5223895907402039, + "learning_rate": 5.693481276005548e-06, + "loss": 0.0151, + "step": 6220 + }, + { + "epoch": 4.309664011084171, + "grad_norm": 0.4044167101383209, + "learning_rate": 5.692787794729543e-06, + "loss": 0.0137, + "step": 6221 + }, + { + "epoch": 4.310356771735365, + "grad_norm": 0.3542228639125824, + "learning_rate": 5.692094313453536e-06, + "loss": 0.0106, + "step": 6222 + }, + { + "epoch": 4.31104953238656, + "grad_norm": 0.44608378410339355, + "learning_rate": 5.691400832177531e-06, + "loss": 0.0156, + "step": 6223 + }, + { + "epoch": 4.311742293037756, + "grad_norm": 0.46396249532699585, + "learning_rate": 5.690707350901526e-06, + "loss": 0.0203, + "step": 6224 + }, + { + "epoch": 4.312435053688951, + "grad_norm": 0.4457850158214569, + "learning_rate": 5.69001386962552e-06, + "loss": 0.0132, + "step": 6225 + }, + { + "epoch": 4.313127814340145, + "grad_norm": 0.4410547912120819, + "learning_rate": 5.689320388349515e-06, + "loss": 0.015, + "step": 6226 + }, + { + "epoch": 4.3138205749913405, + "grad_norm": 0.47477442026138306, + "learning_rate": 5.688626907073509e-06, + "loss": 0.0163, + "step": 6227 + }, + { + "epoch": 4.314513335642536, + "grad_norm": 0.4961939752101898, + "learning_rate": 5.687933425797504e-06, + "loss": 0.0179, + "step": 6228 + }, + { + "epoch": 4.31520609629373, + "grad_norm": 0.33782264590263367, + "learning_rate": 5.687239944521499e-06, + "loss": 0.0092, + "step": 6229 + }, + { + "epoch": 4.3158988569449255, + "grad_norm": 0.5120416879653931, + "learning_rate": 5.686546463245492e-06, + "loss": 0.0124, + "step": 6230 + }, + { + "epoch": 4.316591617596121, + "grad_norm": 0.43162956833839417, + "learning_rate": 5.685852981969487e-06, + "loss": 0.0131, + "step": 6231 + }, + { + "epoch": 4.317284378247315, + "grad_norm": 0.4046199917793274, + "learning_rate": 5.685159500693481e-06, + "loss": 0.0128, + "step": 6232 + }, + { + "epoch": 4.31797713889851, + "grad_norm": 0.4141041934490204, + "learning_rate": 5.684466019417476e-06, + "loss": 0.0125, + "step": 6233 + }, + { + "epoch": 4.318669899549706, + "grad_norm": 0.5312274694442749, + "learning_rate": 5.683772538141471e-06, + "loss": 0.0155, + "step": 6234 + }, + { + "epoch": 4.319362660200901, + "grad_norm": 0.6274176239967346, + "learning_rate": 5.6830790568654645e-06, + "loss": 0.0132, + "step": 6235 + }, + { + "epoch": 4.320055420852095, + "grad_norm": 0.41988468170166016, + "learning_rate": 5.6823855755894594e-06, + "loss": 0.0123, + "step": 6236 + }, + { + "epoch": 4.320748181503291, + "grad_norm": 0.5616729259490967, + "learning_rate": 5.6816920943134535e-06, + "loss": 0.0172, + "step": 6237 + }, + { + "epoch": 4.321440942154486, + "grad_norm": 0.46666640043258667, + "learning_rate": 5.6809986130374484e-06, + "loss": 0.0168, + "step": 6238 + }, + { + "epoch": 4.32213370280568, + "grad_norm": 0.5138194561004639, + "learning_rate": 5.680305131761443e-06, + "loss": 0.015, + "step": 6239 + }, + { + "epoch": 4.322826463456876, + "grad_norm": 0.4188655912876129, + "learning_rate": 5.6796116504854375e-06, + "loss": 0.0161, + "step": 6240 + }, + { + "epoch": 4.323519224108071, + "grad_norm": 0.44582825899124146, + "learning_rate": 5.6789181692094316e-06, + "loss": 0.0128, + "step": 6241 + }, + { + "epoch": 4.324211984759265, + "grad_norm": 0.6632908582687378, + "learning_rate": 5.678224687933426e-06, + "loss": 0.0153, + "step": 6242 + }, + { + "epoch": 4.3249047454104605, + "grad_norm": 0.49136364459991455, + "learning_rate": 5.6775312066574206e-06, + "loss": 0.0132, + "step": 6243 + }, + { + "epoch": 4.325597506061656, + "grad_norm": 0.46757856011390686, + "learning_rate": 5.6768377253814155e-06, + "loss": 0.0174, + "step": 6244 + }, + { + "epoch": 4.326290266712851, + "grad_norm": 0.5878485441207886, + "learning_rate": 5.67614424410541e-06, + "loss": 0.0121, + "step": 6245 + }, + { + "epoch": 4.3269830273640455, + "grad_norm": 0.37498223781585693, + "learning_rate": 5.6754507628294045e-06, + "loss": 0.0152, + "step": 6246 + }, + { + "epoch": 4.327675788015241, + "grad_norm": 0.33918726444244385, + "learning_rate": 5.674757281553398e-06, + "loss": 0.0113, + "step": 6247 + }, + { + "epoch": 4.328368548666436, + "grad_norm": 0.3665863275527954, + "learning_rate": 5.674063800277393e-06, + "loss": 0.0112, + "step": 6248 + }, + { + "epoch": 4.3290613093176304, + "grad_norm": 0.5431647300720215, + "learning_rate": 5.673370319001388e-06, + "loss": 0.0196, + "step": 6249 + }, + { + "epoch": 4.329754069968826, + "grad_norm": 0.36368706822395325, + "learning_rate": 5.672676837725382e-06, + "loss": 0.0126, + "step": 6250 + }, + { + "epoch": 4.330446830620021, + "grad_norm": 0.47009578347206116, + "learning_rate": 5.671983356449377e-06, + "loss": 0.0113, + "step": 6251 + }, + { + "epoch": 4.331139591271215, + "grad_norm": 0.45506036281585693, + "learning_rate": 5.67128987517337e-06, + "loss": 0.0152, + "step": 6252 + }, + { + "epoch": 4.331832351922411, + "grad_norm": 0.5325137972831726, + "learning_rate": 5.670596393897365e-06, + "loss": 0.019, + "step": 6253 + }, + { + "epoch": 4.332525112573606, + "grad_norm": 0.43603190779685974, + "learning_rate": 5.66990291262136e-06, + "loss": 0.0152, + "step": 6254 + }, + { + "epoch": 4.333217873224801, + "grad_norm": 0.6645075082778931, + "learning_rate": 5.669209431345354e-06, + "loss": 0.0222, + "step": 6255 + }, + { + "epoch": 4.333910633875996, + "grad_norm": 0.43905749917030334, + "learning_rate": 5.668515950069349e-06, + "loss": 0.0135, + "step": 6256 + }, + { + "epoch": 4.334603394527191, + "grad_norm": 0.4511397182941437, + "learning_rate": 5.667822468793343e-06, + "loss": 0.0148, + "step": 6257 + }, + { + "epoch": 4.335296155178386, + "grad_norm": 0.3553571403026581, + "learning_rate": 5.667128987517338e-06, + "loss": 0.0101, + "step": 6258 + }, + { + "epoch": 4.335988915829581, + "grad_norm": 0.5838598608970642, + "learning_rate": 5.666435506241333e-06, + "loss": 0.0134, + "step": 6259 + }, + { + "epoch": 4.336681676480776, + "grad_norm": 0.5036539435386658, + "learning_rate": 5.665742024965326e-06, + "loss": 0.0107, + "step": 6260 + }, + { + "epoch": 4.337374437131971, + "grad_norm": 0.4577176868915558, + "learning_rate": 5.665048543689321e-06, + "loss": 0.0141, + "step": 6261 + }, + { + "epoch": 4.3380671977831655, + "grad_norm": 0.35945776104927063, + "learning_rate": 5.664355062413315e-06, + "loss": 0.0125, + "step": 6262 + }, + { + "epoch": 4.338759958434361, + "grad_norm": 0.41411110758781433, + "learning_rate": 5.66366158113731e-06, + "loss": 0.0154, + "step": 6263 + }, + { + "epoch": 4.339452719085556, + "grad_norm": 0.5097288489341736, + "learning_rate": 5.662968099861305e-06, + "loss": 0.0145, + "step": 6264 + }, + { + "epoch": 4.340145479736751, + "grad_norm": 0.5383662581443787, + "learning_rate": 5.662274618585298e-06, + "loss": 0.013, + "step": 6265 + }, + { + "epoch": 4.340838240387946, + "grad_norm": 0.44316715002059937, + "learning_rate": 5.661581137309293e-06, + "loss": 0.0124, + "step": 6266 + }, + { + "epoch": 4.341531001039141, + "grad_norm": 0.5001533627510071, + "learning_rate": 5.660887656033287e-06, + "loss": 0.0168, + "step": 6267 + }, + { + "epoch": 4.342223761690336, + "grad_norm": 0.31020647287368774, + "learning_rate": 5.660194174757282e-06, + "loss": 0.0098, + "step": 6268 + }, + { + "epoch": 4.342916522341531, + "grad_norm": 0.40882495045661926, + "learning_rate": 5.659500693481277e-06, + "loss": 0.0117, + "step": 6269 + }, + { + "epoch": 4.343609282992726, + "grad_norm": 0.4471372663974762, + "learning_rate": 5.65880721220527e-06, + "loss": 0.013, + "step": 6270 + }, + { + "epoch": 4.344302043643921, + "grad_norm": 0.4961498975753784, + "learning_rate": 5.658113730929265e-06, + "loss": 0.0143, + "step": 6271 + }, + { + "epoch": 4.344994804295116, + "grad_norm": 0.5297603011131287, + "learning_rate": 5.657420249653259e-06, + "loss": 0.0173, + "step": 6272 + }, + { + "epoch": 4.345687564946311, + "grad_norm": 0.5457528829574585, + "learning_rate": 5.656726768377254e-06, + "loss": 0.015, + "step": 6273 + }, + { + "epoch": 4.346380325597506, + "grad_norm": 0.922075092792511, + "learning_rate": 5.656033287101249e-06, + "loss": 0.0175, + "step": 6274 + }, + { + "epoch": 4.3470730862487015, + "grad_norm": 0.4334849715232849, + "learning_rate": 5.655339805825243e-06, + "loss": 0.0196, + "step": 6275 + }, + { + "epoch": 4.347765846899896, + "grad_norm": 0.5204346776008606, + "learning_rate": 5.654646324549238e-06, + "loss": 0.0174, + "step": 6276 + }, + { + "epoch": 4.348458607551091, + "grad_norm": 0.4305042624473572, + "learning_rate": 5.653952843273231e-06, + "loss": 0.0137, + "step": 6277 + }, + { + "epoch": 4.349151368202286, + "grad_norm": 0.4151136875152588, + "learning_rate": 5.653259361997226e-06, + "loss": 0.0134, + "step": 6278 + }, + { + "epoch": 4.349844128853481, + "grad_norm": 0.7482634782791138, + "learning_rate": 5.652565880721221e-06, + "loss": 0.0175, + "step": 6279 + }, + { + "epoch": 4.350536889504676, + "grad_norm": 0.5009634494781494, + "learning_rate": 5.651872399445215e-06, + "loss": 0.0155, + "step": 6280 + }, + { + "epoch": 4.351229650155871, + "grad_norm": 0.3669988811016083, + "learning_rate": 5.65117891816921e-06, + "loss": 0.0131, + "step": 6281 + }, + { + "epoch": 4.351922410807066, + "grad_norm": 0.4883120059967041, + "learning_rate": 5.6504854368932035e-06, + "loss": 0.0086, + "step": 6282 + }, + { + "epoch": 4.352615171458261, + "grad_norm": 0.6424862742424011, + "learning_rate": 5.6497919556171984e-06, + "loss": 0.0165, + "step": 6283 + }, + { + "epoch": 4.353307932109456, + "grad_norm": 0.42300087213516235, + "learning_rate": 5.649098474341193e-06, + "loss": 0.0144, + "step": 6284 + }, + { + "epoch": 4.354000692760652, + "grad_norm": 0.33685287833213806, + "learning_rate": 5.6484049930651874e-06, + "loss": 0.0121, + "step": 6285 + }, + { + "epoch": 4.354693453411846, + "grad_norm": 0.38402703404426575, + "learning_rate": 5.647711511789182e-06, + "loss": 0.0146, + "step": 6286 + }, + { + "epoch": 4.355386214063041, + "grad_norm": 0.4445732533931732, + "learning_rate": 5.6470180305131765e-06, + "loss": 0.0178, + "step": 6287 + }, + { + "epoch": 4.3560789747142366, + "grad_norm": 0.5329450964927673, + "learning_rate": 5.646324549237171e-06, + "loss": 0.0165, + "step": 6288 + }, + { + "epoch": 4.356771735365431, + "grad_norm": 0.44079670310020447, + "learning_rate": 5.6456310679611655e-06, + "loss": 0.0141, + "step": 6289 + }, + { + "epoch": 4.357464496016626, + "grad_norm": 0.5818588137626648, + "learning_rate": 5.6449375866851596e-06, + "loss": 0.0192, + "step": 6290 + }, + { + "epoch": 4.3581572566678215, + "grad_norm": 0.4807320833206177, + "learning_rate": 5.6442441054091545e-06, + "loss": 0.0147, + "step": 6291 + }, + { + "epoch": 4.358850017319016, + "grad_norm": 0.45464131236076355, + "learning_rate": 5.643550624133149e-06, + "loss": 0.0124, + "step": 6292 + }, + { + "epoch": 4.359542777970211, + "grad_norm": 0.4575372040271759, + "learning_rate": 5.6428571428571435e-06, + "loss": 0.0171, + "step": 6293 + }, + { + "epoch": 4.3602355386214064, + "grad_norm": 0.5620321035385132, + "learning_rate": 5.6421636615811385e-06, + "loss": 0.0159, + "step": 6294 + }, + { + "epoch": 4.360928299272602, + "grad_norm": 0.407573401927948, + "learning_rate": 5.641470180305132e-06, + "loss": 0.0148, + "step": 6295 + }, + { + "epoch": 4.361621059923796, + "grad_norm": 0.48125845193862915, + "learning_rate": 5.640776699029127e-06, + "loss": 0.0174, + "step": 6296 + }, + { + "epoch": 4.362313820574991, + "grad_norm": 0.5339037179946899, + "learning_rate": 5.640083217753121e-06, + "loss": 0.016, + "step": 6297 + }, + { + "epoch": 4.363006581226187, + "grad_norm": 0.5509049296379089, + "learning_rate": 5.639389736477116e-06, + "loss": 0.0168, + "step": 6298 + }, + { + "epoch": 4.363699341877381, + "grad_norm": 0.47088614106178284, + "learning_rate": 5.638696255201111e-06, + "loss": 0.0186, + "step": 6299 + }, + { + "epoch": 4.364392102528576, + "grad_norm": 0.6244907975196838, + "learning_rate": 5.638002773925104e-06, + "loss": 0.0148, + "step": 6300 + }, + { + "epoch": 4.365084863179772, + "grad_norm": 0.4153135120868683, + "learning_rate": 5.637309292649099e-06, + "loss": 0.0131, + "step": 6301 + }, + { + "epoch": 4.365777623830966, + "grad_norm": 0.452224463224411, + "learning_rate": 5.636615811373093e-06, + "loss": 0.014, + "step": 6302 + }, + { + "epoch": 4.366470384482161, + "grad_norm": 0.4876563549041748, + "learning_rate": 5.635922330097088e-06, + "loss": 0.0155, + "step": 6303 + }, + { + "epoch": 4.367163145133357, + "grad_norm": 0.36006730794906616, + "learning_rate": 5.635228848821083e-06, + "loss": 0.0111, + "step": 6304 + }, + { + "epoch": 4.367855905784552, + "grad_norm": 0.545689046382904, + "learning_rate": 5.634535367545077e-06, + "loss": 0.0178, + "step": 6305 + }, + { + "epoch": 4.368548666435746, + "grad_norm": 0.44589248299598694, + "learning_rate": 5.633841886269072e-06, + "loss": 0.0168, + "step": 6306 + }, + { + "epoch": 4.3692414270869415, + "grad_norm": 0.3454737365245819, + "learning_rate": 5.633148404993065e-06, + "loss": 0.0103, + "step": 6307 + }, + { + "epoch": 4.369934187738137, + "grad_norm": 0.40857359766960144, + "learning_rate": 5.63245492371706e-06, + "loss": 0.0145, + "step": 6308 + }, + { + "epoch": 4.370626948389331, + "grad_norm": 0.42181888222694397, + "learning_rate": 5.631761442441055e-06, + "loss": 0.0137, + "step": 6309 + }, + { + "epoch": 4.3713197090405265, + "grad_norm": 0.5007210969924927, + "learning_rate": 5.631067961165049e-06, + "loss": 0.0117, + "step": 6310 + }, + { + "epoch": 4.372012469691722, + "grad_norm": 0.6799827218055725, + "learning_rate": 5.630374479889044e-06, + "loss": 0.0161, + "step": 6311 + }, + { + "epoch": 4.372705230342916, + "grad_norm": 0.4871334433555603, + "learning_rate": 5.629680998613037e-06, + "loss": 0.0127, + "step": 6312 + }, + { + "epoch": 4.373397990994111, + "grad_norm": 0.3891102373600006, + "learning_rate": 5.628987517337032e-06, + "loss": 0.01, + "step": 6313 + }, + { + "epoch": 4.374090751645307, + "grad_norm": 0.4211445152759552, + "learning_rate": 5.628294036061027e-06, + "loss": 0.0132, + "step": 6314 + }, + { + "epoch": 4.374783512296502, + "grad_norm": 0.5274110436439514, + "learning_rate": 5.627600554785021e-06, + "loss": 0.0112, + "step": 6315 + }, + { + "epoch": 4.375476272947696, + "grad_norm": 0.7541940212249756, + "learning_rate": 5.626907073509016e-06, + "loss": 0.023, + "step": 6316 + }, + { + "epoch": 4.376169033598892, + "grad_norm": 0.5094373226165771, + "learning_rate": 5.62621359223301e-06, + "loss": 0.0155, + "step": 6317 + }, + { + "epoch": 4.376861794250087, + "grad_norm": 0.5532792210578918, + "learning_rate": 5.625520110957004e-06, + "loss": 0.0174, + "step": 6318 + }, + { + "epoch": 4.377554554901281, + "grad_norm": 0.5033500790596008, + "learning_rate": 5.624826629680999e-06, + "loss": 0.0158, + "step": 6319 + }, + { + "epoch": 4.378247315552477, + "grad_norm": 0.3506750464439392, + "learning_rate": 5.624133148404993e-06, + "loss": 0.011, + "step": 6320 + }, + { + "epoch": 4.378940076203672, + "grad_norm": 0.7826740741729736, + "learning_rate": 5.623439667128988e-06, + "loss": 0.0263, + "step": 6321 + }, + { + "epoch": 4.379632836854866, + "grad_norm": 0.5336676239967346, + "learning_rate": 5.622746185852982e-06, + "loss": 0.0159, + "step": 6322 + }, + { + "epoch": 4.3803255975060615, + "grad_norm": 0.6048029661178589, + "learning_rate": 5.622052704576977e-06, + "loss": 0.016, + "step": 6323 + }, + { + "epoch": 4.381018358157257, + "grad_norm": 0.4789120554924011, + "learning_rate": 5.621359223300972e-06, + "loss": 0.0192, + "step": 6324 + }, + { + "epoch": 4.381711118808452, + "grad_norm": 0.38032591342926025, + "learning_rate": 5.620665742024965e-06, + "loss": 0.0118, + "step": 6325 + }, + { + "epoch": 4.3824038794596465, + "grad_norm": 0.46888935565948486, + "learning_rate": 5.61997226074896e-06, + "loss": 0.0169, + "step": 6326 + }, + { + "epoch": 4.383096640110842, + "grad_norm": 0.6135951280593872, + "learning_rate": 5.619278779472954e-06, + "loss": 0.0153, + "step": 6327 + }, + { + "epoch": 4.383789400762037, + "grad_norm": 0.43433281779289246, + "learning_rate": 5.618585298196949e-06, + "loss": 0.015, + "step": 6328 + }, + { + "epoch": 4.384482161413231, + "grad_norm": 0.3692700266838074, + "learning_rate": 5.617891816920944e-06, + "loss": 0.0103, + "step": 6329 + }, + { + "epoch": 4.385174922064427, + "grad_norm": 0.4402207136154175, + "learning_rate": 5.6171983356449374e-06, + "loss": 0.0171, + "step": 6330 + }, + { + "epoch": 4.385867682715622, + "grad_norm": 0.4314385652542114, + "learning_rate": 5.616504854368932e-06, + "loss": 0.0144, + "step": 6331 + }, + { + "epoch": 4.386560443366816, + "grad_norm": 0.46023669838905334, + "learning_rate": 5.6158113730929264e-06, + "loss": 0.0122, + "step": 6332 + }, + { + "epoch": 4.387253204018012, + "grad_norm": 0.5423911213874817, + "learning_rate": 5.615117891816921e-06, + "loss": 0.0178, + "step": 6333 + }, + { + "epoch": 4.387945964669207, + "grad_norm": 0.406222403049469, + "learning_rate": 5.614424410540916e-06, + "loss": 0.015, + "step": 6334 + }, + { + "epoch": 4.388638725320401, + "grad_norm": 0.37357595562934875, + "learning_rate": 5.61373092926491e-06, + "loss": 0.0142, + "step": 6335 + }, + { + "epoch": 4.389331485971597, + "grad_norm": 0.5193958878517151, + "learning_rate": 5.613037447988905e-06, + "loss": 0.015, + "step": 6336 + }, + { + "epoch": 4.390024246622792, + "grad_norm": 0.44530028104782104, + "learning_rate": 5.6123439667128986e-06, + "loss": 0.0184, + "step": 6337 + }, + { + "epoch": 4.390717007273987, + "grad_norm": 0.6473617553710938, + "learning_rate": 5.6116504854368935e-06, + "loss": 0.016, + "step": 6338 + }, + { + "epoch": 4.391409767925182, + "grad_norm": 0.4191620647907257, + "learning_rate": 5.6109570041608884e-06, + "loss": 0.0158, + "step": 6339 + }, + { + "epoch": 4.392102528576377, + "grad_norm": 0.4953300952911377, + "learning_rate": 5.6102635228848825e-06, + "loss": 0.018, + "step": 6340 + }, + { + "epoch": 4.392795289227572, + "grad_norm": 0.4880446493625641, + "learning_rate": 5.6095700416088775e-06, + "loss": 0.0172, + "step": 6341 + }, + { + "epoch": 4.3934880498787665, + "grad_norm": 0.48921796679496765, + "learning_rate": 5.608876560332871e-06, + "loss": 0.0114, + "step": 6342 + }, + { + "epoch": 4.394180810529962, + "grad_norm": 0.6088504791259766, + "learning_rate": 5.608183079056866e-06, + "loss": 0.0147, + "step": 6343 + }, + { + "epoch": 4.394873571181157, + "grad_norm": 0.4283059239387512, + "learning_rate": 5.6074895977808606e-06, + "loss": 0.0152, + "step": 6344 + }, + { + "epoch": 4.395566331832352, + "grad_norm": 0.4862416088581085, + "learning_rate": 5.606796116504855e-06, + "loss": 0.0125, + "step": 6345 + }, + { + "epoch": 4.396259092483547, + "grad_norm": 0.506998598575592, + "learning_rate": 5.60610263522885e-06, + "loss": 0.0159, + "step": 6346 + }, + { + "epoch": 4.396951853134742, + "grad_norm": 0.6488572359085083, + "learning_rate": 5.605409153952843e-06, + "loss": 0.0156, + "step": 6347 + }, + { + "epoch": 4.397644613785937, + "grad_norm": 0.6373663544654846, + "learning_rate": 5.604715672676838e-06, + "loss": 0.0163, + "step": 6348 + }, + { + "epoch": 4.398337374437132, + "grad_norm": 0.6178373694419861, + "learning_rate": 5.604022191400833e-06, + "loss": 0.0239, + "step": 6349 + }, + { + "epoch": 4.399030135088327, + "grad_norm": 0.464708536863327, + "learning_rate": 5.603328710124827e-06, + "loss": 0.0148, + "step": 6350 + }, + { + "epoch": 4.399722895739522, + "grad_norm": 0.40016040205955505, + "learning_rate": 5.602635228848822e-06, + "loss": 0.0123, + "step": 6351 + }, + { + "epoch": 4.400415656390717, + "grad_norm": 0.5595857501029968, + "learning_rate": 5.601941747572816e-06, + "loss": 0.0156, + "step": 6352 + }, + { + "epoch": 4.401108417041912, + "grad_norm": 0.3771488666534424, + "learning_rate": 5.601248266296811e-06, + "loss": 0.0126, + "step": 6353 + }, + { + "epoch": 4.401801177693107, + "grad_norm": 0.3650773763656616, + "learning_rate": 5.600554785020806e-06, + "loss": 0.0136, + "step": 6354 + }, + { + "epoch": 4.402493938344302, + "grad_norm": 0.5128456354141235, + "learning_rate": 5.599861303744799e-06, + "loss": 0.0147, + "step": 6355 + }, + { + "epoch": 4.403186698995497, + "grad_norm": 0.4283827245235443, + "learning_rate": 5.599167822468794e-06, + "loss": 0.0182, + "step": 6356 + }, + { + "epoch": 4.403879459646692, + "grad_norm": 0.46825936436653137, + "learning_rate": 5.598474341192788e-06, + "loss": 0.0129, + "step": 6357 + }, + { + "epoch": 4.404572220297887, + "grad_norm": 0.6075336933135986, + "learning_rate": 5.597780859916783e-06, + "loss": 0.0207, + "step": 6358 + }, + { + "epoch": 4.405264980949082, + "grad_norm": 0.42504626512527466, + "learning_rate": 5.597087378640778e-06, + "loss": 0.0208, + "step": 6359 + }, + { + "epoch": 4.405957741600277, + "grad_norm": 0.3802050054073334, + "learning_rate": 5.596393897364771e-06, + "loss": 0.0119, + "step": 6360 + }, + { + "epoch": 4.406650502251472, + "grad_norm": 0.42043280601501465, + "learning_rate": 5.595700416088766e-06, + "loss": 0.0133, + "step": 6361 + }, + { + "epoch": 4.407343262902667, + "grad_norm": 0.409045934677124, + "learning_rate": 5.59500693481276e-06, + "loss": 0.0125, + "step": 6362 + }, + { + "epoch": 4.408036023553862, + "grad_norm": 0.4809107482433319, + "learning_rate": 5.594313453536755e-06, + "loss": 0.0152, + "step": 6363 + }, + { + "epoch": 4.408728784205057, + "grad_norm": 0.5782892107963562, + "learning_rate": 5.59361997226075e-06, + "loss": 0.0189, + "step": 6364 + }, + { + "epoch": 4.409421544856253, + "grad_norm": 0.4007456600666046, + "learning_rate": 5.592926490984744e-06, + "loss": 0.0127, + "step": 6365 + }, + { + "epoch": 4.410114305507447, + "grad_norm": 0.5185883045196533, + "learning_rate": 5.592233009708738e-06, + "loss": 0.0188, + "step": 6366 + }, + { + "epoch": 4.410807066158642, + "grad_norm": 0.5212296843528748, + "learning_rate": 5.591539528432732e-06, + "loss": 0.017, + "step": 6367 + }, + { + "epoch": 4.4114998268098375, + "grad_norm": 0.5395380258560181, + "learning_rate": 5.590846047156727e-06, + "loss": 0.0148, + "step": 6368 + }, + { + "epoch": 4.412192587461032, + "grad_norm": 0.467286616563797, + "learning_rate": 5.590152565880722e-06, + "loss": 0.0175, + "step": 6369 + }, + { + "epoch": 4.412885348112227, + "grad_norm": 0.45846062898635864, + "learning_rate": 5.589459084604716e-06, + "loss": 0.0139, + "step": 6370 + }, + { + "epoch": 4.4135781087634225, + "grad_norm": 0.4456712603569031, + "learning_rate": 5.588765603328711e-06, + "loss": 0.0149, + "step": 6371 + }, + { + "epoch": 4.414270869414617, + "grad_norm": 0.3855138123035431, + "learning_rate": 5.588072122052704e-06, + "loss": 0.0146, + "step": 6372 + }, + { + "epoch": 4.414963630065812, + "grad_norm": 0.446366548538208, + "learning_rate": 5.587378640776699e-06, + "loss": 0.014, + "step": 6373 + }, + { + "epoch": 4.415656390717007, + "grad_norm": 0.49780362844467163, + "learning_rate": 5.586685159500694e-06, + "loss": 0.0137, + "step": 6374 + }, + { + "epoch": 4.416349151368202, + "grad_norm": 0.5019102692604065, + "learning_rate": 5.585991678224688e-06, + "loss": 0.0153, + "step": 6375 + }, + { + "epoch": 4.417041912019397, + "grad_norm": 0.442880779504776, + "learning_rate": 5.585298196948683e-06, + "loss": 0.0154, + "step": 6376 + }, + { + "epoch": 4.417734672670592, + "grad_norm": 0.5280331373214722, + "learning_rate": 5.5846047156726764e-06, + "loss": 0.0178, + "step": 6377 + }, + { + "epoch": 4.418427433321788, + "grad_norm": 0.5102010369300842, + "learning_rate": 5.583911234396671e-06, + "loss": 0.0138, + "step": 6378 + }, + { + "epoch": 4.419120193972982, + "grad_norm": 0.3986343443393707, + "learning_rate": 5.583217753120666e-06, + "loss": 0.014, + "step": 6379 + }, + { + "epoch": 4.419812954624177, + "grad_norm": 0.47398659586906433, + "learning_rate": 5.58252427184466e-06, + "loss": 0.0132, + "step": 6380 + }, + { + "epoch": 4.420505715275373, + "grad_norm": 0.38692718744277954, + "learning_rate": 5.581830790568655e-06, + "loss": 0.0145, + "step": 6381 + }, + { + "epoch": 4.421198475926567, + "grad_norm": 0.48547956347465515, + "learning_rate": 5.581137309292649e-06, + "loss": 0.0185, + "step": 6382 + }, + { + "epoch": 4.421891236577762, + "grad_norm": 0.37141096591949463, + "learning_rate": 5.580443828016644e-06, + "loss": 0.0109, + "step": 6383 + }, + { + "epoch": 4.422583997228958, + "grad_norm": 0.39800119400024414, + "learning_rate": 5.579750346740639e-06, + "loss": 0.0091, + "step": 6384 + }, + { + "epoch": 4.423276757880153, + "grad_norm": 0.3927205204963684, + "learning_rate": 5.5790568654646325e-06, + "loss": 0.0122, + "step": 6385 + }, + { + "epoch": 4.423969518531347, + "grad_norm": 0.463717520236969, + "learning_rate": 5.5783633841886274e-06, + "loss": 0.0168, + "step": 6386 + }, + { + "epoch": 4.4246622791825425, + "grad_norm": 0.43868792057037354, + "learning_rate": 5.5776699029126215e-06, + "loss": 0.0184, + "step": 6387 + }, + { + "epoch": 4.425355039833738, + "grad_norm": 0.3682408034801483, + "learning_rate": 5.5769764216366165e-06, + "loss": 0.0123, + "step": 6388 + }, + { + "epoch": 4.426047800484932, + "grad_norm": 0.6563029289245605, + "learning_rate": 5.576282940360611e-06, + "loss": 0.0244, + "step": 6389 + }, + { + "epoch": 4.4267405611361275, + "grad_norm": 0.4049411416053772, + "learning_rate": 5.575589459084605e-06, + "loss": 0.0116, + "step": 6390 + }, + { + "epoch": 4.427433321787323, + "grad_norm": 0.5479726195335388, + "learning_rate": 5.5748959778085996e-06, + "loss": 0.0135, + "step": 6391 + }, + { + "epoch": 4.428126082438517, + "grad_norm": 0.6239023208618164, + "learning_rate": 5.574202496532594e-06, + "loss": 0.0172, + "step": 6392 + }, + { + "epoch": 4.428818843089712, + "grad_norm": 0.542202889919281, + "learning_rate": 5.573509015256589e-06, + "loss": 0.0189, + "step": 6393 + }, + { + "epoch": 4.429511603740908, + "grad_norm": 0.470120370388031, + "learning_rate": 5.5728155339805835e-06, + "loss": 0.0158, + "step": 6394 + }, + { + "epoch": 4.430204364392102, + "grad_norm": 0.6321980357170105, + "learning_rate": 5.572122052704577e-06, + "loss": 0.0153, + "step": 6395 + }, + { + "epoch": 4.430897125043297, + "grad_norm": 0.40127214789390564, + "learning_rate": 5.571428571428572e-06, + "loss": 0.0164, + "step": 6396 + }, + { + "epoch": 4.431589885694493, + "grad_norm": 0.3545286953449249, + "learning_rate": 5.570735090152566e-06, + "loss": 0.0143, + "step": 6397 + }, + { + "epoch": 4.432282646345688, + "grad_norm": 0.5121960639953613, + "learning_rate": 5.570041608876561e-06, + "loss": 0.0138, + "step": 6398 + }, + { + "epoch": 4.432975406996882, + "grad_norm": 0.4815506637096405, + "learning_rate": 5.569348127600556e-06, + "loss": 0.0146, + "step": 6399 + }, + { + "epoch": 4.433668167648078, + "grad_norm": 0.4875757694244385, + "learning_rate": 5.56865464632455e-06, + "loss": 0.0148, + "step": 6400 + }, + { + "epoch": 4.434360928299273, + "grad_norm": 0.5487003922462463, + "learning_rate": 5.567961165048545e-06, + "loss": 0.0163, + "step": 6401 + }, + { + "epoch": 4.435053688950467, + "grad_norm": 0.43102291226387024, + "learning_rate": 5.567267683772538e-06, + "loss": 0.0162, + "step": 6402 + }, + { + "epoch": 4.4357464496016625, + "grad_norm": 0.4879804253578186, + "learning_rate": 5.566574202496533e-06, + "loss": 0.0203, + "step": 6403 + }, + { + "epoch": 4.436439210252858, + "grad_norm": 0.49057435989379883, + "learning_rate": 5.565880721220528e-06, + "loss": 0.0116, + "step": 6404 + }, + { + "epoch": 4.437131970904053, + "grad_norm": 0.49575570225715637, + "learning_rate": 5.565187239944522e-06, + "loss": 0.0156, + "step": 6405 + }, + { + "epoch": 4.4378247315552475, + "grad_norm": 0.516643762588501, + "learning_rate": 5.564493758668517e-06, + "loss": 0.0208, + "step": 6406 + }, + { + "epoch": 4.438517492206443, + "grad_norm": 0.3851929306983948, + "learning_rate": 5.56380027739251e-06, + "loss": 0.0166, + "step": 6407 + }, + { + "epoch": 4.439210252857638, + "grad_norm": 0.4506097137928009, + "learning_rate": 5.563106796116505e-06, + "loss": 0.0159, + "step": 6408 + }, + { + "epoch": 4.439903013508832, + "grad_norm": 0.3984452486038208, + "learning_rate": 5.5624133148405e-06, + "loss": 0.0118, + "step": 6409 + }, + { + "epoch": 4.440595774160028, + "grad_norm": 0.5958320498466492, + "learning_rate": 5.561719833564494e-06, + "loss": 0.0127, + "step": 6410 + }, + { + "epoch": 4.441288534811223, + "grad_norm": 0.4138931632041931, + "learning_rate": 5.561026352288489e-06, + "loss": 0.0117, + "step": 6411 + }, + { + "epoch": 4.441981295462417, + "grad_norm": 0.5055581331253052, + "learning_rate": 5.560332871012483e-06, + "loss": 0.0178, + "step": 6412 + }, + { + "epoch": 4.442674056113613, + "grad_norm": 0.4865836799144745, + "learning_rate": 5.559639389736478e-06, + "loss": 0.0151, + "step": 6413 + }, + { + "epoch": 4.443366816764808, + "grad_norm": 0.6675582528114319, + "learning_rate": 5.558945908460472e-06, + "loss": 0.0194, + "step": 6414 + }, + { + "epoch": 4.444059577416002, + "grad_norm": 0.4479790925979614, + "learning_rate": 5.558252427184466e-06, + "loss": 0.019, + "step": 6415 + }, + { + "epoch": 4.444752338067198, + "grad_norm": 0.5455640554428101, + "learning_rate": 5.557558945908461e-06, + "loss": 0.0196, + "step": 6416 + }, + { + "epoch": 4.445445098718393, + "grad_norm": 0.47233420610427856, + "learning_rate": 5.556865464632455e-06, + "loss": 0.0132, + "step": 6417 + }, + { + "epoch": 4.446137859369588, + "grad_norm": 0.4319811463356018, + "learning_rate": 5.55617198335645e-06, + "loss": 0.0152, + "step": 6418 + }, + { + "epoch": 4.4468306200207826, + "grad_norm": 0.4056445062160492, + "learning_rate": 5.555478502080445e-06, + "loss": 0.0138, + "step": 6419 + }, + { + "epoch": 4.447523380671978, + "grad_norm": 0.7867854833602905, + "learning_rate": 5.554785020804438e-06, + "loss": 0.0203, + "step": 6420 + }, + { + "epoch": 4.448216141323173, + "grad_norm": 0.4006311893463135, + "learning_rate": 5.554091539528433e-06, + "loss": 0.0147, + "step": 6421 + }, + { + "epoch": 4.4489089019743675, + "grad_norm": 0.40011733770370483, + "learning_rate": 5.553398058252427e-06, + "loss": 0.0177, + "step": 6422 + }, + { + "epoch": 4.449601662625563, + "grad_norm": 0.5467776656150818, + "learning_rate": 5.552704576976422e-06, + "loss": 0.02, + "step": 6423 + }, + { + "epoch": 4.450294423276758, + "grad_norm": 0.47469064593315125, + "learning_rate": 5.552011095700417e-06, + "loss": 0.0115, + "step": 6424 + }, + { + "epoch": 4.450987183927953, + "grad_norm": 0.4306032359600067, + "learning_rate": 5.55131761442441e-06, + "loss": 0.0183, + "step": 6425 + }, + { + "epoch": 4.451679944579148, + "grad_norm": 0.42458483576774597, + "learning_rate": 5.550624133148405e-06, + "loss": 0.0116, + "step": 6426 + }, + { + "epoch": 4.452372705230343, + "grad_norm": 0.4996929168701172, + "learning_rate": 5.549930651872399e-06, + "loss": 0.0188, + "step": 6427 + }, + { + "epoch": 4.453065465881538, + "grad_norm": 0.41575998067855835, + "learning_rate": 5.549237170596394e-06, + "loss": 0.0165, + "step": 6428 + }, + { + "epoch": 4.453758226532733, + "grad_norm": 0.3469735085964203, + "learning_rate": 5.548543689320389e-06, + "loss": 0.0089, + "step": 6429 + }, + { + "epoch": 4.454450987183928, + "grad_norm": 0.4437747597694397, + "learning_rate": 5.547850208044383e-06, + "loss": 0.016, + "step": 6430 + }, + { + "epoch": 4.455143747835123, + "grad_norm": 0.5774936676025391, + "learning_rate": 5.547156726768378e-06, + "loss": 0.0272, + "step": 6431 + }, + { + "epoch": 4.455836508486318, + "grad_norm": 0.47543248534202576, + "learning_rate": 5.5464632454923715e-06, + "loss": 0.0122, + "step": 6432 + }, + { + "epoch": 4.456529269137513, + "grad_norm": 0.487699955701828, + "learning_rate": 5.5457697642163664e-06, + "loss": 0.0179, + "step": 6433 + }, + { + "epoch": 4.457222029788708, + "grad_norm": 0.4815370738506317, + "learning_rate": 5.545076282940361e-06, + "loss": 0.0157, + "step": 6434 + }, + { + "epoch": 4.457914790439903, + "grad_norm": 0.5126708745956421, + "learning_rate": 5.5443828016643555e-06, + "loss": 0.0149, + "step": 6435 + }, + { + "epoch": 4.458607551091098, + "grad_norm": 0.5073757171630859, + "learning_rate": 5.54368932038835e-06, + "loss": 0.0144, + "step": 6436 + }, + { + "epoch": 4.459300311742293, + "grad_norm": 0.4443979263305664, + "learning_rate": 5.542995839112344e-06, + "loss": 0.0155, + "step": 6437 + }, + { + "epoch": 4.459993072393488, + "grad_norm": 0.34232097864151, + "learning_rate": 5.5423023578363386e-06, + "loss": 0.0096, + "step": 6438 + }, + { + "epoch": 4.460685833044683, + "grad_norm": 0.5201405882835388, + "learning_rate": 5.5416088765603335e-06, + "loss": 0.0172, + "step": 6439 + }, + { + "epoch": 4.461378593695878, + "grad_norm": 0.3482005298137665, + "learning_rate": 5.540915395284328e-06, + "loss": 0.0118, + "step": 6440 + }, + { + "epoch": 4.462071354347073, + "grad_norm": 0.720801591873169, + "learning_rate": 5.5402219140083225e-06, + "loss": 0.0155, + "step": 6441 + }, + { + "epoch": 4.462764114998268, + "grad_norm": 0.38182923197746277, + "learning_rate": 5.539528432732317e-06, + "loss": 0.0179, + "step": 6442 + }, + { + "epoch": 4.463456875649463, + "grad_norm": 0.375409871339798, + "learning_rate": 5.5388349514563115e-06, + "loss": 0.0114, + "step": 6443 + }, + { + "epoch": 4.464149636300658, + "grad_norm": 0.41085493564605713, + "learning_rate": 5.538141470180306e-06, + "loss": 0.0133, + "step": 6444 + }, + { + "epoch": 4.464842396951854, + "grad_norm": 0.42861318588256836, + "learning_rate": 5.5374479889043e-06, + "loss": 0.0152, + "step": 6445 + }, + { + "epoch": 4.465535157603048, + "grad_norm": 0.4964158535003662, + "learning_rate": 5.536754507628295e-06, + "loss": 0.0161, + "step": 6446 + }, + { + "epoch": 4.466227918254243, + "grad_norm": 0.3339213728904724, + "learning_rate": 5.536061026352289e-06, + "loss": 0.013, + "step": 6447 + }, + { + "epoch": 4.4669206789054385, + "grad_norm": 0.5683286190032959, + "learning_rate": 5.535367545076284e-06, + "loss": 0.0189, + "step": 6448 + }, + { + "epoch": 4.467613439556633, + "grad_norm": 0.7219005227088928, + "learning_rate": 5.534674063800279e-06, + "loss": 0.0176, + "step": 6449 + }, + { + "epoch": 4.468306200207828, + "grad_norm": 0.3752721846103668, + "learning_rate": 5.533980582524272e-06, + "loss": 0.0115, + "step": 6450 + }, + { + "epoch": 4.4689989608590235, + "grad_norm": 0.5969683527946472, + "learning_rate": 5.533287101248267e-06, + "loss": 0.0149, + "step": 6451 + }, + { + "epoch": 4.469691721510218, + "grad_norm": 0.4072873890399933, + "learning_rate": 5.532593619972261e-06, + "loss": 0.0148, + "step": 6452 + }, + { + "epoch": 4.470384482161413, + "grad_norm": 0.6940498352050781, + "learning_rate": 5.531900138696256e-06, + "loss": 0.019, + "step": 6453 + }, + { + "epoch": 4.471077242812608, + "grad_norm": 0.5742140412330627, + "learning_rate": 5.531206657420251e-06, + "loss": 0.0138, + "step": 6454 + }, + { + "epoch": 4.471770003463803, + "grad_norm": 0.526165783405304, + "learning_rate": 5.530513176144244e-06, + "loss": 0.0163, + "step": 6455 + }, + { + "epoch": 4.472462764114998, + "grad_norm": 0.4514405131340027, + "learning_rate": 5.529819694868239e-06, + "loss": 0.0171, + "step": 6456 + }, + { + "epoch": 4.473155524766193, + "grad_norm": 0.40915215015411377, + "learning_rate": 5.529126213592233e-06, + "loss": 0.0119, + "step": 6457 + }, + { + "epoch": 4.473848285417389, + "grad_norm": 0.5459065437316895, + "learning_rate": 5.528432732316228e-06, + "loss": 0.014, + "step": 6458 + }, + { + "epoch": 4.474541046068583, + "grad_norm": 0.46588218212127686, + "learning_rate": 5.527739251040223e-06, + "loss": 0.0123, + "step": 6459 + }, + { + "epoch": 4.475233806719778, + "grad_norm": 0.41297537088394165, + "learning_rate": 5.527045769764217e-06, + "loss": 0.0162, + "step": 6460 + }, + { + "epoch": 4.475926567370974, + "grad_norm": 0.8941429853439331, + "learning_rate": 5.526352288488212e-06, + "loss": 0.0218, + "step": 6461 + }, + { + "epoch": 4.476619328022168, + "grad_norm": 0.451428085565567, + "learning_rate": 5.525658807212205e-06, + "loss": 0.016, + "step": 6462 + }, + { + "epoch": 4.477312088673363, + "grad_norm": 0.5240730047225952, + "learning_rate": 5.5249653259362e-06, + "loss": 0.0149, + "step": 6463 + }, + { + "epoch": 4.478004849324559, + "grad_norm": 0.4977891445159912, + "learning_rate": 5.524271844660195e-06, + "loss": 0.0148, + "step": 6464 + }, + { + "epoch": 4.478697609975754, + "grad_norm": 0.6426070928573608, + "learning_rate": 5.523578363384189e-06, + "loss": 0.014, + "step": 6465 + }, + { + "epoch": 4.479390370626948, + "grad_norm": 0.5414949655532837, + "learning_rate": 5.522884882108184e-06, + "loss": 0.0195, + "step": 6466 + }, + { + "epoch": 4.4800831312781435, + "grad_norm": 0.47537606954574585, + "learning_rate": 5.522191400832177e-06, + "loss": 0.0134, + "step": 6467 + }, + { + "epoch": 4.480775891929339, + "grad_norm": 0.5032382607460022, + "learning_rate": 5.521497919556172e-06, + "loss": 0.0129, + "step": 6468 + }, + { + "epoch": 4.481468652580533, + "grad_norm": 0.5442067980766296, + "learning_rate": 5.520804438280167e-06, + "loss": 0.0128, + "step": 6469 + }, + { + "epoch": 4.4821614132317285, + "grad_norm": 0.526715874671936, + "learning_rate": 5.520110957004161e-06, + "loss": 0.0181, + "step": 6470 + }, + { + "epoch": 4.482854173882924, + "grad_norm": 0.6059287786483765, + "learning_rate": 5.519417475728156e-06, + "loss": 0.0167, + "step": 6471 + }, + { + "epoch": 4.483546934534118, + "grad_norm": 0.5071312785148621, + "learning_rate": 5.51872399445215e-06, + "loss": 0.0156, + "step": 6472 + }, + { + "epoch": 4.484239695185313, + "grad_norm": 0.5615072250366211, + "learning_rate": 5.518030513176144e-06, + "loss": 0.0188, + "step": 6473 + }, + { + "epoch": 4.484932455836509, + "grad_norm": 0.37299078702926636, + "learning_rate": 5.517337031900139e-06, + "loss": 0.0155, + "step": 6474 + }, + { + "epoch": 4.485625216487703, + "grad_norm": 0.438225120306015, + "learning_rate": 5.516643550624133e-06, + "loss": 0.0151, + "step": 6475 + }, + { + "epoch": 4.486317977138898, + "grad_norm": 0.49470794200897217, + "learning_rate": 5.515950069348128e-06, + "loss": 0.0139, + "step": 6476 + }, + { + "epoch": 4.487010737790094, + "grad_norm": 0.4833485186100006, + "learning_rate": 5.515256588072122e-06, + "loss": 0.0175, + "step": 6477 + }, + { + "epoch": 4.487703498441289, + "grad_norm": 0.42016899585723877, + "learning_rate": 5.514563106796117e-06, + "loss": 0.0119, + "step": 6478 + }, + { + "epoch": 4.488396259092483, + "grad_norm": 0.5711908340454102, + "learning_rate": 5.513869625520112e-06, + "loss": 0.0169, + "step": 6479 + }, + { + "epoch": 4.489089019743679, + "grad_norm": 0.5124261975288391, + "learning_rate": 5.5131761442441054e-06, + "loss": 0.0195, + "step": 6480 + }, + { + "epoch": 4.489781780394874, + "grad_norm": 0.4338620901107788, + "learning_rate": 5.5124826629681e-06, + "loss": 0.0172, + "step": 6481 + }, + { + "epoch": 4.490474541046068, + "grad_norm": 0.6072943210601807, + "learning_rate": 5.5117891816920945e-06, + "loss": 0.0162, + "step": 6482 + }, + { + "epoch": 4.4911673016972635, + "grad_norm": 0.526709794998169, + "learning_rate": 5.511095700416089e-06, + "loss": 0.0167, + "step": 6483 + }, + { + "epoch": 4.491860062348459, + "grad_norm": 0.4411545693874359, + "learning_rate": 5.510402219140084e-06, + "loss": 0.0153, + "step": 6484 + }, + { + "epoch": 4.492552822999654, + "grad_norm": 0.5905636548995972, + "learning_rate": 5.5097087378640776e-06, + "loss": 0.0211, + "step": 6485 + }, + { + "epoch": 4.4932455836508485, + "grad_norm": 0.48006075620651245, + "learning_rate": 5.5090152565880725e-06, + "loss": 0.0164, + "step": 6486 + }, + { + "epoch": 4.493938344302044, + "grad_norm": 0.4034219980239868, + "learning_rate": 5.508321775312067e-06, + "loss": 0.012, + "step": 6487 + }, + { + "epoch": 4.494631104953239, + "grad_norm": 0.487667441368103, + "learning_rate": 5.5076282940360615e-06, + "loss": 0.0113, + "step": 6488 + }, + { + "epoch": 4.495323865604433, + "grad_norm": 0.48509952425956726, + "learning_rate": 5.5069348127600565e-06, + "loss": 0.0124, + "step": 6489 + }, + { + "epoch": 4.496016626255629, + "grad_norm": 0.6027198433876038, + "learning_rate": 5.5062413314840505e-06, + "loss": 0.0254, + "step": 6490 + }, + { + "epoch": 4.496709386906824, + "grad_norm": 0.43159589171409607, + "learning_rate": 5.5055478502080455e-06, + "loss": 0.0168, + "step": 6491 + }, + { + "epoch": 4.497402147558018, + "grad_norm": 0.5612128376960754, + "learning_rate": 5.504854368932039e-06, + "loss": 0.0182, + "step": 6492 + }, + { + "epoch": 4.498094908209214, + "grad_norm": 0.6271775364875793, + "learning_rate": 5.504160887656034e-06, + "loss": 0.0164, + "step": 6493 + }, + { + "epoch": 4.498787668860409, + "grad_norm": 0.5226729512214661, + "learning_rate": 5.503467406380029e-06, + "loss": 0.0168, + "step": 6494 + }, + { + "epoch": 4.499480429511603, + "grad_norm": 0.5027127265930176, + "learning_rate": 5.502773925104023e-06, + "loss": 0.0167, + "step": 6495 + }, + { + "epoch": 4.500173190162799, + "grad_norm": 0.44412973523139954, + "learning_rate": 5.502080443828018e-06, + "loss": 0.0139, + "step": 6496 + }, + { + "epoch": 4.500865950813994, + "grad_norm": 0.8797883987426758, + "learning_rate": 5.501386962552011e-06, + "loss": 0.0178, + "step": 6497 + }, + { + "epoch": 4.501558711465189, + "grad_norm": 0.4727875292301178, + "learning_rate": 5.500693481276006e-06, + "loss": 0.0125, + "step": 6498 + }, + { + "epoch": 4.5022514721163835, + "grad_norm": 0.5474968552589417, + "learning_rate": 5.500000000000001e-06, + "loss": 0.0188, + "step": 6499 + }, + { + "epoch": 4.502944232767579, + "grad_norm": 0.3545326590538025, + "learning_rate": 5.499306518723995e-06, + "loss": 0.0113, + "step": 6500 + }, + { + "epoch": 4.503636993418774, + "grad_norm": 0.48472627997398376, + "learning_rate": 5.49861303744799e-06, + "loss": 0.0198, + "step": 6501 + }, + { + "epoch": 4.5043297540699685, + "grad_norm": 0.3662717640399933, + "learning_rate": 5.497919556171983e-06, + "loss": 0.0111, + "step": 6502 + }, + { + "epoch": 4.505022514721164, + "grad_norm": 0.4518069326877594, + "learning_rate": 5.497226074895978e-06, + "loss": 0.0162, + "step": 6503 + }, + { + "epoch": 4.505715275372359, + "grad_norm": 0.39849138259887695, + "learning_rate": 5.496532593619973e-06, + "loss": 0.0151, + "step": 6504 + }, + { + "epoch": 4.506408036023554, + "grad_norm": 0.6232315897941589, + "learning_rate": 5.495839112343967e-06, + "loss": 0.0123, + "step": 6505 + }, + { + "epoch": 4.507100796674749, + "grad_norm": 0.6459498405456543, + "learning_rate": 5.495145631067962e-06, + "loss": 0.018, + "step": 6506 + }, + { + "epoch": 4.507793557325944, + "grad_norm": 0.504284679889679, + "learning_rate": 5.494452149791956e-06, + "loss": 0.0232, + "step": 6507 + }, + { + "epoch": 4.508486317977139, + "grad_norm": 0.46955856680870056, + "learning_rate": 5.493758668515951e-06, + "loss": 0.0139, + "step": 6508 + }, + { + "epoch": 4.509179078628334, + "grad_norm": 0.4541158676147461, + "learning_rate": 5.493065187239946e-06, + "loss": 0.0144, + "step": 6509 + }, + { + "epoch": 4.509871839279529, + "grad_norm": 0.4640336036682129, + "learning_rate": 5.492371705963939e-06, + "loss": 0.0189, + "step": 6510 + }, + { + "epoch": 4.510564599930724, + "grad_norm": 0.4979221522808075, + "learning_rate": 5.491678224687934e-06, + "loss": 0.0196, + "step": 6511 + }, + { + "epoch": 4.511257360581919, + "grad_norm": 0.45296889543533325, + "learning_rate": 5.490984743411928e-06, + "loss": 0.0159, + "step": 6512 + }, + { + "epoch": 4.511950121233114, + "grad_norm": 0.4820176064968109, + "learning_rate": 5.490291262135923e-06, + "loss": 0.0129, + "step": 6513 + }, + { + "epoch": 4.512642881884309, + "grad_norm": 0.4541758894920349, + "learning_rate": 5.489597780859918e-06, + "loss": 0.0115, + "step": 6514 + }, + { + "epoch": 4.513335642535504, + "grad_norm": 0.4782460629940033, + "learning_rate": 5.488904299583911e-06, + "loss": 0.0155, + "step": 6515 + }, + { + "epoch": 4.514028403186699, + "grad_norm": 0.4050765335559845, + "learning_rate": 5.488210818307906e-06, + "loss": 0.013, + "step": 6516 + }, + { + "epoch": 4.514721163837894, + "grad_norm": 0.5570684671401978, + "learning_rate": 5.4875173370319e-06, + "loss": 0.0171, + "step": 6517 + }, + { + "epoch": 4.515413924489089, + "grad_norm": 0.3821154832839966, + "learning_rate": 5.486823855755895e-06, + "loss": 0.016, + "step": 6518 + }, + { + "epoch": 4.516106685140284, + "grad_norm": 0.44980764389038086, + "learning_rate": 5.48613037447989e-06, + "loss": 0.0167, + "step": 6519 + }, + { + "epoch": 4.516799445791479, + "grad_norm": 0.47033196687698364, + "learning_rate": 5.485436893203884e-06, + "loss": 0.0159, + "step": 6520 + }, + { + "epoch": 4.517492206442674, + "grad_norm": 0.6494829058647156, + "learning_rate": 5.484743411927878e-06, + "loss": 0.0184, + "step": 6521 + }, + { + "epoch": 4.518184967093869, + "grad_norm": 0.43515223264694214, + "learning_rate": 5.484049930651872e-06, + "loss": 0.0152, + "step": 6522 + }, + { + "epoch": 4.518877727745064, + "grad_norm": 0.4424087405204773, + "learning_rate": 5.483356449375867e-06, + "loss": 0.0124, + "step": 6523 + }, + { + "epoch": 4.519570488396259, + "grad_norm": 0.8619319796562195, + "learning_rate": 5.482662968099862e-06, + "loss": 0.0182, + "step": 6524 + }, + { + "epoch": 4.520263249047455, + "grad_norm": 0.5138509273529053, + "learning_rate": 5.481969486823856e-06, + "loss": 0.0123, + "step": 6525 + }, + { + "epoch": 4.520956009698649, + "grad_norm": 0.4272022843360901, + "learning_rate": 5.481276005547851e-06, + "loss": 0.0116, + "step": 6526 + }, + { + "epoch": 4.521648770349844, + "grad_norm": 0.35826924443244934, + "learning_rate": 5.4805825242718444e-06, + "loss": 0.0109, + "step": 6527 + }, + { + "epoch": 4.5223415310010395, + "grad_norm": 0.46223559975624084, + "learning_rate": 5.479889042995839e-06, + "loss": 0.015, + "step": 6528 + }, + { + "epoch": 4.523034291652234, + "grad_norm": 0.42442476749420166, + "learning_rate": 5.479195561719834e-06, + "loss": 0.0138, + "step": 6529 + }, + { + "epoch": 4.523727052303429, + "grad_norm": 0.4605258107185364, + "learning_rate": 5.478502080443828e-06, + "loss": 0.0135, + "step": 6530 + }, + { + "epoch": 4.5244198129546245, + "grad_norm": 0.6113325953483582, + "learning_rate": 5.477808599167823e-06, + "loss": 0.0199, + "step": 6531 + }, + { + "epoch": 4.525112573605819, + "grad_norm": 0.6036701202392578, + "learning_rate": 5.4771151178918166e-06, + "loss": 0.0179, + "step": 6532 + }, + { + "epoch": 4.525805334257014, + "grad_norm": 0.650604248046875, + "learning_rate": 5.4764216366158115e-06, + "loss": 0.0153, + "step": 6533 + }, + { + "epoch": 4.526498094908209, + "grad_norm": 0.663766622543335, + "learning_rate": 5.4757281553398064e-06, + "loss": 0.0117, + "step": 6534 + }, + { + "epoch": 4.527190855559404, + "grad_norm": 0.4829460680484772, + "learning_rate": 5.4750346740638005e-06, + "loss": 0.0165, + "step": 6535 + }, + { + "epoch": 4.527883616210599, + "grad_norm": 0.4579041004180908, + "learning_rate": 5.4743411927877955e-06, + "loss": 0.0131, + "step": 6536 + }, + { + "epoch": 4.528576376861794, + "grad_norm": 0.49057334661483765, + "learning_rate": 5.4736477115117895e-06, + "loss": 0.013, + "step": 6537 + }, + { + "epoch": 4.529269137512989, + "grad_norm": 0.4914683699607849, + "learning_rate": 5.4729542302357845e-06, + "loss": 0.0176, + "step": 6538 + }, + { + "epoch": 4.529961898164184, + "grad_norm": 0.7309079766273499, + "learning_rate": 5.472260748959779e-06, + "loss": 0.0174, + "step": 6539 + }, + { + "epoch": 4.530654658815379, + "grad_norm": 0.5123947858810425, + "learning_rate": 5.471567267683773e-06, + "loss": 0.0143, + "step": 6540 + }, + { + "epoch": 4.531347419466575, + "grad_norm": 0.6903588175773621, + "learning_rate": 5.470873786407768e-06, + "loss": 0.0175, + "step": 6541 + }, + { + "epoch": 4.532040180117769, + "grad_norm": 0.4115409255027771, + "learning_rate": 5.470180305131762e-06, + "loss": 0.0139, + "step": 6542 + }, + { + "epoch": 4.532732940768964, + "grad_norm": 0.5724513530731201, + "learning_rate": 5.469486823855757e-06, + "loss": 0.0178, + "step": 6543 + }, + { + "epoch": 4.5334257014201595, + "grad_norm": 0.5521072745323181, + "learning_rate": 5.4687933425797515e-06, + "loss": 0.0156, + "step": 6544 + }, + { + "epoch": 4.534118462071355, + "grad_norm": 0.4855845868587494, + "learning_rate": 5.468099861303745e-06, + "loss": 0.0155, + "step": 6545 + }, + { + "epoch": 4.534811222722549, + "grad_norm": 0.3734821081161499, + "learning_rate": 5.46740638002774e-06, + "loss": 0.0112, + "step": 6546 + }, + { + "epoch": 4.5355039833737445, + "grad_norm": 0.3924940526485443, + "learning_rate": 5.466712898751734e-06, + "loss": 0.0121, + "step": 6547 + }, + { + "epoch": 4.53619674402494, + "grad_norm": 0.47242504358291626, + "learning_rate": 5.466019417475729e-06, + "loss": 0.025, + "step": 6548 + }, + { + "epoch": 4.536889504676134, + "grad_norm": 0.4202132523059845, + "learning_rate": 5.465325936199724e-06, + "loss": 0.0159, + "step": 6549 + }, + { + "epoch": 4.5375822653273294, + "grad_norm": 0.4816703200340271, + "learning_rate": 5.464632454923717e-06, + "loss": 0.018, + "step": 6550 + }, + { + "epoch": 4.538275025978525, + "grad_norm": 0.5029601454734802, + "learning_rate": 5.463938973647712e-06, + "loss": 0.0204, + "step": 6551 + }, + { + "epoch": 4.538967786629719, + "grad_norm": 0.47813183069229126, + "learning_rate": 5.463245492371706e-06, + "loss": 0.0176, + "step": 6552 + }, + { + "epoch": 4.539660547280914, + "grad_norm": 0.48515447974205017, + "learning_rate": 5.462552011095701e-06, + "loss": 0.0198, + "step": 6553 + }, + { + "epoch": 4.54035330793211, + "grad_norm": 0.5452256202697754, + "learning_rate": 5.461858529819696e-06, + "loss": 0.0163, + "step": 6554 + }, + { + "epoch": 4.541046068583304, + "grad_norm": 0.6523408889770508, + "learning_rate": 5.46116504854369e-06, + "loss": 0.0219, + "step": 6555 + }, + { + "epoch": 4.541738829234499, + "grad_norm": 0.42483633756637573, + "learning_rate": 5.460471567267685e-06, + "loss": 0.016, + "step": 6556 + }, + { + "epoch": 4.542431589885695, + "grad_norm": 0.4776119589805603, + "learning_rate": 5.459778085991678e-06, + "loss": 0.015, + "step": 6557 + }, + { + "epoch": 4.543124350536889, + "grad_norm": 0.45716530084609985, + "learning_rate": 5.459084604715673e-06, + "loss": 0.0174, + "step": 6558 + }, + { + "epoch": 4.543817111188084, + "grad_norm": 0.4417690634727478, + "learning_rate": 5.458391123439668e-06, + "loss": 0.0155, + "step": 6559 + }, + { + "epoch": 4.54450987183928, + "grad_norm": 0.49368464946746826, + "learning_rate": 5.457697642163662e-06, + "loss": 0.0154, + "step": 6560 + }, + { + "epoch": 4.545202632490475, + "grad_norm": 0.5182321667671204, + "learning_rate": 5.457004160887657e-06, + "loss": 0.0204, + "step": 6561 + }, + { + "epoch": 4.545895393141669, + "grad_norm": 0.5918089151382446, + "learning_rate": 5.45631067961165e-06, + "loss": 0.0112, + "step": 6562 + }, + { + "epoch": 4.5465881537928645, + "grad_norm": 0.6511187553405762, + "learning_rate": 5.455617198335645e-06, + "loss": 0.0187, + "step": 6563 + }, + { + "epoch": 4.54728091444406, + "grad_norm": 0.4714299142360687, + "learning_rate": 5.45492371705964e-06, + "loss": 0.0144, + "step": 6564 + }, + { + "epoch": 4.547973675095255, + "grad_norm": 0.4289691746234894, + "learning_rate": 5.454230235783634e-06, + "loss": 0.0143, + "step": 6565 + }, + { + "epoch": 4.5486664357464495, + "grad_norm": 0.4590403139591217, + "learning_rate": 5.453536754507629e-06, + "loss": 0.0159, + "step": 6566 + }, + { + "epoch": 4.549359196397645, + "grad_norm": 0.3824339509010315, + "learning_rate": 5.452843273231623e-06, + "loss": 0.0115, + "step": 6567 + }, + { + "epoch": 4.55005195704884, + "grad_norm": 0.44767406582832336, + "learning_rate": 5.452149791955618e-06, + "loss": 0.015, + "step": 6568 + }, + { + "epoch": 4.550744717700034, + "grad_norm": 0.5141071677207947, + "learning_rate": 5.451456310679612e-06, + "loss": 0.0194, + "step": 6569 + }, + { + "epoch": 4.55143747835123, + "grad_norm": 0.5163344740867615, + "learning_rate": 5.450762829403606e-06, + "loss": 0.0146, + "step": 6570 + }, + { + "epoch": 4.552130239002425, + "grad_norm": 0.49274778366088867, + "learning_rate": 5.450069348127601e-06, + "loss": 0.0167, + "step": 6571 + }, + { + "epoch": 4.552822999653619, + "grad_norm": 0.4520893096923828, + "learning_rate": 5.449375866851595e-06, + "loss": 0.0145, + "step": 6572 + }, + { + "epoch": 4.553515760304815, + "grad_norm": 1.0724546909332275, + "learning_rate": 5.44868238557559e-06, + "loss": 0.0195, + "step": 6573 + }, + { + "epoch": 4.55420852095601, + "grad_norm": 0.47798892855644226, + "learning_rate": 5.447988904299585e-06, + "loss": 0.0152, + "step": 6574 + }, + { + "epoch": 4.554901281607204, + "grad_norm": 0.580756664276123, + "learning_rate": 5.447295423023578e-06, + "loss": 0.0116, + "step": 6575 + }, + { + "epoch": 4.5555940422584, + "grad_norm": 0.43493279814720154, + "learning_rate": 5.446601941747573e-06, + "loss": 0.0157, + "step": 6576 + }, + { + "epoch": 4.556286802909595, + "grad_norm": 0.4682376980781555, + "learning_rate": 5.445908460471567e-06, + "loss": 0.0164, + "step": 6577 + }, + { + "epoch": 4.556979563560789, + "grad_norm": 0.456667423248291, + "learning_rate": 5.445214979195562e-06, + "loss": 0.0146, + "step": 6578 + }, + { + "epoch": 4.5576723242119845, + "grad_norm": 0.4843285083770752, + "learning_rate": 5.444521497919557e-06, + "loss": 0.0113, + "step": 6579 + }, + { + "epoch": 4.55836508486318, + "grad_norm": 0.6579593420028687, + "learning_rate": 5.4438280166435505e-06, + "loss": 0.0183, + "step": 6580 + }, + { + "epoch": 4.559057845514375, + "grad_norm": 0.6190991997718811, + "learning_rate": 5.4431345353675454e-06, + "loss": 0.0163, + "step": 6581 + }, + { + "epoch": 4.5597506061655695, + "grad_norm": 0.44875380396842957, + "learning_rate": 5.4424410540915395e-06, + "loss": 0.0142, + "step": 6582 + }, + { + "epoch": 4.560443366816765, + "grad_norm": 0.5786736011505127, + "learning_rate": 5.4417475728155345e-06, + "loss": 0.0189, + "step": 6583 + }, + { + "epoch": 4.56113612746796, + "grad_norm": 0.4336460828781128, + "learning_rate": 5.441054091539529e-06, + "loss": 0.0159, + "step": 6584 + }, + { + "epoch": 4.561828888119155, + "grad_norm": 0.5271535515785217, + "learning_rate": 5.4403606102635235e-06, + "loss": 0.0187, + "step": 6585 + }, + { + "epoch": 4.56252164877035, + "grad_norm": 0.4854520857334137, + "learning_rate": 5.439667128987518e-06, + "loss": 0.0153, + "step": 6586 + }, + { + "epoch": 4.563214409421545, + "grad_norm": 0.4231073260307312, + "learning_rate": 5.438973647711512e-06, + "loss": 0.0184, + "step": 6587 + }, + { + "epoch": 4.56390717007274, + "grad_norm": 0.5371063351631165, + "learning_rate": 5.438280166435507e-06, + "loss": 0.0148, + "step": 6588 + }, + { + "epoch": 4.564599930723935, + "grad_norm": 0.4960605800151825, + "learning_rate": 5.4375866851595015e-06, + "loss": 0.0182, + "step": 6589 + }, + { + "epoch": 4.56529269137513, + "grad_norm": 0.7578689455986023, + "learning_rate": 5.436893203883496e-06, + "loss": 0.0229, + "step": 6590 + }, + { + "epoch": 4.565985452026325, + "grad_norm": 0.520584762096405, + "learning_rate": 5.4361997226074905e-06, + "loss": 0.0197, + "step": 6591 + }, + { + "epoch": 4.56667821267752, + "grad_norm": 0.6162667274475098, + "learning_rate": 5.435506241331484e-06, + "loss": 0.0179, + "step": 6592 + }, + { + "epoch": 4.567370973328715, + "grad_norm": 0.7186487317085266, + "learning_rate": 5.434812760055479e-06, + "loss": 0.0181, + "step": 6593 + }, + { + "epoch": 4.56806373397991, + "grad_norm": 0.4948159158229828, + "learning_rate": 5.434119278779474e-06, + "loss": 0.0153, + "step": 6594 + }, + { + "epoch": 4.568756494631105, + "grad_norm": 0.574824869632721, + "learning_rate": 5.433425797503468e-06, + "loss": 0.0178, + "step": 6595 + }, + { + "epoch": 4.5694492552823, + "grad_norm": 0.4070855975151062, + "learning_rate": 5.432732316227463e-06, + "loss": 0.0135, + "step": 6596 + }, + { + "epoch": 4.570142015933495, + "grad_norm": 0.4459018111228943, + "learning_rate": 5.432038834951457e-06, + "loss": 0.0109, + "step": 6597 + }, + { + "epoch": 4.5708347765846895, + "grad_norm": 0.48724040389060974, + "learning_rate": 5.431345353675451e-06, + "loss": 0.0154, + "step": 6598 + }, + { + "epoch": 4.571527537235885, + "grad_norm": 0.43888628482818604, + "learning_rate": 5.430651872399446e-06, + "loss": 0.014, + "step": 6599 + }, + { + "epoch": 4.57222029788708, + "grad_norm": 0.5620042681694031, + "learning_rate": 5.42995839112344e-06, + "loss": 0.0194, + "step": 6600 + }, + { + "epoch": 4.572913058538275, + "grad_norm": 0.499756395816803, + "learning_rate": 5.429264909847435e-06, + "loss": 0.0161, + "step": 6601 + }, + { + "epoch": 4.57360581918947, + "grad_norm": 0.5070227980613708, + "learning_rate": 5.428571428571429e-06, + "loss": 0.0158, + "step": 6602 + }, + { + "epoch": 4.574298579840665, + "grad_norm": 0.5882565379142761, + "learning_rate": 5.427877947295424e-06, + "loss": 0.0198, + "step": 6603 + }, + { + "epoch": 4.57499134049186, + "grad_norm": 0.4141419231891632, + "learning_rate": 5.427184466019419e-06, + "loss": 0.0147, + "step": 6604 + }, + { + "epoch": 4.575684101143056, + "grad_norm": 0.372179239988327, + "learning_rate": 5.426490984743412e-06, + "loss": 0.0127, + "step": 6605 + }, + { + "epoch": 4.57637686179425, + "grad_norm": 0.5299352407455444, + "learning_rate": 5.425797503467407e-06, + "loss": 0.0261, + "step": 6606 + }, + { + "epoch": 4.577069622445445, + "grad_norm": 0.7036036252975464, + "learning_rate": 5.425104022191401e-06, + "loss": 0.0183, + "step": 6607 + }, + { + "epoch": 4.5777623830966405, + "grad_norm": 0.3718984127044678, + "learning_rate": 5.424410540915396e-06, + "loss": 0.0159, + "step": 6608 + }, + { + "epoch": 4.578455143747835, + "grad_norm": 0.5675950646400452, + "learning_rate": 5.423717059639391e-06, + "loss": 0.0136, + "step": 6609 + }, + { + "epoch": 4.57914790439903, + "grad_norm": 0.5094320178031921, + "learning_rate": 5.423023578363384e-06, + "loss": 0.0148, + "step": 6610 + }, + { + "epoch": 4.5798406650502255, + "grad_norm": 0.5102602243423462, + "learning_rate": 5.422330097087379e-06, + "loss": 0.0164, + "step": 6611 + }, + { + "epoch": 4.58053342570142, + "grad_norm": 0.5119156241416931, + "learning_rate": 5.421636615811373e-06, + "loss": 0.0211, + "step": 6612 + }, + { + "epoch": 4.581226186352615, + "grad_norm": 0.49560511112213135, + "learning_rate": 5.420943134535368e-06, + "loss": 0.0138, + "step": 6613 + }, + { + "epoch": 4.58191894700381, + "grad_norm": 0.5669496655464172, + "learning_rate": 5.420249653259363e-06, + "loss": 0.0251, + "step": 6614 + }, + { + "epoch": 4.582611707655005, + "grad_norm": 0.5031467080116272, + "learning_rate": 5.419556171983357e-06, + "loss": 0.0157, + "step": 6615 + }, + { + "epoch": 4.5833044683062, + "grad_norm": 0.5215955376625061, + "learning_rate": 5.418862690707352e-06, + "loss": 0.0184, + "step": 6616 + }, + { + "epoch": 4.583997228957395, + "grad_norm": 0.4663569927215576, + "learning_rate": 5.418169209431345e-06, + "loss": 0.0153, + "step": 6617 + }, + { + "epoch": 4.58468998960859, + "grad_norm": 0.5357926487922668, + "learning_rate": 5.41747572815534e-06, + "loss": 0.0257, + "step": 6618 + }, + { + "epoch": 4.585382750259785, + "grad_norm": 0.8061259388923645, + "learning_rate": 5.416782246879335e-06, + "loss": 0.0189, + "step": 6619 + }, + { + "epoch": 4.58607551091098, + "grad_norm": 0.47571200132369995, + "learning_rate": 5.416088765603329e-06, + "loss": 0.0147, + "step": 6620 + }, + { + "epoch": 4.586768271562176, + "grad_norm": 0.5356894731521606, + "learning_rate": 5.415395284327324e-06, + "loss": 0.0227, + "step": 6621 + }, + { + "epoch": 4.58746103221337, + "grad_norm": 0.556513249874115, + "learning_rate": 5.414701803051317e-06, + "loss": 0.0214, + "step": 6622 + }, + { + "epoch": 4.588153792864565, + "grad_norm": 0.680359423160553, + "learning_rate": 5.414008321775312e-06, + "loss": 0.0167, + "step": 6623 + }, + { + "epoch": 4.5888465535157605, + "grad_norm": 0.512295126914978, + "learning_rate": 5.413314840499307e-06, + "loss": 0.0175, + "step": 6624 + }, + { + "epoch": 4.589539314166955, + "grad_norm": 0.5296668410301208, + "learning_rate": 5.412621359223301e-06, + "loss": 0.0173, + "step": 6625 + }, + { + "epoch": 4.59023207481815, + "grad_norm": 0.40958574414253235, + "learning_rate": 5.411927877947296e-06, + "loss": 0.0129, + "step": 6626 + }, + { + "epoch": 4.5909248354693455, + "grad_norm": 0.4297935962677002, + "learning_rate": 5.4112343966712895e-06, + "loss": 0.0125, + "step": 6627 + }, + { + "epoch": 4.591617596120541, + "grad_norm": 0.4897463619709015, + "learning_rate": 5.4105409153952844e-06, + "loss": 0.0173, + "step": 6628 + }, + { + "epoch": 4.592310356771735, + "grad_norm": 0.40074458718299866, + "learning_rate": 5.409847434119279e-06, + "loss": 0.0166, + "step": 6629 + }, + { + "epoch": 4.59300311742293, + "grad_norm": 1.348474144935608, + "learning_rate": 5.4091539528432735e-06, + "loss": 0.0195, + "step": 6630 + }, + { + "epoch": 4.593695878074126, + "grad_norm": 0.5647010803222656, + "learning_rate": 5.408460471567268e-06, + "loss": 0.0231, + "step": 6631 + }, + { + "epoch": 4.59438863872532, + "grad_norm": 0.43581849336624146, + "learning_rate": 5.4077669902912625e-06, + "loss": 0.0136, + "step": 6632 + }, + { + "epoch": 4.595081399376515, + "grad_norm": 0.46688055992126465, + "learning_rate": 5.407073509015257e-06, + "loss": 0.0168, + "step": 6633 + }, + { + "epoch": 4.595774160027711, + "grad_norm": 0.5506585836410522, + "learning_rate": 5.406380027739252e-06, + "loss": 0.0161, + "step": 6634 + }, + { + "epoch": 4.596466920678905, + "grad_norm": 0.3996039927005768, + "learning_rate": 5.405686546463246e-06, + "loss": 0.0139, + "step": 6635 + }, + { + "epoch": 4.5971596813301, + "grad_norm": 0.4879164397716522, + "learning_rate": 5.4049930651872405e-06, + "loss": 0.0178, + "step": 6636 + }, + { + "epoch": 4.597852441981296, + "grad_norm": 0.7078427672386169, + "learning_rate": 5.404299583911235e-06, + "loss": 0.0154, + "step": 6637 + }, + { + "epoch": 4.59854520263249, + "grad_norm": 0.7090429067611694, + "learning_rate": 5.4036061026352295e-06, + "loss": 0.015, + "step": 6638 + }, + { + "epoch": 4.599237963283685, + "grad_norm": 0.43521860241889954, + "learning_rate": 5.4029126213592245e-06, + "loss": 0.0129, + "step": 6639 + }, + { + "epoch": 4.599930723934881, + "grad_norm": 0.48858875036239624, + "learning_rate": 5.402219140083218e-06, + "loss": 0.0189, + "step": 6640 + }, + { + "epoch": 4.600623484586076, + "grad_norm": 0.5457018613815308, + "learning_rate": 5.401525658807213e-06, + "loss": 0.0164, + "step": 6641 + }, + { + "epoch": 4.60131624523727, + "grad_norm": 0.445328950881958, + "learning_rate": 5.400832177531207e-06, + "loss": 0.0118, + "step": 6642 + }, + { + "epoch": 4.6020090058884655, + "grad_norm": 0.4922443628311157, + "learning_rate": 5.400138696255202e-06, + "loss": 0.0161, + "step": 6643 + }, + { + "epoch": 4.602701766539661, + "grad_norm": 0.584332287311554, + "learning_rate": 5.399445214979197e-06, + "loss": 0.0181, + "step": 6644 + }, + { + "epoch": 4.603394527190855, + "grad_norm": 0.41912952065467834, + "learning_rate": 5.398751733703191e-06, + "loss": 0.0125, + "step": 6645 + }, + { + "epoch": 4.6040872878420505, + "grad_norm": 0.503561794757843, + "learning_rate": 5.398058252427185e-06, + "loss": 0.0234, + "step": 6646 + }, + { + "epoch": 4.604780048493246, + "grad_norm": 0.4105199873447418, + "learning_rate": 5.397364771151179e-06, + "loss": 0.0105, + "step": 6647 + }, + { + "epoch": 4.605472809144441, + "grad_norm": 0.478679358959198, + "learning_rate": 5.396671289875174e-06, + "loss": 0.0177, + "step": 6648 + }, + { + "epoch": 4.606165569795635, + "grad_norm": 0.428882896900177, + "learning_rate": 5.395977808599169e-06, + "loss": 0.0155, + "step": 6649 + }, + { + "epoch": 4.606858330446831, + "grad_norm": 0.48732271790504456, + "learning_rate": 5.395284327323163e-06, + "loss": 0.0186, + "step": 6650 + }, + { + "epoch": 4.607551091098026, + "grad_norm": 0.3989153206348419, + "learning_rate": 5.394590846047158e-06, + "loss": 0.0135, + "step": 6651 + }, + { + "epoch": 4.60824385174922, + "grad_norm": 0.396588534116745, + "learning_rate": 5.393897364771151e-06, + "loss": 0.014, + "step": 6652 + }, + { + "epoch": 4.608936612400416, + "grad_norm": 0.46881407499313354, + "learning_rate": 5.393203883495146e-06, + "loss": 0.0178, + "step": 6653 + }, + { + "epoch": 4.609629373051611, + "grad_norm": 0.42429134249687195, + "learning_rate": 5.392510402219141e-06, + "loss": 0.0131, + "step": 6654 + }, + { + "epoch": 4.610322133702805, + "grad_norm": 0.4902574419975281, + "learning_rate": 5.391816920943135e-06, + "loss": 0.0131, + "step": 6655 + }, + { + "epoch": 4.611014894354001, + "grad_norm": 0.4487420618534088, + "learning_rate": 5.39112343966713e-06, + "loss": 0.014, + "step": 6656 + }, + { + "epoch": 4.611707655005196, + "grad_norm": 0.5300506353378296, + "learning_rate": 5.390429958391123e-06, + "loss": 0.0121, + "step": 6657 + }, + { + "epoch": 4.61240041565639, + "grad_norm": 0.5802969336509705, + "learning_rate": 5.389736477115118e-06, + "loss": 0.0151, + "step": 6658 + }, + { + "epoch": 4.6130931763075855, + "grad_norm": 0.6159478425979614, + "learning_rate": 5.389042995839113e-06, + "loss": 0.0166, + "step": 6659 + }, + { + "epoch": 4.613785936958781, + "grad_norm": 0.36077210307121277, + "learning_rate": 5.388349514563107e-06, + "loss": 0.013, + "step": 6660 + }, + { + "epoch": 4.614478697609976, + "grad_norm": 0.44819772243499756, + "learning_rate": 5.387656033287102e-06, + "loss": 0.0119, + "step": 6661 + }, + { + "epoch": 4.6151714582611705, + "grad_norm": 0.7619090676307678, + "learning_rate": 5.386962552011096e-06, + "loss": 0.0168, + "step": 6662 + }, + { + "epoch": 4.615864218912366, + "grad_norm": 0.5988100171089172, + "learning_rate": 5.386269070735091e-06, + "loss": 0.024, + "step": 6663 + }, + { + "epoch": 4.616556979563561, + "grad_norm": 0.5031140446662903, + "learning_rate": 5.385575589459086e-06, + "loss": 0.0181, + "step": 6664 + }, + { + "epoch": 4.617249740214755, + "grad_norm": 0.38109642267227173, + "learning_rate": 5.384882108183079e-06, + "loss": 0.0149, + "step": 6665 + }, + { + "epoch": 4.617942500865951, + "grad_norm": 0.5986095070838928, + "learning_rate": 5.384188626907074e-06, + "loss": 0.0156, + "step": 6666 + }, + { + "epoch": 4.618635261517146, + "grad_norm": 0.44961240887641907, + "learning_rate": 5.383495145631068e-06, + "loss": 0.0176, + "step": 6667 + }, + { + "epoch": 4.619328022168341, + "grad_norm": 0.4712059795856476, + "learning_rate": 5.382801664355063e-06, + "loss": 0.0216, + "step": 6668 + }, + { + "epoch": 4.620020782819536, + "grad_norm": 0.39034727215766907, + "learning_rate": 5.382108183079058e-06, + "loss": 0.0119, + "step": 6669 + }, + { + "epoch": 4.620713543470731, + "grad_norm": 0.5380370020866394, + "learning_rate": 5.381414701803051e-06, + "loss": 0.0142, + "step": 6670 + }, + { + "epoch": 4.621406304121926, + "grad_norm": 0.45525482296943665, + "learning_rate": 5.380721220527046e-06, + "loss": 0.0156, + "step": 6671 + }, + { + "epoch": 4.622099064773121, + "grad_norm": 0.5401086211204529, + "learning_rate": 5.38002773925104e-06, + "loss": 0.0154, + "step": 6672 + }, + { + "epoch": 4.622791825424316, + "grad_norm": 0.5880857110023499, + "learning_rate": 5.379334257975035e-06, + "loss": 0.0154, + "step": 6673 + }, + { + "epoch": 4.623484586075511, + "grad_norm": 0.5571756362915039, + "learning_rate": 5.37864077669903e-06, + "loss": 0.0185, + "step": 6674 + }, + { + "epoch": 4.6241773467267056, + "grad_norm": 0.4018222391605377, + "learning_rate": 5.3779472954230234e-06, + "loss": 0.0124, + "step": 6675 + }, + { + "epoch": 4.624870107377901, + "grad_norm": 0.4577995538711548, + "learning_rate": 5.377253814147018e-06, + "loss": 0.0126, + "step": 6676 + }, + { + "epoch": 4.625562868029096, + "grad_norm": 0.5859726071357727, + "learning_rate": 5.3765603328710125e-06, + "loss": 0.0167, + "step": 6677 + }, + { + "epoch": 4.6262556286802905, + "grad_norm": 0.44329333305358887, + "learning_rate": 5.375866851595007e-06, + "loss": 0.0113, + "step": 6678 + }, + { + "epoch": 4.626948389331486, + "grad_norm": 0.5705049633979797, + "learning_rate": 5.375173370319002e-06, + "loss": 0.0147, + "step": 6679 + }, + { + "epoch": 4.627641149982681, + "grad_norm": 0.4552869498729706, + "learning_rate": 5.374479889042996e-06, + "loss": 0.0137, + "step": 6680 + }, + { + "epoch": 4.628333910633876, + "grad_norm": 0.5169498324394226, + "learning_rate": 5.373786407766991e-06, + "loss": 0.0173, + "step": 6681 + }, + { + "epoch": 4.629026671285071, + "grad_norm": 0.5297955870628357, + "learning_rate": 5.373092926490985e-06, + "loss": 0.0164, + "step": 6682 + }, + { + "epoch": 4.629719431936266, + "grad_norm": 0.49822568893432617, + "learning_rate": 5.3723994452149795e-06, + "loss": 0.0178, + "step": 6683 + }, + { + "epoch": 4.630412192587461, + "grad_norm": 0.4892895817756653, + "learning_rate": 5.3717059639389744e-06, + "loss": 0.0148, + "step": 6684 + }, + { + "epoch": 4.631104953238656, + "grad_norm": 0.737156093120575, + "learning_rate": 5.3710124826629685e-06, + "loss": 0.0169, + "step": 6685 + }, + { + "epoch": 4.631797713889851, + "grad_norm": 0.5654307007789612, + "learning_rate": 5.3703190013869635e-06, + "loss": 0.0189, + "step": 6686 + }, + { + "epoch": 4.632490474541046, + "grad_norm": 0.6652435660362244, + "learning_rate": 5.369625520110957e-06, + "loss": 0.0186, + "step": 6687 + }, + { + "epoch": 4.6331832351922415, + "grad_norm": 0.4329829812049866, + "learning_rate": 5.368932038834952e-06, + "loss": 0.0142, + "step": 6688 + }, + { + "epoch": 4.633875995843436, + "grad_norm": 0.47517314553260803, + "learning_rate": 5.3682385575589466e-06, + "loss": 0.0154, + "step": 6689 + }, + { + "epoch": 4.634568756494631, + "grad_norm": 0.47700613737106323, + "learning_rate": 5.367545076282941e-06, + "loss": 0.0183, + "step": 6690 + }, + { + "epoch": 4.6352615171458265, + "grad_norm": 0.4429062604904175, + "learning_rate": 5.366851595006936e-06, + "loss": 0.0173, + "step": 6691 + }, + { + "epoch": 4.635954277797021, + "grad_norm": 0.5844759941101074, + "learning_rate": 5.36615811373093e-06, + "loss": 0.0218, + "step": 6692 + }, + { + "epoch": 4.636647038448216, + "grad_norm": 0.5172418355941772, + "learning_rate": 5.365464632454925e-06, + "loss": 0.0184, + "step": 6693 + }, + { + "epoch": 4.637339799099411, + "grad_norm": 0.4225928485393524, + "learning_rate": 5.364771151178919e-06, + "loss": 0.0138, + "step": 6694 + }, + { + "epoch": 4.638032559750606, + "grad_norm": 0.5062415599822998, + "learning_rate": 5.364077669902913e-06, + "loss": 0.0202, + "step": 6695 + }, + { + "epoch": 4.638725320401801, + "grad_norm": 0.44356057047843933, + "learning_rate": 5.363384188626908e-06, + "loss": 0.0165, + "step": 6696 + }, + { + "epoch": 4.639418081052996, + "grad_norm": 0.4556675851345062, + "learning_rate": 5.362690707350902e-06, + "loss": 0.0165, + "step": 6697 + }, + { + "epoch": 4.640110841704191, + "grad_norm": 0.4357486963272095, + "learning_rate": 5.361997226074897e-06, + "loss": 0.0109, + "step": 6698 + }, + { + "epoch": 4.640803602355386, + "grad_norm": 0.44267845153808594, + "learning_rate": 5.361303744798892e-06, + "loss": 0.0135, + "step": 6699 + }, + { + "epoch": 4.641496363006581, + "grad_norm": 0.4378957450389862, + "learning_rate": 5.360610263522885e-06, + "loss": 0.0123, + "step": 6700 + }, + { + "epoch": 4.642189123657777, + "grad_norm": 0.4551459550857544, + "learning_rate": 5.35991678224688e-06, + "loss": 0.0154, + "step": 6701 + }, + { + "epoch": 4.642881884308971, + "grad_norm": 0.5333576202392578, + "learning_rate": 5.359223300970874e-06, + "loss": 0.022, + "step": 6702 + }, + { + "epoch": 4.643574644960166, + "grad_norm": 0.7531522512435913, + "learning_rate": 5.358529819694869e-06, + "loss": 0.0146, + "step": 6703 + }, + { + "epoch": 4.6442674056113615, + "grad_norm": 0.5332921147346497, + "learning_rate": 5.357836338418864e-06, + "loss": 0.0136, + "step": 6704 + }, + { + "epoch": 4.644960166262556, + "grad_norm": 0.4230230748653412, + "learning_rate": 5.357142857142857e-06, + "loss": 0.0157, + "step": 6705 + }, + { + "epoch": 4.645652926913751, + "grad_norm": 0.42440685629844666, + "learning_rate": 5.356449375866852e-06, + "loss": 0.0125, + "step": 6706 + }, + { + "epoch": 4.6463456875649465, + "grad_norm": 0.41778454184532166, + "learning_rate": 5.355755894590846e-06, + "loss": 0.0122, + "step": 6707 + }, + { + "epoch": 4.647038448216142, + "grad_norm": 0.40710264444351196, + "learning_rate": 5.355062413314841e-06, + "loss": 0.0121, + "step": 6708 + }, + { + "epoch": 4.647731208867336, + "grad_norm": 0.6806241869926453, + "learning_rate": 5.354368932038836e-06, + "loss": 0.0177, + "step": 6709 + }, + { + "epoch": 4.648423969518531, + "grad_norm": 0.5046828985214233, + "learning_rate": 5.35367545076283e-06, + "loss": 0.0157, + "step": 6710 + }, + { + "epoch": 4.649116730169727, + "grad_norm": 0.32613328099250793, + "learning_rate": 5.352981969486825e-06, + "loss": 0.0095, + "step": 6711 + }, + { + "epoch": 4.649809490820921, + "grad_norm": 0.4319767653942108, + "learning_rate": 5.352288488210818e-06, + "loss": 0.0142, + "step": 6712 + }, + { + "epoch": 4.650502251472116, + "grad_norm": 0.44726043939590454, + "learning_rate": 5.351595006934813e-06, + "loss": 0.0142, + "step": 6713 + }, + { + "epoch": 4.651195012123312, + "grad_norm": 0.4683282673358917, + "learning_rate": 5.350901525658808e-06, + "loss": 0.0125, + "step": 6714 + }, + { + "epoch": 4.651887772774506, + "grad_norm": 0.44113773107528687, + "learning_rate": 5.350208044382802e-06, + "loss": 0.013, + "step": 6715 + }, + { + "epoch": 4.652580533425701, + "grad_norm": 0.849155604839325, + "learning_rate": 5.349514563106797e-06, + "loss": 0.0154, + "step": 6716 + }, + { + "epoch": 4.653273294076897, + "grad_norm": 0.4086134135723114, + "learning_rate": 5.34882108183079e-06, + "loss": 0.0114, + "step": 6717 + }, + { + "epoch": 4.653966054728091, + "grad_norm": 0.5375789999961853, + "learning_rate": 5.348127600554785e-06, + "loss": 0.0161, + "step": 6718 + }, + { + "epoch": 4.654658815379286, + "grad_norm": 0.3824283480644226, + "learning_rate": 5.34743411927878e-06, + "loss": 0.0121, + "step": 6719 + }, + { + "epoch": 4.6553515760304816, + "grad_norm": 0.5195133090019226, + "learning_rate": 5.346740638002774e-06, + "loss": 0.0163, + "step": 6720 + }, + { + "epoch": 4.656044336681677, + "grad_norm": 0.35725370049476624, + "learning_rate": 5.346047156726769e-06, + "loss": 0.01, + "step": 6721 + }, + { + "epoch": 4.656737097332871, + "grad_norm": 0.38447943329811096, + "learning_rate": 5.345353675450763e-06, + "loss": 0.0204, + "step": 6722 + }, + { + "epoch": 4.6574298579840665, + "grad_norm": 0.4080303907394409, + "learning_rate": 5.344660194174757e-06, + "loss": 0.0146, + "step": 6723 + }, + { + "epoch": 4.658122618635262, + "grad_norm": 0.5283615589141846, + "learning_rate": 5.343966712898752e-06, + "loss": 0.0131, + "step": 6724 + }, + { + "epoch": 4.658815379286456, + "grad_norm": 0.5128315091133118, + "learning_rate": 5.343273231622746e-06, + "loss": 0.0154, + "step": 6725 + }, + { + "epoch": 4.6595081399376514, + "grad_norm": 0.40859082341194153, + "learning_rate": 5.342579750346741e-06, + "loss": 0.0096, + "step": 6726 + }, + { + "epoch": 4.660200900588847, + "grad_norm": 0.6382933855056763, + "learning_rate": 5.341886269070735e-06, + "loss": 0.0196, + "step": 6727 + }, + { + "epoch": 4.660893661240042, + "grad_norm": 0.48991847038269043, + "learning_rate": 5.34119278779473e-06, + "loss": 0.0152, + "step": 6728 + }, + { + "epoch": 4.661586421891236, + "grad_norm": 0.4030665457248688, + "learning_rate": 5.340499306518725e-06, + "loss": 0.0101, + "step": 6729 + }, + { + "epoch": 4.662279182542432, + "grad_norm": 0.5373626947402954, + "learning_rate": 5.3398058252427185e-06, + "loss": 0.0159, + "step": 6730 + }, + { + "epoch": 4.662971943193627, + "grad_norm": 0.5690442323684692, + "learning_rate": 5.3391123439667134e-06, + "loss": 0.0146, + "step": 6731 + }, + { + "epoch": 4.663664703844821, + "grad_norm": 0.4243113696575165, + "learning_rate": 5.3384188626907075e-06, + "loss": 0.0132, + "step": 6732 + }, + { + "epoch": 4.664357464496017, + "grad_norm": 0.39271458983421326, + "learning_rate": 5.3377253814147025e-06, + "loss": 0.0123, + "step": 6733 + }, + { + "epoch": 4.665050225147212, + "grad_norm": 0.501668393611908, + "learning_rate": 5.337031900138697e-06, + "loss": 0.0144, + "step": 6734 + }, + { + "epoch": 4.665742985798406, + "grad_norm": 0.44533124566078186, + "learning_rate": 5.336338418862691e-06, + "loss": 0.014, + "step": 6735 + }, + { + "epoch": 4.666435746449602, + "grad_norm": 0.420138955116272, + "learning_rate": 5.3356449375866856e-06, + "loss": 0.0146, + "step": 6736 + }, + { + "epoch": 4.667128507100797, + "grad_norm": 0.42193126678466797, + "learning_rate": 5.33495145631068e-06, + "loss": 0.0183, + "step": 6737 + }, + { + "epoch": 4.667821267751991, + "grad_norm": 0.4445243775844574, + "learning_rate": 5.334257975034675e-06, + "loss": 0.0143, + "step": 6738 + }, + { + "epoch": 4.6685140284031865, + "grad_norm": 0.44470784068107605, + "learning_rate": 5.3335644937586695e-06, + "loss": 0.013, + "step": 6739 + }, + { + "epoch": 4.669206789054382, + "grad_norm": 0.643493115901947, + "learning_rate": 5.332871012482664e-06, + "loss": 0.0153, + "step": 6740 + }, + { + "epoch": 4.669899549705577, + "grad_norm": 0.4524647295475006, + "learning_rate": 5.3321775312066585e-06, + "loss": 0.0125, + "step": 6741 + }, + { + "epoch": 4.6705923103567715, + "grad_norm": 0.7606937885284424, + "learning_rate": 5.331484049930652e-06, + "loss": 0.0195, + "step": 6742 + }, + { + "epoch": 4.671285071007967, + "grad_norm": 0.4757654368877411, + "learning_rate": 5.330790568654647e-06, + "loss": 0.0125, + "step": 6743 + }, + { + "epoch": 4.671977831659162, + "grad_norm": 0.608229398727417, + "learning_rate": 5.330097087378642e-06, + "loss": 0.0181, + "step": 6744 + }, + { + "epoch": 4.672670592310356, + "grad_norm": 0.3284655213356018, + "learning_rate": 5.329403606102636e-06, + "loss": 0.0102, + "step": 6745 + }, + { + "epoch": 4.673363352961552, + "grad_norm": 0.34887823462486267, + "learning_rate": 5.328710124826631e-06, + "loss": 0.0111, + "step": 6746 + }, + { + "epoch": 4.674056113612747, + "grad_norm": 0.5461851954460144, + "learning_rate": 5.328016643550624e-06, + "loss": 0.0172, + "step": 6747 + }, + { + "epoch": 4.674748874263942, + "grad_norm": 0.38614967465400696, + "learning_rate": 5.327323162274619e-06, + "loss": 0.0136, + "step": 6748 + }, + { + "epoch": 4.675441634915137, + "grad_norm": 0.3726162016391754, + "learning_rate": 5.326629680998614e-06, + "loss": 0.0112, + "step": 6749 + }, + { + "epoch": 4.676134395566332, + "grad_norm": 0.4830145239830017, + "learning_rate": 5.325936199722608e-06, + "loss": 0.0167, + "step": 6750 + }, + { + "epoch": 4.676827156217527, + "grad_norm": 0.39423564076423645, + "learning_rate": 5.325242718446603e-06, + "loss": 0.0128, + "step": 6751 + }, + { + "epoch": 4.677519916868722, + "grad_norm": 0.4504843056201935, + "learning_rate": 5.324549237170597e-06, + "loss": 0.0159, + "step": 6752 + }, + { + "epoch": 4.678212677519917, + "grad_norm": 0.5660965442657471, + "learning_rate": 5.323855755894591e-06, + "loss": 0.0133, + "step": 6753 + }, + { + "epoch": 4.678905438171112, + "grad_norm": 0.48100876808166504, + "learning_rate": 5.323162274618586e-06, + "loss": 0.0183, + "step": 6754 + }, + { + "epoch": 4.6795981988223065, + "grad_norm": 0.4504172205924988, + "learning_rate": 5.32246879334258e-06, + "loss": 0.018, + "step": 6755 + }, + { + "epoch": 4.680290959473502, + "grad_norm": 0.3996303081512451, + "learning_rate": 5.321775312066575e-06, + "loss": 0.013, + "step": 6756 + }, + { + "epoch": 4.680983720124697, + "grad_norm": 0.395956814289093, + "learning_rate": 5.321081830790569e-06, + "loss": 0.0128, + "step": 6757 + }, + { + "epoch": 4.6816764807758915, + "grad_norm": 0.4500115215778351, + "learning_rate": 5.320388349514564e-06, + "loss": 0.0171, + "step": 6758 + }, + { + "epoch": 4.682369241427087, + "grad_norm": 0.4182335138320923, + "learning_rate": 5.319694868238559e-06, + "loss": 0.0125, + "step": 6759 + }, + { + "epoch": 4.683062002078282, + "grad_norm": 0.46114209294319153, + "learning_rate": 5.319001386962552e-06, + "loss": 0.0206, + "step": 6760 + }, + { + "epoch": 4.683754762729477, + "grad_norm": 0.4365215003490448, + "learning_rate": 5.318307905686547e-06, + "loss": 0.0126, + "step": 6761 + }, + { + "epoch": 4.684447523380672, + "grad_norm": 0.5210328102111816, + "learning_rate": 5.317614424410541e-06, + "loss": 0.0198, + "step": 6762 + }, + { + "epoch": 4.685140284031867, + "grad_norm": 0.4795258045196533, + "learning_rate": 5.316920943134536e-06, + "loss": 0.0168, + "step": 6763 + }, + { + "epoch": 4.685833044683062, + "grad_norm": 0.42756351828575134, + "learning_rate": 5.316227461858531e-06, + "loss": 0.0181, + "step": 6764 + }, + { + "epoch": 4.686525805334257, + "grad_norm": 0.46755433082580566, + "learning_rate": 5.315533980582524e-06, + "loss": 0.0189, + "step": 6765 + }, + { + "epoch": 4.687218565985452, + "grad_norm": 0.5363218188285828, + "learning_rate": 5.314840499306519e-06, + "loss": 0.0176, + "step": 6766 + }, + { + "epoch": 4.687911326636647, + "grad_norm": 0.44285398721694946, + "learning_rate": 5.314147018030513e-06, + "loss": 0.0126, + "step": 6767 + }, + { + "epoch": 4.6886040872878425, + "grad_norm": 0.47270604968070984, + "learning_rate": 5.313453536754508e-06, + "loss": 0.0136, + "step": 6768 + }, + { + "epoch": 4.689296847939037, + "grad_norm": 0.5872167348861694, + "learning_rate": 5.312760055478503e-06, + "loss": 0.0202, + "step": 6769 + }, + { + "epoch": 4.689989608590232, + "grad_norm": 0.49353814125061035, + "learning_rate": 5.312066574202497e-06, + "loss": 0.0219, + "step": 6770 + }, + { + "epoch": 4.6906823692414275, + "grad_norm": 0.5511910915374756, + "learning_rate": 5.311373092926492e-06, + "loss": 0.0131, + "step": 6771 + }, + { + "epoch": 4.691375129892622, + "grad_norm": 0.4286562204360962, + "learning_rate": 5.310679611650485e-06, + "loss": 0.0136, + "step": 6772 + }, + { + "epoch": 4.692067890543817, + "grad_norm": 0.41244441270828247, + "learning_rate": 5.30998613037448e-06, + "loss": 0.012, + "step": 6773 + }, + { + "epoch": 4.692760651195012, + "grad_norm": 0.5870745778083801, + "learning_rate": 5.309292649098475e-06, + "loss": 0.0152, + "step": 6774 + }, + { + "epoch": 4.693453411846207, + "grad_norm": 0.36597204208374023, + "learning_rate": 5.308599167822469e-06, + "loss": 0.0117, + "step": 6775 + }, + { + "epoch": 4.694146172497402, + "grad_norm": 0.5219966173171997, + "learning_rate": 5.307905686546464e-06, + "loss": 0.0189, + "step": 6776 + }, + { + "epoch": 4.694838933148597, + "grad_norm": 0.5244627594947815, + "learning_rate": 5.3072122052704575e-06, + "loss": 0.0173, + "step": 6777 + }, + { + "epoch": 4.695531693799792, + "grad_norm": 0.6822748184204102, + "learning_rate": 5.3065187239944524e-06, + "loss": 0.0159, + "step": 6778 + }, + { + "epoch": 4.696224454450987, + "grad_norm": 0.5595081448554993, + "learning_rate": 5.305825242718447e-06, + "loss": 0.0216, + "step": 6779 + }, + { + "epoch": 4.696917215102182, + "grad_norm": 0.49543243646621704, + "learning_rate": 5.3051317614424415e-06, + "loss": 0.0214, + "step": 6780 + }, + { + "epoch": 4.697609975753378, + "grad_norm": 0.5642449855804443, + "learning_rate": 5.304438280166436e-06, + "loss": 0.0122, + "step": 6781 + }, + { + "epoch": 4.698302736404572, + "grad_norm": 0.4676438271999359, + "learning_rate": 5.30374479889043e-06, + "loss": 0.013, + "step": 6782 + }, + { + "epoch": 4.698995497055767, + "grad_norm": 0.5166946053504944, + "learning_rate": 5.3030513176144246e-06, + "loss": 0.0162, + "step": 6783 + }, + { + "epoch": 4.6996882577069625, + "grad_norm": 0.43936529755592346, + "learning_rate": 5.3023578363384195e-06, + "loss": 0.0141, + "step": 6784 + }, + { + "epoch": 4.700381018358157, + "grad_norm": 0.5059449076652527, + "learning_rate": 5.301664355062414e-06, + "loss": 0.0144, + "step": 6785 + }, + { + "epoch": 4.701073779009352, + "grad_norm": 0.42727038264274597, + "learning_rate": 5.3009708737864085e-06, + "loss": 0.0112, + "step": 6786 + }, + { + "epoch": 4.7017665396605475, + "grad_norm": 0.5576202869415283, + "learning_rate": 5.300277392510403e-06, + "loss": 0.0233, + "step": 6787 + }, + { + "epoch": 4.702459300311743, + "grad_norm": 0.4915574789047241, + "learning_rate": 5.2995839112343975e-06, + "loss": 0.016, + "step": 6788 + }, + { + "epoch": 4.703152060962937, + "grad_norm": 0.41665318608283997, + "learning_rate": 5.2988904299583925e-06, + "loss": 0.0123, + "step": 6789 + }, + { + "epoch": 4.703844821614132, + "grad_norm": 0.42135944962501526, + "learning_rate": 5.298196948682386e-06, + "loss": 0.0128, + "step": 6790 + }, + { + "epoch": 4.704537582265328, + "grad_norm": 0.5983707308769226, + "learning_rate": 5.297503467406381e-06, + "loss": 0.0164, + "step": 6791 + }, + { + "epoch": 4.705230342916522, + "grad_norm": 0.4632401466369629, + "learning_rate": 5.296809986130375e-06, + "loss": 0.0145, + "step": 6792 + }, + { + "epoch": 4.705923103567717, + "grad_norm": 0.4878617823123932, + "learning_rate": 5.29611650485437e-06, + "loss": 0.0162, + "step": 6793 + }, + { + "epoch": 4.706615864218913, + "grad_norm": 0.4483562409877777, + "learning_rate": 5.295423023578365e-06, + "loss": 0.0195, + "step": 6794 + }, + { + "epoch": 4.707308624870107, + "grad_norm": 0.462507963180542, + "learning_rate": 5.294729542302358e-06, + "loss": 0.0168, + "step": 6795 + }, + { + "epoch": 4.708001385521302, + "grad_norm": 0.3771485388278961, + "learning_rate": 5.294036061026353e-06, + "loss": 0.011, + "step": 6796 + }, + { + "epoch": 4.708694146172498, + "grad_norm": 0.43506211042404175, + "learning_rate": 5.293342579750347e-06, + "loss": 0.0148, + "step": 6797 + }, + { + "epoch": 4.709386906823692, + "grad_norm": 0.6274265646934509, + "learning_rate": 5.292649098474342e-06, + "loss": 0.0135, + "step": 6798 + }, + { + "epoch": 4.710079667474887, + "grad_norm": 0.5505167841911316, + "learning_rate": 5.291955617198337e-06, + "loss": 0.0129, + "step": 6799 + }, + { + "epoch": 4.7107724281260825, + "grad_norm": 0.44486531615257263, + "learning_rate": 5.291262135922331e-06, + "loss": 0.0164, + "step": 6800 + }, + { + "epoch": 4.711465188777278, + "grad_norm": 0.6184029579162598, + "learning_rate": 5.290568654646325e-06, + "loss": 0.024, + "step": 6801 + }, + { + "epoch": 4.712157949428472, + "grad_norm": 0.5388383269309998, + "learning_rate": 5.289875173370319e-06, + "loss": 0.0181, + "step": 6802 + }, + { + "epoch": 4.7128507100796675, + "grad_norm": 0.43575114011764526, + "learning_rate": 5.289181692094314e-06, + "loss": 0.0142, + "step": 6803 + }, + { + "epoch": 4.713543470730863, + "grad_norm": 0.5159897208213806, + "learning_rate": 5.288488210818309e-06, + "loss": 0.017, + "step": 6804 + }, + { + "epoch": 4.714236231382057, + "grad_norm": 0.5509412288665771, + "learning_rate": 5.287794729542303e-06, + "loss": 0.0199, + "step": 6805 + }, + { + "epoch": 4.714928992033252, + "grad_norm": 0.5837615132331848, + "learning_rate": 5.287101248266298e-06, + "loss": 0.0249, + "step": 6806 + }, + { + "epoch": 4.715621752684448, + "grad_norm": 0.42776304483413696, + "learning_rate": 5.286407766990291e-06, + "loss": 0.0133, + "step": 6807 + }, + { + "epoch": 4.716314513335643, + "grad_norm": 0.4792737066745758, + "learning_rate": 5.285714285714286e-06, + "loss": 0.0165, + "step": 6808 + }, + { + "epoch": 4.717007273986837, + "grad_norm": 0.3855811059474945, + "learning_rate": 5.285020804438281e-06, + "loss": 0.0126, + "step": 6809 + }, + { + "epoch": 4.717700034638033, + "grad_norm": 0.44185054302215576, + "learning_rate": 5.284327323162275e-06, + "loss": 0.0128, + "step": 6810 + }, + { + "epoch": 4.718392795289228, + "grad_norm": 0.5195660591125488, + "learning_rate": 5.28363384188627e-06, + "loss": 0.014, + "step": 6811 + }, + { + "epoch": 4.719085555940422, + "grad_norm": 0.4914865493774414, + "learning_rate": 5.282940360610263e-06, + "loss": 0.0136, + "step": 6812 + }, + { + "epoch": 4.719778316591618, + "grad_norm": 0.46978315711021423, + "learning_rate": 5.282246879334258e-06, + "loss": 0.0139, + "step": 6813 + }, + { + "epoch": 4.720471077242813, + "grad_norm": 0.509129524230957, + "learning_rate": 5.281553398058253e-06, + "loss": 0.0184, + "step": 6814 + }, + { + "epoch": 4.721163837894007, + "grad_norm": 0.4341345429420471, + "learning_rate": 5.280859916782247e-06, + "loss": 0.0151, + "step": 6815 + }, + { + "epoch": 4.721856598545203, + "grad_norm": 0.5684191584587097, + "learning_rate": 5.280166435506242e-06, + "loss": 0.0202, + "step": 6816 + }, + { + "epoch": 4.722549359196398, + "grad_norm": 0.4436483383178711, + "learning_rate": 5.279472954230236e-06, + "loss": 0.0169, + "step": 6817 + }, + { + "epoch": 4.723242119847592, + "grad_norm": 0.6761987805366516, + "learning_rate": 5.278779472954231e-06, + "loss": 0.0189, + "step": 6818 + }, + { + "epoch": 4.7239348804987875, + "grad_norm": 0.6232472658157349, + "learning_rate": 5.278085991678226e-06, + "loss": 0.0204, + "step": 6819 + }, + { + "epoch": 4.724627641149983, + "grad_norm": 0.6841349005699158, + "learning_rate": 5.277392510402219e-06, + "loss": 0.0199, + "step": 6820 + }, + { + "epoch": 4.725320401801178, + "grad_norm": 0.4502756893634796, + "learning_rate": 5.276699029126214e-06, + "loss": 0.0174, + "step": 6821 + }, + { + "epoch": 4.7260131624523725, + "grad_norm": 0.460126668214798, + "learning_rate": 5.276005547850208e-06, + "loss": 0.0133, + "step": 6822 + }, + { + "epoch": 4.726705923103568, + "grad_norm": 0.8593262434005737, + "learning_rate": 5.275312066574203e-06, + "loss": 0.0186, + "step": 6823 + }, + { + "epoch": 4.727398683754763, + "grad_norm": 0.4822736978530884, + "learning_rate": 5.274618585298198e-06, + "loss": 0.016, + "step": 6824 + }, + { + "epoch": 4.728091444405957, + "grad_norm": 0.5434883832931519, + "learning_rate": 5.2739251040221914e-06, + "loss": 0.0179, + "step": 6825 + }, + { + "epoch": 4.728784205057153, + "grad_norm": 0.4977904260158539, + "learning_rate": 5.273231622746186e-06, + "loss": 0.0145, + "step": 6826 + }, + { + "epoch": 4.729476965708348, + "grad_norm": 0.5161583423614502, + "learning_rate": 5.2725381414701805e-06, + "loss": 0.0155, + "step": 6827 + }, + { + "epoch": 4.730169726359543, + "grad_norm": 0.44691187143325806, + "learning_rate": 5.271844660194175e-06, + "loss": 0.0215, + "step": 6828 + }, + { + "epoch": 4.730862487010738, + "grad_norm": 0.40459316968917847, + "learning_rate": 5.27115117891817e-06, + "loss": 0.0137, + "step": 6829 + }, + { + "epoch": 4.731555247661933, + "grad_norm": 0.5111734867095947, + "learning_rate": 5.2704576976421636e-06, + "loss": 0.0155, + "step": 6830 + }, + { + "epoch": 4.732248008313128, + "grad_norm": 0.424892783164978, + "learning_rate": 5.2697642163661585e-06, + "loss": 0.0149, + "step": 6831 + }, + { + "epoch": 4.732940768964323, + "grad_norm": 0.4103415012359619, + "learning_rate": 5.269070735090153e-06, + "loss": 0.0159, + "step": 6832 + }, + { + "epoch": 4.733633529615518, + "grad_norm": 0.5258544683456421, + "learning_rate": 5.2683772538141475e-06, + "loss": 0.0198, + "step": 6833 + }, + { + "epoch": 4.734326290266713, + "grad_norm": 0.4566044807434082, + "learning_rate": 5.2676837725381425e-06, + "loss": 0.0131, + "step": 6834 + }, + { + "epoch": 4.7350190509179075, + "grad_norm": 0.4631848931312561, + "learning_rate": 5.2669902912621365e-06, + "loss": 0.017, + "step": 6835 + }, + { + "epoch": 4.735711811569103, + "grad_norm": 0.4678950011730194, + "learning_rate": 5.2662968099861315e-06, + "loss": 0.0181, + "step": 6836 + }, + { + "epoch": 4.736404572220298, + "grad_norm": 0.5638415217399597, + "learning_rate": 5.265603328710125e-06, + "loss": 0.0173, + "step": 6837 + }, + { + "epoch": 4.7370973328714925, + "grad_norm": 0.49170637130737305, + "learning_rate": 5.26490984743412e-06, + "loss": 0.0205, + "step": 6838 + }, + { + "epoch": 4.737790093522688, + "grad_norm": 0.6220834851264954, + "learning_rate": 5.264216366158115e-06, + "loss": 0.0296, + "step": 6839 + }, + { + "epoch": 4.738482854173883, + "grad_norm": 0.36823028326034546, + "learning_rate": 5.263522884882109e-06, + "loss": 0.0127, + "step": 6840 + }, + { + "epoch": 4.739175614825078, + "grad_norm": 0.4995688796043396, + "learning_rate": 5.262829403606104e-06, + "loss": 0.013, + "step": 6841 + }, + { + "epoch": 4.739868375476273, + "grad_norm": 0.33165818452835083, + "learning_rate": 5.262135922330097e-06, + "loss": 0.0118, + "step": 6842 + }, + { + "epoch": 4.740561136127468, + "grad_norm": 0.6700669527053833, + "learning_rate": 5.261442441054092e-06, + "loss": 0.0161, + "step": 6843 + }, + { + "epoch": 4.741253896778663, + "grad_norm": 0.49722665548324585, + "learning_rate": 5.260748959778087e-06, + "loss": 0.0206, + "step": 6844 + }, + { + "epoch": 4.741946657429858, + "grad_norm": 0.41947928071022034, + "learning_rate": 5.260055478502081e-06, + "loss": 0.0148, + "step": 6845 + }, + { + "epoch": 4.742639418081053, + "grad_norm": 0.37966054677963257, + "learning_rate": 5.259361997226076e-06, + "loss": 0.01, + "step": 6846 + }, + { + "epoch": 4.743332178732248, + "grad_norm": 0.4666665494441986, + "learning_rate": 5.25866851595007e-06, + "loss": 0.0187, + "step": 6847 + }, + { + "epoch": 4.7440249393834435, + "grad_norm": 0.36083927750587463, + "learning_rate": 5.257975034674065e-06, + "loss": 0.013, + "step": 6848 + }, + { + "epoch": 4.744717700034638, + "grad_norm": 0.5228288769721985, + "learning_rate": 5.257281553398059e-06, + "loss": 0.0176, + "step": 6849 + }, + { + "epoch": 4.745410460685833, + "grad_norm": 0.4528660476207733, + "learning_rate": 5.256588072122053e-06, + "loss": 0.0159, + "step": 6850 + }, + { + "epoch": 4.746103221337028, + "grad_norm": 0.5029258131980896, + "learning_rate": 5.255894590846048e-06, + "loss": 0.0217, + "step": 6851 + }, + { + "epoch": 4.746795981988223, + "grad_norm": 0.4450753629207611, + "learning_rate": 5.255201109570042e-06, + "loss": 0.0106, + "step": 6852 + }, + { + "epoch": 4.747488742639418, + "grad_norm": 0.657798707485199, + "learning_rate": 5.254507628294037e-06, + "loss": 0.0136, + "step": 6853 + }, + { + "epoch": 4.748181503290613, + "grad_norm": 0.5315065383911133, + "learning_rate": 5.253814147018032e-06, + "loss": 0.0134, + "step": 6854 + }, + { + "epoch": 4.748874263941808, + "grad_norm": 0.5781354308128357, + "learning_rate": 5.253120665742025e-06, + "loss": 0.0173, + "step": 6855 + }, + { + "epoch": 4.749567024593003, + "grad_norm": 0.42641496658325195, + "learning_rate": 5.25242718446602e-06, + "loss": 0.0167, + "step": 6856 + }, + { + "epoch": 4.750259785244198, + "grad_norm": 0.7987641096115112, + "learning_rate": 5.251733703190014e-06, + "loss": 0.0242, + "step": 6857 + }, + { + "epoch": 4.750952545895393, + "grad_norm": 0.5275469422340393, + "learning_rate": 5.251040221914009e-06, + "loss": 0.013, + "step": 6858 + }, + { + "epoch": 4.751645306546588, + "grad_norm": 0.4702892601490021, + "learning_rate": 5.250346740638004e-06, + "loss": 0.0181, + "step": 6859 + }, + { + "epoch": 4.752338067197783, + "grad_norm": 0.5504516363143921, + "learning_rate": 5.249653259361997e-06, + "loss": 0.015, + "step": 6860 + }, + { + "epoch": 4.753030827848978, + "grad_norm": 0.567467451095581, + "learning_rate": 5.248959778085992e-06, + "loss": 0.0199, + "step": 6861 + }, + { + "epoch": 4.753723588500173, + "grad_norm": 0.9012731313705444, + "learning_rate": 5.248266296809986e-06, + "loss": 0.0158, + "step": 6862 + }, + { + "epoch": 4.754416349151368, + "grad_norm": 0.48847752809524536, + "learning_rate": 5.247572815533981e-06, + "loss": 0.021, + "step": 6863 + }, + { + "epoch": 4.7551091098025635, + "grad_norm": 0.5339195728302002, + "learning_rate": 5.246879334257975e-06, + "loss": 0.0173, + "step": 6864 + }, + { + "epoch": 4.755801870453758, + "grad_norm": 0.5476733446121216, + "learning_rate": 5.24618585298197e-06, + "loss": 0.0192, + "step": 6865 + }, + { + "epoch": 4.756494631104953, + "grad_norm": 0.46313193440437317, + "learning_rate": 5.245492371705965e-06, + "loss": 0.0139, + "step": 6866 + }, + { + "epoch": 4.7571873917561485, + "grad_norm": 0.4554114043712616, + "learning_rate": 5.244798890429958e-06, + "loss": 0.0115, + "step": 6867 + }, + { + "epoch": 4.757880152407344, + "grad_norm": 0.43525463342666626, + "learning_rate": 5.244105409153953e-06, + "loss": 0.0115, + "step": 6868 + }, + { + "epoch": 4.758572913058538, + "grad_norm": 0.5824906229972839, + "learning_rate": 5.243411927877947e-06, + "loss": 0.0157, + "step": 6869 + }, + { + "epoch": 4.759265673709733, + "grad_norm": 0.3750251531600952, + "learning_rate": 5.242718446601942e-06, + "loss": 0.0133, + "step": 6870 + }, + { + "epoch": 4.759958434360929, + "grad_norm": 0.5041017532348633, + "learning_rate": 5.242024965325937e-06, + "loss": 0.0196, + "step": 6871 + }, + { + "epoch": 4.760651195012123, + "grad_norm": 0.6041717529296875, + "learning_rate": 5.2413314840499304e-06, + "loss": 0.0216, + "step": 6872 + }, + { + "epoch": 4.761343955663318, + "grad_norm": 0.4168316423892975, + "learning_rate": 5.240638002773925e-06, + "loss": 0.0128, + "step": 6873 + }, + { + "epoch": 4.762036716314514, + "grad_norm": 0.5820544958114624, + "learning_rate": 5.2399445214979195e-06, + "loss": 0.0156, + "step": 6874 + }, + { + "epoch": 4.762729476965708, + "grad_norm": 0.6230272054672241, + "learning_rate": 5.239251040221914e-06, + "loss": 0.0181, + "step": 6875 + }, + { + "epoch": 4.763422237616903, + "grad_norm": 0.3964560031890869, + "learning_rate": 5.238557558945909e-06, + "loss": 0.0165, + "step": 6876 + }, + { + "epoch": 4.764114998268099, + "grad_norm": 0.4693322777748108, + "learning_rate": 5.237864077669903e-06, + "loss": 0.0143, + "step": 6877 + }, + { + "epoch": 4.764807758919293, + "grad_norm": 0.5114544630050659, + "learning_rate": 5.2371705963938975e-06, + "loss": 0.0161, + "step": 6878 + }, + { + "epoch": 4.765500519570488, + "grad_norm": 0.443996399641037, + "learning_rate": 5.236477115117892e-06, + "loss": 0.011, + "step": 6879 + }, + { + "epoch": 4.7661932802216835, + "grad_norm": 0.5216764807701111, + "learning_rate": 5.2357836338418865e-06, + "loss": 0.0161, + "step": 6880 + }, + { + "epoch": 4.766886040872878, + "grad_norm": 0.5465784072875977, + "learning_rate": 5.2350901525658815e-06, + "loss": 0.021, + "step": 6881 + }, + { + "epoch": 4.767578801524073, + "grad_norm": 0.45073312520980835, + "learning_rate": 5.2343966712898755e-06, + "loss": 0.0155, + "step": 6882 + }, + { + "epoch": 4.7682715621752685, + "grad_norm": 0.4280548393726349, + "learning_rate": 5.2337031900138705e-06, + "loss": 0.0145, + "step": 6883 + }, + { + "epoch": 4.768964322826464, + "grad_norm": 0.5377956628799438, + "learning_rate": 5.233009708737864e-06, + "loss": 0.0135, + "step": 6884 + }, + { + "epoch": 4.769657083477658, + "grad_norm": 0.5947824716567993, + "learning_rate": 5.232316227461859e-06, + "loss": 0.0206, + "step": 6885 + }, + { + "epoch": 4.770349844128853, + "grad_norm": 0.8097753524780273, + "learning_rate": 5.231622746185854e-06, + "loss": 0.0185, + "step": 6886 + }, + { + "epoch": 4.771042604780049, + "grad_norm": 0.3802036941051483, + "learning_rate": 5.230929264909848e-06, + "loss": 0.0115, + "step": 6887 + }, + { + "epoch": 4.771735365431244, + "grad_norm": 0.41896852850914, + "learning_rate": 5.230235783633843e-06, + "loss": 0.0131, + "step": 6888 + }, + { + "epoch": 4.772428126082438, + "grad_norm": 0.5791265964508057, + "learning_rate": 5.229542302357836e-06, + "loss": 0.0155, + "step": 6889 + }, + { + "epoch": 4.773120886733634, + "grad_norm": 0.5431563854217529, + "learning_rate": 5.228848821081831e-06, + "loss": 0.0184, + "step": 6890 + }, + { + "epoch": 4.773813647384829, + "grad_norm": 0.4802630841732025, + "learning_rate": 5.228155339805826e-06, + "loss": 0.0169, + "step": 6891 + }, + { + "epoch": 4.774506408036023, + "grad_norm": 0.42098310589790344, + "learning_rate": 5.22746185852982e-06, + "loss": 0.0128, + "step": 6892 + }, + { + "epoch": 4.775199168687219, + "grad_norm": 0.4361574351787567, + "learning_rate": 5.226768377253815e-06, + "loss": 0.012, + "step": 6893 + }, + { + "epoch": 4.775891929338414, + "grad_norm": 0.6751755475997925, + "learning_rate": 5.226074895977809e-06, + "loss": 0.0217, + "step": 6894 + }, + { + "epoch": 4.776584689989608, + "grad_norm": 0.5671015381813049, + "learning_rate": 5.225381414701804e-06, + "loss": 0.0179, + "step": 6895 + }, + { + "epoch": 4.777277450640804, + "grad_norm": 0.5791196823120117, + "learning_rate": 5.224687933425799e-06, + "loss": 0.019, + "step": 6896 + }, + { + "epoch": 4.777970211291999, + "grad_norm": 0.483121782541275, + "learning_rate": 5.223994452149792e-06, + "loss": 0.0154, + "step": 6897 + }, + { + "epoch": 4.778662971943193, + "grad_norm": 0.48173636198043823, + "learning_rate": 5.223300970873787e-06, + "loss": 0.0197, + "step": 6898 + }, + { + "epoch": 4.7793557325943885, + "grad_norm": 0.6470112800598145, + "learning_rate": 5.222607489597781e-06, + "loss": 0.0157, + "step": 6899 + }, + { + "epoch": 4.780048493245584, + "grad_norm": 0.4605175256729126, + "learning_rate": 5.221914008321776e-06, + "loss": 0.0181, + "step": 6900 + }, + { + "epoch": 4.780741253896778, + "grad_norm": 0.4896812438964844, + "learning_rate": 5.221220527045771e-06, + "loss": 0.0179, + "step": 6901 + }, + { + "epoch": 4.7814340145479735, + "grad_norm": 0.6870664358139038, + "learning_rate": 5.220527045769764e-06, + "loss": 0.0221, + "step": 6902 + }, + { + "epoch": 4.782126775199169, + "grad_norm": 0.47149863839149475, + "learning_rate": 5.219833564493759e-06, + "loss": 0.0147, + "step": 6903 + }, + { + "epoch": 4.782819535850364, + "grad_norm": 0.5091636776924133, + "learning_rate": 5.219140083217753e-06, + "loss": 0.018, + "step": 6904 + }, + { + "epoch": 4.783512296501558, + "grad_norm": 0.4500809907913208, + "learning_rate": 5.218446601941748e-06, + "loss": 0.0129, + "step": 6905 + }, + { + "epoch": 4.784205057152754, + "grad_norm": 0.4145751893520355, + "learning_rate": 5.217753120665743e-06, + "loss": 0.0162, + "step": 6906 + }, + { + "epoch": 4.784897817803949, + "grad_norm": 0.594142496585846, + "learning_rate": 5.217059639389736e-06, + "loss": 0.011, + "step": 6907 + }, + { + "epoch": 4.785590578455144, + "grad_norm": 0.7651152014732361, + "learning_rate": 5.216366158113731e-06, + "loss": 0.0222, + "step": 6908 + }, + { + "epoch": 4.786283339106339, + "grad_norm": 0.5690034031867981, + "learning_rate": 5.215672676837725e-06, + "loss": 0.018, + "step": 6909 + }, + { + "epoch": 4.786976099757534, + "grad_norm": 0.5289434790611267, + "learning_rate": 5.21497919556172e-06, + "loss": 0.0154, + "step": 6910 + }, + { + "epoch": 4.787668860408729, + "grad_norm": 0.4769068658351898, + "learning_rate": 5.214285714285715e-06, + "loss": 0.0142, + "step": 6911 + }, + { + "epoch": 4.788361621059924, + "grad_norm": 0.5074408054351807, + "learning_rate": 5.213592233009709e-06, + "loss": 0.0188, + "step": 6912 + }, + { + "epoch": 4.789054381711119, + "grad_norm": 0.536114513874054, + "learning_rate": 5.212898751733704e-06, + "loss": 0.0205, + "step": 6913 + }, + { + "epoch": 4.789747142362314, + "grad_norm": 0.4882396459579468, + "learning_rate": 5.212205270457697e-06, + "loss": 0.0151, + "step": 6914 + }, + { + "epoch": 4.7904399030135085, + "grad_norm": 0.5209609866142273, + "learning_rate": 5.211511789181692e-06, + "loss": 0.0219, + "step": 6915 + }, + { + "epoch": 4.791132663664704, + "grad_norm": 0.577211856842041, + "learning_rate": 5.210818307905687e-06, + "loss": 0.0189, + "step": 6916 + }, + { + "epoch": 4.791825424315899, + "grad_norm": 0.8125247359275818, + "learning_rate": 5.210124826629681e-06, + "loss": 0.015, + "step": 6917 + }, + { + "epoch": 4.7925181849670935, + "grad_norm": 0.662858247756958, + "learning_rate": 5.209431345353676e-06, + "loss": 0.0147, + "step": 6918 + }, + { + "epoch": 4.793210945618289, + "grad_norm": 0.37163200974464417, + "learning_rate": 5.2087378640776694e-06, + "loss": 0.0126, + "step": 6919 + }, + { + "epoch": 4.793903706269484, + "grad_norm": 0.49915891885757446, + "learning_rate": 5.208044382801664e-06, + "loss": 0.0135, + "step": 6920 + }, + { + "epoch": 4.794596466920678, + "grad_norm": 0.44924038648605347, + "learning_rate": 5.207350901525659e-06, + "loss": 0.0153, + "step": 6921 + }, + { + "epoch": 4.795289227571874, + "grad_norm": 0.42528730630874634, + "learning_rate": 5.206657420249653e-06, + "loss": 0.0144, + "step": 6922 + }, + { + "epoch": 4.795981988223069, + "grad_norm": 0.5404415130615234, + "learning_rate": 5.205963938973648e-06, + "loss": 0.0193, + "step": 6923 + }, + { + "epoch": 4.796674748874264, + "grad_norm": 0.4966762661933899, + "learning_rate": 5.205270457697642e-06, + "loss": 0.0155, + "step": 6924 + }, + { + "epoch": 4.797367509525459, + "grad_norm": 0.5383783578872681, + "learning_rate": 5.204576976421637e-06, + "loss": 0.02, + "step": 6925 + }, + { + "epoch": 4.798060270176654, + "grad_norm": 0.4352760910987854, + "learning_rate": 5.2038834951456314e-06, + "loss": 0.0146, + "step": 6926 + }, + { + "epoch": 4.798753030827849, + "grad_norm": 0.5450789928436279, + "learning_rate": 5.2031900138696255e-06, + "loss": 0.0191, + "step": 6927 + }, + { + "epoch": 4.7994457914790445, + "grad_norm": 0.601810097694397, + "learning_rate": 5.2024965325936205e-06, + "loss": 0.0205, + "step": 6928 + }, + { + "epoch": 4.800138552130239, + "grad_norm": 0.6019473075866699, + "learning_rate": 5.2018030513176145e-06, + "loss": 0.0149, + "step": 6929 + }, + { + "epoch": 4.800831312781434, + "grad_norm": 0.43512487411499023, + "learning_rate": 5.2011095700416095e-06, + "loss": 0.0159, + "step": 6930 + }, + { + "epoch": 4.801524073432629, + "grad_norm": 0.5419918894767761, + "learning_rate": 5.200416088765604e-06, + "loss": 0.017, + "step": 6931 + }, + { + "epoch": 4.802216834083824, + "grad_norm": 0.5038084387779236, + "learning_rate": 5.199722607489598e-06, + "loss": 0.0173, + "step": 6932 + }, + { + "epoch": 4.802909594735019, + "grad_norm": 0.5100458264350891, + "learning_rate": 5.199029126213593e-06, + "loss": 0.0205, + "step": 6933 + }, + { + "epoch": 4.803602355386214, + "grad_norm": 0.47111448645591736, + "learning_rate": 5.198335644937587e-06, + "loss": 0.0174, + "step": 6934 + }, + { + "epoch": 4.804295116037409, + "grad_norm": 0.46273452043533325, + "learning_rate": 5.197642163661582e-06, + "loss": 0.0153, + "step": 6935 + }, + { + "epoch": 4.804987876688604, + "grad_norm": 0.5884197950363159, + "learning_rate": 5.1969486823855765e-06, + "loss": 0.0225, + "step": 6936 + }, + { + "epoch": 4.805680637339799, + "grad_norm": 0.5098763704299927, + "learning_rate": 5.19625520110957e-06, + "loss": 0.0145, + "step": 6937 + }, + { + "epoch": 4.806373397990994, + "grad_norm": 0.500295877456665, + "learning_rate": 5.195561719833565e-06, + "loss": 0.0211, + "step": 6938 + }, + { + "epoch": 4.807066158642189, + "grad_norm": 0.505456268787384, + "learning_rate": 5.194868238557559e-06, + "loss": 0.0145, + "step": 6939 + }, + { + "epoch": 4.807758919293384, + "grad_norm": 0.44865840673446655, + "learning_rate": 5.194174757281554e-06, + "loss": 0.0151, + "step": 6940 + }, + { + "epoch": 4.808451679944579, + "grad_norm": 0.4600048065185547, + "learning_rate": 5.193481276005549e-06, + "loss": 0.0133, + "step": 6941 + }, + { + "epoch": 4.809144440595774, + "grad_norm": 0.764250636100769, + "learning_rate": 5.192787794729543e-06, + "loss": 0.0129, + "step": 6942 + }, + { + "epoch": 4.809837201246969, + "grad_norm": 0.4583585560321808, + "learning_rate": 5.192094313453538e-06, + "loss": 0.0131, + "step": 6943 + }, + { + "epoch": 4.8105299618981645, + "grad_norm": 0.4157699644565582, + "learning_rate": 5.191400832177531e-06, + "loss": 0.0114, + "step": 6944 + }, + { + "epoch": 4.811222722549359, + "grad_norm": 0.62359219789505, + "learning_rate": 5.190707350901526e-06, + "loss": 0.018, + "step": 6945 + }, + { + "epoch": 4.811915483200554, + "grad_norm": 0.37869542837142944, + "learning_rate": 5.190013869625521e-06, + "loss": 0.0141, + "step": 6946 + }, + { + "epoch": 4.8126082438517495, + "grad_norm": 0.42418041825294495, + "learning_rate": 5.189320388349515e-06, + "loss": 0.0133, + "step": 6947 + }, + { + "epoch": 4.813301004502944, + "grad_norm": 0.4623964726924896, + "learning_rate": 5.18862690707351e-06, + "loss": 0.0139, + "step": 6948 + }, + { + "epoch": 4.813993765154139, + "grad_norm": 0.5853286385536194, + "learning_rate": 5.187933425797503e-06, + "loss": 0.0159, + "step": 6949 + }, + { + "epoch": 4.814686525805334, + "grad_norm": 0.5568832159042358, + "learning_rate": 5.187239944521498e-06, + "loss": 0.0184, + "step": 6950 + }, + { + "epoch": 4.81537928645653, + "grad_norm": 0.48901286721229553, + "learning_rate": 5.186546463245493e-06, + "loss": 0.0193, + "step": 6951 + }, + { + "epoch": 4.816072047107724, + "grad_norm": 0.4346074163913727, + "learning_rate": 5.185852981969487e-06, + "loss": 0.012, + "step": 6952 + }, + { + "epoch": 4.816764807758919, + "grad_norm": 0.6015490293502808, + "learning_rate": 5.185159500693482e-06, + "loss": 0.0261, + "step": 6953 + }, + { + "epoch": 4.817457568410115, + "grad_norm": 0.46684080362319946, + "learning_rate": 5.184466019417476e-06, + "loss": 0.0147, + "step": 6954 + }, + { + "epoch": 4.818150329061309, + "grad_norm": 0.5361834764480591, + "learning_rate": 5.18377253814147e-06, + "loss": 0.0208, + "step": 6955 + }, + { + "epoch": 4.818843089712504, + "grad_norm": 0.4664507210254669, + "learning_rate": 5.183079056865465e-06, + "loss": 0.0181, + "step": 6956 + }, + { + "epoch": 4.8195358503637, + "grad_norm": 0.5465168356895447, + "learning_rate": 5.182385575589459e-06, + "loss": 0.0181, + "step": 6957 + }, + { + "epoch": 4.820228611014894, + "grad_norm": 0.3864993155002594, + "learning_rate": 5.181692094313454e-06, + "loss": 0.0111, + "step": 6958 + }, + { + "epoch": 4.820921371666089, + "grad_norm": 0.4204265773296356, + "learning_rate": 5.180998613037448e-06, + "loss": 0.013, + "step": 6959 + }, + { + "epoch": 4.8216141323172845, + "grad_norm": 0.6744337677955627, + "learning_rate": 5.180305131761443e-06, + "loss": 0.0253, + "step": 6960 + }, + { + "epoch": 4.822306892968479, + "grad_norm": 0.3722931146621704, + "learning_rate": 5.179611650485438e-06, + "loss": 0.0125, + "step": 6961 + }, + { + "epoch": 4.822999653619674, + "grad_norm": 0.6089905500411987, + "learning_rate": 5.178918169209431e-06, + "loss": 0.0189, + "step": 6962 + }, + { + "epoch": 4.8236924142708695, + "grad_norm": 0.5528891086578369, + "learning_rate": 5.178224687933426e-06, + "loss": 0.0186, + "step": 6963 + }, + { + "epoch": 4.824385174922065, + "grad_norm": 0.4733254909515381, + "learning_rate": 5.17753120665742e-06, + "loss": 0.0139, + "step": 6964 + }, + { + "epoch": 4.825077935573259, + "grad_norm": 0.38616856932640076, + "learning_rate": 5.176837725381415e-06, + "loss": 0.0118, + "step": 6965 + }, + { + "epoch": 4.825770696224454, + "grad_norm": 0.4355665445327759, + "learning_rate": 5.17614424410541e-06, + "loss": 0.0178, + "step": 6966 + }, + { + "epoch": 4.82646345687565, + "grad_norm": 0.5209665894508362, + "learning_rate": 5.175450762829403e-06, + "loss": 0.014, + "step": 6967 + }, + { + "epoch": 4.827156217526844, + "grad_norm": 0.996882438659668, + "learning_rate": 5.174757281553398e-06, + "loss": 0.0213, + "step": 6968 + }, + { + "epoch": 4.827848978178039, + "grad_norm": 0.5259845852851868, + "learning_rate": 5.174063800277392e-06, + "loss": 0.0284, + "step": 6969 + }, + { + "epoch": 4.828541738829235, + "grad_norm": 0.49833589792251587, + "learning_rate": 5.173370319001387e-06, + "loss": 0.0218, + "step": 6970 + }, + { + "epoch": 4.82923449948043, + "grad_norm": 0.4214405119419098, + "learning_rate": 5.172676837725382e-06, + "loss": 0.0121, + "step": 6971 + }, + { + "epoch": 4.829927260131624, + "grad_norm": 0.571959376335144, + "learning_rate": 5.171983356449376e-06, + "loss": 0.0165, + "step": 6972 + }, + { + "epoch": 4.83062002078282, + "grad_norm": 0.5179031491279602, + "learning_rate": 5.171289875173371e-06, + "loss": 0.0218, + "step": 6973 + }, + { + "epoch": 4.831312781434015, + "grad_norm": 0.43725210428237915, + "learning_rate": 5.1705963938973645e-06, + "loss": 0.0135, + "step": 6974 + }, + { + "epoch": 4.832005542085209, + "grad_norm": 0.40078112483024597, + "learning_rate": 5.1699029126213595e-06, + "loss": 0.0111, + "step": 6975 + }, + { + "epoch": 4.8326983027364045, + "grad_norm": 0.4938311278820038, + "learning_rate": 5.169209431345354e-06, + "loss": 0.0126, + "step": 6976 + }, + { + "epoch": 4.8333910633876, + "grad_norm": 0.7092520594596863, + "learning_rate": 5.1685159500693485e-06, + "loss": 0.0227, + "step": 6977 + }, + { + "epoch": 4.834083824038794, + "grad_norm": 0.5861003398895264, + "learning_rate": 5.167822468793343e-06, + "loss": 0.0172, + "step": 6978 + }, + { + "epoch": 4.8347765846899895, + "grad_norm": 0.5506494641304016, + "learning_rate": 5.167128987517337e-06, + "loss": 0.0158, + "step": 6979 + }, + { + "epoch": 4.835469345341185, + "grad_norm": 0.5054938793182373, + "learning_rate": 5.166435506241332e-06, + "loss": 0.0178, + "step": 6980 + }, + { + "epoch": 4.836162105992379, + "grad_norm": 0.3828156888484955, + "learning_rate": 5.1657420249653265e-06, + "loss": 0.0136, + "step": 6981 + }, + { + "epoch": 4.8368548666435744, + "grad_norm": 0.610649824142456, + "learning_rate": 5.165048543689321e-06, + "loss": 0.0203, + "step": 6982 + }, + { + "epoch": 4.83754762729477, + "grad_norm": 0.5363022685050964, + "learning_rate": 5.1643550624133155e-06, + "loss": 0.0196, + "step": 6983 + }, + { + "epoch": 4.838240387945965, + "grad_norm": 0.41401076316833496, + "learning_rate": 5.163661581137309e-06, + "loss": 0.0174, + "step": 6984 + }, + { + "epoch": 4.838933148597159, + "grad_norm": 0.584434986114502, + "learning_rate": 5.162968099861304e-06, + "loss": 0.0152, + "step": 6985 + }, + { + "epoch": 4.839625909248355, + "grad_norm": 0.5189743041992188, + "learning_rate": 5.162274618585299e-06, + "loss": 0.016, + "step": 6986 + }, + { + "epoch": 4.84031866989955, + "grad_norm": 0.42707058787345886, + "learning_rate": 5.161581137309293e-06, + "loss": 0.0183, + "step": 6987 + }, + { + "epoch": 4.841011430550744, + "grad_norm": 0.5152461528778076, + "learning_rate": 5.160887656033288e-06, + "loss": 0.0144, + "step": 6988 + }, + { + "epoch": 4.84170419120194, + "grad_norm": 0.5386325120925903, + "learning_rate": 5.160194174757282e-06, + "loss": 0.0212, + "step": 6989 + }, + { + "epoch": 4.842396951853135, + "grad_norm": 0.4806686043739319, + "learning_rate": 5.159500693481277e-06, + "loss": 0.019, + "step": 6990 + }, + { + "epoch": 4.84308971250433, + "grad_norm": 0.48570263385772705, + "learning_rate": 5.158807212205272e-06, + "loss": 0.0184, + "step": 6991 + }, + { + "epoch": 4.843782473155525, + "grad_norm": 0.45237165689468384, + "learning_rate": 5.158113730929265e-06, + "loss": 0.0127, + "step": 6992 + }, + { + "epoch": 4.84447523380672, + "grad_norm": 0.5239438414573669, + "learning_rate": 5.15742024965326e-06, + "loss": 0.0167, + "step": 6993 + }, + { + "epoch": 4.845167994457915, + "grad_norm": 0.5638341307640076, + "learning_rate": 5.156726768377254e-06, + "loss": 0.0243, + "step": 6994 + }, + { + "epoch": 4.8458607551091095, + "grad_norm": 0.41066622734069824, + "learning_rate": 5.156033287101249e-06, + "loss": 0.0123, + "step": 6995 + }, + { + "epoch": 4.846553515760305, + "grad_norm": 0.47895199060440063, + "learning_rate": 5.155339805825244e-06, + "loss": 0.0157, + "step": 6996 + }, + { + "epoch": 4.8472462764115, + "grad_norm": 0.4531351327896118, + "learning_rate": 5.154646324549237e-06, + "loss": 0.0139, + "step": 6997 + }, + { + "epoch": 4.8479390370626945, + "grad_norm": 0.43267518281936646, + "learning_rate": 5.153952843273232e-06, + "loss": 0.0165, + "step": 6998 + }, + { + "epoch": 4.84863179771389, + "grad_norm": 0.49868276715278625, + "learning_rate": 5.153259361997226e-06, + "loss": 0.0131, + "step": 6999 + }, + { + "epoch": 4.849324558365085, + "grad_norm": 0.3782249093055725, + "learning_rate": 5.152565880721221e-06, + "loss": 0.0114, + "step": 7000 + }, + { + "epoch": 4.850017319016279, + "grad_norm": 0.4751749038696289, + "learning_rate": 5.151872399445216e-06, + "loss": 0.0144, + "step": 7001 + }, + { + "epoch": 4.850710079667475, + "grad_norm": 0.4537191092967987, + "learning_rate": 5.15117891816921e-06, + "loss": 0.0148, + "step": 7002 + }, + { + "epoch": 4.85140284031867, + "grad_norm": 0.5568353533744812, + "learning_rate": 5.150485436893204e-06, + "loss": 0.0181, + "step": 7003 + }, + { + "epoch": 4.852095600969865, + "grad_norm": 0.360664963722229, + "learning_rate": 5.149791955617198e-06, + "loss": 0.0098, + "step": 7004 + }, + { + "epoch": 4.85278836162106, + "grad_norm": 0.4307069182395935, + "learning_rate": 5.149098474341193e-06, + "loss": 0.0198, + "step": 7005 + }, + { + "epoch": 4.853481122272255, + "grad_norm": 0.5229548811912537, + "learning_rate": 5.148404993065188e-06, + "loss": 0.0174, + "step": 7006 + }, + { + "epoch": 4.85417388292345, + "grad_norm": 1.091782569885254, + "learning_rate": 5.147711511789182e-06, + "loss": 0.0196, + "step": 7007 + }, + { + "epoch": 4.854866643574645, + "grad_norm": 0.36841440200805664, + "learning_rate": 5.147018030513177e-06, + "loss": 0.0113, + "step": 7008 + }, + { + "epoch": 4.85555940422584, + "grad_norm": 0.5098832845687866, + "learning_rate": 5.14632454923717e-06, + "loss": 0.0131, + "step": 7009 + }, + { + "epoch": 4.856252164877035, + "grad_norm": 0.534648597240448, + "learning_rate": 5.145631067961165e-06, + "loss": 0.0194, + "step": 7010 + }, + { + "epoch": 4.85694492552823, + "grad_norm": 0.6079515814781189, + "learning_rate": 5.14493758668516e-06, + "loss": 0.0164, + "step": 7011 + }, + { + "epoch": 4.857637686179425, + "grad_norm": 0.5154495239257812, + "learning_rate": 5.144244105409154e-06, + "loss": 0.0174, + "step": 7012 + }, + { + "epoch": 4.85833044683062, + "grad_norm": 0.5022515058517456, + "learning_rate": 5.143550624133149e-06, + "loss": 0.0188, + "step": 7013 + }, + { + "epoch": 4.859023207481815, + "grad_norm": 0.4352777302265167, + "learning_rate": 5.142857142857142e-06, + "loss": 0.0135, + "step": 7014 + }, + { + "epoch": 4.85971596813301, + "grad_norm": 0.5344070196151733, + "learning_rate": 5.142163661581137e-06, + "loss": 0.0154, + "step": 7015 + }, + { + "epoch": 4.860408728784205, + "grad_norm": 0.45540904998779297, + "learning_rate": 5.141470180305132e-06, + "loss": 0.0136, + "step": 7016 + }, + { + "epoch": 4.8611014894354, + "grad_norm": 0.5667386054992676, + "learning_rate": 5.140776699029126e-06, + "loss": 0.0177, + "step": 7017 + }, + { + "epoch": 4.861794250086595, + "grad_norm": 0.7090626358985901, + "learning_rate": 5.140083217753121e-06, + "loss": 0.0235, + "step": 7018 + }, + { + "epoch": 4.86248701073779, + "grad_norm": 0.5996568202972412, + "learning_rate": 5.139389736477115e-06, + "loss": 0.0163, + "step": 7019 + }, + { + "epoch": 4.863179771388985, + "grad_norm": 0.731913149356842, + "learning_rate": 5.13869625520111e-06, + "loss": 0.0196, + "step": 7020 + }, + { + "epoch": 4.86387253204018, + "grad_norm": 0.7054145932197571, + "learning_rate": 5.138002773925105e-06, + "loss": 0.0182, + "step": 7021 + }, + { + "epoch": 4.864565292691375, + "grad_norm": 0.5450630187988281, + "learning_rate": 5.1373092926490985e-06, + "loss": 0.0181, + "step": 7022 + }, + { + "epoch": 4.86525805334257, + "grad_norm": 0.5680803060531616, + "learning_rate": 5.136615811373093e-06, + "loss": 0.0225, + "step": 7023 + }, + { + "epoch": 4.8659508139937655, + "grad_norm": 0.43003177642822266, + "learning_rate": 5.1359223300970875e-06, + "loss": 0.0191, + "step": 7024 + }, + { + "epoch": 4.86664357464496, + "grad_norm": 0.4550376832485199, + "learning_rate": 5.135228848821082e-06, + "loss": 0.014, + "step": 7025 + }, + { + "epoch": 4.867336335296155, + "grad_norm": 0.46195662021636963, + "learning_rate": 5.134535367545077e-06, + "loss": 0.0146, + "step": 7026 + }, + { + "epoch": 4.8680290959473504, + "grad_norm": 0.4369540512561798, + "learning_rate": 5.133841886269071e-06, + "loss": 0.0121, + "step": 7027 + }, + { + "epoch": 4.868721856598545, + "grad_norm": 0.5560098886489868, + "learning_rate": 5.1331484049930655e-06, + "loss": 0.0205, + "step": 7028 + }, + { + "epoch": 4.86941461724974, + "grad_norm": 0.5415427088737488, + "learning_rate": 5.13245492371706e-06, + "loss": 0.0164, + "step": 7029 + }, + { + "epoch": 4.870107377900935, + "grad_norm": 0.6134937405586243, + "learning_rate": 5.1317614424410545e-06, + "loss": 0.0154, + "step": 7030 + }, + { + "epoch": 4.870800138552131, + "grad_norm": 0.41532644629478455, + "learning_rate": 5.1310679611650495e-06, + "loss": 0.0137, + "step": 7031 + }, + { + "epoch": 4.871492899203325, + "grad_norm": 0.34864580631256104, + "learning_rate": 5.130374479889043e-06, + "loss": 0.0122, + "step": 7032 + }, + { + "epoch": 4.87218565985452, + "grad_norm": 0.5494161248207092, + "learning_rate": 5.129680998613038e-06, + "loss": 0.0199, + "step": 7033 + }, + { + "epoch": 4.872878420505716, + "grad_norm": 0.6478584408760071, + "learning_rate": 5.128987517337032e-06, + "loss": 0.0191, + "step": 7034 + }, + { + "epoch": 4.87357118115691, + "grad_norm": 0.43007680773735046, + "learning_rate": 5.128294036061027e-06, + "loss": 0.016, + "step": 7035 + }, + { + "epoch": 4.874263941808105, + "grad_norm": 0.5305352210998535, + "learning_rate": 5.127600554785022e-06, + "loss": 0.0202, + "step": 7036 + }, + { + "epoch": 4.874956702459301, + "grad_norm": 0.4242437481880188, + "learning_rate": 5.126907073509016e-06, + "loss": 0.0154, + "step": 7037 + }, + { + "epoch": 4.875649463110495, + "grad_norm": 0.4685125946998596, + "learning_rate": 5.126213592233011e-06, + "loss": 0.0175, + "step": 7038 + }, + { + "epoch": 4.87634222376169, + "grad_norm": 0.5354596972465515, + "learning_rate": 5.125520110957004e-06, + "loss": 0.0256, + "step": 7039 + }, + { + "epoch": 4.8770349844128855, + "grad_norm": 0.6412429809570312, + "learning_rate": 5.124826629680999e-06, + "loss": 0.0184, + "step": 7040 + }, + { + "epoch": 4.87772774506408, + "grad_norm": 0.4054080843925476, + "learning_rate": 5.124133148404994e-06, + "loss": 0.0148, + "step": 7041 + }, + { + "epoch": 4.878420505715275, + "grad_norm": 0.4576794505119324, + "learning_rate": 5.123439667128988e-06, + "loss": 0.0165, + "step": 7042 + }, + { + "epoch": 4.8791132663664705, + "grad_norm": 0.5380566120147705, + "learning_rate": 5.122746185852983e-06, + "loss": 0.0181, + "step": 7043 + }, + { + "epoch": 4.879806027017666, + "grad_norm": 0.477628231048584, + "learning_rate": 5.122052704576976e-06, + "loss": 0.0243, + "step": 7044 + }, + { + "epoch": 4.88049878766886, + "grad_norm": 0.7909729480743408, + "learning_rate": 5.121359223300971e-06, + "loss": 0.0232, + "step": 7045 + }, + { + "epoch": 4.881191548320055, + "grad_norm": 0.7681084871292114, + "learning_rate": 5.120665742024966e-06, + "loss": 0.0165, + "step": 7046 + }, + { + "epoch": 4.881884308971251, + "grad_norm": 0.49574559926986694, + "learning_rate": 5.11997226074896e-06, + "loss": 0.0181, + "step": 7047 + }, + { + "epoch": 4.882577069622445, + "grad_norm": 0.46113428473472595, + "learning_rate": 5.119278779472955e-06, + "loss": 0.0126, + "step": 7048 + }, + { + "epoch": 4.88326983027364, + "grad_norm": 0.5326622128486633, + "learning_rate": 5.118585298196949e-06, + "loss": 0.0183, + "step": 7049 + }, + { + "epoch": 4.883962590924836, + "grad_norm": 0.4911978244781494, + "learning_rate": 5.117891816920944e-06, + "loss": 0.0113, + "step": 7050 + }, + { + "epoch": 4.884655351576031, + "grad_norm": 0.5054808855056763, + "learning_rate": 5.117198335644938e-06, + "loss": 0.019, + "step": 7051 + }, + { + "epoch": 4.885348112227225, + "grad_norm": 0.5193711519241333, + "learning_rate": 5.116504854368932e-06, + "loss": 0.0182, + "step": 7052 + }, + { + "epoch": 4.886040872878421, + "grad_norm": 0.49985775351524353, + "learning_rate": 5.115811373092927e-06, + "loss": 0.0205, + "step": 7053 + }, + { + "epoch": 4.886733633529616, + "grad_norm": 0.5126142501831055, + "learning_rate": 5.115117891816921e-06, + "loss": 0.016, + "step": 7054 + }, + { + "epoch": 4.88742639418081, + "grad_norm": 0.41101741790771484, + "learning_rate": 5.114424410540916e-06, + "loss": 0.0127, + "step": 7055 + }, + { + "epoch": 4.8881191548320055, + "grad_norm": 0.5187044143676758, + "learning_rate": 5.113730929264911e-06, + "loss": 0.0124, + "step": 7056 + }, + { + "epoch": 4.888811915483201, + "grad_norm": 0.6034947633743286, + "learning_rate": 5.113037447988904e-06, + "loss": 0.0239, + "step": 7057 + }, + { + "epoch": 4.889504676134395, + "grad_norm": 0.5134703516960144, + "learning_rate": 5.112343966712899e-06, + "loss": 0.0142, + "step": 7058 + }, + { + "epoch": 4.8901974367855905, + "grad_norm": 0.5486952066421509, + "learning_rate": 5.111650485436893e-06, + "loss": 0.0214, + "step": 7059 + }, + { + "epoch": 4.890890197436786, + "grad_norm": 0.5766928791999817, + "learning_rate": 5.110957004160888e-06, + "loss": 0.0163, + "step": 7060 + }, + { + "epoch": 4.89158295808798, + "grad_norm": 0.5219733119010925, + "learning_rate": 5.110263522884883e-06, + "loss": 0.0164, + "step": 7061 + }, + { + "epoch": 4.892275718739175, + "grad_norm": 0.44417136907577515, + "learning_rate": 5.109570041608876e-06, + "loss": 0.0161, + "step": 7062 + }, + { + "epoch": 4.892968479390371, + "grad_norm": 0.5025767683982849, + "learning_rate": 5.108876560332871e-06, + "loss": 0.0152, + "step": 7063 + }, + { + "epoch": 4.893661240041566, + "grad_norm": 0.5078673958778381, + "learning_rate": 5.108183079056865e-06, + "loss": 0.0146, + "step": 7064 + }, + { + "epoch": 4.89435400069276, + "grad_norm": 0.5547715425491333, + "learning_rate": 5.10748959778086e-06, + "loss": 0.0159, + "step": 7065 + }, + { + "epoch": 4.895046761343956, + "grad_norm": 0.5483763217926025, + "learning_rate": 5.106796116504855e-06, + "loss": 0.0159, + "step": 7066 + }, + { + "epoch": 4.895739521995151, + "grad_norm": 0.506777822971344, + "learning_rate": 5.106102635228849e-06, + "loss": 0.0229, + "step": 7067 + }, + { + "epoch": 4.896432282646345, + "grad_norm": 0.5792956948280334, + "learning_rate": 5.105409153952844e-06, + "loss": 0.0167, + "step": 7068 + }, + { + "epoch": 4.897125043297541, + "grad_norm": 0.47799813747406006, + "learning_rate": 5.1047156726768375e-06, + "loss": 0.0175, + "step": 7069 + }, + { + "epoch": 4.897817803948736, + "grad_norm": 0.4106331467628479, + "learning_rate": 5.104022191400832e-06, + "loss": 0.0137, + "step": 7070 + }, + { + "epoch": 4.898510564599931, + "grad_norm": 0.5419927835464478, + "learning_rate": 5.103328710124827e-06, + "loss": 0.0181, + "step": 7071 + }, + { + "epoch": 4.899203325251126, + "grad_norm": 0.8085773587226868, + "learning_rate": 5.102635228848821e-06, + "loss": 0.0216, + "step": 7072 + }, + { + "epoch": 4.899896085902321, + "grad_norm": 0.43125635385513306, + "learning_rate": 5.101941747572816e-06, + "loss": 0.0208, + "step": 7073 + }, + { + "epoch": 4.900588846553516, + "grad_norm": 0.49862387776374817, + "learning_rate": 5.10124826629681e-06, + "loss": 0.0141, + "step": 7074 + }, + { + "epoch": 4.9012816072047105, + "grad_norm": 0.4481689929962158, + "learning_rate": 5.1005547850208045e-06, + "loss": 0.0125, + "step": 7075 + }, + { + "epoch": 4.901974367855906, + "grad_norm": 0.6605470776557922, + "learning_rate": 5.0998613037447995e-06, + "loss": 0.0205, + "step": 7076 + }, + { + "epoch": 4.902667128507101, + "grad_norm": 0.5066818594932556, + "learning_rate": 5.0991678224687935e-06, + "loss": 0.0146, + "step": 7077 + }, + { + "epoch": 4.9033598891582955, + "grad_norm": 0.515537679195404, + "learning_rate": 5.0984743411927885e-06, + "loss": 0.0155, + "step": 7078 + }, + { + "epoch": 4.904052649809491, + "grad_norm": 0.5556579828262329, + "learning_rate": 5.0977808599167826e-06, + "loss": 0.021, + "step": 7079 + }, + { + "epoch": 4.904745410460686, + "grad_norm": 0.45318371057510376, + "learning_rate": 5.0970873786407775e-06, + "loss": 0.0129, + "step": 7080 + }, + { + "epoch": 4.90543817111188, + "grad_norm": 0.5929465889930725, + "learning_rate": 5.096393897364772e-06, + "loss": 0.0168, + "step": 7081 + }, + { + "epoch": 4.906130931763076, + "grad_norm": 0.4716765284538269, + "learning_rate": 5.095700416088766e-06, + "loss": 0.0116, + "step": 7082 + }, + { + "epoch": 4.906823692414271, + "grad_norm": 0.49768924713134766, + "learning_rate": 5.095006934812761e-06, + "loss": 0.0179, + "step": 7083 + }, + { + "epoch": 4.907516453065466, + "grad_norm": 0.37211477756500244, + "learning_rate": 5.094313453536755e-06, + "loss": 0.0109, + "step": 7084 + }, + { + "epoch": 4.908209213716661, + "grad_norm": 0.41021373867988586, + "learning_rate": 5.09361997226075e-06, + "loss": 0.0143, + "step": 7085 + }, + { + "epoch": 4.908901974367856, + "grad_norm": 0.4520041346549988, + "learning_rate": 5.0929264909847446e-06, + "loss": 0.0119, + "step": 7086 + }, + { + "epoch": 4.909594735019051, + "grad_norm": 0.47611114382743835, + "learning_rate": 5.092233009708738e-06, + "loss": 0.0188, + "step": 7087 + }, + { + "epoch": 4.910287495670246, + "grad_norm": 0.5464442372322083, + "learning_rate": 5.091539528432733e-06, + "loss": 0.0184, + "step": 7088 + }, + { + "epoch": 4.910980256321441, + "grad_norm": 0.5857868790626526, + "learning_rate": 5.090846047156727e-06, + "loss": 0.0218, + "step": 7089 + }, + { + "epoch": 4.911673016972636, + "grad_norm": 0.4725109338760376, + "learning_rate": 5.090152565880722e-06, + "loss": 0.0167, + "step": 7090 + }, + { + "epoch": 4.912365777623831, + "grad_norm": 0.42381516098976135, + "learning_rate": 5.089459084604717e-06, + "loss": 0.0164, + "step": 7091 + }, + { + "epoch": 4.913058538275026, + "grad_norm": 0.40553903579711914, + "learning_rate": 5.08876560332871e-06, + "loss": 0.012, + "step": 7092 + }, + { + "epoch": 4.913751298926221, + "grad_norm": 0.6457179188728333, + "learning_rate": 5.088072122052705e-06, + "loss": 0.0164, + "step": 7093 + }, + { + "epoch": 4.914444059577416, + "grad_norm": 0.39204317331314087, + "learning_rate": 5.087378640776699e-06, + "loss": 0.0122, + "step": 7094 + }, + { + "epoch": 4.915136820228611, + "grad_norm": 0.5320116877555847, + "learning_rate": 5.086685159500694e-06, + "loss": 0.0161, + "step": 7095 + }, + { + "epoch": 4.915829580879806, + "grad_norm": 0.42274361848831177, + "learning_rate": 5.085991678224689e-06, + "loss": 0.0129, + "step": 7096 + }, + { + "epoch": 4.916522341531001, + "grad_norm": 0.42949315905570984, + "learning_rate": 5.085298196948683e-06, + "loss": 0.0153, + "step": 7097 + }, + { + "epoch": 4.917215102182196, + "grad_norm": 0.49816930294036865, + "learning_rate": 5.084604715672678e-06, + "loss": 0.0195, + "step": 7098 + }, + { + "epoch": 4.917907862833391, + "grad_norm": 0.449648916721344, + "learning_rate": 5.083911234396671e-06, + "loss": 0.014, + "step": 7099 + }, + { + "epoch": 4.918600623484586, + "grad_norm": 0.5509673357009888, + "learning_rate": 5.083217753120666e-06, + "loss": 0.0162, + "step": 7100 + }, + { + "epoch": 4.919293384135781, + "grad_norm": 0.487145334482193, + "learning_rate": 5.082524271844661e-06, + "loss": 0.0166, + "step": 7101 + }, + { + "epoch": 4.919986144786976, + "grad_norm": 0.4487362205982208, + "learning_rate": 5.081830790568655e-06, + "loss": 0.0122, + "step": 7102 + }, + { + "epoch": 4.920678905438171, + "grad_norm": 0.4550587832927704, + "learning_rate": 5.08113730929265e-06, + "loss": 0.021, + "step": 7103 + }, + { + "epoch": 4.9213716660893665, + "grad_norm": 0.6630377769470215, + "learning_rate": 5.080443828016643e-06, + "loss": 0.0275, + "step": 7104 + }, + { + "epoch": 4.922064426740561, + "grad_norm": 0.5580195188522339, + "learning_rate": 5.079750346740638e-06, + "loss": 0.0157, + "step": 7105 + }, + { + "epoch": 4.922757187391756, + "grad_norm": 0.4646613895893097, + "learning_rate": 5.079056865464633e-06, + "loss": 0.017, + "step": 7106 + }, + { + "epoch": 4.923449948042951, + "grad_norm": 0.5144959688186646, + "learning_rate": 5.078363384188627e-06, + "loss": 0.0153, + "step": 7107 + }, + { + "epoch": 4.924142708694146, + "grad_norm": 0.5957618951797485, + "learning_rate": 5.077669902912622e-06, + "loss": 0.0207, + "step": 7108 + }, + { + "epoch": 4.924835469345341, + "grad_norm": 0.4713325798511505, + "learning_rate": 5.076976421636616e-06, + "loss": 0.0189, + "step": 7109 + }, + { + "epoch": 4.925528229996536, + "grad_norm": 0.486539751291275, + "learning_rate": 5.07628294036061e-06, + "loss": 0.0168, + "step": 7110 + }, + { + "epoch": 4.926220990647732, + "grad_norm": 0.5931656360626221, + "learning_rate": 5.075589459084605e-06, + "loss": 0.0189, + "step": 7111 + }, + { + "epoch": 4.926913751298926, + "grad_norm": 0.4117998778820038, + "learning_rate": 5.074895977808599e-06, + "loss": 0.0146, + "step": 7112 + }, + { + "epoch": 4.927606511950121, + "grad_norm": 0.42907875776290894, + "learning_rate": 5.074202496532594e-06, + "loss": 0.0121, + "step": 7113 + }, + { + "epoch": 4.928299272601317, + "grad_norm": 0.4469527006149292, + "learning_rate": 5.073509015256588e-06, + "loss": 0.0122, + "step": 7114 + }, + { + "epoch": 4.928992033252511, + "grad_norm": 0.4878230094909668, + "learning_rate": 5.072815533980583e-06, + "loss": 0.0184, + "step": 7115 + }, + { + "epoch": 4.929684793903706, + "grad_norm": 0.6271370053291321, + "learning_rate": 5.072122052704578e-06, + "loss": 0.0149, + "step": 7116 + }, + { + "epoch": 4.930377554554902, + "grad_norm": 0.4054403007030487, + "learning_rate": 5.071428571428571e-06, + "loss": 0.0113, + "step": 7117 + }, + { + "epoch": 4.931070315206096, + "grad_norm": 0.52023845911026, + "learning_rate": 5.070735090152566e-06, + "loss": 0.0193, + "step": 7118 + }, + { + "epoch": 4.931763075857291, + "grad_norm": 0.38854172825813293, + "learning_rate": 5.07004160887656e-06, + "loss": 0.0128, + "step": 7119 + }, + { + "epoch": 4.9324558365084865, + "grad_norm": 0.42394208908081055, + "learning_rate": 5.069348127600555e-06, + "loss": 0.0149, + "step": 7120 + }, + { + "epoch": 4.933148597159681, + "grad_norm": 0.37314775586128235, + "learning_rate": 5.06865464632455e-06, + "loss": 0.0103, + "step": 7121 + }, + { + "epoch": 4.933841357810876, + "grad_norm": 0.3798964321613312, + "learning_rate": 5.0679611650485435e-06, + "loss": 0.017, + "step": 7122 + }, + { + "epoch": 4.9345341184620715, + "grad_norm": 0.6011042594909668, + "learning_rate": 5.0672676837725385e-06, + "loss": 0.0244, + "step": 7123 + }, + { + "epoch": 4.935226879113267, + "grad_norm": 0.5264334678649902, + "learning_rate": 5.0665742024965325e-06, + "loss": 0.0184, + "step": 7124 + }, + { + "epoch": 4.935919639764461, + "grad_norm": 0.5989033579826355, + "learning_rate": 5.0658807212205275e-06, + "loss": 0.0165, + "step": 7125 + }, + { + "epoch": 4.936612400415656, + "grad_norm": 0.6535217761993408, + "learning_rate": 5.065187239944522e-06, + "loss": 0.0165, + "step": 7126 + }, + { + "epoch": 4.937305161066852, + "grad_norm": 0.44317764043807983, + "learning_rate": 5.0644937586685165e-06, + "loss": 0.0117, + "step": 7127 + }, + { + "epoch": 4.937997921718046, + "grad_norm": 0.4648471772670746, + "learning_rate": 5.0638002773925114e-06, + "loss": 0.0141, + "step": 7128 + }, + { + "epoch": 4.938690682369241, + "grad_norm": 0.7307034134864807, + "learning_rate": 5.063106796116505e-06, + "loss": 0.0212, + "step": 7129 + }, + { + "epoch": 4.939383443020437, + "grad_norm": 0.40065693855285645, + "learning_rate": 5.0624133148405e-06, + "loss": 0.0143, + "step": 7130 + }, + { + "epoch": 4.940076203671632, + "grad_norm": 0.4784284830093384, + "learning_rate": 5.0617198335644945e-06, + "loss": 0.0111, + "step": 7131 + }, + { + "epoch": 4.940768964322826, + "grad_norm": 0.4007522165775299, + "learning_rate": 5.061026352288489e-06, + "loss": 0.0106, + "step": 7132 + }, + { + "epoch": 4.941461724974022, + "grad_norm": 0.4763968586921692, + "learning_rate": 5.0603328710124836e-06, + "loss": 0.0157, + "step": 7133 + }, + { + "epoch": 4.942154485625217, + "grad_norm": 0.7442196607589722, + "learning_rate": 5.059639389736477e-06, + "loss": 0.0147, + "step": 7134 + }, + { + "epoch": 4.942847246276411, + "grad_norm": 0.48007553815841675, + "learning_rate": 5.058945908460472e-06, + "loss": 0.0145, + "step": 7135 + }, + { + "epoch": 4.9435400069276065, + "grad_norm": 0.5864841938018799, + "learning_rate": 5.058252427184467e-06, + "loss": 0.0156, + "step": 7136 + }, + { + "epoch": 4.944232767578802, + "grad_norm": 0.5981998443603516, + "learning_rate": 5.057558945908461e-06, + "loss": 0.0239, + "step": 7137 + }, + { + "epoch": 4.944925528229996, + "grad_norm": 0.5885744094848633, + "learning_rate": 5.056865464632456e-06, + "loss": 0.0199, + "step": 7138 + }, + { + "epoch": 4.9456182888811915, + "grad_norm": 0.41168013215065, + "learning_rate": 5.056171983356449e-06, + "loss": 0.013, + "step": 7139 + }, + { + "epoch": 4.946311049532387, + "grad_norm": 0.5568606853485107, + "learning_rate": 5.055478502080444e-06, + "loss": 0.0224, + "step": 7140 + }, + { + "epoch": 4.947003810183581, + "grad_norm": 0.6031033396720886, + "learning_rate": 5.054785020804439e-06, + "loss": 0.0225, + "step": 7141 + }, + { + "epoch": 4.947696570834776, + "grad_norm": 0.49166953563690186, + "learning_rate": 5.054091539528433e-06, + "loss": 0.0176, + "step": 7142 + }, + { + "epoch": 4.948389331485972, + "grad_norm": 0.4761636555194855, + "learning_rate": 5.053398058252428e-06, + "loss": 0.0206, + "step": 7143 + }, + { + "epoch": 4.949082092137167, + "grad_norm": 0.4680408537387848, + "learning_rate": 5.052704576976422e-06, + "loss": 0.0131, + "step": 7144 + }, + { + "epoch": 4.949774852788361, + "grad_norm": 0.5143632888793945, + "learning_rate": 5.052011095700417e-06, + "loss": 0.0181, + "step": 7145 + }, + { + "epoch": 4.950467613439557, + "grad_norm": 0.44326531887054443, + "learning_rate": 5.051317614424412e-06, + "loss": 0.0128, + "step": 7146 + }, + { + "epoch": 4.951160374090752, + "grad_norm": 0.6614006757736206, + "learning_rate": 5.050624133148405e-06, + "loss": 0.0143, + "step": 7147 + }, + { + "epoch": 4.951853134741946, + "grad_norm": 0.6010259389877319, + "learning_rate": 5.0499306518724e-06, + "loss": 0.0217, + "step": 7148 + }, + { + "epoch": 4.952545895393142, + "grad_norm": 0.5312649607658386, + "learning_rate": 5.049237170596394e-06, + "loss": 0.0177, + "step": 7149 + }, + { + "epoch": 4.953238656044337, + "grad_norm": 0.4602656960487366, + "learning_rate": 5.048543689320389e-06, + "loss": 0.015, + "step": 7150 + }, + { + "epoch": 4.953931416695532, + "grad_norm": 0.4423941373825073, + "learning_rate": 5.047850208044384e-06, + "loss": 0.0111, + "step": 7151 + }, + { + "epoch": 4.9546241773467266, + "grad_norm": 0.608744740486145, + "learning_rate": 5.047156726768377e-06, + "loss": 0.0155, + "step": 7152 + }, + { + "epoch": 4.955316937997922, + "grad_norm": 0.5473113059997559, + "learning_rate": 5.046463245492372e-06, + "loss": 0.0165, + "step": 7153 + }, + { + "epoch": 4.956009698649117, + "grad_norm": 0.6941458582878113, + "learning_rate": 5.045769764216366e-06, + "loss": 0.0142, + "step": 7154 + }, + { + "epoch": 4.9567024593003115, + "grad_norm": 0.500970184803009, + "learning_rate": 5.045076282940361e-06, + "loss": 0.0227, + "step": 7155 + }, + { + "epoch": 4.957395219951507, + "grad_norm": 0.4593062698841095, + "learning_rate": 5.044382801664356e-06, + "loss": 0.0153, + "step": 7156 + }, + { + "epoch": 4.958087980602702, + "grad_norm": 0.6693342924118042, + "learning_rate": 5.04368932038835e-06, + "loss": 0.0138, + "step": 7157 + }, + { + "epoch": 4.9587807412538965, + "grad_norm": 0.5365694761276245, + "learning_rate": 5.042995839112344e-06, + "loss": 0.0173, + "step": 7158 + }, + { + "epoch": 4.959473501905092, + "grad_norm": 0.37168315052986145, + "learning_rate": 5.042302357836338e-06, + "loss": 0.0132, + "step": 7159 + }, + { + "epoch": 4.960166262556287, + "grad_norm": 0.489751398563385, + "learning_rate": 5.041608876560333e-06, + "loss": 0.0153, + "step": 7160 + }, + { + "epoch": 4.960859023207481, + "grad_norm": 0.8007532358169556, + "learning_rate": 5.040915395284328e-06, + "loss": 0.0175, + "step": 7161 + }, + { + "epoch": 4.961551783858677, + "grad_norm": 0.41315120458602905, + "learning_rate": 5.040221914008322e-06, + "loss": 0.0164, + "step": 7162 + }, + { + "epoch": 4.962244544509872, + "grad_norm": 0.4592771828174591, + "learning_rate": 5.039528432732317e-06, + "loss": 0.0137, + "step": 7163 + }, + { + "epoch": 4.962937305161067, + "grad_norm": 0.5716656446456909, + "learning_rate": 5.03883495145631e-06, + "loss": 0.0232, + "step": 7164 + }, + { + "epoch": 4.963630065812262, + "grad_norm": 0.5507204532623291, + "learning_rate": 5.038141470180305e-06, + "loss": 0.0113, + "step": 7165 + }, + { + "epoch": 4.964322826463457, + "grad_norm": 0.46243104338645935, + "learning_rate": 5.0374479889043e-06, + "loss": 0.0174, + "step": 7166 + }, + { + "epoch": 4.965015587114652, + "grad_norm": 0.4487704634666443, + "learning_rate": 5.036754507628294e-06, + "loss": 0.0134, + "step": 7167 + }, + { + "epoch": 4.965708347765847, + "grad_norm": 0.5466247797012329, + "learning_rate": 5.036061026352289e-06, + "loss": 0.0175, + "step": 7168 + }, + { + "epoch": 4.966401108417042, + "grad_norm": 0.48058101534843445, + "learning_rate": 5.0353675450762825e-06, + "loss": 0.0165, + "step": 7169 + }, + { + "epoch": 4.967093869068237, + "grad_norm": 0.5494735240936279, + "learning_rate": 5.0346740638002775e-06, + "loss": 0.0234, + "step": 7170 + }, + { + "epoch": 4.967786629719432, + "grad_norm": 0.5059705972671509, + "learning_rate": 5.033980582524272e-06, + "loss": 0.0162, + "step": 7171 + }, + { + "epoch": 4.968479390370627, + "grad_norm": 0.5217702388763428, + "learning_rate": 5.0332871012482665e-06, + "loss": 0.0205, + "step": 7172 + }, + { + "epoch": 4.969172151021822, + "grad_norm": 0.6256882548332214, + "learning_rate": 5.032593619972261e-06, + "loss": 0.0234, + "step": 7173 + }, + { + "epoch": 4.969864911673017, + "grad_norm": 0.5773966908454895, + "learning_rate": 5.0319001386962555e-06, + "loss": 0.0195, + "step": 7174 + }, + { + "epoch": 4.970557672324212, + "grad_norm": 0.5373857021331787, + "learning_rate": 5.0312066574202504e-06, + "loss": 0.012, + "step": 7175 + }, + { + "epoch": 4.971250432975407, + "grad_norm": 0.48725610971450806, + "learning_rate": 5.030513176144245e-06, + "loss": 0.0126, + "step": 7176 + }, + { + "epoch": 4.971943193626602, + "grad_norm": 0.6250792741775513, + "learning_rate": 5.029819694868239e-06, + "loss": 0.0192, + "step": 7177 + }, + { + "epoch": 4.972635954277797, + "grad_norm": 0.4702279269695282, + "learning_rate": 5.0291262135922335e-06, + "loss": 0.0169, + "step": 7178 + }, + { + "epoch": 4.973328714928992, + "grad_norm": 0.3435475826263428, + "learning_rate": 5.028432732316228e-06, + "loss": 0.0124, + "step": 7179 + }, + { + "epoch": 4.974021475580187, + "grad_norm": 0.6400315761566162, + "learning_rate": 5.0277392510402226e-06, + "loss": 0.0115, + "step": 7180 + }, + { + "epoch": 4.974714236231382, + "grad_norm": 0.5794646143913269, + "learning_rate": 5.0270457697642175e-06, + "loss": 0.0223, + "step": 7181 + }, + { + "epoch": 4.975406996882577, + "grad_norm": 0.38571569323539734, + "learning_rate": 5.026352288488211e-06, + "loss": 0.0114, + "step": 7182 + }, + { + "epoch": 4.976099757533772, + "grad_norm": 0.5065745711326599, + "learning_rate": 5.025658807212206e-06, + "loss": 0.0219, + "step": 7183 + }, + { + "epoch": 4.976792518184967, + "grad_norm": 0.4838719069957733, + "learning_rate": 5.0249653259362e-06, + "loss": 0.0161, + "step": 7184 + }, + { + "epoch": 4.977485278836162, + "grad_norm": 0.5162280201911926, + "learning_rate": 5.024271844660195e-06, + "loss": 0.0189, + "step": 7185 + }, + { + "epoch": 4.978178039487357, + "grad_norm": 0.3886980712413788, + "learning_rate": 5.02357836338419e-06, + "loss": 0.0097, + "step": 7186 + }, + { + "epoch": 4.978870800138552, + "grad_norm": 0.5071686506271362, + "learning_rate": 5.022884882108183e-06, + "loss": 0.0181, + "step": 7187 + }, + { + "epoch": 4.979563560789747, + "grad_norm": 0.5926240086555481, + "learning_rate": 5.022191400832178e-06, + "loss": 0.018, + "step": 7188 + }, + { + "epoch": 4.980256321440942, + "grad_norm": 0.3423044979572296, + "learning_rate": 5.021497919556172e-06, + "loss": 0.0099, + "step": 7189 + }, + { + "epoch": 4.980949082092137, + "grad_norm": 0.5712215900421143, + "learning_rate": 5.020804438280167e-06, + "loss": 0.0191, + "step": 7190 + }, + { + "epoch": 4.981641842743333, + "grad_norm": 0.42569872736930847, + "learning_rate": 5.020110957004162e-06, + "loss": 0.0149, + "step": 7191 + }, + { + "epoch": 4.982334603394527, + "grad_norm": 0.5604143738746643, + "learning_rate": 5.019417475728156e-06, + "loss": 0.0145, + "step": 7192 + }, + { + "epoch": 4.983027364045722, + "grad_norm": 0.5349857807159424, + "learning_rate": 5.018723994452151e-06, + "loss": 0.0178, + "step": 7193 + }, + { + "epoch": 4.983720124696918, + "grad_norm": 0.5973315238952637, + "learning_rate": 5.018030513176144e-06, + "loss": 0.0188, + "step": 7194 + }, + { + "epoch": 4.984412885348112, + "grad_norm": 0.40893352031707764, + "learning_rate": 5.017337031900139e-06, + "loss": 0.0139, + "step": 7195 + }, + { + "epoch": 4.985105645999307, + "grad_norm": 0.39439642429351807, + "learning_rate": 5.016643550624134e-06, + "loss": 0.0125, + "step": 7196 + }, + { + "epoch": 4.985798406650503, + "grad_norm": 0.4668135643005371, + "learning_rate": 5.015950069348128e-06, + "loss": 0.0107, + "step": 7197 + }, + { + "epoch": 4.986491167301697, + "grad_norm": 0.5245904922485352, + "learning_rate": 5.015256588072123e-06, + "loss": 0.0226, + "step": 7198 + }, + { + "epoch": 4.987183927952892, + "grad_norm": 0.6042958498001099, + "learning_rate": 5.014563106796116e-06, + "loss": 0.0191, + "step": 7199 + }, + { + "epoch": 4.9878766886040875, + "grad_norm": 0.553793728351593, + "learning_rate": 5.013869625520111e-06, + "loss": 0.0203, + "step": 7200 + }, + { + "epoch": 4.988569449255282, + "grad_norm": 0.5331743955612183, + "learning_rate": 5.013176144244106e-06, + "loss": 0.0149, + "step": 7201 + }, + { + "epoch": 4.989262209906477, + "grad_norm": 0.6598172783851624, + "learning_rate": 5.0124826629681e-06, + "loss": 0.0151, + "step": 7202 + }, + { + "epoch": 4.9899549705576725, + "grad_norm": 0.5911411046981812, + "learning_rate": 5.011789181692095e-06, + "loss": 0.0174, + "step": 7203 + }, + { + "epoch": 4.990647731208867, + "grad_norm": 0.4001295268535614, + "learning_rate": 5.011095700416089e-06, + "loss": 0.0133, + "step": 7204 + }, + { + "epoch": 4.991340491860062, + "grad_norm": 0.4572732448577881, + "learning_rate": 5.010402219140084e-06, + "loss": 0.0116, + "step": 7205 + }, + { + "epoch": 4.992033252511257, + "grad_norm": 0.635037899017334, + "learning_rate": 5.009708737864078e-06, + "loss": 0.0109, + "step": 7206 + }, + { + "epoch": 4.992726013162453, + "grad_norm": 0.6858110427856445, + "learning_rate": 5.009015256588072e-06, + "loss": 0.0138, + "step": 7207 + }, + { + "epoch": 4.993418773813647, + "grad_norm": 0.48281699419021606, + "learning_rate": 5.008321775312067e-06, + "loss": 0.0145, + "step": 7208 + }, + { + "epoch": 4.994111534464842, + "grad_norm": 0.5513356924057007, + "learning_rate": 5.007628294036061e-06, + "loss": 0.0131, + "step": 7209 + }, + { + "epoch": 4.994804295116038, + "grad_norm": 0.6060163378715515, + "learning_rate": 5.006934812760056e-06, + "loss": 0.0181, + "step": 7210 + }, + { + "epoch": 4.995497055767233, + "grad_norm": 0.4762422442436218, + "learning_rate": 5.006241331484051e-06, + "loss": 0.0167, + "step": 7211 + }, + { + "epoch": 4.996189816418427, + "grad_norm": 0.44957783818244934, + "learning_rate": 5.005547850208044e-06, + "loss": 0.0188, + "step": 7212 + }, + { + "epoch": 4.996882577069623, + "grad_norm": 0.48919323086738586, + "learning_rate": 5.004854368932039e-06, + "loss": 0.0158, + "step": 7213 + }, + { + "epoch": 4.997575337720818, + "grad_norm": 0.4546241760253906, + "learning_rate": 5.004160887656033e-06, + "loss": 0.0161, + "step": 7214 + }, + { + "epoch": 4.998268098372012, + "grad_norm": 0.6716766357421875, + "learning_rate": 5.003467406380028e-06, + "loss": 0.0199, + "step": 7215 + }, + { + "epoch": 4.9989608590232075, + "grad_norm": 0.5103355050086975, + "learning_rate": 5.002773925104023e-06, + "loss": 0.0185, + "step": 7216 + }, + { + "epoch": 4.999653619674403, + "grad_norm": 0.4889625906944275, + "learning_rate": 5.0020804438280165e-06, + "loss": 0.0197, + "step": 7217 + }, + { + "epoch": 4.999653619674403, + "eval_loss": 0.2697892189025879, + "eval_runtime": 7671.3155, + "eval_samples_per_second": 1.043, + "eval_steps_per_second": 0.033, + "eval_wer": 12.525478411777724, + "step": 7217 + }, + { + "epoch": 5.000346380325597, + "grad_norm": 0.45957180857658386, + "learning_rate": 5.001386962552011e-06, + "loss": 0.0137, + "step": 7218 + }, + { + "epoch": 5.0010391409767925, + "grad_norm": 0.3669784963130951, + "learning_rate": 5.0006934812760055e-06, + "loss": 0.0076, + "step": 7219 + }, + { + "epoch": 5.001731901627988, + "grad_norm": 0.36706864833831787, + "learning_rate": 5e-06, + "loss": 0.0104, + "step": 7220 + }, + { + "epoch": 5.002424662279182, + "grad_norm": 0.43076092004776, + "learning_rate": 4.9993065187239945e-06, + "loss": 0.01, + "step": 7221 + }, + { + "epoch": 5.003117422930377, + "grad_norm": 0.32632872462272644, + "learning_rate": 4.9986130374479894e-06, + "loss": 0.0072, + "step": 7222 + }, + { + "epoch": 5.003810183581573, + "grad_norm": 0.3419957458972931, + "learning_rate": 4.997919556171984e-06, + "loss": 0.0128, + "step": 7223 + }, + { + "epoch": 5.004502944232768, + "grad_norm": 0.29304713010787964, + "learning_rate": 4.9972260748959784e-06, + "loss": 0.0087, + "step": 7224 + }, + { + "epoch": 5.005195704883962, + "grad_norm": 0.34947752952575684, + "learning_rate": 4.9965325936199725e-06, + "loss": 0.0088, + "step": 7225 + }, + { + "epoch": 5.005888465535158, + "grad_norm": 0.29535210132598877, + "learning_rate": 4.995839112343967e-06, + "loss": 0.0076, + "step": 7226 + }, + { + "epoch": 5.006581226186353, + "grad_norm": 0.4918957054615021, + "learning_rate": 4.9951456310679616e-06, + "loss": 0.0122, + "step": 7227 + }, + { + "epoch": 5.007273986837547, + "grad_norm": 0.277309775352478, + "learning_rate": 4.9944521497919565e-06, + "loss": 0.0065, + "step": 7228 + }, + { + "epoch": 5.007966747488743, + "grad_norm": 0.4154784083366394, + "learning_rate": 4.9937586685159506e-06, + "loss": 0.0099, + "step": 7229 + }, + { + "epoch": 5.008659508139938, + "grad_norm": 0.25265270471572876, + "learning_rate": 4.993065187239945e-06, + "loss": 0.0062, + "step": 7230 + }, + { + "epoch": 5.009352268791132, + "grad_norm": 0.2866215407848358, + "learning_rate": 4.99237170596394e-06, + "loss": 0.0094, + "step": 7231 + }, + { + "epoch": 5.0100450294423275, + "grad_norm": 0.30181148648262024, + "learning_rate": 4.991678224687934e-06, + "loss": 0.0072, + "step": 7232 + }, + { + "epoch": 5.010737790093523, + "grad_norm": 0.3594330847263336, + "learning_rate": 4.990984743411929e-06, + "loss": 0.0087, + "step": 7233 + }, + { + "epoch": 5.011430550744718, + "grad_norm": 0.26650872826576233, + "learning_rate": 4.990291262135923e-06, + "loss": 0.0053, + "step": 7234 + }, + { + "epoch": 5.0121233113959125, + "grad_norm": 0.5954672694206238, + "learning_rate": 4.989597780859917e-06, + "loss": 0.0079, + "step": 7235 + }, + { + "epoch": 5.012816072047108, + "grad_norm": 0.27336519956588745, + "learning_rate": 4.988904299583912e-06, + "loss": 0.006, + "step": 7236 + }, + { + "epoch": 5.013508832698303, + "grad_norm": 0.3480377495288849, + "learning_rate": 4.988210818307906e-06, + "loss": 0.0085, + "step": 7237 + }, + { + "epoch": 5.014201593349497, + "grad_norm": 0.38182446360588074, + "learning_rate": 4.987517337031901e-06, + "loss": 0.0091, + "step": 7238 + }, + { + "epoch": 5.014894354000693, + "grad_norm": 0.36752888560295105, + "learning_rate": 4.986823855755895e-06, + "loss": 0.007, + "step": 7239 + }, + { + "epoch": 5.015587114651888, + "grad_norm": 0.30378803610801697, + "learning_rate": 4.98613037447989e-06, + "loss": 0.0075, + "step": 7240 + }, + { + "epoch": 5.016279875303082, + "grad_norm": 0.31696900725364685, + "learning_rate": 4.985436893203884e-06, + "loss": 0.0081, + "step": 7241 + }, + { + "epoch": 5.016972635954278, + "grad_norm": 0.23790748417377472, + "learning_rate": 4.984743411927878e-06, + "loss": 0.0062, + "step": 7242 + }, + { + "epoch": 5.017665396605473, + "grad_norm": 0.3500445783138275, + "learning_rate": 4.984049930651873e-06, + "loss": 0.0069, + "step": 7243 + }, + { + "epoch": 5.018358157256668, + "grad_norm": 0.31131619215011597, + "learning_rate": 4.983356449375868e-06, + "loss": 0.0064, + "step": 7244 + }, + { + "epoch": 5.019050917907863, + "grad_norm": 0.31733691692352295, + "learning_rate": 4.982662968099862e-06, + "loss": 0.0074, + "step": 7245 + }, + { + "epoch": 5.019743678559058, + "grad_norm": 0.3574613928794861, + "learning_rate": 4.981969486823856e-06, + "loss": 0.0102, + "step": 7246 + }, + { + "epoch": 5.020436439210253, + "grad_norm": 0.2864325940608978, + "learning_rate": 4.98127600554785e-06, + "loss": 0.0069, + "step": 7247 + }, + { + "epoch": 5.021129199861448, + "grad_norm": 0.24767273664474487, + "learning_rate": 4.980582524271845e-06, + "loss": 0.0059, + "step": 7248 + }, + { + "epoch": 5.021821960512643, + "grad_norm": 0.5374817848205566, + "learning_rate": 4.97988904299584e-06, + "loss": 0.0096, + "step": 7249 + }, + { + "epoch": 5.022514721163838, + "grad_norm": 0.5407879948616028, + "learning_rate": 4.979195561719834e-06, + "loss": 0.0123, + "step": 7250 + }, + { + "epoch": 5.0232074818150325, + "grad_norm": 0.30810102820396423, + "learning_rate": 4.978502080443828e-06, + "loss": 0.0075, + "step": 7251 + }, + { + "epoch": 5.023900242466228, + "grad_norm": 0.26572948694229126, + "learning_rate": 4.977808599167823e-06, + "loss": 0.0069, + "step": 7252 + }, + { + "epoch": 5.024593003117423, + "grad_norm": 0.3364541232585907, + "learning_rate": 4.977115117891818e-06, + "loss": 0.0067, + "step": 7253 + }, + { + "epoch": 5.025285763768618, + "grad_norm": 0.3556959629058838, + "learning_rate": 4.976421636615812e-06, + "loss": 0.0058, + "step": 7254 + }, + { + "epoch": 5.025978524419813, + "grad_norm": 0.25573965907096863, + "learning_rate": 4.975728155339806e-06, + "loss": 0.0057, + "step": 7255 + }, + { + "epoch": 5.026671285071008, + "grad_norm": 0.315265417098999, + "learning_rate": 4.9750346740638e-06, + "loss": 0.008, + "step": 7256 + }, + { + "epoch": 5.027364045722203, + "grad_norm": 0.5426953434944153, + "learning_rate": 4.974341192787795e-06, + "loss": 0.009, + "step": 7257 + }, + { + "epoch": 5.028056806373398, + "grad_norm": 0.312299907207489, + "learning_rate": 4.97364771151179e-06, + "loss": 0.0067, + "step": 7258 + }, + { + "epoch": 5.028749567024593, + "grad_norm": 0.24006272852420807, + "learning_rate": 4.972954230235784e-06, + "loss": 0.0062, + "step": 7259 + }, + { + "epoch": 5.029442327675788, + "grad_norm": 0.3840598165988922, + "learning_rate": 4.972260748959778e-06, + "loss": 0.0084, + "step": 7260 + }, + { + "epoch": 5.030135088326983, + "grad_norm": 0.23196850717067719, + "learning_rate": 4.971567267683773e-06, + "loss": 0.0065, + "step": 7261 + }, + { + "epoch": 5.030827848978178, + "grad_norm": 0.27836501598358154, + "learning_rate": 4.970873786407767e-06, + "loss": 0.0065, + "step": 7262 + }, + { + "epoch": 5.031520609629373, + "grad_norm": 0.3527560532093048, + "learning_rate": 4.970180305131762e-06, + "loss": 0.0079, + "step": 7263 + }, + { + "epoch": 5.0322133702805685, + "grad_norm": 0.41922900080680847, + "learning_rate": 4.969486823855756e-06, + "loss": 0.0105, + "step": 7264 + }, + { + "epoch": 5.032906130931763, + "grad_norm": 0.27435484528541565, + "learning_rate": 4.96879334257975e-06, + "loss": 0.0064, + "step": 7265 + }, + { + "epoch": 5.033598891582958, + "grad_norm": 0.32516488432884216, + "learning_rate": 4.968099861303745e-06, + "loss": 0.0071, + "step": 7266 + }, + { + "epoch": 5.034291652234153, + "grad_norm": 0.37962162494659424, + "learning_rate": 4.967406380027739e-06, + "loss": 0.0117, + "step": 7267 + }, + { + "epoch": 5.034984412885348, + "grad_norm": 0.3413487672805786, + "learning_rate": 4.966712898751734e-06, + "loss": 0.0073, + "step": 7268 + }, + { + "epoch": 5.035677173536543, + "grad_norm": 0.26364386081695557, + "learning_rate": 4.9660194174757284e-06, + "loss": 0.0074, + "step": 7269 + }, + { + "epoch": 5.036369934187738, + "grad_norm": 0.3675404191017151, + "learning_rate": 4.965325936199723e-06, + "loss": 0.0088, + "step": 7270 + }, + { + "epoch": 5.037062694838933, + "grad_norm": 0.36805999279022217, + "learning_rate": 4.9646324549237174e-06, + "loss": 0.0078, + "step": 7271 + }, + { + "epoch": 5.037755455490128, + "grad_norm": 0.33098238706588745, + "learning_rate": 4.9639389736477115e-06, + "loss": 0.0093, + "step": 7272 + }, + { + "epoch": 5.038448216141323, + "grad_norm": 0.2958826720714569, + "learning_rate": 4.9632454923717065e-06, + "loss": 0.0083, + "step": 7273 + }, + { + "epoch": 5.039140976792519, + "grad_norm": 0.3946586847305298, + "learning_rate": 4.9625520110957006e-06, + "loss": 0.0106, + "step": 7274 + }, + { + "epoch": 5.039833737443713, + "grad_norm": 0.3046253025531769, + "learning_rate": 4.9618585298196955e-06, + "loss": 0.0079, + "step": 7275 + }, + { + "epoch": 5.040526498094908, + "grad_norm": 0.4130062162876129, + "learning_rate": 4.9611650485436896e-06, + "loss": 0.011, + "step": 7276 + }, + { + "epoch": 5.0412192587461035, + "grad_norm": 0.24970969557762146, + "learning_rate": 4.960471567267684e-06, + "loss": 0.0057, + "step": 7277 + }, + { + "epoch": 5.041912019397298, + "grad_norm": 0.29134806990623474, + "learning_rate": 4.959778085991679e-06, + "loss": 0.0079, + "step": 7278 + }, + { + "epoch": 5.042604780048493, + "grad_norm": 0.2696983814239502, + "learning_rate": 4.9590846047156735e-06, + "loss": 0.0074, + "step": 7279 + }, + { + "epoch": 5.0432975406996885, + "grad_norm": 0.22379395365715027, + "learning_rate": 4.958391123439668e-06, + "loss": 0.0059, + "step": 7280 + }, + { + "epoch": 5.043990301350883, + "grad_norm": 0.5553340315818787, + "learning_rate": 4.957697642163662e-06, + "loss": 0.0077, + "step": 7281 + }, + { + "epoch": 5.044683062002078, + "grad_norm": 0.46020668745040894, + "learning_rate": 4.957004160887657e-06, + "loss": 0.0088, + "step": 7282 + }, + { + "epoch": 5.0453758226532734, + "grad_norm": 0.2178874909877777, + "learning_rate": 4.956310679611651e-06, + "loss": 0.0041, + "step": 7283 + }, + { + "epoch": 5.046068583304469, + "grad_norm": 0.41293787956237793, + "learning_rate": 4.955617198335646e-06, + "loss": 0.0074, + "step": 7284 + }, + { + "epoch": 5.046761343955663, + "grad_norm": 0.33358272910118103, + "learning_rate": 4.95492371705964e-06, + "loss": 0.0075, + "step": 7285 + }, + { + "epoch": 5.047454104606858, + "grad_norm": 0.33142977952957153, + "learning_rate": 4.954230235783634e-06, + "loss": 0.0056, + "step": 7286 + }, + { + "epoch": 5.048146865258054, + "grad_norm": 0.3636792004108429, + "learning_rate": 4.953536754507629e-06, + "loss": 0.0071, + "step": 7287 + }, + { + "epoch": 5.048839625909248, + "grad_norm": 0.24365943670272827, + "learning_rate": 4.952843273231624e-06, + "loss": 0.0046, + "step": 7288 + }, + { + "epoch": 5.049532386560443, + "grad_norm": 0.2750144600868225, + "learning_rate": 4.952149791955618e-06, + "loss": 0.0071, + "step": 7289 + }, + { + "epoch": 5.050225147211639, + "grad_norm": 0.2605492174625397, + "learning_rate": 4.951456310679612e-06, + "loss": 0.005, + "step": 7290 + }, + { + "epoch": 5.050917907862833, + "grad_norm": 0.2993355989456177, + "learning_rate": 4.950762829403607e-06, + "loss": 0.0067, + "step": 7291 + }, + { + "epoch": 5.051610668514028, + "grad_norm": 0.4299032986164093, + "learning_rate": 4.950069348127601e-06, + "loss": 0.007, + "step": 7292 + }, + { + "epoch": 5.052303429165224, + "grad_norm": 0.3229838013648987, + "learning_rate": 4.949375866851596e-06, + "loss": 0.0062, + "step": 7293 + }, + { + "epoch": 5.052996189816419, + "grad_norm": 0.3593423366546631, + "learning_rate": 4.94868238557559e-06, + "loss": 0.006, + "step": 7294 + }, + { + "epoch": 5.053688950467613, + "grad_norm": 0.40031617879867554, + "learning_rate": 4.947988904299584e-06, + "loss": 0.0054, + "step": 7295 + }, + { + "epoch": 5.0543817111188085, + "grad_norm": 0.3411800265312195, + "learning_rate": 4.947295423023579e-06, + "loss": 0.0082, + "step": 7296 + }, + { + "epoch": 5.055074471770004, + "grad_norm": 0.37931033968925476, + "learning_rate": 4.946601941747573e-06, + "loss": 0.0069, + "step": 7297 + }, + { + "epoch": 5.055767232421198, + "grad_norm": 0.290792316198349, + "learning_rate": 4.945908460471568e-06, + "loss": 0.0085, + "step": 7298 + }, + { + "epoch": 5.0564599930723935, + "grad_norm": 0.23591336607933044, + "learning_rate": 4.945214979195562e-06, + "loss": 0.0064, + "step": 7299 + }, + { + "epoch": 5.057152753723589, + "grad_norm": 0.25219523906707764, + "learning_rate": 4.944521497919557e-06, + "loss": 0.0056, + "step": 7300 + }, + { + "epoch": 5.057845514374783, + "grad_norm": 0.18537768721580505, + "learning_rate": 4.943828016643551e-06, + "loss": 0.0054, + "step": 7301 + }, + { + "epoch": 5.058538275025978, + "grad_norm": 0.3577527105808258, + "learning_rate": 4.943134535367545e-06, + "loss": 0.0099, + "step": 7302 + }, + { + "epoch": 5.059231035677174, + "grad_norm": 0.3047337234020233, + "learning_rate": 4.94244105409154e-06, + "loss": 0.0072, + "step": 7303 + }, + { + "epoch": 5.059923796328369, + "grad_norm": 0.13182613253593445, + "learning_rate": 4.941747572815534e-06, + "loss": 0.0033, + "step": 7304 + }, + { + "epoch": 5.060616556979563, + "grad_norm": 0.30081212520599365, + "learning_rate": 4.941054091539529e-06, + "loss": 0.0071, + "step": 7305 + }, + { + "epoch": 5.061309317630759, + "grad_norm": 0.23153644800186157, + "learning_rate": 4.940360610263523e-06, + "loss": 0.0059, + "step": 7306 + }, + { + "epoch": 5.062002078281954, + "grad_norm": 0.2580624222755432, + "learning_rate": 4.939667128987517e-06, + "loss": 0.0071, + "step": 7307 + }, + { + "epoch": 5.062694838933148, + "grad_norm": 0.22963561117649078, + "learning_rate": 4.938973647711512e-06, + "loss": 0.0051, + "step": 7308 + }, + { + "epoch": 5.063387599584344, + "grad_norm": 0.23751002550125122, + "learning_rate": 4.938280166435507e-06, + "loss": 0.0058, + "step": 7309 + }, + { + "epoch": 5.064080360235539, + "grad_norm": 0.2023371160030365, + "learning_rate": 4.937586685159501e-06, + "loss": 0.0055, + "step": 7310 + }, + { + "epoch": 5.064773120886733, + "grad_norm": 0.2542552053928375, + "learning_rate": 4.936893203883495e-06, + "loss": 0.0066, + "step": 7311 + }, + { + "epoch": 5.0654658815379285, + "grad_norm": 0.4164277911186218, + "learning_rate": 4.936199722607489e-06, + "loss": 0.0086, + "step": 7312 + }, + { + "epoch": 5.066158642189124, + "grad_norm": 0.5117143392562866, + "learning_rate": 4.935506241331484e-06, + "loss": 0.0076, + "step": 7313 + }, + { + "epoch": 5.066851402840319, + "grad_norm": 0.23055152595043182, + "learning_rate": 4.934812760055479e-06, + "loss": 0.0056, + "step": 7314 + }, + { + "epoch": 5.0675441634915135, + "grad_norm": 0.351272851228714, + "learning_rate": 4.934119278779473e-06, + "loss": 0.0081, + "step": 7315 + }, + { + "epoch": 5.068236924142709, + "grad_norm": 0.38055822253227234, + "learning_rate": 4.9334257975034674e-06, + "loss": 0.0077, + "step": 7316 + }, + { + "epoch": 5.068929684793904, + "grad_norm": 0.38499513268470764, + "learning_rate": 4.932732316227462e-06, + "loss": 0.0076, + "step": 7317 + }, + { + "epoch": 5.069622445445098, + "grad_norm": 0.23655486106872559, + "learning_rate": 4.932038834951457e-06, + "loss": 0.0049, + "step": 7318 + }, + { + "epoch": 5.070315206096294, + "grad_norm": 0.25031062960624695, + "learning_rate": 4.931345353675451e-06, + "loss": 0.0085, + "step": 7319 + }, + { + "epoch": 5.071007966747489, + "grad_norm": 0.9640106558799744, + "learning_rate": 4.9306518723994455e-06, + "loss": 0.0098, + "step": 7320 + }, + { + "epoch": 5.071700727398683, + "grad_norm": 0.44875720143318176, + "learning_rate": 4.92995839112344e-06, + "loss": 0.0054, + "step": 7321 + }, + { + "epoch": 5.072393488049879, + "grad_norm": 0.31168317794799805, + "learning_rate": 4.9292649098474345e-06, + "loss": 0.0065, + "step": 7322 + }, + { + "epoch": 5.073086248701074, + "grad_norm": 0.4042441248893738, + "learning_rate": 4.928571428571429e-06, + "loss": 0.0057, + "step": 7323 + }, + { + "epoch": 5.073779009352269, + "grad_norm": 0.28621089458465576, + "learning_rate": 4.9278779472954235e-06, + "loss": 0.0077, + "step": 7324 + }, + { + "epoch": 5.074471770003464, + "grad_norm": 0.33940061926841736, + "learning_rate": 4.927184466019418e-06, + "loss": 0.0077, + "step": 7325 + }, + { + "epoch": 5.075164530654659, + "grad_norm": 0.41262534260749817, + "learning_rate": 4.9264909847434125e-06, + "loss": 0.0102, + "step": 7326 + }, + { + "epoch": 5.075857291305854, + "grad_norm": 0.23722289502620697, + "learning_rate": 4.925797503467407e-06, + "loss": 0.0053, + "step": 7327 + }, + { + "epoch": 5.076550051957049, + "grad_norm": 0.3129745125770569, + "learning_rate": 4.9251040221914015e-06, + "loss": 0.0067, + "step": 7328 + }, + { + "epoch": 5.077242812608244, + "grad_norm": 0.27803748846054077, + "learning_rate": 4.924410540915396e-06, + "loss": 0.0072, + "step": 7329 + }, + { + "epoch": 5.077935573259439, + "grad_norm": 0.29906079173088074, + "learning_rate": 4.9237170596393906e-06, + "loss": 0.0064, + "step": 7330 + }, + { + "epoch": 5.0786283339106335, + "grad_norm": 0.37967410683631897, + "learning_rate": 4.923023578363385e-06, + "loss": 0.0087, + "step": 7331 + }, + { + "epoch": 5.079321094561829, + "grad_norm": 0.1984589397907257, + "learning_rate": 4.922330097087379e-06, + "loss": 0.0047, + "step": 7332 + }, + { + "epoch": 5.080013855213024, + "grad_norm": 0.3248573839664459, + "learning_rate": 4.921636615811374e-06, + "loss": 0.0072, + "step": 7333 + }, + { + "epoch": 5.080706615864219, + "grad_norm": 0.22171147167682648, + "learning_rate": 4.920943134535368e-06, + "loss": 0.0051, + "step": 7334 + }, + { + "epoch": 5.081399376515414, + "grad_norm": 0.2765241265296936, + "learning_rate": 4.920249653259363e-06, + "loss": 0.006, + "step": 7335 + }, + { + "epoch": 5.082092137166609, + "grad_norm": 0.40976372361183167, + "learning_rate": 4.919556171983357e-06, + "loss": 0.0089, + "step": 7336 + }, + { + "epoch": 5.082784897817804, + "grad_norm": 0.3001773953437805, + "learning_rate": 4.918862690707351e-06, + "loss": 0.0058, + "step": 7337 + }, + { + "epoch": 5.083477658468999, + "grad_norm": 0.29388663172721863, + "learning_rate": 4.918169209431346e-06, + "loss": 0.0087, + "step": 7338 + }, + { + "epoch": 5.084170419120194, + "grad_norm": 0.3032190799713135, + "learning_rate": 4.917475728155341e-06, + "loss": 0.0054, + "step": 7339 + }, + { + "epoch": 5.084863179771389, + "grad_norm": 0.32250088453292847, + "learning_rate": 4.916782246879335e-06, + "loss": 0.0073, + "step": 7340 + }, + { + "epoch": 5.085555940422584, + "grad_norm": 0.3910949230194092, + "learning_rate": 4.916088765603329e-06, + "loss": 0.0084, + "step": 7341 + }, + { + "epoch": 5.086248701073779, + "grad_norm": 0.3047500550746918, + "learning_rate": 4.915395284327323e-06, + "loss": 0.01, + "step": 7342 + }, + { + "epoch": 5.086941461724974, + "grad_norm": 0.31394195556640625, + "learning_rate": 4.914701803051318e-06, + "loss": 0.006, + "step": 7343 + }, + { + "epoch": 5.0876342223761695, + "grad_norm": 0.3759842813014984, + "learning_rate": 4.914008321775313e-06, + "loss": 0.0063, + "step": 7344 + }, + { + "epoch": 5.088326983027364, + "grad_norm": 0.3286186158657074, + "learning_rate": 4.913314840499307e-06, + "loss": 0.0058, + "step": 7345 + }, + { + "epoch": 5.089019743678559, + "grad_norm": 0.2560966908931732, + "learning_rate": 4.912621359223301e-06, + "loss": 0.008, + "step": 7346 + }, + { + "epoch": 5.089712504329754, + "grad_norm": 0.2549499571323395, + "learning_rate": 4.911927877947296e-06, + "loss": 0.0066, + "step": 7347 + }, + { + "epoch": 5.090405264980949, + "grad_norm": 0.2322193682193756, + "learning_rate": 4.911234396671291e-06, + "loss": 0.0055, + "step": 7348 + }, + { + "epoch": 5.091098025632144, + "grad_norm": 0.3046237826347351, + "learning_rate": 4.910540915395285e-06, + "loss": 0.0069, + "step": 7349 + }, + { + "epoch": 5.091790786283339, + "grad_norm": 0.45353612303733826, + "learning_rate": 4.909847434119279e-06, + "loss": 0.0058, + "step": 7350 + }, + { + "epoch": 5.092483546934534, + "grad_norm": 0.2574712634086609, + "learning_rate": 4.909153952843273e-06, + "loss": 0.0065, + "step": 7351 + }, + { + "epoch": 5.093176307585729, + "grad_norm": 0.4106120765209198, + "learning_rate": 4.908460471567268e-06, + "loss": 0.0073, + "step": 7352 + }, + { + "epoch": 5.093869068236924, + "grad_norm": 0.416460782289505, + "learning_rate": 4.907766990291263e-06, + "loss": 0.0057, + "step": 7353 + }, + { + "epoch": 5.09456182888812, + "grad_norm": 0.30797287821769714, + "learning_rate": 4.907073509015257e-06, + "loss": 0.0071, + "step": 7354 + }, + { + "epoch": 5.095254589539314, + "grad_norm": 0.4874429404735565, + "learning_rate": 4.906380027739251e-06, + "loss": 0.0113, + "step": 7355 + }, + { + "epoch": 5.095947350190509, + "grad_norm": 0.2572321593761444, + "learning_rate": 4.905686546463246e-06, + "loss": 0.005, + "step": 7356 + }, + { + "epoch": 5.0966401108417045, + "grad_norm": 0.24332685768604279, + "learning_rate": 4.90499306518724e-06, + "loss": 0.0061, + "step": 7357 + }, + { + "epoch": 5.097332871492899, + "grad_norm": 0.3736319839954376, + "learning_rate": 4.904299583911235e-06, + "loss": 0.0064, + "step": 7358 + }, + { + "epoch": 5.098025632144094, + "grad_norm": 0.4131574332714081, + "learning_rate": 4.903606102635229e-06, + "loss": 0.0063, + "step": 7359 + }, + { + "epoch": 5.0987183927952895, + "grad_norm": 0.412183940410614, + "learning_rate": 4.902912621359223e-06, + "loss": 0.0122, + "step": 7360 + }, + { + "epoch": 5.099411153446484, + "grad_norm": 0.25543296337127686, + "learning_rate": 4.902219140083218e-06, + "loss": 0.0066, + "step": 7361 + }, + { + "epoch": 5.100103914097679, + "grad_norm": 0.25338611006736755, + "learning_rate": 4.901525658807212e-06, + "loss": 0.005, + "step": 7362 + }, + { + "epoch": 5.100796674748874, + "grad_norm": 0.29101377725601196, + "learning_rate": 4.900832177531207e-06, + "loss": 0.0065, + "step": 7363 + }, + { + "epoch": 5.101489435400069, + "grad_norm": 0.25856027007102966, + "learning_rate": 4.900138696255201e-06, + "loss": 0.0047, + "step": 7364 + }, + { + "epoch": 5.102182196051264, + "grad_norm": 0.2946327030658722, + "learning_rate": 4.899445214979196e-06, + "loss": 0.0058, + "step": 7365 + }, + { + "epoch": 5.102874956702459, + "grad_norm": 0.25694847106933594, + "learning_rate": 4.89875173370319e-06, + "loss": 0.0046, + "step": 7366 + }, + { + "epoch": 5.103567717353655, + "grad_norm": 0.308318167924881, + "learning_rate": 4.8980582524271845e-06, + "loss": 0.0065, + "step": 7367 + }, + { + "epoch": 5.104260478004849, + "grad_norm": 0.33156275749206543, + "learning_rate": 4.897364771151179e-06, + "loss": 0.007, + "step": 7368 + }, + { + "epoch": 5.104953238656044, + "grad_norm": 0.19715631008148193, + "learning_rate": 4.896671289875174e-06, + "loss": 0.0041, + "step": 7369 + }, + { + "epoch": 5.10564599930724, + "grad_norm": 0.328476220369339, + "learning_rate": 4.895977808599168e-06, + "loss": 0.0065, + "step": 7370 + }, + { + "epoch": 5.106338759958434, + "grad_norm": 0.40526121854782104, + "learning_rate": 4.8952843273231625e-06, + "loss": 0.0087, + "step": 7371 + }, + { + "epoch": 5.107031520609629, + "grad_norm": 0.4126852750778198, + "learning_rate": 4.894590846047157e-06, + "loss": 0.0056, + "step": 7372 + }, + { + "epoch": 5.107724281260825, + "grad_norm": 0.2388870120048523, + "learning_rate": 4.8938973647711515e-06, + "loss": 0.0046, + "step": 7373 + }, + { + "epoch": 5.10841704191202, + "grad_norm": 0.31237688660621643, + "learning_rate": 4.8932038834951465e-06, + "loss": 0.0081, + "step": 7374 + }, + { + "epoch": 5.109109802563214, + "grad_norm": 0.2459273785352707, + "learning_rate": 4.8925104022191405e-06, + "loss": 0.0058, + "step": 7375 + }, + { + "epoch": 5.1098025632144095, + "grad_norm": 0.31756120920181274, + "learning_rate": 4.891816920943135e-06, + "loss": 0.0071, + "step": 7376 + }, + { + "epoch": 5.110495323865605, + "grad_norm": 0.26058638095855713, + "learning_rate": 4.8911234396671296e-06, + "loss": 0.0066, + "step": 7377 + }, + { + "epoch": 5.111188084516799, + "grad_norm": 0.4566001892089844, + "learning_rate": 4.8904299583911245e-06, + "loss": 0.0057, + "step": 7378 + }, + { + "epoch": 5.1118808451679945, + "grad_norm": 0.25763601064682007, + "learning_rate": 4.889736477115119e-06, + "loss": 0.0067, + "step": 7379 + }, + { + "epoch": 5.11257360581919, + "grad_norm": 0.27179789543151855, + "learning_rate": 4.889042995839113e-06, + "loss": 0.0068, + "step": 7380 + }, + { + "epoch": 5.113266366470384, + "grad_norm": 0.3337440490722656, + "learning_rate": 4.888349514563107e-06, + "loss": 0.0068, + "step": 7381 + }, + { + "epoch": 5.113959127121579, + "grad_norm": 0.30211561918258667, + "learning_rate": 4.887656033287102e-06, + "loss": 0.0049, + "step": 7382 + }, + { + "epoch": 5.114651887772775, + "grad_norm": 0.30484941601753235, + "learning_rate": 4.886962552011097e-06, + "loss": 0.0085, + "step": 7383 + }, + { + "epoch": 5.115344648423969, + "grad_norm": 0.26404300332069397, + "learning_rate": 4.886269070735091e-06, + "loss": 0.0072, + "step": 7384 + }, + { + "epoch": 5.116037409075164, + "grad_norm": 0.26851800084114075, + "learning_rate": 4.885575589459085e-06, + "loss": 0.0066, + "step": 7385 + }, + { + "epoch": 5.11673016972636, + "grad_norm": 0.1804685890674591, + "learning_rate": 4.88488210818308e-06, + "loss": 0.0047, + "step": 7386 + }, + { + "epoch": 5.117422930377555, + "grad_norm": 0.40278634428977966, + "learning_rate": 4.884188626907074e-06, + "loss": 0.0074, + "step": 7387 + }, + { + "epoch": 5.118115691028749, + "grad_norm": 0.316924124956131, + "learning_rate": 4.883495145631069e-06, + "loss": 0.0098, + "step": 7388 + }, + { + "epoch": 5.118808451679945, + "grad_norm": 0.3233623504638672, + "learning_rate": 4.882801664355063e-06, + "loss": 0.0059, + "step": 7389 + }, + { + "epoch": 5.11950121233114, + "grad_norm": 0.8516749143600464, + "learning_rate": 4.882108183079057e-06, + "loss": 0.0052, + "step": 7390 + }, + { + "epoch": 5.120193972982334, + "grad_norm": 0.3691239655017853, + "learning_rate": 4.881414701803052e-06, + "loss": 0.0086, + "step": 7391 + }, + { + "epoch": 5.1208867336335295, + "grad_norm": 0.25633835792541504, + "learning_rate": 4.880721220527046e-06, + "loss": 0.005, + "step": 7392 + }, + { + "epoch": 5.121579494284725, + "grad_norm": 0.2983531057834625, + "learning_rate": 4.880027739251041e-06, + "loss": 0.0058, + "step": 7393 + }, + { + "epoch": 5.12227225493592, + "grad_norm": 0.3667583465576172, + "learning_rate": 4.879334257975035e-06, + "loss": 0.006, + "step": 7394 + }, + { + "epoch": 5.1229650155871145, + "grad_norm": 0.27222368121147156, + "learning_rate": 4.87864077669903e-06, + "loss": 0.009, + "step": 7395 + }, + { + "epoch": 5.12365777623831, + "grad_norm": 0.3674112856388092, + "learning_rate": 4.877947295423024e-06, + "loss": 0.0061, + "step": 7396 + }, + { + "epoch": 5.124350536889505, + "grad_norm": 0.4103373885154724, + "learning_rate": 4.877253814147018e-06, + "loss": 0.0075, + "step": 7397 + }, + { + "epoch": 5.125043297540699, + "grad_norm": 0.44219517707824707, + "learning_rate": 4.876560332871013e-06, + "loss": 0.0061, + "step": 7398 + }, + { + "epoch": 5.125736058191895, + "grad_norm": 0.42452719807624817, + "learning_rate": 4.875866851595007e-06, + "loss": 0.0078, + "step": 7399 + }, + { + "epoch": 5.12642881884309, + "grad_norm": 0.28701063990592957, + "learning_rate": 4.875173370319002e-06, + "loss": 0.0068, + "step": 7400 + }, + { + "epoch": 5.127121579494284, + "grad_norm": 0.28020837903022766, + "learning_rate": 4.874479889042996e-06, + "loss": 0.0059, + "step": 7401 + }, + { + "epoch": 5.12781434014548, + "grad_norm": 0.44853636622428894, + "learning_rate": 4.87378640776699e-06, + "loss": 0.0093, + "step": 7402 + }, + { + "epoch": 5.128507100796675, + "grad_norm": 0.34930720925331116, + "learning_rate": 4.873092926490985e-06, + "loss": 0.0066, + "step": 7403 + }, + { + "epoch": 5.129199861447869, + "grad_norm": 0.501854658126831, + "learning_rate": 4.87239944521498e-06, + "loss": 0.0056, + "step": 7404 + }, + { + "epoch": 5.129892622099065, + "grad_norm": 0.2529129087924957, + "learning_rate": 4.871705963938974e-06, + "loss": 0.0055, + "step": 7405 + }, + { + "epoch": 5.13058538275026, + "grad_norm": 0.7160843014717102, + "learning_rate": 4.871012482662968e-06, + "loss": 0.0091, + "step": 7406 + }, + { + "epoch": 5.131278143401455, + "grad_norm": 0.39088577032089233, + "learning_rate": 4.870319001386963e-06, + "loss": 0.0114, + "step": 7407 + }, + { + "epoch": 5.1319709040526496, + "grad_norm": 0.43904075026512146, + "learning_rate": 4.869625520110957e-06, + "loss": 0.0084, + "step": 7408 + }, + { + "epoch": 5.132663664703845, + "grad_norm": 0.2854726016521454, + "learning_rate": 4.868932038834952e-06, + "loss": 0.0082, + "step": 7409 + }, + { + "epoch": 5.13335642535504, + "grad_norm": 0.233002707362175, + "learning_rate": 4.868238557558946e-06, + "loss": 0.007, + "step": 7410 + }, + { + "epoch": 5.1340491860062345, + "grad_norm": 0.3564295172691345, + "learning_rate": 4.86754507628294e-06, + "loss": 0.0083, + "step": 7411 + }, + { + "epoch": 5.13474194665743, + "grad_norm": 0.32407668232917786, + "learning_rate": 4.866851595006935e-06, + "loss": 0.0071, + "step": 7412 + }, + { + "epoch": 5.135434707308625, + "grad_norm": 0.24932849407196045, + "learning_rate": 4.866158113730929e-06, + "loss": 0.0072, + "step": 7413 + }, + { + "epoch": 5.13612746795982, + "grad_norm": 0.35719776153564453, + "learning_rate": 4.865464632454924e-06, + "loss": 0.0078, + "step": 7414 + }, + { + "epoch": 5.136820228611015, + "grad_norm": 0.36703062057495117, + "learning_rate": 4.864771151178918e-06, + "loss": 0.0098, + "step": 7415 + }, + { + "epoch": 5.13751298926221, + "grad_norm": 0.3459904193878174, + "learning_rate": 4.864077669902913e-06, + "loss": 0.0075, + "step": 7416 + }, + { + "epoch": 5.138205749913405, + "grad_norm": 0.3044803738594055, + "learning_rate": 4.863384188626907e-06, + "loss": 0.0075, + "step": 7417 + }, + { + "epoch": 5.1388985105646, + "grad_norm": 0.3431564271450043, + "learning_rate": 4.8626907073509015e-06, + "loss": 0.008, + "step": 7418 + }, + { + "epoch": 5.139591271215795, + "grad_norm": 0.3503017723560333, + "learning_rate": 4.8619972260748964e-06, + "loss": 0.0103, + "step": 7419 + }, + { + "epoch": 5.14028403186699, + "grad_norm": 0.3378048241138458, + "learning_rate": 4.8613037447988905e-06, + "loss": 0.0053, + "step": 7420 + }, + { + "epoch": 5.140976792518185, + "grad_norm": 0.22293899953365326, + "learning_rate": 4.8606102635228855e-06, + "loss": 0.0051, + "step": 7421 + }, + { + "epoch": 5.14166955316938, + "grad_norm": 0.251714289188385, + "learning_rate": 4.8599167822468795e-06, + "loss": 0.0057, + "step": 7422 + }, + { + "epoch": 5.142362313820575, + "grad_norm": 0.2762012183666229, + "learning_rate": 4.859223300970874e-06, + "loss": 0.0078, + "step": 7423 + }, + { + "epoch": 5.14305507447177, + "grad_norm": 0.41653135418891907, + "learning_rate": 4.8585298196948686e-06, + "loss": 0.0088, + "step": 7424 + }, + { + "epoch": 5.143747835122965, + "grad_norm": 0.42686501145362854, + "learning_rate": 4.8578363384188635e-06, + "loss": 0.0107, + "step": 7425 + }, + { + "epoch": 5.14444059577416, + "grad_norm": 0.2930285334587097, + "learning_rate": 4.857142857142858e-06, + "loss": 0.0066, + "step": 7426 + }, + { + "epoch": 5.145133356425355, + "grad_norm": 0.45484989881515503, + "learning_rate": 4.856449375866852e-06, + "loss": 0.005, + "step": 7427 + }, + { + "epoch": 5.14582611707655, + "grad_norm": 0.412971556186676, + "learning_rate": 4.855755894590846e-06, + "loss": 0.0083, + "step": 7428 + }, + { + "epoch": 5.146518877727745, + "grad_norm": 0.3098788857460022, + "learning_rate": 4.855062413314841e-06, + "loss": 0.0064, + "step": 7429 + }, + { + "epoch": 5.14721163837894, + "grad_norm": 0.29858165979385376, + "learning_rate": 4.854368932038836e-06, + "loss": 0.0055, + "step": 7430 + }, + { + "epoch": 5.147904399030135, + "grad_norm": 0.30740079283714294, + "learning_rate": 4.85367545076283e-06, + "loss": 0.0083, + "step": 7431 + }, + { + "epoch": 5.14859715968133, + "grad_norm": 0.29849037528038025, + "learning_rate": 4.852981969486824e-06, + "loss": 0.0064, + "step": 7432 + }, + { + "epoch": 5.149289920332525, + "grad_norm": 0.6315860748291016, + "learning_rate": 4.852288488210819e-06, + "loss": 0.0069, + "step": 7433 + }, + { + "epoch": 5.14998268098372, + "grad_norm": 0.40717795491218567, + "learning_rate": 4.851595006934814e-06, + "loss": 0.006, + "step": 7434 + }, + { + "epoch": 5.150675441634915, + "grad_norm": 0.3300587236881256, + "learning_rate": 4.850901525658808e-06, + "loss": 0.0073, + "step": 7435 + }, + { + "epoch": 5.15136820228611, + "grad_norm": 0.26116156578063965, + "learning_rate": 4.850208044382802e-06, + "loss": 0.0056, + "step": 7436 + }, + { + "epoch": 5.1520609629373055, + "grad_norm": 0.27881118655204773, + "learning_rate": 4.849514563106797e-06, + "loss": 0.0056, + "step": 7437 + }, + { + "epoch": 5.1527537235885, + "grad_norm": 0.46491703391075134, + "learning_rate": 4.848821081830791e-06, + "loss": 0.0068, + "step": 7438 + }, + { + "epoch": 5.153446484239695, + "grad_norm": 0.3274855315685272, + "learning_rate": 4.848127600554786e-06, + "loss": 0.0102, + "step": 7439 + }, + { + "epoch": 5.1541392448908905, + "grad_norm": 0.36323603987693787, + "learning_rate": 4.84743411927878e-06, + "loss": 0.0069, + "step": 7440 + }, + { + "epoch": 5.154832005542085, + "grad_norm": 1.0078736543655396, + "learning_rate": 4.846740638002774e-06, + "loss": 0.0106, + "step": 7441 + }, + { + "epoch": 5.15552476619328, + "grad_norm": 0.3542138934135437, + "learning_rate": 4.846047156726769e-06, + "loss": 0.006, + "step": 7442 + }, + { + "epoch": 5.156217526844475, + "grad_norm": 0.3060283958911896, + "learning_rate": 4.845353675450763e-06, + "loss": 0.005, + "step": 7443 + }, + { + "epoch": 5.15691028749567, + "grad_norm": 0.39142200350761414, + "learning_rate": 4.844660194174758e-06, + "loss": 0.007, + "step": 7444 + }, + { + "epoch": 5.157603048146865, + "grad_norm": 0.4000868499279022, + "learning_rate": 4.843966712898752e-06, + "loss": 0.0062, + "step": 7445 + }, + { + "epoch": 5.15829580879806, + "grad_norm": 0.3588486313819885, + "learning_rate": 4.843273231622747e-06, + "loss": 0.0104, + "step": 7446 + }, + { + "epoch": 5.158988569449256, + "grad_norm": 0.26281294226646423, + "learning_rate": 4.842579750346741e-06, + "loss": 0.0049, + "step": 7447 + }, + { + "epoch": 5.15968133010045, + "grad_norm": 0.27854153513908386, + "learning_rate": 4.841886269070735e-06, + "loss": 0.0047, + "step": 7448 + }, + { + "epoch": 5.160374090751645, + "grad_norm": 0.34578070044517517, + "learning_rate": 4.84119278779473e-06, + "loss": 0.0106, + "step": 7449 + }, + { + "epoch": 5.161066851402841, + "grad_norm": 0.3037815988063812, + "learning_rate": 4.840499306518724e-06, + "loss": 0.0057, + "step": 7450 + }, + { + "epoch": 5.161759612054035, + "grad_norm": 0.30182409286499023, + "learning_rate": 4.839805825242719e-06, + "loss": 0.0087, + "step": 7451 + }, + { + "epoch": 5.16245237270523, + "grad_norm": 0.277145117521286, + "learning_rate": 4.839112343966713e-06, + "loss": 0.0053, + "step": 7452 + }, + { + "epoch": 5.1631451333564256, + "grad_norm": 0.2836417257785797, + "learning_rate": 4.838418862690707e-06, + "loss": 0.0059, + "step": 7453 + }, + { + "epoch": 5.16383789400762, + "grad_norm": 0.4726655185222626, + "learning_rate": 4.837725381414702e-06, + "loss": 0.0079, + "step": 7454 + }, + { + "epoch": 5.164530654658815, + "grad_norm": 0.33091259002685547, + "learning_rate": 4.837031900138697e-06, + "loss": 0.0061, + "step": 7455 + }, + { + "epoch": 5.1652234153100105, + "grad_norm": 0.3957972228527069, + "learning_rate": 4.836338418862691e-06, + "loss": 0.0085, + "step": 7456 + }, + { + "epoch": 5.165916175961206, + "grad_norm": 0.42114585638046265, + "learning_rate": 4.835644937586685e-06, + "loss": 0.0102, + "step": 7457 + }, + { + "epoch": 5.1666089366124, + "grad_norm": 0.33106687664985657, + "learning_rate": 4.834951456310679e-06, + "loss": 0.007, + "step": 7458 + }, + { + "epoch": 5.1673016972635955, + "grad_norm": 0.28504499793052673, + "learning_rate": 4.834257975034674e-06, + "loss": 0.0063, + "step": 7459 + }, + { + "epoch": 5.167994457914791, + "grad_norm": 0.3100307881832123, + "learning_rate": 4.833564493758669e-06, + "loss": 0.009, + "step": 7460 + }, + { + "epoch": 5.168687218565985, + "grad_norm": 0.5789116621017456, + "learning_rate": 4.832871012482663e-06, + "loss": 0.007, + "step": 7461 + }, + { + "epoch": 5.16937997921718, + "grad_norm": 0.25984102487564087, + "learning_rate": 4.832177531206657e-06, + "loss": 0.0045, + "step": 7462 + }, + { + "epoch": 5.170072739868376, + "grad_norm": 0.46192842721939087, + "learning_rate": 4.831484049930652e-06, + "loss": 0.0085, + "step": 7463 + }, + { + "epoch": 5.17076550051957, + "grad_norm": 0.48657897114753723, + "learning_rate": 4.830790568654647e-06, + "loss": 0.0084, + "step": 7464 + }, + { + "epoch": 5.171458261170765, + "grad_norm": 0.23726239800453186, + "learning_rate": 4.830097087378641e-06, + "loss": 0.0046, + "step": 7465 + }, + { + "epoch": 5.172151021821961, + "grad_norm": 0.37569645047187805, + "learning_rate": 4.8294036061026354e-06, + "loss": 0.0085, + "step": 7466 + }, + { + "epoch": 5.172843782473156, + "grad_norm": 0.3211580514907837, + "learning_rate": 4.8287101248266295e-06, + "loss": 0.0059, + "step": 7467 + }, + { + "epoch": 5.17353654312435, + "grad_norm": 0.24997110664844513, + "learning_rate": 4.8280166435506245e-06, + "loss": 0.0057, + "step": 7468 + }, + { + "epoch": 5.174229303775546, + "grad_norm": 0.28084564208984375, + "learning_rate": 4.827323162274619e-06, + "loss": 0.0089, + "step": 7469 + }, + { + "epoch": 5.174922064426741, + "grad_norm": 0.36999210715293884, + "learning_rate": 4.8266296809986135e-06, + "loss": 0.0079, + "step": 7470 + }, + { + "epoch": 5.175614825077935, + "grad_norm": 0.36330100893974304, + "learning_rate": 4.8259361997226076e-06, + "loss": 0.0078, + "step": 7471 + }, + { + "epoch": 5.1763075857291305, + "grad_norm": 0.45394986867904663, + "learning_rate": 4.8252427184466025e-06, + "loss": 0.0064, + "step": 7472 + }, + { + "epoch": 5.177000346380326, + "grad_norm": 0.42422235012054443, + "learning_rate": 4.824549237170597e-06, + "loss": 0.0091, + "step": 7473 + }, + { + "epoch": 5.17769310703152, + "grad_norm": 0.21235227584838867, + "learning_rate": 4.8238557558945915e-06, + "loss": 0.0048, + "step": 7474 + }, + { + "epoch": 5.1783858676827155, + "grad_norm": 0.42402246594429016, + "learning_rate": 4.823162274618586e-06, + "loss": 0.0055, + "step": 7475 + }, + { + "epoch": 5.179078628333911, + "grad_norm": 0.33939579129219055, + "learning_rate": 4.82246879334258e-06, + "loss": 0.0066, + "step": 7476 + }, + { + "epoch": 5.179771388985106, + "grad_norm": 0.3085120916366577, + "learning_rate": 4.821775312066575e-06, + "loss": 0.0068, + "step": 7477 + }, + { + "epoch": 5.1804641496363, + "grad_norm": 0.4421272575855255, + "learning_rate": 4.821081830790569e-06, + "loss": 0.0111, + "step": 7478 + }, + { + "epoch": 5.181156910287496, + "grad_norm": 0.3783268630504608, + "learning_rate": 4.820388349514564e-06, + "loss": 0.0073, + "step": 7479 + }, + { + "epoch": 5.181849670938691, + "grad_norm": 0.2809380292892456, + "learning_rate": 4.819694868238558e-06, + "loss": 0.0078, + "step": 7480 + }, + { + "epoch": 5.182542431589885, + "grad_norm": 0.31724241375923157, + "learning_rate": 4.819001386962553e-06, + "loss": 0.0079, + "step": 7481 + }, + { + "epoch": 5.183235192241081, + "grad_norm": 0.5670633912086487, + "learning_rate": 4.818307905686547e-06, + "loss": 0.0049, + "step": 7482 + }, + { + "epoch": 5.183927952892276, + "grad_norm": 0.27482205629348755, + "learning_rate": 4.817614424410541e-06, + "loss": 0.006, + "step": 7483 + }, + { + "epoch": 5.18462071354347, + "grad_norm": 0.2535606622695923, + "learning_rate": 4.816920943134536e-06, + "loss": 0.0063, + "step": 7484 + }, + { + "epoch": 5.185313474194666, + "grad_norm": 0.28620415925979614, + "learning_rate": 4.816227461858531e-06, + "loss": 0.0093, + "step": 7485 + }, + { + "epoch": 5.186006234845861, + "grad_norm": 0.2894137501716614, + "learning_rate": 4.815533980582525e-06, + "loss": 0.0084, + "step": 7486 + }, + { + "epoch": 5.186698995497056, + "grad_norm": 0.36168500781059265, + "learning_rate": 4.814840499306519e-06, + "loss": 0.0079, + "step": 7487 + }, + { + "epoch": 5.1873917561482505, + "grad_norm": 0.2551228404045105, + "learning_rate": 4.814147018030513e-06, + "loss": 0.0069, + "step": 7488 + }, + { + "epoch": 5.188084516799446, + "grad_norm": 0.29550832509994507, + "learning_rate": 4.813453536754508e-06, + "loss": 0.0086, + "step": 7489 + }, + { + "epoch": 5.188777277450641, + "grad_norm": 0.3164914548397064, + "learning_rate": 4.812760055478503e-06, + "loss": 0.0077, + "step": 7490 + }, + { + "epoch": 5.1894700381018355, + "grad_norm": 0.20070882141590118, + "learning_rate": 4.812066574202497e-06, + "loss": 0.0052, + "step": 7491 + }, + { + "epoch": 5.190162798753031, + "grad_norm": 0.33653250336647034, + "learning_rate": 4.811373092926491e-06, + "loss": 0.007, + "step": 7492 + }, + { + "epoch": 5.190855559404226, + "grad_norm": 0.38830628991127014, + "learning_rate": 4.810679611650486e-06, + "loss": 0.0091, + "step": 7493 + }, + { + "epoch": 5.19154832005542, + "grad_norm": 0.25447553396224976, + "learning_rate": 4.809986130374481e-06, + "loss": 0.0056, + "step": 7494 + }, + { + "epoch": 5.192241080706616, + "grad_norm": 0.356847882270813, + "learning_rate": 4.809292649098475e-06, + "loss": 0.0105, + "step": 7495 + }, + { + "epoch": 5.192933841357811, + "grad_norm": 0.3971211910247803, + "learning_rate": 4.808599167822469e-06, + "loss": 0.0073, + "step": 7496 + }, + { + "epoch": 5.193626602009006, + "grad_norm": 0.2789386212825775, + "learning_rate": 4.807905686546463e-06, + "loss": 0.0086, + "step": 7497 + }, + { + "epoch": 5.194319362660201, + "grad_norm": 0.4150344431400299, + "learning_rate": 4.807212205270458e-06, + "loss": 0.0069, + "step": 7498 + }, + { + "epoch": 5.195012123311396, + "grad_norm": 0.3591524660587311, + "learning_rate": 4.806518723994453e-06, + "loss": 0.0064, + "step": 7499 + }, + { + "epoch": 5.195704883962591, + "grad_norm": 0.30835917592048645, + "learning_rate": 4.805825242718447e-06, + "loss": 0.007, + "step": 7500 + }, + { + "epoch": 5.196397644613786, + "grad_norm": 0.36681753396987915, + "learning_rate": 4.805131761442441e-06, + "loss": 0.008, + "step": 7501 + }, + { + "epoch": 5.197090405264981, + "grad_norm": 0.30755317211151123, + "learning_rate": 4.804438280166436e-06, + "loss": 0.0073, + "step": 7502 + }, + { + "epoch": 5.197783165916176, + "grad_norm": 0.40762585401535034, + "learning_rate": 4.80374479889043e-06, + "loss": 0.0083, + "step": 7503 + }, + { + "epoch": 5.198475926567371, + "grad_norm": 0.26317718625068665, + "learning_rate": 4.803051317614425e-06, + "loss": 0.0051, + "step": 7504 + }, + { + "epoch": 5.199168687218566, + "grad_norm": 0.48002341389656067, + "learning_rate": 4.802357836338419e-06, + "loss": 0.0064, + "step": 7505 + }, + { + "epoch": 5.199861447869761, + "grad_norm": 0.3856843411922455, + "learning_rate": 4.801664355062413e-06, + "loss": 0.0081, + "step": 7506 + }, + { + "epoch": 5.200554208520956, + "grad_norm": 0.3101407289505005, + "learning_rate": 4.800970873786408e-06, + "loss": 0.0062, + "step": 7507 + }, + { + "epoch": 5.201246969172151, + "grad_norm": 0.29151254892349243, + "learning_rate": 4.800277392510402e-06, + "loss": 0.0072, + "step": 7508 + }, + { + "epoch": 5.201939729823346, + "grad_norm": 0.3234097361564636, + "learning_rate": 4.799583911234397e-06, + "loss": 0.0076, + "step": 7509 + }, + { + "epoch": 5.202632490474541, + "grad_norm": 0.32400745153427124, + "learning_rate": 4.798890429958391e-06, + "loss": 0.0118, + "step": 7510 + }, + { + "epoch": 5.203325251125736, + "grad_norm": 0.350826233625412, + "learning_rate": 4.798196948682386e-06, + "loss": 0.0098, + "step": 7511 + }, + { + "epoch": 5.204018011776931, + "grad_norm": 0.2611273229122162, + "learning_rate": 4.79750346740638e-06, + "loss": 0.005, + "step": 7512 + }, + { + "epoch": 5.204710772428126, + "grad_norm": 0.5315921306610107, + "learning_rate": 4.7968099861303744e-06, + "loss": 0.008, + "step": 7513 + }, + { + "epoch": 5.205403533079321, + "grad_norm": 0.30067697167396545, + "learning_rate": 4.796116504854369e-06, + "loss": 0.0069, + "step": 7514 + }, + { + "epoch": 5.206096293730516, + "grad_norm": 0.37230080366134644, + "learning_rate": 4.7954230235783635e-06, + "loss": 0.0065, + "step": 7515 + }, + { + "epoch": 5.206789054381711, + "grad_norm": 0.36270418763160706, + "learning_rate": 4.794729542302358e-06, + "loss": 0.0084, + "step": 7516 + }, + { + "epoch": 5.2074818150329065, + "grad_norm": 0.29917728900909424, + "learning_rate": 4.7940360610263525e-06, + "loss": 0.0063, + "step": 7517 + }, + { + "epoch": 5.208174575684101, + "grad_norm": 0.3706039488315582, + "learning_rate": 4.7933425797503466e-06, + "loss": 0.0076, + "step": 7518 + }, + { + "epoch": 5.208867336335296, + "grad_norm": 0.37487998604774475, + "learning_rate": 4.7926490984743415e-06, + "loss": 0.0068, + "step": 7519 + }, + { + "epoch": 5.2095600969864915, + "grad_norm": 0.20690150558948517, + "learning_rate": 4.7919556171983364e-06, + "loss": 0.0046, + "step": 7520 + }, + { + "epoch": 5.210252857637686, + "grad_norm": 0.340813010931015, + "learning_rate": 4.7912621359223305e-06, + "loss": 0.0071, + "step": 7521 + }, + { + "epoch": 5.210945618288881, + "grad_norm": 0.19376729428768158, + "learning_rate": 4.790568654646325e-06, + "loss": 0.0038, + "step": 7522 + }, + { + "epoch": 5.211638378940076, + "grad_norm": 0.3811071515083313, + "learning_rate": 4.7898751733703195e-06, + "loss": 0.0101, + "step": 7523 + }, + { + "epoch": 5.212331139591271, + "grad_norm": 0.26559945940971375, + "learning_rate": 4.789181692094314e-06, + "loss": 0.0057, + "step": 7524 + }, + { + "epoch": 5.213023900242466, + "grad_norm": 0.2858808636665344, + "learning_rate": 4.7884882108183086e-06, + "loss": 0.0064, + "step": 7525 + }, + { + "epoch": 5.213716660893661, + "grad_norm": 0.29177382588386536, + "learning_rate": 4.787794729542303e-06, + "loss": 0.0077, + "step": 7526 + }, + { + "epoch": 5.214409421544857, + "grad_norm": 0.20582586526870728, + "learning_rate": 4.787101248266297e-06, + "loss": 0.0059, + "step": 7527 + }, + { + "epoch": 5.215102182196051, + "grad_norm": 0.26234903931617737, + "learning_rate": 4.786407766990292e-06, + "loss": 0.0061, + "step": 7528 + }, + { + "epoch": 5.215794942847246, + "grad_norm": 0.32922959327697754, + "learning_rate": 4.785714285714287e-06, + "loss": 0.0084, + "step": 7529 + }, + { + "epoch": 5.216487703498442, + "grad_norm": 0.3447861671447754, + "learning_rate": 4.785020804438281e-06, + "loss": 0.0073, + "step": 7530 + }, + { + "epoch": 5.217180464149636, + "grad_norm": 0.5398739576339722, + "learning_rate": 4.784327323162275e-06, + "loss": 0.0053, + "step": 7531 + }, + { + "epoch": 5.217873224800831, + "grad_norm": 0.3625837564468384, + "learning_rate": 4.78363384188627e-06, + "loss": 0.0108, + "step": 7532 + }, + { + "epoch": 5.2185659854520265, + "grad_norm": 0.4039744436740875, + "learning_rate": 4.782940360610264e-06, + "loss": 0.0072, + "step": 7533 + }, + { + "epoch": 5.219258746103221, + "grad_norm": 0.3538312315940857, + "learning_rate": 4.782246879334259e-06, + "loss": 0.0078, + "step": 7534 + }, + { + "epoch": 5.219951506754416, + "grad_norm": 0.3611268103122711, + "learning_rate": 4.781553398058253e-06, + "loss": 0.007, + "step": 7535 + }, + { + "epoch": 5.2206442674056115, + "grad_norm": 0.35851985216140747, + "learning_rate": 4.780859916782247e-06, + "loss": 0.0074, + "step": 7536 + }, + { + "epoch": 5.221337028056807, + "grad_norm": 0.3169112205505371, + "learning_rate": 4.780166435506242e-06, + "loss": 0.0076, + "step": 7537 + }, + { + "epoch": 5.222029788708001, + "grad_norm": 0.34461987018585205, + "learning_rate": 4.779472954230236e-06, + "loss": 0.0073, + "step": 7538 + }, + { + "epoch": 5.222722549359196, + "grad_norm": 0.27043017745018005, + "learning_rate": 4.778779472954231e-06, + "loss": 0.0058, + "step": 7539 + }, + { + "epoch": 5.223415310010392, + "grad_norm": 0.3345284163951874, + "learning_rate": 4.778085991678225e-06, + "loss": 0.0065, + "step": 7540 + }, + { + "epoch": 5.224108070661586, + "grad_norm": 0.42769333720207214, + "learning_rate": 4.77739251040222e-06, + "loss": 0.0082, + "step": 7541 + }, + { + "epoch": 5.224800831312781, + "grad_norm": 0.3785141706466675, + "learning_rate": 4.776699029126214e-06, + "loss": 0.0092, + "step": 7542 + }, + { + "epoch": 5.225493591963977, + "grad_norm": 0.2181730568408966, + "learning_rate": 4.776005547850208e-06, + "loss": 0.0041, + "step": 7543 + }, + { + "epoch": 5.226186352615171, + "grad_norm": 0.3941933214664459, + "learning_rate": 4.775312066574203e-06, + "loss": 0.0061, + "step": 7544 + }, + { + "epoch": 5.226879113266366, + "grad_norm": 0.22325095534324646, + "learning_rate": 4.774618585298197e-06, + "loss": 0.0058, + "step": 7545 + }, + { + "epoch": 5.227571873917562, + "grad_norm": 0.36726585030555725, + "learning_rate": 4.773925104022192e-06, + "loss": 0.0067, + "step": 7546 + }, + { + "epoch": 5.228264634568757, + "grad_norm": 0.25020918250083923, + "learning_rate": 4.773231622746186e-06, + "loss": 0.0075, + "step": 7547 + }, + { + "epoch": 5.228957395219951, + "grad_norm": 0.2851022779941559, + "learning_rate": 4.77253814147018e-06, + "loss": 0.0069, + "step": 7548 + }, + { + "epoch": 5.229650155871147, + "grad_norm": 0.28374752402305603, + "learning_rate": 4.771844660194175e-06, + "loss": 0.0071, + "step": 7549 + }, + { + "epoch": 5.230342916522342, + "grad_norm": 0.3818637728691101, + "learning_rate": 4.77115117891817e-06, + "loss": 0.0081, + "step": 7550 + }, + { + "epoch": 5.231035677173536, + "grad_norm": 0.8067923784255981, + "learning_rate": 4.770457697642164e-06, + "loss": 0.0081, + "step": 7551 + }, + { + "epoch": 5.2317284378247315, + "grad_norm": 0.4985780715942383, + "learning_rate": 4.769764216366158e-06, + "loss": 0.0086, + "step": 7552 + }, + { + "epoch": 5.232421198475927, + "grad_norm": 0.3261982798576355, + "learning_rate": 4.769070735090153e-06, + "loss": 0.007, + "step": 7553 + }, + { + "epoch": 5.233113959127121, + "grad_norm": 0.3235166668891907, + "learning_rate": 4.768377253814147e-06, + "loss": 0.0077, + "step": 7554 + }, + { + "epoch": 5.2338067197783165, + "grad_norm": 0.3782225251197815, + "learning_rate": 4.767683772538142e-06, + "loss": 0.005, + "step": 7555 + }, + { + "epoch": 5.234499480429512, + "grad_norm": 0.3849373459815979, + "learning_rate": 4.766990291262136e-06, + "loss": 0.0092, + "step": 7556 + }, + { + "epoch": 5.235192241080707, + "grad_norm": 0.36994263529777527, + "learning_rate": 4.76629680998613e-06, + "loss": 0.007, + "step": 7557 + }, + { + "epoch": 5.235885001731901, + "grad_norm": 0.3145955502986908, + "learning_rate": 4.765603328710125e-06, + "loss": 0.0066, + "step": 7558 + }, + { + "epoch": 5.236577762383097, + "grad_norm": 0.2789524793624878, + "learning_rate": 4.76490984743412e-06, + "loss": 0.0066, + "step": 7559 + }, + { + "epoch": 5.237270523034292, + "grad_norm": 0.509216845035553, + "learning_rate": 4.764216366158114e-06, + "loss": 0.0066, + "step": 7560 + }, + { + "epoch": 5.237963283685486, + "grad_norm": 0.44652724266052246, + "learning_rate": 4.763522884882108e-06, + "loss": 0.0074, + "step": 7561 + }, + { + "epoch": 5.238656044336682, + "grad_norm": 0.3717997074127197, + "learning_rate": 4.762829403606103e-06, + "loss": 0.008, + "step": 7562 + }, + { + "epoch": 5.239348804987877, + "grad_norm": 0.30628785490989685, + "learning_rate": 4.762135922330097e-06, + "loss": 0.0068, + "step": 7563 + }, + { + "epoch": 5.240041565639071, + "grad_norm": 0.27223464846611023, + "learning_rate": 4.761442441054092e-06, + "loss": 0.0059, + "step": 7564 + }, + { + "epoch": 5.240734326290267, + "grad_norm": 0.2744392454624176, + "learning_rate": 4.760748959778086e-06, + "loss": 0.0058, + "step": 7565 + }, + { + "epoch": 5.241427086941462, + "grad_norm": 0.3380274474620819, + "learning_rate": 4.7600554785020805e-06, + "loss": 0.0074, + "step": 7566 + }, + { + "epoch": 5.242119847592657, + "grad_norm": 0.41907307505607605, + "learning_rate": 4.7593619972260754e-06, + "loss": 0.0086, + "step": 7567 + }, + { + "epoch": 5.2428126082438515, + "grad_norm": 0.2549433410167694, + "learning_rate": 4.7586685159500695e-06, + "loss": 0.0054, + "step": 7568 + }, + { + "epoch": 5.243505368895047, + "grad_norm": 0.5552940368652344, + "learning_rate": 4.7579750346740645e-06, + "loss": 0.0075, + "step": 7569 + }, + { + "epoch": 5.244198129546242, + "grad_norm": 0.4340652525424957, + "learning_rate": 4.7572815533980585e-06, + "loss": 0.0109, + "step": 7570 + }, + { + "epoch": 5.2448908901974365, + "grad_norm": 0.3300117254257202, + "learning_rate": 4.7565880721220535e-06, + "loss": 0.0074, + "step": 7571 + }, + { + "epoch": 5.245583650848632, + "grad_norm": 0.3856252431869507, + "learning_rate": 4.7558945908460476e-06, + "loss": 0.007, + "step": 7572 + }, + { + "epoch": 5.246276411499827, + "grad_norm": 0.5354393720626831, + "learning_rate": 4.755201109570042e-06, + "loss": 0.0061, + "step": 7573 + }, + { + "epoch": 5.246969172151021, + "grad_norm": 0.43714696168899536, + "learning_rate": 4.754507628294037e-06, + "loss": 0.0089, + "step": 7574 + }, + { + "epoch": 5.247661932802217, + "grad_norm": 0.2474542111158371, + "learning_rate": 4.753814147018031e-06, + "loss": 0.0056, + "step": 7575 + }, + { + "epoch": 5.248354693453412, + "grad_norm": 0.31160688400268555, + "learning_rate": 4.753120665742026e-06, + "loss": 0.0069, + "step": 7576 + }, + { + "epoch": 5.249047454104607, + "grad_norm": 0.3125763237476349, + "learning_rate": 4.75242718446602e-06, + "loss": 0.0075, + "step": 7577 + }, + { + "epoch": 5.249740214755802, + "grad_norm": 0.354356586933136, + "learning_rate": 4.751733703190014e-06, + "loss": 0.0088, + "step": 7578 + }, + { + "epoch": 5.250432975406997, + "grad_norm": 0.26645520329475403, + "learning_rate": 4.751040221914009e-06, + "loss": 0.0063, + "step": 7579 + }, + { + "epoch": 5.251125736058192, + "grad_norm": 0.4143542945384979, + "learning_rate": 4.750346740638004e-06, + "loss": 0.0088, + "step": 7580 + }, + { + "epoch": 5.251818496709387, + "grad_norm": 0.2791270315647125, + "learning_rate": 4.749653259361998e-06, + "loss": 0.006, + "step": 7581 + }, + { + "epoch": 5.252511257360582, + "grad_norm": 0.32284653186798096, + "learning_rate": 4.748959778085992e-06, + "loss": 0.0079, + "step": 7582 + }, + { + "epoch": 5.253204018011777, + "grad_norm": 0.3721042275428772, + "learning_rate": 4.748266296809986e-06, + "loss": 0.0085, + "step": 7583 + }, + { + "epoch": 5.253896778662972, + "grad_norm": 0.27737486362457275, + "learning_rate": 4.747572815533981e-06, + "loss": 0.0056, + "step": 7584 + }, + { + "epoch": 5.254589539314167, + "grad_norm": 0.3773183226585388, + "learning_rate": 4.746879334257976e-06, + "loss": 0.0112, + "step": 7585 + }, + { + "epoch": 5.255282299965362, + "grad_norm": 0.6213601231575012, + "learning_rate": 4.74618585298197e-06, + "loss": 0.0089, + "step": 7586 + }, + { + "epoch": 5.255975060616557, + "grad_norm": 0.37571918964385986, + "learning_rate": 4.745492371705964e-06, + "loss": 0.008, + "step": 7587 + }, + { + "epoch": 5.256667821267752, + "grad_norm": 0.39039427042007446, + "learning_rate": 4.744798890429959e-06, + "loss": 0.0074, + "step": 7588 + }, + { + "epoch": 5.257360581918947, + "grad_norm": 0.3099873661994934, + "learning_rate": 4.744105409153954e-06, + "loss": 0.0068, + "step": 7589 + }, + { + "epoch": 5.258053342570142, + "grad_norm": 0.41337692737579346, + "learning_rate": 4.743411927877948e-06, + "loss": 0.0077, + "step": 7590 + }, + { + "epoch": 5.258746103221337, + "grad_norm": 0.38124918937683105, + "learning_rate": 4.742718446601942e-06, + "loss": 0.0063, + "step": 7591 + }, + { + "epoch": 5.259438863872532, + "grad_norm": 0.295126736164093, + "learning_rate": 4.742024965325936e-06, + "loss": 0.008, + "step": 7592 + }, + { + "epoch": 5.260131624523727, + "grad_norm": 0.27431103587150574, + "learning_rate": 4.741331484049931e-06, + "loss": 0.0054, + "step": 7593 + }, + { + "epoch": 5.260824385174922, + "grad_norm": 0.28997060656547546, + "learning_rate": 4.740638002773926e-06, + "loss": 0.005, + "step": 7594 + }, + { + "epoch": 5.261517145826117, + "grad_norm": 0.37890487909317017, + "learning_rate": 4.73994452149792e-06, + "loss": 0.0051, + "step": 7595 + }, + { + "epoch": 5.262209906477312, + "grad_norm": 0.44293665885925293, + "learning_rate": 4.739251040221914e-06, + "loss": 0.0104, + "step": 7596 + }, + { + "epoch": 5.2629026671285075, + "grad_norm": 0.24331487715244293, + "learning_rate": 4.738557558945909e-06, + "loss": 0.0062, + "step": 7597 + }, + { + "epoch": 5.263595427779702, + "grad_norm": 0.4369862675666809, + "learning_rate": 4.737864077669903e-06, + "loss": 0.0062, + "step": 7598 + }, + { + "epoch": 5.264288188430897, + "grad_norm": 0.5328646302223206, + "learning_rate": 4.737170596393898e-06, + "loss": 0.0089, + "step": 7599 + }, + { + "epoch": 5.2649809490820925, + "grad_norm": 0.4848049283027649, + "learning_rate": 4.736477115117892e-06, + "loss": 0.0085, + "step": 7600 + }, + { + "epoch": 5.265673709733287, + "grad_norm": 0.5734719634056091, + "learning_rate": 4.735783633841887e-06, + "loss": 0.0088, + "step": 7601 + }, + { + "epoch": 5.266366470384482, + "grad_norm": 0.3014770448207855, + "learning_rate": 4.735090152565881e-06, + "loss": 0.005, + "step": 7602 + }, + { + "epoch": 5.267059231035677, + "grad_norm": 0.4135104715824127, + "learning_rate": 4.734396671289875e-06, + "loss": 0.006, + "step": 7603 + }, + { + "epoch": 5.267751991686872, + "grad_norm": 0.40195783972740173, + "learning_rate": 4.73370319001387e-06, + "loss": 0.007, + "step": 7604 + }, + { + "epoch": 5.268444752338067, + "grad_norm": 0.25867411494255066, + "learning_rate": 4.733009708737864e-06, + "loss": 0.0065, + "step": 7605 + }, + { + "epoch": 5.269137512989262, + "grad_norm": 0.4152539372444153, + "learning_rate": 4.732316227461859e-06, + "loss": 0.0096, + "step": 7606 + }, + { + "epoch": 5.269830273640457, + "grad_norm": 0.38939064741134644, + "learning_rate": 4.731622746185853e-06, + "loss": 0.0075, + "step": 7607 + }, + { + "epoch": 5.270523034291652, + "grad_norm": 0.317944198846817, + "learning_rate": 4.730929264909847e-06, + "loss": 0.007, + "step": 7608 + }, + { + "epoch": 5.271215794942847, + "grad_norm": 0.29861703515052795, + "learning_rate": 4.730235783633842e-06, + "loss": 0.0062, + "step": 7609 + }, + { + "epoch": 5.271908555594043, + "grad_norm": 0.2839019000530243, + "learning_rate": 4.729542302357837e-06, + "loss": 0.0068, + "step": 7610 + }, + { + "epoch": 5.272601316245237, + "grad_norm": 0.3510481119155884, + "learning_rate": 4.728848821081831e-06, + "loss": 0.0072, + "step": 7611 + }, + { + "epoch": 5.273294076896432, + "grad_norm": 0.3694562017917633, + "learning_rate": 4.728155339805825e-06, + "loss": 0.0051, + "step": 7612 + }, + { + "epoch": 5.2739868375476275, + "grad_norm": 0.392853707075119, + "learning_rate": 4.7274618585298195e-06, + "loss": 0.0119, + "step": 7613 + }, + { + "epoch": 5.274679598198822, + "grad_norm": 0.32790088653564453, + "learning_rate": 4.7267683772538144e-06, + "loss": 0.007, + "step": 7614 + }, + { + "epoch": 5.275372358850017, + "grad_norm": 0.39944061636924744, + "learning_rate": 4.726074895977809e-06, + "loss": 0.0092, + "step": 7615 + }, + { + "epoch": 5.2760651195012125, + "grad_norm": 0.31208524107933044, + "learning_rate": 4.7253814147018035e-06, + "loss": 0.0058, + "step": 7616 + }, + { + "epoch": 5.276757880152408, + "grad_norm": 0.30092665553092957, + "learning_rate": 4.7246879334257975e-06, + "loss": 0.0082, + "step": 7617 + }, + { + "epoch": 5.277450640803602, + "grad_norm": 0.3765978217124939, + "learning_rate": 4.7239944521497925e-06, + "loss": 0.0073, + "step": 7618 + }, + { + "epoch": 5.278143401454797, + "grad_norm": 0.2820228338241577, + "learning_rate": 4.723300970873787e-06, + "loss": 0.0071, + "step": 7619 + }, + { + "epoch": 5.278836162105993, + "grad_norm": 0.26217740774154663, + "learning_rate": 4.7226074895977815e-06, + "loss": 0.0054, + "step": 7620 + }, + { + "epoch": 5.279528922757187, + "grad_norm": 0.5804328918457031, + "learning_rate": 4.721914008321776e-06, + "loss": 0.0094, + "step": 7621 + }, + { + "epoch": 5.280221683408382, + "grad_norm": 0.32458844780921936, + "learning_rate": 4.72122052704577e-06, + "loss": 0.007, + "step": 7622 + }, + { + "epoch": 5.280914444059578, + "grad_norm": 0.3897286057472229, + "learning_rate": 4.720527045769765e-06, + "loss": 0.0089, + "step": 7623 + }, + { + "epoch": 5.281607204710772, + "grad_norm": 0.4801514744758606, + "learning_rate": 4.7198335644937595e-06, + "loss": 0.0129, + "step": 7624 + }, + { + "epoch": 5.282299965361967, + "grad_norm": 0.31313589215278625, + "learning_rate": 4.719140083217754e-06, + "loss": 0.0083, + "step": 7625 + }, + { + "epoch": 5.282992726013163, + "grad_norm": 0.4712836444377899, + "learning_rate": 4.718446601941748e-06, + "loss": 0.0099, + "step": 7626 + }, + { + "epoch": 5.283685486664357, + "grad_norm": 0.30827465653419495, + "learning_rate": 4.717753120665743e-06, + "loss": 0.0065, + "step": 7627 + }, + { + "epoch": 5.284378247315552, + "grad_norm": 0.2368597537279129, + "learning_rate": 4.717059639389737e-06, + "loss": 0.0052, + "step": 7628 + }, + { + "epoch": 5.285071007966748, + "grad_norm": 0.3655679523944855, + "learning_rate": 4.716366158113732e-06, + "loss": 0.0064, + "step": 7629 + }, + { + "epoch": 5.285763768617943, + "grad_norm": 0.3163033127784729, + "learning_rate": 4.715672676837726e-06, + "loss": 0.0066, + "step": 7630 + }, + { + "epoch": 5.286456529269137, + "grad_norm": 0.32988113164901733, + "learning_rate": 4.71497919556172e-06, + "loss": 0.009, + "step": 7631 + }, + { + "epoch": 5.2871492899203325, + "grad_norm": 0.4472014904022217, + "learning_rate": 4.714285714285715e-06, + "loss": 0.0087, + "step": 7632 + }, + { + "epoch": 5.287842050571528, + "grad_norm": 0.2753150463104248, + "learning_rate": 4.713592233009709e-06, + "loss": 0.005, + "step": 7633 + }, + { + "epoch": 5.288534811222722, + "grad_norm": 0.4464603364467621, + "learning_rate": 4.712898751733704e-06, + "loss": 0.0088, + "step": 7634 + }, + { + "epoch": 5.2892275718739175, + "grad_norm": 0.18909068405628204, + "learning_rate": 4.712205270457698e-06, + "loss": 0.0045, + "step": 7635 + }, + { + "epoch": 5.289920332525113, + "grad_norm": 0.4636782705783844, + "learning_rate": 4.711511789181693e-06, + "loss": 0.0164, + "step": 7636 + }, + { + "epoch": 5.290613093176308, + "grad_norm": 0.3970611095428467, + "learning_rate": 4.710818307905687e-06, + "loss": 0.0069, + "step": 7637 + }, + { + "epoch": 5.291305853827502, + "grad_norm": 0.3895193934440613, + "learning_rate": 4.710124826629681e-06, + "loss": 0.0075, + "step": 7638 + }, + { + "epoch": 5.291998614478698, + "grad_norm": 0.3456301689147949, + "learning_rate": 4.709431345353676e-06, + "loss": 0.0091, + "step": 7639 + }, + { + "epoch": 5.292691375129893, + "grad_norm": 0.47521817684173584, + "learning_rate": 4.70873786407767e-06, + "loss": 0.0098, + "step": 7640 + }, + { + "epoch": 5.293384135781087, + "grad_norm": 0.3610929846763611, + "learning_rate": 4.708044382801665e-06, + "loss": 0.0085, + "step": 7641 + }, + { + "epoch": 5.294076896432283, + "grad_norm": 0.739068329334259, + "learning_rate": 4.707350901525659e-06, + "loss": 0.0069, + "step": 7642 + }, + { + "epoch": 5.294769657083478, + "grad_norm": 0.3092059791088104, + "learning_rate": 4.706657420249653e-06, + "loss": 0.0062, + "step": 7643 + }, + { + "epoch": 5.295462417734672, + "grad_norm": 0.3299868106842041, + "learning_rate": 4.705963938973648e-06, + "loss": 0.0072, + "step": 7644 + }, + { + "epoch": 5.296155178385868, + "grad_norm": 1.6031320095062256, + "learning_rate": 4.705270457697643e-06, + "loss": 0.0109, + "step": 7645 + }, + { + "epoch": 5.296847939037063, + "grad_norm": 0.3581763803958893, + "learning_rate": 4.704576976421637e-06, + "loss": 0.0073, + "step": 7646 + }, + { + "epoch": 5.297540699688257, + "grad_norm": 0.5154359936714172, + "learning_rate": 4.703883495145631e-06, + "loss": 0.0119, + "step": 7647 + }, + { + "epoch": 5.2982334603394525, + "grad_norm": 0.27011722326278687, + "learning_rate": 4.703190013869626e-06, + "loss": 0.0083, + "step": 7648 + }, + { + "epoch": 5.298926220990648, + "grad_norm": 0.4391592741012573, + "learning_rate": 4.702496532593621e-06, + "loss": 0.0112, + "step": 7649 + }, + { + "epoch": 5.299618981641843, + "grad_norm": 0.3810083270072937, + "learning_rate": 4.701803051317615e-06, + "loss": 0.0063, + "step": 7650 + }, + { + "epoch": 5.3003117422930375, + "grad_norm": 0.3371776044368744, + "learning_rate": 4.701109570041609e-06, + "loss": 0.0088, + "step": 7651 + }, + { + "epoch": 5.301004502944233, + "grad_norm": 0.2894117534160614, + "learning_rate": 4.700416088765603e-06, + "loss": 0.0077, + "step": 7652 + }, + { + "epoch": 5.301697263595428, + "grad_norm": 0.3736479878425598, + "learning_rate": 4.699722607489598e-06, + "loss": 0.0079, + "step": 7653 + }, + { + "epoch": 5.302390024246622, + "grad_norm": 0.2794745862483978, + "learning_rate": 4.699029126213593e-06, + "loss": 0.0053, + "step": 7654 + }, + { + "epoch": 5.303082784897818, + "grad_norm": 0.34775087237358093, + "learning_rate": 4.698335644937587e-06, + "loss": 0.0082, + "step": 7655 + }, + { + "epoch": 5.303775545549013, + "grad_norm": 0.45007821917533875, + "learning_rate": 4.697642163661581e-06, + "loss": 0.0076, + "step": 7656 + }, + { + "epoch": 5.304468306200208, + "grad_norm": 0.2986973524093628, + "learning_rate": 4.696948682385576e-06, + "loss": 0.0064, + "step": 7657 + }, + { + "epoch": 5.305161066851403, + "grad_norm": 0.3737943768501282, + "learning_rate": 4.69625520110957e-06, + "loss": 0.0054, + "step": 7658 + }, + { + "epoch": 5.305853827502598, + "grad_norm": 0.2908320128917694, + "learning_rate": 4.695561719833565e-06, + "loss": 0.006, + "step": 7659 + }, + { + "epoch": 5.306546588153793, + "grad_norm": 0.5494328737258911, + "learning_rate": 4.694868238557559e-06, + "loss": 0.0076, + "step": 7660 + }, + { + "epoch": 5.307239348804988, + "grad_norm": 0.22597812116146088, + "learning_rate": 4.6941747572815534e-06, + "loss": 0.0053, + "step": 7661 + }, + { + "epoch": 5.307932109456183, + "grad_norm": 0.46040821075439453, + "learning_rate": 4.693481276005548e-06, + "loss": 0.0099, + "step": 7662 + }, + { + "epoch": 5.308624870107378, + "grad_norm": 0.381632536649704, + "learning_rate": 4.6927877947295425e-06, + "loss": 0.0086, + "step": 7663 + }, + { + "epoch": 5.3093176307585725, + "grad_norm": 0.283383309841156, + "learning_rate": 4.692094313453537e-06, + "loss": 0.006, + "step": 7664 + }, + { + "epoch": 5.310010391409768, + "grad_norm": 0.3281424045562744, + "learning_rate": 4.6914008321775315e-06, + "loss": 0.0061, + "step": 7665 + }, + { + "epoch": 5.310703152060963, + "grad_norm": 0.4245662987232208, + "learning_rate": 4.690707350901526e-06, + "loss": 0.0066, + "step": 7666 + }, + { + "epoch": 5.3113959127121575, + "grad_norm": 0.3177310824394226, + "learning_rate": 4.6900138696255205e-06, + "loss": 0.0064, + "step": 7667 + }, + { + "epoch": 5.312088673363353, + "grad_norm": 0.3352442681789398, + "learning_rate": 4.689320388349515e-06, + "loss": 0.0074, + "step": 7668 + }, + { + "epoch": 5.312781434014548, + "grad_norm": 0.24328415095806122, + "learning_rate": 4.6886269070735095e-06, + "loss": 0.0051, + "step": 7669 + }, + { + "epoch": 5.313474194665743, + "grad_norm": 0.32478755712509155, + "learning_rate": 4.687933425797504e-06, + "loss": 0.0083, + "step": 7670 + }, + { + "epoch": 5.314166955316938, + "grad_norm": 0.2994299530982971, + "learning_rate": 4.6872399445214985e-06, + "loss": 0.0073, + "step": 7671 + }, + { + "epoch": 5.314859715968133, + "grad_norm": 0.4358683228492737, + "learning_rate": 4.686546463245493e-06, + "loss": 0.0063, + "step": 7672 + }, + { + "epoch": 5.315552476619328, + "grad_norm": 0.27254951000213623, + "learning_rate": 4.685852981969487e-06, + "loss": 0.0062, + "step": 7673 + }, + { + "epoch": 5.316245237270523, + "grad_norm": 0.26246801018714905, + "learning_rate": 4.685159500693482e-06, + "loss": 0.005, + "step": 7674 + }, + { + "epoch": 5.316937997921718, + "grad_norm": 0.3456709384918213, + "learning_rate": 4.6844660194174766e-06, + "loss": 0.0069, + "step": 7675 + }, + { + "epoch": 5.317630758572913, + "grad_norm": 0.4331519901752472, + "learning_rate": 4.683772538141471e-06, + "loss": 0.0089, + "step": 7676 + }, + { + "epoch": 5.3183235192241085, + "grad_norm": 0.27074652910232544, + "learning_rate": 4.683079056865465e-06, + "loss": 0.006, + "step": 7677 + }, + { + "epoch": 5.319016279875303, + "grad_norm": 0.3390403985977173, + "learning_rate": 4.68238557558946e-06, + "loss": 0.0082, + "step": 7678 + }, + { + "epoch": 5.319709040526498, + "grad_norm": 0.3624959886074066, + "learning_rate": 4.681692094313454e-06, + "loss": 0.0057, + "step": 7679 + }, + { + "epoch": 5.3204018011776935, + "grad_norm": 0.28021004796028137, + "learning_rate": 4.680998613037449e-06, + "loss": 0.0051, + "step": 7680 + }, + { + "epoch": 5.321094561828888, + "grad_norm": 0.5038958191871643, + "learning_rate": 4.680305131761443e-06, + "loss": 0.0065, + "step": 7681 + }, + { + "epoch": 5.321787322480083, + "grad_norm": 0.31561705470085144, + "learning_rate": 4.679611650485437e-06, + "loss": 0.008, + "step": 7682 + }, + { + "epoch": 5.322480083131278, + "grad_norm": 0.39957281947135925, + "learning_rate": 4.678918169209432e-06, + "loss": 0.0071, + "step": 7683 + }, + { + "epoch": 5.323172843782473, + "grad_norm": 0.3455754816532135, + "learning_rate": 4.678224687933427e-06, + "loss": 0.0103, + "step": 7684 + }, + { + "epoch": 5.323865604433668, + "grad_norm": 0.5441429615020752, + "learning_rate": 4.677531206657421e-06, + "loss": 0.0082, + "step": 7685 + }, + { + "epoch": 5.324558365084863, + "grad_norm": 0.26968908309936523, + "learning_rate": 4.676837725381415e-06, + "loss": 0.0068, + "step": 7686 + }, + { + "epoch": 5.325251125736058, + "grad_norm": 0.32514292001724243, + "learning_rate": 4.67614424410541e-06, + "loss": 0.01, + "step": 7687 + }, + { + "epoch": 5.325943886387253, + "grad_norm": 0.3907332718372345, + "learning_rate": 4.675450762829404e-06, + "loss": 0.0095, + "step": 7688 + }, + { + "epoch": 5.326636647038448, + "grad_norm": 0.4088253974914551, + "learning_rate": 4.674757281553399e-06, + "loss": 0.0094, + "step": 7689 + }, + { + "epoch": 5.327329407689644, + "grad_norm": 0.3880894184112549, + "learning_rate": 4.674063800277393e-06, + "loss": 0.006, + "step": 7690 + }, + { + "epoch": 5.328022168340838, + "grad_norm": 0.266454815864563, + "learning_rate": 4.673370319001387e-06, + "loss": 0.0075, + "step": 7691 + }, + { + "epoch": 5.328714928992033, + "grad_norm": 0.4018932580947876, + "learning_rate": 4.672676837725382e-06, + "loss": 0.0087, + "step": 7692 + }, + { + "epoch": 5.3294076896432285, + "grad_norm": 0.42245224118232727, + "learning_rate": 4.671983356449376e-06, + "loss": 0.0077, + "step": 7693 + }, + { + "epoch": 5.330100450294423, + "grad_norm": 0.2805998623371124, + "learning_rate": 4.671289875173371e-06, + "loss": 0.0063, + "step": 7694 + }, + { + "epoch": 5.330793210945618, + "grad_norm": 0.37564656138420105, + "learning_rate": 4.670596393897365e-06, + "loss": 0.0095, + "step": 7695 + }, + { + "epoch": 5.3314859715968135, + "grad_norm": 0.29174837470054626, + "learning_rate": 4.66990291262136e-06, + "loss": 0.0099, + "step": 7696 + }, + { + "epoch": 5.332178732248009, + "grad_norm": 0.47423186898231506, + "learning_rate": 4.669209431345354e-06, + "loss": 0.0079, + "step": 7697 + }, + { + "epoch": 5.332871492899203, + "grad_norm": 0.3046198785305023, + "learning_rate": 4.668515950069348e-06, + "loss": 0.0065, + "step": 7698 + }, + { + "epoch": 5.333564253550398, + "grad_norm": 0.3165915608406067, + "learning_rate": 4.667822468793343e-06, + "loss": 0.0066, + "step": 7699 + }, + { + "epoch": 5.334257014201594, + "grad_norm": 0.29851290583610535, + "learning_rate": 4.667128987517337e-06, + "loss": 0.0062, + "step": 7700 + }, + { + "epoch": 5.334949774852788, + "grad_norm": 0.22100408375263214, + "learning_rate": 4.666435506241332e-06, + "loss": 0.006, + "step": 7701 + }, + { + "epoch": 5.335642535503983, + "grad_norm": 0.30884596705436707, + "learning_rate": 4.665742024965326e-06, + "loss": 0.0052, + "step": 7702 + }, + { + "epoch": 5.336335296155179, + "grad_norm": 0.2811336815357208, + "learning_rate": 4.66504854368932e-06, + "loss": 0.0071, + "step": 7703 + }, + { + "epoch": 5.337028056806373, + "grad_norm": 0.29546523094177246, + "learning_rate": 4.664355062413315e-06, + "loss": 0.0066, + "step": 7704 + }, + { + "epoch": 5.337720817457568, + "grad_norm": 0.4193173348903656, + "learning_rate": 4.66366158113731e-06, + "loss": 0.0061, + "step": 7705 + }, + { + "epoch": 5.338413578108764, + "grad_norm": 0.2141648828983307, + "learning_rate": 4.662968099861304e-06, + "loss": 0.0045, + "step": 7706 + }, + { + "epoch": 5.339106338759958, + "grad_norm": 0.30084890127182007, + "learning_rate": 4.662274618585298e-06, + "loss": 0.0071, + "step": 7707 + }, + { + "epoch": 5.339799099411153, + "grad_norm": 0.46289554238319397, + "learning_rate": 4.6615811373092924e-06, + "loss": 0.0058, + "step": 7708 + }, + { + "epoch": 5.3404918600623486, + "grad_norm": 0.34725630283355713, + "learning_rate": 4.660887656033287e-06, + "loss": 0.0063, + "step": 7709 + }, + { + "epoch": 5.341184620713544, + "grad_norm": 0.37042534351348877, + "learning_rate": 4.660194174757282e-06, + "loss": 0.0097, + "step": 7710 + }, + { + "epoch": 5.341877381364738, + "grad_norm": 0.30515289306640625, + "learning_rate": 4.659500693481276e-06, + "loss": 0.0066, + "step": 7711 + }, + { + "epoch": 5.3425701420159335, + "grad_norm": 0.5584580302238464, + "learning_rate": 4.6588072122052705e-06, + "loss": 0.0123, + "step": 7712 + }, + { + "epoch": 5.343262902667129, + "grad_norm": 0.42978665232658386, + "learning_rate": 4.658113730929265e-06, + "loss": 0.0088, + "step": 7713 + }, + { + "epoch": 5.343955663318323, + "grad_norm": 0.5490087270736694, + "learning_rate": 4.65742024965326e-06, + "loss": 0.0061, + "step": 7714 + }, + { + "epoch": 5.3446484239695184, + "grad_norm": 0.7571777105331421, + "learning_rate": 4.6567267683772544e-06, + "loss": 0.0154, + "step": 7715 + }, + { + "epoch": 5.345341184620714, + "grad_norm": 0.8619104623794556, + "learning_rate": 4.6560332871012485e-06, + "loss": 0.0091, + "step": 7716 + }, + { + "epoch": 5.346033945271909, + "grad_norm": 0.3772400915622711, + "learning_rate": 4.6553398058252435e-06, + "loss": 0.0082, + "step": 7717 + }, + { + "epoch": 5.346726705923103, + "grad_norm": 0.2627728581428528, + "learning_rate": 4.6546463245492375e-06, + "loss": 0.0063, + "step": 7718 + }, + { + "epoch": 5.347419466574299, + "grad_norm": 0.262724906206131, + "learning_rate": 4.6539528432732325e-06, + "loss": 0.0058, + "step": 7719 + }, + { + "epoch": 5.348112227225494, + "grad_norm": 0.23938456177711487, + "learning_rate": 4.6532593619972266e-06, + "loss": 0.0055, + "step": 7720 + }, + { + "epoch": 5.348804987876688, + "grad_norm": 0.3758114278316498, + "learning_rate": 4.652565880721221e-06, + "loss": 0.0081, + "step": 7721 + }, + { + "epoch": 5.349497748527884, + "grad_norm": 0.45761778950691223, + "learning_rate": 4.6518723994452156e-06, + "loss": 0.0113, + "step": 7722 + }, + { + "epoch": 5.350190509179079, + "grad_norm": 0.30680564045906067, + "learning_rate": 4.65117891816921e-06, + "loss": 0.0083, + "step": 7723 + }, + { + "epoch": 5.350883269830273, + "grad_norm": 0.42523193359375, + "learning_rate": 4.650485436893205e-06, + "loss": 0.0081, + "step": 7724 + }, + { + "epoch": 5.351576030481469, + "grad_norm": 0.4331026077270508, + "learning_rate": 4.649791955617199e-06, + "loss": 0.0112, + "step": 7725 + }, + { + "epoch": 5.352268791132664, + "grad_norm": 0.3689337372779846, + "learning_rate": 4.649098474341194e-06, + "loss": 0.0057, + "step": 7726 + }, + { + "epoch": 5.352961551783858, + "grad_norm": 0.5229584574699402, + "learning_rate": 4.648404993065188e-06, + "loss": 0.0087, + "step": 7727 + }, + { + "epoch": 5.3536543124350535, + "grad_norm": 0.7035202383995056, + "learning_rate": 4.647711511789182e-06, + "loss": 0.0119, + "step": 7728 + }, + { + "epoch": 5.354347073086249, + "grad_norm": 0.3946300148963928, + "learning_rate": 4.647018030513177e-06, + "loss": 0.0063, + "step": 7729 + }, + { + "epoch": 5.355039833737444, + "grad_norm": 0.49915385246276855, + "learning_rate": 4.646324549237171e-06, + "loss": 0.0104, + "step": 7730 + }, + { + "epoch": 5.3557325943886385, + "grad_norm": 0.3928295373916626, + "learning_rate": 4.645631067961166e-06, + "loss": 0.0085, + "step": 7731 + }, + { + "epoch": 5.356425355039834, + "grad_norm": 0.30644491314888, + "learning_rate": 4.64493758668516e-06, + "loss": 0.0068, + "step": 7732 + }, + { + "epoch": 5.357118115691029, + "grad_norm": 0.31353700160980225, + "learning_rate": 4.644244105409154e-06, + "loss": 0.0075, + "step": 7733 + }, + { + "epoch": 5.357810876342223, + "grad_norm": 0.43064355850219727, + "learning_rate": 4.643550624133149e-06, + "loss": 0.0088, + "step": 7734 + }, + { + "epoch": 5.358503636993419, + "grad_norm": 0.6127991080284119, + "learning_rate": 4.642857142857144e-06, + "loss": 0.0092, + "step": 7735 + }, + { + "epoch": 5.359196397644614, + "grad_norm": 0.39949271082878113, + "learning_rate": 4.642163661581138e-06, + "loss": 0.0073, + "step": 7736 + }, + { + "epoch": 5.359889158295809, + "grad_norm": 0.35690221190452576, + "learning_rate": 4.641470180305132e-06, + "loss": 0.0107, + "step": 7737 + }, + { + "epoch": 5.360581918947004, + "grad_norm": 0.34157228469848633, + "learning_rate": 4.640776699029126e-06, + "loss": 0.009, + "step": 7738 + }, + { + "epoch": 5.361274679598199, + "grad_norm": 0.3978438675403595, + "learning_rate": 4.640083217753121e-06, + "loss": 0.0081, + "step": 7739 + }, + { + "epoch": 5.361967440249394, + "grad_norm": 0.468851238489151, + "learning_rate": 4.639389736477116e-06, + "loss": 0.0127, + "step": 7740 + }, + { + "epoch": 5.362660200900589, + "grad_norm": 0.297496497631073, + "learning_rate": 4.63869625520111e-06, + "loss": 0.0086, + "step": 7741 + }, + { + "epoch": 5.363352961551784, + "grad_norm": 0.32384419441223145, + "learning_rate": 4.638002773925104e-06, + "loss": 0.0057, + "step": 7742 + }, + { + "epoch": 5.364045722202979, + "grad_norm": 0.2854473888874054, + "learning_rate": 4.637309292649099e-06, + "loss": 0.0052, + "step": 7743 + }, + { + "epoch": 5.3647384828541735, + "grad_norm": 0.5612171292304993, + "learning_rate": 4.636615811373094e-06, + "loss": 0.0086, + "step": 7744 + }, + { + "epoch": 5.365431243505369, + "grad_norm": 0.329746276140213, + "learning_rate": 4.635922330097088e-06, + "loss": 0.0089, + "step": 7745 + }, + { + "epoch": 5.366124004156564, + "grad_norm": 0.43329378962516785, + "learning_rate": 4.635228848821082e-06, + "loss": 0.0099, + "step": 7746 + }, + { + "epoch": 5.3668167648077585, + "grad_norm": 0.3019446134567261, + "learning_rate": 4.634535367545076e-06, + "loss": 0.0081, + "step": 7747 + }, + { + "epoch": 5.367509525458954, + "grad_norm": 0.30500757694244385, + "learning_rate": 4.633841886269071e-06, + "loss": 0.0068, + "step": 7748 + }, + { + "epoch": 5.368202286110149, + "grad_norm": 0.3315373361110687, + "learning_rate": 4.633148404993066e-06, + "loss": 0.0058, + "step": 7749 + }, + { + "epoch": 5.368895046761344, + "grad_norm": 0.42689260840415955, + "learning_rate": 4.63245492371706e-06, + "loss": 0.0082, + "step": 7750 + }, + { + "epoch": 5.369587807412539, + "grad_norm": 0.4819112718105316, + "learning_rate": 4.631761442441054e-06, + "loss": 0.0069, + "step": 7751 + }, + { + "epoch": 5.370280568063734, + "grad_norm": 0.4219898581504822, + "learning_rate": 4.631067961165049e-06, + "loss": 0.0072, + "step": 7752 + }, + { + "epoch": 5.370973328714929, + "grad_norm": 0.48407548666000366, + "learning_rate": 4.630374479889043e-06, + "loss": 0.0078, + "step": 7753 + }, + { + "epoch": 5.371666089366124, + "grad_norm": 0.33480745553970337, + "learning_rate": 4.629680998613038e-06, + "loss": 0.0059, + "step": 7754 + }, + { + "epoch": 5.372358850017319, + "grad_norm": 0.3247593939304352, + "learning_rate": 4.628987517337032e-06, + "loss": 0.0068, + "step": 7755 + }, + { + "epoch": 5.373051610668514, + "grad_norm": 0.38313329219818115, + "learning_rate": 4.628294036061026e-06, + "loss": 0.0116, + "step": 7756 + }, + { + "epoch": 5.3737443713197095, + "grad_norm": 0.4368196129798889, + "learning_rate": 4.627600554785021e-06, + "loss": 0.0074, + "step": 7757 + }, + { + "epoch": 5.374437131970904, + "grad_norm": 0.30036941170692444, + "learning_rate": 4.626907073509015e-06, + "loss": 0.0067, + "step": 7758 + }, + { + "epoch": 5.375129892622099, + "grad_norm": 0.2873584032058716, + "learning_rate": 4.62621359223301e-06, + "loss": 0.0063, + "step": 7759 + }, + { + "epoch": 5.3758226532732944, + "grad_norm": 0.40364065766334534, + "learning_rate": 4.625520110957004e-06, + "loss": 0.0082, + "step": 7760 + }, + { + "epoch": 5.376515413924489, + "grad_norm": 0.4257599413394928, + "learning_rate": 4.624826629680999e-06, + "loss": 0.0084, + "step": 7761 + }, + { + "epoch": 5.377208174575684, + "grad_norm": 0.3269195258617401, + "learning_rate": 4.6241331484049934e-06, + "loss": 0.0068, + "step": 7762 + }, + { + "epoch": 5.377900935226879, + "grad_norm": 0.40550678968429565, + "learning_rate": 4.6234396671289875e-06, + "loss": 0.0077, + "step": 7763 + }, + { + "epoch": 5.378593695878074, + "grad_norm": 0.4186185896396637, + "learning_rate": 4.6227461858529825e-06, + "loss": 0.0104, + "step": 7764 + }, + { + "epoch": 5.379286456529269, + "grad_norm": 0.45377662777900696, + "learning_rate": 4.622052704576977e-06, + "loss": 0.0093, + "step": 7765 + }, + { + "epoch": 5.379979217180464, + "grad_norm": 0.4113067090511322, + "learning_rate": 4.6213592233009715e-06, + "loss": 0.0079, + "step": 7766 + }, + { + "epoch": 5.380671977831659, + "grad_norm": 0.29760265350341797, + "learning_rate": 4.6206657420249656e-06, + "loss": 0.0056, + "step": 7767 + }, + { + "epoch": 5.381364738482854, + "grad_norm": 0.2859029769897461, + "learning_rate": 4.61997226074896e-06, + "loss": 0.0092, + "step": 7768 + }, + { + "epoch": 5.382057499134049, + "grad_norm": 0.40182769298553467, + "learning_rate": 4.619278779472955e-06, + "loss": 0.008, + "step": 7769 + }, + { + "epoch": 5.382750259785245, + "grad_norm": 0.3549944758415222, + "learning_rate": 4.6185852981969495e-06, + "loss": 0.007, + "step": 7770 + }, + { + "epoch": 5.383443020436439, + "grad_norm": 0.30594438314437866, + "learning_rate": 4.617891816920944e-06, + "loss": 0.0064, + "step": 7771 + }, + { + "epoch": 5.384135781087634, + "grad_norm": 0.3039894104003906, + "learning_rate": 4.617198335644938e-06, + "loss": 0.0044, + "step": 7772 + }, + { + "epoch": 5.3848285417388295, + "grad_norm": 0.256325900554657, + "learning_rate": 4.616504854368933e-06, + "loss": 0.0064, + "step": 7773 + }, + { + "epoch": 5.385521302390024, + "grad_norm": 0.3893841803073883, + "learning_rate": 4.615811373092927e-06, + "loss": 0.0061, + "step": 7774 + }, + { + "epoch": 5.386214063041219, + "grad_norm": 0.5432993769645691, + "learning_rate": 4.615117891816922e-06, + "loss": 0.011, + "step": 7775 + }, + { + "epoch": 5.3869068236924145, + "grad_norm": 0.30605578422546387, + "learning_rate": 4.614424410540916e-06, + "loss": 0.0062, + "step": 7776 + }, + { + "epoch": 5.38759958434361, + "grad_norm": 0.3061191439628601, + "learning_rate": 4.61373092926491e-06, + "loss": 0.0072, + "step": 7777 + }, + { + "epoch": 5.388292344994804, + "grad_norm": 0.597377598285675, + "learning_rate": 4.613037447988905e-06, + "loss": 0.0133, + "step": 7778 + }, + { + "epoch": 5.388985105645999, + "grad_norm": 0.5816665887832642, + "learning_rate": 4.612343966712899e-06, + "loss": 0.0083, + "step": 7779 + }, + { + "epoch": 5.389677866297195, + "grad_norm": 0.4072982370853424, + "learning_rate": 4.611650485436894e-06, + "loss": 0.0104, + "step": 7780 + }, + { + "epoch": 5.390370626948389, + "grad_norm": 0.3289441168308258, + "learning_rate": 4.610957004160888e-06, + "loss": 0.0064, + "step": 7781 + }, + { + "epoch": 5.391063387599584, + "grad_norm": 0.7042639851570129, + "learning_rate": 4.610263522884883e-06, + "loss": 0.0097, + "step": 7782 + }, + { + "epoch": 5.39175614825078, + "grad_norm": 0.3567148447036743, + "learning_rate": 4.609570041608877e-06, + "loss": 0.0125, + "step": 7783 + }, + { + "epoch": 5.392448908901974, + "grad_norm": 0.27430447936058044, + "learning_rate": 4.608876560332871e-06, + "loss": 0.0056, + "step": 7784 + }, + { + "epoch": 5.393141669553169, + "grad_norm": 0.9605273008346558, + "learning_rate": 4.608183079056866e-06, + "loss": 0.0092, + "step": 7785 + }, + { + "epoch": 5.393834430204365, + "grad_norm": 0.36006471514701843, + "learning_rate": 4.60748959778086e-06, + "loss": 0.0094, + "step": 7786 + }, + { + "epoch": 5.394527190855559, + "grad_norm": 0.3300853967666626, + "learning_rate": 4.606796116504855e-06, + "loss": 0.0092, + "step": 7787 + }, + { + "epoch": 5.395219951506754, + "grad_norm": 0.36270296573638916, + "learning_rate": 4.606102635228849e-06, + "loss": 0.0068, + "step": 7788 + }, + { + "epoch": 5.3959127121579495, + "grad_norm": 0.3791219890117645, + "learning_rate": 4.605409153952843e-06, + "loss": 0.0064, + "step": 7789 + }, + { + "epoch": 5.396605472809145, + "grad_norm": 0.26875039935112, + "learning_rate": 4.604715672676838e-06, + "loss": 0.0065, + "step": 7790 + }, + { + "epoch": 5.397298233460339, + "grad_norm": 0.33566567301750183, + "learning_rate": 4.604022191400833e-06, + "loss": 0.0054, + "step": 7791 + }, + { + "epoch": 5.3979909941115345, + "grad_norm": 0.39481306076049805, + "learning_rate": 4.603328710124827e-06, + "loss": 0.0085, + "step": 7792 + }, + { + "epoch": 5.39868375476273, + "grad_norm": 0.29390987753868103, + "learning_rate": 4.602635228848821e-06, + "loss": 0.0089, + "step": 7793 + }, + { + "epoch": 5.399376515413924, + "grad_norm": 0.3850598931312561, + "learning_rate": 4.601941747572816e-06, + "loss": 0.0088, + "step": 7794 + }, + { + "epoch": 5.400069276065119, + "grad_norm": 0.3381340503692627, + "learning_rate": 4.60124826629681e-06, + "loss": 0.0106, + "step": 7795 + }, + { + "epoch": 5.400762036716315, + "grad_norm": 0.3669929802417755, + "learning_rate": 4.600554785020805e-06, + "loss": 0.0091, + "step": 7796 + }, + { + "epoch": 5.40145479736751, + "grad_norm": 0.35967838764190674, + "learning_rate": 4.599861303744799e-06, + "loss": 0.0067, + "step": 7797 + }, + { + "epoch": 5.402147558018704, + "grad_norm": 0.3908219039440155, + "learning_rate": 4.599167822468793e-06, + "loss": 0.0059, + "step": 7798 + }, + { + "epoch": 5.4028403186699, + "grad_norm": 0.3632335364818573, + "learning_rate": 4.598474341192788e-06, + "loss": 0.007, + "step": 7799 + }, + { + "epoch": 5.403533079321095, + "grad_norm": 0.3082124590873718, + "learning_rate": 4.597780859916783e-06, + "loss": 0.0085, + "step": 7800 + }, + { + "epoch": 5.404225839972289, + "grad_norm": 0.338107168674469, + "learning_rate": 4.597087378640777e-06, + "loss": 0.0089, + "step": 7801 + }, + { + "epoch": 5.404918600623485, + "grad_norm": 0.3868095874786377, + "learning_rate": 4.596393897364771e-06, + "loss": 0.0098, + "step": 7802 + }, + { + "epoch": 5.40561136127468, + "grad_norm": 0.341936856508255, + "learning_rate": 4.595700416088766e-06, + "loss": 0.0078, + "step": 7803 + }, + { + "epoch": 5.406304121925874, + "grad_norm": 0.3599020540714264, + "learning_rate": 4.59500693481276e-06, + "loss": 0.006, + "step": 7804 + }, + { + "epoch": 5.40699688257707, + "grad_norm": 0.34239375591278076, + "learning_rate": 4.594313453536755e-06, + "loss": 0.0079, + "step": 7805 + }, + { + "epoch": 5.407689643228265, + "grad_norm": 0.3811340034008026, + "learning_rate": 4.593619972260749e-06, + "loss": 0.0101, + "step": 7806 + }, + { + "epoch": 5.408382403879459, + "grad_norm": 0.3789340555667877, + "learning_rate": 4.592926490984743e-06, + "loss": 0.0095, + "step": 7807 + }, + { + "epoch": 5.4090751645306545, + "grad_norm": 0.6665703654289246, + "learning_rate": 4.592233009708738e-06, + "loss": 0.0082, + "step": 7808 + }, + { + "epoch": 5.40976792518185, + "grad_norm": 0.41004064679145813, + "learning_rate": 4.5915395284327324e-06, + "loss": 0.0092, + "step": 7809 + }, + { + "epoch": 5.410460685833045, + "grad_norm": 0.5761812329292297, + "learning_rate": 4.590846047156727e-06, + "loss": 0.0107, + "step": 7810 + }, + { + "epoch": 5.4111534464842395, + "grad_norm": 0.4545287489891052, + "learning_rate": 4.5901525658807215e-06, + "loss": 0.0092, + "step": 7811 + }, + { + "epoch": 5.411846207135435, + "grad_norm": 0.3607684075832367, + "learning_rate": 4.589459084604716e-06, + "loss": 0.008, + "step": 7812 + }, + { + "epoch": 5.41253896778663, + "grad_norm": 0.24616214632987976, + "learning_rate": 4.5887656033287105e-06, + "loss": 0.0046, + "step": 7813 + }, + { + "epoch": 5.413231728437824, + "grad_norm": 0.2663242816925049, + "learning_rate": 4.5880721220527046e-06, + "loss": 0.0062, + "step": 7814 + }, + { + "epoch": 5.41392448908902, + "grad_norm": 0.3745112717151642, + "learning_rate": 4.5873786407766995e-06, + "loss": 0.0085, + "step": 7815 + }, + { + "epoch": 5.414617249740215, + "grad_norm": 0.35963526368141174, + "learning_rate": 4.586685159500694e-06, + "loss": 0.0072, + "step": 7816 + }, + { + "epoch": 5.415310010391409, + "grad_norm": 0.5829387307167053, + "learning_rate": 4.5859916782246885e-06, + "loss": 0.0087, + "step": 7817 + }, + { + "epoch": 5.416002771042605, + "grad_norm": 0.5039746165275574, + "learning_rate": 4.585298196948683e-06, + "loss": 0.0092, + "step": 7818 + }, + { + "epoch": 5.4166955316938, + "grad_norm": 0.42750078439712524, + "learning_rate": 4.584604715672677e-06, + "loss": 0.012, + "step": 7819 + }, + { + "epoch": 5.417388292344995, + "grad_norm": 0.4325467050075531, + "learning_rate": 4.583911234396672e-06, + "loss": 0.0081, + "step": 7820 + }, + { + "epoch": 5.41808105299619, + "grad_norm": 0.3611724078655243, + "learning_rate": 4.5832177531206666e-06, + "loss": 0.009, + "step": 7821 + }, + { + "epoch": 5.418773813647385, + "grad_norm": 0.4679085910320282, + "learning_rate": 4.582524271844661e-06, + "loss": 0.0086, + "step": 7822 + }, + { + "epoch": 5.41946657429858, + "grad_norm": 0.4069465100765228, + "learning_rate": 4.581830790568655e-06, + "loss": 0.0065, + "step": 7823 + }, + { + "epoch": 5.4201593349497745, + "grad_norm": 0.34985414147377014, + "learning_rate": 4.581137309292649e-06, + "loss": 0.0082, + "step": 7824 + }, + { + "epoch": 5.42085209560097, + "grad_norm": 0.3227502703666687, + "learning_rate": 4.580443828016644e-06, + "loss": 0.0062, + "step": 7825 + }, + { + "epoch": 5.421544856252165, + "grad_norm": 0.3587585687637329, + "learning_rate": 4.579750346740639e-06, + "loss": 0.0093, + "step": 7826 + }, + { + "epoch": 5.4222376169033595, + "grad_norm": 0.4926263689994812, + "learning_rate": 4.579056865464633e-06, + "loss": 0.0081, + "step": 7827 + }, + { + "epoch": 5.422930377554555, + "grad_norm": 0.47826123237609863, + "learning_rate": 4.578363384188627e-06, + "loss": 0.0099, + "step": 7828 + }, + { + "epoch": 5.42362313820575, + "grad_norm": 0.19716590642929077, + "learning_rate": 4.577669902912622e-06, + "loss": 0.0051, + "step": 7829 + }, + { + "epoch": 5.424315898856945, + "grad_norm": 0.22191399335861206, + "learning_rate": 4.576976421636617e-06, + "loss": 0.0049, + "step": 7830 + }, + { + "epoch": 5.42500865950814, + "grad_norm": 0.3407326340675354, + "learning_rate": 4.576282940360611e-06, + "loss": 0.0093, + "step": 7831 + }, + { + "epoch": 5.425701420159335, + "grad_norm": 0.39827200770378113, + "learning_rate": 4.575589459084605e-06, + "loss": 0.0132, + "step": 7832 + }, + { + "epoch": 5.42639418081053, + "grad_norm": 0.3168533146381378, + "learning_rate": 4.574895977808599e-06, + "loss": 0.0098, + "step": 7833 + }, + { + "epoch": 5.427086941461725, + "grad_norm": 0.2461700439453125, + "learning_rate": 4.574202496532594e-06, + "loss": 0.0073, + "step": 7834 + }, + { + "epoch": 5.42777970211292, + "grad_norm": 0.2408578246831894, + "learning_rate": 4.573509015256589e-06, + "loss": 0.0052, + "step": 7835 + }, + { + "epoch": 5.428472462764115, + "grad_norm": 0.38545313477516174, + "learning_rate": 4.572815533980583e-06, + "loss": 0.0099, + "step": 7836 + }, + { + "epoch": 5.42916522341531, + "grad_norm": 0.28080132603645325, + "learning_rate": 4.572122052704577e-06, + "loss": 0.0058, + "step": 7837 + }, + { + "epoch": 5.429857984066505, + "grad_norm": 0.367831826210022, + "learning_rate": 4.571428571428572e-06, + "loss": 0.0083, + "step": 7838 + }, + { + "epoch": 5.4305507447177, + "grad_norm": 0.35450807213783264, + "learning_rate": 4.570735090152566e-06, + "loss": 0.009, + "step": 7839 + }, + { + "epoch": 5.431243505368895, + "grad_norm": 0.5089789032936096, + "learning_rate": 4.570041608876561e-06, + "loss": 0.0099, + "step": 7840 + }, + { + "epoch": 5.43193626602009, + "grad_norm": 0.42711472511291504, + "learning_rate": 4.569348127600555e-06, + "loss": 0.0078, + "step": 7841 + }, + { + "epoch": 5.432629026671285, + "grad_norm": 0.46557801961898804, + "learning_rate": 4.56865464632455e-06, + "loss": 0.01, + "step": 7842 + }, + { + "epoch": 5.43332178732248, + "grad_norm": 0.29302334785461426, + "learning_rate": 4.567961165048544e-06, + "loss": 0.0067, + "step": 7843 + }, + { + "epoch": 5.434014547973675, + "grad_norm": 0.4012078046798706, + "learning_rate": 4.567267683772538e-06, + "loss": 0.0063, + "step": 7844 + }, + { + "epoch": 5.43470730862487, + "grad_norm": 0.5524371266365051, + "learning_rate": 4.566574202496533e-06, + "loss": 0.0114, + "step": 7845 + }, + { + "epoch": 5.435400069276065, + "grad_norm": 0.40067043900489807, + "learning_rate": 4.565880721220527e-06, + "loss": 0.0107, + "step": 7846 + }, + { + "epoch": 5.43609282992726, + "grad_norm": 0.3479993939399719, + "learning_rate": 4.565187239944522e-06, + "loss": 0.0094, + "step": 7847 + }, + { + "epoch": 5.436785590578455, + "grad_norm": 0.42340004444122314, + "learning_rate": 4.564493758668516e-06, + "loss": 0.0055, + "step": 7848 + }, + { + "epoch": 5.43747835122965, + "grad_norm": 0.32815006375312805, + "learning_rate": 4.56380027739251e-06, + "loss": 0.0075, + "step": 7849 + }, + { + "epoch": 5.438171111880846, + "grad_norm": 0.3321520984172821, + "learning_rate": 4.563106796116505e-06, + "loss": 0.0084, + "step": 7850 + }, + { + "epoch": 5.43886387253204, + "grad_norm": 0.3828342854976654, + "learning_rate": 4.5624133148405e-06, + "loss": 0.0126, + "step": 7851 + }, + { + "epoch": 5.439556633183235, + "grad_norm": 0.3134111166000366, + "learning_rate": 4.561719833564494e-06, + "loss": 0.0072, + "step": 7852 + }, + { + "epoch": 5.4402493938344305, + "grad_norm": 0.3222011923789978, + "learning_rate": 4.561026352288488e-06, + "loss": 0.0086, + "step": 7853 + }, + { + "epoch": 5.440942154485625, + "grad_norm": 0.3499179780483246, + "learning_rate": 4.560332871012482e-06, + "loss": 0.0067, + "step": 7854 + }, + { + "epoch": 5.44163491513682, + "grad_norm": 0.3237707316875458, + "learning_rate": 4.559639389736477e-06, + "loss": 0.006, + "step": 7855 + }, + { + "epoch": 5.4423276757880155, + "grad_norm": 0.3217258155345917, + "learning_rate": 4.558945908460472e-06, + "loss": 0.0061, + "step": 7856 + }, + { + "epoch": 5.44302043643921, + "grad_norm": 0.3172626793384552, + "learning_rate": 4.558252427184466e-06, + "loss": 0.0062, + "step": 7857 + }, + { + "epoch": 5.443713197090405, + "grad_norm": 0.23633909225463867, + "learning_rate": 4.5575589459084605e-06, + "loss": 0.0062, + "step": 7858 + }, + { + "epoch": 5.4444059577416, + "grad_norm": 0.3542218506336212, + "learning_rate": 4.556865464632455e-06, + "loss": 0.0068, + "step": 7859 + }, + { + "epoch": 5.445098718392796, + "grad_norm": 0.36565008759498596, + "learning_rate": 4.55617198335645e-06, + "loss": 0.0063, + "step": 7860 + }, + { + "epoch": 5.44579147904399, + "grad_norm": 0.4428595304489136, + "learning_rate": 4.555478502080444e-06, + "loss": 0.0061, + "step": 7861 + }, + { + "epoch": 5.446484239695185, + "grad_norm": 0.4287349283695221, + "learning_rate": 4.5547850208044385e-06, + "loss": 0.0068, + "step": 7862 + }, + { + "epoch": 5.447177000346381, + "grad_norm": 0.3254203200340271, + "learning_rate": 4.554091539528433e-06, + "loss": 0.0054, + "step": 7863 + }, + { + "epoch": 5.447869760997575, + "grad_norm": 0.4210508465766907, + "learning_rate": 4.5533980582524275e-06, + "loss": 0.0091, + "step": 7864 + }, + { + "epoch": 5.44856252164877, + "grad_norm": 0.36349746584892273, + "learning_rate": 4.5527045769764224e-06, + "loss": 0.0068, + "step": 7865 + }, + { + "epoch": 5.449255282299966, + "grad_norm": 0.5324582457542419, + "learning_rate": 4.5520110957004165e-06, + "loss": 0.01, + "step": 7866 + }, + { + "epoch": 5.44994804295116, + "grad_norm": 0.5588488578796387, + "learning_rate": 4.551317614424411e-06, + "loss": 0.0111, + "step": 7867 + }, + { + "epoch": 5.450640803602355, + "grad_norm": 0.2990497052669525, + "learning_rate": 4.5506241331484056e-06, + "loss": 0.0084, + "step": 7868 + }, + { + "epoch": 5.4513335642535505, + "grad_norm": 0.31489789485931396, + "learning_rate": 4.5499306518724e-06, + "loss": 0.0086, + "step": 7869 + }, + { + "epoch": 5.452026324904746, + "grad_norm": 0.2952931523323059, + "learning_rate": 4.5492371705963946e-06, + "loss": 0.0068, + "step": 7870 + }, + { + "epoch": 5.45271908555594, + "grad_norm": 0.32959821820259094, + "learning_rate": 4.548543689320389e-06, + "loss": 0.0086, + "step": 7871 + }, + { + "epoch": 5.4534118462071355, + "grad_norm": 0.44898277521133423, + "learning_rate": 4.547850208044383e-06, + "loss": 0.0074, + "step": 7872 + }, + { + "epoch": 5.454104606858331, + "grad_norm": 0.5427644848823547, + "learning_rate": 4.547156726768378e-06, + "loss": 0.0107, + "step": 7873 + }, + { + "epoch": 5.454797367509525, + "grad_norm": 0.33111703395843506, + "learning_rate": 4.546463245492372e-06, + "loss": 0.0064, + "step": 7874 + }, + { + "epoch": 5.45549012816072, + "grad_norm": 0.20245476067066193, + "learning_rate": 4.545769764216367e-06, + "loss": 0.0042, + "step": 7875 + }, + { + "epoch": 5.456182888811916, + "grad_norm": 0.5624719262123108, + "learning_rate": 4.545076282940361e-06, + "loss": 0.0114, + "step": 7876 + }, + { + "epoch": 5.45687564946311, + "grad_norm": 0.49390777945518494, + "learning_rate": 4.544382801664356e-06, + "loss": 0.0105, + "step": 7877 + }, + { + "epoch": 5.457568410114305, + "grad_norm": 0.34519869089126587, + "learning_rate": 4.54368932038835e-06, + "loss": 0.0081, + "step": 7878 + }, + { + "epoch": 5.458261170765501, + "grad_norm": 0.27565476298332214, + "learning_rate": 4.542995839112344e-06, + "loss": 0.006, + "step": 7879 + }, + { + "epoch": 5.458953931416696, + "grad_norm": 0.3185814917087555, + "learning_rate": 4.542302357836339e-06, + "loss": 0.0076, + "step": 7880 + }, + { + "epoch": 5.45964669206789, + "grad_norm": 0.40839284658432007, + "learning_rate": 4.541608876560334e-06, + "loss": 0.0084, + "step": 7881 + }, + { + "epoch": 5.460339452719086, + "grad_norm": 0.39189034700393677, + "learning_rate": 4.540915395284328e-06, + "loss": 0.0078, + "step": 7882 + }, + { + "epoch": 5.461032213370281, + "grad_norm": 0.2857482433319092, + "learning_rate": 4.540221914008322e-06, + "loss": 0.007, + "step": 7883 + }, + { + "epoch": 5.461724974021475, + "grad_norm": 0.35811492800712585, + "learning_rate": 4.539528432732316e-06, + "loss": 0.0086, + "step": 7884 + }, + { + "epoch": 5.4624177346726706, + "grad_norm": 0.3011959195137024, + "learning_rate": 4.538834951456311e-06, + "loss": 0.0069, + "step": 7885 + }, + { + "epoch": 5.463110495323866, + "grad_norm": 0.24721817672252655, + "learning_rate": 4.538141470180306e-06, + "loss": 0.0057, + "step": 7886 + }, + { + "epoch": 5.46380325597506, + "grad_norm": 0.3397451937198639, + "learning_rate": 4.5374479889043e-06, + "loss": 0.0075, + "step": 7887 + }, + { + "epoch": 5.4644960166262555, + "grad_norm": 0.3887694478034973, + "learning_rate": 4.536754507628294e-06, + "loss": 0.0083, + "step": 7888 + }, + { + "epoch": 5.465188777277451, + "grad_norm": 0.396060973405838, + "learning_rate": 4.536061026352289e-06, + "loss": 0.0092, + "step": 7889 + }, + { + "epoch": 5.465881537928646, + "grad_norm": 0.38492250442504883, + "learning_rate": 4.535367545076284e-06, + "loss": 0.0064, + "step": 7890 + }, + { + "epoch": 5.4665742985798405, + "grad_norm": 0.331540584564209, + "learning_rate": 4.534674063800278e-06, + "loss": 0.0092, + "step": 7891 + }, + { + "epoch": 5.467267059231036, + "grad_norm": 0.5112343430519104, + "learning_rate": 4.533980582524272e-06, + "loss": 0.0098, + "step": 7892 + }, + { + "epoch": 5.467959819882231, + "grad_norm": 0.21132290363311768, + "learning_rate": 4.533287101248266e-06, + "loss": 0.0051, + "step": 7893 + }, + { + "epoch": 5.468652580533425, + "grad_norm": 0.2255939096212387, + "learning_rate": 4.532593619972261e-06, + "loss": 0.0045, + "step": 7894 + }, + { + "epoch": 5.469345341184621, + "grad_norm": 0.3799256384372711, + "learning_rate": 4.531900138696256e-06, + "loss": 0.0072, + "step": 7895 + }, + { + "epoch": 5.470038101835816, + "grad_norm": 0.2559894323348999, + "learning_rate": 4.53120665742025e-06, + "loss": 0.0064, + "step": 7896 + }, + { + "epoch": 5.47073086248701, + "grad_norm": 0.27032461762428284, + "learning_rate": 4.530513176144244e-06, + "loss": 0.0076, + "step": 7897 + }, + { + "epoch": 5.471423623138206, + "grad_norm": 0.40068286657333374, + "learning_rate": 4.529819694868239e-06, + "loss": 0.0115, + "step": 7898 + }, + { + "epoch": 5.472116383789401, + "grad_norm": 0.33502882719039917, + "learning_rate": 4.529126213592233e-06, + "loss": 0.0078, + "step": 7899 + }, + { + "epoch": 5.472809144440596, + "grad_norm": 0.3175271153450012, + "learning_rate": 4.528432732316228e-06, + "loss": 0.0077, + "step": 7900 + }, + { + "epoch": 5.473501905091791, + "grad_norm": 0.2385866641998291, + "learning_rate": 4.527739251040222e-06, + "loss": 0.0054, + "step": 7901 + }, + { + "epoch": 5.474194665742986, + "grad_norm": 0.2599616050720215, + "learning_rate": 4.527045769764216e-06, + "loss": 0.0054, + "step": 7902 + }, + { + "epoch": 5.474887426394181, + "grad_norm": 0.30545392632484436, + "learning_rate": 4.526352288488211e-06, + "loss": 0.007, + "step": 7903 + }, + { + "epoch": 5.4755801870453755, + "grad_norm": 0.32731249928474426, + "learning_rate": 4.525658807212205e-06, + "loss": 0.0089, + "step": 7904 + }, + { + "epoch": 5.476272947696571, + "grad_norm": 0.37971776723861694, + "learning_rate": 4.5249653259362e-06, + "loss": 0.0076, + "step": 7905 + }, + { + "epoch": 5.476965708347766, + "grad_norm": 0.40346983075141907, + "learning_rate": 4.524271844660194e-06, + "loss": 0.0058, + "step": 7906 + }, + { + "epoch": 5.4776584689989605, + "grad_norm": 0.37472909688949585, + "learning_rate": 4.523578363384189e-06, + "loss": 0.0075, + "step": 7907 + }, + { + "epoch": 5.478351229650156, + "grad_norm": 0.6266420483589172, + "learning_rate": 4.522884882108183e-06, + "loss": 0.0102, + "step": 7908 + }, + { + "epoch": 5.479043990301351, + "grad_norm": 0.43866774439811707, + "learning_rate": 4.5221914008321775e-06, + "loss": 0.0077, + "step": 7909 + }, + { + "epoch": 5.479736750952546, + "grad_norm": 0.3060195744037628, + "learning_rate": 4.5214979195561724e-06, + "loss": 0.0057, + "step": 7910 + }, + { + "epoch": 5.480429511603741, + "grad_norm": 0.3488198518753052, + "learning_rate": 4.5208044382801665e-06, + "loss": 0.0082, + "step": 7911 + }, + { + "epoch": 5.481122272254936, + "grad_norm": 0.2779553532600403, + "learning_rate": 4.5201109570041614e-06, + "loss": 0.0057, + "step": 7912 + }, + { + "epoch": 5.481815032906131, + "grad_norm": 0.20279371738433838, + "learning_rate": 4.5194174757281555e-06, + "loss": 0.0055, + "step": 7913 + }, + { + "epoch": 5.482507793557326, + "grad_norm": 0.24038439989089966, + "learning_rate": 4.51872399445215e-06, + "loss": 0.0046, + "step": 7914 + }, + { + "epoch": 5.483200554208521, + "grad_norm": 0.28487125039100647, + "learning_rate": 4.5180305131761446e-06, + "loss": 0.0063, + "step": 7915 + }, + { + "epoch": 5.483893314859716, + "grad_norm": 0.2544689178466797, + "learning_rate": 4.5173370319001395e-06, + "loss": 0.007, + "step": 7916 + }, + { + "epoch": 5.484586075510911, + "grad_norm": 0.3484251797199249, + "learning_rate": 4.5166435506241336e-06, + "loss": 0.0096, + "step": 7917 + }, + { + "epoch": 5.485278836162106, + "grad_norm": 0.2558421790599823, + "learning_rate": 4.515950069348128e-06, + "loss": 0.0055, + "step": 7918 + }, + { + "epoch": 5.485971596813301, + "grad_norm": 0.28450798988342285, + "learning_rate": 4.515256588072123e-06, + "loss": 0.006, + "step": 7919 + }, + { + "epoch": 5.486664357464496, + "grad_norm": 0.30505290627479553, + "learning_rate": 4.514563106796117e-06, + "loss": 0.0054, + "step": 7920 + }, + { + "epoch": 5.487357118115691, + "grad_norm": 0.32605981826782227, + "learning_rate": 4.513869625520112e-06, + "loss": 0.0102, + "step": 7921 + }, + { + "epoch": 5.488049878766886, + "grad_norm": 0.3867191672325134, + "learning_rate": 4.513176144244106e-06, + "loss": 0.0097, + "step": 7922 + }, + { + "epoch": 5.488742639418081, + "grad_norm": 0.33257973194122314, + "learning_rate": 4.5124826629681e-06, + "loss": 0.0103, + "step": 7923 + }, + { + "epoch": 5.489435400069276, + "grad_norm": 0.3549240231513977, + "learning_rate": 4.511789181692095e-06, + "loss": 0.0079, + "step": 7924 + }, + { + "epoch": 5.490128160720471, + "grad_norm": 0.530772864818573, + "learning_rate": 4.51109570041609e-06, + "loss": 0.0091, + "step": 7925 + }, + { + "epoch": 5.490820921371666, + "grad_norm": 0.5482567548751831, + "learning_rate": 4.510402219140084e-06, + "loss": 0.0077, + "step": 7926 + }, + { + "epoch": 5.491513682022861, + "grad_norm": 0.24993811547756195, + "learning_rate": 4.509708737864078e-06, + "loss": 0.0061, + "step": 7927 + }, + { + "epoch": 5.492206442674056, + "grad_norm": 0.43241819739341736, + "learning_rate": 4.509015256588073e-06, + "loss": 0.0081, + "step": 7928 + }, + { + "epoch": 5.492899203325251, + "grad_norm": 0.48345640301704407, + "learning_rate": 4.508321775312067e-06, + "loss": 0.0136, + "step": 7929 + }, + { + "epoch": 5.493591963976446, + "grad_norm": 0.2781326472759247, + "learning_rate": 4.507628294036062e-06, + "loss": 0.006, + "step": 7930 + }, + { + "epoch": 5.494284724627641, + "grad_norm": 0.3042835593223572, + "learning_rate": 4.506934812760056e-06, + "loss": 0.0077, + "step": 7931 + }, + { + "epoch": 5.494977485278836, + "grad_norm": 0.32773521542549133, + "learning_rate": 4.50624133148405e-06, + "loss": 0.0066, + "step": 7932 + }, + { + "epoch": 5.4956702459300315, + "grad_norm": 0.41129395365715027, + "learning_rate": 4.505547850208045e-06, + "loss": 0.0077, + "step": 7933 + }, + { + "epoch": 5.496363006581226, + "grad_norm": 0.3410695493221283, + "learning_rate": 4.504854368932039e-06, + "loss": 0.0073, + "step": 7934 + }, + { + "epoch": 5.497055767232421, + "grad_norm": 0.41703978180885315, + "learning_rate": 4.504160887656034e-06, + "loss": 0.0084, + "step": 7935 + }, + { + "epoch": 5.4977485278836165, + "grad_norm": 0.5004681944847107, + "learning_rate": 4.503467406380028e-06, + "loss": 0.0082, + "step": 7936 + }, + { + "epoch": 5.498441288534811, + "grad_norm": 0.35817739367485046, + "learning_rate": 4.502773925104023e-06, + "loss": 0.007, + "step": 7937 + }, + { + "epoch": 5.499134049186006, + "grad_norm": 0.2824252247810364, + "learning_rate": 4.502080443828017e-06, + "loss": 0.0046, + "step": 7938 + }, + { + "epoch": 5.499826809837201, + "grad_norm": 0.3260112404823303, + "learning_rate": 4.501386962552011e-06, + "loss": 0.007, + "step": 7939 + }, + { + "epoch": 5.500519570488397, + "grad_norm": 0.2906835079193115, + "learning_rate": 4.500693481276006e-06, + "loss": 0.0053, + "step": 7940 + }, + { + "epoch": 5.501212331139591, + "grad_norm": 0.36243653297424316, + "learning_rate": 4.5e-06, + "loss": 0.0059, + "step": 7941 + }, + { + "epoch": 5.501905091790786, + "grad_norm": 0.3283509314060211, + "learning_rate": 4.499306518723995e-06, + "loss": 0.0064, + "step": 7942 + }, + { + "epoch": 5.502597852441982, + "grad_norm": 0.4903525114059448, + "learning_rate": 4.498613037447989e-06, + "loss": 0.0085, + "step": 7943 + }, + { + "epoch": 5.503290613093176, + "grad_norm": 0.48571836948394775, + "learning_rate": 4.497919556171983e-06, + "loss": 0.0077, + "step": 7944 + }, + { + "epoch": 5.503983373744371, + "grad_norm": 0.40723055601119995, + "learning_rate": 4.497226074895978e-06, + "loss": 0.0111, + "step": 7945 + }, + { + "epoch": 5.504676134395567, + "grad_norm": 0.2866261303424835, + "learning_rate": 4.496532593619973e-06, + "loss": 0.005, + "step": 7946 + }, + { + "epoch": 5.505368895046761, + "grad_norm": 0.2458246499300003, + "learning_rate": 4.495839112343967e-06, + "loss": 0.005, + "step": 7947 + }, + { + "epoch": 5.506061655697956, + "grad_norm": 0.5143068432807922, + "learning_rate": 4.495145631067961e-06, + "loss": 0.008, + "step": 7948 + }, + { + "epoch": 5.5067544163491515, + "grad_norm": 0.32183995842933655, + "learning_rate": 4.494452149791955e-06, + "loss": 0.0075, + "step": 7949 + }, + { + "epoch": 5.507447177000346, + "grad_norm": 0.2839590609073639, + "learning_rate": 4.49375866851595e-06, + "loss": 0.0058, + "step": 7950 + }, + { + "epoch": 5.508139937651541, + "grad_norm": 0.40156957507133484, + "learning_rate": 4.493065187239945e-06, + "loss": 0.0053, + "step": 7951 + }, + { + "epoch": 5.5088326983027365, + "grad_norm": 0.169826939702034, + "learning_rate": 4.492371705963939e-06, + "loss": 0.0039, + "step": 7952 + }, + { + "epoch": 5.509525458953932, + "grad_norm": 0.3291435241699219, + "learning_rate": 4.491678224687933e-06, + "loss": 0.01, + "step": 7953 + }, + { + "epoch": 5.510218219605126, + "grad_norm": 0.28348952531814575, + "learning_rate": 4.490984743411928e-06, + "loss": 0.0056, + "step": 7954 + }, + { + "epoch": 5.510910980256321, + "grad_norm": 0.2896806299686432, + "learning_rate": 4.490291262135923e-06, + "loss": 0.0072, + "step": 7955 + }, + { + "epoch": 5.511603740907517, + "grad_norm": 0.30665791034698486, + "learning_rate": 4.489597780859917e-06, + "loss": 0.0054, + "step": 7956 + }, + { + "epoch": 5.512296501558711, + "grad_norm": 0.3430306613445282, + "learning_rate": 4.4889042995839114e-06, + "loss": 0.0065, + "step": 7957 + }, + { + "epoch": 5.512989262209906, + "grad_norm": 0.40963706374168396, + "learning_rate": 4.488210818307906e-06, + "loss": 0.0062, + "step": 7958 + }, + { + "epoch": 5.513682022861102, + "grad_norm": 0.3337137997150421, + "learning_rate": 4.4875173370319004e-06, + "loss": 0.0076, + "step": 7959 + }, + { + "epoch": 5.514374783512297, + "grad_norm": 0.39136621356010437, + "learning_rate": 4.486823855755895e-06, + "loss": 0.008, + "step": 7960 + }, + { + "epoch": 5.515067544163491, + "grad_norm": 0.35691195726394653, + "learning_rate": 4.4861303744798895e-06, + "loss": 0.0074, + "step": 7961 + }, + { + "epoch": 5.515760304814687, + "grad_norm": 0.40752115845680237, + "learning_rate": 4.4854368932038836e-06, + "loss": 0.0067, + "step": 7962 + }, + { + "epoch": 5.516453065465882, + "grad_norm": 0.6623293161392212, + "learning_rate": 4.4847434119278785e-06, + "loss": 0.0132, + "step": 7963 + }, + { + "epoch": 5.517145826117076, + "grad_norm": 0.2455042600631714, + "learning_rate": 4.4840499306518726e-06, + "loss": 0.005, + "step": 7964 + }, + { + "epoch": 5.5178385867682715, + "grad_norm": 0.41237589716911316, + "learning_rate": 4.4833564493758675e-06, + "loss": 0.0058, + "step": 7965 + }, + { + "epoch": 5.518531347419467, + "grad_norm": 0.4208453297615051, + "learning_rate": 4.482662968099862e-06, + "loss": 0.0068, + "step": 7966 + }, + { + "epoch": 5.519224108070661, + "grad_norm": 0.46869662404060364, + "learning_rate": 4.4819694868238565e-06, + "loss": 0.0131, + "step": 7967 + }, + { + "epoch": 5.5199168687218565, + "grad_norm": 0.5133314728736877, + "learning_rate": 4.481276005547851e-06, + "loss": 0.0109, + "step": 7968 + }, + { + "epoch": 5.520609629373052, + "grad_norm": 0.28814154863357544, + "learning_rate": 4.480582524271845e-06, + "loss": 0.0059, + "step": 7969 + }, + { + "epoch": 5.521302390024246, + "grad_norm": 0.36473405361175537, + "learning_rate": 4.47988904299584e-06, + "loss": 0.0096, + "step": 7970 + }, + { + "epoch": 5.521995150675441, + "grad_norm": 0.3517254889011383, + "learning_rate": 4.479195561719834e-06, + "loss": 0.0085, + "step": 7971 + }, + { + "epoch": 5.522687911326637, + "grad_norm": 0.2761370837688446, + "learning_rate": 4.478502080443829e-06, + "loss": 0.0062, + "step": 7972 + }, + { + "epoch": 5.523380671977832, + "grad_norm": 0.3428029716014862, + "learning_rate": 4.477808599167823e-06, + "loss": 0.0074, + "step": 7973 + }, + { + "epoch": 5.524073432629026, + "grad_norm": 0.36943402886390686, + "learning_rate": 4.477115117891817e-06, + "loss": 0.0084, + "step": 7974 + }, + { + "epoch": 5.524766193280222, + "grad_norm": 0.48313096165657043, + "learning_rate": 4.476421636615812e-06, + "loss": 0.011, + "step": 7975 + }, + { + "epoch": 5.525458953931417, + "grad_norm": 0.32100197672843933, + "learning_rate": 4.475728155339807e-06, + "loss": 0.0062, + "step": 7976 + }, + { + "epoch": 5.526151714582611, + "grad_norm": 0.2223215103149414, + "learning_rate": 4.475034674063801e-06, + "loss": 0.0049, + "step": 7977 + }, + { + "epoch": 5.526844475233807, + "grad_norm": 0.3528873324394226, + "learning_rate": 4.474341192787795e-06, + "loss": 0.0075, + "step": 7978 + }, + { + "epoch": 5.527537235885002, + "grad_norm": 0.29667994379997253, + "learning_rate": 4.473647711511789e-06, + "loss": 0.0074, + "step": 7979 + }, + { + "epoch": 5.528229996536197, + "grad_norm": 0.2997491657733917, + "learning_rate": 4.472954230235784e-06, + "loss": 0.0075, + "step": 7980 + }, + { + "epoch": 5.528922757187392, + "grad_norm": 0.26284554600715637, + "learning_rate": 4.472260748959779e-06, + "loss": 0.0059, + "step": 7981 + }, + { + "epoch": 5.529615517838587, + "grad_norm": 0.4108389616012573, + "learning_rate": 4.471567267683773e-06, + "loss": 0.0085, + "step": 7982 + }, + { + "epoch": 5.530308278489782, + "grad_norm": 0.4730110764503479, + "learning_rate": 4.470873786407767e-06, + "loss": 0.0072, + "step": 7983 + }, + { + "epoch": 5.5310010391409765, + "grad_norm": 0.5832785367965698, + "learning_rate": 4.470180305131762e-06, + "loss": 0.0112, + "step": 7984 + }, + { + "epoch": 5.531693799792172, + "grad_norm": 0.3889367878437042, + "learning_rate": 4.469486823855757e-06, + "loss": 0.0063, + "step": 7985 + }, + { + "epoch": 5.532386560443367, + "grad_norm": 0.332314670085907, + "learning_rate": 4.468793342579751e-06, + "loss": 0.0072, + "step": 7986 + }, + { + "epoch": 5.5330793210945615, + "grad_norm": 0.32923293113708496, + "learning_rate": 4.468099861303745e-06, + "loss": 0.0068, + "step": 7987 + }, + { + "epoch": 5.533772081745757, + "grad_norm": 0.3131104111671448, + "learning_rate": 4.467406380027739e-06, + "loss": 0.0077, + "step": 7988 + }, + { + "epoch": 5.534464842396952, + "grad_norm": 0.5339545011520386, + "learning_rate": 4.466712898751734e-06, + "loss": 0.0098, + "step": 7989 + }, + { + "epoch": 5.535157603048146, + "grad_norm": 0.32775241136550903, + "learning_rate": 4.466019417475729e-06, + "loss": 0.0063, + "step": 7990 + }, + { + "epoch": 5.535850363699342, + "grad_norm": 0.2774968445301056, + "learning_rate": 4.465325936199723e-06, + "loss": 0.005, + "step": 7991 + }, + { + "epoch": 5.536543124350537, + "grad_norm": 0.33874380588531494, + "learning_rate": 4.464632454923717e-06, + "loss": 0.0069, + "step": 7992 + }, + { + "epoch": 5.537235885001732, + "grad_norm": 0.30800774693489075, + "learning_rate": 4.463938973647712e-06, + "loss": 0.005, + "step": 7993 + }, + { + "epoch": 5.537928645652927, + "grad_norm": 0.3378745913505554, + "learning_rate": 4.463245492371706e-06, + "loss": 0.0076, + "step": 7994 + }, + { + "epoch": 5.538621406304122, + "grad_norm": 0.3484329879283905, + "learning_rate": 4.462552011095701e-06, + "loss": 0.007, + "step": 7995 + }, + { + "epoch": 5.539314166955317, + "grad_norm": 0.3457847833633423, + "learning_rate": 4.461858529819695e-06, + "loss": 0.0086, + "step": 7996 + }, + { + "epoch": 5.540006927606512, + "grad_norm": 0.3561868965625763, + "learning_rate": 4.461165048543689e-06, + "loss": 0.0062, + "step": 7997 + }, + { + "epoch": 5.540699688257707, + "grad_norm": 0.3368603587150574, + "learning_rate": 4.460471567267684e-06, + "loss": 0.0083, + "step": 7998 + }, + { + "epoch": 5.541392448908902, + "grad_norm": 0.46497246623039246, + "learning_rate": 4.459778085991678e-06, + "loss": 0.006, + "step": 7999 + }, + { + "epoch": 5.542085209560097, + "grad_norm": 0.5414943695068359, + "learning_rate": 4.459084604715673e-06, + "loss": 0.0078, + "step": 8000 + }, + { + "epoch": 5.542777970211292, + "grad_norm": 0.47564202547073364, + "learning_rate": 4.458391123439667e-06, + "loss": 0.0089, + "step": 8001 + }, + { + "epoch": 5.543470730862487, + "grad_norm": 0.2951892018318176, + "learning_rate": 4.457697642163662e-06, + "loss": 0.0056, + "step": 8002 + }, + { + "epoch": 5.544163491513682, + "grad_norm": 0.24927709996700287, + "learning_rate": 4.457004160887656e-06, + "loss": 0.0059, + "step": 8003 + }, + { + "epoch": 5.544856252164877, + "grad_norm": 0.44315189123153687, + "learning_rate": 4.4563106796116504e-06, + "loss": 0.0095, + "step": 8004 + }, + { + "epoch": 5.545549012816072, + "grad_norm": 0.44241759181022644, + "learning_rate": 4.455617198335645e-06, + "loss": 0.0113, + "step": 8005 + }, + { + "epoch": 5.546241773467267, + "grad_norm": 0.3930674195289612, + "learning_rate": 4.45492371705964e-06, + "loss": 0.0071, + "step": 8006 + }, + { + "epoch": 5.546934534118462, + "grad_norm": 0.3861844837665558, + "learning_rate": 4.454230235783634e-06, + "loss": 0.007, + "step": 8007 + }, + { + "epoch": 5.547627294769657, + "grad_norm": 0.2811047434806824, + "learning_rate": 4.4535367545076285e-06, + "loss": 0.0067, + "step": 8008 + }, + { + "epoch": 5.548320055420852, + "grad_norm": 0.43750813603401184, + "learning_rate": 4.4528432732316226e-06, + "loss": 0.0077, + "step": 8009 + }, + { + "epoch": 5.549012816072047, + "grad_norm": 0.26658254861831665, + "learning_rate": 4.4521497919556175e-06, + "loss": 0.0056, + "step": 8010 + }, + { + "epoch": 5.549705576723242, + "grad_norm": 0.5692890286445618, + "learning_rate": 4.451456310679612e-06, + "loss": 0.0111, + "step": 8011 + }, + { + "epoch": 5.550398337374437, + "grad_norm": 0.4793515205383301, + "learning_rate": 4.4507628294036065e-06, + "loss": 0.0093, + "step": 8012 + }, + { + "epoch": 5.5510910980256325, + "grad_norm": 0.34090977907180786, + "learning_rate": 4.450069348127601e-06, + "loss": 0.0099, + "step": 8013 + }, + { + "epoch": 5.551783858676827, + "grad_norm": 0.5725679397583008, + "learning_rate": 4.4493758668515955e-06, + "loss": 0.0099, + "step": 8014 + }, + { + "epoch": 5.552476619328022, + "grad_norm": 0.30535051226615906, + "learning_rate": 4.4486823855755905e-06, + "loss": 0.0042, + "step": 8015 + }, + { + "epoch": 5.5531693799792174, + "grad_norm": 0.45637768507003784, + "learning_rate": 4.4479889042995845e-06, + "loss": 0.0106, + "step": 8016 + }, + { + "epoch": 5.553862140630412, + "grad_norm": 0.31168192625045776, + "learning_rate": 4.447295423023579e-06, + "loss": 0.006, + "step": 8017 + }, + { + "epoch": 5.554554901281607, + "grad_norm": 0.29973074793815613, + "learning_rate": 4.446601941747573e-06, + "loss": 0.006, + "step": 8018 + }, + { + "epoch": 5.555247661932802, + "grad_norm": 0.5181790590286255, + "learning_rate": 4.445908460471568e-06, + "loss": 0.0067, + "step": 8019 + }, + { + "epoch": 5.555940422583998, + "grad_norm": 0.573112964630127, + "learning_rate": 4.445214979195563e-06, + "loss": 0.0104, + "step": 8020 + }, + { + "epoch": 5.556633183235192, + "grad_norm": 1.0122028589248657, + "learning_rate": 4.444521497919557e-06, + "loss": 0.0075, + "step": 8021 + }, + { + "epoch": 5.557325943886387, + "grad_norm": 0.35621264576911926, + "learning_rate": 4.443828016643551e-06, + "loss": 0.0066, + "step": 8022 + }, + { + "epoch": 5.558018704537583, + "grad_norm": 0.4715633690357208, + "learning_rate": 4.443134535367546e-06, + "loss": 0.0085, + "step": 8023 + }, + { + "epoch": 5.558711465188777, + "grad_norm": 0.5500308275222778, + "learning_rate": 4.44244105409154e-06, + "loss": 0.0074, + "step": 8024 + }, + { + "epoch": 5.559404225839972, + "grad_norm": 0.48288464546203613, + "learning_rate": 4.441747572815535e-06, + "loss": 0.009, + "step": 8025 + }, + { + "epoch": 5.560096986491168, + "grad_norm": 0.36384445428848267, + "learning_rate": 4.441054091539529e-06, + "loss": 0.0085, + "step": 8026 + }, + { + "epoch": 5.560789747142362, + "grad_norm": 0.4242739975452423, + "learning_rate": 4.440360610263523e-06, + "loss": 0.0125, + "step": 8027 + }, + { + "epoch": 5.561482507793557, + "grad_norm": 0.30751749873161316, + "learning_rate": 4.439667128987518e-06, + "loss": 0.0072, + "step": 8028 + }, + { + "epoch": 5.5621752684447525, + "grad_norm": 0.3378515839576721, + "learning_rate": 4.438973647711512e-06, + "loss": 0.0083, + "step": 8029 + }, + { + "epoch": 5.562868029095947, + "grad_norm": 0.4439280331134796, + "learning_rate": 4.438280166435507e-06, + "loss": 0.0136, + "step": 8030 + }, + { + "epoch": 5.563560789747142, + "grad_norm": 0.33129703998565674, + "learning_rate": 4.437586685159501e-06, + "loss": 0.0097, + "step": 8031 + }, + { + "epoch": 5.5642535503983375, + "grad_norm": 0.25259390473365784, + "learning_rate": 4.436893203883496e-06, + "loss": 0.0074, + "step": 8032 + }, + { + "epoch": 5.564946311049533, + "grad_norm": 0.27036571502685547, + "learning_rate": 4.43619972260749e-06, + "loss": 0.0066, + "step": 8033 + }, + { + "epoch": 5.565639071700727, + "grad_norm": 0.30866026878356934, + "learning_rate": 4.435506241331484e-06, + "loss": 0.0066, + "step": 8034 + }, + { + "epoch": 5.566331832351922, + "grad_norm": 0.3763596713542938, + "learning_rate": 4.434812760055479e-06, + "loss": 0.0083, + "step": 8035 + }, + { + "epoch": 5.567024593003118, + "grad_norm": 0.44617846608161926, + "learning_rate": 4.434119278779473e-06, + "loss": 0.0074, + "step": 8036 + }, + { + "epoch": 5.567717353654312, + "grad_norm": 0.4959987699985504, + "learning_rate": 4.433425797503468e-06, + "loss": 0.0088, + "step": 8037 + }, + { + "epoch": 5.568410114305507, + "grad_norm": 0.41230276226997375, + "learning_rate": 4.432732316227462e-06, + "loss": 0.0081, + "step": 8038 + }, + { + "epoch": 5.569102874956703, + "grad_norm": 0.330306738615036, + "learning_rate": 4.432038834951456e-06, + "loss": 0.007, + "step": 8039 + }, + { + "epoch": 5.569795635607898, + "grad_norm": 0.33954522013664246, + "learning_rate": 4.431345353675451e-06, + "loss": 0.0066, + "step": 8040 + }, + { + "epoch": 5.570488396259092, + "grad_norm": 0.47080641984939575, + "learning_rate": 4.430651872399446e-06, + "loss": 0.0121, + "step": 8041 + }, + { + "epoch": 5.571181156910288, + "grad_norm": 0.3344554305076599, + "learning_rate": 4.42995839112344e-06, + "loss": 0.0079, + "step": 8042 + }, + { + "epoch": 5.571873917561483, + "grad_norm": 0.4823695421218872, + "learning_rate": 4.429264909847434e-06, + "loss": 0.0102, + "step": 8043 + }, + { + "epoch": 5.572566678212677, + "grad_norm": 0.35360637307167053, + "learning_rate": 4.428571428571429e-06, + "loss": 0.0057, + "step": 8044 + }, + { + "epoch": 5.5732594388638725, + "grad_norm": 0.31102123856544495, + "learning_rate": 4.427877947295424e-06, + "loss": 0.0062, + "step": 8045 + }, + { + "epoch": 5.573952199515068, + "grad_norm": 0.4725441038608551, + "learning_rate": 4.427184466019418e-06, + "loss": 0.0127, + "step": 8046 + }, + { + "epoch": 5.574644960166262, + "grad_norm": 0.7464017271995544, + "learning_rate": 4.426490984743412e-06, + "loss": 0.0073, + "step": 8047 + }, + { + "epoch": 5.5753377208174575, + "grad_norm": 0.4012281000614166, + "learning_rate": 4.425797503467406e-06, + "loss": 0.0092, + "step": 8048 + }, + { + "epoch": 5.576030481468653, + "grad_norm": 0.27611708641052246, + "learning_rate": 4.425104022191401e-06, + "loss": 0.0062, + "step": 8049 + }, + { + "epoch": 5.576723242119847, + "grad_norm": 0.3412626385688782, + "learning_rate": 4.424410540915396e-06, + "loss": 0.0063, + "step": 8050 + }, + { + "epoch": 5.577416002771042, + "grad_norm": 0.3446902930736542, + "learning_rate": 4.42371705963939e-06, + "loss": 0.0066, + "step": 8051 + }, + { + "epoch": 5.578108763422238, + "grad_norm": 0.30243048071861267, + "learning_rate": 4.423023578363384e-06, + "loss": 0.0075, + "step": 8052 + }, + { + "epoch": 5.578801524073433, + "grad_norm": 0.316084623336792, + "learning_rate": 4.422330097087379e-06, + "loss": 0.0095, + "step": 8053 + }, + { + "epoch": 5.579494284724627, + "grad_norm": 0.36083683371543884, + "learning_rate": 4.421636615811373e-06, + "loss": 0.0088, + "step": 8054 + }, + { + "epoch": 5.580187045375823, + "grad_norm": 0.4106943905353546, + "learning_rate": 4.420943134535368e-06, + "loss": 0.0079, + "step": 8055 + }, + { + "epoch": 5.580879806027018, + "grad_norm": 0.465559184551239, + "learning_rate": 4.420249653259362e-06, + "loss": 0.007, + "step": 8056 + }, + { + "epoch": 5.581572566678212, + "grad_norm": 0.3268836438655853, + "learning_rate": 4.4195561719833565e-06, + "loss": 0.0065, + "step": 8057 + }, + { + "epoch": 5.582265327329408, + "grad_norm": 0.458279550075531, + "learning_rate": 4.418862690707351e-06, + "loss": 0.0106, + "step": 8058 + }, + { + "epoch": 5.582958087980603, + "grad_norm": 0.2828100323677063, + "learning_rate": 4.4181692094313455e-06, + "loss": 0.0056, + "step": 8059 + }, + { + "epoch": 5.583650848631798, + "grad_norm": 0.5296974778175354, + "learning_rate": 4.4174757281553404e-06, + "loss": 0.0066, + "step": 8060 + }, + { + "epoch": 5.584343609282993, + "grad_norm": 0.43544599413871765, + "learning_rate": 4.4167822468793345e-06, + "loss": 0.0104, + "step": 8061 + }, + { + "epoch": 5.585036369934188, + "grad_norm": 0.3860195577144623, + "learning_rate": 4.4160887656033295e-06, + "loss": 0.0083, + "step": 8062 + }, + { + "epoch": 5.585729130585383, + "grad_norm": 0.3217155933380127, + "learning_rate": 4.4153952843273235e-06, + "loss": 0.0071, + "step": 8063 + }, + { + "epoch": 5.5864218912365775, + "grad_norm": 0.43554022908210754, + "learning_rate": 4.414701803051318e-06, + "loss": 0.0071, + "step": 8064 + }, + { + "epoch": 5.587114651887773, + "grad_norm": 0.34503835439682007, + "learning_rate": 4.4140083217753126e-06, + "loss": 0.0094, + "step": 8065 + }, + { + "epoch": 5.587807412538968, + "grad_norm": 0.33823269605636597, + "learning_rate": 4.413314840499307e-06, + "loss": 0.0071, + "step": 8066 + }, + { + "epoch": 5.5885001731901625, + "grad_norm": 0.35930171608924866, + "learning_rate": 4.412621359223302e-06, + "loss": 0.0078, + "step": 8067 + }, + { + "epoch": 5.589192933841358, + "grad_norm": 0.5095252394676208, + "learning_rate": 4.411927877947296e-06, + "loss": 0.0087, + "step": 8068 + }, + { + "epoch": 5.589885694492553, + "grad_norm": 0.28732743859291077, + "learning_rate": 4.41123439667129e-06, + "loss": 0.0063, + "step": 8069 + }, + { + "epoch": 5.590578455143747, + "grad_norm": 0.37983229756355286, + "learning_rate": 4.410540915395285e-06, + "loss": 0.0079, + "step": 8070 + }, + { + "epoch": 5.591271215794943, + "grad_norm": 0.5520676374435425, + "learning_rate": 4.40984743411928e-06, + "loss": 0.0136, + "step": 8071 + }, + { + "epoch": 5.591963976446138, + "grad_norm": 0.2988365590572357, + "learning_rate": 4.409153952843274e-06, + "loss": 0.0057, + "step": 8072 + }, + { + "epoch": 5.592656737097333, + "grad_norm": 0.33575090765953064, + "learning_rate": 4.408460471567268e-06, + "loss": 0.0101, + "step": 8073 + }, + { + "epoch": 5.593349497748528, + "grad_norm": 0.2537892758846283, + "learning_rate": 4.407766990291263e-06, + "loss": 0.0049, + "step": 8074 + }, + { + "epoch": 5.594042258399723, + "grad_norm": 0.31315284967422485, + "learning_rate": 4.407073509015257e-06, + "loss": 0.0073, + "step": 8075 + }, + { + "epoch": 5.594735019050918, + "grad_norm": 0.5253440737724304, + "learning_rate": 4.406380027739252e-06, + "loss": 0.0085, + "step": 8076 + }, + { + "epoch": 5.595427779702113, + "grad_norm": 0.5795570015907288, + "learning_rate": 4.405686546463246e-06, + "loss": 0.0092, + "step": 8077 + }, + { + "epoch": 5.596120540353308, + "grad_norm": 0.3438153564929962, + "learning_rate": 4.40499306518724e-06, + "loss": 0.0061, + "step": 8078 + }, + { + "epoch": 5.596813301004503, + "grad_norm": 0.33880171179771423, + "learning_rate": 4.404299583911235e-06, + "loss": 0.0084, + "step": 8079 + }, + { + "epoch": 5.597506061655698, + "grad_norm": 0.3033163547515869, + "learning_rate": 4.40360610263523e-06, + "loss": 0.0056, + "step": 8080 + }, + { + "epoch": 5.598198822306893, + "grad_norm": 0.28660401701927185, + "learning_rate": 4.402912621359224e-06, + "loss": 0.0071, + "step": 8081 + }, + { + "epoch": 5.598891582958088, + "grad_norm": 0.22764852643013, + "learning_rate": 4.402219140083218e-06, + "loss": 0.0069, + "step": 8082 + }, + { + "epoch": 5.599584343609283, + "grad_norm": 0.28526607155799866, + "learning_rate": 4.401525658807213e-06, + "loss": 0.0064, + "step": 8083 + }, + { + "epoch": 5.600277104260478, + "grad_norm": 0.36071881651878357, + "learning_rate": 4.400832177531207e-06, + "loss": 0.0082, + "step": 8084 + }, + { + "epoch": 5.600969864911673, + "grad_norm": 0.3269389569759369, + "learning_rate": 4.400138696255202e-06, + "loss": 0.0079, + "step": 8085 + }, + { + "epoch": 5.601662625562868, + "grad_norm": 0.3415747284889221, + "learning_rate": 4.399445214979196e-06, + "loss": 0.0078, + "step": 8086 + }, + { + "epoch": 5.602355386214063, + "grad_norm": 0.23844791948795319, + "learning_rate": 4.39875173370319e-06, + "loss": 0.0058, + "step": 8087 + }, + { + "epoch": 5.603048146865258, + "grad_norm": 0.49624690413475037, + "learning_rate": 4.398058252427185e-06, + "loss": 0.0078, + "step": 8088 + }, + { + "epoch": 5.603740907516453, + "grad_norm": 0.42024341225624084, + "learning_rate": 4.397364771151179e-06, + "loss": 0.0101, + "step": 8089 + }, + { + "epoch": 5.604433668167648, + "grad_norm": 0.4434609115123749, + "learning_rate": 4.396671289875174e-06, + "loss": 0.0077, + "step": 8090 + }, + { + "epoch": 5.605126428818843, + "grad_norm": 0.37759512662887573, + "learning_rate": 4.395977808599168e-06, + "loss": 0.009, + "step": 8091 + }, + { + "epoch": 5.605819189470038, + "grad_norm": 0.39756596088409424, + "learning_rate": 4.395284327323163e-06, + "loss": 0.0085, + "step": 8092 + }, + { + "epoch": 5.6065119501212335, + "grad_norm": 0.33069220185279846, + "learning_rate": 4.394590846047157e-06, + "loss": 0.0066, + "step": 8093 + }, + { + "epoch": 5.607204710772428, + "grad_norm": 0.3371417224407196, + "learning_rate": 4.393897364771151e-06, + "loss": 0.0044, + "step": 8094 + }, + { + "epoch": 5.607897471423623, + "grad_norm": 0.3080448508262634, + "learning_rate": 4.393203883495146e-06, + "loss": 0.0071, + "step": 8095 + }, + { + "epoch": 5.608590232074818, + "grad_norm": 0.30030423402786255, + "learning_rate": 4.39251040221914e-06, + "loss": 0.0049, + "step": 8096 + }, + { + "epoch": 5.609282992726013, + "grad_norm": 0.4218898117542267, + "learning_rate": 4.391816920943135e-06, + "loss": 0.0093, + "step": 8097 + }, + { + "epoch": 5.609975753377208, + "grad_norm": 0.3682861030101776, + "learning_rate": 4.391123439667129e-06, + "loss": 0.0069, + "step": 8098 + }, + { + "epoch": 5.610668514028403, + "grad_norm": 0.5085688829421997, + "learning_rate": 4.390429958391123e-06, + "loss": 0.0086, + "step": 8099 + }, + { + "epoch": 5.611361274679599, + "grad_norm": 0.37401247024536133, + "learning_rate": 4.389736477115118e-06, + "loss": 0.0081, + "step": 8100 + }, + { + "epoch": 5.612054035330793, + "grad_norm": 0.3405166566371918, + "learning_rate": 4.389042995839113e-06, + "loss": 0.0063, + "step": 8101 + }, + { + "epoch": 5.612746795981988, + "grad_norm": 0.279582679271698, + "learning_rate": 4.388349514563107e-06, + "loss": 0.007, + "step": 8102 + }, + { + "epoch": 5.613439556633184, + "grad_norm": 0.2885138690471649, + "learning_rate": 4.387656033287101e-06, + "loss": 0.0063, + "step": 8103 + }, + { + "epoch": 5.614132317284378, + "grad_norm": 0.2857093811035156, + "learning_rate": 4.3869625520110955e-06, + "loss": 0.0055, + "step": 8104 + }, + { + "epoch": 5.614825077935573, + "grad_norm": 0.3629360496997833, + "learning_rate": 4.38626907073509e-06, + "loss": 0.0074, + "step": 8105 + }, + { + "epoch": 5.615517838586769, + "grad_norm": 0.8724383115768433, + "learning_rate": 4.385575589459085e-06, + "loss": 0.0094, + "step": 8106 + }, + { + "epoch": 5.616210599237963, + "grad_norm": 0.48356372117996216, + "learning_rate": 4.3848821081830794e-06, + "loss": 0.0081, + "step": 8107 + }, + { + "epoch": 5.616903359889158, + "grad_norm": 0.299801230430603, + "learning_rate": 4.3841886269070735e-06, + "loss": 0.008, + "step": 8108 + }, + { + "epoch": 5.6175961205403535, + "grad_norm": 0.28150179982185364, + "learning_rate": 4.3834951456310685e-06, + "loss": 0.0053, + "step": 8109 + }, + { + "epoch": 5.618288881191548, + "grad_norm": 0.320665180683136, + "learning_rate": 4.382801664355063e-06, + "loss": 0.0057, + "step": 8110 + }, + { + "epoch": 5.618981641842743, + "grad_norm": 0.35002484917640686, + "learning_rate": 4.3821081830790575e-06, + "loss": 0.006, + "step": 8111 + }, + { + "epoch": 5.6196744024939385, + "grad_norm": 0.31823593378067017, + "learning_rate": 4.3814147018030516e-06, + "loss": 0.008, + "step": 8112 + }, + { + "epoch": 5.620367163145134, + "grad_norm": 0.3462357521057129, + "learning_rate": 4.380721220527046e-06, + "loss": 0.0056, + "step": 8113 + }, + { + "epoch": 5.621059923796328, + "grad_norm": 0.3695269525051117, + "learning_rate": 4.380027739251041e-06, + "loss": 0.0099, + "step": 8114 + }, + { + "epoch": 5.621752684447523, + "grad_norm": 0.4130893349647522, + "learning_rate": 4.3793342579750355e-06, + "loss": 0.0091, + "step": 8115 + }, + { + "epoch": 5.622445445098719, + "grad_norm": 0.40944939851760864, + "learning_rate": 4.37864077669903e-06, + "loss": 0.0083, + "step": 8116 + }, + { + "epoch": 5.623138205749913, + "grad_norm": 0.6065247654914856, + "learning_rate": 4.377947295423024e-06, + "loss": 0.0092, + "step": 8117 + }, + { + "epoch": 5.623830966401108, + "grad_norm": 0.4101960062980652, + "learning_rate": 4.377253814147019e-06, + "loss": 0.0083, + "step": 8118 + }, + { + "epoch": 5.624523727052304, + "grad_norm": 0.35071033239364624, + "learning_rate": 4.376560332871013e-06, + "loss": 0.0058, + "step": 8119 + }, + { + "epoch": 5.625216487703499, + "grad_norm": 0.34921368956565857, + "learning_rate": 4.375866851595008e-06, + "loss": 0.0091, + "step": 8120 + }, + { + "epoch": 5.625909248354693, + "grad_norm": 0.3462977409362793, + "learning_rate": 4.375173370319002e-06, + "loss": 0.0061, + "step": 8121 + }, + { + "epoch": 5.626602009005889, + "grad_norm": 0.35768160223960876, + "learning_rate": 4.374479889042997e-06, + "loss": 0.0085, + "step": 8122 + }, + { + "epoch": 5.627294769657084, + "grad_norm": 0.41447946429252625, + "learning_rate": 4.373786407766991e-06, + "loss": 0.0059, + "step": 8123 + }, + { + "epoch": 5.627987530308278, + "grad_norm": 0.37148943543434143, + "learning_rate": 4.373092926490985e-06, + "loss": 0.0072, + "step": 8124 + }, + { + "epoch": 5.6286802909594735, + "grad_norm": 0.34060126543045044, + "learning_rate": 4.372399445214979e-06, + "loss": 0.0083, + "step": 8125 + }, + { + "epoch": 5.629373051610669, + "grad_norm": 0.3396386206150055, + "learning_rate": 4.371705963938974e-06, + "loss": 0.0099, + "step": 8126 + }, + { + "epoch": 5.630065812261863, + "grad_norm": 0.3414992690086365, + "learning_rate": 4.371012482662969e-06, + "loss": 0.0092, + "step": 8127 + }, + { + "epoch": 5.6307585729130585, + "grad_norm": 0.487488329410553, + "learning_rate": 4.370319001386963e-06, + "loss": 0.0101, + "step": 8128 + }, + { + "epoch": 5.631451333564254, + "grad_norm": 0.3113594353199005, + "learning_rate": 4.369625520110957e-06, + "loss": 0.0072, + "step": 8129 + }, + { + "epoch": 5.632144094215448, + "grad_norm": 0.3420415222644806, + "learning_rate": 4.368932038834952e-06, + "loss": 0.0083, + "step": 8130 + }, + { + "epoch": 5.632836854866643, + "grad_norm": 0.5112042427062988, + "learning_rate": 4.368238557558947e-06, + "loss": 0.0064, + "step": 8131 + }, + { + "epoch": 5.633529615517839, + "grad_norm": 0.27378049492836, + "learning_rate": 4.367545076282941e-06, + "loss": 0.0073, + "step": 8132 + }, + { + "epoch": 5.634222376169034, + "grad_norm": 0.35176554322242737, + "learning_rate": 4.366851595006935e-06, + "loss": 0.0116, + "step": 8133 + }, + { + "epoch": 5.634915136820228, + "grad_norm": 0.40267062187194824, + "learning_rate": 4.366158113730929e-06, + "loss": 0.0086, + "step": 8134 + }, + { + "epoch": 5.635607897471424, + "grad_norm": 0.35776427388191223, + "learning_rate": 4.365464632454924e-06, + "loss": 0.0102, + "step": 8135 + }, + { + "epoch": 5.636300658122619, + "grad_norm": 0.34870702028274536, + "learning_rate": 4.364771151178919e-06, + "loss": 0.0065, + "step": 8136 + }, + { + "epoch": 5.636993418773813, + "grad_norm": 0.42673560976982117, + "learning_rate": 4.364077669902913e-06, + "loss": 0.007, + "step": 8137 + }, + { + "epoch": 5.637686179425009, + "grad_norm": 0.27707332372665405, + "learning_rate": 4.363384188626907e-06, + "loss": 0.0068, + "step": 8138 + }, + { + "epoch": 5.638378940076204, + "grad_norm": 0.2589033842086792, + "learning_rate": 4.362690707350902e-06, + "loss": 0.0053, + "step": 8139 + }, + { + "epoch": 5.639071700727399, + "grad_norm": 0.5663381218910217, + "learning_rate": 4.361997226074896e-06, + "loss": 0.0111, + "step": 8140 + }, + { + "epoch": 5.6397644613785936, + "grad_norm": 0.5260714888572693, + "learning_rate": 4.361303744798891e-06, + "loss": 0.0088, + "step": 8141 + }, + { + "epoch": 5.640457222029789, + "grad_norm": 0.3698810338973999, + "learning_rate": 4.360610263522885e-06, + "loss": 0.0097, + "step": 8142 + }, + { + "epoch": 5.641149982680984, + "grad_norm": 0.4451572000980377, + "learning_rate": 4.359916782246879e-06, + "loss": 0.008, + "step": 8143 + }, + { + "epoch": 5.6418427433321785, + "grad_norm": 0.4416818916797638, + "learning_rate": 4.359223300970874e-06, + "loss": 0.0076, + "step": 8144 + }, + { + "epoch": 5.642535503983374, + "grad_norm": 0.4071839451789856, + "learning_rate": 4.358529819694868e-06, + "loss": 0.007, + "step": 8145 + }, + { + "epoch": 5.643228264634569, + "grad_norm": 0.38316574692726135, + "learning_rate": 4.357836338418863e-06, + "loss": 0.0076, + "step": 8146 + }, + { + "epoch": 5.6439210252857634, + "grad_norm": 0.3717910945415497, + "learning_rate": 4.357142857142857e-06, + "loss": 0.0078, + "step": 8147 + }, + { + "epoch": 5.644613785936959, + "grad_norm": 0.2634609639644623, + "learning_rate": 4.356449375866852e-06, + "loss": 0.0061, + "step": 8148 + }, + { + "epoch": 5.645306546588154, + "grad_norm": 0.41189005970954895, + "learning_rate": 4.355755894590846e-06, + "loss": 0.0099, + "step": 8149 + }, + { + "epoch": 5.645999307239348, + "grad_norm": 0.41990867257118225, + "learning_rate": 4.35506241331484e-06, + "loss": 0.0067, + "step": 8150 + }, + { + "epoch": 5.646692067890544, + "grad_norm": 0.31547811627388, + "learning_rate": 4.354368932038835e-06, + "loss": 0.0064, + "step": 8151 + }, + { + "epoch": 5.647384828541739, + "grad_norm": 0.39213982224464417, + "learning_rate": 4.353675450762829e-06, + "loss": 0.0071, + "step": 8152 + }, + { + "epoch": 5.648077589192933, + "grad_norm": 0.2914406657218933, + "learning_rate": 4.352981969486824e-06, + "loss": 0.0042, + "step": 8153 + }, + { + "epoch": 5.648770349844129, + "grad_norm": 0.38466131687164307, + "learning_rate": 4.3522884882108184e-06, + "loss": 0.0127, + "step": 8154 + }, + { + "epoch": 5.649463110495324, + "grad_norm": 0.3522290289402008, + "learning_rate": 4.3515950069348125e-06, + "loss": 0.0057, + "step": 8155 + }, + { + "epoch": 5.650155871146519, + "grad_norm": 0.4373476803302765, + "learning_rate": 4.3509015256588075e-06, + "loss": 0.009, + "step": 8156 + }, + { + "epoch": 5.650848631797714, + "grad_norm": 0.30540671944618225, + "learning_rate": 4.350208044382802e-06, + "loss": 0.0074, + "step": 8157 + }, + { + "epoch": 5.651541392448909, + "grad_norm": 0.4660751223564148, + "learning_rate": 4.3495145631067965e-06, + "loss": 0.0086, + "step": 8158 + }, + { + "epoch": 5.652234153100104, + "grad_norm": 0.41871729493141174, + "learning_rate": 4.3488210818307906e-06, + "loss": 0.0091, + "step": 8159 + }, + { + "epoch": 5.652926913751299, + "grad_norm": 0.43146413564682007, + "learning_rate": 4.3481276005547855e-06, + "loss": 0.0098, + "step": 8160 + }, + { + "epoch": 5.653619674402494, + "grad_norm": 0.4287135899066925, + "learning_rate": 4.34743411927878e-06, + "loss": 0.0091, + "step": 8161 + }, + { + "epoch": 5.654312435053689, + "grad_norm": 0.3094720244407654, + "learning_rate": 4.3467406380027745e-06, + "loss": 0.0058, + "step": 8162 + }, + { + "epoch": 5.655005195704884, + "grad_norm": 0.7001898288726807, + "learning_rate": 4.346047156726769e-06, + "loss": 0.0068, + "step": 8163 + }, + { + "epoch": 5.655697956356079, + "grad_norm": 0.28040841221809387, + "learning_rate": 4.345353675450763e-06, + "loss": 0.0064, + "step": 8164 + }, + { + "epoch": 5.656390717007274, + "grad_norm": 0.4943999648094177, + "learning_rate": 4.344660194174758e-06, + "loss": 0.0097, + "step": 8165 + }, + { + "epoch": 5.657083477658469, + "grad_norm": 0.2820316553115845, + "learning_rate": 4.3439667128987526e-06, + "loss": 0.0052, + "step": 8166 + }, + { + "epoch": 5.657776238309664, + "grad_norm": 0.4877709746360779, + "learning_rate": 4.343273231622747e-06, + "loss": 0.0085, + "step": 8167 + }, + { + "epoch": 5.658468998960859, + "grad_norm": 0.36580315232276917, + "learning_rate": 4.342579750346741e-06, + "loss": 0.0068, + "step": 8168 + }, + { + "epoch": 5.659161759612054, + "grad_norm": 0.5931958556175232, + "learning_rate": 4.341886269070736e-06, + "loss": 0.0078, + "step": 8169 + }, + { + "epoch": 5.659854520263249, + "grad_norm": 0.3309081494808197, + "learning_rate": 4.34119278779473e-06, + "loss": 0.0093, + "step": 8170 + }, + { + "epoch": 5.660547280914444, + "grad_norm": 0.4991997182369232, + "learning_rate": 4.340499306518725e-06, + "loss": 0.0069, + "step": 8171 + }, + { + "epoch": 5.661240041565639, + "grad_norm": 0.23241691291332245, + "learning_rate": 4.339805825242719e-06, + "loss": 0.0047, + "step": 8172 + }, + { + "epoch": 5.661932802216834, + "grad_norm": 0.3474172353744507, + "learning_rate": 4.339112343966713e-06, + "loss": 0.0059, + "step": 8173 + }, + { + "epoch": 5.662625562868029, + "grad_norm": 0.3061229884624481, + "learning_rate": 4.338418862690708e-06, + "loss": 0.0073, + "step": 8174 + }, + { + "epoch": 5.663318323519224, + "grad_norm": 0.4371403753757477, + "learning_rate": 4.337725381414702e-06, + "loss": 0.0078, + "step": 8175 + }, + { + "epoch": 5.664011084170419, + "grad_norm": 0.3339928090572357, + "learning_rate": 4.337031900138697e-06, + "loss": 0.0105, + "step": 8176 + }, + { + "epoch": 5.664703844821614, + "grad_norm": 0.5151131749153137, + "learning_rate": 4.336338418862691e-06, + "loss": 0.0104, + "step": 8177 + }, + { + "epoch": 5.665396605472809, + "grad_norm": 0.33382347226142883, + "learning_rate": 4.335644937586686e-06, + "loss": 0.0078, + "step": 8178 + }, + { + "epoch": 5.666089366124004, + "grad_norm": 0.33786681294441223, + "learning_rate": 4.33495145631068e-06, + "loss": 0.0073, + "step": 8179 + }, + { + "epoch": 5.6667821267752, + "grad_norm": 0.32782819867134094, + "learning_rate": 4.334257975034674e-06, + "loss": 0.0081, + "step": 8180 + }, + { + "epoch": 5.667474887426394, + "grad_norm": 0.32676103711128235, + "learning_rate": 4.333564493758669e-06, + "loss": 0.0067, + "step": 8181 + }, + { + "epoch": 5.668167648077589, + "grad_norm": 0.44742289185523987, + "learning_rate": 4.332871012482663e-06, + "loss": 0.0072, + "step": 8182 + }, + { + "epoch": 5.668860408728785, + "grad_norm": 0.6208932995796204, + "learning_rate": 4.332177531206658e-06, + "loss": 0.0094, + "step": 8183 + }, + { + "epoch": 5.669553169379979, + "grad_norm": 0.43178027868270874, + "learning_rate": 4.331484049930652e-06, + "loss": 0.0078, + "step": 8184 + }, + { + "epoch": 5.670245930031174, + "grad_norm": 0.32541170716285706, + "learning_rate": 4.330790568654646e-06, + "loss": 0.0072, + "step": 8185 + }, + { + "epoch": 5.6709386906823696, + "grad_norm": 0.3351057469844818, + "learning_rate": 4.330097087378641e-06, + "loss": 0.0082, + "step": 8186 + }, + { + "epoch": 5.671631451333564, + "grad_norm": 0.4671008586883545, + "learning_rate": 4.329403606102636e-06, + "loss": 0.0093, + "step": 8187 + }, + { + "epoch": 5.672324211984759, + "grad_norm": 0.4235295057296753, + "learning_rate": 4.32871012482663e-06, + "loss": 0.0082, + "step": 8188 + }, + { + "epoch": 5.6730169726359545, + "grad_norm": 0.30958467721939087, + "learning_rate": 4.328016643550624e-06, + "loss": 0.0068, + "step": 8189 + }, + { + "epoch": 5.673709733287149, + "grad_norm": 0.2938699424266815, + "learning_rate": 4.327323162274619e-06, + "loss": 0.0057, + "step": 8190 + }, + { + "epoch": 5.674402493938344, + "grad_norm": 0.3453691303730011, + "learning_rate": 4.326629680998613e-06, + "loss": 0.0075, + "step": 8191 + }, + { + "epoch": 5.6750952545895395, + "grad_norm": 0.2740063965320587, + "learning_rate": 4.325936199722608e-06, + "loss": 0.0062, + "step": 8192 + }, + { + "epoch": 5.675788015240734, + "grad_norm": 0.4536712169647217, + "learning_rate": 4.325242718446602e-06, + "loss": 0.0118, + "step": 8193 + }, + { + "epoch": 5.676480775891929, + "grad_norm": 0.3655270040035248, + "learning_rate": 4.324549237170596e-06, + "loss": 0.0072, + "step": 8194 + }, + { + "epoch": 5.677173536543124, + "grad_norm": 0.2964143753051758, + "learning_rate": 4.323855755894591e-06, + "loss": 0.0065, + "step": 8195 + }, + { + "epoch": 5.67786629719432, + "grad_norm": 0.3953753709793091, + "learning_rate": 4.323162274618586e-06, + "loss": 0.0057, + "step": 8196 + }, + { + "epoch": 5.678559057845514, + "grad_norm": 0.230918750166893, + "learning_rate": 4.32246879334258e-06, + "loss": 0.0054, + "step": 8197 + }, + { + "epoch": 5.679251818496709, + "grad_norm": 0.32712507247924805, + "learning_rate": 4.321775312066574e-06, + "loss": 0.0086, + "step": 8198 + }, + { + "epoch": 5.679944579147905, + "grad_norm": 0.24037069082260132, + "learning_rate": 4.321081830790569e-06, + "loss": 0.0054, + "step": 8199 + }, + { + "epoch": 5.6806373397991, + "grad_norm": 0.34376099705696106, + "learning_rate": 4.320388349514563e-06, + "loss": 0.0065, + "step": 8200 + }, + { + "epoch": 5.681330100450294, + "grad_norm": 0.4202915132045746, + "learning_rate": 4.319694868238558e-06, + "loss": 0.0061, + "step": 8201 + }, + { + "epoch": 5.68202286110149, + "grad_norm": 0.3023999333381653, + "learning_rate": 4.319001386962552e-06, + "loss": 0.0069, + "step": 8202 + }, + { + "epoch": 5.682715621752685, + "grad_norm": 0.281676709651947, + "learning_rate": 4.3183079056865465e-06, + "loss": 0.0056, + "step": 8203 + }, + { + "epoch": 5.683408382403879, + "grad_norm": 0.41399192810058594, + "learning_rate": 4.317614424410541e-06, + "loss": 0.0104, + "step": 8204 + }, + { + "epoch": 5.6841011430550745, + "grad_norm": 0.349400132894516, + "learning_rate": 4.3169209431345355e-06, + "loss": 0.0065, + "step": 8205 + }, + { + "epoch": 5.68479390370627, + "grad_norm": 0.25694534182548523, + "learning_rate": 4.31622746185853e-06, + "loss": 0.0065, + "step": 8206 + }, + { + "epoch": 5.685486664357464, + "grad_norm": 0.3727432191371918, + "learning_rate": 4.3155339805825245e-06, + "loss": 0.0103, + "step": 8207 + }, + { + "epoch": 5.6861794250086595, + "grad_norm": 0.3524590730667114, + "learning_rate": 4.3148404993065194e-06, + "loss": 0.0075, + "step": 8208 + }, + { + "epoch": 5.686872185659855, + "grad_norm": 0.35577112436294556, + "learning_rate": 4.3141470180305135e-06, + "loss": 0.0099, + "step": 8209 + }, + { + "epoch": 5.687564946311049, + "grad_norm": 0.48710888624191284, + "learning_rate": 4.313453536754508e-06, + "loss": 0.0126, + "step": 8210 + }, + { + "epoch": 5.688257706962244, + "grad_norm": 0.3545190095901489, + "learning_rate": 4.3127600554785025e-06, + "loss": 0.0102, + "step": 8211 + }, + { + "epoch": 5.68895046761344, + "grad_norm": 0.3064030110836029, + "learning_rate": 4.312066574202497e-06, + "loss": 0.0066, + "step": 8212 + }, + { + "epoch": 5.689643228264634, + "grad_norm": 0.5038630366325378, + "learning_rate": 4.3113730929264916e-06, + "loss": 0.0075, + "step": 8213 + }, + { + "epoch": 5.690335988915829, + "grad_norm": 0.29983583092689514, + "learning_rate": 4.310679611650486e-06, + "loss": 0.0057, + "step": 8214 + }, + { + "epoch": 5.691028749567025, + "grad_norm": 0.6348174810409546, + "learning_rate": 4.30998613037448e-06, + "loss": 0.0089, + "step": 8215 + }, + { + "epoch": 5.69172151021822, + "grad_norm": 0.26088616251945496, + "learning_rate": 4.309292649098475e-06, + "loss": 0.0062, + "step": 8216 + }, + { + "epoch": 5.692414270869414, + "grad_norm": 0.3356310725212097, + "learning_rate": 4.30859916782247e-06, + "loss": 0.0072, + "step": 8217 + }, + { + "epoch": 5.69310703152061, + "grad_norm": 0.36710891127586365, + "learning_rate": 4.307905686546464e-06, + "loss": 0.0073, + "step": 8218 + }, + { + "epoch": 5.693799792171805, + "grad_norm": 0.3252546489238739, + "learning_rate": 4.307212205270458e-06, + "loss": 0.0064, + "step": 8219 + }, + { + "epoch": 5.694492552822999, + "grad_norm": 0.45620712637901306, + "learning_rate": 4.306518723994452e-06, + "loss": 0.0126, + "step": 8220 + }, + { + "epoch": 5.6951853134741945, + "grad_norm": 0.21465623378753662, + "learning_rate": 4.305825242718447e-06, + "loss": 0.0047, + "step": 8221 + }, + { + "epoch": 5.69587807412539, + "grad_norm": 0.40564826130867004, + "learning_rate": 4.305131761442442e-06, + "loss": 0.0074, + "step": 8222 + }, + { + "epoch": 5.696570834776585, + "grad_norm": 0.3543236255645752, + "learning_rate": 4.304438280166436e-06, + "loss": 0.0066, + "step": 8223 + }, + { + "epoch": 5.6972635954277795, + "grad_norm": 0.29081037640571594, + "learning_rate": 4.30374479889043e-06, + "loss": 0.0065, + "step": 8224 + }, + { + "epoch": 5.697956356078975, + "grad_norm": 0.26856741309165955, + "learning_rate": 4.303051317614425e-06, + "loss": 0.006, + "step": 8225 + }, + { + "epoch": 5.69864911673017, + "grad_norm": 0.42368850111961365, + "learning_rate": 4.30235783633842e-06, + "loss": 0.0066, + "step": 8226 + }, + { + "epoch": 5.699341877381364, + "grad_norm": 0.3613841235637665, + "learning_rate": 4.301664355062414e-06, + "loss": 0.0077, + "step": 8227 + }, + { + "epoch": 5.70003463803256, + "grad_norm": 0.314153254032135, + "learning_rate": 4.300970873786408e-06, + "loss": 0.0107, + "step": 8228 + }, + { + "epoch": 5.700727398683755, + "grad_norm": 0.3639216125011444, + "learning_rate": 4.300277392510402e-06, + "loss": 0.0095, + "step": 8229 + }, + { + "epoch": 5.701420159334949, + "grad_norm": 0.3123050630092621, + "learning_rate": 4.299583911234397e-06, + "loss": 0.0071, + "step": 8230 + }, + { + "epoch": 5.702112919986145, + "grad_norm": 0.31937894225120544, + "learning_rate": 4.298890429958392e-06, + "loss": 0.0062, + "step": 8231 + }, + { + "epoch": 5.70280568063734, + "grad_norm": 0.43743598461151123, + "learning_rate": 4.298196948682386e-06, + "loss": 0.0144, + "step": 8232 + }, + { + "epoch": 5.703498441288534, + "grad_norm": 0.4170362055301666, + "learning_rate": 4.29750346740638e-06, + "loss": 0.007, + "step": 8233 + }, + { + "epoch": 5.70419120193973, + "grad_norm": 0.2906656861305237, + "learning_rate": 4.296809986130375e-06, + "loss": 0.0053, + "step": 8234 + }, + { + "epoch": 5.704883962590925, + "grad_norm": 0.2628159821033478, + "learning_rate": 4.296116504854369e-06, + "loss": 0.0054, + "step": 8235 + }, + { + "epoch": 5.70557672324212, + "grad_norm": 0.27946367859840393, + "learning_rate": 4.295423023578364e-06, + "loss": 0.0055, + "step": 8236 + }, + { + "epoch": 5.706269483893315, + "grad_norm": 0.40050429105758667, + "learning_rate": 4.294729542302358e-06, + "loss": 0.008, + "step": 8237 + }, + { + "epoch": 5.70696224454451, + "grad_norm": 0.6546024680137634, + "learning_rate": 4.294036061026353e-06, + "loss": 0.0105, + "step": 8238 + }, + { + "epoch": 5.707655005195705, + "grad_norm": 0.5611664056777954, + "learning_rate": 4.293342579750347e-06, + "loss": 0.0107, + "step": 8239 + }, + { + "epoch": 5.7083477658468995, + "grad_norm": 0.25876954197883606, + "learning_rate": 4.292649098474341e-06, + "loss": 0.0054, + "step": 8240 + }, + { + "epoch": 5.709040526498095, + "grad_norm": 0.38433337211608887, + "learning_rate": 4.291955617198336e-06, + "loss": 0.008, + "step": 8241 + }, + { + "epoch": 5.70973328714929, + "grad_norm": 0.294488787651062, + "learning_rate": 4.29126213592233e-06, + "loss": 0.0062, + "step": 8242 + }, + { + "epoch": 5.710426047800485, + "grad_norm": 0.3941182494163513, + "learning_rate": 4.290568654646325e-06, + "loss": 0.0076, + "step": 8243 + }, + { + "epoch": 5.71111880845168, + "grad_norm": 0.4684038460254669, + "learning_rate": 4.289875173370319e-06, + "loss": 0.0086, + "step": 8244 + }, + { + "epoch": 5.711811569102875, + "grad_norm": 0.2668977975845337, + "learning_rate": 4.289181692094313e-06, + "loss": 0.0048, + "step": 8245 + }, + { + "epoch": 5.71250432975407, + "grad_norm": 0.9417048692703247, + "learning_rate": 4.288488210818308e-06, + "loss": 0.0064, + "step": 8246 + }, + { + "epoch": 5.713197090405265, + "grad_norm": 0.40976911783218384, + "learning_rate": 4.287794729542303e-06, + "loss": 0.0063, + "step": 8247 + }, + { + "epoch": 5.71388985105646, + "grad_norm": 0.46163731813430786, + "learning_rate": 4.287101248266297e-06, + "loss": 0.0102, + "step": 8248 + }, + { + "epoch": 5.714582611707655, + "grad_norm": 0.2681505084037781, + "learning_rate": 4.286407766990291e-06, + "loss": 0.0061, + "step": 8249 + }, + { + "epoch": 5.71527537235885, + "grad_norm": 0.5496297478675842, + "learning_rate": 4.2857142857142855e-06, + "loss": 0.008, + "step": 8250 + }, + { + "epoch": 5.715968133010045, + "grad_norm": 0.38196346163749695, + "learning_rate": 4.28502080443828e-06, + "loss": 0.0076, + "step": 8251 + }, + { + "epoch": 5.71666089366124, + "grad_norm": 0.8958604335784912, + "learning_rate": 4.284327323162275e-06, + "loss": 0.0133, + "step": 8252 + }, + { + "epoch": 5.717353654312435, + "grad_norm": 0.34037333726882935, + "learning_rate": 4.283633841886269e-06, + "loss": 0.0081, + "step": 8253 + }, + { + "epoch": 5.71804641496363, + "grad_norm": 0.34312424063682556, + "learning_rate": 4.2829403606102635e-06, + "loss": 0.0053, + "step": 8254 + }, + { + "epoch": 5.718739175614825, + "grad_norm": 0.3917454481124878, + "learning_rate": 4.2822468793342584e-06, + "loss": 0.0065, + "step": 8255 + }, + { + "epoch": 5.71943193626602, + "grad_norm": 0.2833869457244873, + "learning_rate": 4.281553398058253e-06, + "loss": 0.0092, + "step": 8256 + }, + { + "epoch": 5.720124696917215, + "grad_norm": 0.32059454917907715, + "learning_rate": 4.2808599167822475e-06, + "loss": 0.0058, + "step": 8257 + }, + { + "epoch": 5.72081745756841, + "grad_norm": 0.38847750425338745, + "learning_rate": 4.2801664355062415e-06, + "loss": 0.0135, + "step": 8258 + }, + { + "epoch": 5.721510218219605, + "grad_norm": 0.4522644877433777, + "learning_rate": 4.279472954230236e-06, + "loss": 0.009, + "step": 8259 + }, + { + "epoch": 5.7222029788708, + "grad_norm": 0.41455429792404175, + "learning_rate": 4.2787794729542306e-06, + "loss": 0.0106, + "step": 8260 + }, + { + "epoch": 5.722895739521995, + "grad_norm": 0.3959617614746094, + "learning_rate": 4.2780859916782255e-06, + "loss": 0.0077, + "step": 8261 + }, + { + "epoch": 5.72358850017319, + "grad_norm": 0.44272294640541077, + "learning_rate": 4.27739251040222e-06, + "loss": 0.01, + "step": 8262 + }, + { + "epoch": 5.724281260824386, + "grad_norm": 0.3831024765968323, + "learning_rate": 4.276699029126214e-06, + "loss": 0.0085, + "step": 8263 + }, + { + "epoch": 5.72497402147558, + "grad_norm": 0.3204551935195923, + "learning_rate": 4.276005547850209e-06, + "loss": 0.0071, + "step": 8264 + }, + { + "epoch": 5.725666782126775, + "grad_norm": 0.25566670298576355, + "learning_rate": 4.275312066574203e-06, + "loss": 0.0058, + "step": 8265 + }, + { + "epoch": 5.7263595427779705, + "grad_norm": 0.44943755865097046, + "learning_rate": 4.274618585298198e-06, + "loss": 0.006, + "step": 8266 + }, + { + "epoch": 5.727052303429165, + "grad_norm": 0.32006701827049255, + "learning_rate": 4.273925104022192e-06, + "loss": 0.0077, + "step": 8267 + }, + { + "epoch": 5.72774506408036, + "grad_norm": 0.3889787197113037, + "learning_rate": 4.273231622746186e-06, + "loss": 0.0065, + "step": 8268 + }, + { + "epoch": 5.7284378247315555, + "grad_norm": 0.34079816937446594, + "learning_rate": 4.272538141470181e-06, + "loss": 0.0079, + "step": 8269 + }, + { + "epoch": 5.72913058538275, + "grad_norm": 0.45281341671943665, + "learning_rate": 4.271844660194175e-06, + "loss": 0.0096, + "step": 8270 + }, + { + "epoch": 5.729823346033945, + "grad_norm": 0.7545556426048279, + "learning_rate": 4.27115117891817e-06, + "loss": 0.0089, + "step": 8271 + }, + { + "epoch": 5.73051610668514, + "grad_norm": 0.36646124720573425, + "learning_rate": 4.270457697642164e-06, + "loss": 0.009, + "step": 8272 + }, + { + "epoch": 5.731208867336335, + "grad_norm": 0.2675376236438751, + "learning_rate": 4.269764216366159e-06, + "loss": 0.0072, + "step": 8273 + }, + { + "epoch": 5.73190162798753, + "grad_norm": 0.46366944909095764, + "learning_rate": 4.269070735090153e-06, + "loss": 0.011, + "step": 8274 + }, + { + "epoch": 5.732594388638725, + "grad_norm": 0.3251652121543884, + "learning_rate": 4.268377253814147e-06, + "loss": 0.008, + "step": 8275 + }, + { + "epoch": 5.733287149289921, + "grad_norm": 0.48200687766075134, + "learning_rate": 4.267683772538142e-06, + "loss": 0.0077, + "step": 8276 + }, + { + "epoch": 5.733979909941115, + "grad_norm": 0.45379742980003357, + "learning_rate": 4.266990291262136e-06, + "loss": 0.0109, + "step": 8277 + }, + { + "epoch": 5.73467267059231, + "grad_norm": 0.4099942445755005, + "learning_rate": 4.266296809986131e-06, + "loss": 0.0076, + "step": 8278 + }, + { + "epoch": 5.735365431243506, + "grad_norm": 0.35289695858955383, + "learning_rate": 4.265603328710125e-06, + "loss": 0.0055, + "step": 8279 + }, + { + "epoch": 5.7360581918947, + "grad_norm": 0.37739306688308716, + "learning_rate": 4.264909847434119e-06, + "loss": 0.0082, + "step": 8280 + }, + { + "epoch": 5.736750952545895, + "grad_norm": 0.5464060306549072, + "learning_rate": 4.264216366158114e-06, + "loss": 0.0074, + "step": 8281 + }, + { + "epoch": 5.737443713197091, + "grad_norm": 0.3523530960083008, + "learning_rate": 4.263522884882109e-06, + "loss": 0.0067, + "step": 8282 + }, + { + "epoch": 5.738136473848286, + "grad_norm": 0.36967411637306213, + "learning_rate": 4.262829403606103e-06, + "loss": 0.0072, + "step": 8283 + }, + { + "epoch": 5.73882923449948, + "grad_norm": 0.4134407341480255, + "learning_rate": 4.262135922330097e-06, + "loss": 0.0088, + "step": 8284 + }, + { + "epoch": 5.7395219951506755, + "grad_norm": 0.3396783769130707, + "learning_rate": 4.261442441054092e-06, + "loss": 0.0086, + "step": 8285 + }, + { + "epoch": 5.740214755801871, + "grad_norm": 0.48782989382743835, + "learning_rate": 4.260748959778087e-06, + "loss": 0.0088, + "step": 8286 + }, + { + "epoch": 5.740907516453065, + "grad_norm": 0.5654097199440002, + "learning_rate": 4.260055478502081e-06, + "loss": 0.0072, + "step": 8287 + }, + { + "epoch": 5.7416002771042605, + "grad_norm": 0.2882134020328522, + "learning_rate": 4.259361997226075e-06, + "loss": 0.0076, + "step": 8288 + }, + { + "epoch": 5.742293037755456, + "grad_norm": 0.3056911528110504, + "learning_rate": 4.258668515950069e-06, + "loss": 0.0065, + "step": 8289 + }, + { + "epoch": 5.74298579840665, + "grad_norm": 0.3661186695098877, + "learning_rate": 4.257975034674064e-06, + "loss": 0.0099, + "step": 8290 + }, + { + "epoch": 5.743678559057845, + "grad_norm": 0.30900242924690247, + "learning_rate": 4.257281553398059e-06, + "loss": 0.0051, + "step": 8291 + }, + { + "epoch": 5.744371319709041, + "grad_norm": 0.29524505138397217, + "learning_rate": 4.256588072122053e-06, + "loss": 0.0057, + "step": 8292 + }, + { + "epoch": 5.745064080360235, + "grad_norm": 0.25422176718711853, + "learning_rate": 4.255894590846047e-06, + "loss": 0.0056, + "step": 8293 + }, + { + "epoch": 5.74575684101143, + "grad_norm": 0.3724766969680786, + "learning_rate": 4.255201109570042e-06, + "loss": 0.008, + "step": 8294 + }, + { + "epoch": 5.746449601662626, + "grad_norm": 0.40852609276771545, + "learning_rate": 4.254507628294036e-06, + "loss": 0.0064, + "step": 8295 + }, + { + "epoch": 5.747142362313821, + "grad_norm": 0.38329029083251953, + "learning_rate": 4.253814147018031e-06, + "loss": 0.0088, + "step": 8296 + }, + { + "epoch": 5.747835122965015, + "grad_norm": 0.4243938624858856, + "learning_rate": 4.253120665742025e-06, + "loss": 0.0077, + "step": 8297 + }, + { + "epoch": 5.748527883616211, + "grad_norm": 0.5794067978858948, + "learning_rate": 4.252427184466019e-06, + "loss": 0.0081, + "step": 8298 + }, + { + "epoch": 5.749220644267406, + "grad_norm": 0.38931483030319214, + "learning_rate": 4.251733703190014e-06, + "loss": 0.0084, + "step": 8299 + }, + { + "epoch": 5.7499134049186, + "grad_norm": 0.383065789937973, + "learning_rate": 4.251040221914008e-06, + "loss": 0.0072, + "step": 8300 + }, + { + "epoch": 5.7506061655697955, + "grad_norm": 0.30684277415275574, + "learning_rate": 4.250346740638003e-06, + "loss": 0.0085, + "step": 8301 + }, + { + "epoch": 5.751298926220991, + "grad_norm": 0.5193754434585571, + "learning_rate": 4.2496532593619974e-06, + "loss": 0.0082, + "step": 8302 + }, + { + "epoch": 5.751991686872186, + "grad_norm": 0.3613142669200897, + "learning_rate": 4.248959778085992e-06, + "loss": 0.0071, + "step": 8303 + }, + { + "epoch": 5.7526844475233805, + "grad_norm": 0.5218485593795776, + "learning_rate": 4.2482662968099865e-06, + "loss": 0.0092, + "step": 8304 + }, + { + "epoch": 5.753377208174576, + "grad_norm": 0.30625155568122864, + "learning_rate": 4.2475728155339805e-06, + "loss": 0.0104, + "step": 8305 + }, + { + "epoch": 5.754069968825771, + "grad_norm": 0.3795376718044281, + "learning_rate": 4.2468793342579755e-06, + "loss": 0.0075, + "step": 8306 + }, + { + "epoch": 5.754762729476965, + "grad_norm": 0.38593757152557373, + "learning_rate": 4.2461858529819696e-06, + "loss": 0.0074, + "step": 8307 + }, + { + "epoch": 5.755455490128161, + "grad_norm": 0.49432623386383057, + "learning_rate": 4.2454923717059645e-06, + "loss": 0.0104, + "step": 8308 + }, + { + "epoch": 5.756148250779356, + "grad_norm": 0.32483917474746704, + "learning_rate": 4.244798890429959e-06, + "loss": 0.0072, + "step": 8309 + }, + { + "epoch": 5.75684101143055, + "grad_norm": 0.4211333990097046, + "learning_rate": 4.244105409153953e-06, + "loss": 0.0101, + "step": 8310 + }, + { + "epoch": 5.757533772081746, + "grad_norm": 0.24202485382556915, + "learning_rate": 4.243411927877948e-06, + "loss": 0.0058, + "step": 8311 + }, + { + "epoch": 5.758226532732941, + "grad_norm": 0.3968794047832489, + "learning_rate": 4.2427184466019425e-06, + "loss": 0.01, + "step": 8312 + }, + { + "epoch": 5.758919293384135, + "grad_norm": 0.4180951416492462, + "learning_rate": 4.242024965325937e-06, + "loss": 0.0143, + "step": 8313 + }, + { + "epoch": 5.759612054035331, + "grad_norm": 0.2683388292789459, + "learning_rate": 4.241331484049931e-06, + "loss": 0.0047, + "step": 8314 + }, + { + "epoch": 5.760304814686526, + "grad_norm": 0.35669052600860596, + "learning_rate": 4.240638002773926e-06, + "loss": 0.0102, + "step": 8315 + }, + { + "epoch": 5.760997575337721, + "grad_norm": 0.6181167364120483, + "learning_rate": 4.23994452149792e-06, + "loss": 0.0077, + "step": 8316 + }, + { + "epoch": 5.761690335988916, + "grad_norm": 0.45315179228782654, + "learning_rate": 4.239251040221915e-06, + "loss": 0.0105, + "step": 8317 + }, + { + "epoch": 5.762383096640111, + "grad_norm": 0.38807427883148193, + "learning_rate": 4.238557558945909e-06, + "loss": 0.0079, + "step": 8318 + }, + { + "epoch": 5.763075857291306, + "grad_norm": 0.4046410024166107, + "learning_rate": 4.237864077669903e-06, + "loss": 0.0126, + "step": 8319 + }, + { + "epoch": 5.7637686179425005, + "grad_norm": 0.42906343936920166, + "learning_rate": 4.237170596393898e-06, + "loss": 0.0086, + "step": 8320 + }, + { + "epoch": 5.764461378593696, + "grad_norm": 0.40467703342437744, + "learning_rate": 4.236477115117893e-06, + "loss": 0.0095, + "step": 8321 + }, + { + "epoch": 5.765154139244891, + "grad_norm": 0.42137759923934937, + "learning_rate": 4.235783633841887e-06, + "loss": 0.0089, + "step": 8322 + }, + { + "epoch": 5.765846899896086, + "grad_norm": 0.3476032614707947, + "learning_rate": 4.235090152565881e-06, + "loss": 0.01, + "step": 8323 + }, + { + "epoch": 5.766539660547281, + "grad_norm": 0.37140029668807983, + "learning_rate": 4.234396671289876e-06, + "loss": 0.0088, + "step": 8324 + }, + { + "epoch": 5.767232421198476, + "grad_norm": 0.31196829676628113, + "learning_rate": 4.23370319001387e-06, + "loss": 0.007, + "step": 8325 + }, + { + "epoch": 5.767925181849671, + "grad_norm": 0.4869900941848755, + "learning_rate": 4.233009708737865e-06, + "loss": 0.0085, + "step": 8326 + }, + { + "epoch": 5.768617942500866, + "grad_norm": 0.5487848520278931, + "learning_rate": 4.232316227461859e-06, + "loss": 0.0096, + "step": 8327 + }, + { + "epoch": 5.769310703152061, + "grad_norm": 0.3647545874118805, + "learning_rate": 4.231622746185853e-06, + "loss": 0.009, + "step": 8328 + }, + { + "epoch": 5.770003463803256, + "grad_norm": 0.47813114523887634, + "learning_rate": 4.230929264909848e-06, + "loss": 0.0125, + "step": 8329 + }, + { + "epoch": 5.770696224454451, + "grad_norm": 0.17446501553058624, + "learning_rate": 4.230235783633842e-06, + "loss": 0.0038, + "step": 8330 + }, + { + "epoch": 5.771388985105646, + "grad_norm": 0.35993754863739014, + "learning_rate": 4.229542302357837e-06, + "loss": 0.0099, + "step": 8331 + }, + { + "epoch": 5.772081745756841, + "grad_norm": 1.0426440238952637, + "learning_rate": 4.228848821081831e-06, + "loss": 0.0098, + "step": 8332 + }, + { + "epoch": 5.772774506408036, + "grad_norm": 0.36029863357543945, + "learning_rate": 4.228155339805826e-06, + "loss": 0.0061, + "step": 8333 + }, + { + "epoch": 5.773467267059231, + "grad_norm": 0.550159752368927, + "learning_rate": 4.22746185852982e-06, + "loss": 0.012, + "step": 8334 + }, + { + "epoch": 5.774160027710426, + "grad_norm": 0.32115405797958374, + "learning_rate": 4.226768377253814e-06, + "loss": 0.0076, + "step": 8335 + }, + { + "epoch": 5.774852788361621, + "grad_norm": 0.5311328768730164, + "learning_rate": 4.226074895977809e-06, + "loss": 0.0106, + "step": 8336 + }, + { + "epoch": 5.775545549012816, + "grad_norm": 0.2964485287666321, + "learning_rate": 4.225381414701803e-06, + "loss": 0.0073, + "step": 8337 + }, + { + "epoch": 5.776238309664011, + "grad_norm": 0.28040528297424316, + "learning_rate": 4.224687933425798e-06, + "loss": 0.0088, + "step": 8338 + }, + { + "epoch": 5.776931070315206, + "grad_norm": 0.4162251949310303, + "learning_rate": 4.223994452149792e-06, + "loss": 0.0087, + "step": 8339 + }, + { + "epoch": 5.777623830966401, + "grad_norm": 0.27482980489730835, + "learning_rate": 4.223300970873786e-06, + "loss": 0.0058, + "step": 8340 + }, + { + "epoch": 5.778316591617596, + "grad_norm": 0.4677090048789978, + "learning_rate": 4.222607489597781e-06, + "loss": 0.0079, + "step": 8341 + }, + { + "epoch": 5.779009352268791, + "grad_norm": 0.4626348912715912, + "learning_rate": 4.221914008321776e-06, + "loss": 0.0071, + "step": 8342 + }, + { + "epoch": 5.779702112919987, + "grad_norm": 0.5672571063041687, + "learning_rate": 4.22122052704577e-06, + "loss": 0.0071, + "step": 8343 + }, + { + "epoch": 5.780394873571181, + "grad_norm": 0.37879785895347595, + "learning_rate": 4.220527045769764e-06, + "loss": 0.0074, + "step": 8344 + }, + { + "epoch": 5.781087634222376, + "grad_norm": 0.3782157599925995, + "learning_rate": 4.219833564493758e-06, + "loss": 0.007, + "step": 8345 + }, + { + "epoch": 5.7817803948735715, + "grad_norm": 0.3514493703842163, + "learning_rate": 4.219140083217753e-06, + "loss": 0.0076, + "step": 8346 + }, + { + "epoch": 5.782473155524766, + "grad_norm": 0.33222493529319763, + "learning_rate": 4.218446601941748e-06, + "loss": 0.0074, + "step": 8347 + }, + { + "epoch": 5.783165916175961, + "grad_norm": 0.4822169244289398, + "learning_rate": 4.217753120665742e-06, + "loss": 0.0077, + "step": 8348 + }, + { + "epoch": 5.7838586768271565, + "grad_norm": 0.31460967659950256, + "learning_rate": 4.2170596393897364e-06, + "loss": 0.0068, + "step": 8349 + }, + { + "epoch": 5.784551437478351, + "grad_norm": 0.27153342962265015, + "learning_rate": 4.216366158113731e-06, + "loss": 0.0071, + "step": 8350 + }, + { + "epoch": 5.785244198129546, + "grad_norm": 0.8966512680053711, + "learning_rate": 4.215672676837726e-06, + "loss": 0.0084, + "step": 8351 + }, + { + "epoch": 5.785936958780741, + "grad_norm": 0.48668205738067627, + "learning_rate": 4.21497919556172e-06, + "loss": 0.0103, + "step": 8352 + }, + { + "epoch": 5.786629719431936, + "grad_norm": 0.422843337059021, + "learning_rate": 4.2142857142857145e-06, + "loss": 0.0086, + "step": 8353 + }, + { + "epoch": 5.787322480083131, + "grad_norm": 0.3376331031322479, + "learning_rate": 4.213592233009709e-06, + "loss": 0.0079, + "step": 8354 + }, + { + "epoch": 5.788015240734326, + "grad_norm": 0.5531777739524841, + "learning_rate": 4.2128987517337035e-06, + "loss": 0.0111, + "step": 8355 + }, + { + "epoch": 5.788708001385522, + "grad_norm": 0.5027947425842285, + "learning_rate": 4.2122052704576984e-06, + "loss": 0.0107, + "step": 8356 + }, + { + "epoch": 5.789400762036716, + "grad_norm": 0.4449463486671448, + "learning_rate": 4.2115117891816925e-06, + "loss": 0.0092, + "step": 8357 + }, + { + "epoch": 5.790093522687911, + "grad_norm": 0.3338002860546112, + "learning_rate": 4.210818307905687e-06, + "loss": 0.0092, + "step": 8358 + }, + { + "epoch": 5.790786283339107, + "grad_norm": 0.8274686932563782, + "learning_rate": 4.2101248266296815e-06, + "loss": 0.0122, + "step": 8359 + }, + { + "epoch": 5.791479043990301, + "grad_norm": 0.49187445640563965, + "learning_rate": 4.209431345353676e-06, + "loss": 0.008, + "step": 8360 + }, + { + "epoch": 5.792171804641496, + "grad_norm": 0.3872314989566803, + "learning_rate": 4.2087378640776706e-06, + "loss": 0.0065, + "step": 8361 + }, + { + "epoch": 5.792864565292692, + "grad_norm": 0.39518681168556213, + "learning_rate": 4.208044382801665e-06, + "loss": 0.0103, + "step": 8362 + }, + { + "epoch": 5.793557325943887, + "grad_norm": 0.4513131082057953, + "learning_rate": 4.2073509015256596e-06, + "loss": 0.0077, + "step": 8363 + }, + { + "epoch": 5.794250086595081, + "grad_norm": 0.4230335056781769, + "learning_rate": 4.206657420249654e-06, + "loss": 0.0069, + "step": 8364 + }, + { + "epoch": 5.7949428472462765, + "grad_norm": 0.44150733947753906, + "learning_rate": 4.205963938973648e-06, + "loss": 0.007, + "step": 8365 + }, + { + "epoch": 5.795635607897472, + "grad_norm": 0.3151010572910309, + "learning_rate": 4.205270457697643e-06, + "loss": 0.0067, + "step": 8366 + }, + { + "epoch": 5.796328368548666, + "grad_norm": 0.31523779034614563, + "learning_rate": 4.204576976421637e-06, + "loss": 0.0086, + "step": 8367 + }, + { + "epoch": 5.7970211291998615, + "grad_norm": 0.34788498282432556, + "learning_rate": 4.203883495145632e-06, + "loss": 0.0081, + "step": 8368 + }, + { + "epoch": 5.797713889851057, + "grad_norm": 0.6555057764053345, + "learning_rate": 4.203190013869626e-06, + "loss": 0.0103, + "step": 8369 + }, + { + "epoch": 5.798406650502251, + "grad_norm": 0.47564446926116943, + "learning_rate": 4.20249653259362e-06, + "loss": 0.0058, + "step": 8370 + }, + { + "epoch": 5.799099411153446, + "grad_norm": 0.3363701105117798, + "learning_rate": 4.201803051317615e-06, + "loss": 0.0094, + "step": 8371 + }, + { + "epoch": 5.799792171804642, + "grad_norm": 0.2784354090690613, + "learning_rate": 4.20110957004161e-06, + "loss": 0.0064, + "step": 8372 + }, + { + "epoch": 5.800484932455836, + "grad_norm": 0.3875786364078522, + "learning_rate": 4.200416088765604e-06, + "loss": 0.0077, + "step": 8373 + }, + { + "epoch": 5.801177693107031, + "grad_norm": 0.373428076505661, + "learning_rate": 4.199722607489598e-06, + "loss": 0.0093, + "step": 8374 + }, + { + "epoch": 5.801870453758227, + "grad_norm": 0.41150420904159546, + "learning_rate": 4.199029126213592e-06, + "loss": 0.0109, + "step": 8375 + }, + { + "epoch": 5.802563214409422, + "grad_norm": 0.3832527697086334, + "learning_rate": 4.198335644937587e-06, + "loss": 0.0081, + "step": 8376 + }, + { + "epoch": 5.803255975060616, + "grad_norm": 0.3092479109764099, + "learning_rate": 4.197642163661582e-06, + "loss": 0.0061, + "step": 8377 + }, + { + "epoch": 5.803948735711812, + "grad_norm": 0.6134652495384216, + "learning_rate": 4.196948682385576e-06, + "loss": 0.0085, + "step": 8378 + }, + { + "epoch": 5.804641496363007, + "grad_norm": 0.3552073538303375, + "learning_rate": 4.19625520110957e-06, + "loss": 0.0101, + "step": 8379 + }, + { + "epoch": 5.805334257014201, + "grad_norm": 0.3787483274936676, + "learning_rate": 4.195561719833565e-06, + "loss": 0.0064, + "step": 8380 + }, + { + "epoch": 5.8060270176653965, + "grad_norm": 0.3213479518890381, + "learning_rate": 4.19486823855756e-06, + "loss": 0.009, + "step": 8381 + }, + { + "epoch": 5.806719778316592, + "grad_norm": 0.5946488380432129, + "learning_rate": 4.194174757281554e-06, + "loss": 0.0085, + "step": 8382 + }, + { + "epoch": 5.807412538967787, + "grad_norm": 0.4807218611240387, + "learning_rate": 4.193481276005548e-06, + "loss": 0.0102, + "step": 8383 + }, + { + "epoch": 5.8081052996189815, + "grad_norm": 0.367820680141449, + "learning_rate": 4.192787794729542e-06, + "loss": 0.0088, + "step": 8384 + }, + { + "epoch": 5.808798060270177, + "grad_norm": 0.4575831890106201, + "learning_rate": 4.192094313453537e-06, + "loss": 0.0127, + "step": 8385 + }, + { + "epoch": 5.809490820921372, + "grad_norm": 0.3799903988838196, + "learning_rate": 4.191400832177532e-06, + "loss": 0.0084, + "step": 8386 + }, + { + "epoch": 5.810183581572566, + "grad_norm": 0.3574175536632538, + "learning_rate": 4.190707350901526e-06, + "loss": 0.0059, + "step": 8387 + }, + { + "epoch": 5.810876342223762, + "grad_norm": 0.44677236676216125, + "learning_rate": 4.19001386962552e-06, + "loss": 0.0107, + "step": 8388 + }, + { + "epoch": 5.811569102874957, + "grad_norm": 0.40432292222976685, + "learning_rate": 4.189320388349515e-06, + "loss": 0.0089, + "step": 8389 + }, + { + "epoch": 5.812261863526151, + "grad_norm": 0.34104475378990173, + "learning_rate": 4.188626907073509e-06, + "loss": 0.0075, + "step": 8390 + }, + { + "epoch": 5.812954624177347, + "grad_norm": 0.4250873625278473, + "learning_rate": 4.187933425797504e-06, + "loss": 0.011, + "step": 8391 + }, + { + "epoch": 5.813647384828542, + "grad_norm": 0.2522217333316803, + "learning_rate": 4.187239944521498e-06, + "loss": 0.0067, + "step": 8392 + }, + { + "epoch": 5.814340145479736, + "grad_norm": 0.415157675743103, + "learning_rate": 4.186546463245492e-06, + "loss": 0.0062, + "step": 8393 + }, + { + "epoch": 5.815032906130932, + "grad_norm": 0.6287251114845276, + "learning_rate": 4.185852981969487e-06, + "loss": 0.0104, + "step": 8394 + }, + { + "epoch": 5.815725666782127, + "grad_norm": 0.3798479735851288, + "learning_rate": 4.185159500693481e-06, + "loss": 0.0073, + "step": 8395 + }, + { + "epoch": 5.816418427433322, + "grad_norm": 0.34912198781967163, + "learning_rate": 4.184466019417476e-06, + "loss": 0.0072, + "step": 8396 + }, + { + "epoch": 5.8171111880845165, + "grad_norm": 0.39099520444869995, + "learning_rate": 4.18377253814147e-06, + "loss": 0.0074, + "step": 8397 + }, + { + "epoch": 5.817803948735712, + "grad_norm": 0.458638072013855, + "learning_rate": 4.183079056865465e-06, + "loss": 0.0076, + "step": 8398 + }, + { + "epoch": 5.818496709386907, + "grad_norm": 0.3762831687927246, + "learning_rate": 4.182385575589459e-06, + "loss": 0.007, + "step": 8399 + }, + { + "epoch": 5.8191894700381015, + "grad_norm": 0.381137877702713, + "learning_rate": 4.1816920943134535e-06, + "loss": 0.0099, + "step": 8400 + }, + { + "epoch": 5.819882230689297, + "grad_norm": 0.4109715223312378, + "learning_rate": 4.180998613037448e-06, + "loss": 0.0091, + "step": 8401 + }, + { + "epoch": 5.820574991340492, + "grad_norm": 0.4500003159046173, + "learning_rate": 4.180305131761443e-06, + "loss": 0.0117, + "step": 8402 + }, + { + "epoch": 5.821267751991687, + "grad_norm": 0.49851664900779724, + "learning_rate": 4.1796116504854374e-06, + "loss": 0.0115, + "step": 8403 + }, + { + "epoch": 5.821960512642882, + "grad_norm": 0.29713866114616394, + "learning_rate": 4.1789181692094315e-06, + "loss": 0.0063, + "step": 8404 + }, + { + "epoch": 5.822653273294077, + "grad_norm": 0.34780338406562805, + "learning_rate": 4.178224687933426e-06, + "loss": 0.009, + "step": 8405 + }, + { + "epoch": 5.823346033945272, + "grad_norm": 0.3469697833061218, + "learning_rate": 4.1775312066574205e-06, + "loss": 0.0081, + "step": 8406 + }, + { + "epoch": 5.824038794596467, + "grad_norm": 0.27404332160949707, + "learning_rate": 4.1768377253814155e-06, + "loss": 0.0055, + "step": 8407 + }, + { + "epoch": 5.824731555247662, + "grad_norm": 0.30559876561164856, + "learning_rate": 4.1761442441054096e-06, + "loss": 0.0069, + "step": 8408 + }, + { + "epoch": 5.825424315898857, + "grad_norm": 0.38184332847595215, + "learning_rate": 4.175450762829404e-06, + "loss": 0.0088, + "step": 8409 + }, + { + "epoch": 5.826117076550052, + "grad_norm": 0.24550074338912964, + "learning_rate": 4.1747572815533986e-06, + "loss": 0.0055, + "step": 8410 + }, + { + "epoch": 5.826809837201247, + "grad_norm": 0.29619890451431274, + "learning_rate": 4.1740638002773935e-06, + "loss": 0.0061, + "step": 8411 + }, + { + "epoch": 5.827502597852442, + "grad_norm": 0.45778506994247437, + "learning_rate": 4.173370319001388e-06, + "loss": 0.0074, + "step": 8412 + }, + { + "epoch": 5.828195358503637, + "grad_norm": 0.3315967917442322, + "learning_rate": 4.172676837725382e-06, + "loss": 0.0074, + "step": 8413 + }, + { + "epoch": 5.828888119154832, + "grad_norm": 0.31582432985305786, + "learning_rate": 4.171983356449376e-06, + "loss": 0.0058, + "step": 8414 + }, + { + "epoch": 5.829580879806027, + "grad_norm": 0.3864191174507141, + "learning_rate": 4.171289875173371e-06, + "loss": 0.0085, + "step": 8415 + }, + { + "epoch": 5.830273640457222, + "grad_norm": 0.34668925404548645, + "learning_rate": 4.170596393897366e-06, + "loss": 0.0097, + "step": 8416 + }, + { + "epoch": 5.830966401108417, + "grad_norm": 0.4442277252674103, + "learning_rate": 4.16990291262136e-06, + "loss": 0.0094, + "step": 8417 + }, + { + "epoch": 5.831659161759612, + "grad_norm": 0.23464812338352203, + "learning_rate": 4.169209431345354e-06, + "loss": 0.0051, + "step": 8418 + }, + { + "epoch": 5.832351922410807, + "grad_norm": 0.26087555289268494, + "learning_rate": 4.168515950069349e-06, + "loss": 0.0071, + "step": 8419 + }, + { + "epoch": 5.833044683062002, + "grad_norm": 0.3434668481349945, + "learning_rate": 4.167822468793343e-06, + "loss": 0.0071, + "step": 8420 + }, + { + "epoch": 5.833737443713197, + "grad_norm": 0.4209461808204651, + "learning_rate": 4.167128987517338e-06, + "loss": 0.0077, + "step": 8421 + }, + { + "epoch": 5.834430204364392, + "grad_norm": 0.417118102312088, + "learning_rate": 4.166435506241332e-06, + "loss": 0.011, + "step": 8422 + }, + { + "epoch": 5.835122965015588, + "grad_norm": 0.406374454498291, + "learning_rate": 4.165742024965326e-06, + "loss": 0.0069, + "step": 8423 + }, + { + "epoch": 5.835815725666782, + "grad_norm": 0.4506526589393616, + "learning_rate": 4.165048543689321e-06, + "loss": 0.0105, + "step": 8424 + }, + { + "epoch": 5.836508486317977, + "grad_norm": 0.24989177286624908, + "learning_rate": 4.164355062413315e-06, + "loss": 0.0056, + "step": 8425 + }, + { + "epoch": 5.8372012469691725, + "grad_norm": 0.45859843492507935, + "learning_rate": 4.16366158113731e-06, + "loss": 0.0087, + "step": 8426 + }, + { + "epoch": 5.837894007620367, + "grad_norm": 0.3756405711174011, + "learning_rate": 4.162968099861304e-06, + "loss": 0.0068, + "step": 8427 + }, + { + "epoch": 5.838586768271562, + "grad_norm": 0.34702837467193604, + "learning_rate": 4.162274618585299e-06, + "loss": 0.0058, + "step": 8428 + }, + { + "epoch": 5.8392795289227575, + "grad_norm": 0.28181061148643494, + "learning_rate": 4.161581137309293e-06, + "loss": 0.006, + "step": 8429 + }, + { + "epoch": 5.839972289573952, + "grad_norm": 0.2980850040912628, + "learning_rate": 4.160887656033287e-06, + "loss": 0.0083, + "step": 8430 + }, + { + "epoch": 5.840665050225147, + "grad_norm": 0.408311665058136, + "learning_rate": 4.160194174757282e-06, + "loss": 0.0072, + "step": 8431 + }, + { + "epoch": 5.841357810876342, + "grad_norm": 0.2777591943740845, + "learning_rate": 4.159500693481276e-06, + "loss": 0.0061, + "step": 8432 + }, + { + "epoch": 5.842050571527537, + "grad_norm": 0.36505457758903503, + "learning_rate": 4.158807212205271e-06, + "loss": 0.0071, + "step": 8433 + }, + { + "epoch": 5.842743332178732, + "grad_norm": 0.32132652401924133, + "learning_rate": 4.158113730929265e-06, + "loss": 0.0075, + "step": 8434 + }, + { + "epoch": 5.843436092829927, + "grad_norm": 0.24043220281600952, + "learning_rate": 4.157420249653259e-06, + "loss": 0.0052, + "step": 8435 + }, + { + "epoch": 5.844128853481123, + "grad_norm": 0.4256744682788849, + "learning_rate": 4.156726768377254e-06, + "loss": 0.0071, + "step": 8436 + }, + { + "epoch": 5.844821614132317, + "grad_norm": 0.4054756760597229, + "learning_rate": 4.156033287101249e-06, + "loss": 0.0088, + "step": 8437 + }, + { + "epoch": 5.845514374783512, + "grad_norm": 0.37333884835243225, + "learning_rate": 4.155339805825243e-06, + "loss": 0.0092, + "step": 8438 + }, + { + "epoch": 5.846207135434708, + "grad_norm": 0.4519575834274292, + "learning_rate": 4.154646324549237e-06, + "loss": 0.01, + "step": 8439 + }, + { + "epoch": 5.846899896085902, + "grad_norm": 0.362836092710495, + "learning_rate": 4.153952843273232e-06, + "loss": 0.0082, + "step": 8440 + }, + { + "epoch": 5.847592656737097, + "grad_norm": 0.4783961772918701, + "learning_rate": 4.153259361997226e-06, + "loss": 0.0119, + "step": 8441 + }, + { + "epoch": 5.8482854173882926, + "grad_norm": 0.5307455062866211, + "learning_rate": 4.152565880721221e-06, + "loss": 0.0095, + "step": 8442 + }, + { + "epoch": 5.848978178039488, + "grad_norm": 0.3113182485103607, + "learning_rate": 4.151872399445215e-06, + "loss": 0.0078, + "step": 8443 + }, + { + "epoch": 5.849670938690682, + "grad_norm": 0.41187363862991333, + "learning_rate": 4.151178918169209e-06, + "loss": 0.007, + "step": 8444 + }, + { + "epoch": 5.8503636993418775, + "grad_norm": 0.3689092993736267, + "learning_rate": 4.150485436893204e-06, + "loss": 0.0092, + "step": 8445 + }, + { + "epoch": 5.851056459993073, + "grad_norm": 0.20535829663276672, + "learning_rate": 4.149791955617199e-06, + "loss": 0.004, + "step": 8446 + }, + { + "epoch": 5.851749220644267, + "grad_norm": 0.4785582721233368, + "learning_rate": 4.149098474341193e-06, + "loss": 0.0106, + "step": 8447 + }, + { + "epoch": 5.8524419812954624, + "grad_norm": 0.5107148885726929, + "learning_rate": 4.148404993065187e-06, + "loss": 0.0078, + "step": 8448 + }, + { + "epoch": 5.853134741946658, + "grad_norm": 0.4586354196071625, + "learning_rate": 4.147711511789182e-06, + "loss": 0.0082, + "step": 8449 + }, + { + "epoch": 5.853827502597852, + "grad_norm": 0.21495041251182556, + "learning_rate": 4.1470180305131764e-06, + "loss": 0.0047, + "step": 8450 + }, + { + "epoch": 5.854520263249047, + "grad_norm": 0.4510405957698822, + "learning_rate": 4.146324549237171e-06, + "loss": 0.0087, + "step": 8451 + }, + { + "epoch": 5.855213023900243, + "grad_norm": 0.8113586902618408, + "learning_rate": 4.1456310679611654e-06, + "loss": 0.0131, + "step": 8452 + }, + { + "epoch": 5.855905784551437, + "grad_norm": 0.41880154609680176, + "learning_rate": 4.1449375866851595e-06, + "loss": 0.0072, + "step": 8453 + }, + { + "epoch": 5.856598545202632, + "grad_norm": 0.32437968254089355, + "learning_rate": 4.1442441054091545e-06, + "loss": 0.0075, + "step": 8454 + }, + { + "epoch": 5.857291305853828, + "grad_norm": 0.27792876958847046, + "learning_rate": 4.1435506241331486e-06, + "loss": 0.0063, + "step": 8455 + }, + { + "epoch": 5.857984066505023, + "grad_norm": 0.32808801531791687, + "learning_rate": 4.1428571428571435e-06, + "loss": 0.0078, + "step": 8456 + }, + { + "epoch": 5.858676827156217, + "grad_norm": 0.34345778822898865, + "learning_rate": 4.1421636615811376e-06, + "loss": 0.0054, + "step": 8457 + }, + { + "epoch": 5.859369587807413, + "grad_norm": 0.29215386509895325, + "learning_rate": 4.1414701803051325e-06, + "loss": 0.0059, + "step": 8458 + }, + { + "epoch": 5.860062348458608, + "grad_norm": 0.4607574939727783, + "learning_rate": 4.140776699029127e-06, + "loss": 0.0086, + "step": 8459 + }, + { + "epoch": 5.860755109109802, + "grad_norm": 0.3328019678592682, + "learning_rate": 4.140083217753121e-06, + "loss": 0.0085, + "step": 8460 + }, + { + "epoch": 5.8614478697609975, + "grad_norm": 0.22415997087955475, + "learning_rate": 4.139389736477116e-06, + "loss": 0.0049, + "step": 8461 + }, + { + "epoch": 5.862140630412193, + "grad_norm": 0.43069514632225037, + "learning_rate": 4.13869625520111e-06, + "loss": 0.0086, + "step": 8462 + }, + { + "epoch": 5.862833391063388, + "grad_norm": 0.5360902547836304, + "learning_rate": 4.138002773925105e-06, + "loss": 0.0069, + "step": 8463 + }, + { + "epoch": 5.8635261517145825, + "grad_norm": 0.4504351317882538, + "learning_rate": 4.137309292649099e-06, + "loss": 0.0083, + "step": 8464 + }, + { + "epoch": 5.864218912365778, + "grad_norm": 0.4289409816265106, + "learning_rate": 4.136615811373093e-06, + "loss": 0.0066, + "step": 8465 + }, + { + "epoch": 5.864911673016973, + "grad_norm": 0.2925226092338562, + "learning_rate": 4.135922330097088e-06, + "loss": 0.0064, + "step": 8466 + }, + { + "epoch": 5.865604433668167, + "grad_norm": 0.4810274541378021, + "learning_rate": 4.135228848821083e-06, + "loss": 0.0073, + "step": 8467 + }, + { + "epoch": 5.866297194319363, + "grad_norm": 0.44315850734710693, + "learning_rate": 4.134535367545077e-06, + "loss": 0.0098, + "step": 8468 + }, + { + "epoch": 5.866989954970558, + "grad_norm": 0.4919215142726898, + "learning_rate": 4.133841886269071e-06, + "loss": 0.0092, + "step": 8469 + }, + { + "epoch": 5.867682715621752, + "grad_norm": 0.33163192868232727, + "learning_rate": 4.133148404993066e-06, + "loss": 0.0087, + "step": 8470 + }, + { + "epoch": 5.868375476272948, + "grad_norm": 0.34440740942955017, + "learning_rate": 4.13245492371706e-06, + "loss": 0.008, + "step": 8471 + }, + { + "epoch": 5.869068236924143, + "grad_norm": 0.3269757032394409, + "learning_rate": 4.131761442441055e-06, + "loss": 0.0076, + "step": 8472 + }, + { + "epoch": 5.869760997575337, + "grad_norm": 0.5475027561187744, + "learning_rate": 4.131067961165049e-06, + "loss": 0.0097, + "step": 8473 + }, + { + "epoch": 5.870453758226533, + "grad_norm": 0.461850106716156, + "learning_rate": 4.130374479889043e-06, + "loss": 0.0095, + "step": 8474 + }, + { + "epoch": 5.871146518877728, + "grad_norm": 0.3912562131881714, + "learning_rate": 4.129680998613038e-06, + "loss": 0.0085, + "step": 8475 + }, + { + "epoch": 5.871839279528922, + "grad_norm": 0.38020938634872437, + "learning_rate": 4.128987517337033e-06, + "loss": 0.0092, + "step": 8476 + }, + { + "epoch": 5.8725320401801175, + "grad_norm": 0.4405178129673004, + "learning_rate": 4.128294036061027e-06, + "loss": 0.0087, + "step": 8477 + }, + { + "epoch": 5.873224800831313, + "grad_norm": 0.3280518054962158, + "learning_rate": 4.127600554785021e-06, + "loss": 0.0053, + "step": 8478 + }, + { + "epoch": 5.873917561482508, + "grad_norm": 0.3536124527454376, + "learning_rate": 4.126907073509016e-06, + "loss": 0.0073, + "step": 8479 + }, + { + "epoch": 5.8746103221337025, + "grad_norm": 0.38063618540763855, + "learning_rate": 4.12621359223301e-06, + "loss": 0.0056, + "step": 8480 + }, + { + "epoch": 5.875303082784898, + "grad_norm": 0.35610726475715637, + "learning_rate": 4.125520110957005e-06, + "loss": 0.0072, + "step": 8481 + }, + { + "epoch": 5.875995843436093, + "grad_norm": 0.3490633964538574, + "learning_rate": 4.124826629680999e-06, + "loss": 0.006, + "step": 8482 + }, + { + "epoch": 5.876688604087288, + "grad_norm": 0.39876359701156616, + "learning_rate": 4.124133148404993e-06, + "loss": 0.007, + "step": 8483 + }, + { + "epoch": 5.877381364738483, + "grad_norm": 0.3289647400379181, + "learning_rate": 4.123439667128988e-06, + "loss": 0.0059, + "step": 8484 + }, + { + "epoch": 5.878074125389678, + "grad_norm": 0.4408869445323944, + "learning_rate": 4.122746185852982e-06, + "loss": 0.0073, + "step": 8485 + }, + { + "epoch": 5.878766886040873, + "grad_norm": 0.43645215034484863, + "learning_rate": 4.122052704576976e-06, + "loss": 0.0114, + "step": 8486 + }, + { + "epoch": 5.879459646692068, + "grad_norm": 0.48829320073127747, + "learning_rate": 4.121359223300971e-06, + "loss": 0.0079, + "step": 8487 + }, + { + "epoch": 5.880152407343263, + "grad_norm": 0.5117582082748413, + "learning_rate": 4.120665742024966e-06, + "loss": 0.0124, + "step": 8488 + }, + { + "epoch": 5.880845167994458, + "grad_norm": 0.31155481934547424, + "learning_rate": 4.11997226074896e-06, + "loss": 0.0072, + "step": 8489 + }, + { + "epoch": 5.881537928645653, + "grad_norm": 0.4325661063194275, + "learning_rate": 4.119278779472954e-06, + "loss": 0.0069, + "step": 8490 + }, + { + "epoch": 5.882230689296848, + "grad_norm": 0.2997148334980011, + "learning_rate": 4.118585298196948e-06, + "loss": 0.0062, + "step": 8491 + }, + { + "epoch": 5.882923449948043, + "grad_norm": 0.25914114713668823, + "learning_rate": 4.117891816920943e-06, + "loss": 0.0043, + "step": 8492 + }, + { + "epoch": 5.883616210599238, + "grad_norm": 0.30932387709617615, + "learning_rate": 4.117198335644938e-06, + "loss": 0.0066, + "step": 8493 + }, + { + "epoch": 5.884308971250433, + "grad_norm": 0.3908243179321289, + "learning_rate": 4.116504854368932e-06, + "loss": 0.0062, + "step": 8494 + }, + { + "epoch": 5.885001731901628, + "grad_norm": 0.28193148970603943, + "learning_rate": 4.115811373092926e-06, + "loss": 0.0068, + "step": 8495 + }, + { + "epoch": 5.8856944925528225, + "grad_norm": 0.3283005952835083, + "learning_rate": 4.115117891816921e-06, + "loss": 0.0056, + "step": 8496 + }, + { + "epoch": 5.886387253204018, + "grad_norm": 0.5039186477661133, + "learning_rate": 4.114424410540916e-06, + "loss": 0.0118, + "step": 8497 + }, + { + "epoch": 5.887080013855213, + "grad_norm": 0.4128674268722534, + "learning_rate": 4.11373092926491e-06, + "loss": 0.0095, + "step": 8498 + }, + { + "epoch": 5.887772774506408, + "grad_norm": 0.2646946609020233, + "learning_rate": 4.1130374479889044e-06, + "loss": 0.0054, + "step": 8499 + }, + { + "epoch": 5.888465535157603, + "grad_norm": 0.5354933142662048, + "learning_rate": 4.1123439667128985e-06, + "loss": 0.0107, + "step": 8500 + }, + { + "epoch": 5.889158295808798, + "grad_norm": 0.41395577788352966, + "learning_rate": 4.1116504854368935e-06, + "loss": 0.0084, + "step": 8501 + }, + { + "epoch": 5.889851056459993, + "grad_norm": 0.43025678396224976, + "learning_rate": 4.110957004160888e-06, + "loss": 0.0112, + "step": 8502 + }, + { + "epoch": 5.890543817111189, + "grad_norm": 0.2974132299423218, + "learning_rate": 4.1102635228848825e-06, + "loss": 0.0077, + "step": 8503 + }, + { + "epoch": 5.891236577762383, + "grad_norm": 0.41498416662216187, + "learning_rate": 4.1095700416088766e-06, + "loss": 0.0085, + "step": 8504 + }, + { + "epoch": 5.891929338413578, + "grad_norm": 0.31983187794685364, + "learning_rate": 4.1088765603328715e-06, + "loss": 0.0087, + "step": 8505 + }, + { + "epoch": 5.8926220990647735, + "grad_norm": 0.44971662759780884, + "learning_rate": 4.108183079056866e-06, + "loss": 0.0086, + "step": 8506 + }, + { + "epoch": 5.893314859715968, + "grad_norm": 0.5393744111061096, + "learning_rate": 4.1074895977808605e-06, + "loss": 0.0096, + "step": 8507 + }, + { + "epoch": 5.894007620367163, + "grad_norm": 0.42491066455841064, + "learning_rate": 4.106796116504855e-06, + "loss": 0.0075, + "step": 8508 + }, + { + "epoch": 5.8947003810183585, + "grad_norm": 0.44292858242988586, + "learning_rate": 4.106102635228849e-06, + "loss": 0.0092, + "step": 8509 + }, + { + "epoch": 5.895393141669553, + "grad_norm": 0.4139257073402405, + "learning_rate": 4.105409153952844e-06, + "loss": 0.0076, + "step": 8510 + }, + { + "epoch": 5.896085902320748, + "grad_norm": 0.40488094091415405, + "learning_rate": 4.104715672676838e-06, + "loss": 0.009, + "step": 8511 + }, + { + "epoch": 5.896778662971943, + "grad_norm": 0.46385952830314636, + "learning_rate": 4.104022191400833e-06, + "loss": 0.0096, + "step": 8512 + }, + { + "epoch": 5.897471423623138, + "grad_norm": 0.2889416515827179, + "learning_rate": 4.103328710124827e-06, + "loss": 0.0061, + "step": 8513 + }, + { + "epoch": 5.898164184274333, + "grad_norm": 0.29356223344802856, + "learning_rate": 4.102635228848822e-06, + "loss": 0.0054, + "step": 8514 + }, + { + "epoch": 5.898856944925528, + "grad_norm": 0.46368181705474854, + "learning_rate": 4.101941747572816e-06, + "loss": 0.0076, + "step": 8515 + }, + { + "epoch": 5.899549705576723, + "grad_norm": 0.3494319021701813, + "learning_rate": 4.10124826629681e-06, + "loss": 0.0071, + "step": 8516 + }, + { + "epoch": 5.900242466227918, + "grad_norm": 0.3383325934410095, + "learning_rate": 4.100554785020805e-06, + "loss": 0.0062, + "step": 8517 + }, + { + "epoch": 5.900935226879113, + "grad_norm": 0.37720346450805664, + "learning_rate": 4.0998613037448e-06, + "loss": 0.0068, + "step": 8518 + }, + { + "epoch": 5.901627987530309, + "grad_norm": 0.33118075132369995, + "learning_rate": 4.099167822468794e-06, + "loss": 0.0069, + "step": 8519 + }, + { + "epoch": 5.902320748181503, + "grad_norm": 0.39278867840766907, + "learning_rate": 4.098474341192788e-06, + "loss": 0.0083, + "step": 8520 + }, + { + "epoch": 5.903013508832698, + "grad_norm": 0.3461000323295593, + "learning_rate": 4.097780859916782e-06, + "loss": 0.0078, + "step": 8521 + }, + { + "epoch": 5.9037062694838935, + "grad_norm": 0.35026121139526367, + "learning_rate": 4.097087378640777e-06, + "loss": 0.0101, + "step": 8522 + }, + { + "epoch": 5.904399030135089, + "grad_norm": 0.4571658968925476, + "learning_rate": 4.096393897364772e-06, + "loss": 0.0094, + "step": 8523 + }, + { + "epoch": 5.905091790786283, + "grad_norm": 0.4728372097015381, + "learning_rate": 4.095700416088766e-06, + "loss": 0.0081, + "step": 8524 + }, + { + "epoch": 5.9057845514374785, + "grad_norm": 0.35921093821525574, + "learning_rate": 4.09500693481276e-06, + "loss": 0.0075, + "step": 8525 + }, + { + "epoch": 5.906477312088674, + "grad_norm": 0.2806636095046997, + "learning_rate": 4.094313453536755e-06, + "loss": 0.0057, + "step": 8526 + }, + { + "epoch": 5.907170072739868, + "grad_norm": 0.37967145442962646, + "learning_rate": 4.09361997226075e-06, + "loss": 0.0082, + "step": 8527 + }, + { + "epoch": 5.907862833391063, + "grad_norm": 0.30859291553497314, + "learning_rate": 4.092926490984744e-06, + "loss": 0.007, + "step": 8528 + }, + { + "epoch": 5.908555594042259, + "grad_norm": 0.4443950057029724, + "learning_rate": 4.092233009708738e-06, + "loss": 0.0059, + "step": 8529 + }, + { + "epoch": 5.909248354693453, + "grad_norm": 0.20901168882846832, + "learning_rate": 4.091539528432732e-06, + "loss": 0.006, + "step": 8530 + }, + { + "epoch": 5.909941115344648, + "grad_norm": 0.2895027995109558, + "learning_rate": 4.090846047156727e-06, + "loss": 0.0053, + "step": 8531 + }, + { + "epoch": 5.910633875995844, + "grad_norm": 0.41502273082733154, + "learning_rate": 4.090152565880722e-06, + "loss": 0.0098, + "step": 8532 + }, + { + "epoch": 5.911326636647038, + "grad_norm": 0.35625556111335754, + "learning_rate": 4.089459084604716e-06, + "loss": 0.0079, + "step": 8533 + }, + { + "epoch": 5.912019397298233, + "grad_norm": 0.714909017086029, + "learning_rate": 4.08876560332871e-06, + "loss": 0.0085, + "step": 8534 + }, + { + "epoch": 5.912712157949429, + "grad_norm": 0.39728614687919617, + "learning_rate": 4.088072122052705e-06, + "loss": 0.0089, + "step": 8535 + }, + { + "epoch": 5.913404918600623, + "grad_norm": 0.45335593819618225, + "learning_rate": 4.087378640776699e-06, + "loss": 0.0103, + "step": 8536 + }, + { + "epoch": 5.914097679251818, + "grad_norm": 0.3070891797542572, + "learning_rate": 4.086685159500694e-06, + "loss": 0.0067, + "step": 8537 + }, + { + "epoch": 5.914790439903014, + "grad_norm": 0.5755982995033264, + "learning_rate": 4.085991678224688e-06, + "loss": 0.0096, + "step": 8538 + }, + { + "epoch": 5.915483200554209, + "grad_norm": 0.4692467451095581, + "learning_rate": 4.085298196948682e-06, + "loss": 0.0122, + "step": 8539 + }, + { + "epoch": 5.916175961205403, + "grad_norm": 0.48478642106056213, + "learning_rate": 4.084604715672677e-06, + "loss": 0.0109, + "step": 8540 + }, + { + "epoch": 5.9168687218565985, + "grad_norm": 0.2915709316730499, + "learning_rate": 4.083911234396671e-06, + "loss": 0.0092, + "step": 8541 + }, + { + "epoch": 5.917561482507794, + "grad_norm": 0.3141952157020569, + "learning_rate": 4.083217753120666e-06, + "loss": 0.0057, + "step": 8542 + }, + { + "epoch": 5.918254243158988, + "grad_norm": 0.3410860598087311, + "learning_rate": 4.08252427184466e-06, + "loss": 0.0056, + "step": 8543 + }, + { + "epoch": 5.9189470038101835, + "grad_norm": 0.3486231863498688, + "learning_rate": 4.081830790568655e-06, + "loss": 0.0092, + "step": 8544 + }, + { + "epoch": 5.919639764461379, + "grad_norm": 0.2818567454814911, + "learning_rate": 4.081137309292649e-06, + "loss": 0.0068, + "step": 8545 + }, + { + "epoch": 5.920332525112574, + "grad_norm": 0.4283974766731262, + "learning_rate": 4.0804438280166434e-06, + "loss": 0.0129, + "step": 8546 + }, + { + "epoch": 5.921025285763768, + "grad_norm": 0.44637128710746765, + "learning_rate": 4.079750346740638e-06, + "loss": 0.0062, + "step": 8547 + }, + { + "epoch": 5.921718046414964, + "grad_norm": 0.34497445821762085, + "learning_rate": 4.0790568654646325e-06, + "loss": 0.0076, + "step": 8548 + }, + { + "epoch": 5.922410807066159, + "grad_norm": 0.2993614971637726, + "learning_rate": 4.078363384188627e-06, + "loss": 0.0064, + "step": 8549 + }, + { + "epoch": 5.923103567717353, + "grad_norm": 0.2649892568588257, + "learning_rate": 4.0776699029126215e-06, + "loss": 0.0055, + "step": 8550 + }, + { + "epoch": 5.923796328368549, + "grad_norm": 0.47700437903404236, + "learning_rate": 4.0769764216366156e-06, + "loss": 0.0097, + "step": 8551 + }, + { + "epoch": 5.924489089019744, + "grad_norm": 0.539458155632019, + "learning_rate": 4.0762829403606105e-06, + "loss": 0.0078, + "step": 8552 + }, + { + "epoch": 5.925181849670938, + "grad_norm": 0.34400475025177, + "learning_rate": 4.0755894590846054e-06, + "loss": 0.0071, + "step": 8553 + }, + { + "epoch": 5.925874610322134, + "grad_norm": 0.518405556678772, + "learning_rate": 4.0748959778085995e-06, + "loss": 0.0073, + "step": 8554 + }, + { + "epoch": 5.926567370973329, + "grad_norm": 0.397615522146225, + "learning_rate": 4.074202496532594e-06, + "loss": 0.008, + "step": 8555 + }, + { + "epoch": 5.927260131624523, + "grad_norm": 0.3987257480621338, + "learning_rate": 4.0735090152565885e-06, + "loss": 0.0085, + "step": 8556 + }, + { + "epoch": 5.9279528922757185, + "grad_norm": 0.5365645289421082, + "learning_rate": 4.072815533980583e-06, + "loss": 0.0071, + "step": 8557 + }, + { + "epoch": 5.928645652926914, + "grad_norm": 0.2973618805408478, + "learning_rate": 4.0721220527045776e-06, + "loss": 0.0054, + "step": 8558 + }, + { + "epoch": 5.929338413578109, + "grad_norm": 0.4930473566055298, + "learning_rate": 4.071428571428572e-06, + "loss": 0.0103, + "step": 8559 + }, + { + "epoch": 5.9300311742293035, + "grad_norm": 0.3181535303592682, + "learning_rate": 4.070735090152566e-06, + "loss": 0.0066, + "step": 8560 + }, + { + "epoch": 5.930723934880499, + "grad_norm": 0.2664131224155426, + "learning_rate": 4.070041608876561e-06, + "loss": 0.0053, + "step": 8561 + }, + { + "epoch": 5.931416695531694, + "grad_norm": 0.40227028727531433, + "learning_rate": 4.069348127600556e-06, + "loss": 0.0057, + "step": 8562 + }, + { + "epoch": 5.932109456182888, + "grad_norm": 0.6118664145469666, + "learning_rate": 4.06865464632455e-06, + "loss": 0.0067, + "step": 8563 + }, + { + "epoch": 5.932802216834084, + "grad_norm": 0.4364956021308899, + "learning_rate": 4.067961165048544e-06, + "loss": 0.0095, + "step": 8564 + }, + { + "epoch": 5.933494977485279, + "grad_norm": 0.5452849864959717, + "learning_rate": 4.067267683772539e-06, + "loss": 0.0077, + "step": 8565 + }, + { + "epoch": 5.934187738136474, + "grad_norm": 0.33377793431282043, + "learning_rate": 4.066574202496533e-06, + "loss": 0.0072, + "step": 8566 + }, + { + "epoch": 5.934880498787669, + "grad_norm": 0.2791842520236969, + "learning_rate": 4.065880721220528e-06, + "loss": 0.0067, + "step": 8567 + }, + { + "epoch": 5.935573259438864, + "grad_norm": 0.3220636546611786, + "learning_rate": 4.065187239944522e-06, + "loss": 0.0069, + "step": 8568 + }, + { + "epoch": 5.936266020090059, + "grad_norm": 0.44998860359191895, + "learning_rate": 4.064493758668516e-06, + "loss": 0.008, + "step": 8569 + }, + { + "epoch": 5.936958780741254, + "grad_norm": 0.26550137996673584, + "learning_rate": 4.063800277392511e-06, + "loss": 0.005, + "step": 8570 + }, + { + "epoch": 5.937651541392449, + "grad_norm": 0.4603528380393982, + "learning_rate": 4.063106796116505e-06, + "loss": 0.0091, + "step": 8571 + }, + { + "epoch": 5.938344302043644, + "grad_norm": 0.37784427404403687, + "learning_rate": 4.0624133148405e-06, + "loss": 0.0082, + "step": 8572 + }, + { + "epoch": 5.9390370626948386, + "grad_norm": 0.481948584318161, + "learning_rate": 4.061719833564494e-06, + "loss": 0.0103, + "step": 8573 + }, + { + "epoch": 5.939729823346034, + "grad_norm": 0.30039507150650024, + "learning_rate": 4.061026352288489e-06, + "loss": 0.0057, + "step": 8574 + }, + { + "epoch": 5.940422583997229, + "grad_norm": 0.29231733083724976, + "learning_rate": 4.060332871012483e-06, + "loss": 0.0048, + "step": 8575 + }, + { + "epoch": 5.9411153446484235, + "grad_norm": 0.39617523550987244, + "learning_rate": 4.059639389736477e-06, + "loss": 0.0104, + "step": 8576 + }, + { + "epoch": 5.941808105299619, + "grad_norm": 0.3052685558795929, + "learning_rate": 4.058945908460472e-06, + "loss": 0.0083, + "step": 8577 + }, + { + "epoch": 5.942500865950814, + "grad_norm": 0.2841261625289917, + "learning_rate": 4.058252427184466e-06, + "loss": 0.0056, + "step": 8578 + }, + { + "epoch": 5.943193626602009, + "grad_norm": 0.3798808157444, + "learning_rate": 4.057558945908461e-06, + "loss": 0.0075, + "step": 8579 + }, + { + "epoch": 5.943886387253204, + "grad_norm": 0.3799256384372711, + "learning_rate": 4.056865464632455e-06, + "loss": 0.0099, + "step": 8580 + }, + { + "epoch": 5.944579147904399, + "grad_norm": 0.45977383852005005, + "learning_rate": 4.056171983356449e-06, + "loss": 0.008, + "step": 8581 + }, + { + "epoch": 5.945271908555594, + "grad_norm": 0.5314000248908997, + "learning_rate": 4.055478502080444e-06, + "loss": 0.0101, + "step": 8582 + }, + { + "epoch": 5.945964669206789, + "grad_norm": 0.4125184416770935, + "learning_rate": 4.054785020804439e-06, + "loss": 0.0082, + "step": 8583 + }, + { + "epoch": 5.946657429857984, + "grad_norm": 0.3740287721157074, + "learning_rate": 4.054091539528433e-06, + "loss": 0.0062, + "step": 8584 + }, + { + "epoch": 5.947350190509179, + "grad_norm": 0.30348846316337585, + "learning_rate": 4.053398058252427e-06, + "loss": 0.0069, + "step": 8585 + }, + { + "epoch": 5.9480429511603745, + "grad_norm": 0.43956172466278076, + "learning_rate": 4.052704576976421e-06, + "loss": 0.0123, + "step": 8586 + }, + { + "epoch": 5.948735711811569, + "grad_norm": 0.4679591655731201, + "learning_rate": 4.052011095700416e-06, + "loss": 0.0091, + "step": 8587 + }, + { + "epoch": 5.949428472462764, + "grad_norm": 0.5678568482398987, + "learning_rate": 4.051317614424411e-06, + "loss": 0.0128, + "step": 8588 + }, + { + "epoch": 5.9501212331139595, + "grad_norm": 0.37824150919914246, + "learning_rate": 4.050624133148405e-06, + "loss": 0.0068, + "step": 8589 + }, + { + "epoch": 5.950813993765154, + "grad_norm": 0.38412854075431824, + "learning_rate": 4.049930651872399e-06, + "loss": 0.0073, + "step": 8590 + }, + { + "epoch": 5.951506754416349, + "grad_norm": 0.40824028849601746, + "learning_rate": 4.049237170596394e-06, + "loss": 0.0084, + "step": 8591 + }, + { + "epoch": 5.952199515067544, + "grad_norm": 0.6860206723213196, + "learning_rate": 4.048543689320389e-06, + "loss": 0.0077, + "step": 8592 + }, + { + "epoch": 5.952892275718739, + "grad_norm": 0.28051894903182983, + "learning_rate": 4.047850208044383e-06, + "loss": 0.0074, + "step": 8593 + }, + { + "epoch": 5.953585036369934, + "grad_norm": 0.32923415303230286, + "learning_rate": 4.047156726768377e-06, + "loss": 0.0077, + "step": 8594 + }, + { + "epoch": 5.954277797021129, + "grad_norm": 0.2752898931503296, + "learning_rate": 4.046463245492372e-06, + "loss": 0.0059, + "step": 8595 + }, + { + "epoch": 5.954970557672324, + "grad_norm": 0.4531700313091278, + "learning_rate": 4.045769764216366e-06, + "loss": 0.0086, + "step": 8596 + }, + { + "epoch": 5.955663318323519, + "grad_norm": 0.3138566315174103, + "learning_rate": 4.045076282940361e-06, + "loss": 0.0053, + "step": 8597 + }, + { + "epoch": 5.956356078974714, + "grad_norm": 0.2832315266132355, + "learning_rate": 4.044382801664355e-06, + "loss": 0.0059, + "step": 8598 + }, + { + "epoch": 5.95704883962591, + "grad_norm": 0.33605942130088806, + "learning_rate": 4.0436893203883495e-06, + "loss": 0.0084, + "step": 8599 + }, + { + "epoch": 5.957741600277104, + "grad_norm": 0.44466447830200195, + "learning_rate": 4.0429958391123444e-06, + "loss": 0.0078, + "step": 8600 + }, + { + "epoch": 5.958434360928299, + "grad_norm": 0.3474466800689697, + "learning_rate": 4.0423023578363385e-06, + "loss": 0.0065, + "step": 8601 + }, + { + "epoch": 5.9591271215794945, + "grad_norm": 0.36882033944129944, + "learning_rate": 4.0416088765603335e-06, + "loss": 0.0057, + "step": 8602 + }, + { + "epoch": 5.959819882230689, + "grad_norm": 0.31977614760398865, + "learning_rate": 4.0409153952843275e-06, + "loss": 0.0064, + "step": 8603 + }, + { + "epoch": 5.960512642881884, + "grad_norm": 0.33210012316703796, + "learning_rate": 4.0402219140083225e-06, + "loss": 0.009, + "step": 8604 + }, + { + "epoch": 5.9612054035330795, + "grad_norm": 0.32554617524147034, + "learning_rate": 4.0395284327323166e-06, + "loss": 0.0061, + "step": 8605 + }, + { + "epoch": 5.961898164184275, + "grad_norm": 0.32885587215423584, + "learning_rate": 4.038834951456311e-06, + "loss": 0.0062, + "step": 8606 + }, + { + "epoch": 5.962590924835469, + "grad_norm": 0.2488987147808075, + "learning_rate": 4.038141470180306e-06, + "loss": 0.0048, + "step": 8607 + }, + { + "epoch": 5.963283685486664, + "grad_norm": 0.2795836925506592, + "learning_rate": 4.0374479889043e-06, + "loss": 0.0061, + "step": 8608 + }, + { + "epoch": 5.96397644613786, + "grad_norm": 0.47058913111686707, + "learning_rate": 4.036754507628295e-06, + "loss": 0.0077, + "step": 8609 + }, + { + "epoch": 5.964669206789054, + "grad_norm": 0.5544996857643127, + "learning_rate": 4.036061026352289e-06, + "loss": 0.0074, + "step": 8610 + }, + { + "epoch": 5.965361967440249, + "grad_norm": 0.3298545479774475, + "learning_rate": 4.035367545076283e-06, + "loss": 0.0069, + "step": 8611 + }, + { + "epoch": 5.966054728091445, + "grad_norm": 0.3296002745628357, + "learning_rate": 4.034674063800278e-06, + "loss": 0.0099, + "step": 8612 + }, + { + "epoch": 5.966747488742639, + "grad_norm": 0.4145912826061249, + "learning_rate": 4.033980582524273e-06, + "loss": 0.0099, + "step": 8613 + }, + { + "epoch": 5.967440249393834, + "grad_norm": 0.257587194442749, + "learning_rate": 4.033287101248267e-06, + "loss": 0.0058, + "step": 8614 + }, + { + "epoch": 5.96813301004503, + "grad_norm": 0.22960102558135986, + "learning_rate": 4.032593619972261e-06, + "loss": 0.0046, + "step": 8615 + }, + { + "epoch": 5.968825770696224, + "grad_norm": 0.2833264172077179, + "learning_rate": 4.031900138696255e-06, + "loss": 0.0057, + "step": 8616 + }, + { + "epoch": 5.969518531347419, + "grad_norm": 0.33171358704566956, + "learning_rate": 4.03120665742025e-06, + "loss": 0.0058, + "step": 8617 + }, + { + "epoch": 5.9702112919986146, + "grad_norm": 0.3521496057510376, + "learning_rate": 4.030513176144245e-06, + "loss": 0.0072, + "step": 8618 + }, + { + "epoch": 5.97090405264981, + "grad_norm": 0.41624125838279724, + "learning_rate": 4.029819694868239e-06, + "loss": 0.0071, + "step": 8619 + }, + { + "epoch": 5.971596813301004, + "grad_norm": 0.25357767939567566, + "learning_rate": 4.029126213592233e-06, + "loss": 0.0052, + "step": 8620 + }, + { + "epoch": 5.9722895739521995, + "grad_norm": 0.3554152250289917, + "learning_rate": 4.028432732316228e-06, + "loss": 0.0063, + "step": 8621 + }, + { + "epoch": 5.972982334603395, + "grad_norm": 0.48351994156837463, + "learning_rate": 4.027739251040223e-06, + "loss": 0.0073, + "step": 8622 + }, + { + "epoch": 5.973675095254589, + "grad_norm": 0.3916809558868408, + "learning_rate": 4.027045769764217e-06, + "loss": 0.0082, + "step": 8623 + }, + { + "epoch": 5.9743678559057845, + "grad_norm": 0.4536936283111572, + "learning_rate": 4.026352288488211e-06, + "loss": 0.0085, + "step": 8624 + }, + { + "epoch": 5.97506061655698, + "grad_norm": 0.3828856348991394, + "learning_rate": 4.025658807212205e-06, + "loss": 0.0078, + "step": 8625 + }, + { + "epoch": 5.975753377208175, + "grad_norm": 0.2626187205314636, + "learning_rate": 4.0249653259362e-06, + "loss": 0.0046, + "step": 8626 + }, + { + "epoch": 5.976446137859369, + "grad_norm": 0.40936556458473206, + "learning_rate": 4.024271844660195e-06, + "loss": 0.0065, + "step": 8627 + }, + { + "epoch": 5.977138898510565, + "grad_norm": 0.991696298122406, + "learning_rate": 4.023578363384189e-06, + "loss": 0.0096, + "step": 8628 + }, + { + "epoch": 5.97783165916176, + "grad_norm": 0.36345651745796204, + "learning_rate": 4.022884882108183e-06, + "loss": 0.0065, + "step": 8629 + }, + { + "epoch": 5.978524419812954, + "grad_norm": 0.43531450629234314, + "learning_rate": 4.022191400832178e-06, + "loss": 0.0073, + "step": 8630 + }, + { + "epoch": 5.97921718046415, + "grad_norm": 0.36994925141334534, + "learning_rate": 4.021497919556172e-06, + "loss": 0.0081, + "step": 8631 + }, + { + "epoch": 5.979909941115345, + "grad_norm": 0.27669623494148254, + "learning_rate": 4.020804438280167e-06, + "loss": 0.0061, + "step": 8632 + }, + { + "epoch": 5.980602701766539, + "grad_norm": 0.7132675051689148, + "learning_rate": 4.020110957004161e-06, + "loss": 0.0104, + "step": 8633 + }, + { + "epoch": 5.981295462417735, + "grad_norm": 0.3345467746257782, + "learning_rate": 4.019417475728156e-06, + "loss": 0.0081, + "step": 8634 + }, + { + "epoch": 5.98198822306893, + "grad_norm": 0.3310278654098511, + "learning_rate": 4.01872399445215e-06, + "loss": 0.0061, + "step": 8635 + }, + { + "epoch": 5.982680983720124, + "grad_norm": 0.48464733362197876, + "learning_rate": 4.018030513176144e-06, + "loss": 0.0054, + "step": 8636 + }, + { + "epoch": 5.9833737443713195, + "grad_norm": 0.7874982953071594, + "learning_rate": 4.017337031900139e-06, + "loss": 0.0089, + "step": 8637 + }, + { + "epoch": 5.984066505022515, + "grad_norm": 0.4229066073894501, + "learning_rate": 4.016643550624133e-06, + "loss": 0.0082, + "step": 8638 + }, + { + "epoch": 5.98475926567371, + "grad_norm": 0.43128204345703125, + "learning_rate": 4.015950069348128e-06, + "loss": 0.0106, + "step": 8639 + }, + { + "epoch": 5.9854520263249045, + "grad_norm": 0.44995054602622986, + "learning_rate": 4.015256588072122e-06, + "loss": 0.0071, + "step": 8640 + }, + { + "epoch": 5.9861447869761, + "grad_norm": 0.32228732109069824, + "learning_rate": 4.014563106796116e-06, + "loss": 0.005, + "step": 8641 + }, + { + "epoch": 5.986837547627295, + "grad_norm": 0.514509916305542, + "learning_rate": 4.013869625520111e-06, + "loss": 0.0068, + "step": 8642 + }, + { + "epoch": 5.987530308278489, + "grad_norm": 0.4599981904029846, + "learning_rate": 4.013176144244106e-06, + "loss": 0.01, + "step": 8643 + }, + { + "epoch": 5.988223068929685, + "grad_norm": 0.3862370550632477, + "learning_rate": 4.0124826629681e-06, + "loss": 0.0074, + "step": 8644 + }, + { + "epoch": 5.98891582958088, + "grad_norm": 0.4147402048110962, + "learning_rate": 4.011789181692094e-06, + "loss": 0.0089, + "step": 8645 + }, + { + "epoch": 5.989608590232075, + "grad_norm": 0.34358009696006775, + "learning_rate": 4.0110957004160885e-06, + "loss": 0.0071, + "step": 8646 + }, + { + "epoch": 5.99030135088327, + "grad_norm": 0.33831122517585754, + "learning_rate": 4.0104022191400834e-06, + "loss": 0.0078, + "step": 8647 + }, + { + "epoch": 5.990994111534465, + "grad_norm": 0.2806399464607239, + "learning_rate": 4.009708737864078e-06, + "loss": 0.0049, + "step": 8648 + }, + { + "epoch": 5.99168687218566, + "grad_norm": 0.5958036780357361, + "learning_rate": 4.0090152565880725e-06, + "loss": 0.012, + "step": 8649 + }, + { + "epoch": 5.992379632836855, + "grad_norm": 0.42996886372566223, + "learning_rate": 4.0083217753120665e-06, + "loss": 0.0085, + "step": 8650 + }, + { + "epoch": 5.99307239348805, + "grad_norm": 0.29199519753456116, + "learning_rate": 4.0076282940360615e-06, + "loss": 0.0066, + "step": 8651 + }, + { + "epoch": 5.993765154139245, + "grad_norm": 0.5027766823768616, + "learning_rate": 4.006934812760056e-06, + "loss": 0.0141, + "step": 8652 + }, + { + "epoch": 5.9944579147904395, + "grad_norm": 0.3328966200351715, + "learning_rate": 4.0062413314840505e-06, + "loss": 0.0071, + "step": 8653 + }, + { + "epoch": 5.995150675441635, + "grad_norm": 0.5493451952934265, + "learning_rate": 4.005547850208045e-06, + "loss": 0.0088, + "step": 8654 + }, + { + "epoch": 5.99584343609283, + "grad_norm": 0.4130370318889618, + "learning_rate": 4.004854368932039e-06, + "loss": 0.0072, + "step": 8655 + }, + { + "epoch": 5.9965361967440245, + "grad_norm": 0.30309581756591797, + "learning_rate": 4.004160887656034e-06, + "loss": 0.009, + "step": 8656 + }, + { + "epoch": 5.99722895739522, + "grad_norm": 0.431990385055542, + "learning_rate": 4.0034674063800285e-06, + "loss": 0.0098, + "step": 8657 + }, + { + "epoch": 5.997921718046415, + "grad_norm": 0.35093095898628235, + "learning_rate": 4.002773925104023e-06, + "loss": 0.0079, + "step": 8658 + }, + { + "epoch": 5.99861447869761, + "grad_norm": 0.4860779345035553, + "learning_rate": 4.002080443828017e-06, + "loss": 0.0087, + "step": 8659 + }, + { + "epoch": 5.999307239348805, + "grad_norm": 0.31936201453208923, + "learning_rate": 4.001386962552012e-06, + "loss": 0.0057, + "step": 8660 + }, + { + "epoch": 6.0, + "grad_norm": 0.3616134226322174, + "learning_rate": 4.000693481276006e-06, + "loss": 0.0079, + "step": 8661 + }, + { + "epoch": 6.0, + "eval_loss": 0.28647318482398987, + "eval_runtime": 7657.2252, + "eval_samples_per_second": 1.045, + "eval_steps_per_second": 0.033, + "eval_wer": 12.49403077209779, + "step": 8661 + }, + { + "epoch": 6.000692760651195, + "grad_norm": 0.18963384628295898, + "learning_rate": 4.000000000000001e-06, + "loss": 0.0033, + "step": 8662 + }, + { + "epoch": 6.00138552130239, + "grad_norm": 0.21622170507907867, + "learning_rate": 3.999306518723995e-06, + "loss": 0.0042, + "step": 8663 + }, + { + "epoch": 6.002078281953585, + "grad_norm": 0.3468954563140869, + "learning_rate": 3.998613037447989e-06, + "loss": 0.0066, + "step": 8664 + }, + { + "epoch": 6.00277104260478, + "grad_norm": 0.1983821988105774, + "learning_rate": 3.997919556171984e-06, + "loss": 0.0039, + "step": 8665 + }, + { + "epoch": 6.003463803255975, + "grad_norm": 0.2540256381034851, + "learning_rate": 3.997226074895978e-06, + "loss": 0.0056, + "step": 8666 + }, + { + "epoch": 6.00415656390717, + "grad_norm": 0.22275549173355103, + "learning_rate": 3.996532593619973e-06, + "loss": 0.0043, + "step": 8667 + }, + { + "epoch": 6.004849324558365, + "grad_norm": 0.31321004033088684, + "learning_rate": 3.995839112343967e-06, + "loss": 0.004, + "step": 8668 + }, + { + "epoch": 6.0055420852095605, + "grad_norm": 0.3807545006275177, + "learning_rate": 3.995145631067962e-06, + "loss": 0.0044, + "step": 8669 + }, + { + "epoch": 6.006234845860755, + "grad_norm": 0.3322286903858185, + "learning_rate": 3.994452149791956e-06, + "loss": 0.0058, + "step": 8670 + }, + { + "epoch": 6.00692760651195, + "grad_norm": 0.2569510340690613, + "learning_rate": 3.99375866851595e-06, + "loss": 0.0037, + "step": 8671 + }, + { + "epoch": 6.007620367163145, + "grad_norm": 0.17756123840808868, + "learning_rate": 3.993065187239945e-06, + "loss": 0.0038, + "step": 8672 + }, + { + "epoch": 6.00831312781434, + "grad_norm": 0.35195431113243103, + "learning_rate": 3.992371705963939e-06, + "loss": 0.0063, + "step": 8673 + }, + { + "epoch": 6.009005888465535, + "grad_norm": 0.1642107516527176, + "learning_rate": 3.991678224687934e-06, + "loss": 0.0029, + "step": 8674 + }, + { + "epoch": 6.00969864911673, + "grad_norm": 0.2845403850078583, + "learning_rate": 3.990984743411928e-06, + "loss": 0.004, + "step": 8675 + }, + { + "epoch": 6.010391409767925, + "grad_norm": 0.22731952369213104, + "learning_rate": 3.990291262135922e-06, + "loss": 0.0045, + "step": 8676 + }, + { + "epoch": 6.01108417041912, + "grad_norm": 0.3325885236263275, + "learning_rate": 3.989597780859917e-06, + "loss": 0.0041, + "step": 8677 + }, + { + "epoch": 6.011776931070315, + "grad_norm": 0.20064549148082733, + "learning_rate": 3.988904299583912e-06, + "loss": 0.0038, + "step": 8678 + }, + { + "epoch": 6.012469691721511, + "grad_norm": 0.1457013040781021, + "learning_rate": 3.988210818307906e-06, + "loss": 0.0029, + "step": 8679 + }, + { + "epoch": 6.013162452372705, + "grad_norm": 0.171246737241745, + "learning_rate": 3.9875173370319e-06, + "loss": 0.0026, + "step": 8680 + }, + { + "epoch": 6.0138552130239, + "grad_norm": 0.1949104517698288, + "learning_rate": 3.986823855755895e-06, + "loss": 0.0032, + "step": 8681 + }, + { + "epoch": 6.0145479736750955, + "grad_norm": 0.1342078000307083, + "learning_rate": 3.98613037447989e-06, + "loss": 0.003, + "step": 8682 + }, + { + "epoch": 6.01524073432629, + "grad_norm": 0.3117418885231018, + "learning_rate": 3.985436893203884e-06, + "loss": 0.0052, + "step": 8683 + }, + { + "epoch": 6.015933494977485, + "grad_norm": 0.28929945826530457, + "learning_rate": 3.984743411927878e-06, + "loss": 0.005, + "step": 8684 + }, + { + "epoch": 6.0166262556286805, + "grad_norm": 0.32413914799690247, + "learning_rate": 3.984049930651872e-06, + "loss": 0.0054, + "step": 8685 + }, + { + "epoch": 6.017319016279875, + "grad_norm": 0.3699895739555359, + "learning_rate": 3.983356449375867e-06, + "loss": 0.004, + "step": 8686 + }, + { + "epoch": 6.01801177693107, + "grad_norm": 0.23407377302646637, + "learning_rate": 3.982662968099862e-06, + "loss": 0.0032, + "step": 8687 + }, + { + "epoch": 6.018704537582265, + "grad_norm": 0.2936857342720032, + "learning_rate": 3.981969486823856e-06, + "loss": 0.0044, + "step": 8688 + }, + { + "epoch": 6.019397298233461, + "grad_norm": 0.43710726499557495, + "learning_rate": 3.98127600554785e-06, + "loss": 0.0044, + "step": 8689 + }, + { + "epoch": 6.020090058884655, + "grad_norm": 0.2992812693119049, + "learning_rate": 3.980582524271845e-06, + "loss": 0.0057, + "step": 8690 + }, + { + "epoch": 6.02078281953585, + "grad_norm": 0.3120293617248535, + "learning_rate": 3.979889042995839e-06, + "loss": 0.0039, + "step": 8691 + }, + { + "epoch": 6.021475580187046, + "grad_norm": 0.18237251043319702, + "learning_rate": 3.979195561719834e-06, + "loss": 0.0035, + "step": 8692 + }, + { + "epoch": 6.02216834083824, + "grad_norm": 0.1504957675933838, + "learning_rate": 3.978502080443828e-06, + "loss": 0.0023, + "step": 8693 + }, + { + "epoch": 6.022861101489435, + "grad_norm": 0.5236929655075073, + "learning_rate": 3.9778085991678224e-06, + "loss": 0.0043, + "step": 8694 + }, + { + "epoch": 6.023553862140631, + "grad_norm": 0.08897154033184052, + "learning_rate": 3.977115117891817e-06, + "loss": 0.0024, + "step": 8695 + }, + { + "epoch": 6.024246622791825, + "grad_norm": 0.24160565435886383, + "learning_rate": 3.9764216366158115e-06, + "loss": 0.0035, + "step": 8696 + }, + { + "epoch": 6.02493938344302, + "grad_norm": 0.21429160237312317, + "learning_rate": 3.975728155339806e-06, + "loss": 0.0033, + "step": 8697 + }, + { + "epoch": 6.0256321440942155, + "grad_norm": 0.19688105583190918, + "learning_rate": 3.9750346740638005e-06, + "loss": 0.0033, + "step": 8698 + }, + { + "epoch": 6.026324904745411, + "grad_norm": 0.14925949275493622, + "learning_rate": 3.974341192787795e-06, + "loss": 0.0029, + "step": 8699 + }, + { + "epoch": 6.027017665396605, + "grad_norm": 0.33217620849609375, + "learning_rate": 3.9736477115117895e-06, + "loss": 0.0042, + "step": 8700 + }, + { + "epoch": 6.0277104260478005, + "grad_norm": 0.3502248227596283, + "learning_rate": 3.972954230235784e-06, + "loss": 0.0101, + "step": 8701 + }, + { + "epoch": 6.028403186698996, + "grad_norm": 0.1795787215232849, + "learning_rate": 3.9722607489597785e-06, + "loss": 0.0033, + "step": 8702 + }, + { + "epoch": 6.02909594735019, + "grad_norm": 0.2041216939687729, + "learning_rate": 3.971567267683773e-06, + "loss": 0.0039, + "step": 8703 + }, + { + "epoch": 6.0297887080013854, + "grad_norm": 0.19797015190124512, + "learning_rate": 3.9708737864077675e-06, + "loss": 0.0034, + "step": 8704 + }, + { + "epoch": 6.030481468652581, + "grad_norm": 0.2640591561794281, + "learning_rate": 3.970180305131762e-06, + "loss": 0.0039, + "step": 8705 + }, + { + "epoch": 6.031174229303775, + "grad_norm": 0.2120644450187683, + "learning_rate": 3.969486823855756e-06, + "loss": 0.004, + "step": 8706 + }, + { + "epoch": 6.03186698995497, + "grad_norm": 0.4690014719963074, + "learning_rate": 3.968793342579751e-06, + "loss": 0.0065, + "step": 8707 + }, + { + "epoch": 6.032559750606166, + "grad_norm": 0.24471673369407654, + "learning_rate": 3.968099861303746e-06, + "loss": 0.0034, + "step": 8708 + }, + { + "epoch": 6.033252511257361, + "grad_norm": 0.2979377806186676, + "learning_rate": 3.96740638002774e-06, + "loss": 0.0059, + "step": 8709 + }, + { + "epoch": 6.033945271908555, + "grad_norm": 0.14429903030395508, + "learning_rate": 3.966712898751734e-06, + "loss": 0.0031, + "step": 8710 + }, + { + "epoch": 6.034638032559751, + "grad_norm": 0.2544229328632355, + "learning_rate": 3.966019417475729e-06, + "loss": 0.0038, + "step": 8711 + }, + { + "epoch": 6.035330793210946, + "grad_norm": 0.13747026026248932, + "learning_rate": 3.965325936199723e-06, + "loss": 0.0029, + "step": 8712 + }, + { + "epoch": 6.03602355386214, + "grad_norm": 0.3124599754810333, + "learning_rate": 3.964632454923718e-06, + "loss": 0.005, + "step": 8713 + }, + { + "epoch": 6.036716314513336, + "grad_norm": 0.14773334562778473, + "learning_rate": 3.963938973647712e-06, + "loss": 0.003, + "step": 8714 + }, + { + "epoch": 6.037409075164531, + "grad_norm": 0.40750443935394287, + "learning_rate": 3.963245492371706e-06, + "loss": 0.004, + "step": 8715 + }, + { + "epoch": 6.038101835815725, + "grad_norm": 0.1526937633752823, + "learning_rate": 3.962552011095701e-06, + "loss": 0.0025, + "step": 8716 + }, + { + "epoch": 6.0387945964669205, + "grad_norm": 0.17284125089645386, + "learning_rate": 3.961858529819696e-06, + "loss": 0.0035, + "step": 8717 + }, + { + "epoch": 6.039487357118116, + "grad_norm": 0.3093487620353699, + "learning_rate": 3.96116504854369e-06, + "loss": 0.0038, + "step": 8718 + }, + { + "epoch": 6.040180117769311, + "grad_norm": 0.4244932532310486, + "learning_rate": 3.960471567267684e-06, + "loss": 0.0062, + "step": 8719 + }, + { + "epoch": 6.0408728784205055, + "grad_norm": 0.20396049320697784, + "learning_rate": 3.959778085991679e-06, + "loss": 0.0034, + "step": 8720 + }, + { + "epoch": 6.041565639071701, + "grad_norm": 0.20464245975017548, + "learning_rate": 3.959084604715673e-06, + "loss": 0.0033, + "step": 8721 + }, + { + "epoch": 6.042258399722896, + "grad_norm": 0.17733533680438995, + "learning_rate": 3.958391123439668e-06, + "loss": 0.0034, + "step": 8722 + }, + { + "epoch": 6.04295116037409, + "grad_norm": 0.5362047553062439, + "learning_rate": 3.957697642163662e-06, + "loss": 0.0059, + "step": 8723 + }, + { + "epoch": 6.043643921025286, + "grad_norm": 0.19199024140834808, + "learning_rate": 3.957004160887656e-06, + "loss": 0.0035, + "step": 8724 + }, + { + "epoch": 6.044336681676481, + "grad_norm": 0.2026488333940506, + "learning_rate": 3.956310679611651e-06, + "loss": 0.0034, + "step": 8725 + }, + { + "epoch": 6.045029442327675, + "grad_norm": 0.34202784299850464, + "learning_rate": 3.955617198335645e-06, + "loss": 0.006, + "step": 8726 + }, + { + "epoch": 6.045722202978871, + "grad_norm": 0.18430756032466888, + "learning_rate": 3.95492371705964e-06, + "loss": 0.0028, + "step": 8727 + }, + { + "epoch": 6.046414963630066, + "grad_norm": 0.2855680584907532, + "learning_rate": 3.954230235783634e-06, + "loss": 0.0046, + "step": 8728 + }, + { + "epoch": 6.047107724281261, + "grad_norm": 0.20251581072807312, + "learning_rate": 3.953536754507629e-06, + "loss": 0.0026, + "step": 8729 + }, + { + "epoch": 6.047800484932456, + "grad_norm": 0.35224398970603943, + "learning_rate": 3.952843273231623e-06, + "loss": 0.004, + "step": 8730 + }, + { + "epoch": 6.048493245583651, + "grad_norm": 0.23019564151763916, + "learning_rate": 3.952149791955617e-06, + "loss": 0.0033, + "step": 8731 + }, + { + "epoch": 6.049186006234846, + "grad_norm": 0.20461471378803253, + "learning_rate": 3.951456310679612e-06, + "loss": 0.0033, + "step": 8732 + }, + { + "epoch": 6.0498787668860405, + "grad_norm": 0.482501745223999, + "learning_rate": 3.950762829403606e-06, + "loss": 0.0077, + "step": 8733 + }, + { + "epoch": 6.050571527537236, + "grad_norm": 0.1371850222349167, + "learning_rate": 3.950069348127601e-06, + "loss": 0.0029, + "step": 8734 + }, + { + "epoch": 6.051264288188431, + "grad_norm": 0.26176711916923523, + "learning_rate": 3.949375866851595e-06, + "loss": 0.0048, + "step": 8735 + }, + { + "epoch": 6.0519570488396255, + "grad_norm": 0.17871302366256714, + "learning_rate": 3.948682385575589e-06, + "loss": 0.0031, + "step": 8736 + }, + { + "epoch": 6.052649809490821, + "grad_norm": 0.353868305683136, + "learning_rate": 3.947988904299584e-06, + "loss": 0.0034, + "step": 8737 + }, + { + "epoch": 6.053342570142016, + "grad_norm": 0.15032900869846344, + "learning_rate": 3.947295423023579e-06, + "loss": 0.0034, + "step": 8738 + }, + { + "epoch": 6.054035330793211, + "grad_norm": 0.19332966208457947, + "learning_rate": 3.946601941747573e-06, + "loss": 0.0035, + "step": 8739 + }, + { + "epoch": 6.054728091444406, + "grad_norm": 0.14572642743587494, + "learning_rate": 3.945908460471567e-06, + "loss": 0.003, + "step": 8740 + }, + { + "epoch": 6.055420852095601, + "grad_norm": 0.1290379911661148, + "learning_rate": 3.9452149791955614e-06, + "loss": 0.0026, + "step": 8741 + }, + { + "epoch": 6.056113612746796, + "grad_norm": 0.1428215503692627, + "learning_rate": 3.944521497919556e-06, + "loss": 0.0024, + "step": 8742 + }, + { + "epoch": 6.056806373397991, + "grad_norm": 0.27382057905197144, + "learning_rate": 3.943828016643551e-06, + "loss": 0.0039, + "step": 8743 + }, + { + "epoch": 6.057499134049186, + "grad_norm": 0.1334439218044281, + "learning_rate": 3.943134535367545e-06, + "loss": 0.0025, + "step": 8744 + }, + { + "epoch": 6.058191894700381, + "grad_norm": 0.33077719807624817, + "learning_rate": 3.9424410540915395e-06, + "loss": 0.0048, + "step": 8745 + }, + { + "epoch": 6.058884655351576, + "grad_norm": 0.21852080523967743, + "learning_rate": 3.941747572815534e-06, + "loss": 0.0034, + "step": 8746 + }, + { + "epoch": 6.059577416002771, + "grad_norm": 0.1549639254808426, + "learning_rate": 3.941054091539529e-06, + "loss": 0.0033, + "step": 8747 + }, + { + "epoch": 6.060270176653966, + "grad_norm": 0.31655940413475037, + "learning_rate": 3.9403606102635234e-06, + "loss": 0.0054, + "step": 8748 + }, + { + "epoch": 6.0609629373051614, + "grad_norm": 0.2889525890350342, + "learning_rate": 3.9396671289875175e-06, + "loss": 0.0037, + "step": 8749 + }, + { + "epoch": 6.061655697956356, + "grad_norm": 0.25274062156677246, + "learning_rate": 3.938973647711512e-06, + "loss": 0.0039, + "step": 8750 + }, + { + "epoch": 6.062348458607551, + "grad_norm": 0.26409873366355896, + "learning_rate": 3.9382801664355065e-06, + "loss": 0.0045, + "step": 8751 + }, + { + "epoch": 6.063041219258746, + "grad_norm": 0.3435945212841034, + "learning_rate": 3.9375866851595015e-06, + "loss": 0.0041, + "step": 8752 + }, + { + "epoch": 6.063733979909941, + "grad_norm": 0.15153640508651733, + "learning_rate": 3.9368932038834956e-06, + "loss": 0.0042, + "step": 8753 + }, + { + "epoch": 6.064426740561136, + "grad_norm": 0.1768701821565628, + "learning_rate": 3.93619972260749e-06, + "loss": 0.0034, + "step": 8754 + }, + { + "epoch": 6.065119501212331, + "grad_norm": 0.2529245913028717, + "learning_rate": 3.935506241331485e-06, + "loss": 0.0053, + "step": 8755 + }, + { + "epoch": 6.065812261863526, + "grad_norm": 0.35842397809028625, + "learning_rate": 3.934812760055479e-06, + "loss": 0.0035, + "step": 8756 + }, + { + "epoch": 6.066505022514721, + "grad_norm": 0.17603647708892822, + "learning_rate": 3.934119278779474e-06, + "loss": 0.0028, + "step": 8757 + }, + { + "epoch": 6.067197783165916, + "grad_norm": 0.36040037870407104, + "learning_rate": 3.933425797503468e-06, + "loss": 0.0038, + "step": 8758 + }, + { + "epoch": 6.067890543817112, + "grad_norm": 0.2548173666000366, + "learning_rate": 3.932732316227463e-06, + "loss": 0.0035, + "step": 8759 + }, + { + "epoch": 6.068583304468306, + "grad_norm": 0.24014891684055328, + "learning_rate": 3.932038834951457e-06, + "loss": 0.0052, + "step": 8760 + }, + { + "epoch": 6.069276065119501, + "grad_norm": 0.14708179235458374, + "learning_rate": 3.931345353675451e-06, + "loss": 0.0032, + "step": 8761 + }, + { + "epoch": 6.0699688257706965, + "grad_norm": 0.2969667911529541, + "learning_rate": 3.930651872399446e-06, + "loss": 0.0059, + "step": 8762 + }, + { + "epoch": 6.070661586421891, + "grad_norm": 0.3556962311267853, + "learning_rate": 3.92995839112344e-06, + "loss": 0.005, + "step": 8763 + }, + { + "epoch": 6.071354347073086, + "grad_norm": 0.21978433430194855, + "learning_rate": 3.929264909847435e-06, + "loss": 0.0033, + "step": 8764 + }, + { + "epoch": 6.0720471077242815, + "grad_norm": 0.1866694539785385, + "learning_rate": 3.928571428571429e-06, + "loss": 0.0038, + "step": 8765 + }, + { + "epoch": 6.072739868375476, + "grad_norm": 0.1878751814365387, + "learning_rate": 3.927877947295423e-06, + "loss": 0.0034, + "step": 8766 + }, + { + "epoch": 6.073432629026671, + "grad_norm": 0.2639264166355133, + "learning_rate": 3.927184466019418e-06, + "loss": 0.0049, + "step": 8767 + }, + { + "epoch": 6.074125389677866, + "grad_norm": 0.16790176928043365, + "learning_rate": 3.926490984743413e-06, + "loss": 0.0035, + "step": 8768 + }, + { + "epoch": 6.074818150329062, + "grad_norm": 0.21823951601982117, + "learning_rate": 3.925797503467407e-06, + "loss": 0.0044, + "step": 8769 + }, + { + "epoch": 6.075510910980256, + "grad_norm": 0.25152066349983215, + "learning_rate": 3.925104022191401e-06, + "loss": 0.0029, + "step": 8770 + }, + { + "epoch": 6.076203671631451, + "grad_norm": 0.27631130814552307, + "learning_rate": 3.924410540915395e-06, + "loss": 0.0051, + "step": 8771 + }, + { + "epoch": 6.076896432282647, + "grad_norm": 0.18931017816066742, + "learning_rate": 3.92371705963939e-06, + "loss": 0.0028, + "step": 8772 + }, + { + "epoch": 6.077589192933841, + "grad_norm": 0.15689823031425476, + "learning_rate": 3.923023578363385e-06, + "loss": 0.0028, + "step": 8773 + }, + { + "epoch": 6.078281953585036, + "grad_norm": 0.15919512510299683, + "learning_rate": 3.922330097087379e-06, + "loss": 0.0034, + "step": 8774 + }, + { + "epoch": 6.078974714236232, + "grad_norm": 0.2085564285516739, + "learning_rate": 3.921636615811373e-06, + "loss": 0.0049, + "step": 8775 + }, + { + "epoch": 6.079667474887426, + "grad_norm": 0.20154698193073273, + "learning_rate": 3.920943134535368e-06, + "loss": 0.004, + "step": 8776 + }, + { + "epoch": 6.080360235538621, + "grad_norm": 0.32937222719192505, + "learning_rate": 3.920249653259363e-06, + "loss": 0.0074, + "step": 8777 + }, + { + "epoch": 6.0810529961898165, + "grad_norm": 0.36560726165771484, + "learning_rate": 3.919556171983357e-06, + "loss": 0.0038, + "step": 8778 + }, + { + "epoch": 6.081745756841012, + "grad_norm": 0.15626446902751923, + "learning_rate": 3.918862690707351e-06, + "loss": 0.0027, + "step": 8779 + }, + { + "epoch": 6.082438517492206, + "grad_norm": 0.5248779654502869, + "learning_rate": 3.918169209431345e-06, + "loss": 0.003, + "step": 8780 + }, + { + "epoch": 6.0831312781434015, + "grad_norm": 0.1562829613685608, + "learning_rate": 3.91747572815534e-06, + "loss": 0.0038, + "step": 8781 + }, + { + "epoch": 6.083824038794597, + "grad_norm": 0.16246558725833893, + "learning_rate": 3.916782246879335e-06, + "loss": 0.0033, + "step": 8782 + }, + { + "epoch": 6.084516799445791, + "grad_norm": 0.11507923901081085, + "learning_rate": 3.916088765603329e-06, + "loss": 0.0028, + "step": 8783 + }, + { + "epoch": 6.085209560096986, + "grad_norm": 0.26600173115730286, + "learning_rate": 3.915395284327323e-06, + "loss": 0.0047, + "step": 8784 + }, + { + "epoch": 6.085902320748182, + "grad_norm": 0.26927924156188965, + "learning_rate": 3.914701803051318e-06, + "loss": 0.0036, + "step": 8785 + }, + { + "epoch": 6.086595081399376, + "grad_norm": 0.3145221173763275, + "learning_rate": 3.914008321775312e-06, + "loss": 0.0034, + "step": 8786 + }, + { + "epoch": 6.087287842050571, + "grad_norm": 0.27800190448760986, + "learning_rate": 3.913314840499307e-06, + "loss": 0.0029, + "step": 8787 + }, + { + "epoch": 6.087980602701767, + "grad_norm": 0.19154535233974457, + "learning_rate": 3.912621359223301e-06, + "loss": 0.0036, + "step": 8788 + }, + { + "epoch": 6.088673363352962, + "grad_norm": 0.8603630065917969, + "learning_rate": 3.911927877947295e-06, + "loss": 0.0046, + "step": 8789 + }, + { + "epoch": 6.089366124004156, + "grad_norm": 0.3638884127140045, + "learning_rate": 3.91123439667129e-06, + "loss": 0.003, + "step": 8790 + }, + { + "epoch": 6.090058884655352, + "grad_norm": 0.19491951167583466, + "learning_rate": 3.910540915395284e-06, + "loss": 0.0037, + "step": 8791 + }, + { + "epoch": 6.090751645306547, + "grad_norm": 0.22996185719966888, + "learning_rate": 3.909847434119279e-06, + "loss": 0.0034, + "step": 8792 + }, + { + "epoch": 6.091444405957741, + "grad_norm": 0.29953622817993164, + "learning_rate": 3.909153952843273e-06, + "loss": 0.0026, + "step": 8793 + }, + { + "epoch": 6.092137166608937, + "grad_norm": 0.1870751529932022, + "learning_rate": 3.908460471567268e-06, + "loss": 0.0029, + "step": 8794 + }, + { + "epoch": 6.092829927260132, + "grad_norm": 0.20994138717651367, + "learning_rate": 3.9077669902912624e-06, + "loss": 0.0039, + "step": 8795 + }, + { + "epoch": 6.093522687911326, + "grad_norm": 0.1280861347913742, + "learning_rate": 3.9070735090152565e-06, + "loss": 0.0029, + "step": 8796 + }, + { + "epoch": 6.0942154485625215, + "grad_norm": 0.33243700861930847, + "learning_rate": 3.9063800277392515e-06, + "loss": 0.0038, + "step": 8797 + }, + { + "epoch": 6.094908209213717, + "grad_norm": 0.21579419076442719, + "learning_rate": 3.905686546463246e-06, + "loss": 0.0027, + "step": 8798 + }, + { + "epoch": 6.095600969864912, + "grad_norm": 0.3789650499820709, + "learning_rate": 3.9049930651872405e-06, + "loss": 0.0067, + "step": 8799 + }, + { + "epoch": 6.0962937305161065, + "grad_norm": 0.25341564416885376, + "learning_rate": 3.9042995839112346e-06, + "loss": 0.0032, + "step": 8800 + }, + { + "epoch": 6.096986491167302, + "grad_norm": 0.27467408776283264, + "learning_rate": 3.903606102635229e-06, + "loss": 0.0049, + "step": 8801 + }, + { + "epoch": 6.097679251818497, + "grad_norm": 0.14345599710941315, + "learning_rate": 3.902912621359224e-06, + "loss": 0.0026, + "step": 8802 + }, + { + "epoch": 6.098372012469691, + "grad_norm": 0.2033206969499588, + "learning_rate": 3.9022191400832185e-06, + "loss": 0.0032, + "step": 8803 + }, + { + "epoch": 6.099064773120887, + "grad_norm": 0.17043140530586243, + "learning_rate": 3.901525658807213e-06, + "loss": 0.0031, + "step": 8804 + }, + { + "epoch": 6.099757533772082, + "grad_norm": 0.23245030641555786, + "learning_rate": 3.900832177531207e-06, + "loss": 0.0038, + "step": 8805 + }, + { + "epoch": 6.100450294423276, + "grad_norm": 0.27172887325286865, + "learning_rate": 3.900138696255202e-06, + "loss": 0.0035, + "step": 8806 + }, + { + "epoch": 6.101143055074472, + "grad_norm": 0.17954404652118683, + "learning_rate": 3.8994452149791966e-06, + "loss": 0.003, + "step": 8807 + }, + { + "epoch": 6.101835815725667, + "grad_norm": 0.20926862955093384, + "learning_rate": 3.898751733703191e-06, + "loss": 0.003, + "step": 8808 + }, + { + "epoch": 6.102528576376862, + "grad_norm": 0.27428656816482544, + "learning_rate": 3.898058252427185e-06, + "loss": 0.004, + "step": 8809 + }, + { + "epoch": 6.103221337028057, + "grad_norm": 0.22884447872638702, + "learning_rate": 3.897364771151179e-06, + "loss": 0.0026, + "step": 8810 + }, + { + "epoch": 6.103914097679252, + "grad_norm": 0.13015517592430115, + "learning_rate": 3.896671289875174e-06, + "loss": 0.0027, + "step": 8811 + }, + { + "epoch": 6.104606858330447, + "grad_norm": 0.19904249906539917, + "learning_rate": 3.895977808599169e-06, + "loss": 0.003, + "step": 8812 + }, + { + "epoch": 6.1052996189816415, + "grad_norm": 0.1918070763349533, + "learning_rate": 3.895284327323163e-06, + "loss": 0.0039, + "step": 8813 + }, + { + "epoch": 6.105992379632837, + "grad_norm": 0.15057818591594696, + "learning_rate": 3.894590846047157e-06, + "loss": 0.0032, + "step": 8814 + }, + { + "epoch": 6.106685140284032, + "grad_norm": 0.09813568741083145, + "learning_rate": 3.893897364771152e-06, + "loss": 0.002, + "step": 8815 + }, + { + "epoch": 6.1073779009352265, + "grad_norm": 0.45844826102256775, + "learning_rate": 3.893203883495146e-06, + "loss": 0.0032, + "step": 8816 + }, + { + "epoch": 6.108070661586422, + "grad_norm": 0.2586424648761749, + "learning_rate": 3.892510402219141e-06, + "loss": 0.0039, + "step": 8817 + }, + { + "epoch": 6.108763422237617, + "grad_norm": 0.184055358171463, + "learning_rate": 3.891816920943135e-06, + "loss": 0.0032, + "step": 8818 + }, + { + "epoch": 6.109456182888812, + "grad_norm": 0.23174357414245605, + "learning_rate": 3.891123439667129e-06, + "loss": 0.0031, + "step": 8819 + }, + { + "epoch": 6.110148943540007, + "grad_norm": 0.23705334961414337, + "learning_rate": 3.890429958391124e-06, + "loss": 0.0026, + "step": 8820 + }, + { + "epoch": 6.110841704191202, + "grad_norm": 0.12069892883300781, + "learning_rate": 3.889736477115118e-06, + "loss": 0.0026, + "step": 8821 + }, + { + "epoch": 6.111534464842397, + "grad_norm": 0.19660265743732452, + "learning_rate": 3.889042995839113e-06, + "loss": 0.0033, + "step": 8822 + }, + { + "epoch": 6.112227225493592, + "grad_norm": 0.2262856662273407, + "learning_rate": 3.888349514563107e-06, + "loss": 0.0028, + "step": 8823 + }, + { + "epoch": 6.112919986144787, + "grad_norm": 0.1962341070175171, + "learning_rate": 3.887656033287102e-06, + "loss": 0.0032, + "step": 8824 + }, + { + "epoch": 6.113612746795982, + "grad_norm": 0.2887985408306122, + "learning_rate": 3.886962552011096e-06, + "loss": 0.0033, + "step": 8825 + }, + { + "epoch": 6.114305507447177, + "grad_norm": 0.17293819785118103, + "learning_rate": 3.88626907073509e-06, + "loss": 0.0033, + "step": 8826 + }, + { + "epoch": 6.114998268098372, + "grad_norm": 0.47911661863327026, + "learning_rate": 3.885575589459085e-06, + "loss": 0.0052, + "step": 8827 + }, + { + "epoch": 6.115691028749567, + "grad_norm": 0.24279531836509705, + "learning_rate": 3.884882108183079e-06, + "loss": 0.0036, + "step": 8828 + }, + { + "epoch": 6.116383789400762, + "grad_norm": 0.19363507628440857, + "learning_rate": 3.884188626907074e-06, + "loss": 0.003, + "step": 8829 + }, + { + "epoch": 6.117076550051957, + "grad_norm": 0.2090897113084793, + "learning_rate": 3.883495145631068e-06, + "loss": 0.0029, + "step": 8830 + }, + { + "epoch": 6.117769310703152, + "grad_norm": 0.512018620967865, + "learning_rate": 3.882801664355062e-06, + "loss": 0.0063, + "step": 8831 + }, + { + "epoch": 6.118462071354347, + "grad_norm": 0.2961897552013397, + "learning_rate": 3.882108183079057e-06, + "loss": 0.0036, + "step": 8832 + }, + { + "epoch": 6.119154832005542, + "grad_norm": 0.12247592955827713, + "learning_rate": 3.881414701803052e-06, + "loss": 0.0027, + "step": 8833 + }, + { + "epoch": 6.119847592656737, + "grad_norm": 0.4169470965862274, + "learning_rate": 3.880721220527046e-06, + "loss": 0.0041, + "step": 8834 + }, + { + "epoch": 6.120540353307932, + "grad_norm": 0.15056347846984863, + "learning_rate": 3.88002773925104e-06, + "loss": 0.0025, + "step": 8835 + }, + { + "epoch": 6.121233113959127, + "grad_norm": 0.36095133423805237, + "learning_rate": 3.879334257975035e-06, + "loss": 0.0035, + "step": 8836 + }, + { + "epoch": 6.121925874610322, + "grad_norm": 0.2863277792930603, + "learning_rate": 3.878640776699029e-06, + "loss": 0.0059, + "step": 8837 + }, + { + "epoch": 6.122618635261517, + "grad_norm": 0.36874857544898987, + "learning_rate": 3.877947295423024e-06, + "loss": 0.0055, + "step": 8838 + }, + { + "epoch": 6.123311395912713, + "grad_norm": 0.1970696896314621, + "learning_rate": 3.877253814147018e-06, + "loss": 0.0035, + "step": 8839 + }, + { + "epoch": 6.124004156563907, + "grad_norm": 0.34227070212364197, + "learning_rate": 3.876560332871012e-06, + "loss": 0.004, + "step": 8840 + }, + { + "epoch": 6.124696917215102, + "grad_norm": 0.13455091416835785, + "learning_rate": 3.875866851595007e-06, + "loss": 0.002, + "step": 8841 + }, + { + "epoch": 6.1253896778662975, + "grad_norm": 0.17953209578990936, + "learning_rate": 3.875173370319002e-06, + "loss": 0.0027, + "step": 8842 + }, + { + "epoch": 6.126082438517492, + "grad_norm": 0.23272685706615448, + "learning_rate": 3.874479889042996e-06, + "loss": 0.0034, + "step": 8843 + }, + { + "epoch": 6.126775199168687, + "grad_norm": 0.14728665351867676, + "learning_rate": 3.8737864077669905e-06, + "loss": 0.0026, + "step": 8844 + }, + { + "epoch": 6.1274679598198825, + "grad_norm": 0.27838802337646484, + "learning_rate": 3.873092926490985e-06, + "loss": 0.0037, + "step": 8845 + }, + { + "epoch": 6.128160720471077, + "grad_norm": 0.26910194754600525, + "learning_rate": 3.8723994452149795e-06, + "loss": 0.0069, + "step": 8846 + }, + { + "epoch": 6.128853481122272, + "grad_norm": 0.18215250968933105, + "learning_rate": 3.8717059639389736e-06, + "loss": 0.0028, + "step": 8847 + }, + { + "epoch": 6.129546241773467, + "grad_norm": 0.26187875866889954, + "learning_rate": 3.8710124826629685e-06, + "loss": 0.0047, + "step": 8848 + }, + { + "epoch": 6.130239002424663, + "grad_norm": 0.4058743417263031, + "learning_rate": 3.870319001386963e-06, + "loss": 0.0051, + "step": 8849 + }, + { + "epoch": 6.130931763075857, + "grad_norm": 0.2035028338432312, + "learning_rate": 3.8696255201109575e-06, + "loss": 0.0032, + "step": 8850 + }, + { + "epoch": 6.131624523727052, + "grad_norm": 0.34671515226364136, + "learning_rate": 3.868932038834952e-06, + "loss": 0.0033, + "step": 8851 + }, + { + "epoch": 6.132317284378248, + "grad_norm": 0.27328479290008545, + "learning_rate": 3.868238557558946e-06, + "loss": 0.0042, + "step": 8852 + }, + { + "epoch": 6.133010045029442, + "grad_norm": 0.16158245503902435, + "learning_rate": 3.867545076282941e-06, + "loss": 0.0039, + "step": 8853 + }, + { + "epoch": 6.133702805680637, + "grad_norm": 0.2262895405292511, + "learning_rate": 3.8668515950069356e-06, + "loss": 0.0042, + "step": 8854 + }, + { + "epoch": 6.134395566331833, + "grad_norm": 0.19910332560539246, + "learning_rate": 3.86615811373093e-06, + "loss": 0.0027, + "step": 8855 + }, + { + "epoch": 6.135088326983027, + "grad_norm": 0.332500696182251, + "learning_rate": 3.865464632454924e-06, + "loss": 0.0065, + "step": 8856 + }, + { + "epoch": 6.135781087634222, + "grad_norm": 0.20140956342220306, + "learning_rate": 3.864771151178918e-06, + "loss": 0.0036, + "step": 8857 + }, + { + "epoch": 6.1364738482854175, + "grad_norm": 0.18646806478500366, + "learning_rate": 3.864077669902913e-06, + "loss": 0.0036, + "step": 8858 + }, + { + "epoch": 6.137166608936613, + "grad_norm": 0.3805655539035797, + "learning_rate": 3.863384188626908e-06, + "loss": 0.0036, + "step": 8859 + }, + { + "epoch": 6.137859369587807, + "grad_norm": 0.3395026922225952, + "learning_rate": 3.862690707350902e-06, + "loss": 0.0049, + "step": 8860 + }, + { + "epoch": 6.1385521302390025, + "grad_norm": 0.14605139195919037, + "learning_rate": 3.861997226074896e-06, + "loss": 0.0024, + "step": 8861 + }, + { + "epoch": 6.139244890890198, + "grad_norm": 0.22433006763458252, + "learning_rate": 3.861303744798891e-06, + "loss": 0.0035, + "step": 8862 + }, + { + "epoch": 6.139937651541392, + "grad_norm": 0.20515076816082, + "learning_rate": 3.860610263522886e-06, + "loss": 0.0033, + "step": 8863 + }, + { + "epoch": 6.140630412192587, + "grad_norm": 0.18175749480724335, + "learning_rate": 3.85991678224688e-06, + "loss": 0.0031, + "step": 8864 + }, + { + "epoch": 6.141323172843783, + "grad_norm": 0.2024599313735962, + "learning_rate": 3.859223300970874e-06, + "loss": 0.0034, + "step": 8865 + }, + { + "epoch": 6.142015933494977, + "grad_norm": 0.21010836958885193, + "learning_rate": 3.858529819694868e-06, + "loss": 0.0035, + "step": 8866 + }, + { + "epoch": 6.142708694146172, + "grad_norm": 0.15664875507354736, + "learning_rate": 3.857836338418863e-06, + "loss": 0.0035, + "step": 8867 + }, + { + "epoch": 6.143401454797368, + "grad_norm": 0.21486541628837585, + "learning_rate": 3.857142857142858e-06, + "loss": 0.0031, + "step": 8868 + }, + { + "epoch": 6.144094215448563, + "grad_norm": 0.25232917070388794, + "learning_rate": 3.856449375866852e-06, + "loss": 0.0053, + "step": 8869 + }, + { + "epoch": 6.144786976099757, + "grad_norm": 0.2670959234237671, + "learning_rate": 3.855755894590846e-06, + "loss": 0.003, + "step": 8870 + }, + { + "epoch": 6.145479736750953, + "grad_norm": 0.16176125407218933, + "learning_rate": 3.855062413314841e-06, + "loss": 0.0033, + "step": 8871 + }, + { + "epoch": 6.146172497402148, + "grad_norm": 0.16566051542758942, + "learning_rate": 3.854368932038835e-06, + "loss": 0.0036, + "step": 8872 + }, + { + "epoch": 6.146865258053342, + "grad_norm": 0.297456294298172, + "learning_rate": 3.85367545076283e-06, + "loss": 0.0037, + "step": 8873 + }, + { + "epoch": 6.1475580187045376, + "grad_norm": 0.16764581203460693, + "learning_rate": 3.852981969486824e-06, + "loss": 0.0034, + "step": 8874 + }, + { + "epoch": 6.148250779355733, + "grad_norm": 0.2256138175725937, + "learning_rate": 3.852288488210819e-06, + "loss": 0.0033, + "step": 8875 + }, + { + "epoch": 6.148943540006927, + "grad_norm": 0.23719216883182526, + "learning_rate": 3.851595006934813e-06, + "loss": 0.0033, + "step": 8876 + }, + { + "epoch": 6.1496363006581225, + "grad_norm": 0.4487541913986206, + "learning_rate": 3.850901525658807e-06, + "loss": 0.0029, + "step": 8877 + }, + { + "epoch": 6.150329061309318, + "grad_norm": 0.14292246103286743, + "learning_rate": 3.850208044382802e-06, + "loss": 0.0029, + "step": 8878 + }, + { + "epoch": 6.151021821960513, + "grad_norm": 0.25498414039611816, + "learning_rate": 3.849514563106796e-06, + "loss": 0.004, + "step": 8879 + }, + { + "epoch": 6.1517145826117074, + "grad_norm": 0.19159753620624542, + "learning_rate": 3.848821081830791e-06, + "loss": 0.0032, + "step": 8880 + }, + { + "epoch": 6.152407343262903, + "grad_norm": 0.24804987013339996, + "learning_rate": 3.848127600554785e-06, + "loss": 0.0028, + "step": 8881 + }, + { + "epoch": 6.153100103914098, + "grad_norm": 0.2274547666311264, + "learning_rate": 3.847434119278779e-06, + "loss": 0.0027, + "step": 8882 + }, + { + "epoch": 6.153792864565292, + "grad_norm": 0.3069051206111908, + "learning_rate": 3.846740638002774e-06, + "loss": 0.0063, + "step": 8883 + }, + { + "epoch": 6.154485625216488, + "grad_norm": 0.15738233923912048, + "learning_rate": 3.846047156726769e-06, + "loss": 0.0033, + "step": 8884 + }, + { + "epoch": 6.155178385867683, + "grad_norm": 0.23025180399417877, + "learning_rate": 3.845353675450763e-06, + "loss": 0.0031, + "step": 8885 + }, + { + "epoch": 6.155871146518877, + "grad_norm": 0.18047180771827698, + "learning_rate": 3.844660194174757e-06, + "loss": 0.0035, + "step": 8886 + }, + { + "epoch": 6.156563907170073, + "grad_norm": 0.18933671712875366, + "learning_rate": 3.843966712898751e-06, + "loss": 0.003, + "step": 8887 + }, + { + "epoch": 6.157256667821268, + "grad_norm": 0.9460041522979736, + "learning_rate": 3.843273231622746e-06, + "loss": 0.0063, + "step": 8888 + }, + { + "epoch": 6.157949428472463, + "grad_norm": 0.16287913918495178, + "learning_rate": 3.842579750346741e-06, + "loss": 0.003, + "step": 8889 + }, + { + "epoch": 6.158642189123658, + "grad_norm": 0.23413816094398499, + "learning_rate": 3.841886269070735e-06, + "loss": 0.0043, + "step": 8890 + }, + { + "epoch": 6.159334949774853, + "grad_norm": 0.1540103405714035, + "learning_rate": 3.8411927877947295e-06, + "loss": 0.0027, + "step": 8891 + }, + { + "epoch": 6.160027710426048, + "grad_norm": 0.22090907394886017, + "learning_rate": 3.840499306518724e-06, + "loss": 0.0036, + "step": 8892 + }, + { + "epoch": 6.1607204710772425, + "grad_norm": 0.5158216953277588, + "learning_rate": 3.839805825242719e-06, + "loss": 0.0063, + "step": 8893 + }, + { + "epoch": 6.161413231728438, + "grad_norm": 0.14484673738479614, + "learning_rate": 3.839112343966713e-06, + "loss": 0.0027, + "step": 8894 + }, + { + "epoch": 6.162105992379633, + "grad_norm": 0.4610350430011749, + "learning_rate": 3.8384188626907075e-06, + "loss": 0.01, + "step": 8895 + }, + { + "epoch": 6.1627987530308275, + "grad_norm": 0.1954057514667511, + "learning_rate": 3.837725381414702e-06, + "loss": 0.0023, + "step": 8896 + }, + { + "epoch": 6.163491513682023, + "grad_norm": 0.1904076486825943, + "learning_rate": 3.8370319001386965e-06, + "loss": 0.0041, + "step": 8897 + }, + { + "epoch": 6.164184274333218, + "grad_norm": 0.3598301112651825, + "learning_rate": 3.8363384188626914e-06, + "loss": 0.0023, + "step": 8898 + }, + { + "epoch": 6.164877034984413, + "grad_norm": 0.3696455657482147, + "learning_rate": 3.8356449375866855e-06, + "loss": 0.0053, + "step": 8899 + }, + { + "epoch": 6.165569795635608, + "grad_norm": 0.22022351622581482, + "learning_rate": 3.83495145631068e-06, + "loss": 0.0041, + "step": 8900 + }, + { + "epoch": 6.166262556286803, + "grad_norm": 0.13488420844078064, + "learning_rate": 3.8342579750346746e-06, + "loss": 0.0026, + "step": 8901 + }, + { + "epoch": 6.166955316937998, + "grad_norm": 0.18695005774497986, + "learning_rate": 3.833564493758669e-06, + "loss": 0.003, + "step": 8902 + }, + { + "epoch": 6.167648077589193, + "grad_norm": 0.2914468050003052, + "learning_rate": 3.8328710124826636e-06, + "loss": 0.0055, + "step": 8903 + }, + { + "epoch": 6.168340838240388, + "grad_norm": 0.28238797187805176, + "learning_rate": 3.832177531206658e-06, + "loss": 0.0054, + "step": 8904 + }, + { + "epoch": 6.169033598891583, + "grad_norm": 0.46053773164749146, + "learning_rate": 3.831484049930652e-06, + "loss": 0.0053, + "step": 8905 + }, + { + "epoch": 6.169726359542778, + "grad_norm": 0.23768968880176544, + "learning_rate": 3.830790568654647e-06, + "loss": 0.004, + "step": 8906 + }, + { + "epoch": 6.170419120193973, + "grad_norm": 0.26686280965805054, + "learning_rate": 3.830097087378641e-06, + "loss": 0.0042, + "step": 8907 + }, + { + "epoch": 6.171111880845168, + "grad_norm": 0.3420552611351013, + "learning_rate": 3.829403606102636e-06, + "loss": 0.0049, + "step": 8908 + }, + { + "epoch": 6.171804641496363, + "grad_norm": 0.2604672312736511, + "learning_rate": 3.82871012482663e-06, + "loss": 0.0041, + "step": 8909 + }, + { + "epoch": 6.172497402147558, + "grad_norm": 0.18973568081855774, + "learning_rate": 3.828016643550625e-06, + "loss": 0.0027, + "step": 8910 + }, + { + "epoch": 6.173190162798753, + "grad_norm": 0.2012564092874527, + "learning_rate": 3.827323162274619e-06, + "loss": 0.0037, + "step": 8911 + }, + { + "epoch": 6.173882923449948, + "grad_norm": 0.1997278928756714, + "learning_rate": 3.826629680998613e-06, + "loss": 0.0043, + "step": 8912 + }, + { + "epoch": 6.174575684101143, + "grad_norm": 0.5193144679069519, + "learning_rate": 3.825936199722608e-06, + "loss": 0.0067, + "step": 8913 + }, + { + "epoch": 6.175268444752338, + "grad_norm": 0.26644983887672424, + "learning_rate": 3.825242718446602e-06, + "loss": 0.0036, + "step": 8914 + }, + { + "epoch": 6.175961205403533, + "grad_norm": 0.15375620126724243, + "learning_rate": 3.824549237170597e-06, + "loss": 0.0032, + "step": 8915 + }, + { + "epoch": 6.176653966054728, + "grad_norm": 0.1635560542345047, + "learning_rate": 3.823855755894591e-06, + "loss": 0.003, + "step": 8916 + }, + { + "epoch": 6.177346726705923, + "grad_norm": 0.5810548067092896, + "learning_rate": 3.823162274618585e-06, + "loss": 0.0041, + "step": 8917 + }, + { + "epoch": 6.178039487357118, + "grad_norm": 0.21411103010177612, + "learning_rate": 3.82246879334258e-06, + "loss": 0.0034, + "step": 8918 + }, + { + "epoch": 6.1787322480083136, + "grad_norm": 0.17415635287761688, + "learning_rate": 3.821775312066575e-06, + "loss": 0.0026, + "step": 8919 + }, + { + "epoch": 6.179425008659508, + "grad_norm": 0.17464856803417206, + "learning_rate": 3.821081830790569e-06, + "loss": 0.0024, + "step": 8920 + }, + { + "epoch": 6.180117769310703, + "grad_norm": 0.1878119260072708, + "learning_rate": 3.820388349514563e-06, + "loss": 0.0034, + "step": 8921 + }, + { + "epoch": 6.1808105299618985, + "grad_norm": 0.2702119052410126, + "learning_rate": 3.819694868238558e-06, + "loss": 0.004, + "step": 8922 + }, + { + "epoch": 6.181503290613093, + "grad_norm": 0.2339838743209839, + "learning_rate": 3.819001386962553e-06, + "loss": 0.0031, + "step": 8923 + }, + { + "epoch": 6.182196051264288, + "grad_norm": 0.18894006311893463, + "learning_rate": 3.818307905686547e-06, + "loss": 0.0038, + "step": 8924 + }, + { + "epoch": 6.1828888119154835, + "grad_norm": 0.13782832026481628, + "learning_rate": 3.817614424410541e-06, + "loss": 0.0031, + "step": 8925 + }, + { + "epoch": 6.183581572566678, + "grad_norm": 0.19864635169506073, + "learning_rate": 3.816920943134535e-06, + "loss": 0.0045, + "step": 8926 + }, + { + "epoch": 6.184274333217873, + "grad_norm": 1.2593319416046143, + "learning_rate": 3.81622746185853e-06, + "loss": 0.006, + "step": 8927 + }, + { + "epoch": 6.184967093869068, + "grad_norm": 0.1914646029472351, + "learning_rate": 3.815533980582525e-06, + "loss": 0.0035, + "step": 8928 + }, + { + "epoch": 6.185659854520264, + "grad_norm": 0.20021909475326538, + "learning_rate": 3.814840499306519e-06, + "loss": 0.003, + "step": 8929 + }, + { + "epoch": 6.186352615171458, + "grad_norm": 0.3417031764984131, + "learning_rate": 3.8141470180305136e-06, + "loss": 0.0041, + "step": 8930 + }, + { + "epoch": 6.187045375822653, + "grad_norm": 0.13619321584701538, + "learning_rate": 3.8134535367545077e-06, + "loss": 0.0027, + "step": 8931 + }, + { + "epoch": 6.187738136473849, + "grad_norm": 0.2461363822221756, + "learning_rate": 3.8127600554785022e-06, + "loss": 0.0028, + "step": 8932 + }, + { + "epoch": 6.188430897125043, + "grad_norm": 0.10541187971830368, + "learning_rate": 3.812066574202497e-06, + "loss": 0.002, + "step": 8933 + }, + { + "epoch": 6.189123657776238, + "grad_norm": 0.1136985495686531, + "learning_rate": 3.8113730929264913e-06, + "loss": 0.0027, + "step": 8934 + }, + { + "epoch": 6.189816418427434, + "grad_norm": 0.31341642141342163, + "learning_rate": 3.8106796116504858e-06, + "loss": 0.004, + "step": 8935 + }, + { + "epoch": 6.190509179078628, + "grad_norm": 0.23905427753925323, + "learning_rate": 3.80998613037448e-06, + "loss": 0.0039, + "step": 8936 + }, + { + "epoch": 6.191201939729823, + "grad_norm": 0.28960779309272766, + "learning_rate": 3.8092926490984744e-06, + "loss": 0.0045, + "step": 8937 + }, + { + "epoch": 6.1918947003810185, + "grad_norm": 0.1412201076745987, + "learning_rate": 3.8085991678224693e-06, + "loss": 0.0028, + "step": 8938 + }, + { + "epoch": 6.192587461032213, + "grad_norm": 0.13628625869750977, + "learning_rate": 3.807905686546464e-06, + "loss": 0.0025, + "step": 8939 + }, + { + "epoch": 6.193280221683408, + "grad_norm": 0.1394803524017334, + "learning_rate": 3.807212205270458e-06, + "loss": 0.0027, + "step": 8940 + }, + { + "epoch": 6.1939729823346035, + "grad_norm": 0.35515934228897095, + "learning_rate": 3.8065187239944524e-06, + "loss": 0.0035, + "step": 8941 + }, + { + "epoch": 6.194665742985799, + "grad_norm": 0.5569301843643188, + "learning_rate": 3.8058252427184465e-06, + "loss": 0.0041, + "step": 8942 + }, + { + "epoch": 6.195358503636993, + "grad_norm": 0.3013700544834137, + "learning_rate": 3.8051317614424414e-06, + "loss": 0.0033, + "step": 8943 + }, + { + "epoch": 6.196051264288188, + "grad_norm": 0.25852084159851074, + "learning_rate": 3.804438280166436e-06, + "loss": 0.0037, + "step": 8944 + }, + { + "epoch": 6.196744024939384, + "grad_norm": 0.3149358928203583, + "learning_rate": 3.8037447988904304e-06, + "loss": 0.0043, + "step": 8945 + }, + { + "epoch": 6.197436785590578, + "grad_norm": 0.5216568112373352, + "learning_rate": 3.8030513176144245e-06, + "loss": 0.0046, + "step": 8946 + }, + { + "epoch": 6.198129546241773, + "grad_norm": 0.4101194739341736, + "learning_rate": 3.802357836338419e-06, + "loss": 0.0041, + "step": 8947 + }, + { + "epoch": 6.198822306892969, + "grad_norm": 0.2276059091091156, + "learning_rate": 3.801664355062414e-06, + "loss": 0.0038, + "step": 8948 + }, + { + "epoch": 6.199515067544164, + "grad_norm": 0.37515920400619507, + "learning_rate": 3.800970873786408e-06, + "loss": 0.0043, + "step": 8949 + }, + { + "epoch": 6.200207828195358, + "grad_norm": 0.3137504458427429, + "learning_rate": 3.8002773925104026e-06, + "loss": 0.0032, + "step": 8950 + }, + { + "epoch": 6.200900588846554, + "grad_norm": 0.21343262493610382, + "learning_rate": 3.7995839112343967e-06, + "loss": 0.0037, + "step": 8951 + }, + { + "epoch": 6.201593349497749, + "grad_norm": 0.17816847562789917, + "learning_rate": 3.798890429958391e-06, + "loss": 0.0037, + "step": 8952 + }, + { + "epoch": 6.202286110148943, + "grad_norm": 0.386277973651886, + "learning_rate": 3.798196948682386e-06, + "loss": 0.0051, + "step": 8953 + }, + { + "epoch": 6.2029788708001385, + "grad_norm": 0.3896544277667999, + "learning_rate": 3.7975034674063806e-06, + "loss": 0.0054, + "step": 8954 + }, + { + "epoch": 6.203671631451334, + "grad_norm": 0.2156912237405777, + "learning_rate": 3.7968099861303747e-06, + "loss": 0.0029, + "step": 8955 + }, + { + "epoch": 6.204364392102528, + "grad_norm": 0.17616605758666992, + "learning_rate": 3.796116504854369e-06, + "loss": 0.0028, + "step": 8956 + }, + { + "epoch": 6.2050571527537235, + "grad_norm": 0.20233075320720673, + "learning_rate": 3.7954230235783633e-06, + "loss": 0.003, + "step": 8957 + }, + { + "epoch": 6.205749913404919, + "grad_norm": 0.44021105766296387, + "learning_rate": 3.7947295423023582e-06, + "loss": 0.0054, + "step": 8958 + }, + { + "epoch": 6.206442674056113, + "grad_norm": 0.2861934006214142, + "learning_rate": 3.7940360610263527e-06, + "loss": 0.0032, + "step": 8959 + }, + { + "epoch": 6.207135434707308, + "grad_norm": 0.39539748430252075, + "learning_rate": 3.793342579750347e-06, + "loss": 0.0044, + "step": 8960 + }, + { + "epoch": 6.207828195358504, + "grad_norm": 0.2111402004957199, + "learning_rate": 3.7926490984743413e-06, + "loss": 0.0041, + "step": 8961 + }, + { + "epoch": 6.208520956009699, + "grad_norm": 0.22310729324817657, + "learning_rate": 3.791955617198336e-06, + "loss": 0.0048, + "step": 8962 + }, + { + "epoch": 6.209213716660893, + "grad_norm": 0.18399444222450256, + "learning_rate": 3.7912621359223308e-06, + "loss": 0.0035, + "step": 8963 + }, + { + "epoch": 6.209906477312089, + "grad_norm": 0.18670117855072021, + "learning_rate": 3.790568654646325e-06, + "loss": 0.0042, + "step": 8964 + }, + { + "epoch": 6.210599237963284, + "grad_norm": 0.5374297499656677, + "learning_rate": 3.7898751733703194e-06, + "loss": 0.0046, + "step": 8965 + }, + { + "epoch": 6.211291998614478, + "grad_norm": 0.24439114332199097, + "learning_rate": 3.7891816920943135e-06, + "loss": 0.0051, + "step": 8966 + }, + { + "epoch": 6.211984759265674, + "grad_norm": 0.2810538411140442, + "learning_rate": 3.788488210818308e-06, + "loss": 0.0049, + "step": 8967 + }, + { + "epoch": 6.212677519916869, + "grad_norm": 0.2148081213235855, + "learning_rate": 3.787794729542303e-06, + "loss": 0.0033, + "step": 8968 + }, + { + "epoch": 6.213370280568064, + "grad_norm": 0.13280628621578217, + "learning_rate": 3.7871012482662974e-06, + "loss": 0.0024, + "step": 8969 + }, + { + "epoch": 6.214063041219259, + "grad_norm": 0.152400404214859, + "learning_rate": 3.7864077669902915e-06, + "loss": 0.0027, + "step": 8970 + }, + { + "epoch": 6.214755801870454, + "grad_norm": 0.14377716183662415, + "learning_rate": 3.785714285714286e-06, + "loss": 0.0025, + "step": 8971 + }, + { + "epoch": 6.215448562521649, + "grad_norm": 0.17485810816287994, + "learning_rate": 3.78502080443828e-06, + "loss": 0.0029, + "step": 8972 + }, + { + "epoch": 6.2161413231728435, + "grad_norm": 0.19294202327728271, + "learning_rate": 3.784327323162275e-06, + "loss": 0.0038, + "step": 8973 + }, + { + "epoch": 6.216834083824039, + "grad_norm": 0.23651130497455597, + "learning_rate": 3.7836338418862695e-06, + "loss": 0.003, + "step": 8974 + }, + { + "epoch": 6.217526844475234, + "grad_norm": 0.22184260189533234, + "learning_rate": 3.7829403606102636e-06, + "loss": 0.0045, + "step": 8975 + }, + { + "epoch": 6.2182196051264285, + "grad_norm": 0.24723675847053528, + "learning_rate": 3.782246879334258e-06, + "loss": 0.0033, + "step": 8976 + }, + { + "epoch": 6.218912365777624, + "grad_norm": 0.20020835101604462, + "learning_rate": 3.7815533980582526e-06, + "loss": 0.0054, + "step": 8977 + }, + { + "epoch": 6.219605126428819, + "grad_norm": 0.25527140498161316, + "learning_rate": 3.7808599167822476e-06, + "loss": 0.0037, + "step": 8978 + }, + { + "epoch": 6.220297887080013, + "grad_norm": 0.354015976190567, + "learning_rate": 3.7801664355062417e-06, + "loss": 0.0029, + "step": 8979 + }, + { + "epoch": 6.220990647731209, + "grad_norm": 0.23956342041492462, + "learning_rate": 3.779472954230236e-06, + "loss": 0.0043, + "step": 8980 + }, + { + "epoch": 6.221683408382404, + "grad_norm": 0.1630607545375824, + "learning_rate": 3.7787794729542303e-06, + "loss": 0.0029, + "step": 8981 + }, + { + "epoch": 6.222376169033599, + "grad_norm": 0.2843337059020996, + "learning_rate": 3.7780859916782248e-06, + "loss": 0.0064, + "step": 8982 + }, + { + "epoch": 6.223068929684794, + "grad_norm": 0.28237929940223694, + "learning_rate": 3.7773925104022197e-06, + "loss": 0.0042, + "step": 8983 + }, + { + "epoch": 6.223761690335989, + "grad_norm": 0.409702330827713, + "learning_rate": 3.776699029126214e-06, + "loss": 0.0052, + "step": 8984 + }, + { + "epoch": 6.224454450987184, + "grad_norm": 0.16289186477661133, + "learning_rate": 3.7760055478502083e-06, + "loss": 0.0027, + "step": 8985 + }, + { + "epoch": 6.225147211638379, + "grad_norm": 0.20507176220417023, + "learning_rate": 3.775312066574203e-06, + "loss": 0.0028, + "step": 8986 + }, + { + "epoch": 6.225839972289574, + "grad_norm": 0.20279560983181, + "learning_rate": 3.774618585298197e-06, + "loss": 0.0028, + "step": 8987 + }, + { + "epoch": 6.226532732940769, + "grad_norm": 0.1872999668121338, + "learning_rate": 3.773925104022192e-06, + "loss": 0.0036, + "step": 8988 + }, + { + "epoch": 6.227225493591964, + "grad_norm": 0.147945836186409, + "learning_rate": 3.7732316227461863e-06, + "loss": 0.0028, + "step": 8989 + }, + { + "epoch": 6.227918254243159, + "grad_norm": 0.4329070448875427, + "learning_rate": 3.7725381414701804e-06, + "loss": 0.0034, + "step": 8990 + }, + { + "epoch": 6.228611014894354, + "grad_norm": 0.21519504487514496, + "learning_rate": 3.771844660194175e-06, + "loss": 0.0036, + "step": 8991 + }, + { + "epoch": 6.229303775545549, + "grad_norm": 0.5421343445777893, + "learning_rate": 3.7711511789181694e-06, + "loss": 0.0045, + "step": 8992 + }, + { + "epoch": 6.229996536196744, + "grad_norm": 0.2251807004213333, + "learning_rate": 3.7704576976421644e-06, + "loss": 0.0034, + "step": 8993 + }, + { + "epoch": 6.230689296847939, + "grad_norm": 0.23039962351322174, + "learning_rate": 3.7697642163661585e-06, + "loss": 0.0026, + "step": 8994 + }, + { + "epoch": 6.231382057499134, + "grad_norm": 0.2990110218524933, + "learning_rate": 3.769070735090153e-06, + "loss": 0.0033, + "step": 8995 + }, + { + "epoch": 6.232074818150329, + "grad_norm": 0.5308482050895691, + "learning_rate": 3.768377253814147e-06, + "loss": 0.0049, + "step": 8996 + }, + { + "epoch": 6.232767578801524, + "grad_norm": 0.20214372873306274, + "learning_rate": 3.7676837725381416e-06, + "loss": 0.0023, + "step": 8997 + }, + { + "epoch": 6.233460339452719, + "grad_norm": 0.175472691655159, + "learning_rate": 3.7669902912621365e-06, + "loss": 0.0025, + "step": 8998 + }, + { + "epoch": 6.234153100103914, + "grad_norm": 0.22035038471221924, + "learning_rate": 3.7662968099861306e-06, + "loss": 0.0034, + "step": 8999 + }, + { + "epoch": 6.234845860755109, + "grad_norm": 0.3284253478050232, + "learning_rate": 3.765603328710125e-06, + "loss": 0.0037, + "step": 9000 + }, + { + "epoch": 6.235538621406304, + "grad_norm": 0.20636343955993652, + "learning_rate": 3.7649098474341196e-06, + "loss": 0.0028, + "step": 9001 + }, + { + "epoch": 6.2362313820574995, + "grad_norm": 0.40715131163597107, + "learning_rate": 3.7642163661581137e-06, + "loss": 0.0054, + "step": 9002 + }, + { + "epoch": 6.236924142708694, + "grad_norm": 0.428727388381958, + "learning_rate": 3.7635228848821086e-06, + "loss": 0.0037, + "step": 9003 + }, + { + "epoch": 6.237616903359889, + "grad_norm": 0.28343698382377625, + "learning_rate": 3.762829403606103e-06, + "loss": 0.0048, + "step": 9004 + }, + { + "epoch": 6.238309664011084, + "grad_norm": 0.4409272372722626, + "learning_rate": 3.7621359223300972e-06, + "loss": 0.0047, + "step": 9005 + }, + { + "epoch": 6.239002424662279, + "grad_norm": 0.167841374874115, + "learning_rate": 3.7614424410540917e-06, + "loss": 0.003, + "step": 9006 + }, + { + "epoch": 6.239695185313474, + "grad_norm": 0.2910407781600952, + "learning_rate": 3.7607489597780863e-06, + "loss": 0.0036, + "step": 9007 + }, + { + "epoch": 6.240387945964669, + "grad_norm": 0.23353824019432068, + "learning_rate": 3.7600554785020808e-06, + "loss": 0.0033, + "step": 9008 + }, + { + "epoch": 6.241080706615865, + "grad_norm": 0.17986933887004852, + "learning_rate": 3.7593619972260753e-06, + "loss": 0.0041, + "step": 9009 + }, + { + "epoch": 6.241773467267059, + "grad_norm": 0.27407851815223694, + "learning_rate": 3.7586685159500698e-06, + "loss": 0.0045, + "step": 9010 + }, + { + "epoch": 6.242466227918254, + "grad_norm": 0.42741575837135315, + "learning_rate": 3.757975034674064e-06, + "loss": 0.0036, + "step": 9011 + }, + { + "epoch": 6.24315898856945, + "grad_norm": 0.17926374077796936, + "learning_rate": 3.7572815533980584e-06, + "loss": 0.0026, + "step": 9012 + }, + { + "epoch": 6.243851749220644, + "grad_norm": 0.1855120211839676, + "learning_rate": 3.7565880721220533e-06, + "loss": 0.0033, + "step": 9013 + }, + { + "epoch": 6.244544509871839, + "grad_norm": 0.16290566325187683, + "learning_rate": 3.7558945908460474e-06, + "loss": 0.0027, + "step": 9014 + }, + { + "epoch": 6.245237270523035, + "grad_norm": 0.20021256804466248, + "learning_rate": 3.755201109570042e-06, + "loss": 0.0032, + "step": 9015 + }, + { + "epoch": 6.245930031174229, + "grad_norm": 0.1684054583311081, + "learning_rate": 3.7545076282940364e-06, + "loss": 0.0035, + "step": 9016 + }, + { + "epoch": 6.246622791825424, + "grad_norm": 0.34178709983825684, + "learning_rate": 3.7538141470180305e-06, + "loss": 0.0046, + "step": 9017 + }, + { + "epoch": 6.2473155524766195, + "grad_norm": 0.19220145046710968, + "learning_rate": 3.7531206657420254e-06, + "loss": 0.0034, + "step": 9018 + }, + { + "epoch": 6.248008313127814, + "grad_norm": 0.20446428656578064, + "learning_rate": 3.75242718446602e-06, + "loss": 0.0035, + "step": 9019 + }, + { + "epoch": 6.248701073779009, + "grad_norm": 0.3254857361316681, + "learning_rate": 3.751733703190014e-06, + "loss": 0.0051, + "step": 9020 + }, + { + "epoch": 6.2493938344302045, + "grad_norm": 0.2723582684993744, + "learning_rate": 3.7510402219140085e-06, + "loss": 0.0058, + "step": 9021 + }, + { + "epoch": 6.2500865950814, + "grad_norm": 0.15010446310043335, + "learning_rate": 3.750346740638003e-06, + "loss": 0.0032, + "step": 9022 + }, + { + "epoch": 6.250779355732594, + "grad_norm": 0.1501779556274414, + "learning_rate": 3.7496532593619976e-06, + "loss": 0.0024, + "step": 9023 + }, + { + "epoch": 6.251472116383789, + "grad_norm": 0.17819537222385406, + "learning_rate": 3.748959778085992e-06, + "loss": 0.0032, + "step": 9024 + }, + { + "epoch": 6.252164877034985, + "grad_norm": 0.28279098868370056, + "learning_rate": 3.7482662968099866e-06, + "loss": 0.0039, + "step": 9025 + }, + { + "epoch": 6.252857637686179, + "grad_norm": 0.3389202952384949, + "learning_rate": 3.7475728155339807e-06, + "loss": 0.0036, + "step": 9026 + }, + { + "epoch": 6.253550398337374, + "grad_norm": 0.21087242662906647, + "learning_rate": 3.746879334257975e-06, + "loss": 0.0037, + "step": 9027 + }, + { + "epoch": 6.25424315898857, + "grad_norm": 0.3403334617614746, + "learning_rate": 3.74618585298197e-06, + "loss": 0.004, + "step": 9028 + }, + { + "epoch": 6.254935919639765, + "grad_norm": 0.28383970260620117, + "learning_rate": 3.745492371705964e-06, + "loss": 0.0035, + "step": 9029 + }, + { + "epoch": 6.255628680290959, + "grad_norm": 0.49126681685447693, + "learning_rate": 3.7447988904299587e-06, + "loss": 0.0046, + "step": 9030 + }, + { + "epoch": 6.256321440942155, + "grad_norm": 0.3545149266719818, + "learning_rate": 3.7441054091539532e-06, + "loss": 0.0038, + "step": 9031 + }, + { + "epoch": 6.25701420159335, + "grad_norm": 0.36593419313430786, + "learning_rate": 3.7434119278779473e-06, + "loss": 0.0044, + "step": 9032 + }, + { + "epoch": 6.257706962244544, + "grad_norm": 0.15204544365406036, + "learning_rate": 3.7427184466019422e-06, + "loss": 0.0033, + "step": 9033 + }, + { + "epoch": 6.2583997228957395, + "grad_norm": 0.19848747551441193, + "learning_rate": 3.7420249653259367e-06, + "loss": 0.0027, + "step": 9034 + }, + { + "epoch": 6.259092483546935, + "grad_norm": 0.26260146498680115, + "learning_rate": 3.741331484049931e-06, + "loss": 0.0028, + "step": 9035 + }, + { + "epoch": 6.259785244198129, + "grad_norm": 0.19948506355285645, + "learning_rate": 3.7406380027739253e-06, + "loss": 0.0033, + "step": 9036 + }, + { + "epoch": 6.2604780048493245, + "grad_norm": 0.17205795645713806, + "learning_rate": 3.73994452149792e-06, + "loss": 0.0031, + "step": 9037 + }, + { + "epoch": 6.26117076550052, + "grad_norm": 0.4595890939235687, + "learning_rate": 3.7392510402219144e-06, + "loss": 0.0046, + "step": 9038 + }, + { + "epoch": 6.261863526151714, + "grad_norm": 0.25202450156211853, + "learning_rate": 3.738557558945909e-06, + "loss": 0.0054, + "step": 9039 + }, + { + "epoch": 6.262556286802909, + "grad_norm": 0.2603699862957001, + "learning_rate": 3.7378640776699034e-06, + "loss": 0.0036, + "step": 9040 + }, + { + "epoch": 6.263249047454105, + "grad_norm": 0.19549359381198883, + "learning_rate": 3.7371705963938975e-06, + "loss": 0.0041, + "step": 9041 + }, + { + "epoch": 6.2639418081053, + "grad_norm": 0.19719092547893524, + "learning_rate": 3.736477115117892e-06, + "loss": 0.0046, + "step": 9042 + }, + { + "epoch": 6.264634568756494, + "grad_norm": 0.2393694818019867, + "learning_rate": 3.735783633841887e-06, + "loss": 0.0029, + "step": 9043 + }, + { + "epoch": 6.26532732940769, + "grad_norm": 0.2887903153896332, + "learning_rate": 3.735090152565881e-06, + "loss": 0.0049, + "step": 9044 + }, + { + "epoch": 6.266020090058885, + "grad_norm": 0.39004719257354736, + "learning_rate": 3.7343966712898755e-06, + "loss": 0.0063, + "step": 9045 + }, + { + "epoch": 6.266712850710079, + "grad_norm": 0.3693449795246124, + "learning_rate": 3.73370319001387e-06, + "loss": 0.0035, + "step": 9046 + }, + { + "epoch": 6.267405611361275, + "grad_norm": 0.34467846155166626, + "learning_rate": 3.733009708737864e-06, + "loss": 0.0039, + "step": 9047 + }, + { + "epoch": 6.26809837201247, + "grad_norm": 0.3364553451538086, + "learning_rate": 3.732316227461859e-06, + "loss": 0.0039, + "step": 9048 + }, + { + "epoch": 6.268791132663665, + "grad_norm": 0.23317886888980865, + "learning_rate": 3.7316227461858535e-06, + "loss": 0.0035, + "step": 9049 + }, + { + "epoch": 6.26948389331486, + "grad_norm": 0.42350003123283386, + "learning_rate": 3.7309292649098476e-06, + "loss": 0.0063, + "step": 9050 + }, + { + "epoch": 6.270176653966055, + "grad_norm": 0.3047276437282562, + "learning_rate": 3.730235783633842e-06, + "loss": 0.0047, + "step": 9051 + }, + { + "epoch": 6.27086941461725, + "grad_norm": 0.10471628606319427, + "learning_rate": 3.7295423023578362e-06, + "loss": 0.0023, + "step": 9052 + }, + { + "epoch": 6.2715621752684445, + "grad_norm": 0.18865561485290527, + "learning_rate": 3.728848821081831e-06, + "loss": 0.0044, + "step": 9053 + }, + { + "epoch": 6.27225493591964, + "grad_norm": 0.41251006722450256, + "learning_rate": 3.7281553398058257e-06, + "loss": 0.0037, + "step": 9054 + }, + { + "epoch": 6.272947696570835, + "grad_norm": 0.3191385567188263, + "learning_rate": 3.72746185852982e-06, + "loss": 0.0038, + "step": 9055 + }, + { + "epoch": 6.2736404572220295, + "grad_norm": 0.20665396749973297, + "learning_rate": 3.7267683772538143e-06, + "loss": 0.0043, + "step": 9056 + }, + { + "epoch": 6.274333217873225, + "grad_norm": 0.19908887147903442, + "learning_rate": 3.7260748959778088e-06, + "loss": 0.0038, + "step": 9057 + }, + { + "epoch": 6.27502597852442, + "grad_norm": 0.17986100912094116, + "learning_rate": 3.7253814147018037e-06, + "loss": 0.0035, + "step": 9058 + }, + { + "epoch": 6.275718739175614, + "grad_norm": 0.280498743057251, + "learning_rate": 3.724687933425798e-06, + "loss": 0.0049, + "step": 9059 + }, + { + "epoch": 6.27641149982681, + "grad_norm": 0.13933268189430237, + "learning_rate": 3.7239944521497923e-06, + "loss": 0.0028, + "step": 9060 + }, + { + "epoch": 6.277104260478005, + "grad_norm": 0.15762725472450256, + "learning_rate": 3.723300970873787e-06, + "loss": 0.0027, + "step": 9061 + }, + { + "epoch": 6.2777970211292, + "grad_norm": 0.25061821937561035, + "learning_rate": 3.722607489597781e-06, + "loss": 0.0029, + "step": 9062 + }, + { + "epoch": 6.278489781780395, + "grad_norm": 0.2139478325843811, + "learning_rate": 3.721914008321776e-06, + "loss": 0.0048, + "step": 9063 + }, + { + "epoch": 6.27918254243159, + "grad_norm": 0.2980344295501709, + "learning_rate": 3.7212205270457703e-06, + "loss": 0.0043, + "step": 9064 + }, + { + "epoch": 6.279875303082785, + "grad_norm": 0.17846964299678802, + "learning_rate": 3.7205270457697644e-06, + "loss": 0.0028, + "step": 9065 + }, + { + "epoch": 6.28056806373398, + "grad_norm": 0.3059777319431305, + "learning_rate": 3.719833564493759e-06, + "loss": 0.0046, + "step": 9066 + }, + { + "epoch": 6.281260824385175, + "grad_norm": 0.3137226700782776, + "learning_rate": 3.719140083217753e-06, + "loss": 0.0063, + "step": 9067 + }, + { + "epoch": 6.28195358503637, + "grad_norm": 0.1962714046239853, + "learning_rate": 3.718446601941748e-06, + "loss": 0.0033, + "step": 9068 + }, + { + "epoch": 6.282646345687565, + "grad_norm": 0.11992142349481583, + "learning_rate": 3.7177531206657425e-06, + "loss": 0.0024, + "step": 9069 + }, + { + "epoch": 6.28333910633876, + "grad_norm": 0.1776193380355835, + "learning_rate": 3.717059639389737e-06, + "loss": 0.0031, + "step": 9070 + }, + { + "epoch": 6.284031866989955, + "grad_norm": 0.25049513578414917, + "learning_rate": 3.716366158113731e-06, + "loss": 0.0043, + "step": 9071 + }, + { + "epoch": 6.28472462764115, + "grad_norm": 0.33176901936531067, + "learning_rate": 3.7156726768377256e-06, + "loss": 0.0039, + "step": 9072 + }, + { + "epoch": 6.285417388292345, + "grad_norm": 0.37309005856513977, + "learning_rate": 3.7149791955617205e-06, + "loss": 0.005, + "step": 9073 + }, + { + "epoch": 6.28611014894354, + "grad_norm": 0.18733099102973938, + "learning_rate": 3.7142857142857146e-06, + "loss": 0.0029, + "step": 9074 + }, + { + "epoch": 6.286802909594735, + "grad_norm": 0.3493751585483551, + "learning_rate": 3.713592233009709e-06, + "loss": 0.0033, + "step": 9075 + }, + { + "epoch": 6.28749567024593, + "grad_norm": 0.1516132354736328, + "learning_rate": 3.712898751733703e-06, + "loss": 0.0029, + "step": 9076 + }, + { + "epoch": 6.288188430897125, + "grad_norm": 0.24067988991737366, + "learning_rate": 3.7122052704576977e-06, + "loss": 0.0038, + "step": 9077 + }, + { + "epoch": 6.28888119154832, + "grad_norm": 0.15279389917850494, + "learning_rate": 3.7115117891816926e-06, + "loss": 0.0038, + "step": 9078 + }, + { + "epoch": 6.289573952199515, + "grad_norm": 0.1782047152519226, + "learning_rate": 3.710818307905687e-06, + "loss": 0.0029, + "step": 9079 + }, + { + "epoch": 6.29026671285071, + "grad_norm": 0.17505787312984467, + "learning_rate": 3.7101248266296812e-06, + "loss": 0.0032, + "step": 9080 + }, + { + "epoch": 6.290959473501905, + "grad_norm": 0.22981473803520203, + "learning_rate": 3.7094313453536757e-06, + "loss": 0.0033, + "step": 9081 + }, + { + "epoch": 6.2916522341531005, + "grad_norm": 0.16924026608467102, + "learning_rate": 3.70873786407767e-06, + "loss": 0.0029, + "step": 9082 + }, + { + "epoch": 6.292344994804295, + "grad_norm": 0.18179747462272644, + "learning_rate": 3.7080443828016648e-06, + "loss": 0.0029, + "step": 9083 + }, + { + "epoch": 6.29303775545549, + "grad_norm": 0.21053244173526764, + "learning_rate": 3.7073509015256593e-06, + "loss": 0.0031, + "step": 9084 + }, + { + "epoch": 6.293730516106685, + "grad_norm": 0.19924941658973694, + "learning_rate": 3.7066574202496538e-06, + "loss": 0.0039, + "step": 9085 + }, + { + "epoch": 6.29442327675788, + "grad_norm": 0.2505203187465668, + "learning_rate": 3.705963938973648e-06, + "loss": 0.0034, + "step": 9086 + }, + { + "epoch": 6.295116037409075, + "grad_norm": 0.20534180104732513, + "learning_rate": 3.7052704576976424e-06, + "loss": 0.0036, + "step": 9087 + }, + { + "epoch": 6.29580879806027, + "grad_norm": 0.19099000096321106, + "learning_rate": 3.7045769764216373e-06, + "loss": 0.0027, + "step": 9088 + }, + { + "epoch": 6.296501558711465, + "grad_norm": 0.1680712103843689, + "learning_rate": 3.7038834951456314e-06, + "loss": 0.0037, + "step": 9089 + }, + { + "epoch": 6.29719431936266, + "grad_norm": 0.3262440860271454, + "learning_rate": 3.703190013869626e-06, + "loss": 0.0043, + "step": 9090 + }, + { + "epoch": 6.297887080013855, + "grad_norm": 0.21914781630039215, + "learning_rate": 3.70249653259362e-06, + "loss": 0.0034, + "step": 9091 + }, + { + "epoch": 6.298579840665051, + "grad_norm": 0.19973184168338776, + "learning_rate": 3.7018030513176145e-06, + "loss": 0.004, + "step": 9092 + }, + { + "epoch": 6.299272601316245, + "grad_norm": 0.7196505069732666, + "learning_rate": 3.7011095700416094e-06, + "loss": 0.0044, + "step": 9093 + }, + { + "epoch": 6.29996536196744, + "grad_norm": 0.15338726341724396, + "learning_rate": 3.700416088765604e-06, + "loss": 0.0031, + "step": 9094 + }, + { + "epoch": 6.300658122618636, + "grad_norm": 0.5259151458740234, + "learning_rate": 3.699722607489598e-06, + "loss": 0.0036, + "step": 9095 + }, + { + "epoch": 6.30135088326983, + "grad_norm": 0.5824711918830872, + "learning_rate": 3.6990291262135925e-06, + "loss": 0.0062, + "step": 9096 + }, + { + "epoch": 6.302043643921025, + "grad_norm": 0.2186335027217865, + "learning_rate": 3.6983356449375866e-06, + "loss": 0.0028, + "step": 9097 + }, + { + "epoch": 6.3027364045722205, + "grad_norm": 0.26481255888938904, + "learning_rate": 3.6976421636615816e-06, + "loss": 0.0036, + "step": 9098 + }, + { + "epoch": 6.303429165223415, + "grad_norm": 0.1980435848236084, + "learning_rate": 3.696948682385576e-06, + "loss": 0.0039, + "step": 9099 + }, + { + "epoch": 6.30412192587461, + "grad_norm": 0.3058741092681885, + "learning_rate": 3.69625520110957e-06, + "loss": 0.0045, + "step": 9100 + }, + { + "epoch": 6.3048146865258055, + "grad_norm": 0.1924455761909485, + "learning_rate": 3.6955617198335647e-06, + "loss": 0.0039, + "step": 9101 + }, + { + "epoch": 6.305507447177001, + "grad_norm": 0.45326587557792664, + "learning_rate": 3.694868238557559e-06, + "loss": 0.0062, + "step": 9102 + }, + { + "epoch": 6.306200207828195, + "grad_norm": 0.18722014129161835, + "learning_rate": 3.694174757281554e-06, + "loss": 0.0037, + "step": 9103 + }, + { + "epoch": 6.30689296847939, + "grad_norm": 0.17146499454975128, + "learning_rate": 3.693481276005548e-06, + "loss": 0.0026, + "step": 9104 + }, + { + "epoch": 6.307585729130586, + "grad_norm": 0.31631070375442505, + "learning_rate": 3.6927877947295427e-06, + "loss": 0.0036, + "step": 9105 + }, + { + "epoch": 6.30827848978178, + "grad_norm": 0.15640586614608765, + "learning_rate": 3.692094313453537e-06, + "loss": 0.0029, + "step": 9106 + }, + { + "epoch": 6.308971250432975, + "grad_norm": 0.23377051949501038, + "learning_rate": 3.6914008321775313e-06, + "loss": 0.005, + "step": 9107 + }, + { + "epoch": 6.309664011084171, + "grad_norm": 0.19398950040340424, + "learning_rate": 3.6907073509015262e-06, + "loss": 0.0029, + "step": 9108 + }, + { + "epoch": 6.310356771735365, + "grad_norm": 0.3420952558517456, + "learning_rate": 3.6900138696255208e-06, + "loss": 0.0034, + "step": 9109 + }, + { + "epoch": 6.31104953238656, + "grad_norm": 0.29911771416664124, + "learning_rate": 3.689320388349515e-06, + "loss": 0.0048, + "step": 9110 + }, + { + "epoch": 6.311742293037756, + "grad_norm": 0.2814820408821106, + "learning_rate": 3.6886269070735094e-06, + "loss": 0.0061, + "step": 9111 + }, + { + "epoch": 6.312435053688951, + "grad_norm": 0.22791898250579834, + "learning_rate": 3.6879334257975034e-06, + "loss": 0.0056, + "step": 9112 + }, + { + "epoch": 6.313127814340145, + "grad_norm": 0.15516510605812073, + "learning_rate": 3.6872399445214984e-06, + "loss": 0.003, + "step": 9113 + }, + { + "epoch": 6.3138205749913405, + "grad_norm": 0.1726953387260437, + "learning_rate": 3.686546463245493e-06, + "loss": 0.0027, + "step": 9114 + }, + { + "epoch": 6.314513335642536, + "grad_norm": 0.13942740857601166, + "learning_rate": 3.685852981969487e-06, + "loss": 0.0027, + "step": 9115 + }, + { + "epoch": 6.31520609629373, + "grad_norm": 0.37250882387161255, + "learning_rate": 3.6851595006934815e-06, + "loss": 0.0055, + "step": 9116 + }, + { + "epoch": 6.3158988569449255, + "grad_norm": 0.44557350873947144, + "learning_rate": 3.684466019417476e-06, + "loss": 0.0039, + "step": 9117 + }, + { + "epoch": 6.316591617596121, + "grad_norm": 0.2854665517807007, + "learning_rate": 3.683772538141471e-06, + "loss": 0.0043, + "step": 9118 + }, + { + "epoch": 6.317284378247315, + "grad_norm": 0.4197855591773987, + "learning_rate": 3.683079056865465e-06, + "loss": 0.0032, + "step": 9119 + }, + { + "epoch": 6.31797713889851, + "grad_norm": 0.2187676578760147, + "learning_rate": 3.6823855755894595e-06, + "loss": 0.0032, + "step": 9120 + }, + { + "epoch": 6.318669899549706, + "grad_norm": 0.19195012748241425, + "learning_rate": 3.6816920943134536e-06, + "loss": 0.0029, + "step": 9121 + }, + { + "epoch": 6.319362660200901, + "grad_norm": 0.2230643630027771, + "learning_rate": 3.680998613037448e-06, + "loss": 0.0039, + "step": 9122 + }, + { + "epoch": 6.320055420852095, + "grad_norm": 0.21015210449695587, + "learning_rate": 3.680305131761443e-06, + "loss": 0.0047, + "step": 9123 + }, + { + "epoch": 6.320748181503291, + "grad_norm": 0.1933145821094513, + "learning_rate": 3.679611650485437e-06, + "loss": 0.003, + "step": 9124 + }, + { + "epoch": 6.321440942154486, + "grad_norm": 0.22975307703018188, + "learning_rate": 3.6789181692094316e-06, + "loss": 0.0038, + "step": 9125 + }, + { + "epoch": 6.32213370280568, + "grad_norm": 0.21831192076206207, + "learning_rate": 3.678224687933426e-06, + "loss": 0.0036, + "step": 9126 + }, + { + "epoch": 6.322826463456876, + "grad_norm": 0.2836846113204956, + "learning_rate": 3.6775312066574202e-06, + "loss": 0.0035, + "step": 9127 + }, + { + "epoch": 6.323519224108071, + "grad_norm": 0.13700388371944427, + "learning_rate": 3.676837725381415e-06, + "loss": 0.003, + "step": 9128 + }, + { + "epoch": 6.324211984759265, + "grad_norm": 0.20510956645011902, + "learning_rate": 3.6761442441054097e-06, + "loss": 0.0033, + "step": 9129 + }, + { + "epoch": 6.3249047454104605, + "grad_norm": 0.1835632026195526, + "learning_rate": 3.6754507628294038e-06, + "loss": 0.0038, + "step": 9130 + }, + { + "epoch": 6.325597506061656, + "grad_norm": 0.26688703894615173, + "learning_rate": 3.6747572815533983e-06, + "loss": 0.0035, + "step": 9131 + }, + { + "epoch": 6.326290266712851, + "grad_norm": 0.21331727504730225, + "learning_rate": 3.6740638002773928e-06, + "loss": 0.0047, + "step": 9132 + }, + { + "epoch": 6.3269830273640455, + "grad_norm": 0.1545047014951706, + "learning_rate": 3.6733703190013877e-06, + "loss": 0.0023, + "step": 9133 + }, + { + "epoch": 6.327675788015241, + "grad_norm": 0.17784884572029114, + "learning_rate": 3.672676837725382e-06, + "loss": 0.0033, + "step": 9134 + }, + { + "epoch": 6.328368548666436, + "grad_norm": 0.1269831359386444, + "learning_rate": 3.6719833564493763e-06, + "loss": 0.0028, + "step": 9135 + }, + { + "epoch": 6.3290613093176304, + "grad_norm": 0.2010103464126587, + "learning_rate": 3.6712898751733704e-06, + "loss": 0.0034, + "step": 9136 + }, + { + "epoch": 6.329754069968826, + "grad_norm": 0.38688895106315613, + "learning_rate": 3.670596393897365e-06, + "loss": 0.0051, + "step": 9137 + }, + { + "epoch": 6.330446830620021, + "grad_norm": 0.2992427945137024, + "learning_rate": 3.66990291262136e-06, + "loss": 0.0037, + "step": 9138 + }, + { + "epoch": 6.331139591271215, + "grad_norm": 0.19218286871910095, + "learning_rate": 3.669209431345354e-06, + "loss": 0.0034, + "step": 9139 + }, + { + "epoch": 6.331832351922411, + "grad_norm": 0.27887171506881714, + "learning_rate": 3.6685159500693484e-06, + "loss": 0.0036, + "step": 9140 + }, + { + "epoch": 6.332525112573606, + "grad_norm": 0.2271146923303604, + "learning_rate": 3.667822468793343e-06, + "loss": 0.0041, + "step": 9141 + }, + { + "epoch": 6.333217873224801, + "grad_norm": 0.2160869836807251, + "learning_rate": 3.667128987517337e-06, + "loss": 0.0029, + "step": 9142 + }, + { + "epoch": 6.333910633875996, + "grad_norm": 0.2749139964580536, + "learning_rate": 3.666435506241332e-06, + "loss": 0.004, + "step": 9143 + }, + { + "epoch": 6.334603394527191, + "grad_norm": 0.24140483140945435, + "learning_rate": 3.6657420249653265e-06, + "loss": 0.0031, + "step": 9144 + }, + { + "epoch": 6.335296155178386, + "grad_norm": 0.22241345047950745, + "learning_rate": 3.6650485436893206e-06, + "loss": 0.0038, + "step": 9145 + }, + { + "epoch": 6.335988915829581, + "grad_norm": 0.33373576402664185, + "learning_rate": 3.664355062413315e-06, + "loss": 0.0044, + "step": 9146 + }, + { + "epoch": 6.336681676480776, + "grad_norm": 0.45689862966537476, + "learning_rate": 3.6636615811373096e-06, + "loss": 0.0042, + "step": 9147 + }, + { + "epoch": 6.337374437131971, + "grad_norm": 0.2773498594760895, + "learning_rate": 3.662968099861304e-06, + "loss": 0.0038, + "step": 9148 + }, + { + "epoch": 6.3380671977831655, + "grad_norm": 0.151902437210083, + "learning_rate": 3.6622746185852986e-06, + "loss": 0.0028, + "step": 9149 + }, + { + "epoch": 6.338759958434361, + "grad_norm": 0.30610865354537964, + "learning_rate": 3.661581137309293e-06, + "loss": 0.0057, + "step": 9150 + }, + { + "epoch": 6.339452719085556, + "grad_norm": 0.3921205699443817, + "learning_rate": 3.660887656033287e-06, + "loss": 0.005, + "step": 9151 + }, + { + "epoch": 6.340145479736751, + "grad_norm": 0.16900771856307983, + "learning_rate": 3.6601941747572817e-06, + "loss": 0.0033, + "step": 9152 + }, + { + "epoch": 6.340838240387946, + "grad_norm": 0.17052240669727325, + "learning_rate": 3.6595006934812766e-06, + "loss": 0.0037, + "step": 9153 + }, + { + "epoch": 6.341531001039141, + "grad_norm": 0.4854407012462616, + "learning_rate": 3.6588072122052707e-06, + "loss": 0.0045, + "step": 9154 + }, + { + "epoch": 6.342223761690336, + "grad_norm": 0.26672014594078064, + "learning_rate": 3.6581137309292652e-06, + "loss": 0.0033, + "step": 9155 + }, + { + "epoch": 6.342916522341531, + "grad_norm": 0.41833311319351196, + "learning_rate": 3.6574202496532598e-06, + "loss": 0.0051, + "step": 9156 + }, + { + "epoch": 6.343609282992726, + "grad_norm": 0.27049949765205383, + "learning_rate": 3.656726768377254e-06, + "loss": 0.0048, + "step": 9157 + }, + { + "epoch": 6.344302043643921, + "grad_norm": 0.41585874557495117, + "learning_rate": 3.6560332871012488e-06, + "loss": 0.0065, + "step": 9158 + }, + { + "epoch": 6.344994804295116, + "grad_norm": 0.17797231674194336, + "learning_rate": 3.6553398058252433e-06, + "loss": 0.0036, + "step": 9159 + }, + { + "epoch": 6.345687564946311, + "grad_norm": 0.16923600435256958, + "learning_rate": 3.6546463245492374e-06, + "loss": 0.0028, + "step": 9160 + }, + { + "epoch": 6.346380325597506, + "grad_norm": 0.43312036991119385, + "learning_rate": 3.653952843273232e-06, + "loss": 0.0045, + "step": 9161 + }, + { + "epoch": 6.3470730862487015, + "grad_norm": 0.1832997351884842, + "learning_rate": 3.6532593619972264e-06, + "loss": 0.0031, + "step": 9162 + }, + { + "epoch": 6.347765846899896, + "grad_norm": 0.21749837696552277, + "learning_rate": 3.652565880721221e-06, + "loss": 0.0031, + "step": 9163 + }, + { + "epoch": 6.348458607551091, + "grad_norm": 0.19867324829101562, + "learning_rate": 3.6518723994452154e-06, + "loss": 0.0033, + "step": 9164 + }, + { + "epoch": 6.349151368202286, + "grad_norm": 0.20064476132392883, + "learning_rate": 3.65117891816921e-06, + "loss": 0.0035, + "step": 9165 + }, + { + "epoch": 6.349844128853481, + "grad_norm": 0.39225077629089355, + "learning_rate": 3.650485436893204e-06, + "loss": 0.0058, + "step": 9166 + }, + { + "epoch": 6.350536889504676, + "grad_norm": 0.18574102222919464, + "learning_rate": 3.6497919556171985e-06, + "loss": 0.0025, + "step": 9167 + }, + { + "epoch": 6.351229650155871, + "grad_norm": 0.32468369603157043, + "learning_rate": 3.6490984743411934e-06, + "loss": 0.0043, + "step": 9168 + }, + { + "epoch": 6.351922410807066, + "grad_norm": 0.18796774744987488, + "learning_rate": 3.6484049930651875e-06, + "loss": 0.0031, + "step": 9169 + }, + { + "epoch": 6.352615171458261, + "grad_norm": 0.13876697421073914, + "learning_rate": 3.647711511789182e-06, + "loss": 0.0028, + "step": 9170 + }, + { + "epoch": 6.353307932109456, + "grad_norm": 0.2559381425380707, + "learning_rate": 3.6470180305131766e-06, + "loss": 0.0039, + "step": 9171 + }, + { + "epoch": 6.354000692760652, + "grad_norm": 0.10826985538005829, + "learning_rate": 3.6463245492371706e-06, + "loss": 0.0023, + "step": 9172 + }, + { + "epoch": 6.354693453411846, + "grad_norm": 0.2699757218360901, + "learning_rate": 3.6456310679611656e-06, + "loss": 0.0053, + "step": 9173 + }, + { + "epoch": 6.355386214063041, + "grad_norm": 0.29450279474258423, + "learning_rate": 3.64493758668516e-06, + "loss": 0.0033, + "step": 9174 + }, + { + "epoch": 6.3560789747142366, + "grad_norm": 0.45780789852142334, + "learning_rate": 3.644244105409154e-06, + "loss": 0.0029, + "step": 9175 + }, + { + "epoch": 6.356771735365431, + "grad_norm": 0.23914526402950287, + "learning_rate": 3.6435506241331487e-06, + "loss": 0.0043, + "step": 9176 + }, + { + "epoch": 6.357464496016626, + "grad_norm": 0.23474185168743134, + "learning_rate": 3.642857142857143e-06, + "loss": 0.0041, + "step": 9177 + }, + { + "epoch": 6.3581572566678215, + "grad_norm": 0.4006239175796509, + "learning_rate": 3.6421636615811377e-06, + "loss": 0.0041, + "step": 9178 + }, + { + "epoch": 6.358850017319016, + "grad_norm": 0.31819018721580505, + "learning_rate": 3.641470180305132e-06, + "loss": 0.0038, + "step": 9179 + }, + { + "epoch": 6.359542777970211, + "grad_norm": 0.18153499066829681, + "learning_rate": 3.6407766990291267e-06, + "loss": 0.0034, + "step": 9180 + }, + { + "epoch": 6.3602355386214064, + "grad_norm": 0.32186275720596313, + "learning_rate": 3.640083217753121e-06, + "loss": 0.0036, + "step": 9181 + }, + { + "epoch": 6.360928299272602, + "grad_norm": 0.16871248185634613, + "learning_rate": 3.6393897364771153e-06, + "loss": 0.0028, + "step": 9182 + }, + { + "epoch": 6.361621059923796, + "grad_norm": 0.2546533942222595, + "learning_rate": 3.6386962552011103e-06, + "loss": 0.0033, + "step": 9183 + }, + { + "epoch": 6.362313820574991, + "grad_norm": 0.30084696412086487, + "learning_rate": 3.6380027739251043e-06, + "loss": 0.0031, + "step": 9184 + }, + { + "epoch": 6.363006581226187, + "grad_norm": 0.24564731121063232, + "learning_rate": 3.637309292649099e-06, + "loss": 0.0035, + "step": 9185 + }, + { + "epoch": 6.363699341877381, + "grad_norm": 0.5032068490982056, + "learning_rate": 3.6366158113730934e-06, + "loss": 0.0037, + "step": 9186 + }, + { + "epoch": 6.364392102528576, + "grad_norm": 0.5219812989234924, + "learning_rate": 3.6359223300970874e-06, + "loss": 0.0037, + "step": 9187 + }, + { + "epoch": 6.365084863179772, + "grad_norm": 0.15681755542755127, + "learning_rate": 3.6352288488210824e-06, + "loss": 0.0026, + "step": 9188 + }, + { + "epoch": 6.365777623830966, + "grad_norm": 0.16239261627197266, + "learning_rate": 3.634535367545077e-06, + "loss": 0.0032, + "step": 9189 + }, + { + "epoch": 6.366470384482161, + "grad_norm": 0.24097208678722382, + "learning_rate": 3.633841886269071e-06, + "loss": 0.0033, + "step": 9190 + }, + { + "epoch": 6.367163145133357, + "grad_norm": 0.18855173885822296, + "learning_rate": 3.6331484049930655e-06, + "loss": 0.0024, + "step": 9191 + }, + { + "epoch": 6.367855905784552, + "grad_norm": 0.45409417152404785, + "learning_rate": 3.6324549237170596e-06, + "loss": 0.005, + "step": 9192 + }, + { + "epoch": 6.368548666435746, + "grad_norm": 0.18834039568901062, + "learning_rate": 3.6317614424410545e-06, + "loss": 0.0031, + "step": 9193 + }, + { + "epoch": 6.3692414270869415, + "grad_norm": 0.2133660614490509, + "learning_rate": 3.631067961165049e-06, + "loss": 0.0042, + "step": 9194 + }, + { + "epoch": 6.369934187738137, + "grad_norm": 0.2551059424877167, + "learning_rate": 3.6303744798890435e-06, + "loss": 0.0042, + "step": 9195 + }, + { + "epoch": 6.370626948389331, + "grad_norm": 0.19197168946266174, + "learning_rate": 3.6296809986130376e-06, + "loss": 0.0048, + "step": 9196 + }, + { + "epoch": 6.3713197090405265, + "grad_norm": 0.2950360178947449, + "learning_rate": 3.628987517337032e-06, + "loss": 0.0047, + "step": 9197 + }, + { + "epoch": 6.372012469691722, + "grad_norm": 0.48158374428749084, + "learning_rate": 3.628294036061027e-06, + "loss": 0.0061, + "step": 9198 + }, + { + "epoch": 6.372705230342916, + "grad_norm": 0.34344348311424255, + "learning_rate": 3.627600554785021e-06, + "loss": 0.0035, + "step": 9199 + }, + { + "epoch": 6.373397990994111, + "grad_norm": 0.3352544605731964, + "learning_rate": 3.6269070735090156e-06, + "loss": 0.004, + "step": 9200 + }, + { + "epoch": 6.374090751645307, + "grad_norm": 0.28213703632354736, + "learning_rate": 3.62621359223301e-06, + "loss": 0.0038, + "step": 9201 + }, + { + "epoch": 6.374783512296502, + "grad_norm": 0.25421786308288574, + "learning_rate": 3.6255201109570042e-06, + "loss": 0.0044, + "step": 9202 + }, + { + "epoch": 6.375476272947696, + "grad_norm": 0.21430328488349915, + "learning_rate": 3.6248266296809988e-06, + "loss": 0.003, + "step": 9203 + }, + { + "epoch": 6.376169033598892, + "grad_norm": 0.319079726934433, + "learning_rate": 3.6241331484049937e-06, + "loss": 0.0044, + "step": 9204 + }, + { + "epoch": 6.376861794250087, + "grad_norm": 0.3768300414085388, + "learning_rate": 3.6234396671289878e-06, + "loss": 0.0034, + "step": 9205 + }, + { + "epoch": 6.377554554901281, + "grad_norm": 0.2236616015434265, + "learning_rate": 3.6227461858529823e-06, + "loss": 0.0034, + "step": 9206 + }, + { + "epoch": 6.378247315552477, + "grad_norm": 0.30210188031196594, + "learning_rate": 3.6220527045769764e-06, + "loss": 0.0039, + "step": 9207 + }, + { + "epoch": 6.378940076203672, + "grad_norm": 0.32834911346435547, + "learning_rate": 3.621359223300971e-06, + "loss": 0.005, + "step": 9208 + }, + { + "epoch": 6.379632836854866, + "grad_norm": 0.253227561712265, + "learning_rate": 3.620665742024966e-06, + "loss": 0.0038, + "step": 9209 + }, + { + "epoch": 6.3803255975060615, + "grad_norm": 0.2744441330432892, + "learning_rate": 3.6199722607489603e-06, + "loss": 0.0044, + "step": 9210 + }, + { + "epoch": 6.381018358157257, + "grad_norm": 0.278323769569397, + "learning_rate": 3.6192787794729544e-06, + "loss": 0.0034, + "step": 9211 + }, + { + "epoch": 6.381711118808452, + "grad_norm": 0.14826999604701996, + "learning_rate": 3.618585298196949e-06, + "loss": 0.0025, + "step": 9212 + }, + { + "epoch": 6.3824038794596465, + "grad_norm": 0.15909333527088165, + "learning_rate": 3.617891816920943e-06, + "loss": 0.0028, + "step": 9213 + }, + { + "epoch": 6.383096640110842, + "grad_norm": 0.21053312718868256, + "learning_rate": 3.617198335644938e-06, + "loss": 0.0034, + "step": 9214 + }, + { + "epoch": 6.383789400762037, + "grad_norm": 0.34448710083961487, + "learning_rate": 3.6165048543689324e-06, + "loss": 0.0037, + "step": 9215 + }, + { + "epoch": 6.384482161413231, + "grad_norm": 0.2399231195449829, + "learning_rate": 3.6158113730929265e-06, + "loss": 0.0038, + "step": 9216 + }, + { + "epoch": 6.385174922064427, + "grad_norm": 0.1950463205575943, + "learning_rate": 3.615117891816921e-06, + "loss": 0.0031, + "step": 9217 + }, + { + "epoch": 6.385867682715622, + "grad_norm": 0.2300952523946762, + "learning_rate": 3.6144244105409156e-06, + "loss": 0.0029, + "step": 9218 + }, + { + "epoch": 6.386560443366816, + "grad_norm": 0.2720438539981842, + "learning_rate": 3.6137309292649105e-06, + "loss": 0.0033, + "step": 9219 + }, + { + "epoch": 6.387253204018012, + "grad_norm": 0.4348621964454651, + "learning_rate": 3.6130374479889046e-06, + "loss": 0.004, + "step": 9220 + }, + { + "epoch": 6.387945964669207, + "grad_norm": 0.23593385517597198, + "learning_rate": 3.612343966712899e-06, + "loss": 0.006, + "step": 9221 + }, + { + "epoch": 6.388638725320401, + "grad_norm": 0.4404006004333496, + "learning_rate": 3.611650485436893e-06, + "loss": 0.0039, + "step": 9222 + }, + { + "epoch": 6.389331485971597, + "grad_norm": 0.3923046588897705, + "learning_rate": 3.6109570041608877e-06, + "loss": 0.0039, + "step": 9223 + }, + { + "epoch": 6.390024246622792, + "grad_norm": 0.1145739033818245, + "learning_rate": 3.6102635228848826e-06, + "loss": 0.0023, + "step": 9224 + }, + { + "epoch": 6.390717007273987, + "grad_norm": 0.24858969449996948, + "learning_rate": 3.609570041608877e-06, + "loss": 0.0025, + "step": 9225 + }, + { + "epoch": 6.391409767925182, + "grad_norm": 0.20601050555706024, + "learning_rate": 3.608876560332871e-06, + "loss": 0.0052, + "step": 9226 + }, + { + "epoch": 6.392102528576377, + "grad_norm": 0.4553786814212799, + "learning_rate": 3.6081830790568657e-06, + "loss": 0.0047, + "step": 9227 + }, + { + "epoch": 6.392795289227572, + "grad_norm": 0.30672401189804077, + "learning_rate": 3.60748959778086e-06, + "loss": 0.004, + "step": 9228 + }, + { + "epoch": 6.3934880498787665, + "grad_norm": 0.3431636095046997, + "learning_rate": 3.6067961165048547e-06, + "loss": 0.0036, + "step": 9229 + }, + { + "epoch": 6.394180810529962, + "grad_norm": 0.37345394492149353, + "learning_rate": 3.6061026352288493e-06, + "loss": 0.011, + "step": 9230 + }, + { + "epoch": 6.394873571181157, + "grad_norm": 0.506641685962677, + "learning_rate": 3.6054091539528433e-06, + "loss": 0.0032, + "step": 9231 + }, + { + "epoch": 6.395566331832352, + "grad_norm": 0.3148530125617981, + "learning_rate": 3.604715672676838e-06, + "loss": 0.0036, + "step": 9232 + }, + { + "epoch": 6.396259092483547, + "grad_norm": 0.1761692464351654, + "learning_rate": 3.6040221914008324e-06, + "loss": 0.0029, + "step": 9233 + }, + { + "epoch": 6.396951853134742, + "grad_norm": 0.21109837293624878, + "learning_rate": 3.6033287101248273e-06, + "loss": 0.0036, + "step": 9234 + }, + { + "epoch": 6.397644613785937, + "grad_norm": 0.16789598762989044, + "learning_rate": 3.6026352288488214e-06, + "loss": 0.0025, + "step": 9235 + }, + { + "epoch": 6.398337374437132, + "grad_norm": 0.23829500377178192, + "learning_rate": 3.601941747572816e-06, + "loss": 0.0037, + "step": 9236 + }, + { + "epoch": 6.399030135088327, + "grad_norm": 0.23452800512313843, + "learning_rate": 3.60124826629681e-06, + "loss": 0.0035, + "step": 9237 + }, + { + "epoch": 6.399722895739522, + "grad_norm": 0.13841485977172852, + "learning_rate": 3.6005547850208045e-06, + "loss": 0.0027, + "step": 9238 + }, + { + "epoch": 6.400415656390717, + "grad_norm": 0.20119313895702362, + "learning_rate": 3.5998613037447994e-06, + "loss": 0.0049, + "step": 9239 + }, + { + "epoch": 6.401108417041912, + "grad_norm": 0.16879573464393616, + "learning_rate": 3.5991678224687935e-06, + "loss": 0.0032, + "step": 9240 + }, + { + "epoch": 6.401801177693107, + "grad_norm": 0.1579677313566208, + "learning_rate": 3.598474341192788e-06, + "loss": 0.0023, + "step": 9241 + }, + { + "epoch": 6.402493938344302, + "grad_norm": 0.5832002758979797, + "learning_rate": 3.5977808599167825e-06, + "loss": 0.0061, + "step": 9242 + }, + { + "epoch": 6.403186698995497, + "grad_norm": 0.21203866600990295, + "learning_rate": 3.5970873786407766e-06, + "loss": 0.0037, + "step": 9243 + }, + { + "epoch": 6.403879459646692, + "grad_norm": 0.39060166478157043, + "learning_rate": 3.5963938973647715e-06, + "loss": 0.0076, + "step": 9244 + }, + { + "epoch": 6.404572220297887, + "grad_norm": 0.23987151682376862, + "learning_rate": 3.595700416088766e-06, + "loss": 0.0034, + "step": 9245 + }, + { + "epoch": 6.405264980949082, + "grad_norm": 0.6974042057991028, + "learning_rate": 3.59500693481276e-06, + "loss": 0.0039, + "step": 9246 + }, + { + "epoch": 6.405957741600277, + "grad_norm": 0.36157554388046265, + "learning_rate": 3.5943134535367546e-06, + "loss": 0.0047, + "step": 9247 + }, + { + "epoch": 6.406650502251472, + "grad_norm": 0.44548600912094116, + "learning_rate": 3.593619972260749e-06, + "loss": 0.0058, + "step": 9248 + }, + { + "epoch": 6.407343262902667, + "grad_norm": 0.21171928942203522, + "learning_rate": 3.592926490984744e-06, + "loss": 0.003, + "step": 9249 + }, + { + "epoch": 6.408036023553862, + "grad_norm": 0.22173413634300232, + "learning_rate": 3.592233009708738e-06, + "loss": 0.0039, + "step": 9250 + }, + { + "epoch": 6.408728784205057, + "grad_norm": 0.5000897645950317, + "learning_rate": 3.5915395284327327e-06, + "loss": 0.0059, + "step": 9251 + }, + { + "epoch": 6.409421544856253, + "grad_norm": 0.18607904016971588, + "learning_rate": 3.5908460471567268e-06, + "loss": 0.0033, + "step": 9252 + }, + { + "epoch": 6.410114305507447, + "grad_norm": 0.12777851521968842, + "learning_rate": 3.5901525658807213e-06, + "loss": 0.0029, + "step": 9253 + }, + { + "epoch": 6.410807066158642, + "grad_norm": 0.13433578610420227, + "learning_rate": 3.5894590846047162e-06, + "loss": 0.0028, + "step": 9254 + }, + { + "epoch": 6.4114998268098375, + "grad_norm": 0.19018560647964478, + "learning_rate": 3.5887656033287103e-06, + "loss": 0.0031, + "step": 9255 + }, + { + "epoch": 6.412192587461032, + "grad_norm": 0.47062408924102783, + "learning_rate": 3.588072122052705e-06, + "loss": 0.0055, + "step": 9256 + }, + { + "epoch": 6.412885348112227, + "grad_norm": 0.2807880938053131, + "learning_rate": 3.5873786407766993e-06, + "loss": 0.0046, + "step": 9257 + }, + { + "epoch": 6.4135781087634225, + "grad_norm": 0.3895263671875, + "learning_rate": 3.5866851595006934e-06, + "loss": 0.0042, + "step": 9258 + }, + { + "epoch": 6.414270869414617, + "grad_norm": 0.5025876760482788, + "learning_rate": 3.5859916782246883e-06, + "loss": 0.0045, + "step": 9259 + }, + { + "epoch": 6.414963630065812, + "grad_norm": 0.14714136719703674, + "learning_rate": 3.585298196948683e-06, + "loss": 0.0032, + "step": 9260 + }, + { + "epoch": 6.415656390717007, + "grad_norm": 0.23582592606544495, + "learning_rate": 3.584604715672677e-06, + "loss": 0.003, + "step": 9261 + }, + { + "epoch": 6.416349151368202, + "grad_norm": 0.2524140179157257, + "learning_rate": 3.5839112343966715e-06, + "loss": 0.0042, + "step": 9262 + }, + { + "epoch": 6.417041912019397, + "grad_norm": 0.29775092005729675, + "learning_rate": 3.583217753120666e-06, + "loss": 0.0048, + "step": 9263 + }, + { + "epoch": 6.417734672670592, + "grad_norm": 0.14647287130355835, + "learning_rate": 3.5825242718446605e-06, + "loss": 0.0028, + "step": 9264 + }, + { + "epoch": 6.418427433321788, + "grad_norm": 0.2190023958683014, + "learning_rate": 3.581830790568655e-06, + "loss": 0.0028, + "step": 9265 + }, + { + "epoch": 6.419120193972982, + "grad_norm": 0.44877180457115173, + "learning_rate": 3.5811373092926495e-06, + "loss": 0.0031, + "step": 9266 + }, + { + "epoch": 6.419812954624177, + "grad_norm": 0.2303936928510666, + "learning_rate": 3.5804438280166436e-06, + "loss": 0.0039, + "step": 9267 + }, + { + "epoch": 6.420505715275373, + "grad_norm": 0.19964253902435303, + "learning_rate": 3.579750346740638e-06, + "loss": 0.0031, + "step": 9268 + }, + { + "epoch": 6.421198475926567, + "grad_norm": 0.2720945477485657, + "learning_rate": 3.579056865464633e-06, + "loss": 0.0041, + "step": 9269 + }, + { + "epoch": 6.421891236577762, + "grad_norm": 0.2579038441181183, + "learning_rate": 3.578363384188627e-06, + "loss": 0.0036, + "step": 9270 + }, + { + "epoch": 6.422583997228958, + "grad_norm": 0.2768496572971344, + "learning_rate": 3.5776699029126216e-06, + "loss": 0.0026, + "step": 9271 + }, + { + "epoch": 6.423276757880153, + "grad_norm": 0.3870190382003784, + "learning_rate": 3.576976421636616e-06, + "loss": 0.006, + "step": 9272 + }, + { + "epoch": 6.423969518531347, + "grad_norm": 0.2708127200603485, + "learning_rate": 3.5762829403606102e-06, + "loss": 0.0037, + "step": 9273 + }, + { + "epoch": 6.4246622791825425, + "grad_norm": 0.1558944284915924, + "learning_rate": 3.575589459084605e-06, + "loss": 0.0023, + "step": 9274 + }, + { + "epoch": 6.425355039833738, + "grad_norm": 0.40814536809921265, + "learning_rate": 3.5748959778085997e-06, + "loss": 0.0054, + "step": 9275 + }, + { + "epoch": 6.426047800484932, + "grad_norm": 0.20814627408981323, + "learning_rate": 3.5742024965325937e-06, + "loss": 0.0028, + "step": 9276 + }, + { + "epoch": 6.4267405611361275, + "grad_norm": 0.19707806408405304, + "learning_rate": 3.5735090152565883e-06, + "loss": 0.0034, + "step": 9277 + }, + { + "epoch": 6.427433321787323, + "grad_norm": 0.19731505215168, + "learning_rate": 3.5728155339805828e-06, + "loss": 0.0031, + "step": 9278 + }, + { + "epoch": 6.428126082438517, + "grad_norm": 0.43673670291900635, + "learning_rate": 3.5721220527045773e-06, + "loss": 0.0065, + "step": 9279 + }, + { + "epoch": 6.428818843089712, + "grad_norm": 0.46689996123313904, + "learning_rate": 3.5714285714285718e-06, + "loss": 0.0069, + "step": 9280 + }, + { + "epoch": 6.429511603740908, + "grad_norm": 0.2391880601644516, + "learning_rate": 3.5707350901525663e-06, + "loss": 0.003, + "step": 9281 + }, + { + "epoch": 6.430204364392102, + "grad_norm": 0.13410313427448273, + "learning_rate": 3.5700416088765604e-06, + "loss": 0.0029, + "step": 9282 + }, + { + "epoch": 6.430897125043297, + "grad_norm": 0.3177035450935364, + "learning_rate": 3.569348127600555e-06, + "loss": 0.004, + "step": 9283 + }, + { + "epoch": 6.431589885694493, + "grad_norm": 0.16396614909172058, + "learning_rate": 3.56865464632455e-06, + "loss": 0.003, + "step": 9284 + }, + { + "epoch": 6.432282646345688, + "grad_norm": 0.23232145607471466, + "learning_rate": 3.567961165048544e-06, + "loss": 0.0035, + "step": 9285 + }, + { + "epoch": 6.432975406996882, + "grad_norm": 0.30791035294532776, + "learning_rate": 3.5672676837725384e-06, + "loss": 0.0038, + "step": 9286 + }, + { + "epoch": 6.433668167648078, + "grad_norm": 0.27299022674560547, + "learning_rate": 3.566574202496533e-06, + "loss": 0.0031, + "step": 9287 + }, + { + "epoch": 6.434360928299273, + "grad_norm": 0.15564918518066406, + "learning_rate": 3.565880721220527e-06, + "loss": 0.0034, + "step": 9288 + }, + { + "epoch": 6.435053688950467, + "grad_norm": 0.31566473841667175, + "learning_rate": 3.565187239944522e-06, + "loss": 0.0036, + "step": 9289 + }, + { + "epoch": 6.4357464496016625, + "grad_norm": 0.1905096173286438, + "learning_rate": 3.5644937586685165e-06, + "loss": 0.0028, + "step": 9290 + }, + { + "epoch": 6.436439210252858, + "grad_norm": 0.42711296677589417, + "learning_rate": 3.5638002773925105e-06, + "loss": 0.0041, + "step": 9291 + }, + { + "epoch": 6.437131970904053, + "grad_norm": 0.2646515369415283, + "learning_rate": 3.563106796116505e-06, + "loss": 0.0032, + "step": 9292 + }, + { + "epoch": 6.4378247315552475, + "grad_norm": 0.20329734683036804, + "learning_rate": 3.562413314840499e-06, + "loss": 0.0038, + "step": 9293 + }, + { + "epoch": 6.438517492206443, + "grad_norm": 0.2283334732055664, + "learning_rate": 3.561719833564494e-06, + "loss": 0.003, + "step": 9294 + }, + { + "epoch": 6.439210252857638, + "grad_norm": 0.27285119891166687, + "learning_rate": 3.5610263522884886e-06, + "loss": 0.0031, + "step": 9295 + }, + { + "epoch": 6.439903013508832, + "grad_norm": 0.5273153781890869, + "learning_rate": 3.560332871012483e-06, + "loss": 0.0034, + "step": 9296 + }, + { + "epoch": 6.440595774160028, + "grad_norm": 0.2023629993200302, + "learning_rate": 3.559639389736477e-06, + "loss": 0.0039, + "step": 9297 + }, + { + "epoch": 6.441288534811223, + "grad_norm": 0.29897841811180115, + "learning_rate": 3.5589459084604717e-06, + "loss": 0.0042, + "step": 9298 + }, + { + "epoch": 6.441981295462417, + "grad_norm": 0.16852521896362305, + "learning_rate": 3.5582524271844666e-06, + "loss": 0.0041, + "step": 9299 + }, + { + "epoch": 6.442674056113613, + "grad_norm": 0.22829200327396393, + "learning_rate": 3.5575589459084607e-06, + "loss": 0.0046, + "step": 9300 + }, + { + "epoch": 6.443366816764808, + "grad_norm": 0.43147629499435425, + "learning_rate": 3.5568654646324552e-06, + "loss": 0.0041, + "step": 9301 + }, + { + "epoch": 6.444059577416002, + "grad_norm": 0.17013610899448395, + "learning_rate": 3.5561719833564497e-06, + "loss": 0.0026, + "step": 9302 + }, + { + "epoch": 6.444752338067198, + "grad_norm": 0.19303420186042786, + "learning_rate": 3.555478502080444e-06, + "loss": 0.0036, + "step": 9303 + }, + { + "epoch": 6.445445098718393, + "grad_norm": 0.2566279172897339, + "learning_rate": 3.5547850208044387e-06, + "loss": 0.0046, + "step": 9304 + }, + { + "epoch": 6.446137859369588, + "grad_norm": 0.4512794315814972, + "learning_rate": 3.5540915395284333e-06, + "loss": 0.0043, + "step": 9305 + }, + { + "epoch": 6.4468306200207826, + "grad_norm": 0.16259980201721191, + "learning_rate": 3.5533980582524273e-06, + "loss": 0.0028, + "step": 9306 + }, + { + "epoch": 6.447523380671978, + "grad_norm": 0.15087482333183289, + "learning_rate": 3.552704576976422e-06, + "loss": 0.003, + "step": 9307 + }, + { + "epoch": 6.448216141323173, + "grad_norm": 0.19279569387435913, + "learning_rate": 3.552011095700416e-06, + "loss": 0.0038, + "step": 9308 + }, + { + "epoch": 6.4489089019743675, + "grad_norm": 0.2542977035045624, + "learning_rate": 3.551317614424411e-06, + "loss": 0.0026, + "step": 9309 + }, + { + "epoch": 6.449601662625563, + "grad_norm": 0.20152418315410614, + "learning_rate": 3.5506241331484054e-06, + "loss": 0.0029, + "step": 9310 + }, + { + "epoch": 6.450294423276758, + "grad_norm": 0.2344168871641159, + "learning_rate": 3.5499306518724e-06, + "loss": 0.0039, + "step": 9311 + }, + { + "epoch": 6.450987183927953, + "grad_norm": 0.11551791429519653, + "learning_rate": 3.549237170596394e-06, + "loss": 0.0024, + "step": 9312 + }, + { + "epoch": 6.451679944579148, + "grad_norm": 0.1495424211025238, + "learning_rate": 3.5485436893203885e-06, + "loss": 0.0026, + "step": 9313 + }, + { + "epoch": 6.452372705230343, + "grad_norm": 0.36056405305862427, + "learning_rate": 3.5478502080443834e-06, + "loss": 0.0059, + "step": 9314 + }, + { + "epoch": 6.453065465881538, + "grad_norm": 0.1673860251903534, + "learning_rate": 3.5471567267683775e-06, + "loss": 0.0028, + "step": 9315 + }, + { + "epoch": 6.453758226532733, + "grad_norm": 0.46804893016815186, + "learning_rate": 3.546463245492372e-06, + "loss": 0.007, + "step": 9316 + }, + { + "epoch": 6.454450987183928, + "grad_norm": 0.18465572595596313, + "learning_rate": 3.545769764216366e-06, + "loss": 0.0028, + "step": 9317 + }, + { + "epoch": 6.455143747835123, + "grad_norm": 0.3037743866443634, + "learning_rate": 3.5450762829403606e-06, + "loss": 0.0035, + "step": 9318 + }, + { + "epoch": 6.455836508486318, + "grad_norm": 0.5053547024726868, + "learning_rate": 3.5443828016643555e-06, + "loss": 0.0043, + "step": 9319 + }, + { + "epoch": 6.456529269137513, + "grad_norm": 0.1676502823829651, + "learning_rate": 3.54368932038835e-06, + "loss": 0.0027, + "step": 9320 + }, + { + "epoch": 6.457222029788708, + "grad_norm": 0.28507503867149353, + "learning_rate": 3.542995839112344e-06, + "loss": 0.0034, + "step": 9321 + }, + { + "epoch": 6.457914790439903, + "grad_norm": 0.18971438705921173, + "learning_rate": 3.5423023578363387e-06, + "loss": 0.0035, + "step": 9322 + }, + { + "epoch": 6.458607551091098, + "grad_norm": 0.26438286900520325, + "learning_rate": 3.5416088765603327e-06, + "loss": 0.0037, + "step": 9323 + }, + { + "epoch": 6.459300311742293, + "grad_norm": 0.29785943031311035, + "learning_rate": 3.5409153952843277e-06, + "loss": 0.0039, + "step": 9324 + }, + { + "epoch": 6.459993072393488, + "grad_norm": 0.21474888920783997, + "learning_rate": 3.540221914008322e-06, + "loss": 0.0034, + "step": 9325 + }, + { + "epoch": 6.460685833044683, + "grad_norm": 0.1615288406610489, + "learning_rate": 3.5395284327323167e-06, + "loss": 0.003, + "step": 9326 + }, + { + "epoch": 6.461378593695878, + "grad_norm": 0.25554540753364563, + "learning_rate": 3.5388349514563108e-06, + "loss": 0.0073, + "step": 9327 + }, + { + "epoch": 6.462071354347073, + "grad_norm": 0.21433214843273163, + "learning_rate": 3.5381414701803053e-06, + "loss": 0.0031, + "step": 9328 + }, + { + "epoch": 6.462764114998268, + "grad_norm": 0.2230767160654068, + "learning_rate": 3.5374479889043002e-06, + "loss": 0.0033, + "step": 9329 + }, + { + "epoch": 6.463456875649463, + "grad_norm": 0.27657797932624817, + "learning_rate": 3.5367545076282943e-06, + "loss": 0.0036, + "step": 9330 + }, + { + "epoch": 6.464149636300658, + "grad_norm": 0.22122372686862946, + "learning_rate": 3.536061026352289e-06, + "loss": 0.0036, + "step": 9331 + }, + { + "epoch": 6.464842396951854, + "grad_norm": 0.2776499092578888, + "learning_rate": 3.535367545076283e-06, + "loss": 0.0034, + "step": 9332 + }, + { + "epoch": 6.465535157603048, + "grad_norm": 0.3153015077114105, + "learning_rate": 3.5346740638002774e-06, + "loss": 0.0025, + "step": 9333 + }, + { + "epoch": 6.466227918254243, + "grad_norm": 0.1284671276807785, + "learning_rate": 3.5339805825242724e-06, + "loss": 0.0022, + "step": 9334 + }, + { + "epoch": 6.4669206789054385, + "grad_norm": 0.30792906880378723, + "learning_rate": 3.533287101248267e-06, + "loss": 0.004, + "step": 9335 + }, + { + "epoch": 6.467613439556633, + "grad_norm": 0.6116307377815247, + "learning_rate": 3.532593619972261e-06, + "loss": 0.0028, + "step": 9336 + }, + { + "epoch": 6.468306200207828, + "grad_norm": 0.12195632606744766, + "learning_rate": 3.5319001386962555e-06, + "loss": 0.0024, + "step": 9337 + }, + { + "epoch": 6.4689989608590235, + "grad_norm": 0.20950274169445038, + "learning_rate": 3.5312066574202495e-06, + "loss": 0.003, + "step": 9338 + }, + { + "epoch": 6.469691721510218, + "grad_norm": 0.30685269832611084, + "learning_rate": 3.5305131761442445e-06, + "loss": 0.0041, + "step": 9339 + }, + { + "epoch": 6.470384482161413, + "grad_norm": 0.34830111265182495, + "learning_rate": 3.529819694868239e-06, + "loss": 0.0046, + "step": 9340 + }, + { + "epoch": 6.471077242812608, + "grad_norm": 0.22837074100971222, + "learning_rate": 3.5291262135922335e-06, + "loss": 0.0027, + "step": 9341 + }, + { + "epoch": 6.471770003463803, + "grad_norm": 0.38324078917503357, + "learning_rate": 3.5284327323162276e-06, + "loss": 0.0028, + "step": 9342 + }, + { + "epoch": 6.472462764114998, + "grad_norm": 0.3997797667980194, + "learning_rate": 3.527739251040222e-06, + "loss": 0.0032, + "step": 9343 + }, + { + "epoch": 6.473155524766193, + "grad_norm": 0.18856115639209747, + "learning_rate": 3.527045769764217e-06, + "loss": 0.0041, + "step": 9344 + }, + { + "epoch": 6.473848285417389, + "grad_norm": 0.2529909610748291, + "learning_rate": 3.526352288488211e-06, + "loss": 0.0034, + "step": 9345 + }, + { + "epoch": 6.474541046068583, + "grad_norm": 0.16508552432060242, + "learning_rate": 3.5256588072122056e-06, + "loss": 0.0025, + "step": 9346 + }, + { + "epoch": 6.475233806719778, + "grad_norm": 0.14784452319145203, + "learning_rate": 3.5249653259361997e-06, + "loss": 0.0024, + "step": 9347 + }, + { + "epoch": 6.475926567370974, + "grad_norm": 0.2928319573402405, + "learning_rate": 3.5242718446601942e-06, + "loss": 0.0038, + "step": 9348 + }, + { + "epoch": 6.476619328022168, + "grad_norm": 0.2895059287548065, + "learning_rate": 3.523578363384189e-06, + "loss": 0.0043, + "step": 9349 + }, + { + "epoch": 6.477312088673363, + "grad_norm": 0.3710174560546875, + "learning_rate": 3.5228848821081837e-06, + "loss": 0.0033, + "step": 9350 + }, + { + "epoch": 6.478004849324559, + "grad_norm": 0.3162163197994232, + "learning_rate": 3.5221914008321777e-06, + "loss": 0.0033, + "step": 9351 + }, + { + "epoch": 6.478697609975754, + "grad_norm": 0.29709571599960327, + "learning_rate": 3.5214979195561723e-06, + "loss": 0.0033, + "step": 9352 + }, + { + "epoch": 6.479390370626948, + "grad_norm": 0.34101665019989014, + "learning_rate": 3.5208044382801663e-06, + "loss": 0.0055, + "step": 9353 + }, + { + "epoch": 6.4800831312781435, + "grad_norm": 0.28122788667678833, + "learning_rate": 3.5201109570041613e-06, + "loss": 0.0037, + "step": 9354 + }, + { + "epoch": 6.480775891929339, + "grad_norm": 0.16156338155269623, + "learning_rate": 3.5194174757281558e-06, + "loss": 0.0028, + "step": 9355 + }, + { + "epoch": 6.481468652580533, + "grad_norm": 0.24210387468338013, + "learning_rate": 3.51872399445215e-06, + "loss": 0.0044, + "step": 9356 + }, + { + "epoch": 6.4821614132317285, + "grad_norm": 0.30856701731681824, + "learning_rate": 3.5180305131761444e-06, + "loss": 0.0041, + "step": 9357 + }, + { + "epoch": 6.482854173882924, + "grad_norm": 0.3893034756183624, + "learning_rate": 3.517337031900139e-06, + "loss": 0.0034, + "step": 9358 + }, + { + "epoch": 6.483546934534118, + "grad_norm": 0.13166674971580505, + "learning_rate": 3.516643550624134e-06, + "loss": 0.0021, + "step": 9359 + }, + { + "epoch": 6.484239695185313, + "grad_norm": 0.32749444246292114, + "learning_rate": 3.515950069348128e-06, + "loss": 0.0051, + "step": 9360 + }, + { + "epoch": 6.484932455836509, + "grad_norm": 0.36538147926330566, + "learning_rate": 3.5152565880721224e-06, + "loss": 0.0043, + "step": 9361 + }, + { + "epoch": 6.485625216487703, + "grad_norm": 0.32941997051239014, + "learning_rate": 3.5145631067961165e-06, + "loss": 0.0037, + "step": 9362 + }, + { + "epoch": 6.486317977138898, + "grad_norm": 0.10568477213382721, + "learning_rate": 3.513869625520111e-06, + "loss": 0.0022, + "step": 9363 + }, + { + "epoch": 6.487010737790094, + "grad_norm": 0.160566508769989, + "learning_rate": 3.513176144244106e-06, + "loss": 0.0025, + "step": 9364 + }, + { + "epoch": 6.487703498441289, + "grad_norm": 0.16372741758823395, + "learning_rate": 3.5124826629681005e-06, + "loss": 0.0026, + "step": 9365 + }, + { + "epoch": 6.488396259092483, + "grad_norm": 0.1735554039478302, + "learning_rate": 3.5117891816920945e-06, + "loss": 0.0027, + "step": 9366 + }, + { + "epoch": 6.489089019743679, + "grad_norm": 0.34052911400794983, + "learning_rate": 3.511095700416089e-06, + "loss": 0.0037, + "step": 9367 + }, + { + "epoch": 6.489781780394874, + "grad_norm": 0.47544294595718384, + "learning_rate": 3.510402219140083e-06, + "loss": 0.0071, + "step": 9368 + }, + { + "epoch": 6.490474541046068, + "grad_norm": 0.28894108533859253, + "learning_rate": 3.509708737864078e-06, + "loss": 0.0043, + "step": 9369 + }, + { + "epoch": 6.4911673016972635, + "grad_norm": 0.18008123338222504, + "learning_rate": 3.5090152565880726e-06, + "loss": 0.0027, + "step": 9370 + }, + { + "epoch": 6.491860062348459, + "grad_norm": 0.17859375476837158, + "learning_rate": 3.5083217753120667e-06, + "loss": 0.0026, + "step": 9371 + }, + { + "epoch": 6.492552822999654, + "grad_norm": 0.36944127082824707, + "learning_rate": 3.507628294036061e-06, + "loss": 0.0058, + "step": 9372 + }, + { + "epoch": 6.4932455836508485, + "grad_norm": 0.13610002398490906, + "learning_rate": 3.5069348127600557e-06, + "loss": 0.0023, + "step": 9373 + }, + { + "epoch": 6.493938344302044, + "grad_norm": 0.26454663276672363, + "learning_rate": 3.5062413314840506e-06, + "loss": 0.004, + "step": 9374 + }, + { + "epoch": 6.494631104953239, + "grad_norm": 0.15831910073757172, + "learning_rate": 3.5055478502080447e-06, + "loss": 0.0029, + "step": 9375 + }, + { + "epoch": 6.495323865604433, + "grad_norm": 0.44009876251220703, + "learning_rate": 3.5048543689320392e-06, + "loss": 0.0035, + "step": 9376 + }, + { + "epoch": 6.496016626255629, + "grad_norm": 0.21103429794311523, + "learning_rate": 3.5041608876560333e-06, + "loss": 0.0039, + "step": 9377 + }, + { + "epoch": 6.496709386906824, + "grad_norm": 0.6784105896949768, + "learning_rate": 3.503467406380028e-06, + "loss": 0.0049, + "step": 9378 + }, + { + "epoch": 6.497402147558018, + "grad_norm": 0.14801089465618134, + "learning_rate": 3.5027739251040228e-06, + "loss": 0.0025, + "step": 9379 + }, + { + "epoch": 6.498094908209214, + "grad_norm": 0.21305952966213226, + "learning_rate": 3.502080443828017e-06, + "loss": 0.0026, + "step": 9380 + }, + { + "epoch": 6.498787668860409, + "grad_norm": 0.3113575279712677, + "learning_rate": 3.5013869625520114e-06, + "loss": 0.0046, + "step": 9381 + }, + { + "epoch": 6.499480429511603, + "grad_norm": 0.569118082523346, + "learning_rate": 3.500693481276006e-06, + "loss": 0.0031, + "step": 9382 + }, + { + "epoch": 6.500173190162799, + "grad_norm": 0.447287380695343, + "learning_rate": 3.5e-06, + "loss": 0.0047, + "step": 9383 + }, + { + "epoch": 6.500865950813994, + "grad_norm": 0.2354438304901123, + "learning_rate": 3.499306518723995e-06, + "loss": 0.0039, + "step": 9384 + }, + { + "epoch": 6.501558711465189, + "grad_norm": 0.2539510428905487, + "learning_rate": 3.4986130374479894e-06, + "loss": 0.003, + "step": 9385 + }, + { + "epoch": 6.5022514721163835, + "grad_norm": 0.17222854495048523, + "learning_rate": 3.4979195561719835e-06, + "loss": 0.0024, + "step": 9386 + }, + { + "epoch": 6.502944232767579, + "grad_norm": 0.21358181536197662, + "learning_rate": 3.497226074895978e-06, + "loss": 0.0037, + "step": 9387 + }, + { + "epoch": 6.503636993418774, + "grad_norm": 0.15497736632823944, + "learning_rate": 3.4965325936199725e-06, + "loss": 0.0028, + "step": 9388 + }, + { + "epoch": 6.5043297540699685, + "grad_norm": 0.15973694622516632, + "learning_rate": 3.4958391123439674e-06, + "loss": 0.0035, + "step": 9389 + }, + { + "epoch": 6.505022514721164, + "grad_norm": 0.21442706882953644, + "learning_rate": 3.4951456310679615e-06, + "loss": 0.0035, + "step": 9390 + }, + { + "epoch": 6.505715275372359, + "grad_norm": 0.3782165050506592, + "learning_rate": 3.494452149791956e-06, + "loss": 0.0073, + "step": 9391 + }, + { + "epoch": 6.506408036023554, + "grad_norm": 0.2783690094947815, + "learning_rate": 3.49375866851595e-06, + "loss": 0.0034, + "step": 9392 + }, + { + "epoch": 6.507100796674749, + "grad_norm": 0.13924583792686462, + "learning_rate": 3.4930651872399446e-06, + "loss": 0.0025, + "step": 9393 + }, + { + "epoch": 6.507793557325944, + "grad_norm": 0.2684783935546875, + "learning_rate": 3.4923717059639396e-06, + "loss": 0.0049, + "step": 9394 + }, + { + "epoch": 6.508486317977139, + "grad_norm": 0.25154218077659607, + "learning_rate": 3.4916782246879336e-06, + "loss": 0.0049, + "step": 9395 + }, + { + "epoch": 6.509179078628334, + "grad_norm": 0.22599586844444275, + "learning_rate": 3.490984743411928e-06, + "loss": 0.0044, + "step": 9396 + }, + { + "epoch": 6.509871839279529, + "grad_norm": 0.14814086258411407, + "learning_rate": 3.4902912621359227e-06, + "loss": 0.0027, + "step": 9397 + }, + { + "epoch": 6.510564599930724, + "grad_norm": 0.241136834025383, + "learning_rate": 3.4895977808599167e-06, + "loss": 0.0037, + "step": 9398 + }, + { + "epoch": 6.511257360581919, + "grad_norm": 0.6824232935905457, + "learning_rate": 3.4889042995839117e-06, + "loss": 0.0042, + "step": 9399 + }, + { + "epoch": 6.511950121233114, + "grad_norm": 0.1383947730064392, + "learning_rate": 3.488210818307906e-06, + "loss": 0.0027, + "step": 9400 + }, + { + "epoch": 6.512642881884309, + "grad_norm": 0.1966572403907776, + "learning_rate": 3.4875173370319003e-06, + "loss": 0.0027, + "step": 9401 + }, + { + "epoch": 6.513335642535504, + "grad_norm": 0.2254183143377304, + "learning_rate": 3.486823855755895e-06, + "loss": 0.003, + "step": 9402 + }, + { + "epoch": 6.514028403186699, + "grad_norm": 0.1570611596107483, + "learning_rate": 3.4861303744798893e-06, + "loss": 0.0027, + "step": 9403 + }, + { + "epoch": 6.514721163837894, + "grad_norm": 0.29516130685806274, + "learning_rate": 3.485436893203884e-06, + "loss": 0.0038, + "step": 9404 + }, + { + "epoch": 6.515413924489089, + "grad_norm": 0.12006194144487381, + "learning_rate": 3.4847434119278783e-06, + "loss": 0.0025, + "step": 9405 + }, + { + "epoch": 6.516106685140284, + "grad_norm": 0.5796042680740356, + "learning_rate": 3.484049930651873e-06, + "loss": 0.0036, + "step": 9406 + }, + { + "epoch": 6.516799445791479, + "grad_norm": 0.1770615428686142, + "learning_rate": 3.483356449375867e-06, + "loss": 0.0032, + "step": 9407 + }, + { + "epoch": 6.517492206442674, + "grad_norm": 0.3312567472457886, + "learning_rate": 3.4826629680998614e-06, + "loss": 0.0044, + "step": 9408 + }, + { + "epoch": 6.518184967093869, + "grad_norm": 0.20188383758068085, + "learning_rate": 3.4819694868238564e-06, + "loss": 0.003, + "step": 9409 + }, + { + "epoch": 6.518877727745064, + "grad_norm": 0.2747334837913513, + "learning_rate": 3.4812760055478504e-06, + "loss": 0.0028, + "step": 9410 + }, + { + "epoch": 6.519570488396259, + "grad_norm": 0.2189648300409317, + "learning_rate": 3.480582524271845e-06, + "loss": 0.0033, + "step": 9411 + }, + { + "epoch": 6.520263249047455, + "grad_norm": 0.23750540614128113, + "learning_rate": 3.4798890429958395e-06, + "loss": 0.0049, + "step": 9412 + }, + { + "epoch": 6.520956009698649, + "grad_norm": 0.6962100267410278, + "learning_rate": 3.4791955617198336e-06, + "loss": 0.0024, + "step": 9413 + }, + { + "epoch": 6.521648770349844, + "grad_norm": 0.12846559286117554, + "learning_rate": 3.4785020804438285e-06, + "loss": 0.0022, + "step": 9414 + }, + { + "epoch": 6.5223415310010395, + "grad_norm": 0.4862101972103119, + "learning_rate": 3.477808599167823e-06, + "loss": 0.0053, + "step": 9415 + }, + { + "epoch": 6.523034291652234, + "grad_norm": 0.2817215025424957, + "learning_rate": 3.477115117891817e-06, + "loss": 0.0058, + "step": 9416 + }, + { + "epoch": 6.523727052303429, + "grad_norm": 0.23052118718624115, + "learning_rate": 3.4764216366158116e-06, + "loss": 0.003, + "step": 9417 + }, + { + "epoch": 6.5244198129546245, + "grad_norm": 0.1892869770526886, + "learning_rate": 3.475728155339806e-06, + "loss": 0.0028, + "step": 9418 + }, + { + "epoch": 6.525112573605819, + "grad_norm": 0.49516624212265015, + "learning_rate": 3.4750346740638006e-06, + "loss": 0.0049, + "step": 9419 + }, + { + "epoch": 6.525805334257014, + "grad_norm": 0.21081629395484924, + "learning_rate": 3.474341192787795e-06, + "loss": 0.0045, + "step": 9420 + }, + { + "epoch": 6.526498094908209, + "grad_norm": 0.16002444922924042, + "learning_rate": 3.4736477115117896e-06, + "loss": 0.0034, + "step": 9421 + }, + { + "epoch": 6.527190855559404, + "grad_norm": 0.4152107536792755, + "learning_rate": 3.4729542302357837e-06, + "loss": 0.0032, + "step": 9422 + }, + { + "epoch": 6.527883616210599, + "grad_norm": 0.244332954287529, + "learning_rate": 3.4722607489597782e-06, + "loss": 0.0025, + "step": 9423 + }, + { + "epoch": 6.528576376861794, + "grad_norm": 0.22425228357315063, + "learning_rate": 3.471567267683773e-06, + "loss": 0.0043, + "step": 9424 + }, + { + "epoch": 6.529269137512989, + "grad_norm": 0.3269823491573334, + "learning_rate": 3.4708737864077672e-06, + "loss": 0.003, + "step": 9425 + }, + { + "epoch": 6.529961898164184, + "grad_norm": 0.4144686162471771, + "learning_rate": 3.4701803051317618e-06, + "loss": 0.0064, + "step": 9426 + }, + { + "epoch": 6.530654658815379, + "grad_norm": 0.25652065873146057, + "learning_rate": 3.4694868238557563e-06, + "loss": 0.0051, + "step": 9427 + }, + { + "epoch": 6.531347419466575, + "grad_norm": 0.8899679183959961, + "learning_rate": 3.4687933425797504e-06, + "loss": 0.0044, + "step": 9428 + }, + { + "epoch": 6.532040180117769, + "grad_norm": 0.12251868844032288, + "learning_rate": 3.4680998613037453e-06, + "loss": 0.0023, + "step": 9429 + }, + { + "epoch": 6.532732940768964, + "grad_norm": 0.2892351448535919, + "learning_rate": 3.46740638002774e-06, + "loss": 0.004, + "step": 9430 + }, + { + "epoch": 6.5334257014201595, + "grad_norm": 0.3377503752708435, + "learning_rate": 3.466712898751734e-06, + "loss": 0.0056, + "step": 9431 + }, + { + "epoch": 6.534118462071355, + "grad_norm": 0.7726133465766907, + "learning_rate": 3.4660194174757284e-06, + "loss": 0.0044, + "step": 9432 + }, + { + "epoch": 6.534811222722549, + "grad_norm": 0.27427834272384644, + "learning_rate": 3.4653259361997225e-06, + "loss": 0.0037, + "step": 9433 + }, + { + "epoch": 6.5355039833737445, + "grad_norm": 0.3087518811225891, + "learning_rate": 3.4646324549237174e-06, + "loss": 0.0046, + "step": 9434 + }, + { + "epoch": 6.53619674402494, + "grad_norm": 0.15330451726913452, + "learning_rate": 3.463938973647712e-06, + "loss": 0.0027, + "step": 9435 + }, + { + "epoch": 6.536889504676134, + "grad_norm": 0.4843771755695343, + "learning_rate": 3.4632454923717064e-06, + "loss": 0.0036, + "step": 9436 + }, + { + "epoch": 6.5375822653273294, + "grad_norm": 0.2843921184539795, + "learning_rate": 3.4625520110957005e-06, + "loss": 0.0036, + "step": 9437 + }, + { + "epoch": 6.538275025978525, + "grad_norm": 0.2954002022743225, + "learning_rate": 3.461858529819695e-06, + "loss": 0.0045, + "step": 9438 + }, + { + "epoch": 6.538967786629719, + "grad_norm": 0.23253130912780762, + "learning_rate": 3.46116504854369e-06, + "loss": 0.0031, + "step": 9439 + }, + { + "epoch": 6.539660547280914, + "grad_norm": 0.2086314707994461, + "learning_rate": 3.460471567267684e-06, + "loss": 0.0029, + "step": 9440 + }, + { + "epoch": 6.54035330793211, + "grad_norm": 0.40789994597435, + "learning_rate": 3.4597780859916786e-06, + "loss": 0.006, + "step": 9441 + }, + { + "epoch": 6.541046068583304, + "grad_norm": 0.15205442905426025, + "learning_rate": 3.459084604715673e-06, + "loss": 0.0029, + "step": 9442 + }, + { + "epoch": 6.541738829234499, + "grad_norm": 0.21897590160369873, + "learning_rate": 3.458391123439667e-06, + "loss": 0.0035, + "step": 9443 + }, + { + "epoch": 6.542431589885695, + "grad_norm": 0.3793187737464905, + "learning_rate": 3.457697642163662e-06, + "loss": 0.0049, + "step": 9444 + }, + { + "epoch": 6.543124350536889, + "grad_norm": 0.48277217149734497, + "learning_rate": 3.4570041608876566e-06, + "loss": 0.0062, + "step": 9445 + }, + { + "epoch": 6.543817111188084, + "grad_norm": 0.2049214243888855, + "learning_rate": 3.4563106796116507e-06, + "loss": 0.0036, + "step": 9446 + }, + { + "epoch": 6.54450987183928, + "grad_norm": 0.2654341459274292, + "learning_rate": 3.455617198335645e-06, + "loss": 0.0037, + "step": 9447 + }, + { + "epoch": 6.545202632490475, + "grad_norm": 0.6070582270622253, + "learning_rate": 3.4549237170596393e-06, + "loss": 0.004, + "step": 9448 + }, + { + "epoch": 6.545895393141669, + "grad_norm": 0.7338226437568665, + "learning_rate": 3.4542302357836342e-06, + "loss": 0.0083, + "step": 9449 + }, + { + "epoch": 6.5465881537928645, + "grad_norm": 0.19648605585098267, + "learning_rate": 3.4535367545076287e-06, + "loss": 0.0043, + "step": 9450 + }, + { + "epoch": 6.54728091444406, + "grad_norm": 0.436946839094162, + "learning_rate": 3.4528432732316232e-06, + "loss": 0.0039, + "step": 9451 + }, + { + "epoch": 6.547973675095255, + "grad_norm": 0.2569490969181061, + "learning_rate": 3.4521497919556173e-06, + "loss": 0.0042, + "step": 9452 + }, + { + "epoch": 6.5486664357464495, + "grad_norm": 0.3160373568534851, + "learning_rate": 3.451456310679612e-06, + "loss": 0.0038, + "step": 9453 + }, + { + "epoch": 6.549359196397645, + "grad_norm": 0.23284515738487244, + "learning_rate": 3.4507628294036068e-06, + "loss": 0.0032, + "step": 9454 + }, + { + "epoch": 6.55005195704884, + "grad_norm": 0.45133253931999207, + "learning_rate": 3.450069348127601e-06, + "loss": 0.0041, + "step": 9455 + }, + { + "epoch": 6.550744717700034, + "grad_norm": 0.5596340298652649, + "learning_rate": 3.4493758668515954e-06, + "loss": 0.0057, + "step": 9456 + }, + { + "epoch": 6.55143747835123, + "grad_norm": 0.4074881374835968, + "learning_rate": 3.4486823855755894e-06, + "loss": 0.0068, + "step": 9457 + }, + { + "epoch": 6.552130239002425, + "grad_norm": 0.21876654028892517, + "learning_rate": 3.447988904299584e-06, + "loss": 0.0037, + "step": 9458 + }, + { + "epoch": 6.552822999653619, + "grad_norm": 0.17608174681663513, + "learning_rate": 3.447295423023579e-06, + "loss": 0.0028, + "step": 9459 + }, + { + "epoch": 6.553515760304815, + "grad_norm": 0.17134511470794678, + "learning_rate": 3.4466019417475734e-06, + "loss": 0.0032, + "step": 9460 + }, + { + "epoch": 6.55420852095601, + "grad_norm": 0.8364170789718628, + "learning_rate": 3.4459084604715675e-06, + "loss": 0.0043, + "step": 9461 + }, + { + "epoch": 6.554901281607204, + "grad_norm": 0.39484527707099915, + "learning_rate": 3.445214979195562e-06, + "loss": 0.0039, + "step": 9462 + }, + { + "epoch": 6.5555940422584, + "grad_norm": 0.35075441002845764, + "learning_rate": 3.444521497919556e-06, + "loss": 0.006, + "step": 9463 + }, + { + "epoch": 6.556286802909595, + "grad_norm": 0.278879851102829, + "learning_rate": 3.443828016643551e-06, + "loss": 0.0037, + "step": 9464 + }, + { + "epoch": 6.556979563560789, + "grad_norm": 0.2113790512084961, + "learning_rate": 3.4431345353675455e-06, + "loss": 0.0037, + "step": 9465 + }, + { + "epoch": 6.5576723242119845, + "grad_norm": 0.2821394205093384, + "learning_rate": 3.44244105409154e-06, + "loss": 0.0054, + "step": 9466 + }, + { + "epoch": 6.55836508486318, + "grad_norm": 0.2962658107280731, + "learning_rate": 3.441747572815534e-06, + "loss": 0.0033, + "step": 9467 + }, + { + "epoch": 6.559057845514375, + "grad_norm": 0.16165822744369507, + "learning_rate": 3.4410540915395286e-06, + "loss": 0.0029, + "step": 9468 + }, + { + "epoch": 6.5597506061655695, + "grad_norm": 0.560104250907898, + "learning_rate": 3.4403606102635236e-06, + "loss": 0.0068, + "step": 9469 + }, + { + "epoch": 6.560443366816765, + "grad_norm": 0.19999182224273682, + "learning_rate": 3.4396671289875176e-06, + "loss": 0.003, + "step": 9470 + }, + { + "epoch": 6.56113612746796, + "grad_norm": 0.3037041425704956, + "learning_rate": 3.438973647711512e-06, + "loss": 0.0053, + "step": 9471 + }, + { + "epoch": 6.561828888119155, + "grad_norm": 0.2861514985561371, + "learning_rate": 3.4382801664355062e-06, + "loss": 0.0031, + "step": 9472 + }, + { + "epoch": 6.56252164877035, + "grad_norm": 0.24474790692329407, + "learning_rate": 3.4375866851595008e-06, + "loss": 0.0037, + "step": 9473 + }, + { + "epoch": 6.563214409421545, + "grad_norm": 0.3315252959728241, + "learning_rate": 3.4368932038834957e-06, + "loss": 0.0054, + "step": 9474 + }, + { + "epoch": 6.56390717007274, + "grad_norm": 0.24230967462062836, + "learning_rate": 3.43619972260749e-06, + "loss": 0.0051, + "step": 9475 + }, + { + "epoch": 6.564599930723935, + "grad_norm": 0.25363656878471375, + "learning_rate": 3.4355062413314843e-06, + "loss": 0.0035, + "step": 9476 + }, + { + "epoch": 6.56529269137513, + "grad_norm": 0.2314106523990631, + "learning_rate": 3.434812760055479e-06, + "loss": 0.0043, + "step": 9477 + }, + { + "epoch": 6.565985452026325, + "grad_norm": 0.3255745470523834, + "learning_rate": 3.434119278779473e-06, + "loss": 0.0036, + "step": 9478 + }, + { + "epoch": 6.56667821267752, + "grad_norm": 0.25917908549308777, + "learning_rate": 3.433425797503468e-06, + "loss": 0.0057, + "step": 9479 + }, + { + "epoch": 6.567370973328715, + "grad_norm": 0.1794329434633255, + "learning_rate": 3.4327323162274623e-06, + "loss": 0.0031, + "step": 9480 + }, + { + "epoch": 6.56806373397991, + "grad_norm": 0.31018608808517456, + "learning_rate": 3.4320388349514564e-06, + "loss": 0.0051, + "step": 9481 + }, + { + "epoch": 6.568756494631105, + "grad_norm": 0.25144803524017334, + "learning_rate": 3.431345353675451e-06, + "loss": 0.005, + "step": 9482 + }, + { + "epoch": 6.5694492552823, + "grad_norm": 0.7011420130729675, + "learning_rate": 3.4306518723994454e-06, + "loss": 0.0065, + "step": 9483 + }, + { + "epoch": 6.570142015933495, + "grad_norm": 0.2495323121547699, + "learning_rate": 3.4299583911234404e-06, + "loss": 0.0044, + "step": 9484 + }, + { + "epoch": 6.5708347765846895, + "grad_norm": 0.20269997417926788, + "learning_rate": 3.4292649098474345e-06, + "loss": 0.0038, + "step": 9485 + }, + { + "epoch": 6.571527537235885, + "grad_norm": 0.29128703474998474, + "learning_rate": 3.428571428571429e-06, + "loss": 0.0043, + "step": 9486 + }, + { + "epoch": 6.57222029788708, + "grad_norm": 0.48349523544311523, + "learning_rate": 3.427877947295423e-06, + "loss": 0.0046, + "step": 9487 + }, + { + "epoch": 6.572913058538275, + "grad_norm": 0.2877331078052521, + "learning_rate": 3.4271844660194176e-06, + "loss": 0.0038, + "step": 9488 + }, + { + "epoch": 6.57360581918947, + "grad_norm": 0.19083867967128754, + "learning_rate": 3.4264909847434125e-06, + "loss": 0.0034, + "step": 9489 + }, + { + "epoch": 6.574298579840665, + "grad_norm": 0.39765849709510803, + "learning_rate": 3.425797503467407e-06, + "loss": 0.0044, + "step": 9490 + }, + { + "epoch": 6.57499134049186, + "grad_norm": 0.39917293190956116, + "learning_rate": 3.425104022191401e-06, + "loss": 0.0048, + "step": 9491 + }, + { + "epoch": 6.575684101143056, + "grad_norm": 0.3934260308742523, + "learning_rate": 3.4244105409153956e-06, + "loss": 0.0071, + "step": 9492 + }, + { + "epoch": 6.57637686179425, + "grad_norm": 0.30203375220298767, + "learning_rate": 3.4237170596393897e-06, + "loss": 0.0041, + "step": 9493 + }, + { + "epoch": 6.577069622445445, + "grad_norm": 0.29003268480300903, + "learning_rate": 3.4230235783633846e-06, + "loss": 0.0045, + "step": 9494 + }, + { + "epoch": 6.5777623830966405, + "grad_norm": 0.30671316385269165, + "learning_rate": 3.422330097087379e-06, + "loss": 0.0047, + "step": 9495 + }, + { + "epoch": 6.578455143747835, + "grad_norm": 0.44722458720207214, + "learning_rate": 3.4216366158113732e-06, + "loss": 0.0034, + "step": 9496 + }, + { + "epoch": 6.57914790439903, + "grad_norm": 0.2024839073419571, + "learning_rate": 3.4209431345353677e-06, + "loss": 0.0041, + "step": 9497 + }, + { + "epoch": 6.5798406650502255, + "grad_norm": 0.3256869316101074, + "learning_rate": 3.4202496532593622e-06, + "loss": 0.0035, + "step": 9498 + }, + { + "epoch": 6.58053342570142, + "grad_norm": 0.19540810585021973, + "learning_rate": 3.419556171983357e-06, + "loss": 0.0027, + "step": 9499 + }, + { + "epoch": 6.581226186352615, + "grad_norm": 0.664091169834137, + "learning_rate": 3.4188626907073513e-06, + "loss": 0.0035, + "step": 9500 + }, + { + "epoch": 6.58191894700381, + "grad_norm": 0.3423573970794678, + "learning_rate": 3.4181692094313458e-06, + "loss": 0.0056, + "step": 9501 + }, + { + "epoch": 6.582611707655005, + "grad_norm": 0.27303338050842285, + "learning_rate": 3.41747572815534e-06, + "loss": 0.0039, + "step": 9502 + }, + { + "epoch": 6.5833044683062, + "grad_norm": 0.18630249798297882, + "learning_rate": 3.4167822468793344e-06, + "loss": 0.003, + "step": 9503 + }, + { + "epoch": 6.583997228957395, + "grad_norm": 0.4994889795780182, + "learning_rate": 3.4160887656033293e-06, + "loss": 0.0059, + "step": 9504 + }, + { + "epoch": 6.58468998960859, + "grad_norm": 0.37421154975891113, + "learning_rate": 3.415395284327324e-06, + "loss": 0.0077, + "step": 9505 + }, + { + "epoch": 6.585382750259785, + "grad_norm": 0.17351457476615906, + "learning_rate": 3.414701803051318e-06, + "loss": 0.0031, + "step": 9506 + }, + { + "epoch": 6.58607551091098, + "grad_norm": 0.2850160300731659, + "learning_rate": 3.4140083217753124e-06, + "loss": 0.0036, + "step": 9507 + }, + { + "epoch": 6.586768271562176, + "grad_norm": 0.30493614077568054, + "learning_rate": 3.4133148404993065e-06, + "loss": 0.0044, + "step": 9508 + }, + { + "epoch": 6.58746103221337, + "grad_norm": 0.7638171315193176, + "learning_rate": 3.4126213592233014e-06, + "loss": 0.0046, + "step": 9509 + }, + { + "epoch": 6.588153792864565, + "grad_norm": 0.2794395685195923, + "learning_rate": 3.411927877947296e-06, + "loss": 0.0035, + "step": 9510 + }, + { + "epoch": 6.5888465535157605, + "grad_norm": 0.41764628887176514, + "learning_rate": 3.41123439667129e-06, + "loss": 0.0053, + "step": 9511 + }, + { + "epoch": 6.589539314166955, + "grad_norm": 0.5977731347084045, + "learning_rate": 3.4105409153952845e-06, + "loss": 0.0068, + "step": 9512 + }, + { + "epoch": 6.59023207481815, + "grad_norm": 0.14589254558086395, + "learning_rate": 3.409847434119279e-06, + "loss": 0.0033, + "step": 9513 + }, + { + "epoch": 6.5909248354693455, + "grad_norm": 0.22986231744289398, + "learning_rate": 3.409153952843274e-06, + "loss": 0.0041, + "step": 9514 + }, + { + "epoch": 6.591617596120541, + "grad_norm": 0.266808420419693, + "learning_rate": 3.408460471567268e-06, + "loss": 0.0036, + "step": 9515 + }, + { + "epoch": 6.592310356771735, + "grad_norm": 0.35455387830734253, + "learning_rate": 3.4077669902912626e-06, + "loss": 0.0061, + "step": 9516 + }, + { + "epoch": 6.59300311742293, + "grad_norm": Infinity, + "learning_rate": 3.4077669902912626e-06, + "loss": 0.0046, + "step": 9517 + }, + { + "epoch": 6.593695878074126, + "grad_norm": 0.28056949377059937, + "learning_rate": 3.4070735090152566e-06, + "loss": 0.0051, + "step": 9518 + }, + { + "epoch": 6.59438863872532, + "grad_norm": 0.22811107337474823, + "learning_rate": 3.406380027739251e-06, + "loss": 0.0033, + "step": 9519 + }, + { + "epoch": 6.595081399376515, + "grad_norm": 0.24492451548576355, + "learning_rate": 3.405686546463246e-06, + "loss": 0.0038, + "step": 9520 + }, + { + "epoch": 6.595774160027711, + "grad_norm": 0.27364006638526917, + "learning_rate": 3.40499306518724e-06, + "loss": 0.0037, + "step": 9521 + }, + { + "epoch": 6.596466920678905, + "grad_norm": 0.3937765955924988, + "learning_rate": 3.4042995839112347e-06, + "loss": 0.0031, + "step": 9522 + }, + { + "epoch": 6.5971596813301, + "grad_norm": 0.5127364993095398, + "learning_rate": 3.403606102635229e-06, + "loss": 0.004, + "step": 9523 + }, + { + "epoch": 6.597852441981296, + "grad_norm": 0.2622610032558441, + "learning_rate": 3.4029126213592233e-06, + "loss": 0.0041, + "step": 9524 + }, + { + "epoch": 6.59854520263249, + "grad_norm": 0.2696210741996765, + "learning_rate": 3.4022191400832182e-06, + "loss": 0.0031, + "step": 9525 + }, + { + "epoch": 6.599237963283685, + "grad_norm": 0.29049065709114075, + "learning_rate": 3.4015256588072127e-06, + "loss": 0.0049, + "step": 9526 + }, + { + "epoch": 6.599930723934881, + "grad_norm": 0.48603224754333496, + "learning_rate": 3.400832177531207e-06, + "loss": 0.0055, + "step": 9527 + }, + { + "epoch": 6.600623484586076, + "grad_norm": 0.33300310373306274, + "learning_rate": 3.4001386962552013e-06, + "loss": 0.004, + "step": 9528 + }, + { + "epoch": 6.60131624523727, + "grad_norm": 0.25161683559417725, + "learning_rate": 3.399445214979196e-06, + "loss": 0.0032, + "step": 9529 + }, + { + "epoch": 6.6020090058884655, + "grad_norm": 0.2611345946788788, + "learning_rate": 3.3987517337031908e-06, + "loss": 0.0042, + "step": 9530 + }, + { + "epoch": 6.602701766539661, + "grad_norm": 0.23604142665863037, + "learning_rate": 3.398058252427185e-06, + "loss": 0.0044, + "step": 9531 + }, + { + "epoch": 6.603394527190855, + "grad_norm": 0.2808261215686798, + "learning_rate": 3.3973647711511794e-06, + "loss": 0.0052, + "step": 9532 + }, + { + "epoch": 6.6040872878420505, + "grad_norm": 0.17070677876472473, + "learning_rate": 3.3966712898751735e-06, + "loss": 0.0031, + "step": 9533 + }, + { + "epoch": 6.604780048493246, + "grad_norm": 0.2397252768278122, + "learning_rate": 3.395977808599168e-06, + "loss": 0.0045, + "step": 9534 + }, + { + "epoch": 6.605472809144441, + "grad_norm": 0.25312379002571106, + "learning_rate": 3.395284327323163e-06, + "loss": 0.0042, + "step": 9535 + }, + { + "epoch": 6.606165569795635, + "grad_norm": 0.22014859318733215, + "learning_rate": 3.394590846047157e-06, + "loss": 0.0028, + "step": 9536 + }, + { + "epoch": 6.606858330446831, + "grad_norm": 0.2628215551376343, + "learning_rate": 3.3938973647711515e-06, + "loss": 0.0042, + "step": 9537 + }, + { + "epoch": 6.607551091098026, + "grad_norm": 0.36326444149017334, + "learning_rate": 3.393203883495146e-06, + "loss": 0.0041, + "step": 9538 + }, + { + "epoch": 6.60824385174922, + "grad_norm": 0.15043947100639343, + "learning_rate": 3.39251040221914e-06, + "loss": 0.0027, + "step": 9539 + }, + { + "epoch": 6.608936612400416, + "grad_norm": 0.18471574783325195, + "learning_rate": 3.391816920943135e-06, + "loss": 0.0034, + "step": 9540 + }, + { + "epoch": 6.609629373051611, + "grad_norm": 0.2622855007648468, + "learning_rate": 3.3911234396671295e-06, + "loss": 0.0044, + "step": 9541 + }, + { + "epoch": 6.610322133702805, + "grad_norm": 0.3304399251937866, + "learning_rate": 3.3904299583911236e-06, + "loss": 0.0036, + "step": 9542 + }, + { + "epoch": 6.611014894354001, + "grad_norm": 0.19413425028324127, + "learning_rate": 3.389736477115118e-06, + "loss": 0.0046, + "step": 9543 + }, + { + "epoch": 6.611707655005196, + "grad_norm": 0.2844732403755188, + "learning_rate": 3.3890429958391126e-06, + "loss": 0.0053, + "step": 9544 + }, + { + "epoch": 6.61240041565639, + "grad_norm": 0.2866968512535095, + "learning_rate": 3.388349514563107e-06, + "loss": 0.0043, + "step": 9545 + }, + { + "epoch": 6.6130931763075855, + "grad_norm": 0.2924291491508484, + "learning_rate": 3.3876560332871017e-06, + "loss": 0.0038, + "step": 9546 + }, + { + "epoch": 6.613785936958781, + "grad_norm": 0.24228566884994507, + "learning_rate": 3.386962552011096e-06, + "loss": 0.005, + "step": 9547 + }, + { + "epoch": 6.614478697609976, + "grad_norm": 0.31746983528137207, + "learning_rate": 3.3862690707350903e-06, + "loss": 0.0052, + "step": 9548 + }, + { + "epoch": 6.6151714582611705, + "grad_norm": 0.18304447829723358, + "learning_rate": 3.3855755894590848e-06, + "loss": 0.0039, + "step": 9549 + }, + { + "epoch": 6.615864218912366, + "grad_norm": 0.27996575832366943, + "learning_rate": 3.3848821081830797e-06, + "loss": 0.0042, + "step": 9550 + }, + { + "epoch": 6.616556979563561, + "grad_norm": 0.35377106070518494, + "learning_rate": 3.3841886269070738e-06, + "loss": 0.0037, + "step": 9551 + }, + { + "epoch": 6.617249740214755, + "grad_norm": 0.2977728247642517, + "learning_rate": 3.3834951456310683e-06, + "loss": 0.0038, + "step": 9552 + }, + { + "epoch": 6.617942500865951, + "grad_norm": 0.40454521775245667, + "learning_rate": 3.382801664355063e-06, + "loss": 0.0061, + "step": 9553 + }, + { + "epoch": 6.618635261517146, + "grad_norm": 0.2601086497306824, + "learning_rate": 3.382108183079057e-06, + "loss": 0.0048, + "step": 9554 + }, + { + "epoch": 6.619328022168341, + "grad_norm": 0.2615302503108978, + "learning_rate": 3.381414701803052e-06, + "loss": 0.0044, + "step": 9555 + }, + { + "epoch": 6.620020782819536, + "grad_norm": 0.19293847680091858, + "learning_rate": 3.3807212205270463e-06, + "loss": 0.0029, + "step": 9556 + }, + { + "epoch": 6.620713543470731, + "grad_norm": 0.24798336625099182, + "learning_rate": 3.3800277392510404e-06, + "loss": 0.0046, + "step": 9557 + }, + { + "epoch": 6.621406304121926, + "grad_norm": 0.266619473695755, + "learning_rate": 3.379334257975035e-06, + "loss": 0.0037, + "step": 9558 + }, + { + "epoch": 6.622099064773121, + "grad_norm": 0.2793313264846802, + "learning_rate": 3.3786407766990294e-06, + "loss": 0.0041, + "step": 9559 + }, + { + "epoch": 6.622791825424316, + "grad_norm": 0.20297564566135406, + "learning_rate": 3.377947295423024e-06, + "loss": 0.0037, + "step": 9560 + }, + { + "epoch": 6.623484586075511, + "grad_norm": 0.3926437199115753, + "learning_rate": 3.3772538141470185e-06, + "loss": 0.0054, + "step": 9561 + }, + { + "epoch": 6.6241773467267056, + "grad_norm": 0.27789077162742615, + "learning_rate": 3.376560332871013e-06, + "loss": 0.0039, + "step": 9562 + }, + { + "epoch": 6.624870107377901, + "grad_norm": 0.3504057824611664, + "learning_rate": 3.375866851595007e-06, + "loss": 0.0046, + "step": 9563 + }, + { + "epoch": 6.625562868029096, + "grad_norm": 0.7347978353500366, + "learning_rate": 3.3751733703190016e-06, + "loss": 0.0046, + "step": 9564 + }, + { + "epoch": 6.6262556286802905, + "grad_norm": 0.2537793219089508, + "learning_rate": 3.3744798890429957e-06, + "loss": 0.0039, + "step": 9565 + }, + { + "epoch": 6.626948389331486, + "grad_norm": 0.36182963848114014, + "learning_rate": 3.3737864077669906e-06, + "loss": 0.0034, + "step": 9566 + }, + { + "epoch": 6.627641149982681, + "grad_norm": 0.14422914385795593, + "learning_rate": 3.373092926490985e-06, + "loss": 0.0025, + "step": 9567 + }, + { + "epoch": 6.628333910633876, + "grad_norm": 0.25976353883743286, + "learning_rate": 3.3723994452149796e-06, + "loss": 0.0043, + "step": 9568 + }, + { + "epoch": 6.629026671285071, + "grad_norm": 0.3340349495410919, + "learning_rate": 3.3717059639389737e-06, + "loss": 0.0063, + "step": 9569 + }, + { + "epoch": 6.629719431936266, + "grad_norm": 0.46246954798698425, + "learning_rate": 3.371012482662968e-06, + "loss": 0.0049, + "step": 9570 + }, + { + "epoch": 6.630412192587461, + "grad_norm": 0.2905452251434326, + "learning_rate": 3.370319001386963e-06, + "loss": 0.0051, + "step": 9571 + }, + { + "epoch": 6.631104953238656, + "grad_norm": 0.26880714297294617, + "learning_rate": 3.3696255201109572e-06, + "loss": 0.0032, + "step": 9572 + }, + { + "epoch": 6.631797713889851, + "grad_norm": 0.2723860740661621, + "learning_rate": 3.3689320388349517e-06, + "loss": 0.0046, + "step": 9573 + }, + { + "epoch": 6.632490474541046, + "grad_norm": 0.23139895498752594, + "learning_rate": 3.368238557558946e-06, + "loss": 0.0046, + "step": 9574 + }, + { + "epoch": 6.6331832351922415, + "grad_norm": 0.40411585569381714, + "learning_rate": 3.3675450762829403e-06, + "loss": 0.0039, + "step": 9575 + }, + { + "epoch": 6.633875995843436, + "grad_norm": 0.20245085656642914, + "learning_rate": 3.3668515950069353e-06, + "loss": 0.0043, + "step": 9576 + }, + { + "epoch": 6.634568756494631, + "grad_norm": 0.1982521414756775, + "learning_rate": 3.3661581137309298e-06, + "loss": 0.003, + "step": 9577 + }, + { + "epoch": 6.6352615171458265, + "grad_norm": 0.290549099445343, + "learning_rate": 3.365464632454924e-06, + "loss": 0.0048, + "step": 9578 + }, + { + "epoch": 6.635954277797021, + "grad_norm": 0.23341302573680878, + "learning_rate": 3.3647711511789184e-06, + "loss": 0.005, + "step": 9579 + }, + { + "epoch": 6.636647038448216, + "grad_norm": 0.31134483218193054, + "learning_rate": 3.3640776699029125e-06, + "loss": 0.0039, + "step": 9580 + }, + { + "epoch": 6.637339799099411, + "grad_norm": 0.26159578561782837, + "learning_rate": 3.3633841886269074e-06, + "loss": 0.0043, + "step": 9581 + }, + { + "epoch": 6.638032559750606, + "grad_norm": 0.22782525420188904, + "learning_rate": 3.362690707350902e-06, + "loss": 0.0036, + "step": 9582 + }, + { + "epoch": 6.638725320401801, + "grad_norm": 0.21495933830738068, + "learning_rate": 3.3619972260748964e-06, + "loss": 0.0031, + "step": 9583 + }, + { + "epoch": 6.639418081052996, + "grad_norm": 0.13852988183498383, + "learning_rate": 3.3613037447988905e-06, + "loss": 0.0026, + "step": 9584 + }, + { + "epoch": 6.640110841704191, + "grad_norm": 0.3558557331562042, + "learning_rate": 3.360610263522885e-06, + "loss": 0.0043, + "step": 9585 + }, + { + "epoch": 6.640803602355386, + "grad_norm": 0.32118579745292664, + "learning_rate": 3.35991678224688e-06, + "loss": 0.0037, + "step": 9586 + }, + { + "epoch": 6.641496363006581, + "grad_norm": 0.25010016560554504, + "learning_rate": 3.359223300970874e-06, + "loss": 0.0035, + "step": 9587 + }, + { + "epoch": 6.642189123657777, + "grad_norm": 0.1745690405368805, + "learning_rate": 3.3585298196948685e-06, + "loss": 0.0026, + "step": 9588 + }, + { + "epoch": 6.642881884308971, + "grad_norm": 0.20617301762104034, + "learning_rate": 3.3578363384188626e-06, + "loss": 0.0043, + "step": 9589 + }, + { + "epoch": 6.643574644960166, + "grad_norm": 0.3337268829345703, + "learning_rate": 3.357142857142857e-06, + "loss": 0.0042, + "step": 9590 + }, + { + "epoch": 6.6442674056113615, + "grad_norm": 0.37391218543052673, + "learning_rate": 3.356449375866852e-06, + "loss": 0.0046, + "step": 9591 + }, + { + "epoch": 6.644960166262556, + "grad_norm": 0.42142197489738464, + "learning_rate": 3.3557558945908466e-06, + "loss": 0.0031, + "step": 9592 + }, + { + "epoch": 6.645652926913751, + "grad_norm": 0.36293545365333557, + "learning_rate": 3.3550624133148407e-06, + "loss": 0.0036, + "step": 9593 + }, + { + "epoch": 6.6463456875649465, + "grad_norm": 0.10082734376192093, + "learning_rate": 3.354368932038835e-06, + "loss": 0.002, + "step": 9594 + }, + { + "epoch": 6.647038448216142, + "grad_norm": 0.386010080575943, + "learning_rate": 3.3536754507628293e-06, + "loss": 0.0034, + "step": 9595 + }, + { + "epoch": 6.647731208867336, + "grad_norm": 0.3421812355518341, + "learning_rate": 3.352981969486824e-06, + "loss": 0.005, + "step": 9596 + }, + { + "epoch": 6.648423969518531, + "grad_norm": 0.3279930353164673, + "learning_rate": 3.3522884882108187e-06, + "loss": 0.0035, + "step": 9597 + }, + { + "epoch": 6.649116730169727, + "grad_norm": 0.17352928221225739, + "learning_rate": 3.3515950069348128e-06, + "loss": 0.0037, + "step": 9598 + }, + { + "epoch": 6.649809490820921, + "grad_norm": 0.16105639934539795, + "learning_rate": 3.3509015256588073e-06, + "loss": 0.0025, + "step": 9599 + }, + { + "epoch": 6.650502251472116, + "grad_norm": 0.44193294644355774, + "learning_rate": 3.350208044382802e-06, + "loss": 0.0048, + "step": 9600 + }, + { + "epoch": 6.651195012123312, + "grad_norm": 0.21000400185585022, + "learning_rate": 3.3495145631067967e-06, + "loss": 0.0029, + "step": 9601 + }, + { + "epoch": 6.651887772774506, + "grad_norm": 0.2680608630180359, + "learning_rate": 3.348821081830791e-06, + "loss": 0.0039, + "step": 9602 + }, + { + "epoch": 6.652580533425701, + "grad_norm": 0.45542842149734497, + "learning_rate": 3.3481276005547853e-06, + "loss": 0.0046, + "step": 9603 + }, + { + "epoch": 6.653273294076897, + "grad_norm": 0.27576544880867004, + "learning_rate": 3.3474341192787794e-06, + "loss": 0.0037, + "step": 9604 + }, + { + "epoch": 6.653966054728091, + "grad_norm": 0.31914380192756653, + "learning_rate": 3.346740638002774e-06, + "loss": 0.0042, + "step": 9605 + }, + { + "epoch": 6.654658815379286, + "grad_norm": 0.44567424058914185, + "learning_rate": 3.346047156726769e-06, + "loss": 0.0065, + "step": 9606 + }, + { + "epoch": 6.6553515760304816, + "grad_norm": 0.24427129328250885, + "learning_rate": 3.3453536754507634e-06, + "loss": 0.0038, + "step": 9607 + }, + { + "epoch": 6.656044336681677, + "grad_norm": 0.24412909150123596, + "learning_rate": 3.3446601941747575e-06, + "loss": 0.0031, + "step": 9608 + }, + { + "epoch": 6.656737097332871, + "grad_norm": 0.3295021653175354, + "learning_rate": 3.343966712898752e-06, + "loss": 0.0037, + "step": 9609 + }, + { + "epoch": 6.6574298579840665, + "grad_norm": 0.2805141806602478, + "learning_rate": 3.343273231622746e-06, + "loss": 0.0057, + "step": 9610 + }, + { + "epoch": 6.658122618635262, + "grad_norm": 0.28327521681785583, + "learning_rate": 3.342579750346741e-06, + "loss": 0.0049, + "step": 9611 + }, + { + "epoch": 6.658815379286456, + "grad_norm": 0.20474810898303986, + "learning_rate": 3.3418862690707355e-06, + "loss": 0.0038, + "step": 9612 + }, + { + "epoch": 6.6595081399376514, + "grad_norm": 0.42438840866088867, + "learning_rate": 3.3411927877947296e-06, + "loss": 0.0061, + "step": 9613 + }, + { + "epoch": 6.660200900588847, + "grad_norm": 0.1490003913640976, + "learning_rate": 3.340499306518724e-06, + "loss": 0.0034, + "step": 9614 + }, + { + "epoch": 6.660893661240042, + "grad_norm": 0.4401046931743622, + "learning_rate": 3.3398058252427186e-06, + "loss": 0.0041, + "step": 9615 + }, + { + "epoch": 6.661586421891236, + "grad_norm": 0.1705765724182129, + "learning_rate": 3.3391123439667135e-06, + "loss": 0.0027, + "step": 9616 + }, + { + "epoch": 6.662279182542432, + "grad_norm": 0.30720090866088867, + "learning_rate": 3.3384188626907076e-06, + "loss": 0.0044, + "step": 9617 + }, + { + "epoch": 6.662971943193627, + "grad_norm": 0.33989569544792175, + "learning_rate": 3.337725381414702e-06, + "loss": 0.0044, + "step": 9618 + }, + { + "epoch": 6.663664703844821, + "grad_norm": 0.20146344602108002, + "learning_rate": 3.3370319001386962e-06, + "loss": 0.0039, + "step": 9619 + }, + { + "epoch": 6.664357464496017, + "grad_norm": 0.180581197142601, + "learning_rate": 3.3363384188626907e-06, + "loss": 0.0041, + "step": 9620 + }, + { + "epoch": 6.665050225147212, + "grad_norm": 0.18227854371070862, + "learning_rate": 3.3356449375866857e-06, + "loss": 0.0032, + "step": 9621 + }, + { + "epoch": 6.665742985798406, + "grad_norm": 0.7514982223510742, + "learning_rate": 3.3349514563106797e-06, + "loss": 0.0044, + "step": 9622 + }, + { + "epoch": 6.666435746449602, + "grad_norm": 0.2514946162700653, + "learning_rate": 3.3342579750346743e-06, + "loss": 0.0035, + "step": 9623 + }, + { + "epoch": 6.667128507100797, + "grad_norm": 0.2893274128437042, + "learning_rate": 3.3335644937586688e-06, + "loss": 0.0055, + "step": 9624 + }, + { + "epoch": 6.667821267751991, + "grad_norm": 0.3611743748188019, + "learning_rate": 3.332871012482663e-06, + "loss": 0.0048, + "step": 9625 + }, + { + "epoch": 6.6685140284031865, + "grad_norm": 0.2338365763425827, + "learning_rate": 3.332177531206658e-06, + "loss": 0.0033, + "step": 9626 + }, + { + "epoch": 6.669206789054382, + "grad_norm": 0.22902587056159973, + "learning_rate": 3.3314840499306523e-06, + "loss": 0.0037, + "step": 9627 + }, + { + "epoch": 6.669899549705577, + "grad_norm": 0.24569883942604065, + "learning_rate": 3.3307905686546464e-06, + "loss": 0.0038, + "step": 9628 + }, + { + "epoch": 6.6705923103567715, + "grad_norm": 0.723281979560852, + "learning_rate": 3.330097087378641e-06, + "loss": 0.005, + "step": 9629 + }, + { + "epoch": 6.671285071007967, + "grad_norm": 0.27090781927108765, + "learning_rate": 3.3294036061026354e-06, + "loss": 0.0038, + "step": 9630 + }, + { + "epoch": 6.671977831659162, + "grad_norm": 0.44531315565109253, + "learning_rate": 3.3287101248266303e-06, + "loss": 0.0037, + "step": 9631 + }, + { + "epoch": 6.672670592310356, + "grad_norm": 0.4721844494342804, + "learning_rate": 3.3280166435506244e-06, + "loss": 0.0067, + "step": 9632 + }, + { + "epoch": 6.673363352961552, + "grad_norm": 0.2333526611328125, + "learning_rate": 3.327323162274619e-06, + "loss": 0.0038, + "step": 9633 + }, + { + "epoch": 6.674056113612747, + "grad_norm": 0.22444242238998413, + "learning_rate": 3.326629680998613e-06, + "loss": 0.0048, + "step": 9634 + }, + { + "epoch": 6.674748874263942, + "grad_norm": 0.7265631556510925, + "learning_rate": 3.3259361997226075e-06, + "loss": 0.0065, + "step": 9635 + }, + { + "epoch": 6.675441634915137, + "grad_norm": 0.45793724060058594, + "learning_rate": 3.3252427184466025e-06, + "loss": 0.0038, + "step": 9636 + }, + { + "epoch": 6.676134395566332, + "grad_norm": 0.21917425096035004, + "learning_rate": 3.3245492371705966e-06, + "loss": 0.0038, + "step": 9637 + }, + { + "epoch": 6.676827156217527, + "grad_norm": 0.26209592819213867, + "learning_rate": 3.323855755894591e-06, + "loss": 0.0043, + "step": 9638 + }, + { + "epoch": 6.677519916868722, + "grad_norm": 0.25796249508857727, + "learning_rate": 3.3231622746185856e-06, + "loss": 0.0048, + "step": 9639 + }, + { + "epoch": 6.678212677519917, + "grad_norm": 0.26896247267723083, + "learning_rate": 3.3224687933425797e-06, + "loss": 0.0037, + "step": 9640 + }, + { + "epoch": 6.678905438171112, + "grad_norm": 0.2653186023235321, + "learning_rate": 3.3217753120665746e-06, + "loss": 0.004, + "step": 9641 + }, + { + "epoch": 6.6795981988223065, + "grad_norm": 0.16648922860622406, + "learning_rate": 3.321081830790569e-06, + "loss": 0.0035, + "step": 9642 + }, + { + "epoch": 6.680290959473502, + "grad_norm": 0.26881250739097595, + "learning_rate": 3.320388349514563e-06, + "loss": 0.0042, + "step": 9643 + }, + { + "epoch": 6.680983720124697, + "grad_norm": 0.2109580636024475, + "learning_rate": 3.3196948682385577e-06, + "loss": 0.0029, + "step": 9644 + }, + { + "epoch": 6.6816764807758915, + "grad_norm": 0.38735663890838623, + "learning_rate": 3.319001386962552e-06, + "loss": 0.0059, + "step": 9645 + }, + { + "epoch": 6.682369241427087, + "grad_norm": 0.17858567833900452, + "learning_rate": 3.3183079056865467e-06, + "loss": 0.0037, + "step": 9646 + }, + { + "epoch": 6.683062002078282, + "grad_norm": 0.2825709879398346, + "learning_rate": 3.3176144244105412e-06, + "loss": 0.0043, + "step": 9647 + }, + { + "epoch": 6.683754762729477, + "grad_norm": 0.18355581164360046, + "learning_rate": 3.3169209431345357e-06, + "loss": 0.0033, + "step": 9648 + }, + { + "epoch": 6.684447523380672, + "grad_norm": 0.21532532572746277, + "learning_rate": 3.31622746185853e-06, + "loss": 0.0035, + "step": 9649 + }, + { + "epoch": 6.685140284031867, + "grad_norm": 0.4155719578266144, + "learning_rate": 3.3155339805825243e-06, + "loss": 0.0043, + "step": 9650 + }, + { + "epoch": 6.685833044683062, + "grad_norm": 0.4123566150665283, + "learning_rate": 3.3148404993065193e-06, + "loss": 0.006, + "step": 9651 + }, + { + "epoch": 6.686525805334257, + "grad_norm": 0.236257404088974, + "learning_rate": 3.3141470180305134e-06, + "loss": 0.0027, + "step": 9652 + }, + { + "epoch": 6.687218565985452, + "grad_norm": 0.31226691603660583, + "learning_rate": 3.313453536754508e-06, + "loss": 0.0047, + "step": 9653 + }, + { + "epoch": 6.687911326636647, + "grad_norm": 0.35732463002204895, + "learning_rate": 3.3127600554785024e-06, + "loss": 0.0059, + "step": 9654 + }, + { + "epoch": 6.6886040872878425, + "grad_norm": 0.3564273715019226, + "learning_rate": 3.3120665742024965e-06, + "loss": 0.0047, + "step": 9655 + }, + { + "epoch": 6.689296847939037, + "grad_norm": 0.30896562337875366, + "learning_rate": 3.3113730929264914e-06, + "loss": 0.0059, + "step": 9656 + }, + { + "epoch": 6.689989608590232, + "grad_norm": 0.7997479438781738, + "learning_rate": 3.310679611650486e-06, + "loss": 0.0056, + "step": 9657 + }, + { + "epoch": 6.6906823692414275, + "grad_norm": 0.22391101717948914, + "learning_rate": 3.30998613037448e-06, + "loss": 0.0027, + "step": 9658 + }, + { + "epoch": 6.691375129892622, + "grad_norm": 0.2137303203344345, + "learning_rate": 3.3092926490984745e-06, + "loss": 0.0031, + "step": 9659 + }, + { + "epoch": 6.692067890543817, + "grad_norm": 0.14612312614917755, + "learning_rate": 3.308599167822469e-06, + "loss": 0.0025, + "step": 9660 + }, + { + "epoch": 6.692760651195012, + "grad_norm": 0.47494062781333923, + "learning_rate": 3.3079056865464635e-06, + "loss": 0.0042, + "step": 9661 + }, + { + "epoch": 6.693453411846207, + "grad_norm": 0.36330175399780273, + "learning_rate": 3.307212205270458e-06, + "loss": 0.0042, + "step": 9662 + }, + { + "epoch": 6.694146172497402, + "grad_norm": 0.4094347059726715, + "learning_rate": 3.3065187239944525e-06, + "loss": 0.0049, + "step": 9663 + }, + { + "epoch": 6.694838933148597, + "grad_norm": 0.2243693619966507, + "learning_rate": 3.3058252427184466e-06, + "loss": 0.0037, + "step": 9664 + }, + { + "epoch": 6.695531693799792, + "grad_norm": 0.46385589241981506, + "learning_rate": 3.305131761442441e-06, + "loss": 0.0038, + "step": 9665 + }, + { + "epoch": 6.696224454450987, + "grad_norm": 0.3777003288269043, + "learning_rate": 3.304438280166436e-06, + "loss": 0.0053, + "step": 9666 + }, + { + "epoch": 6.696917215102182, + "grad_norm": 0.46151116490364075, + "learning_rate": 3.30374479889043e-06, + "loss": 0.0046, + "step": 9667 + }, + { + "epoch": 6.697609975753378, + "grad_norm": 0.24645903706550598, + "learning_rate": 3.3030513176144247e-06, + "loss": 0.0027, + "step": 9668 + }, + { + "epoch": 6.698302736404572, + "grad_norm": 0.3040834367275238, + "learning_rate": 3.302357836338419e-06, + "loss": 0.0037, + "step": 9669 + }, + { + "epoch": 6.698995497055767, + "grad_norm": 0.3605433404445648, + "learning_rate": 3.3016643550624133e-06, + "loss": 0.006, + "step": 9670 + }, + { + "epoch": 6.6996882577069625, + "grad_norm": 0.21150998771190643, + "learning_rate": 3.300970873786408e-06, + "loss": 0.0038, + "step": 9671 + }, + { + "epoch": 6.700381018358157, + "grad_norm": 0.23658040165901184, + "learning_rate": 3.3002773925104027e-06, + "loss": 0.0033, + "step": 9672 + }, + { + "epoch": 6.701073779009352, + "grad_norm": 0.19974714517593384, + "learning_rate": 3.299583911234397e-06, + "loss": 0.0034, + "step": 9673 + }, + { + "epoch": 6.7017665396605475, + "grad_norm": 0.2631831467151642, + "learning_rate": 3.2988904299583913e-06, + "loss": 0.0039, + "step": 9674 + }, + { + "epoch": 6.702459300311743, + "grad_norm": 0.22800016403198242, + "learning_rate": 3.298196948682386e-06, + "loss": 0.0033, + "step": 9675 + }, + { + "epoch": 6.703152060962937, + "grad_norm": 0.1849680095911026, + "learning_rate": 3.2975034674063803e-06, + "loss": 0.0031, + "step": 9676 + }, + { + "epoch": 6.703844821614132, + "grad_norm": 0.160861536860466, + "learning_rate": 3.296809986130375e-06, + "loss": 0.0024, + "step": 9677 + }, + { + "epoch": 6.704537582265328, + "grad_norm": 0.2611650824546814, + "learning_rate": 3.2961165048543693e-06, + "loss": 0.0037, + "step": 9678 + }, + { + "epoch": 6.705230342916522, + "grad_norm": 0.17863458395004272, + "learning_rate": 3.2954230235783634e-06, + "loss": 0.0033, + "step": 9679 + }, + { + "epoch": 6.705923103567717, + "grad_norm": 0.37167879939079285, + "learning_rate": 3.294729542302358e-06, + "loss": 0.0059, + "step": 9680 + }, + { + "epoch": 6.706615864218913, + "grad_norm": 0.22416667640209198, + "learning_rate": 3.294036061026353e-06, + "loss": 0.0048, + "step": 9681 + }, + { + "epoch": 6.707308624870107, + "grad_norm": 0.3475745618343353, + "learning_rate": 3.293342579750347e-06, + "loss": 0.0046, + "step": 9682 + }, + { + "epoch": 6.708001385521302, + "grad_norm": 0.20298218727111816, + "learning_rate": 3.2926490984743415e-06, + "loss": 0.0032, + "step": 9683 + }, + { + "epoch": 6.708694146172498, + "grad_norm": 0.33679860830307007, + "learning_rate": 3.291955617198336e-06, + "loss": 0.0041, + "step": 9684 + }, + { + "epoch": 6.709386906823692, + "grad_norm": 0.23559480905532837, + "learning_rate": 3.29126213592233e-06, + "loss": 0.0034, + "step": 9685 + }, + { + "epoch": 6.710079667474887, + "grad_norm": 0.19997474551200867, + "learning_rate": 3.290568654646325e-06, + "loss": 0.0037, + "step": 9686 + }, + { + "epoch": 6.7107724281260825, + "grad_norm": 0.1411164402961731, + "learning_rate": 3.2898751733703195e-06, + "loss": 0.0026, + "step": 9687 + }, + { + "epoch": 6.711465188777278, + "grad_norm": 0.3438756763935089, + "learning_rate": 3.2891816920943136e-06, + "loss": 0.006, + "step": 9688 + }, + { + "epoch": 6.712157949428472, + "grad_norm": 0.2956550419330597, + "learning_rate": 3.288488210818308e-06, + "loss": 0.0045, + "step": 9689 + }, + { + "epoch": 6.7128507100796675, + "grad_norm": 0.327735036611557, + "learning_rate": 3.287794729542302e-06, + "loss": 0.0038, + "step": 9690 + }, + { + "epoch": 6.713543470730863, + "grad_norm": 0.2549278736114502, + "learning_rate": 3.287101248266297e-06, + "loss": 0.0041, + "step": 9691 + }, + { + "epoch": 6.714236231382057, + "grad_norm": 0.2906319200992584, + "learning_rate": 3.2864077669902916e-06, + "loss": 0.0037, + "step": 9692 + }, + { + "epoch": 6.714928992033252, + "grad_norm": 0.19674977660179138, + "learning_rate": 3.285714285714286e-06, + "loss": 0.0027, + "step": 9693 + }, + { + "epoch": 6.715621752684448, + "grad_norm": 0.6923297047615051, + "learning_rate": 3.2850208044382802e-06, + "loss": 0.0054, + "step": 9694 + }, + { + "epoch": 6.716314513335643, + "grad_norm": 0.2080959528684616, + "learning_rate": 3.2843273231622747e-06, + "loss": 0.0029, + "step": 9695 + }, + { + "epoch": 6.717007273986837, + "grad_norm": 0.3292155861854553, + "learning_rate": 3.2836338418862697e-06, + "loss": 0.0049, + "step": 9696 + }, + { + "epoch": 6.717700034638033, + "grad_norm": 0.18534091114997864, + "learning_rate": 3.2829403606102638e-06, + "loss": 0.0045, + "step": 9697 + }, + { + "epoch": 6.718392795289228, + "grad_norm": 0.11694613099098206, + "learning_rate": 3.2822468793342583e-06, + "loss": 0.002, + "step": 9698 + }, + { + "epoch": 6.719085555940422, + "grad_norm": 0.24391762912273407, + "learning_rate": 3.2815533980582528e-06, + "loss": 0.0028, + "step": 9699 + }, + { + "epoch": 6.719778316591618, + "grad_norm": 0.11731547862291336, + "learning_rate": 3.280859916782247e-06, + "loss": 0.0025, + "step": 9700 + }, + { + "epoch": 6.720471077242813, + "grad_norm": 0.3219974637031555, + "learning_rate": 3.280166435506242e-06, + "loss": 0.0038, + "step": 9701 + }, + { + "epoch": 6.721163837894007, + "grad_norm": 0.42592859268188477, + "learning_rate": 3.2794729542302363e-06, + "loss": 0.0046, + "step": 9702 + }, + { + "epoch": 6.721856598545203, + "grad_norm": 0.3236500322818756, + "learning_rate": 3.2787794729542304e-06, + "loss": 0.0056, + "step": 9703 + }, + { + "epoch": 6.722549359196398, + "grad_norm": 0.20326076447963715, + "learning_rate": 3.278085991678225e-06, + "loss": 0.0034, + "step": 9704 + }, + { + "epoch": 6.723242119847592, + "grad_norm": 0.4166771471500397, + "learning_rate": 3.277392510402219e-06, + "loss": 0.0045, + "step": 9705 + }, + { + "epoch": 6.7239348804987875, + "grad_norm": 0.32326415181159973, + "learning_rate": 3.276699029126214e-06, + "loss": 0.005, + "step": 9706 + }, + { + "epoch": 6.724627641149983, + "grad_norm": 0.1668633371591568, + "learning_rate": 3.2760055478502084e-06, + "loss": 0.0027, + "step": 9707 + }, + { + "epoch": 6.725320401801178, + "grad_norm": 0.21399752795696259, + "learning_rate": 3.275312066574203e-06, + "loss": 0.0046, + "step": 9708 + }, + { + "epoch": 6.7260131624523725, + "grad_norm": 0.23568181693553925, + "learning_rate": 3.274618585298197e-06, + "loss": 0.0032, + "step": 9709 + }, + { + "epoch": 6.726705923103568, + "grad_norm": 0.18345728516578674, + "learning_rate": 3.2739251040221915e-06, + "loss": 0.0041, + "step": 9710 + }, + { + "epoch": 6.727398683754763, + "grad_norm": 0.28310561180114746, + "learning_rate": 3.2732316227461865e-06, + "loss": 0.003, + "step": 9711 + }, + { + "epoch": 6.728091444405957, + "grad_norm": 0.5181269645690918, + "learning_rate": 3.2725381414701806e-06, + "loss": 0.0059, + "step": 9712 + }, + { + "epoch": 6.728784205057153, + "grad_norm": 0.19630587100982666, + "learning_rate": 3.271844660194175e-06, + "loss": 0.0035, + "step": 9713 + }, + { + "epoch": 6.729476965708348, + "grad_norm": 0.21144691109657288, + "learning_rate": 3.271151178918169e-06, + "loss": 0.0035, + "step": 9714 + }, + { + "epoch": 6.730169726359543, + "grad_norm": 0.46461763978004456, + "learning_rate": 3.2704576976421637e-06, + "loss": 0.0049, + "step": 9715 + }, + { + "epoch": 6.730862487010738, + "grad_norm": 0.2225027084350586, + "learning_rate": 3.2697642163661586e-06, + "loss": 0.0044, + "step": 9716 + }, + { + "epoch": 6.731555247661933, + "grad_norm": 0.15940137207508087, + "learning_rate": 3.269070735090153e-06, + "loss": 0.003, + "step": 9717 + }, + { + "epoch": 6.732248008313128, + "grad_norm": 0.2563859522342682, + "learning_rate": 3.268377253814147e-06, + "loss": 0.0034, + "step": 9718 + }, + { + "epoch": 6.732940768964323, + "grad_norm": 0.23835010826587677, + "learning_rate": 3.2676837725381417e-06, + "loss": 0.0043, + "step": 9719 + }, + { + "epoch": 6.733633529615518, + "grad_norm": 0.4149261713027954, + "learning_rate": 3.266990291262136e-06, + "loss": 0.0049, + "step": 9720 + }, + { + "epoch": 6.734326290266713, + "grad_norm": 0.16650626063346863, + "learning_rate": 3.2662968099861307e-06, + "loss": 0.003, + "step": 9721 + }, + { + "epoch": 6.7350190509179075, + "grad_norm": 0.37145712971687317, + "learning_rate": 3.2656033287101252e-06, + "loss": 0.0056, + "step": 9722 + }, + { + "epoch": 6.735711811569103, + "grad_norm": 0.2124325931072235, + "learning_rate": 3.2649098474341197e-06, + "loss": 0.0039, + "step": 9723 + }, + { + "epoch": 6.736404572220298, + "grad_norm": 0.4586735665798187, + "learning_rate": 3.264216366158114e-06, + "loss": 0.0059, + "step": 9724 + }, + { + "epoch": 6.7370973328714925, + "grad_norm": 0.30123084783554077, + "learning_rate": 3.2635228848821083e-06, + "loss": 0.0066, + "step": 9725 + }, + { + "epoch": 6.737790093522688, + "grad_norm": 0.24534186720848083, + "learning_rate": 3.2628294036061033e-06, + "loss": 0.0043, + "step": 9726 + }, + { + "epoch": 6.738482854173883, + "grad_norm": 0.27212992310523987, + "learning_rate": 3.2621359223300974e-06, + "loss": 0.0043, + "step": 9727 + }, + { + "epoch": 6.739175614825078, + "grad_norm": 0.3368600904941559, + "learning_rate": 3.261442441054092e-06, + "loss": 0.0045, + "step": 9728 + }, + { + "epoch": 6.739868375476273, + "grad_norm": 0.46149179339408875, + "learning_rate": 3.260748959778086e-06, + "loss": 0.005, + "step": 9729 + }, + { + "epoch": 6.740561136127468, + "grad_norm": 0.4382191002368927, + "learning_rate": 3.2600554785020805e-06, + "loss": 0.0035, + "step": 9730 + }, + { + "epoch": 6.741253896778663, + "grad_norm": 0.1537993997335434, + "learning_rate": 3.2593619972260754e-06, + "loss": 0.0039, + "step": 9731 + }, + { + "epoch": 6.741946657429858, + "grad_norm": 0.25139740109443665, + "learning_rate": 3.25866851595007e-06, + "loss": 0.0058, + "step": 9732 + }, + { + "epoch": 6.742639418081053, + "grad_norm": 0.16098076105117798, + "learning_rate": 3.257975034674064e-06, + "loss": 0.0029, + "step": 9733 + }, + { + "epoch": 6.743332178732248, + "grad_norm": 0.25203338265419006, + "learning_rate": 3.2572815533980585e-06, + "loss": 0.0042, + "step": 9734 + }, + { + "epoch": 6.7440249393834435, + "grad_norm": 0.6023163199424744, + "learning_rate": 3.2565880721220526e-06, + "loss": 0.0064, + "step": 9735 + }, + { + "epoch": 6.744717700034638, + "grad_norm": 0.18290142714977264, + "learning_rate": 3.2558945908460475e-06, + "loss": 0.0035, + "step": 9736 + }, + { + "epoch": 6.745410460685833, + "grad_norm": 0.2506641447544098, + "learning_rate": 3.255201109570042e-06, + "loss": 0.0054, + "step": 9737 + }, + { + "epoch": 6.746103221337028, + "grad_norm": 0.30213823914527893, + "learning_rate": 3.254507628294036e-06, + "loss": 0.0042, + "step": 9738 + }, + { + "epoch": 6.746795981988223, + "grad_norm": 0.2335348129272461, + "learning_rate": 3.2538141470180306e-06, + "loss": 0.0034, + "step": 9739 + }, + { + "epoch": 6.747488742639418, + "grad_norm": 0.26008355617523193, + "learning_rate": 3.253120665742025e-06, + "loss": 0.0031, + "step": 9740 + }, + { + "epoch": 6.748181503290613, + "grad_norm": 0.46285054087638855, + "learning_rate": 3.25242718446602e-06, + "loss": 0.0048, + "step": 9741 + }, + { + "epoch": 6.748874263941808, + "grad_norm": 0.16118820011615753, + "learning_rate": 3.251733703190014e-06, + "loss": 0.0029, + "step": 9742 + }, + { + "epoch": 6.749567024593003, + "grad_norm": 0.22637991607189178, + "learning_rate": 3.2510402219140087e-06, + "loss": 0.0035, + "step": 9743 + }, + { + "epoch": 6.750259785244198, + "grad_norm": 0.25915202498435974, + "learning_rate": 3.2503467406380028e-06, + "loss": 0.0038, + "step": 9744 + }, + { + "epoch": 6.750952545895393, + "grad_norm": 0.31419217586517334, + "learning_rate": 3.2496532593619973e-06, + "loss": 0.0046, + "step": 9745 + }, + { + "epoch": 6.751645306546588, + "grad_norm": 0.21347445249557495, + "learning_rate": 3.248959778085992e-06, + "loss": 0.0033, + "step": 9746 + }, + { + "epoch": 6.752338067197783, + "grad_norm": 0.4536197781562805, + "learning_rate": 3.2482662968099867e-06, + "loss": 0.003, + "step": 9747 + }, + { + "epoch": 6.753030827848978, + "grad_norm": 0.3877834379673004, + "learning_rate": 3.247572815533981e-06, + "loss": 0.0053, + "step": 9748 + }, + { + "epoch": 6.753723588500173, + "grad_norm": 0.1879110336303711, + "learning_rate": 3.2468793342579753e-06, + "loss": 0.0024, + "step": 9749 + }, + { + "epoch": 6.754416349151368, + "grad_norm": 0.2452160269021988, + "learning_rate": 3.2461858529819694e-06, + "loss": 0.0039, + "step": 9750 + }, + { + "epoch": 6.7551091098025635, + "grad_norm": 0.48815909028053284, + "learning_rate": 3.2454923717059643e-06, + "loss": 0.0034, + "step": 9751 + }, + { + "epoch": 6.755801870453758, + "grad_norm": 0.2948278784751892, + "learning_rate": 3.244798890429959e-06, + "loss": 0.0044, + "step": 9752 + }, + { + "epoch": 6.756494631104953, + "grad_norm": 0.22150950133800507, + "learning_rate": 3.244105409153953e-06, + "loss": 0.0034, + "step": 9753 + }, + { + "epoch": 6.7571873917561485, + "grad_norm": 0.2298438400030136, + "learning_rate": 3.2434119278779474e-06, + "loss": 0.0043, + "step": 9754 + }, + { + "epoch": 6.757880152407344, + "grad_norm": 0.22544124722480774, + "learning_rate": 3.242718446601942e-06, + "loss": 0.0037, + "step": 9755 + }, + { + "epoch": 6.758572913058538, + "grad_norm": 0.22202441096305847, + "learning_rate": 3.242024965325937e-06, + "loss": 0.0032, + "step": 9756 + }, + { + "epoch": 6.759265673709733, + "grad_norm": 0.20543573796749115, + "learning_rate": 3.241331484049931e-06, + "loss": 0.0036, + "step": 9757 + }, + { + "epoch": 6.759958434360929, + "grad_norm": 0.285784512758255, + "learning_rate": 3.2406380027739255e-06, + "loss": 0.003, + "step": 9758 + }, + { + "epoch": 6.760651195012123, + "grad_norm": 0.25166475772857666, + "learning_rate": 3.2399445214979196e-06, + "loss": 0.0031, + "step": 9759 + }, + { + "epoch": 6.761343955663318, + "grad_norm": 0.13722099363803864, + "learning_rate": 3.239251040221914e-06, + "loss": 0.0022, + "step": 9760 + }, + { + "epoch": 6.762036716314514, + "grad_norm": 0.25914523005485535, + "learning_rate": 3.238557558945909e-06, + "loss": 0.0033, + "step": 9761 + }, + { + "epoch": 6.762729476965708, + "grad_norm": 0.1303299069404602, + "learning_rate": 3.237864077669903e-06, + "loss": 0.0026, + "step": 9762 + }, + { + "epoch": 6.763422237616903, + "grad_norm": 0.38397571444511414, + "learning_rate": 3.2371705963938976e-06, + "loss": 0.0037, + "step": 9763 + }, + { + "epoch": 6.764114998268099, + "grad_norm": 0.2579515278339386, + "learning_rate": 3.236477115117892e-06, + "loss": 0.0039, + "step": 9764 + }, + { + "epoch": 6.764807758919293, + "grad_norm": 0.1735793650150299, + "learning_rate": 3.235783633841886e-06, + "loss": 0.0025, + "step": 9765 + }, + { + "epoch": 6.765500519570488, + "grad_norm": 0.18444368243217468, + "learning_rate": 3.235090152565881e-06, + "loss": 0.0038, + "step": 9766 + }, + { + "epoch": 6.7661932802216835, + "grad_norm": 0.29714348912239075, + "learning_rate": 3.2343966712898756e-06, + "loss": 0.0042, + "step": 9767 + }, + { + "epoch": 6.766886040872878, + "grad_norm": 0.29417502880096436, + "learning_rate": 3.2337031900138697e-06, + "loss": 0.0037, + "step": 9768 + }, + { + "epoch": 6.767578801524073, + "grad_norm": 0.16488131880760193, + "learning_rate": 3.2330097087378642e-06, + "loss": 0.0028, + "step": 9769 + }, + { + "epoch": 6.7682715621752685, + "grad_norm": 0.224967360496521, + "learning_rate": 3.2323162274618587e-06, + "loss": 0.0042, + "step": 9770 + }, + { + "epoch": 6.768964322826464, + "grad_norm": 0.25119632482528687, + "learning_rate": 3.2316227461858537e-06, + "loss": 0.0032, + "step": 9771 + }, + { + "epoch": 6.769657083477658, + "grad_norm": 0.2366628646850586, + "learning_rate": 3.2309292649098478e-06, + "loss": 0.0035, + "step": 9772 + }, + { + "epoch": 6.770349844128853, + "grad_norm": 0.2102373242378235, + "learning_rate": 3.2302357836338423e-06, + "loss": 0.0029, + "step": 9773 + }, + { + "epoch": 6.771042604780049, + "grad_norm": 0.2694167494773865, + "learning_rate": 3.2295423023578364e-06, + "loss": 0.0041, + "step": 9774 + }, + { + "epoch": 6.771735365431244, + "grad_norm": 0.26727476716041565, + "learning_rate": 3.228848821081831e-06, + "loss": 0.0034, + "step": 9775 + }, + { + "epoch": 6.772428126082438, + "grad_norm": 0.23914754390716553, + "learning_rate": 3.228155339805826e-06, + "loss": 0.0054, + "step": 9776 + }, + { + "epoch": 6.773120886733634, + "grad_norm": 0.4744378626346588, + "learning_rate": 3.22746185852982e-06, + "loss": 0.0062, + "step": 9777 + }, + { + "epoch": 6.773813647384829, + "grad_norm": 0.22059325873851776, + "learning_rate": 3.2267683772538144e-06, + "loss": 0.004, + "step": 9778 + }, + { + "epoch": 6.774506408036023, + "grad_norm": 0.2267928123474121, + "learning_rate": 3.226074895977809e-06, + "loss": 0.0044, + "step": 9779 + }, + { + "epoch": 6.775199168687219, + "grad_norm": 0.3121776878833771, + "learning_rate": 3.225381414701803e-06, + "loss": 0.0051, + "step": 9780 + }, + { + "epoch": 6.775891929338414, + "grad_norm": 0.35926979780197144, + "learning_rate": 3.224687933425798e-06, + "loss": 0.0049, + "step": 9781 + }, + { + "epoch": 6.776584689989608, + "grad_norm": 0.7782687544822693, + "learning_rate": 3.2239944521497924e-06, + "loss": 0.0033, + "step": 9782 + }, + { + "epoch": 6.777277450640804, + "grad_norm": 0.37896329164505005, + "learning_rate": 3.2233009708737865e-06, + "loss": 0.0031, + "step": 9783 + }, + { + "epoch": 6.777970211291999, + "grad_norm": 0.3360580503940582, + "learning_rate": 3.222607489597781e-06, + "loss": 0.0037, + "step": 9784 + }, + { + "epoch": 6.778662971943193, + "grad_norm": 0.5456976890563965, + "learning_rate": 3.2219140083217755e-06, + "loss": 0.0077, + "step": 9785 + }, + { + "epoch": 6.7793557325943885, + "grad_norm": 0.16573163866996765, + "learning_rate": 3.22122052704577e-06, + "loss": 0.0035, + "step": 9786 + }, + { + "epoch": 6.780048493245584, + "grad_norm": 0.34573253989219666, + "learning_rate": 3.2205270457697646e-06, + "loss": 0.0062, + "step": 9787 + }, + { + "epoch": 6.780741253896778, + "grad_norm": 0.28036412596702576, + "learning_rate": 3.219833564493759e-06, + "loss": 0.0038, + "step": 9788 + }, + { + "epoch": 6.7814340145479735, + "grad_norm": 0.25258782505989075, + "learning_rate": 3.219140083217753e-06, + "loss": 0.0034, + "step": 9789 + }, + { + "epoch": 6.782126775199169, + "grad_norm": 0.23531854152679443, + "learning_rate": 3.2184466019417477e-06, + "loss": 0.0031, + "step": 9790 + }, + { + "epoch": 6.782819535850364, + "grad_norm": 0.3496544063091278, + "learning_rate": 3.2177531206657426e-06, + "loss": 0.0042, + "step": 9791 + }, + { + "epoch": 6.783512296501558, + "grad_norm": 0.2098337858915329, + "learning_rate": 3.2170596393897367e-06, + "loss": 0.0038, + "step": 9792 + }, + { + "epoch": 6.784205057152754, + "grad_norm": 0.3733336627483368, + "learning_rate": 3.216366158113731e-06, + "loss": 0.0046, + "step": 9793 + }, + { + "epoch": 6.784897817803949, + "grad_norm": 0.5599023103713989, + "learning_rate": 3.2156726768377257e-06, + "loss": 0.0044, + "step": 9794 + }, + { + "epoch": 6.785590578455144, + "grad_norm": 0.1788777858018875, + "learning_rate": 3.21497919556172e-06, + "loss": 0.003, + "step": 9795 + }, + { + "epoch": 6.786283339106339, + "grad_norm": 0.15402033925056458, + "learning_rate": 3.2142857142857147e-06, + "loss": 0.0028, + "step": 9796 + }, + { + "epoch": 6.786976099757534, + "grad_norm": 0.24567104876041412, + "learning_rate": 3.2135922330097092e-06, + "loss": 0.0051, + "step": 9797 + }, + { + "epoch": 6.787668860408729, + "grad_norm": 0.33602938055992126, + "learning_rate": 3.2128987517337033e-06, + "loss": 0.0044, + "step": 9798 + }, + { + "epoch": 6.788361621059924, + "grad_norm": 0.1702132374048233, + "learning_rate": 3.212205270457698e-06, + "loss": 0.0027, + "step": 9799 + }, + { + "epoch": 6.789054381711119, + "grad_norm": 0.26751288771629333, + "learning_rate": 3.2115117891816923e-06, + "loss": 0.0039, + "step": 9800 + }, + { + "epoch": 6.789747142362314, + "grad_norm": 0.312261164188385, + "learning_rate": 3.210818307905687e-06, + "loss": 0.0057, + "step": 9801 + }, + { + "epoch": 6.7904399030135085, + "grad_norm": 0.27268752455711365, + "learning_rate": 3.2101248266296814e-06, + "loss": 0.0041, + "step": 9802 + }, + { + "epoch": 6.791132663664704, + "grad_norm": 0.20988130569458008, + "learning_rate": 3.209431345353676e-06, + "loss": 0.0042, + "step": 9803 + }, + { + "epoch": 6.791825424315899, + "grad_norm": 0.22025372087955475, + "learning_rate": 3.20873786407767e-06, + "loss": 0.0034, + "step": 9804 + }, + { + "epoch": 6.7925181849670935, + "grad_norm": 0.3386022448539734, + "learning_rate": 3.2080443828016645e-06, + "loss": 0.0073, + "step": 9805 + }, + { + "epoch": 6.793210945618289, + "grad_norm": 0.2105453461408615, + "learning_rate": 3.2073509015256594e-06, + "loss": 0.0035, + "step": 9806 + }, + { + "epoch": 6.793903706269484, + "grad_norm": 0.45902219414711, + "learning_rate": 3.2066574202496535e-06, + "loss": 0.0062, + "step": 9807 + }, + { + "epoch": 6.794596466920678, + "grad_norm": 0.13881872594356537, + "learning_rate": 3.205963938973648e-06, + "loss": 0.0031, + "step": 9808 + }, + { + "epoch": 6.795289227571874, + "grad_norm": 0.21973024308681488, + "learning_rate": 3.2052704576976425e-06, + "loss": 0.0035, + "step": 9809 + }, + { + "epoch": 6.795981988223069, + "grad_norm": 0.2734771966934204, + "learning_rate": 3.2045769764216366e-06, + "loss": 0.004, + "step": 9810 + }, + { + "epoch": 6.796674748874264, + "grad_norm": 0.2917121648788452, + "learning_rate": 3.2038834951456315e-06, + "loss": 0.0046, + "step": 9811 + }, + { + "epoch": 6.797367509525459, + "grad_norm": 0.32434505224227905, + "learning_rate": 3.203190013869626e-06, + "loss": 0.0036, + "step": 9812 + }, + { + "epoch": 6.798060270176654, + "grad_norm": 0.2449207454919815, + "learning_rate": 3.20249653259362e-06, + "loss": 0.0044, + "step": 9813 + }, + { + "epoch": 6.798753030827849, + "grad_norm": 0.6148754954338074, + "learning_rate": 3.2018030513176146e-06, + "loss": 0.004, + "step": 9814 + }, + { + "epoch": 6.7994457914790445, + "grad_norm": 0.2852209508419037, + "learning_rate": 3.201109570041609e-06, + "loss": 0.0059, + "step": 9815 + }, + { + "epoch": 6.800138552130239, + "grad_norm": 0.3754134178161621, + "learning_rate": 3.2004160887656037e-06, + "loss": 0.0042, + "step": 9816 + }, + { + "epoch": 6.800831312781434, + "grad_norm": 0.30358991026878357, + "learning_rate": 3.199722607489598e-06, + "loss": 0.0037, + "step": 9817 + }, + { + "epoch": 6.801524073432629, + "grad_norm": 0.25480136275291443, + "learning_rate": 3.1990291262135927e-06, + "loss": 0.0042, + "step": 9818 + }, + { + "epoch": 6.802216834083824, + "grad_norm": 0.2729668617248535, + "learning_rate": 3.1983356449375868e-06, + "loss": 0.0044, + "step": 9819 + }, + { + "epoch": 6.802909594735019, + "grad_norm": 0.4184122383594513, + "learning_rate": 3.1976421636615813e-06, + "loss": 0.006, + "step": 9820 + }, + { + "epoch": 6.803602355386214, + "grad_norm": 0.3179779648780823, + "learning_rate": 3.196948682385576e-06, + "loss": 0.0033, + "step": 9821 + }, + { + "epoch": 6.804295116037409, + "grad_norm": 0.4185744524002075, + "learning_rate": 3.1962552011095703e-06, + "loss": 0.0034, + "step": 9822 + }, + { + "epoch": 6.804987876688604, + "grad_norm": 0.16288727521896362, + "learning_rate": 3.195561719833565e-06, + "loss": 0.0032, + "step": 9823 + }, + { + "epoch": 6.805680637339799, + "grad_norm": 0.18902811408042908, + "learning_rate": 3.1948682385575593e-06, + "loss": 0.0032, + "step": 9824 + }, + { + "epoch": 6.806373397990994, + "grad_norm": 0.16687580943107605, + "learning_rate": 3.1941747572815534e-06, + "loss": 0.0041, + "step": 9825 + }, + { + "epoch": 6.807066158642189, + "grad_norm": 0.23390142619609833, + "learning_rate": 3.1934812760055483e-06, + "loss": 0.0033, + "step": 9826 + }, + { + "epoch": 6.807758919293384, + "grad_norm": 0.26326286792755127, + "learning_rate": 3.192787794729543e-06, + "loss": 0.004, + "step": 9827 + }, + { + "epoch": 6.808451679944579, + "grad_norm": 0.4257308840751648, + "learning_rate": 3.192094313453537e-06, + "loss": 0.0043, + "step": 9828 + }, + { + "epoch": 6.809144440595774, + "grad_norm": 0.1480860710144043, + "learning_rate": 3.1914008321775314e-06, + "loss": 0.0026, + "step": 9829 + }, + { + "epoch": 6.809837201246969, + "grad_norm": 0.23659858107566833, + "learning_rate": 3.1907073509015255e-06, + "loss": 0.0029, + "step": 9830 + }, + { + "epoch": 6.8105299618981645, + "grad_norm": 0.4106503129005432, + "learning_rate": 3.1900138696255205e-06, + "loss": 0.0037, + "step": 9831 + }, + { + "epoch": 6.811222722549359, + "grad_norm": 0.3442458212375641, + "learning_rate": 3.189320388349515e-06, + "loss": 0.0051, + "step": 9832 + }, + { + "epoch": 6.811915483200554, + "grad_norm": 0.21609291434288025, + "learning_rate": 3.1886269070735095e-06, + "loss": 0.0032, + "step": 9833 + }, + { + "epoch": 6.8126082438517495, + "grad_norm": 0.17556637525558472, + "learning_rate": 3.1879334257975036e-06, + "loss": 0.003, + "step": 9834 + }, + { + "epoch": 6.813301004502944, + "grad_norm": 0.3292466402053833, + "learning_rate": 3.187239944521498e-06, + "loss": 0.0039, + "step": 9835 + }, + { + "epoch": 6.813993765154139, + "grad_norm": 0.25917738676071167, + "learning_rate": 3.186546463245493e-06, + "loss": 0.0048, + "step": 9836 + }, + { + "epoch": 6.814686525805334, + "grad_norm": 0.17207622528076172, + "learning_rate": 3.185852981969487e-06, + "loss": 0.0042, + "step": 9837 + }, + { + "epoch": 6.81537928645653, + "grad_norm": 0.24776802957057953, + "learning_rate": 3.1851595006934816e-06, + "loss": 0.0043, + "step": 9838 + }, + { + "epoch": 6.816072047107724, + "grad_norm": 0.32751888036727905, + "learning_rate": 3.184466019417476e-06, + "loss": 0.0045, + "step": 9839 + }, + { + "epoch": 6.816764807758919, + "grad_norm": 0.11641302704811096, + "learning_rate": 3.18377253814147e-06, + "loss": 0.0023, + "step": 9840 + }, + { + "epoch": 6.817457568410115, + "grad_norm": 0.450704425573349, + "learning_rate": 3.183079056865465e-06, + "loss": 0.0057, + "step": 9841 + }, + { + "epoch": 6.818150329061309, + "grad_norm": 0.2343761920928955, + "learning_rate": 3.1823855755894596e-06, + "loss": 0.0033, + "step": 9842 + }, + { + "epoch": 6.818843089712504, + "grad_norm": 0.29512107372283936, + "learning_rate": 3.1816920943134537e-06, + "loss": 0.0032, + "step": 9843 + }, + { + "epoch": 6.8195358503637, + "grad_norm": 0.16802075505256653, + "learning_rate": 3.1809986130374482e-06, + "loss": 0.0029, + "step": 9844 + }, + { + "epoch": 6.820228611014894, + "grad_norm": 0.16243338584899902, + "learning_rate": 3.1803051317614423e-06, + "loss": 0.0036, + "step": 9845 + }, + { + "epoch": 6.820921371666089, + "grad_norm": 0.398639053106308, + "learning_rate": 3.1796116504854373e-06, + "loss": 0.0051, + "step": 9846 + }, + { + "epoch": 6.8216141323172845, + "grad_norm": 0.317801833152771, + "learning_rate": 3.1789181692094318e-06, + "loss": 0.0059, + "step": 9847 + }, + { + "epoch": 6.822306892968479, + "grad_norm": 0.6256863474845886, + "learning_rate": 3.1782246879334263e-06, + "loss": 0.0056, + "step": 9848 + }, + { + "epoch": 6.822999653619674, + "grad_norm": 0.3846816420555115, + "learning_rate": 3.1775312066574204e-06, + "loss": 0.0054, + "step": 9849 + }, + { + "epoch": 6.8236924142708695, + "grad_norm": 0.1945173144340515, + "learning_rate": 3.176837725381415e-06, + "loss": 0.004, + "step": 9850 + }, + { + "epoch": 6.824385174922065, + "grad_norm": 0.21418678760528564, + "learning_rate": 3.17614424410541e-06, + "loss": 0.003, + "step": 9851 + }, + { + "epoch": 6.825077935573259, + "grad_norm": 0.29843252897262573, + "learning_rate": 3.175450762829404e-06, + "loss": 0.0059, + "step": 9852 + }, + { + "epoch": 6.825770696224454, + "grad_norm": 0.19400453567504883, + "learning_rate": 3.1747572815533984e-06, + "loss": 0.0034, + "step": 9853 + }, + { + "epoch": 6.82646345687565, + "grad_norm": 0.2794119715690613, + "learning_rate": 3.1740638002773925e-06, + "loss": 0.0045, + "step": 9854 + }, + { + "epoch": 6.827156217526844, + "grad_norm": 0.23559890687465668, + "learning_rate": 3.173370319001387e-06, + "loss": 0.0028, + "step": 9855 + }, + { + "epoch": 6.827848978178039, + "grad_norm": 0.1915004700422287, + "learning_rate": 3.172676837725382e-06, + "loss": 0.0041, + "step": 9856 + }, + { + "epoch": 6.828541738829235, + "grad_norm": 0.212208554148674, + "learning_rate": 3.1719833564493764e-06, + "loss": 0.0033, + "step": 9857 + }, + { + "epoch": 6.82923449948043, + "grad_norm": 0.23424312472343445, + "learning_rate": 3.1712898751733705e-06, + "loss": 0.003, + "step": 9858 + }, + { + "epoch": 6.829927260131624, + "grad_norm": 0.34941333532333374, + "learning_rate": 3.170596393897365e-06, + "loss": 0.0043, + "step": 9859 + }, + { + "epoch": 6.83062002078282, + "grad_norm": 0.2985086143016815, + "learning_rate": 3.169902912621359e-06, + "loss": 0.0053, + "step": 9860 + }, + { + "epoch": 6.831312781434015, + "grad_norm": 0.4434652328491211, + "learning_rate": 3.169209431345354e-06, + "loss": 0.0064, + "step": 9861 + }, + { + "epoch": 6.832005542085209, + "grad_norm": 0.20575271546840668, + "learning_rate": 3.1685159500693486e-06, + "loss": 0.0034, + "step": 9862 + }, + { + "epoch": 6.8326983027364045, + "grad_norm": 0.1959100216627121, + "learning_rate": 3.167822468793343e-06, + "loss": 0.0035, + "step": 9863 + }, + { + "epoch": 6.8333910633876, + "grad_norm": 0.38107800483703613, + "learning_rate": 3.167128987517337e-06, + "loss": 0.0044, + "step": 9864 + }, + { + "epoch": 6.834083824038794, + "grad_norm": 0.15644213557243347, + "learning_rate": 3.1664355062413317e-06, + "loss": 0.0025, + "step": 9865 + }, + { + "epoch": 6.8347765846899895, + "grad_norm": 0.19525551795959473, + "learning_rate": 3.1657420249653266e-06, + "loss": 0.0032, + "step": 9866 + }, + { + "epoch": 6.835469345341185, + "grad_norm": 0.29288479685783386, + "learning_rate": 3.1650485436893207e-06, + "loss": 0.0038, + "step": 9867 + }, + { + "epoch": 6.836162105992379, + "grad_norm": 0.5829963684082031, + "learning_rate": 3.164355062413315e-06, + "loss": 0.0058, + "step": 9868 + }, + { + "epoch": 6.8368548666435744, + "grad_norm": 0.21414701640605927, + "learning_rate": 3.1636615811373093e-06, + "loss": 0.0045, + "step": 9869 + }, + { + "epoch": 6.83754762729477, + "grad_norm": 0.21531203389167786, + "learning_rate": 3.162968099861304e-06, + "loss": 0.0032, + "step": 9870 + }, + { + "epoch": 6.838240387945965, + "grad_norm": 0.1464047133922577, + "learning_rate": 3.1622746185852987e-06, + "loss": 0.0026, + "step": 9871 + }, + { + "epoch": 6.838933148597159, + "grad_norm": 0.20787616074085236, + "learning_rate": 3.1615811373092932e-06, + "loss": 0.003, + "step": 9872 + }, + { + "epoch": 6.839625909248355, + "grad_norm": 0.7196558117866516, + "learning_rate": 3.1608876560332873e-06, + "loss": 0.0073, + "step": 9873 + }, + { + "epoch": 6.84031866989955, + "grad_norm": 0.2806764841079712, + "learning_rate": 3.160194174757282e-06, + "loss": 0.0055, + "step": 9874 + }, + { + "epoch": 6.841011430550744, + "grad_norm": 0.23197102546691895, + "learning_rate": 3.159500693481276e-06, + "loss": 0.0034, + "step": 9875 + }, + { + "epoch": 6.84170419120194, + "grad_norm": 0.208356112241745, + "learning_rate": 3.158807212205271e-06, + "loss": 0.0031, + "step": 9876 + }, + { + "epoch": 6.842396951853135, + "grad_norm": 0.35312914848327637, + "learning_rate": 3.1581137309292654e-06, + "loss": 0.0044, + "step": 9877 + }, + { + "epoch": 6.84308971250433, + "grad_norm": 0.20556029677391052, + "learning_rate": 3.1574202496532595e-06, + "loss": 0.0036, + "step": 9878 + }, + { + "epoch": 6.843782473155525, + "grad_norm": 0.1898297816514969, + "learning_rate": 3.156726768377254e-06, + "loss": 0.0036, + "step": 9879 + }, + { + "epoch": 6.84447523380672, + "grad_norm": 0.1932491511106491, + "learning_rate": 3.1560332871012485e-06, + "loss": 0.0029, + "step": 9880 + }, + { + "epoch": 6.845167994457915, + "grad_norm": 0.2936733365058899, + "learning_rate": 3.1553398058252434e-06, + "loss": 0.0039, + "step": 9881 + }, + { + "epoch": 6.8458607551091095, + "grad_norm": 0.274087131023407, + "learning_rate": 3.1546463245492375e-06, + "loss": 0.0039, + "step": 9882 + }, + { + "epoch": 6.846553515760305, + "grad_norm": 0.3380624055862427, + "learning_rate": 3.153952843273232e-06, + "loss": 0.007, + "step": 9883 + }, + { + "epoch": 6.8472462764115, + "grad_norm": 0.14727269113063812, + "learning_rate": 3.153259361997226e-06, + "loss": 0.0026, + "step": 9884 + }, + { + "epoch": 6.8479390370626945, + "grad_norm": 0.17885534465312958, + "learning_rate": 3.1525658807212206e-06, + "loss": 0.0035, + "step": 9885 + }, + { + "epoch": 6.84863179771389, + "grad_norm": 0.32952526211738586, + "learning_rate": 3.1518723994452155e-06, + "loss": 0.0036, + "step": 9886 + }, + { + "epoch": 6.849324558365085, + "grad_norm": 0.2858526110649109, + "learning_rate": 3.15117891816921e-06, + "loss": 0.004, + "step": 9887 + }, + { + "epoch": 6.850017319016279, + "grad_norm": 0.21263548731803894, + "learning_rate": 3.150485436893204e-06, + "loss": 0.0031, + "step": 9888 + }, + { + "epoch": 6.850710079667475, + "grad_norm": 0.17841656506061554, + "learning_rate": 3.1497919556171986e-06, + "loss": 0.0029, + "step": 9889 + }, + { + "epoch": 6.85140284031867, + "grad_norm": 0.1647372841835022, + "learning_rate": 3.1490984743411927e-06, + "loss": 0.0025, + "step": 9890 + }, + { + "epoch": 6.852095600969865, + "grad_norm": 0.1923486292362213, + "learning_rate": 3.1484049930651877e-06, + "loss": 0.0032, + "step": 9891 + }, + { + "epoch": 6.85278836162106, + "grad_norm": 0.19028408825397491, + "learning_rate": 3.147711511789182e-06, + "loss": 0.0032, + "step": 9892 + }, + { + "epoch": 6.853481122272255, + "grad_norm": 0.11258408427238464, + "learning_rate": 3.1470180305131763e-06, + "loss": 0.0022, + "step": 9893 + }, + { + "epoch": 6.85417388292345, + "grad_norm": 0.23631267249584198, + "learning_rate": 3.1463245492371708e-06, + "loss": 0.003, + "step": 9894 + }, + { + "epoch": 6.854866643574645, + "grad_norm": 0.14479823410511017, + "learning_rate": 3.1456310679611653e-06, + "loss": 0.0024, + "step": 9895 + }, + { + "epoch": 6.85555940422584, + "grad_norm": 0.16995778679847717, + "learning_rate": 3.1449375866851602e-06, + "loss": 0.0027, + "step": 9896 + }, + { + "epoch": 6.856252164877035, + "grad_norm": 0.27697867155075073, + "learning_rate": 3.1442441054091543e-06, + "loss": 0.0032, + "step": 9897 + }, + { + "epoch": 6.85694492552823, + "grad_norm": 0.19249166548252106, + "learning_rate": 3.143550624133149e-06, + "loss": 0.0028, + "step": 9898 + }, + { + "epoch": 6.857637686179425, + "grad_norm": 0.22699804604053497, + "learning_rate": 3.142857142857143e-06, + "loss": 0.0044, + "step": 9899 + }, + { + "epoch": 6.85833044683062, + "grad_norm": 0.30924737453460693, + "learning_rate": 3.1421636615811374e-06, + "loss": 0.0048, + "step": 9900 + }, + { + "epoch": 6.859023207481815, + "grad_norm": 0.25368955731391907, + "learning_rate": 3.1414701803051323e-06, + "loss": 0.0046, + "step": 9901 + }, + { + "epoch": 6.85971596813301, + "grad_norm": 0.2596815526485443, + "learning_rate": 3.1407766990291264e-06, + "loss": 0.0039, + "step": 9902 + }, + { + "epoch": 6.860408728784205, + "grad_norm": 0.28993862867355347, + "learning_rate": 3.140083217753121e-06, + "loss": 0.0047, + "step": 9903 + }, + { + "epoch": 6.8611014894354, + "grad_norm": 0.23070913553237915, + "learning_rate": 3.1393897364771154e-06, + "loss": 0.0042, + "step": 9904 + }, + { + "epoch": 6.861794250086595, + "grad_norm": 0.27373337745666504, + "learning_rate": 3.1386962552011095e-06, + "loss": 0.0038, + "step": 9905 + }, + { + "epoch": 6.86248701073779, + "grad_norm": 0.17786487936973572, + "learning_rate": 3.1380027739251045e-06, + "loss": 0.0026, + "step": 9906 + }, + { + "epoch": 6.863179771388985, + "grad_norm": 0.1329381763935089, + "learning_rate": 3.137309292649099e-06, + "loss": 0.0023, + "step": 9907 + }, + { + "epoch": 6.86387253204018, + "grad_norm": 0.25502750277519226, + "learning_rate": 3.136615811373093e-06, + "loss": 0.0041, + "step": 9908 + }, + { + "epoch": 6.864565292691375, + "grad_norm": 0.2774457037448883, + "learning_rate": 3.1359223300970876e-06, + "loss": 0.0042, + "step": 9909 + }, + { + "epoch": 6.86525805334257, + "grad_norm": 0.20210881531238556, + "learning_rate": 3.135228848821082e-06, + "loss": 0.0032, + "step": 9910 + }, + { + "epoch": 6.8659508139937655, + "grad_norm": 0.2541934549808502, + "learning_rate": 3.134535367545077e-06, + "loss": 0.0044, + "step": 9911 + }, + { + "epoch": 6.86664357464496, + "grad_norm": 0.25267860293388367, + "learning_rate": 3.133841886269071e-06, + "loss": 0.0046, + "step": 9912 + }, + { + "epoch": 6.867336335296155, + "grad_norm": 0.1430072784423828, + "learning_rate": 3.1331484049930656e-06, + "loss": 0.0023, + "step": 9913 + }, + { + "epoch": 6.8680290959473504, + "grad_norm": 0.3600761592388153, + "learning_rate": 3.1324549237170597e-06, + "loss": 0.0043, + "step": 9914 + }, + { + "epoch": 6.868721856598545, + "grad_norm": 0.2174580842256546, + "learning_rate": 3.131761442441054e-06, + "loss": 0.0033, + "step": 9915 + }, + { + "epoch": 6.86941461724974, + "grad_norm": 0.49101021885871887, + "learning_rate": 3.131067961165049e-06, + "loss": 0.0045, + "step": 9916 + }, + { + "epoch": 6.870107377900935, + "grad_norm": 0.10587871074676514, + "learning_rate": 3.1303744798890432e-06, + "loss": 0.002, + "step": 9917 + }, + { + "epoch": 6.870800138552131, + "grad_norm": 0.32948845624923706, + "learning_rate": 3.1296809986130377e-06, + "loss": 0.0036, + "step": 9918 + }, + { + "epoch": 6.871492899203325, + "grad_norm": 0.30850082635879517, + "learning_rate": 3.1289875173370322e-06, + "loss": 0.0039, + "step": 9919 + }, + { + "epoch": 6.87218565985452, + "grad_norm": 0.3731090724468231, + "learning_rate": 3.1282940360610263e-06, + "loss": 0.0052, + "step": 9920 + }, + { + "epoch": 6.872878420505716, + "grad_norm": 0.23356278240680695, + "learning_rate": 3.1276005547850213e-06, + "loss": 0.0022, + "step": 9921 + }, + { + "epoch": 6.87357118115691, + "grad_norm": 0.14465276896953583, + "learning_rate": 3.1269070735090158e-06, + "loss": 0.0027, + "step": 9922 + }, + { + "epoch": 6.874263941808105, + "grad_norm": 0.35693371295928955, + "learning_rate": 3.12621359223301e-06, + "loss": 0.0037, + "step": 9923 + }, + { + "epoch": 6.874956702459301, + "grad_norm": 0.505262017250061, + "learning_rate": 3.1255201109570044e-06, + "loss": 0.0059, + "step": 9924 + }, + { + "epoch": 6.875649463110495, + "grad_norm": 0.3666226267814636, + "learning_rate": 3.124826629680999e-06, + "loss": 0.0059, + "step": 9925 + }, + { + "epoch": 6.87634222376169, + "grad_norm": 0.35455337166786194, + "learning_rate": 3.124133148404993e-06, + "loss": 0.006, + "step": 9926 + }, + { + "epoch": 6.8770349844128855, + "grad_norm": 0.1944286972284317, + "learning_rate": 3.123439667128988e-06, + "loss": 0.0036, + "step": 9927 + }, + { + "epoch": 6.87772774506408, + "grad_norm": 0.2120799571275711, + "learning_rate": 3.1227461858529824e-06, + "loss": 0.004, + "step": 9928 + }, + { + "epoch": 6.878420505715275, + "grad_norm": 0.27662771940231323, + "learning_rate": 3.1220527045769765e-06, + "loss": 0.0029, + "step": 9929 + }, + { + "epoch": 6.8791132663664705, + "grad_norm": 0.20927362143993378, + "learning_rate": 3.121359223300971e-06, + "loss": 0.0036, + "step": 9930 + }, + { + "epoch": 6.879806027017666, + "grad_norm": 0.28317469358444214, + "learning_rate": 3.120665742024965e-06, + "loss": 0.0032, + "step": 9931 + }, + { + "epoch": 6.88049878766886, + "grad_norm": 0.2607945501804352, + "learning_rate": 3.11997226074896e-06, + "loss": 0.0037, + "step": 9932 + }, + { + "epoch": 6.881191548320055, + "grad_norm": 0.3832789659500122, + "learning_rate": 3.1192787794729545e-06, + "loss": 0.0053, + "step": 9933 + }, + { + "epoch": 6.881884308971251, + "grad_norm": 0.2038079798221588, + "learning_rate": 3.118585298196949e-06, + "loss": 0.0027, + "step": 9934 + }, + { + "epoch": 6.882577069622445, + "grad_norm": 0.3464046120643616, + "learning_rate": 3.117891816920943e-06, + "loss": 0.0045, + "step": 9935 + }, + { + "epoch": 6.88326983027364, + "grad_norm": 0.3392944037914276, + "learning_rate": 3.1171983356449376e-06, + "loss": 0.0049, + "step": 9936 + }, + { + "epoch": 6.883962590924836, + "grad_norm": 0.3029744327068329, + "learning_rate": 3.1165048543689326e-06, + "loss": 0.0035, + "step": 9937 + }, + { + "epoch": 6.884655351576031, + "grad_norm": 0.15682591497898102, + "learning_rate": 3.1158113730929267e-06, + "loss": 0.0032, + "step": 9938 + }, + { + "epoch": 6.885348112227225, + "grad_norm": 0.3812074661254883, + "learning_rate": 3.115117891816921e-06, + "loss": 0.0033, + "step": 9939 + }, + { + "epoch": 6.886040872878421, + "grad_norm": 0.3690953850746155, + "learning_rate": 3.1144244105409157e-06, + "loss": 0.0058, + "step": 9940 + }, + { + "epoch": 6.886733633529616, + "grad_norm": 0.298159658908844, + "learning_rate": 3.1137309292649098e-06, + "loss": 0.0047, + "step": 9941 + }, + { + "epoch": 6.88742639418081, + "grad_norm": 0.40038490295410156, + "learning_rate": 3.1130374479889047e-06, + "loss": 0.0056, + "step": 9942 + }, + { + "epoch": 6.8881191548320055, + "grad_norm": 0.1882338970899582, + "learning_rate": 3.1123439667128992e-06, + "loss": 0.004, + "step": 9943 + }, + { + "epoch": 6.888811915483201, + "grad_norm": 0.20926852524280548, + "learning_rate": 3.1116504854368933e-06, + "loss": 0.0036, + "step": 9944 + }, + { + "epoch": 6.889504676134395, + "grad_norm": 0.1722264587879181, + "learning_rate": 3.110957004160888e-06, + "loss": 0.0027, + "step": 9945 + }, + { + "epoch": 6.8901974367855905, + "grad_norm": 0.23840396106243134, + "learning_rate": 3.110263522884882e-06, + "loss": 0.0029, + "step": 9946 + }, + { + "epoch": 6.890890197436786, + "grad_norm": 0.2524208426475525, + "learning_rate": 3.109570041608877e-06, + "loss": 0.0044, + "step": 9947 + }, + { + "epoch": 6.89158295808798, + "grad_norm": 0.40240007638931274, + "learning_rate": 3.1088765603328713e-06, + "loss": 0.0047, + "step": 9948 + }, + { + "epoch": 6.892275718739175, + "grad_norm": 0.17475059628486633, + "learning_rate": 3.108183079056866e-06, + "loss": 0.0024, + "step": 9949 + }, + { + "epoch": 6.892968479390371, + "grad_norm": 0.28309381008148193, + "learning_rate": 3.10748959778086e-06, + "loss": 0.003, + "step": 9950 + }, + { + "epoch": 6.893661240041566, + "grad_norm": 0.1873553842306137, + "learning_rate": 3.1067961165048544e-06, + "loss": 0.0032, + "step": 9951 + }, + { + "epoch": 6.89435400069276, + "grad_norm": 0.40987929701805115, + "learning_rate": 3.1061026352288494e-06, + "loss": 0.0039, + "step": 9952 + }, + { + "epoch": 6.895046761343956, + "grad_norm": 0.28078773617744446, + "learning_rate": 3.1054091539528435e-06, + "loss": 0.0034, + "step": 9953 + }, + { + "epoch": 6.895739521995151, + "grad_norm": 0.34553733468055725, + "learning_rate": 3.104715672676838e-06, + "loss": 0.0041, + "step": 9954 + }, + { + "epoch": 6.896432282646345, + "grad_norm": 0.19245411455631256, + "learning_rate": 3.1040221914008325e-06, + "loss": 0.0028, + "step": 9955 + }, + { + "epoch": 6.897125043297541, + "grad_norm": 0.8639165759086609, + "learning_rate": 3.1033287101248266e-06, + "loss": 0.0045, + "step": 9956 + }, + { + "epoch": 6.897817803948736, + "grad_norm": 0.22493526339530945, + "learning_rate": 3.1026352288488215e-06, + "loss": 0.0041, + "step": 9957 + }, + { + "epoch": 6.898510564599931, + "grad_norm": 0.22028587758541107, + "learning_rate": 3.101941747572816e-06, + "loss": 0.0036, + "step": 9958 + }, + { + "epoch": 6.899203325251126, + "grad_norm": 0.21007047593593597, + "learning_rate": 3.10124826629681e-06, + "loss": 0.0037, + "step": 9959 + }, + { + "epoch": 6.899896085902321, + "grad_norm": 0.3950655162334442, + "learning_rate": 3.1005547850208046e-06, + "loss": 0.0048, + "step": 9960 + }, + { + "epoch": 6.900588846553516, + "grad_norm": 0.13465934991836548, + "learning_rate": 3.0998613037447987e-06, + "loss": 0.0024, + "step": 9961 + }, + { + "epoch": 6.9012816072047105, + "grad_norm": 0.3211439251899719, + "learning_rate": 3.0991678224687936e-06, + "loss": 0.0042, + "step": 9962 + }, + { + "epoch": 6.901974367855906, + "grad_norm": 0.25775355100631714, + "learning_rate": 3.098474341192788e-06, + "loss": 0.0038, + "step": 9963 + }, + { + "epoch": 6.902667128507101, + "grad_norm": 0.30540579557418823, + "learning_rate": 3.0977808599167827e-06, + "loss": 0.0034, + "step": 9964 + }, + { + "epoch": 6.9033598891582955, + "grad_norm": 0.39724627137184143, + "learning_rate": 3.0970873786407767e-06, + "loss": 0.0033, + "step": 9965 + }, + { + "epoch": 6.904052649809491, + "grad_norm": 0.20717176795005798, + "learning_rate": 3.0963938973647712e-06, + "loss": 0.0047, + "step": 9966 + }, + { + "epoch": 6.904745410460686, + "grad_norm": 0.25329551100730896, + "learning_rate": 3.095700416088766e-06, + "loss": 0.0053, + "step": 9967 + }, + { + "epoch": 6.90543817111188, + "grad_norm": 0.5993894338607788, + "learning_rate": 3.0950069348127603e-06, + "loss": 0.0048, + "step": 9968 + }, + { + "epoch": 6.906130931763076, + "grad_norm": 0.3660557270050049, + "learning_rate": 3.0943134535367548e-06, + "loss": 0.0051, + "step": 9969 + }, + { + "epoch": 6.906823692414271, + "grad_norm": 0.1983492374420166, + "learning_rate": 3.093619972260749e-06, + "loss": 0.0033, + "step": 9970 + }, + { + "epoch": 6.907516453065466, + "grad_norm": 0.1262977570295334, + "learning_rate": 3.0929264909847434e-06, + "loss": 0.0026, + "step": 9971 + }, + { + "epoch": 6.908209213716661, + "grad_norm": 0.4371584951877594, + "learning_rate": 3.0922330097087383e-06, + "loss": 0.0037, + "step": 9972 + }, + { + "epoch": 6.908901974367856, + "grad_norm": 0.19370682537555695, + "learning_rate": 3.091539528432733e-06, + "loss": 0.0032, + "step": 9973 + }, + { + "epoch": 6.909594735019051, + "grad_norm": 0.3977658748626709, + "learning_rate": 3.090846047156727e-06, + "loss": 0.0043, + "step": 9974 + }, + { + "epoch": 6.910287495670246, + "grad_norm": 0.2266015261411667, + "learning_rate": 3.0901525658807214e-06, + "loss": 0.0039, + "step": 9975 + }, + { + "epoch": 6.910980256321441, + "grad_norm": 0.18956920504570007, + "learning_rate": 3.0894590846047155e-06, + "loss": 0.0031, + "step": 9976 + }, + { + "epoch": 6.911673016972636, + "grad_norm": 0.2873472571372986, + "learning_rate": 3.0887656033287104e-06, + "loss": 0.0051, + "step": 9977 + }, + { + "epoch": 6.912365777623831, + "grad_norm": 0.25905776023864746, + "learning_rate": 3.088072122052705e-06, + "loss": 0.0059, + "step": 9978 + }, + { + "epoch": 6.913058538275026, + "grad_norm": 0.5050122141838074, + "learning_rate": 3.0873786407766995e-06, + "loss": 0.006, + "step": 9979 + }, + { + "epoch": 6.913751298926221, + "grad_norm": 0.28577563166618347, + "learning_rate": 3.0866851595006935e-06, + "loss": 0.0055, + "step": 9980 + }, + { + "epoch": 6.914444059577416, + "grad_norm": 0.20023463666439056, + "learning_rate": 3.085991678224688e-06, + "loss": 0.0044, + "step": 9981 + }, + { + "epoch": 6.915136820228611, + "grad_norm": 0.30305594205856323, + "learning_rate": 3.085298196948683e-06, + "loss": 0.0031, + "step": 9982 + }, + { + "epoch": 6.915829580879806, + "grad_norm": 0.19310308992862701, + "learning_rate": 3.084604715672677e-06, + "loss": 0.0028, + "step": 9983 + }, + { + "epoch": 6.916522341531001, + "grad_norm": 0.2212955355644226, + "learning_rate": 3.0839112343966716e-06, + "loss": 0.0033, + "step": 9984 + }, + { + "epoch": 6.917215102182196, + "grad_norm": 0.27291813492774963, + "learning_rate": 3.0832177531206657e-06, + "loss": 0.0038, + "step": 9985 + }, + { + "epoch": 6.917907862833391, + "grad_norm": 0.2718471586704254, + "learning_rate": 3.08252427184466e-06, + "loss": 0.0034, + "step": 9986 + }, + { + "epoch": 6.918600623484586, + "grad_norm": 0.2039763629436493, + "learning_rate": 3.081830790568655e-06, + "loss": 0.0031, + "step": 9987 + }, + { + "epoch": 6.919293384135781, + "grad_norm": 0.12809832394123077, + "learning_rate": 3.0811373092926496e-06, + "loss": 0.0023, + "step": 9988 + }, + { + "epoch": 6.919986144786976, + "grad_norm": 0.16423216462135315, + "learning_rate": 3.0804438280166437e-06, + "loss": 0.0029, + "step": 9989 + }, + { + "epoch": 6.920678905438171, + "grad_norm": 0.31327763199806213, + "learning_rate": 3.0797503467406382e-06, + "loss": 0.0037, + "step": 9990 + }, + { + "epoch": 6.9213716660893665, + "grad_norm": 0.23169377446174622, + "learning_rate": 3.0790568654646323e-06, + "loss": 0.0033, + "step": 9991 + }, + { + "epoch": 6.922064426740561, + "grad_norm": 0.37328752875328064, + "learning_rate": 3.0783633841886272e-06, + "loss": 0.0065, + "step": 9992 + }, + { + "epoch": 6.922757187391756, + "grad_norm": 0.2823079228401184, + "learning_rate": 3.0776699029126217e-06, + "loss": 0.0043, + "step": 9993 + }, + { + "epoch": 6.923449948042951, + "grad_norm": 0.4732924997806549, + "learning_rate": 3.076976421636616e-06, + "loss": 0.0045, + "step": 9994 + }, + { + "epoch": 6.924142708694146, + "grad_norm": 0.3883107602596283, + "learning_rate": 3.0762829403606103e-06, + "loss": 0.0053, + "step": 9995 + }, + { + "epoch": 6.924835469345341, + "grad_norm": 0.241115540266037, + "learning_rate": 3.075589459084605e-06, + "loss": 0.004, + "step": 9996 + }, + { + "epoch": 6.925528229996536, + "grad_norm": 0.31863003969192505, + "learning_rate": 3.0748959778085998e-06, + "loss": 0.0039, + "step": 9997 + }, + { + "epoch": 6.926220990647732, + "grad_norm": 0.16336983442306519, + "learning_rate": 3.074202496532594e-06, + "loss": 0.0026, + "step": 9998 + }, + { + "epoch": 6.926913751298926, + "grad_norm": 0.2944413423538208, + "learning_rate": 3.0735090152565884e-06, + "loss": 0.0033, + "step": 9999 + }, + { + "epoch": 6.927606511950121, + "grad_norm": 0.17245644330978394, + "learning_rate": 3.0728155339805825e-06, + "loss": 0.0027, + "step": 10000 + }, + { + "epoch": 6.928299272601317, + "grad_norm": 0.14909231662750244, + "learning_rate": 3.072122052704577e-06, + "loss": 0.0038, + "step": 10001 + }, + { + "epoch": 6.928992033252511, + "grad_norm": 0.19677869975566864, + "learning_rate": 3.071428571428572e-06, + "loss": 0.0034, + "step": 10002 + }, + { + "epoch": 6.929684793903706, + "grad_norm": 0.15342335402965546, + "learning_rate": 3.0707350901525664e-06, + "loss": 0.0027, + "step": 10003 + }, + { + "epoch": 6.930377554554902, + "grad_norm": 0.5093405842781067, + "learning_rate": 3.0700416088765605e-06, + "loss": 0.0037, + "step": 10004 + }, + { + "epoch": 6.931070315206096, + "grad_norm": 0.2522270679473877, + "learning_rate": 3.069348127600555e-06, + "loss": 0.0048, + "step": 10005 + }, + { + "epoch": 6.931763075857291, + "grad_norm": 0.33458763360977173, + "learning_rate": 3.068654646324549e-06, + "loss": 0.004, + "step": 10006 + }, + { + "epoch": 6.9324558365084865, + "grad_norm": 0.25092023611068726, + "learning_rate": 3.067961165048544e-06, + "loss": 0.0032, + "step": 10007 + }, + { + "epoch": 6.933148597159681, + "grad_norm": 0.26223668456077576, + "learning_rate": 3.0672676837725385e-06, + "loss": 0.0048, + "step": 10008 + }, + { + "epoch": 6.933841357810876, + "grad_norm": 0.23392276465892792, + "learning_rate": 3.0665742024965326e-06, + "loss": 0.0046, + "step": 10009 + }, + { + "epoch": 6.9345341184620715, + "grad_norm": 0.17337779700756073, + "learning_rate": 3.065880721220527e-06, + "loss": 0.0028, + "step": 10010 + }, + { + "epoch": 6.935226879113267, + "grad_norm": 0.18336065113544464, + "learning_rate": 3.0651872399445217e-06, + "loss": 0.003, + "step": 10011 + }, + { + "epoch": 6.935919639764461, + "grad_norm": 0.311021625995636, + "learning_rate": 3.0644937586685166e-06, + "loss": 0.0046, + "step": 10012 + }, + { + "epoch": 6.936612400415656, + "grad_norm": 0.35576215386390686, + "learning_rate": 3.0638002773925107e-06, + "loss": 0.0066, + "step": 10013 + }, + { + "epoch": 6.937305161066852, + "grad_norm": 0.1976678967475891, + "learning_rate": 3.063106796116505e-06, + "loss": 0.0031, + "step": 10014 + }, + { + "epoch": 6.937997921718046, + "grad_norm": 0.3793521523475647, + "learning_rate": 3.0624133148404993e-06, + "loss": 0.0042, + "step": 10015 + }, + { + "epoch": 6.938690682369241, + "grad_norm": 0.3814072012901306, + "learning_rate": 3.0617198335644938e-06, + "loss": 0.004, + "step": 10016 + }, + { + "epoch": 6.939383443020437, + "grad_norm": 0.20558927953243256, + "learning_rate": 3.0610263522884887e-06, + "loss": 0.0034, + "step": 10017 + }, + { + "epoch": 6.940076203671632, + "grad_norm": 0.2317202389240265, + "learning_rate": 3.060332871012483e-06, + "loss": 0.0031, + "step": 10018 + }, + { + "epoch": 6.940768964322826, + "grad_norm": 0.17047932744026184, + "learning_rate": 3.0596393897364773e-06, + "loss": 0.0031, + "step": 10019 + }, + { + "epoch": 6.941461724974022, + "grad_norm": 0.43688803911209106, + "learning_rate": 3.058945908460472e-06, + "loss": 0.0046, + "step": 10020 + }, + { + "epoch": 6.942154485625217, + "grad_norm": 0.1728978008031845, + "learning_rate": 3.058252427184466e-06, + "loss": 0.0031, + "step": 10021 + }, + { + "epoch": 6.942847246276411, + "grad_norm": 0.264182984828949, + "learning_rate": 3.057558945908461e-06, + "loss": 0.0033, + "step": 10022 + }, + { + "epoch": 6.9435400069276065, + "grad_norm": 0.17324721813201904, + "learning_rate": 3.0568654646324553e-06, + "loss": 0.0035, + "step": 10023 + }, + { + "epoch": 6.944232767578802, + "grad_norm": 0.22715601325035095, + "learning_rate": 3.0561719833564494e-06, + "loss": 0.0038, + "step": 10024 + }, + { + "epoch": 6.944925528229996, + "grad_norm": 0.2656223475933075, + "learning_rate": 3.055478502080444e-06, + "loss": 0.0043, + "step": 10025 + }, + { + "epoch": 6.9456182888811915, + "grad_norm": 0.2931860089302063, + "learning_rate": 3.0547850208044385e-06, + "loss": 0.006, + "step": 10026 + }, + { + "epoch": 6.946311049532387, + "grad_norm": 0.24384410679340363, + "learning_rate": 3.0540915395284334e-06, + "loss": 0.0046, + "step": 10027 + }, + { + "epoch": 6.947003810183581, + "grad_norm": 0.3938216269016266, + "learning_rate": 3.0533980582524275e-06, + "loss": 0.0046, + "step": 10028 + }, + { + "epoch": 6.947696570834776, + "grad_norm": 0.25126296281814575, + "learning_rate": 3.052704576976422e-06, + "loss": 0.0035, + "step": 10029 + }, + { + "epoch": 6.948389331485972, + "grad_norm": 0.2991364896297455, + "learning_rate": 3.052011095700416e-06, + "loss": 0.0049, + "step": 10030 + }, + { + "epoch": 6.949082092137167, + "grad_norm": 0.23555906116962433, + "learning_rate": 3.0513176144244106e-06, + "loss": 0.0028, + "step": 10031 + }, + { + "epoch": 6.949774852788361, + "grad_norm": 0.1772664189338684, + "learning_rate": 3.0506241331484055e-06, + "loss": 0.0028, + "step": 10032 + }, + { + "epoch": 6.950467613439557, + "grad_norm": 0.2503613829612732, + "learning_rate": 3.0499306518723996e-06, + "loss": 0.0029, + "step": 10033 + }, + { + "epoch": 6.951160374090752, + "grad_norm": 0.22569991648197174, + "learning_rate": 3.049237170596394e-06, + "loss": 0.0026, + "step": 10034 + }, + { + "epoch": 6.951853134741946, + "grad_norm": 0.2664303779602051, + "learning_rate": 3.0485436893203886e-06, + "loss": 0.0047, + "step": 10035 + }, + { + "epoch": 6.952545895393142, + "grad_norm": 0.6259468197822571, + "learning_rate": 3.0478502080443827e-06, + "loss": 0.0033, + "step": 10036 + }, + { + "epoch": 6.953238656044337, + "grad_norm": 0.22040881216526031, + "learning_rate": 3.0471567267683776e-06, + "loss": 0.004, + "step": 10037 + }, + { + "epoch": 6.953931416695532, + "grad_norm": 0.1329197734594345, + "learning_rate": 3.046463245492372e-06, + "loss": 0.0026, + "step": 10038 + }, + { + "epoch": 6.9546241773467266, + "grad_norm": 0.3064022362232208, + "learning_rate": 3.0457697642163662e-06, + "loss": 0.0032, + "step": 10039 + }, + { + "epoch": 6.955316937997922, + "grad_norm": 0.34832891821861267, + "learning_rate": 3.0450762829403607e-06, + "loss": 0.0054, + "step": 10040 + }, + { + "epoch": 6.956009698649117, + "grad_norm": 0.34846732020378113, + "learning_rate": 3.0443828016643553e-06, + "loss": 0.0051, + "step": 10041 + }, + { + "epoch": 6.9567024593003115, + "grad_norm": 0.22778034210205078, + "learning_rate": 3.0436893203883498e-06, + "loss": 0.003, + "step": 10042 + }, + { + "epoch": 6.957395219951507, + "grad_norm": 0.29621484875679016, + "learning_rate": 3.0429958391123443e-06, + "loss": 0.0039, + "step": 10043 + }, + { + "epoch": 6.958087980602702, + "grad_norm": 0.2415006160736084, + "learning_rate": 3.0423023578363388e-06, + "loss": 0.0029, + "step": 10044 + }, + { + "epoch": 6.9587807412538965, + "grad_norm": 0.3712511956691742, + "learning_rate": 3.041608876560333e-06, + "loss": 0.0023, + "step": 10045 + }, + { + "epoch": 6.959473501905092, + "grad_norm": 0.25687456130981445, + "learning_rate": 3.0409153952843274e-06, + "loss": 0.0049, + "step": 10046 + }, + { + "epoch": 6.960166262556287, + "grad_norm": 0.19633793830871582, + "learning_rate": 3.0402219140083223e-06, + "loss": 0.0037, + "step": 10047 + }, + { + "epoch": 6.960859023207481, + "grad_norm": 0.31647029519081116, + "learning_rate": 3.0395284327323164e-06, + "loss": 0.0047, + "step": 10048 + }, + { + "epoch": 6.961551783858677, + "grad_norm": 0.274926096200943, + "learning_rate": 3.038834951456311e-06, + "loss": 0.0051, + "step": 10049 + }, + { + "epoch": 6.962244544509872, + "grad_norm": 0.23959867656230927, + "learning_rate": 3.0381414701803054e-06, + "loss": 0.0044, + "step": 10050 + }, + { + "epoch": 6.962937305161067, + "grad_norm": 0.25957247614860535, + "learning_rate": 3.0374479889042995e-06, + "loss": 0.0039, + "step": 10051 + }, + { + "epoch": 6.963630065812262, + "grad_norm": 0.23399588465690613, + "learning_rate": 3.0367545076282944e-06, + "loss": 0.0034, + "step": 10052 + }, + { + "epoch": 6.964322826463457, + "grad_norm": 0.21195518970489502, + "learning_rate": 3.036061026352289e-06, + "loss": 0.0026, + "step": 10053 + }, + { + "epoch": 6.965015587114652, + "grad_norm": 0.2954472005367279, + "learning_rate": 3.035367545076283e-06, + "loss": 0.0054, + "step": 10054 + }, + { + "epoch": 6.965708347765847, + "grad_norm": 0.16996623575687408, + "learning_rate": 3.0346740638002775e-06, + "loss": 0.0028, + "step": 10055 + }, + { + "epoch": 6.966401108417042, + "grad_norm": 0.3674592971801758, + "learning_rate": 3.033980582524272e-06, + "loss": 0.0032, + "step": 10056 + }, + { + "epoch": 6.967093869068237, + "grad_norm": 0.16707442700862885, + "learning_rate": 3.0332871012482666e-06, + "loss": 0.0026, + "step": 10057 + }, + { + "epoch": 6.967786629719432, + "grad_norm": 0.2636074125766754, + "learning_rate": 3.032593619972261e-06, + "loss": 0.0033, + "step": 10058 + }, + { + "epoch": 6.968479390370627, + "grad_norm": 0.2006467580795288, + "learning_rate": 3.0319001386962556e-06, + "loss": 0.0031, + "step": 10059 + }, + { + "epoch": 6.969172151021822, + "grad_norm": 0.27483102679252625, + "learning_rate": 3.0312066574202497e-06, + "loss": 0.0037, + "step": 10060 + }, + { + "epoch": 6.969864911673017, + "grad_norm": 0.4790448546409607, + "learning_rate": 3.030513176144244e-06, + "loss": 0.0049, + "step": 10061 + }, + { + "epoch": 6.970557672324212, + "grad_norm": 0.3278796672821045, + "learning_rate": 3.029819694868239e-06, + "loss": 0.0045, + "step": 10062 + }, + { + "epoch": 6.971250432975407, + "grad_norm": 0.2741769254207611, + "learning_rate": 3.029126213592233e-06, + "loss": 0.0037, + "step": 10063 + }, + { + "epoch": 6.971943193626602, + "grad_norm": 0.23450294137001038, + "learning_rate": 3.0284327323162277e-06, + "loss": 0.0036, + "step": 10064 + }, + { + "epoch": 6.972635954277797, + "grad_norm": 0.5124578475952148, + "learning_rate": 3.0277392510402222e-06, + "loss": 0.0045, + "step": 10065 + }, + { + "epoch": 6.973328714928992, + "grad_norm": 0.2673770487308502, + "learning_rate": 3.0270457697642163e-06, + "loss": 0.005, + "step": 10066 + }, + { + "epoch": 6.974021475580187, + "grad_norm": 0.26082757115364075, + "learning_rate": 3.0263522884882112e-06, + "loss": 0.0046, + "step": 10067 + }, + { + "epoch": 6.974714236231382, + "grad_norm": 0.4629848301410675, + "learning_rate": 3.0256588072122058e-06, + "loss": 0.0046, + "step": 10068 + }, + { + "epoch": 6.975406996882577, + "grad_norm": 0.19269625842571259, + "learning_rate": 3.0249653259362e-06, + "loss": 0.0035, + "step": 10069 + }, + { + "epoch": 6.976099757533772, + "grad_norm": 0.31474941968917847, + "learning_rate": 3.0242718446601943e-06, + "loss": 0.0041, + "step": 10070 + }, + { + "epoch": 6.976792518184967, + "grad_norm": 0.29348450899124146, + "learning_rate": 3.0235783633841884e-06, + "loss": 0.0032, + "step": 10071 + }, + { + "epoch": 6.977485278836162, + "grad_norm": 0.6254693269729614, + "learning_rate": 3.0228848821081834e-06, + "loss": 0.0039, + "step": 10072 + }, + { + "epoch": 6.978178039487357, + "grad_norm": 0.1562761813402176, + "learning_rate": 3.022191400832178e-06, + "loss": 0.0024, + "step": 10073 + }, + { + "epoch": 6.978870800138552, + "grad_norm": 0.14508678019046783, + "learning_rate": 3.0214979195561724e-06, + "loss": 0.0022, + "step": 10074 + }, + { + "epoch": 6.979563560789747, + "grad_norm": 0.2147209197282791, + "learning_rate": 3.0208044382801665e-06, + "loss": 0.0025, + "step": 10075 + }, + { + "epoch": 6.980256321440942, + "grad_norm": 0.37642332911491394, + "learning_rate": 3.020110957004161e-06, + "loss": 0.0052, + "step": 10076 + }, + { + "epoch": 6.980949082092137, + "grad_norm": 0.13769502937793732, + "learning_rate": 3.019417475728156e-06, + "loss": 0.0028, + "step": 10077 + }, + { + "epoch": 6.981641842743333, + "grad_norm": 0.26323574781417847, + "learning_rate": 3.01872399445215e-06, + "loss": 0.0033, + "step": 10078 + }, + { + "epoch": 6.982334603394527, + "grad_norm": 0.19526053965091705, + "learning_rate": 3.0180305131761445e-06, + "loss": 0.0032, + "step": 10079 + }, + { + "epoch": 6.983027364045722, + "grad_norm": 0.20103825628757477, + "learning_rate": 3.017337031900139e-06, + "loss": 0.003, + "step": 10080 + }, + { + "epoch": 6.983720124696918, + "grad_norm": 0.18664149940013885, + "learning_rate": 3.016643550624133e-06, + "loss": 0.0033, + "step": 10081 + }, + { + "epoch": 6.984412885348112, + "grad_norm": 0.4142363965511322, + "learning_rate": 3.015950069348128e-06, + "loss": 0.0049, + "step": 10082 + }, + { + "epoch": 6.985105645999307, + "grad_norm": 0.27377110719680786, + "learning_rate": 3.0152565880721226e-06, + "loss": 0.0038, + "step": 10083 + }, + { + "epoch": 6.985798406650503, + "grad_norm": 0.22474831342697144, + "learning_rate": 3.0145631067961166e-06, + "loss": 0.0035, + "step": 10084 + }, + { + "epoch": 6.986491167301697, + "grad_norm": 0.19796189665794373, + "learning_rate": 3.013869625520111e-06, + "loss": 0.0034, + "step": 10085 + }, + { + "epoch": 6.987183927952892, + "grad_norm": 0.181865856051445, + "learning_rate": 3.0131761442441052e-06, + "loss": 0.0026, + "step": 10086 + }, + { + "epoch": 6.9878766886040875, + "grad_norm": 0.2278188169002533, + "learning_rate": 3.0124826629681e-06, + "loss": 0.0036, + "step": 10087 + }, + { + "epoch": 6.988569449255282, + "grad_norm": 0.19749589264392853, + "learning_rate": 3.0117891816920947e-06, + "loss": 0.0043, + "step": 10088 + }, + { + "epoch": 6.989262209906477, + "grad_norm": 0.1457054615020752, + "learning_rate": 3.011095700416089e-06, + "loss": 0.0033, + "step": 10089 + }, + { + "epoch": 6.9899549705576725, + "grad_norm": 0.18413835763931274, + "learning_rate": 3.0104022191400833e-06, + "loss": 0.0031, + "step": 10090 + }, + { + "epoch": 6.990647731208867, + "grad_norm": 0.3408403992652893, + "learning_rate": 3.0097087378640778e-06, + "loss": 0.0043, + "step": 10091 + }, + { + "epoch": 6.991340491860062, + "grad_norm": 0.17260783910751343, + "learning_rate": 3.0090152565880727e-06, + "loss": 0.003, + "step": 10092 + }, + { + "epoch": 6.992033252511257, + "grad_norm": 0.2064782828092575, + "learning_rate": 3.008321775312067e-06, + "loss": 0.0037, + "step": 10093 + }, + { + "epoch": 6.992726013162453, + "grad_norm": 0.19827456772327423, + "learning_rate": 3.0076282940360613e-06, + "loss": 0.0028, + "step": 10094 + }, + { + "epoch": 6.993418773813647, + "grad_norm": 0.22474165260791779, + "learning_rate": 3.0069348127600554e-06, + "loss": 0.0032, + "step": 10095 + }, + { + "epoch": 6.994111534464842, + "grad_norm": 0.20684751868247986, + "learning_rate": 3.00624133148405e-06, + "loss": 0.0037, + "step": 10096 + }, + { + "epoch": 6.994804295116038, + "grad_norm": 0.5600537061691284, + "learning_rate": 3.005547850208045e-06, + "loss": 0.0025, + "step": 10097 + }, + { + "epoch": 6.995497055767233, + "grad_norm": 0.20565995573997498, + "learning_rate": 3.0048543689320394e-06, + "loss": 0.0035, + "step": 10098 + }, + { + "epoch": 6.996189816418427, + "grad_norm": 0.2063204050064087, + "learning_rate": 3.0041608876560334e-06, + "loss": 0.0033, + "step": 10099 + }, + { + "epoch": 6.996882577069623, + "grad_norm": 0.21847106516361237, + "learning_rate": 3.003467406380028e-06, + "loss": 0.0042, + "step": 10100 + }, + { + "epoch": 6.997575337720818, + "grad_norm": 0.2257625311613083, + "learning_rate": 3.002773925104022e-06, + "loss": 0.0029, + "step": 10101 + }, + { + "epoch": 6.998268098372012, + "grad_norm": 0.154760479927063, + "learning_rate": 3.002080443828017e-06, + "loss": 0.0027, + "step": 10102 + }, + { + "epoch": 6.9989608590232075, + "grad_norm": 0.6939622163772583, + "learning_rate": 3.0013869625520115e-06, + "loss": 0.0047, + "step": 10103 + }, + { + "epoch": 6.999653619674403, + "grad_norm": 0.17868445813655853, + "learning_rate": 3.000693481276006e-06, + "loss": 0.0028, + "step": 10104 + }, + { + "epoch": 6.999653619674403, + "eval_loss": 0.3009836971759796, + "eval_runtime": 7671.4969, + "eval_samples_per_second": 1.043, + "eval_steps_per_second": 0.033, + "eval_wer": 12.362416576400294, + "step": 10104 + }, + { + "epoch": 7.000346380325597, + "grad_norm": 0.2716224491596222, + "learning_rate": 3e-06, + "loss": 0.0053, + "step": 10105 + }, + { + "epoch": 7.0010391409767925, + "grad_norm": 0.09159483015537262, + "learning_rate": 2.9993065187239946e-06, + "loss": 0.0019, + "step": 10106 + }, + { + "epoch": 7.001731901627988, + "grad_norm": 0.5564009547233582, + "learning_rate": 2.9986130374479895e-06, + "loss": 0.0024, + "step": 10107 + }, + { + "epoch": 7.002424662279182, + "grad_norm": 0.23817522823810577, + "learning_rate": 2.9979195561719836e-06, + "loss": 0.0026, + "step": 10108 + }, + { + "epoch": 7.003117422930377, + "grad_norm": 0.11498336493968964, + "learning_rate": 2.997226074895978e-06, + "loss": 0.0018, + "step": 10109 + }, + { + "epoch": 7.003810183581573, + "grad_norm": 0.28012868762016296, + "learning_rate": 2.996532593619972e-06, + "loss": 0.0046, + "step": 10110 + }, + { + "epoch": 7.004502944232768, + "grad_norm": 0.31910011172294617, + "learning_rate": 2.9958391123439667e-06, + "loss": 0.0021, + "step": 10111 + }, + { + "epoch": 7.005195704883962, + "grad_norm": 0.07397008687257767, + "learning_rate": 2.9951456310679616e-06, + "loss": 0.0018, + "step": 10112 + }, + { + "epoch": 7.005888465535158, + "grad_norm": 0.12284370511770248, + "learning_rate": 2.994452149791956e-06, + "loss": 0.0022, + "step": 10113 + }, + { + "epoch": 7.006581226186353, + "grad_norm": 0.11359403282403946, + "learning_rate": 2.9937586685159502e-06, + "loss": 0.0018, + "step": 10114 + }, + { + "epoch": 7.007273986837547, + "grad_norm": 0.21793392300605774, + "learning_rate": 2.9930651872399448e-06, + "loss": 0.0024, + "step": 10115 + }, + { + "epoch": 7.007966747488743, + "grad_norm": 0.1326545625925064, + "learning_rate": 2.992371705963939e-06, + "loss": 0.0027, + "step": 10116 + }, + { + "epoch": 7.008659508139938, + "grad_norm": 0.24529285728931427, + "learning_rate": 2.9916782246879338e-06, + "loss": 0.0022, + "step": 10117 + }, + { + "epoch": 7.009352268791132, + "grad_norm": 0.12091325968503952, + "learning_rate": 2.9909847434119283e-06, + "loss": 0.0019, + "step": 10118 + }, + { + "epoch": 7.0100450294423275, + "grad_norm": 0.1797342598438263, + "learning_rate": 2.9902912621359224e-06, + "loss": 0.0023, + "step": 10119 + }, + { + "epoch": 7.010737790093523, + "grad_norm": 0.18923820555210114, + "learning_rate": 2.989597780859917e-06, + "loss": 0.0022, + "step": 10120 + }, + { + "epoch": 7.011430550744718, + "grad_norm": 0.7115009427070618, + "learning_rate": 2.9889042995839114e-06, + "loss": 0.0023, + "step": 10121 + }, + { + "epoch": 7.0121233113959125, + "grad_norm": 0.10658278316259384, + "learning_rate": 2.9882108183079063e-06, + "loss": 0.002, + "step": 10122 + }, + { + "epoch": 7.012816072047108, + "grad_norm": 0.1364043653011322, + "learning_rate": 2.9875173370319004e-06, + "loss": 0.0022, + "step": 10123 + }, + { + "epoch": 7.013508832698303, + "grad_norm": 0.27905717492103577, + "learning_rate": 2.986823855755895e-06, + "loss": 0.003, + "step": 10124 + }, + { + "epoch": 7.014201593349497, + "grad_norm": 0.11260130256414413, + "learning_rate": 2.986130374479889e-06, + "loss": 0.0019, + "step": 10125 + }, + { + "epoch": 7.014894354000693, + "grad_norm": 0.09679900109767914, + "learning_rate": 2.9854368932038835e-06, + "loss": 0.0017, + "step": 10126 + }, + { + "epoch": 7.015587114651888, + "grad_norm": 0.11848677694797516, + "learning_rate": 2.9847434119278784e-06, + "loss": 0.0021, + "step": 10127 + }, + { + "epoch": 7.016279875303082, + "grad_norm": 0.2643411159515381, + "learning_rate": 2.984049930651873e-06, + "loss": 0.0024, + "step": 10128 + }, + { + "epoch": 7.016972635954278, + "grad_norm": 0.14489522576332092, + "learning_rate": 2.983356449375867e-06, + "loss": 0.0026, + "step": 10129 + }, + { + "epoch": 7.017665396605473, + "grad_norm": 0.149917334318161, + "learning_rate": 2.9826629680998616e-06, + "loss": 0.002, + "step": 10130 + }, + { + "epoch": 7.018358157256668, + "grad_norm": 0.12082762271165848, + "learning_rate": 2.9819694868238556e-06, + "loss": 0.0027, + "step": 10131 + }, + { + "epoch": 7.019050917907863, + "grad_norm": 0.15150338411331177, + "learning_rate": 2.9812760055478506e-06, + "loss": 0.0022, + "step": 10132 + }, + { + "epoch": 7.019743678559058, + "grad_norm": 0.16063298285007477, + "learning_rate": 2.980582524271845e-06, + "loss": 0.0025, + "step": 10133 + }, + { + "epoch": 7.020436439210253, + "grad_norm": 0.20068734884262085, + "learning_rate": 2.979889042995839e-06, + "loss": 0.0021, + "step": 10134 + }, + { + "epoch": 7.021129199861448, + "grad_norm": 0.12279853969812393, + "learning_rate": 2.9791955617198337e-06, + "loss": 0.0026, + "step": 10135 + }, + { + "epoch": 7.021821960512643, + "grad_norm": 0.12573403120040894, + "learning_rate": 2.978502080443828e-06, + "loss": 0.0022, + "step": 10136 + }, + { + "epoch": 7.022514721163838, + "grad_norm": 0.23808933794498444, + "learning_rate": 2.977808599167823e-06, + "loss": 0.0025, + "step": 10137 + }, + { + "epoch": 7.0232074818150325, + "grad_norm": 0.25830936431884766, + "learning_rate": 2.977115117891817e-06, + "loss": 0.0045, + "step": 10138 + }, + { + "epoch": 7.023900242466228, + "grad_norm": 0.1041800007224083, + "learning_rate": 2.9764216366158117e-06, + "loss": 0.0017, + "step": 10139 + }, + { + "epoch": 7.024593003117423, + "grad_norm": 0.1521730124950409, + "learning_rate": 2.975728155339806e-06, + "loss": 0.0027, + "step": 10140 + }, + { + "epoch": 7.025285763768618, + "grad_norm": 0.19751764833927155, + "learning_rate": 2.9750346740638003e-06, + "loss": 0.0021, + "step": 10141 + }, + { + "epoch": 7.025978524419813, + "grad_norm": 0.06638767570257187, + "learning_rate": 2.9743411927877952e-06, + "loss": 0.0017, + "step": 10142 + }, + { + "epoch": 7.026671285071008, + "grad_norm": 0.2514634430408478, + "learning_rate": 2.9736477115117898e-06, + "loss": 0.0035, + "step": 10143 + }, + { + "epoch": 7.027364045722203, + "grad_norm": 0.07468952983617783, + "learning_rate": 2.972954230235784e-06, + "loss": 0.0016, + "step": 10144 + }, + { + "epoch": 7.028056806373398, + "grad_norm": 0.16920965909957886, + "learning_rate": 2.9722607489597784e-06, + "loss": 0.0019, + "step": 10145 + }, + { + "epoch": 7.028749567024593, + "grad_norm": 0.11712231487035751, + "learning_rate": 2.9715672676837724e-06, + "loss": 0.0018, + "step": 10146 + }, + { + "epoch": 7.029442327675788, + "grad_norm": 0.10406385362148285, + "learning_rate": 2.9708737864077674e-06, + "loss": 0.0027, + "step": 10147 + }, + { + "epoch": 7.030135088326983, + "grad_norm": 0.2668299674987793, + "learning_rate": 2.970180305131762e-06, + "loss": 0.0025, + "step": 10148 + }, + { + "epoch": 7.030827848978178, + "grad_norm": 0.13492488861083984, + "learning_rate": 2.969486823855756e-06, + "loss": 0.0018, + "step": 10149 + }, + { + "epoch": 7.031520609629373, + "grad_norm": 0.10186266899108887, + "learning_rate": 2.9687933425797505e-06, + "loss": 0.0021, + "step": 10150 + }, + { + "epoch": 7.0322133702805685, + "grad_norm": 0.2647084891796112, + "learning_rate": 2.968099861303745e-06, + "loss": 0.003, + "step": 10151 + }, + { + "epoch": 7.032906130931763, + "grad_norm": 0.20142273604869843, + "learning_rate": 2.96740638002774e-06, + "loss": 0.0028, + "step": 10152 + }, + { + "epoch": 7.033598891582958, + "grad_norm": 0.11634217202663422, + "learning_rate": 2.966712898751734e-06, + "loss": 0.0018, + "step": 10153 + }, + { + "epoch": 7.034291652234153, + "grad_norm": 0.07488865405321121, + "learning_rate": 2.9660194174757285e-06, + "loss": 0.0015, + "step": 10154 + }, + { + "epoch": 7.034984412885348, + "grad_norm": 0.11648157238960266, + "learning_rate": 2.9653259361997226e-06, + "loss": 0.0028, + "step": 10155 + }, + { + "epoch": 7.035677173536543, + "grad_norm": 0.12555374205112457, + "learning_rate": 2.964632454923717e-06, + "loss": 0.0021, + "step": 10156 + }, + { + "epoch": 7.036369934187738, + "grad_norm": 0.21055260300636292, + "learning_rate": 2.963938973647712e-06, + "loss": 0.0028, + "step": 10157 + }, + { + "epoch": 7.037062694838933, + "grad_norm": 0.14861522614955902, + "learning_rate": 2.963245492371706e-06, + "loss": 0.0022, + "step": 10158 + }, + { + "epoch": 7.037755455490128, + "grad_norm": 0.10379479825496674, + "learning_rate": 2.9625520110957006e-06, + "loss": 0.0017, + "step": 10159 + }, + { + "epoch": 7.038448216141323, + "grad_norm": 0.10191231220960617, + "learning_rate": 2.961858529819695e-06, + "loss": 0.002, + "step": 10160 + }, + { + "epoch": 7.039140976792519, + "grad_norm": 1.4535095691680908, + "learning_rate": 2.9611650485436892e-06, + "loss": 0.0025, + "step": 10161 + }, + { + "epoch": 7.039833737443713, + "grad_norm": 0.10545312613248825, + "learning_rate": 2.960471567267684e-06, + "loss": 0.0019, + "step": 10162 + }, + { + "epoch": 7.040526498094908, + "grad_norm": 0.15498687326908112, + "learning_rate": 2.9597780859916787e-06, + "loss": 0.0024, + "step": 10163 + }, + { + "epoch": 7.0412192587461035, + "grad_norm": 0.13552707433700562, + "learning_rate": 2.9590846047156728e-06, + "loss": 0.0017, + "step": 10164 + }, + { + "epoch": 7.041912019397298, + "grad_norm": 0.07262732833623886, + "learning_rate": 2.9583911234396673e-06, + "loss": 0.0017, + "step": 10165 + }, + { + "epoch": 7.042604780048493, + "grad_norm": 0.1755642294883728, + "learning_rate": 2.957697642163662e-06, + "loss": 0.0018, + "step": 10166 + }, + { + "epoch": 7.0432975406996885, + "grad_norm": 0.15847671031951904, + "learning_rate": 2.9570041608876567e-06, + "loss": 0.0021, + "step": 10167 + }, + { + "epoch": 7.043990301350883, + "grad_norm": 0.17660565674304962, + "learning_rate": 2.956310679611651e-06, + "loss": 0.0027, + "step": 10168 + }, + { + "epoch": 7.044683062002078, + "grad_norm": 0.06941859424114227, + "learning_rate": 2.9556171983356453e-06, + "loss": 0.0015, + "step": 10169 + }, + { + "epoch": 7.0453758226532734, + "grad_norm": 0.15044160187244415, + "learning_rate": 2.9549237170596394e-06, + "loss": 0.0021, + "step": 10170 + }, + { + "epoch": 7.046068583304469, + "grad_norm": 0.2087961882352829, + "learning_rate": 2.954230235783634e-06, + "loss": 0.0023, + "step": 10171 + }, + { + "epoch": 7.046761343955663, + "grad_norm": 0.2598675787448883, + "learning_rate": 2.953536754507629e-06, + "loss": 0.0024, + "step": 10172 + }, + { + "epoch": 7.047454104606858, + "grad_norm": 0.15336377918720245, + "learning_rate": 2.952843273231623e-06, + "loss": 0.0024, + "step": 10173 + }, + { + "epoch": 7.048146865258054, + "grad_norm": 0.3325541615486145, + "learning_rate": 2.9521497919556174e-06, + "loss": 0.0038, + "step": 10174 + }, + { + "epoch": 7.048839625909248, + "grad_norm": 0.1810804307460785, + "learning_rate": 2.951456310679612e-06, + "loss": 0.0021, + "step": 10175 + }, + { + "epoch": 7.049532386560443, + "grad_norm": 0.2572703957557678, + "learning_rate": 2.950762829403606e-06, + "loss": 0.0034, + "step": 10176 + }, + { + "epoch": 7.050225147211639, + "grad_norm": 0.09396033734083176, + "learning_rate": 2.950069348127601e-06, + "loss": 0.0015, + "step": 10177 + }, + { + "epoch": 7.050917907862833, + "grad_norm": 0.3814314305782318, + "learning_rate": 2.9493758668515955e-06, + "loss": 0.002, + "step": 10178 + }, + { + "epoch": 7.051610668514028, + "grad_norm": 0.14968882501125336, + "learning_rate": 2.9486823855755896e-06, + "loss": 0.0023, + "step": 10179 + }, + { + "epoch": 7.052303429165224, + "grad_norm": 0.23168759047985077, + "learning_rate": 2.947988904299584e-06, + "loss": 0.0026, + "step": 10180 + }, + { + "epoch": 7.052996189816419, + "grad_norm": 0.12254835665225983, + "learning_rate": 2.9472954230235786e-06, + "loss": 0.0017, + "step": 10181 + }, + { + "epoch": 7.053688950467613, + "grad_norm": 0.11456603556871414, + "learning_rate": 2.946601941747573e-06, + "loss": 0.0016, + "step": 10182 + }, + { + "epoch": 7.0543817111188085, + "grad_norm": 0.14420464634895325, + "learning_rate": 2.9459084604715676e-06, + "loss": 0.0017, + "step": 10183 + }, + { + "epoch": 7.055074471770004, + "grad_norm": 0.10403777658939362, + "learning_rate": 2.945214979195562e-06, + "loss": 0.0017, + "step": 10184 + }, + { + "epoch": 7.055767232421198, + "grad_norm": 0.1886509209871292, + "learning_rate": 2.944521497919556e-06, + "loss": 0.0026, + "step": 10185 + }, + { + "epoch": 7.0564599930723935, + "grad_norm": 0.08650385588407516, + "learning_rate": 2.9438280166435507e-06, + "loss": 0.0017, + "step": 10186 + }, + { + "epoch": 7.057152753723589, + "grad_norm": 0.09474591165781021, + "learning_rate": 2.9431345353675457e-06, + "loss": 0.0021, + "step": 10187 + }, + { + "epoch": 7.057845514374783, + "grad_norm": 0.40305501222610474, + "learning_rate": 2.9424410540915397e-06, + "loss": 0.0029, + "step": 10188 + }, + { + "epoch": 7.058538275025978, + "grad_norm": 0.11685768514871597, + "learning_rate": 2.9417475728155342e-06, + "loss": 0.0017, + "step": 10189 + }, + { + "epoch": 7.059231035677174, + "grad_norm": 0.17035429179668427, + "learning_rate": 2.9410540915395288e-06, + "loss": 0.0019, + "step": 10190 + }, + { + "epoch": 7.059923796328369, + "grad_norm": 0.10403960943222046, + "learning_rate": 2.940360610263523e-06, + "loss": 0.0019, + "step": 10191 + }, + { + "epoch": 7.060616556979563, + "grad_norm": 0.08097000420093536, + "learning_rate": 2.9396671289875178e-06, + "loss": 0.0017, + "step": 10192 + }, + { + "epoch": 7.061309317630759, + "grad_norm": 0.0710969939827919, + "learning_rate": 2.9389736477115123e-06, + "loss": 0.0014, + "step": 10193 + }, + { + "epoch": 7.062002078281954, + "grad_norm": 0.13937436044216156, + "learning_rate": 2.9382801664355064e-06, + "loss": 0.002, + "step": 10194 + }, + { + "epoch": 7.062694838933148, + "grad_norm": 0.2913510501384735, + "learning_rate": 2.937586685159501e-06, + "loss": 0.0036, + "step": 10195 + }, + { + "epoch": 7.063387599584344, + "grad_norm": 0.11702926456928253, + "learning_rate": 2.9368932038834954e-06, + "loss": 0.0018, + "step": 10196 + }, + { + "epoch": 7.064080360235539, + "grad_norm": 0.17788025736808777, + "learning_rate": 2.93619972260749e-06, + "loss": 0.0037, + "step": 10197 + }, + { + "epoch": 7.064773120886733, + "grad_norm": 0.11324986815452576, + "learning_rate": 2.9355062413314844e-06, + "loss": 0.0017, + "step": 10198 + }, + { + "epoch": 7.0654658815379285, + "grad_norm": 0.22492696344852448, + "learning_rate": 2.934812760055479e-06, + "loss": 0.0023, + "step": 10199 + }, + { + "epoch": 7.066158642189124, + "grad_norm": 0.1243765652179718, + "learning_rate": 2.934119278779473e-06, + "loss": 0.0018, + "step": 10200 + }, + { + "epoch": 7.066851402840319, + "grad_norm": 0.09988114982843399, + "learning_rate": 2.9334257975034675e-06, + "loss": 0.0017, + "step": 10201 + }, + { + "epoch": 7.0675441634915135, + "grad_norm": 0.13032008707523346, + "learning_rate": 2.9327323162274625e-06, + "loss": 0.0019, + "step": 10202 + }, + { + "epoch": 7.068236924142709, + "grad_norm": 0.10445775091648102, + "learning_rate": 2.9320388349514565e-06, + "loss": 0.0017, + "step": 10203 + }, + { + "epoch": 7.068929684793904, + "grad_norm": 0.12967832386493683, + "learning_rate": 2.931345353675451e-06, + "loss": 0.0017, + "step": 10204 + }, + { + "epoch": 7.069622445445098, + "grad_norm": 0.4911452829837799, + "learning_rate": 2.9306518723994456e-06, + "loss": 0.002, + "step": 10205 + }, + { + "epoch": 7.070315206096294, + "grad_norm": 0.10737442970275879, + "learning_rate": 2.9299583911234396e-06, + "loss": 0.0016, + "step": 10206 + }, + { + "epoch": 7.071007966747489, + "grad_norm": 0.07060238718986511, + "learning_rate": 2.9292649098474346e-06, + "loss": 0.0016, + "step": 10207 + }, + { + "epoch": 7.071700727398683, + "grad_norm": 0.10306859761476517, + "learning_rate": 2.928571428571429e-06, + "loss": 0.0015, + "step": 10208 + }, + { + "epoch": 7.072393488049879, + "grad_norm": 0.12221773713827133, + "learning_rate": 2.927877947295423e-06, + "loss": 0.002, + "step": 10209 + }, + { + "epoch": 7.073086248701074, + "grad_norm": 0.12454909831285477, + "learning_rate": 2.9271844660194177e-06, + "loss": 0.0021, + "step": 10210 + }, + { + "epoch": 7.073779009352269, + "grad_norm": 0.24364261329174042, + "learning_rate": 2.9264909847434118e-06, + "loss": 0.0029, + "step": 10211 + }, + { + "epoch": 7.074471770003464, + "grad_norm": 0.1029602512717247, + "learning_rate": 2.9257975034674067e-06, + "loss": 0.0019, + "step": 10212 + }, + { + "epoch": 7.075164530654659, + "grad_norm": 0.1706295758485794, + "learning_rate": 2.9251040221914012e-06, + "loss": 0.002, + "step": 10213 + }, + { + "epoch": 7.075857291305854, + "grad_norm": 0.07938599586486816, + "learning_rate": 2.9244105409153957e-06, + "loss": 0.0016, + "step": 10214 + }, + { + "epoch": 7.076550051957049, + "grad_norm": 0.09258869290351868, + "learning_rate": 2.92371705963939e-06, + "loss": 0.0015, + "step": 10215 + }, + { + "epoch": 7.077242812608244, + "grad_norm": 0.14611390233039856, + "learning_rate": 2.9230235783633843e-06, + "loss": 0.0018, + "step": 10216 + }, + { + "epoch": 7.077935573259439, + "grad_norm": 0.1732112616300583, + "learning_rate": 2.9223300970873793e-06, + "loss": 0.0029, + "step": 10217 + }, + { + "epoch": 7.0786283339106335, + "grad_norm": 0.11785987764596939, + "learning_rate": 2.9216366158113733e-06, + "loss": 0.0022, + "step": 10218 + }, + { + "epoch": 7.079321094561829, + "grad_norm": 0.13899289071559906, + "learning_rate": 2.920943134535368e-06, + "loss": 0.0019, + "step": 10219 + }, + { + "epoch": 7.080013855213024, + "grad_norm": 0.10356828570365906, + "learning_rate": 2.9202496532593624e-06, + "loss": 0.0018, + "step": 10220 + }, + { + "epoch": 7.080706615864219, + "grad_norm": 0.1071179062128067, + "learning_rate": 2.9195561719833564e-06, + "loss": 0.0024, + "step": 10221 + }, + { + "epoch": 7.081399376515414, + "grad_norm": 0.1809478998184204, + "learning_rate": 2.9188626907073514e-06, + "loss": 0.002, + "step": 10222 + }, + { + "epoch": 7.082092137166609, + "grad_norm": 0.21219773590564728, + "learning_rate": 2.918169209431346e-06, + "loss": 0.0021, + "step": 10223 + }, + { + "epoch": 7.082784897817804, + "grad_norm": 0.1390245407819748, + "learning_rate": 2.91747572815534e-06, + "loss": 0.0019, + "step": 10224 + }, + { + "epoch": 7.083477658468999, + "grad_norm": 0.18222716450691223, + "learning_rate": 2.9167822468793345e-06, + "loss": 0.0018, + "step": 10225 + }, + { + "epoch": 7.084170419120194, + "grad_norm": 0.13278789818286896, + "learning_rate": 2.9160887656033286e-06, + "loss": 0.0021, + "step": 10226 + }, + { + "epoch": 7.084863179771389, + "grad_norm": 0.3092099726200104, + "learning_rate": 2.9153952843273235e-06, + "loss": 0.0025, + "step": 10227 + }, + { + "epoch": 7.085555940422584, + "grad_norm": 0.08629997819662094, + "learning_rate": 2.914701803051318e-06, + "loss": 0.0018, + "step": 10228 + }, + { + "epoch": 7.086248701073779, + "grad_norm": 0.10827749222517014, + "learning_rate": 2.9140083217753125e-06, + "loss": 0.0019, + "step": 10229 + }, + { + "epoch": 7.086941461724974, + "grad_norm": 0.15044258534908295, + "learning_rate": 2.9133148404993066e-06, + "loss": 0.0018, + "step": 10230 + }, + { + "epoch": 7.0876342223761695, + "grad_norm": 0.10118412226438522, + "learning_rate": 2.912621359223301e-06, + "loss": 0.002, + "step": 10231 + }, + { + "epoch": 7.088326983027364, + "grad_norm": 0.18661916255950928, + "learning_rate": 2.911927877947296e-06, + "loss": 0.0033, + "step": 10232 + }, + { + "epoch": 7.089019743678559, + "grad_norm": 0.1472817361354828, + "learning_rate": 2.91123439667129e-06, + "loss": 0.002, + "step": 10233 + }, + { + "epoch": 7.089712504329754, + "grad_norm": 0.31590205430984497, + "learning_rate": 2.9105409153952847e-06, + "loss": 0.0023, + "step": 10234 + }, + { + "epoch": 7.090405264980949, + "grad_norm": 0.3203689754009247, + "learning_rate": 2.9098474341192787e-06, + "loss": 0.0039, + "step": 10235 + }, + { + "epoch": 7.091098025632144, + "grad_norm": 0.141969233751297, + "learning_rate": 2.9091539528432732e-06, + "loss": 0.0017, + "step": 10236 + }, + { + "epoch": 7.091790786283339, + "grad_norm": 0.10086339712142944, + "learning_rate": 2.908460471567268e-06, + "loss": 0.0015, + "step": 10237 + }, + { + "epoch": 7.092483546934534, + "grad_norm": 0.08305101096630096, + "learning_rate": 2.9077669902912627e-06, + "loss": 0.0016, + "step": 10238 + }, + { + "epoch": 7.093176307585729, + "grad_norm": 0.1501968502998352, + "learning_rate": 2.9070735090152568e-06, + "loss": 0.0017, + "step": 10239 + }, + { + "epoch": 7.093869068236924, + "grad_norm": 0.12165763229131699, + "learning_rate": 2.9063800277392513e-06, + "loss": 0.0019, + "step": 10240 + }, + { + "epoch": 7.09456182888812, + "grad_norm": 0.2288498729467392, + "learning_rate": 2.9056865464632454e-06, + "loss": 0.0021, + "step": 10241 + }, + { + "epoch": 7.095254589539314, + "grad_norm": 0.1075778380036354, + "learning_rate": 2.9049930651872403e-06, + "loss": 0.0017, + "step": 10242 + }, + { + "epoch": 7.095947350190509, + "grad_norm": 0.17690244317054749, + "learning_rate": 2.904299583911235e-06, + "loss": 0.0021, + "step": 10243 + }, + { + "epoch": 7.0966401108417045, + "grad_norm": 0.15283559262752533, + "learning_rate": 2.9036061026352293e-06, + "loss": 0.0023, + "step": 10244 + }, + { + "epoch": 7.097332871492899, + "grad_norm": 0.07930505275726318, + "learning_rate": 2.9029126213592234e-06, + "loss": 0.0015, + "step": 10245 + }, + { + "epoch": 7.098025632144094, + "grad_norm": 0.1359519362449646, + "learning_rate": 2.902219140083218e-06, + "loss": 0.0018, + "step": 10246 + }, + { + "epoch": 7.0987183927952895, + "grad_norm": 0.1672157496213913, + "learning_rate": 2.901525658807213e-06, + "loss": 0.0024, + "step": 10247 + }, + { + "epoch": 7.099411153446484, + "grad_norm": 0.1201040968298912, + "learning_rate": 2.900832177531207e-06, + "loss": 0.0015, + "step": 10248 + }, + { + "epoch": 7.100103914097679, + "grad_norm": 0.11735248565673828, + "learning_rate": 2.9001386962552015e-06, + "loss": 0.0017, + "step": 10249 + }, + { + "epoch": 7.100796674748874, + "grad_norm": 0.08093228191137314, + "learning_rate": 2.8994452149791955e-06, + "loss": 0.0016, + "step": 10250 + }, + { + "epoch": 7.101489435400069, + "grad_norm": 0.13805758953094482, + "learning_rate": 2.89875173370319e-06, + "loss": 0.0028, + "step": 10251 + }, + { + "epoch": 7.102182196051264, + "grad_norm": 0.10857182741165161, + "learning_rate": 2.898058252427185e-06, + "loss": 0.0019, + "step": 10252 + }, + { + "epoch": 7.102874956702459, + "grad_norm": 0.13954514265060425, + "learning_rate": 2.8973647711511795e-06, + "loss": 0.0018, + "step": 10253 + }, + { + "epoch": 7.103567717353655, + "grad_norm": 0.10731736570596695, + "learning_rate": 2.8966712898751736e-06, + "loss": 0.0015, + "step": 10254 + }, + { + "epoch": 7.104260478004849, + "grad_norm": 0.12175274640321732, + "learning_rate": 2.895977808599168e-06, + "loss": 0.0028, + "step": 10255 + }, + { + "epoch": 7.104953238656044, + "grad_norm": 0.10490991920232773, + "learning_rate": 2.895284327323162e-06, + "loss": 0.0014, + "step": 10256 + }, + { + "epoch": 7.10564599930724, + "grad_norm": 0.4596697688102722, + "learning_rate": 2.894590846047157e-06, + "loss": 0.0026, + "step": 10257 + }, + { + "epoch": 7.106338759958434, + "grad_norm": 0.07938335090875626, + "learning_rate": 2.8938973647711516e-06, + "loss": 0.0016, + "step": 10258 + }, + { + "epoch": 7.107031520609629, + "grad_norm": 0.08749832957983017, + "learning_rate": 2.8932038834951457e-06, + "loss": 0.0016, + "step": 10259 + }, + { + "epoch": 7.107724281260825, + "grad_norm": 0.09177283942699432, + "learning_rate": 2.8925104022191402e-06, + "loss": 0.0015, + "step": 10260 + }, + { + "epoch": 7.10841704191202, + "grad_norm": 0.06884568929672241, + "learning_rate": 2.8918169209431347e-06, + "loss": 0.0014, + "step": 10261 + }, + { + "epoch": 7.109109802563214, + "grad_norm": 0.16840292513370514, + "learning_rate": 2.8911234396671297e-06, + "loss": 0.0023, + "step": 10262 + }, + { + "epoch": 7.1098025632144095, + "grad_norm": 0.20110274851322174, + "learning_rate": 2.8904299583911237e-06, + "loss": 0.0016, + "step": 10263 + }, + { + "epoch": 7.110495323865605, + "grad_norm": 0.1194235309958458, + "learning_rate": 2.8897364771151183e-06, + "loss": 0.0018, + "step": 10264 + }, + { + "epoch": 7.111188084516799, + "grad_norm": 0.1636734902858734, + "learning_rate": 2.8890429958391123e-06, + "loss": 0.0019, + "step": 10265 + }, + { + "epoch": 7.1118808451679945, + "grad_norm": 0.7341674566268921, + "learning_rate": 2.888349514563107e-06, + "loss": 0.0018, + "step": 10266 + }, + { + "epoch": 7.11257360581919, + "grad_norm": 0.12724405527114868, + "learning_rate": 2.8876560332871018e-06, + "loss": 0.0017, + "step": 10267 + }, + { + "epoch": 7.113266366470384, + "grad_norm": 0.5825866460800171, + "learning_rate": 2.8869625520110963e-06, + "loss": 0.0026, + "step": 10268 + }, + { + "epoch": 7.113959127121579, + "grad_norm": 0.1154637336730957, + "learning_rate": 2.8862690707350904e-06, + "loss": 0.0016, + "step": 10269 + }, + { + "epoch": 7.114651887772775, + "grad_norm": 0.05893559381365776, + "learning_rate": 2.885575589459085e-06, + "loss": 0.0014, + "step": 10270 + }, + { + "epoch": 7.115344648423969, + "grad_norm": 0.1777360886335373, + "learning_rate": 2.884882108183079e-06, + "loss": 0.0019, + "step": 10271 + }, + { + "epoch": 7.116037409075164, + "grad_norm": 0.4561309516429901, + "learning_rate": 2.884188626907074e-06, + "loss": 0.0022, + "step": 10272 + }, + { + "epoch": 7.11673016972636, + "grad_norm": 0.10777769982814789, + "learning_rate": 2.8834951456310684e-06, + "loss": 0.0016, + "step": 10273 + }, + { + "epoch": 7.117422930377555, + "grad_norm": 0.06441865861415863, + "learning_rate": 2.8828016643550625e-06, + "loss": 0.0013, + "step": 10274 + }, + { + "epoch": 7.118115691028749, + "grad_norm": 0.23009923100471497, + "learning_rate": 2.882108183079057e-06, + "loss": 0.0024, + "step": 10275 + }, + { + "epoch": 7.118808451679945, + "grad_norm": 0.17869342863559723, + "learning_rate": 2.8814147018030515e-06, + "loss": 0.0024, + "step": 10276 + }, + { + "epoch": 7.11950121233114, + "grad_norm": 0.1781490296125412, + "learning_rate": 2.8807212205270465e-06, + "loss": 0.0016, + "step": 10277 + }, + { + "epoch": 7.120193972982334, + "grad_norm": 0.12733760476112366, + "learning_rate": 2.8800277392510405e-06, + "loss": 0.0018, + "step": 10278 + }, + { + "epoch": 7.1208867336335295, + "grad_norm": 0.38746392726898193, + "learning_rate": 2.879334257975035e-06, + "loss": 0.0037, + "step": 10279 + }, + { + "epoch": 7.121579494284725, + "grad_norm": 0.12233682721853256, + "learning_rate": 2.878640776699029e-06, + "loss": 0.0017, + "step": 10280 + }, + { + "epoch": 7.12227225493592, + "grad_norm": 0.1713201105594635, + "learning_rate": 2.8779472954230237e-06, + "loss": 0.0034, + "step": 10281 + }, + { + "epoch": 7.1229650155871145, + "grad_norm": 0.13770155608654022, + "learning_rate": 2.8772538141470186e-06, + "loss": 0.0025, + "step": 10282 + }, + { + "epoch": 7.12365777623831, + "grad_norm": 0.08295518159866333, + "learning_rate": 2.8765603328710127e-06, + "loss": 0.0017, + "step": 10283 + }, + { + "epoch": 7.124350536889505, + "grad_norm": 0.3251071274280548, + "learning_rate": 2.875866851595007e-06, + "loss": 0.002, + "step": 10284 + }, + { + "epoch": 7.125043297540699, + "grad_norm": 0.17440323531627655, + "learning_rate": 2.8751733703190017e-06, + "loss": 0.0031, + "step": 10285 + }, + { + "epoch": 7.125736058191895, + "grad_norm": 0.395295649766922, + "learning_rate": 2.8744798890429958e-06, + "loss": 0.003, + "step": 10286 + }, + { + "epoch": 7.12642881884309, + "grad_norm": 0.09597323834896088, + "learning_rate": 2.8737864077669903e-06, + "loss": 0.0016, + "step": 10287 + }, + { + "epoch": 7.127121579494284, + "grad_norm": 0.19693192839622498, + "learning_rate": 2.8730929264909852e-06, + "loss": 0.002, + "step": 10288 + }, + { + "epoch": 7.12781434014548, + "grad_norm": 0.09158217161893845, + "learning_rate": 2.8723994452149793e-06, + "loss": 0.0019, + "step": 10289 + }, + { + "epoch": 7.128507100796675, + "grad_norm": 0.10716332495212555, + "learning_rate": 2.871705963938974e-06, + "loss": 0.002, + "step": 10290 + }, + { + "epoch": 7.129199861447869, + "grad_norm": 0.15890870988368988, + "learning_rate": 2.8710124826629683e-06, + "loss": 0.0025, + "step": 10291 + }, + { + "epoch": 7.129892622099065, + "grad_norm": 0.11464457213878632, + "learning_rate": 2.8703190013869624e-06, + "loss": 0.0018, + "step": 10292 + }, + { + "epoch": 7.13058538275026, + "grad_norm": 0.41781777143478394, + "learning_rate": 2.8696255201109573e-06, + "loss": 0.0027, + "step": 10293 + }, + { + "epoch": 7.131278143401455, + "grad_norm": 0.07067586481571198, + "learning_rate": 2.868932038834952e-06, + "loss": 0.0015, + "step": 10294 + }, + { + "epoch": 7.1319709040526496, + "grad_norm": 0.1502179205417633, + "learning_rate": 2.868238557558946e-06, + "loss": 0.0018, + "step": 10295 + }, + { + "epoch": 7.132663664703845, + "grad_norm": 0.11330417543649673, + "learning_rate": 2.8675450762829405e-06, + "loss": 0.0023, + "step": 10296 + }, + { + "epoch": 7.13335642535504, + "grad_norm": 0.09089425951242447, + "learning_rate": 2.866851595006935e-06, + "loss": 0.0017, + "step": 10297 + }, + { + "epoch": 7.1340491860062345, + "grad_norm": 0.13021895289421082, + "learning_rate": 2.8661581137309295e-06, + "loss": 0.002, + "step": 10298 + }, + { + "epoch": 7.13474194665743, + "grad_norm": 0.2148451954126358, + "learning_rate": 2.865464632454924e-06, + "loss": 0.0021, + "step": 10299 + }, + { + "epoch": 7.135434707308625, + "grad_norm": 0.2965351343154907, + "learning_rate": 2.8647711511789185e-06, + "loss": 0.0021, + "step": 10300 + }, + { + "epoch": 7.13612746795982, + "grad_norm": 0.13132959604263306, + "learning_rate": 2.8640776699029126e-06, + "loss": 0.0021, + "step": 10301 + }, + { + "epoch": 7.136820228611015, + "grad_norm": 0.1508658528327942, + "learning_rate": 2.863384188626907e-06, + "loss": 0.0016, + "step": 10302 + }, + { + "epoch": 7.13751298926221, + "grad_norm": 0.19183248281478882, + "learning_rate": 2.862690707350902e-06, + "loss": 0.0016, + "step": 10303 + }, + { + "epoch": 7.138205749913405, + "grad_norm": 0.07367881387472153, + "learning_rate": 2.861997226074896e-06, + "loss": 0.0016, + "step": 10304 + }, + { + "epoch": 7.1388985105646, + "grad_norm": 0.33188414573669434, + "learning_rate": 2.8613037447988906e-06, + "loss": 0.0019, + "step": 10305 + }, + { + "epoch": 7.139591271215795, + "grad_norm": 0.23595505952835083, + "learning_rate": 2.860610263522885e-06, + "loss": 0.0023, + "step": 10306 + }, + { + "epoch": 7.14028403186699, + "grad_norm": 0.1040181890130043, + "learning_rate": 2.8599167822468792e-06, + "loss": 0.0017, + "step": 10307 + }, + { + "epoch": 7.140976792518185, + "grad_norm": 0.13215972483158112, + "learning_rate": 2.859223300970874e-06, + "loss": 0.0024, + "step": 10308 + }, + { + "epoch": 7.14166955316938, + "grad_norm": 0.11046463996171951, + "learning_rate": 2.8585298196948687e-06, + "loss": 0.0018, + "step": 10309 + }, + { + "epoch": 7.142362313820575, + "grad_norm": 0.2333577275276184, + "learning_rate": 2.8578363384188627e-06, + "loss": 0.0025, + "step": 10310 + }, + { + "epoch": 7.14305507447177, + "grad_norm": 0.18429487943649292, + "learning_rate": 2.8571428571428573e-06, + "loss": 0.0028, + "step": 10311 + }, + { + "epoch": 7.143747835122965, + "grad_norm": 0.12696926295757294, + "learning_rate": 2.8564493758668518e-06, + "loss": 0.0017, + "step": 10312 + }, + { + "epoch": 7.14444059577416, + "grad_norm": 0.1349647343158722, + "learning_rate": 2.8557558945908463e-06, + "loss": 0.0021, + "step": 10313 + }, + { + "epoch": 7.145133356425355, + "grad_norm": 0.1701875627040863, + "learning_rate": 2.8550624133148408e-06, + "loss": 0.002, + "step": 10314 + }, + { + "epoch": 7.14582611707655, + "grad_norm": 0.1400025486946106, + "learning_rate": 2.8543689320388353e-06, + "loss": 0.0024, + "step": 10315 + }, + { + "epoch": 7.146518877727745, + "grad_norm": 0.19550296664237976, + "learning_rate": 2.8536754507628294e-06, + "loss": 0.002, + "step": 10316 + }, + { + "epoch": 7.14721163837894, + "grad_norm": 0.2161799818277359, + "learning_rate": 2.852981969486824e-06, + "loss": 0.0024, + "step": 10317 + }, + { + "epoch": 7.147904399030135, + "grad_norm": 0.06708332896232605, + "learning_rate": 2.852288488210819e-06, + "loss": 0.0016, + "step": 10318 + }, + { + "epoch": 7.14859715968133, + "grad_norm": 0.11973574757575989, + "learning_rate": 2.851595006934813e-06, + "loss": 0.0025, + "step": 10319 + }, + { + "epoch": 7.149289920332525, + "grad_norm": 0.08611651510000229, + "learning_rate": 2.8509015256588074e-06, + "loss": 0.0016, + "step": 10320 + }, + { + "epoch": 7.14998268098372, + "grad_norm": 0.13845081627368927, + "learning_rate": 2.850208044382802e-06, + "loss": 0.0019, + "step": 10321 + }, + { + "epoch": 7.150675441634915, + "grad_norm": 0.06957918405532837, + "learning_rate": 2.849514563106796e-06, + "loss": 0.0017, + "step": 10322 + }, + { + "epoch": 7.15136820228611, + "grad_norm": 0.10618674755096436, + "learning_rate": 2.848821081830791e-06, + "loss": 0.0021, + "step": 10323 + }, + { + "epoch": 7.1520609629373055, + "grad_norm": 0.4333973824977875, + "learning_rate": 2.8481276005547855e-06, + "loss": 0.004, + "step": 10324 + }, + { + "epoch": 7.1527537235885, + "grad_norm": 0.08355173468589783, + "learning_rate": 2.8474341192787795e-06, + "loss": 0.0016, + "step": 10325 + }, + { + "epoch": 7.153446484239695, + "grad_norm": 0.07392586767673492, + "learning_rate": 2.846740638002774e-06, + "loss": 0.0015, + "step": 10326 + }, + { + "epoch": 7.1541392448908905, + "grad_norm": 0.13416960835456848, + "learning_rate": 2.846047156726768e-06, + "loss": 0.0016, + "step": 10327 + }, + { + "epoch": 7.154832005542085, + "grad_norm": 0.1664823740720749, + "learning_rate": 2.845353675450763e-06, + "loss": 0.0019, + "step": 10328 + }, + { + "epoch": 7.15552476619328, + "grad_norm": 0.07663372159004211, + "learning_rate": 2.8446601941747576e-06, + "loss": 0.0015, + "step": 10329 + }, + { + "epoch": 7.156217526844475, + "grad_norm": 0.27967923879623413, + "learning_rate": 2.843966712898752e-06, + "loss": 0.002, + "step": 10330 + }, + { + "epoch": 7.15691028749567, + "grad_norm": 0.15132340788841248, + "learning_rate": 2.843273231622746e-06, + "loss": 0.0018, + "step": 10331 + }, + { + "epoch": 7.157603048146865, + "grad_norm": 0.114525206387043, + "learning_rate": 2.8425797503467407e-06, + "loss": 0.0016, + "step": 10332 + }, + { + "epoch": 7.15829580879806, + "grad_norm": 0.07676137238740921, + "learning_rate": 2.8418862690707356e-06, + "loss": 0.0016, + "step": 10333 + }, + { + "epoch": 7.158988569449256, + "grad_norm": 0.1388207972049713, + "learning_rate": 2.8411927877947297e-06, + "loss": 0.002, + "step": 10334 + }, + { + "epoch": 7.15968133010045, + "grad_norm": 0.33709225058555603, + "learning_rate": 2.8404993065187242e-06, + "loss": 0.0031, + "step": 10335 + }, + { + "epoch": 7.160374090751645, + "grad_norm": 0.1640513390302658, + "learning_rate": 2.8398058252427187e-06, + "loss": 0.0024, + "step": 10336 + }, + { + "epoch": 7.161066851402841, + "grad_norm": 0.13665960729122162, + "learning_rate": 2.839112343966713e-06, + "loss": 0.0018, + "step": 10337 + }, + { + "epoch": 7.161759612054035, + "grad_norm": 0.09487101435661316, + "learning_rate": 2.8384188626907078e-06, + "loss": 0.0017, + "step": 10338 + }, + { + "epoch": 7.16245237270523, + "grad_norm": 0.12235639989376068, + "learning_rate": 2.8377253814147023e-06, + "loss": 0.0016, + "step": 10339 + }, + { + "epoch": 7.1631451333564256, + "grad_norm": 0.14719174802303314, + "learning_rate": 2.8370319001386963e-06, + "loss": 0.0019, + "step": 10340 + }, + { + "epoch": 7.16383789400762, + "grad_norm": 0.15159761905670166, + "learning_rate": 2.836338418862691e-06, + "loss": 0.0018, + "step": 10341 + }, + { + "epoch": 7.164530654658815, + "grad_norm": 0.08222801238298416, + "learning_rate": 2.835644937586685e-06, + "loss": 0.0016, + "step": 10342 + }, + { + "epoch": 7.1652234153100105, + "grad_norm": 0.08563582599163055, + "learning_rate": 2.83495145631068e-06, + "loss": 0.0014, + "step": 10343 + }, + { + "epoch": 7.165916175961206, + "grad_norm": 0.24724319577217102, + "learning_rate": 2.8342579750346744e-06, + "loss": 0.0059, + "step": 10344 + }, + { + "epoch": 7.1666089366124, + "grad_norm": 0.2744807004928589, + "learning_rate": 2.833564493758669e-06, + "loss": 0.0035, + "step": 10345 + }, + { + "epoch": 7.1673016972635955, + "grad_norm": 0.17966726422309875, + "learning_rate": 2.832871012482663e-06, + "loss": 0.0034, + "step": 10346 + }, + { + "epoch": 7.167994457914791, + "grad_norm": 0.08111991733312607, + "learning_rate": 2.8321775312066575e-06, + "loss": 0.0013, + "step": 10347 + }, + { + "epoch": 7.168687218565985, + "grad_norm": 0.16162936389446259, + "learning_rate": 2.8314840499306524e-06, + "loss": 0.0021, + "step": 10348 + }, + { + "epoch": 7.16937997921718, + "grad_norm": 0.10621341317892075, + "learning_rate": 2.8307905686546465e-06, + "loss": 0.0014, + "step": 10349 + }, + { + "epoch": 7.170072739868376, + "grad_norm": 0.08825115859508514, + "learning_rate": 2.830097087378641e-06, + "loss": 0.0015, + "step": 10350 + }, + { + "epoch": 7.17076550051957, + "grad_norm": 0.06684909760951996, + "learning_rate": 2.829403606102635e-06, + "loss": 0.0015, + "step": 10351 + }, + { + "epoch": 7.171458261170765, + "grad_norm": 0.19890569150447845, + "learning_rate": 2.8287101248266296e-06, + "loss": 0.0023, + "step": 10352 + }, + { + "epoch": 7.172151021821961, + "grad_norm": 0.17000828683376312, + "learning_rate": 2.8280166435506246e-06, + "loss": 0.002, + "step": 10353 + }, + { + "epoch": 7.172843782473156, + "grad_norm": 0.11930099129676819, + "learning_rate": 2.827323162274619e-06, + "loss": 0.0021, + "step": 10354 + }, + { + "epoch": 7.17353654312435, + "grad_norm": 0.05308936536312103, + "learning_rate": 2.826629680998613e-06, + "loss": 0.0013, + "step": 10355 + }, + { + "epoch": 7.174229303775546, + "grad_norm": 0.06604034453630447, + "learning_rate": 2.8259361997226077e-06, + "loss": 0.0015, + "step": 10356 + }, + { + "epoch": 7.174922064426741, + "grad_norm": 0.13203255832195282, + "learning_rate": 2.8252427184466017e-06, + "loss": 0.0016, + "step": 10357 + }, + { + "epoch": 7.175614825077935, + "grad_norm": 0.1402018964290619, + "learning_rate": 2.8245492371705967e-06, + "loss": 0.0026, + "step": 10358 + }, + { + "epoch": 7.1763075857291305, + "grad_norm": 0.15407191216945648, + "learning_rate": 2.823855755894591e-06, + "loss": 0.0014, + "step": 10359 + }, + { + "epoch": 7.177000346380326, + "grad_norm": 0.12346000224351883, + "learning_rate": 2.8231622746185857e-06, + "loss": 0.0024, + "step": 10360 + }, + { + "epoch": 7.17769310703152, + "grad_norm": 0.0611688531935215, + "learning_rate": 2.8224687933425798e-06, + "loss": 0.0015, + "step": 10361 + }, + { + "epoch": 7.1783858676827155, + "grad_norm": 0.05798180028796196, + "learning_rate": 2.8217753120665743e-06, + "loss": 0.0015, + "step": 10362 + }, + { + "epoch": 7.179078628333911, + "grad_norm": 0.09550177305936813, + "learning_rate": 2.8210818307905692e-06, + "loss": 0.0016, + "step": 10363 + }, + { + "epoch": 7.179771388985106, + "grad_norm": 0.05496978014707565, + "learning_rate": 2.8203883495145633e-06, + "loss": 0.0013, + "step": 10364 + }, + { + "epoch": 7.1804641496363, + "grad_norm": 0.07199720293283463, + "learning_rate": 2.819694868238558e-06, + "loss": 0.0014, + "step": 10365 + }, + { + "epoch": 7.181156910287496, + "grad_norm": 0.2144985795021057, + "learning_rate": 2.819001386962552e-06, + "loss": 0.003, + "step": 10366 + }, + { + "epoch": 7.181849670938691, + "grad_norm": 0.1355004459619522, + "learning_rate": 2.8183079056865464e-06, + "loss": 0.0017, + "step": 10367 + }, + { + "epoch": 7.182542431589885, + "grad_norm": 0.08780473470687866, + "learning_rate": 2.8176144244105414e-06, + "loss": 0.0022, + "step": 10368 + }, + { + "epoch": 7.183235192241081, + "grad_norm": 0.1346195638179779, + "learning_rate": 2.816920943134536e-06, + "loss": 0.0019, + "step": 10369 + }, + { + "epoch": 7.183927952892276, + "grad_norm": 0.1946592628955841, + "learning_rate": 2.81622746185853e-06, + "loss": 0.0022, + "step": 10370 + }, + { + "epoch": 7.18462071354347, + "grad_norm": 0.1239401251077652, + "learning_rate": 2.8155339805825245e-06, + "loss": 0.0016, + "step": 10371 + }, + { + "epoch": 7.185313474194666, + "grad_norm": 0.13771218061447144, + "learning_rate": 2.8148404993065185e-06, + "loss": 0.0018, + "step": 10372 + }, + { + "epoch": 7.186006234845861, + "grad_norm": 0.15504243969917297, + "learning_rate": 2.8141470180305135e-06, + "loss": 0.0021, + "step": 10373 + }, + { + "epoch": 7.186698995497056, + "grad_norm": 0.09864521771669388, + "learning_rate": 2.813453536754508e-06, + "loss": 0.0015, + "step": 10374 + }, + { + "epoch": 7.1873917561482505, + "grad_norm": 0.07116943597793579, + "learning_rate": 2.812760055478502e-06, + "loss": 0.0016, + "step": 10375 + }, + { + "epoch": 7.188084516799446, + "grad_norm": 0.08147519081830978, + "learning_rate": 2.8120665742024966e-06, + "loss": 0.0019, + "step": 10376 + }, + { + "epoch": 7.188777277450641, + "grad_norm": 0.17966683208942413, + "learning_rate": 2.811373092926491e-06, + "loss": 0.0025, + "step": 10377 + }, + { + "epoch": 7.1894700381018355, + "grad_norm": 0.06896746158599854, + "learning_rate": 2.810679611650486e-06, + "loss": 0.0014, + "step": 10378 + }, + { + "epoch": 7.190162798753031, + "grad_norm": 0.06997902691364288, + "learning_rate": 2.80998613037448e-06, + "loss": 0.0017, + "step": 10379 + }, + { + "epoch": 7.190855559404226, + "grad_norm": 0.055900491774082184, + "learning_rate": 2.8092926490984746e-06, + "loss": 0.0014, + "step": 10380 + }, + { + "epoch": 7.19154832005542, + "grad_norm": 0.1522582769393921, + "learning_rate": 2.8085991678224687e-06, + "loss": 0.002, + "step": 10381 + }, + { + "epoch": 7.192241080706616, + "grad_norm": 0.2514250874519348, + "learning_rate": 2.8079056865464632e-06, + "loss": 0.0019, + "step": 10382 + }, + { + "epoch": 7.192933841357811, + "grad_norm": 0.06587401777505875, + "learning_rate": 2.807212205270458e-06, + "loss": 0.0014, + "step": 10383 + }, + { + "epoch": 7.193626602009006, + "grad_norm": 0.11855365335941315, + "learning_rate": 2.8065187239944527e-06, + "loss": 0.0017, + "step": 10384 + }, + { + "epoch": 7.194319362660201, + "grad_norm": 0.1970524936914444, + "learning_rate": 2.8058252427184468e-06, + "loss": 0.0019, + "step": 10385 + }, + { + "epoch": 7.195012123311396, + "grad_norm": 0.4181351959705353, + "learning_rate": 2.8051317614424413e-06, + "loss": 0.002, + "step": 10386 + }, + { + "epoch": 7.195704883962591, + "grad_norm": 0.10302127152681351, + "learning_rate": 2.8044382801664353e-06, + "loss": 0.0017, + "step": 10387 + }, + { + "epoch": 7.196397644613786, + "grad_norm": 0.13763552904129028, + "learning_rate": 2.8037447988904303e-06, + "loss": 0.0016, + "step": 10388 + }, + { + "epoch": 7.197090405264981, + "grad_norm": 0.3948575258255005, + "learning_rate": 2.803051317614425e-06, + "loss": 0.0024, + "step": 10389 + }, + { + "epoch": 7.197783165916176, + "grad_norm": 0.5052704811096191, + "learning_rate": 2.802357836338419e-06, + "loss": 0.0035, + "step": 10390 + }, + { + "epoch": 7.198475926567371, + "grad_norm": 0.13870610296726227, + "learning_rate": 2.8016643550624134e-06, + "loss": 0.0023, + "step": 10391 + }, + { + "epoch": 7.199168687218566, + "grad_norm": 0.08997498452663422, + "learning_rate": 2.800970873786408e-06, + "loss": 0.0014, + "step": 10392 + }, + { + "epoch": 7.199861447869761, + "grad_norm": 0.10391823947429657, + "learning_rate": 2.800277392510403e-06, + "loss": 0.0017, + "step": 10393 + }, + { + "epoch": 7.200554208520956, + "grad_norm": 0.060874272137880325, + "learning_rate": 2.799583911234397e-06, + "loss": 0.0015, + "step": 10394 + }, + { + "epoch": 7.201246969172151, + "grad_norm": 0.2799158990383148, + "learning_rate": 2.7988904299583914e-06, + "loss": 0.0026, + "step": 10395 + }, + { + "epoch": 7.201939729823346, + "grad_norm": 0.285510390996933, + "learning_rate": 2.7981969486823855e-06, + "loss": 0.0029, + "step": 10396 + }, + { + "epoch": 7.202632490474541, + "grad_norm": 0.15282388031482697, + "learning_rate": 2.79750346740638e-06, + "loss": 0.0019, + "step": 10397 + }, + { + "epoch": 7.203325251125736, + "grad_norm": 0.144027441740036, + "learning_rate": 2.796809986130375e-06, + "loss": 0.0018, + "step": 10398 + }, + { + "epoch": 7.204018011776931, + "grad_norm": 0.0793803483247757, + "learning_rate": 2.796116504854369e-06, + "loss": 0.0016, + "step": 10399 + }, + { + "epoch": 7.204710772428126, + "grad_norm": 0.10301689803600311, + "learning_rate": 2.7954230235783636e-06, + "loss": 0.0016, + "step": 10400 + }, + { + "epoch": 7.205403533079321, + "grad_norm": 0.28356364369392395, + "learning_rate": 2.794729542302358e-06, + "loss": 0.0026, + "step": 10401 + }, + { + "epoch": 7.206096293730516, + "grad_norm": 0.21687909960746765, + "learning_rate": 2.794036061026352e-06, + "loss": 0.0032, + "step": 10402 + }, + { + "epoch": 7.206789054381711, + "grad_norm": 0.08599995076656342, + "learning_rate": 2.793342579750347e-06, + "loss": 0.0016, + "step": 10403 + }, + { + "epoch": 7.2074818150329065, + "grad_norm": 0.17210939526557922, + "learning_rate": 2.7926490984743416e-06, + "loss": 0.0025, + "step": 10404 + }, + { + "epoch": 7.208174575684101, + "grad_norm": 0.07880869507789612, + "learning_rate": 2.7919556171983357e-06, + "loss": 0.0016, + "step": 10405 + }, + { + "epoch": 7.208867336335296, + "grad_norm": 0.12287131696939468, + "learning_rate": 2.79126213592233e-06, + "loss": 0.002, + "step": 10406 + }, + { + "epoch": 7.2095600969864915, + "grad_norm": 0.26594921946525574, + "learning_rate": 2.7905686546463247e-06, + "loss": 0.0017, + "step": 10407 + }, + { + "epoch": 7.210252857637686, + "grad_norm": 0.07592356950044632, + "learning_rate": 2.7898751733703196e-06, + "loss": 0.0017, + "step": 10408 + }, + { + "epoch": 7.210945618288881, + "grad_norm": 0.11640237271785736, + "learning_rate": 2.7891816920943137e-06, + "loss": 0.0018, + "step": 10409 + }, + { + "epoch": 7.211638378940076, + "grad_norm": 0.20506124198436737, + "learning_rate": 2.7884882108183082e-06, + "loss": 0.0024, + "step": 10410 + }, + { + "epoch": 7.212331139591271, + "grad_norm": 0.15286748111248016, + "learning_rate": 2.7877947295423023e-06, + "loss": 0.0018, + "step": 10411 + }, + { + "epoch": 7.213023900242466, + "grad_norm": 0.3817189335823059, + "learning_rate": 2.787101248266297e-06, + "loss": 0.0034, + "step": 10412 + }, + { + "epoch": 7.213716660893661, + "grad_norm": 0.11340080946683884, + "learning_rate": 2.7864077669902918e-06, + "loss": 0.0019, + "step": 10413 + }, + { + "epoch": 7.214409421544857, + "grad_norm": 0.11808422207832336, + "learning_rate": 2.785714285714286e-06, + "loss": 0.0016, + "step": 10414 + }, + { + "epoch": 7.215102182196051, + "grad_norm": 0.099315345287323, + "learning_rate": 2.7850208044382804e-06, + "loss": 0.0016, + "step": 10415 + }, + { + "epoch": 7.215794942847246, + "grad_norm": 0.12037818878889084, + "learning_rate": 2.784327323162275e-06, + "loss": 0.0019, + "step": 10416 + }, + { + "epoch": 7.216487703498442, + "grad_norm": 0.0628642588853836, + "learning_rate": 2.783633841886269e-06, + "loss": 0.0014, + "step": 10417 + }, + { + "epoch": 7.217180464149636, + "grad_norm": 0.12666834890842438, + "learning_rate": 2.782940360610264e-06, + "loss": 0.0022, + "step": 10418 + }, + { + "epoch": 7.217873224800831, + "grad_norm": 0.09318286925554276, + "learning_rate": 2.7822468793342584e-06, + "loss": 0.0015, + "step": 10419 + }, + { + "epoch": 7.2185659854520265, + "grad_norm": 0.1301775872707367, + "learning_rate": 2.7815533980582525e-06, + "loss": 0.0028, + "step": 10420 + }, + { + "epoch": 7.219258746103221, + "grad_norm": 0.11705788224935532, + "learning_rate": 2.780859916782247e-06, + "loss": 0.0014, + "step": 10421 + }, + { + "epoch": 7.219951506754416, + "grad_norm": 0.44201797246932983, + "learning_rate": 2.7801664355062415e-06, + "loss": 0.0016, + "step": 10422 + }, + { + "epoch": 7.2206442674056115, + "grad_norm": 0.054916124790906906, + "learning_rate": 2.779472954230236e-06, + "loss": 0.0014, + "step": 10423 + }, + { + "epoch": 7.221337028056807, + "grad_norm": 0.060670580714941025, + "learning_rate": 2.7787794729542305e-06, + "loss": 0.0012, + "step": 10424 + }, + { + "epoch": 7.222029788708001, + "grad_norm": 0.14784590899944305, + "learning_rate": 2.778085991678225e-06, + "loss": 0.0017, + "step": 10425 + }, + { + "epoch": 7.222722549359196, + "grad_norm": 0.3116993010044098, + "learning_rate": 2.777392510402219e-06, + "loss": 0.0026, + "step": 10426 + }, + { + "epoch": 7.223415310010392, + "grad_norm": 0.10167987644672394, + "learning_rate": 2.7766990291262136e-06, + "loss": 0.0016, + "step": 10427 + }, + { + "epoch": 7.224108070661586, + "grad_norm": 0.30445626378059387, + "learning_rate": 2.7760055478502086e-06, + "loss": 0.0021, + "step": 10428 + }, + { + "epoch": 7.224800831312781, + "grad_norm": 0.28299471735954285, + "learning_rate": 2.7753120665742026e-06, + "loss": 0.0025, + "step": 10429 + }, + { + "epoch": 7.225493591963977, + "grad_norm": 0.1520148664712906, + "learning_rate": 2.774618585298197e-06, + "loss": 0.002, + "step": 10430 + }, + { + "epoch": 7.226186352615171, + "grad_norm": 0.21248866617679596, + "learning_rate": 2.7739251040221917e-06, + "loss": 0.0015, + "step": 10431 + }, + { + "epoch": 7.226879113266366, + "grad_norm": 0.14709307253360748, + "learning_rate": 2.7732316227461858e-06, + "loss": 0.0022, + "step": 10432 + }, + { + "epoch": 7.227571873917562, + "grad_norm": 0.07308857142925262, + "learning_rate": 2.7725381414701807e-06, + "loss": 0.0013, + "step": 10433 + }, + { + "epoch": 7.228264634568757, + "grad_norm": 0.14815136790275574, + "learning_rate": 2.771844660194175e-06, + "loss": 0.0019, + "step": 10434 + }, + { + "epoch": 7.228957395219951, + "grad_norm": 0.09827826917171478, + "learning_rate": 2.7711511789181693e-06, + "loss": 0.0016, + "step": 10435 + }, + { + "epoch": 7.229650155871147, + "grad_norm": 0.12288874387741089, + "learning_rate": 2.770457697642164e-06, + "loss": 0.0018, + "step": 10436 + }, + { + "epoch": 7.230342916522342, + "grad_norm": 0.14140482246875763, + "learning_rate": 2.7697642163661583e-06, + "loss": 0.0019, + "step": 10437 + }, + { + "epoch": 7.231035677173536, + "grad_norm": 0.06752082705497742, + "learning_rate": 2.769070735090153e-06, + "loss": 0.0013, + "step": 10438 + }, + { + "epoch": 7.2317284378247315, + "grad_norm": 0.14494122564792633, + "learning_rate": 2.7683772538141473e-06, + "loss": 0.0017, + "step": 10439 + }, + { + "epoch": 7.232421198475927, + "grad_norm": 0.13115465641021729, + "learning_rate": 2.767683772538142e-06, + "loss": 0.0016, + "step": 10440 + }, + { + "epoch": 7.233113959127121, + "grad_norm": 0.17863360047340393, + "learning_rate": 2.766990291262136e-06, + "loss": 0.002, + "step": 10441 + }, + { + "epoch": 7.2338067197783165, + "grad_norm": 0.09369208663702011, + "learning_rate": 2.7662968099861304e-06, + "loss": 0.0018, + "step": 10442 + }, + { + "epoch": 7.234499480429512, + "grad_norm": 0.10856392979621887, + "learning_rate": 2.7656033287101254e-06, + "loss": 0.0015, + "step": 10443 + }, + { + "epoch": 7.235192241080707, + "grad_norm": 0.10256483405828476, + "learning_rate": 2.7649098474341194e-06, + "loss": 0.0016, + "step": 10444 + }, + { + "epoch": 7.235885001731901, + "grad_norm": 0.08446470648050308, + "learning_rate": 2.764216366158114e-06, + "loss": 0.0015, + "step": 10445 + }, + { + "epoch": 7.236577762383097, + "grad_norm": 0.06964675337076187, + "learning_rate": 2.7635228848821085e-06, + "loss": 0.0014, + "step": 10446 + }, + { + "epoch": 7.237270523034292, + "grad_norm": 0.09242125600576401, + "learning_rate": 2.7628294036061026e-06, + "loss": 0.0019, + "step": 10447 + }, + { + "epoch": 7.237963283685486, + "grad_norm": 0.18718264997005463, + "learning_rate": 2.7621359223300975e-06, + "loss": 0.0025, + "step": 10448 + }, + { + "epoch": 7.238656044336682, + "grad_norm": 0.09490001946687698, + "learning_rate": 2.761442441054092e-06, + "loss": 0.0016, + "step": 10449 + }, + { + "epoch": 7.239348804987877, + "grad_norm": 0.12201209366321564, + "learning_rate": 2.760748959778086e-06, + "loss": 0.0015, + "step": 10450 + }, + { + "epoch": 7.240041565639071, + "grad_norm": 0.15196803212165833, + "learning_rate": 2.7600554785020806e-06, + "loss": 0.0019, + "step": 10451 + }, + { + "epoch": 7.240734326290267, + "grad_norm": 0.07299428433179855, + "learning_rate": 2.759361997226075e-06, + "loss": 0.0015, + "step": 10452 + }, + { + "epoch": 7.241427086941462, + "grad_norm": 0.10037188977003098, + "learning_rate": 2.7586685159500696e-06, + "loss": 0.0016, + "step": 10453 + }, + { + "epoch": 7.242119847592657, + "grad_norm": 0.11262709647417068, + "learning_rate": 2.757975034674064e-06, + "loss": 0.0018, + "step": 10454 + }, + { + "epoch": 7.2428126082438515, + "grad_norm": 0.1267578899860382, + "learning_rate": 2.7572815533980586e-06, + "loss": 0.0017, + "step": 10455 + }, + { + "epoch": 7.243505368895047, + "grad_norm": 0.1580943912267685, + "learning_rate": 2.7565880721220527e-06, + "loss": 0.002, + "step": 10456 + }, + { + "epoch": 7.244198129546242, + "grad_norm": 0.13367560505867004, + "learning_rate": 2.7558945908460472e-06, + "loss": 0.003, + "step": 10457 + }, + { + "epoch": 7.2448908901974365, + "grad_norm": 0.06954680383205414, + "learning_rate": 2.755201109570042e-06, + "loss": 0.0014, + "step": 10458 + }, + { + "epoch": 7.245583650848632, + "grad_norm": 0.0899452492594719, + "learning_rate": 2.7545076282940362e-06, + "loss": 0.0017, + "step": 10459 + }, + { + "epoch": 7.246276411499827, + "grad_norm": 0.3852347731590271, + "learning_rate": 2.7538141470180308e-06, + "loss": 0.0024, + "step": 10460 + }, + { + "epoch": 7.246969172151021, + "grad_norm": 0.31897953152656555, + "learning_rate": 2.7531206657420253e-06, + "loss": 0.0022, + "step": 10461 + }, + { + "epoch": 7.247661932802217, + "grad_norm": 0.16231410205364227, + "learning_rate": 2.7524271844660194e-06, + "loss": 0.0017, + "step": 10462 + }, + { + "epoch": 7.248354693453412, + "grad_norm": 0.1474291831254959, + "learning_rate": 2.7517337031900143e-06, + "loss": 0.0019, + "step": 10463 + }, + { + "epoch": 7.249047454104607, + "grad_norm": 0.13489697873592377, + "learning_rate": 2.751040221914009e-06, + "loss": 0.002, + "step": 10464 + }, + { + "epoch": 7.249740214755802, + "grad_norm": 0.15055708587169647, + "learning_rate": 2.750346740638003e-06, + "loss": 0.0017, + "step": 10465 + }, + { + "epoch": 7.250432975406997, + "grad_norm": 0.09993751347064972, + "learning_rate": 2.7496532593619974e-06, + "loss": 0.0016, + "step": 10466 + }, + { + "epoch": 7.251125736058192, + "grad_norm": 0.09920458495616913, + "learning_rate": 2.7489597780859915e-06, + "loss": 0.0017, + "step": 10467 + }, + { + "epoch": 7.251818496709387, + "grad_norm": 0.08457056432962418, + "learning_rate": 2.7482662968099864e-06, + "loss": 0.0016, + "step": 10468 + }, + { + "epoch": 7.252511257360582, + "grad_norm": 0.15224045515060425, + "learning_rate": 2.747572815533981e-06, + "loss": 0.002, + "step": 10469 + }, + { + "epoch": 7.253204018011777, + "grad_norm": 0.12078787386417389, + "learning_rate": 2.7468793342579754e-06, + "loss": 0.0015, + "step": 10470 + }, + { + "epoch": 7.253896778662972, + "grad_norm": 0.1865270733833313, + "learning_rate": 2.7461858529819695e-06, + "loss": 0.0031, + "step": 10471 + }, + { + "epoch": 7.254589539314167, + "grad_norm": 0.1568903774023056, + "learning_rate": 2.745492371705964e-06, + "loss": 0.0035, + "step": 10472 + }, + { + "epoch": 7.255282299965362, + "grad_norm": 0.14186471700668335, + "learning_rate": 2.744798890429959e-06, + "loss": 0.0021, + "step": 10473 + }, + { + "epoch": 7.255975060616557, + "grad_norm": 0.09129304438829422, + "learning_rate": 2.744105409153953e-06, + "loss": 0.0016, + "step": 10474 + }, + { + "epoch": 7.256667821267752, + "grad_norm": 0.48369312286376953, + "learning_rate": 2.7434119278779476e-06, + "loss": 0.0029, + "step": 10475 + }, + { + "epoch": 7.257360581918947, + "grad_norm": 0.07339184731245041, + "learning_rate": 2.742718446601942e-06, + "loss": 0.0015, + "step": 10476 + }, + { + "epoch": 7.258053342570142, + "grad_norm": 0.07599583268165588, + "learning_rate": 2.742024965325936e-06, + "loss": 0.0014, + "step": 10477 + }, + { + "epoch": 7.258746103221337, + "grad_norm": 0.1359255611896515, + "learning_rate": 2.741331484049931e-06, + "loss": 0.0019, + "step": 10478 + }, + { + "epoch": 7.259438863872532, + "grad_norm": 0.09700708836317062, + "learning_rate": 2.7406380027739256e-06, + "loss": 0.0017, + "step": 10479 + }, + { + "epoch": 7.260131624523727, + "grad_norm": 0.2100406438112259, + "learning_rate": 2.7399445214979197e-06, + "loss": 0.0031, + "step": 10480 + }, + { + "epoch": 7.260824385174922, + "grad_norm": 0.12372469156980515, + "learning_rate": 2.739251040221914e-06, + "loss": 0.0016, + "step": 10481 + }, + { + "epoch": 7.261517145826117, + "grad_norm": 0.0920005515217781, + "learning_rate": 2.7385575589459083e-06, + "loss": 0.0016, + "step": 10482 + }, + { + "epoch": 7.262209906477312, + "grad_norm": 0.11117003113031387, + "learning_rate": 2.7378640776699032e-06, + "loss": 0.0018, + "step": 10483 + }, + { + "epoch": 7.2629026671285075, + "grad_norm": 0.0973900705575943, + "learning_rate": 2.7371705963938977e-06, + "loss": 0.0018, + "step": 10484 + }, + { + "epoch": 7.263595427779702, + "grad_norm": 0.1261003464460373, + "learning_rate": 2.7364771151178922e-06, + "loss": 0.0017, + "step": 10485 + }, + { + "epoch": 7.264288188430897, + "grad_norm": 0.06539832800626755, + "learning_rate": 2.7357836338418863e-06, + "loss": 0.0015, + "step": 10486 + }, + { + "epoch": 7.2649809490820925, + "grad_norm": 0.09684585779905319, + "learning_rate": 2.735090152565881e-06, + "loss": 0.0015, + "step": 10487 + }, + { + "epoch": 7.265673709733287, + "grad_norm": 0.09772973507642746, + "learning_rate": 2.7343966712898758e-06, + "loss": 0.0016, + "step": 10488 + }, + { + "epoch": 7.266366470384482, + "grad_norm": 0.12444275617599487, + "learning_rate": 2.73370319001387e-06, + "loss": 0.0016, + "step": 10489 + }, + { + "epoch": 7.267059231035677, + "grad_norm": 0.09388227760791779, + "learning_rate": 2.7330097087378644e-06, + "loss": 0.0016, + "step": 10490 + }, + { + "epoch": 7.267751991686872, + "grad_norm": 0.10781733691692352, + "learning_rate": 2.7323162274618584e-06, + "loss": 0.002, + "step": 10491 + }, + { + "epoch": 7.268444752338067, + "grad_norm": 0.26906630396842957, + "learning_rate": 2.731622746185853e-06, + "loss": 0.0023, + "step": 10492 + }, + { + "epoch": 7.269137512989262, + "grad_norm": 0.10488678514957428, + "learning_rate": 2.730929264909848e-06, + "loss": 0.0016, + "step": 10493 + }, + { + "epoch": 7.269830273640457, + "grad_norm": 0.18185140192508698, + "learning_rate": 2.7302357836338424e-06, + "loss": 0.0029, + "step": 10494 + }, + { + "epoch": 7.270523034291652, + "grad_norm": 0.09366568177938461, + "learning_rate": 2.7295423023578365e-06, + "loss": 0.0014, + "step": 10495 + }, + { + "epoch": 7.271215794942847, + "grad_norm": 0.7352327704429626, + "learning_rate": 2.728848821081831e-06, + "loss": 0.0024, + "step": 10496 + }, + { + "epoch": 7.271908555594043, + "grad_norm": 0.20652872323989868, + "learning_rate": 2.728155339805825e-06, + "loss": 0.0019, + "step": 10497 + }, + { + "epoch": 7.272601316245237, + "grad_norm": 0.0690193846821785, + "learning_rate": 2.72746185852982e-06, + "loss": 0.0013, + "step": 10498 + }, + { + "epoch": 7.273294076896432, + "grad_norm": 0.1919272243976593, + "learning_rate": 2.7267683772538145e-06, + "loss": 0.0018, + "step": 10499 + }, + { + "epoch": 7.2739868375476275, + "grad_norm": 0.09269005060195923, + "learning_rate": 2.726074895977809e-06, + "loss": 0.0014, + "step": 10500 + }, + { + "epoch": 7.274679598198822, + "grad_norm": 0.15102043747901917, + "learning_rate": 2.725381414701803e-06, + "loss": 0.002, + "step": 10501 + }, + { + "epoch": 7.275372358850017, + "grad_norm": 0.1863505244255066, + "learning_rate": 2.7246879334257976e-06, + "loss": 0.0032, + "step": 10502 + }, + { + "epoch": 7.2760651195012125, + "grad_norm": 0.2828274965286255, + "learning_rate": 2.7239944521497926e-06, + "loss": 0.0019, + "step": 10503 + }, + { + "epoch": 7.276757880152408, + "grad_norm": 0.1548154056072235, + "learning_rate": 2.7233009708737867e-06, + "loss": 0.0018, + "step": 10504 + }, + { + "epoch": 7.277450640803602, + "grad_norm": 0.11284724622964859, + "learning_rate": 2.722607489597781e-06, + "loss": 0.0016, + "step": 10505 + }, + { + "epoch": 7.278143401454797, + "grad_norm": 0.09580027312040329, + "learning_rate": 2.7219140083217752e-06, + "loss": 0.0018, + "step": 10506 + }, + { + "epoch": 7.278836162105993, + "grad_norm": 0.2197761833667755, + "learning_rate": 2.7212205270457698e-06, + "loss": 0.0021, + "step": 10507 + }, + { + "epoch": 7.279528922757187, + "grad_norm": 0.08582144975662231, + "learning_rate": 2.7205270457697647e-06, + "loss": 0.0016, + "step": 10508 + }, + { + "epoch": 7.280221683408382, + "grad_norm": 0.33913543820381165, + "learning_rate": 2.719833564493759e-06, + "loss": 0.0018, + "step": 10509 + }, + { + "epoch": 7.280914444059578, + "grad_norm": 0.6199621558189392, + "learning_rate": 2.7191400832177533e-06, + "loss": 0.0022, + "step": 10510 + }, + { + "epoch": 7.281607204710772, + "grad_norm": 0.05950072035193443, + "learning_rate": 2.718446601941748e-06, + "loss": 0.0014, + "step": 10511 + }, + { + "epoch": 7.282299965361967, + "grad_norm": 0.09101162105798721, + "learning_rate": 2.717753120665742e-06, + "loss": 0.0019, + "step": 10512 + }, + { + "epoch": 7.282992726013163, + "grad_norm": 0.0685565322637558, + "learning_rate": 2.717059639389737e-06, + "loss": 0.0016, + "step": 10513 + }, + { + "epoch": 7.283685486664357, + "grad_norm": 0.3845399022102356, + "learning_rate": 2.7163661581137313e-06, + "loss": 0.0029, + "step": 10514 + }, + { + "epoch": 7.284378247315552, + "grad_norm": 0.1877196878194809, + "learning_rate": 2.7156726768377254e-06, + "loss": 0.0026, + "step": 10515 + }, + { + "epoch": 7.285071007966748, + "grad_norm": 0.29697221517562866, + "learning_rate": 2.71497919556172e-06, + "loss": 0.0033, + "step": 10516 + }, + { + "epoch": 7.285763768617943, + "grad_norm": 0.2284240871667862, + "learning_rate": 2.7142857142857144e-06, + "loss": 0.0022, + "step": 10517 + }, + { + "epoch": 7.286456529269137, + "grad_norm": 0.2014724463224411, + "learning_rate": 2.7135922330097094e-06, + "loss": 0.0024, + "step": 10518 + }, + { + "epoch": 7.2871492899203325, + "grad_norm": 0.12144462764263153, + "learning_rate": 2.7128987517337035e-06, + "loss": 0.002, + "step": 10519 + }, + { + "epoch": 7.287842050571528, + "grad_norm": 0.15588727593421936, + "learning_rate": 2.712205270457698e-06, + "loss": 0.0021, + "step": 10520 + }, + { + "epoch": 7.288534811222722, + "grad_norm": 0.07342936098575592, + "learning_rate": 2.711511789181692e-06, + "loss": 0.0016, + "step": 10521 + }, + { + "epoch": 7.2892275718739175, + "grad_norm": 0.09196361899375916, + "learning_rate": 2.7108183079056866e-06, + "loss": 0.0017, + "step": 10522 + }, + { + "epoch": 7.289920332525113, + "grad_norm": 0.4919677674770355, + "learning_rate": 2.7101248266296815e-06, + "loss": 0.0026, + "step": 10523 + }, + { + "epoch": 7.290613093176308, + "grad_norm": 0.13308435678482056, + "learning_rate": 2.709431345353676e-06, + "loss": 0.0017, + "step": 10524 + }, + { + "epoch": 7.291305853827502, + "grad_norm": 0.09319281578063965, + "learning_rate": 2.70873786407767e-06, + "loss": 0.0016, + "step": 10525 + }, + { + "epoch": 7.291998614478698, + "grad_norm": 0.07991555333137512, + "learning_rate": 2.7080443828016646e-06, + "loss": 0.0015, + "step": 10526 + }, + { + "epoch": 7.292691375129893, + "grad_norm": 0.6627943515777588, + "learning_rate": 2.7073509015256587e-06, + "loss": 0.002, + "step": 10527 + }, + { + "epoch": 7.293384135781087, + "grad_norm": 0.1489666849374771, + "learning_rate": 2.7066574202496536e-06, + "loss": 0.0028, + "step": 10528 + }, + { + "epoch": 7.294076896432283, + "grad_norm": 0.1293557733297348, + "learning_rate": 2.705963938973648e-06, + "loss": 0.0019, + "step": 10529 + }, + { + "epoch": 7.294769657083478, + "grad_norm": 0.10962218791246414, + "learning_rate": 2.7052704576976422e-06, + "loss": 0.0017, + "step": 10530 + }, + { + "epoch": 7.295462417734672, + "grad_norm": 0.11898452043533325, + "learning_rate": 2.7045769764216367e-06, + "loss": 0.0017, + "step": 10531 + }, + { + "epoch": 7.296155178385868, + "grad_norm": 0.3060879111289978, + "learning_rate": 2.7038834951456312e-06, + "loss": 0.0035, + "step": 10532 + }, + { + "epoch": 7.296847939037063, + "grad_norm": 0.17742903530597687, + "learning_rate": 2.703190013869626e-06, + "loss": 0.0017, + "step": 10533 + }, + { + "epoch": 7.297540699688257, + "grad_norm": 0.1303330510854721, + "learning_rate": 2.7024965325936203e-06, + "loss": 0.0023, + "step": 10534 + }, + { + "epoch": 7.2982334603394525, + "grad_norm": 0.24945330619812012, + "learning_rate": 2.7018030513176148e-06, + "loss": 0.0025, + "step": 10535 + }, + { + "epoch": 7.298926220990648, + "grad_norm": 0.13665013015270233, + "learning_rate": 2.701109570041609e-06, + "loss": 0.0024, + "step": 10536 + }, + { + "epoch": 7.299618981641843, + "grad_norm": 0.11649353802204132, + "learning_rate": 2.7004160887656034e-06, + "loss": 0.0017, + "step": 10537 + }, + { + "epoch": 7.3003117422930375, + "grad_norm": 0.10994106531143188, + "learning_rate": 2.6997226074895983e-06, + "loss": 0.0016, + "step": 10538 + }, + { + "epoch": 7.301004502944233, + "grad_norm": 0.11216150224208832, + "learning_rate": 2.6990291262135924e-06, + "loss": 0.0016, + "step": 10539 + }, + { + "epoch": 7.301697263595428, + "grad_norm": 0.1115545853972435, + "learning_rate": 2.698335644937587e-06, + "loss": 0.0018, + "step": 10540 + }, + { + "epoch": 7.302390024246622, + "grad_norm": 0.09486105293035507, + "learning_rate": 2.6976421636615814e-06, + "loss": 0.0017, + "step": 10541 + }, + { + "epoch": 7.303082784897818, + "grad_norm": 0.10198003798723221, + "learning_rate": 2.6969486823855755e-06, + "loss": 0.0016, + "step": 10542 + }, + { + "epoch": 7.303775545549013, + "grad_norm": 0.08961523324251175, + "learning_rate": 2.6962552011095704e-06, + "loss": 0.0017, + "step": 10543 + }, + { + "epoch": 7.304468306200208, + "grad_norm": 0.2805187702178955, + "learning_rate": 2.695561719833565e-06, + "loss": 0.0019, + "step": 10544 + }, + { + "epoch": 7.305161066851403, + "grad_norm": 0.1289357841014862, + "learning_rate": 2.694868238557559e-06, + "loss": 0.0021, + "step": 10545 + }, + { + "epoch": 7.305853827502598, + "grad_norm": 0.32471606135368347, + "learning_rate": 2.6941747572815535e-06, + "loss": 0.0024, + "step": 10546 + }, + { + "epoch": 7.306546588153793, + "grad_norm": 0.09396038949489594, + "learning_rate": 2.693481276005548e-06, + "loss": 0.0015, + "step": 10547 + }, + { + "epoch": 7.307239348804988, + "grad_norm": 0.1599978804588318, + "learning_rate": 2.692787794729543e-06, + "loss": 0.0022, + "step": 10548 + }, + { + "epoch": 7.307932109456183, + "grad_norm": 0.1154475212097168, + "learning_rate": 2.692094313453537e-06, + "loss": 0.0019, + "step": 10549 + }, + { + "epoch": 7.308624870107378, + "grad_norm": 0.14504484832286835, + "learning_rate": 2.6914008321775316e-06, + "loss": 0.0017, + "step": 10550 + }, + { + "epoch": 7.3093176307585725, + "grad_norm": 0.10207211971282959, + "learning_rate": 2.6907073509015257e-06, + "loss": 0.0016, + "step": 10551 + }, + { + "epoch": 7.310010391409768, + "grad_norm": 0.08849812299013138, + "learning_rate": 2.69001386962552e-06, + "loss": 0.0016, + "step": 10552 + }, + { + "epoch": 7.310703152060963, + "grad_norm": 0.14090637862682343, + "learning_rate": 2.689320388349515e-06, + "loss": 0.0016, + "step": 10553 + }, + { + "epoch": 7.3113959127121575, + "grad_norm": 0.14641524851322174, + "learning_rate": 2.688626907073509e-06, + "loss": 0.0016, + "step": 10554 + }, + { + "epoch": 7.312088673363353, + "grad_norm": 0.18492135405540466, + "learning_rate": 2.6879334257975037e-06, + "loss": 0.0033, + "step": 10555 + }, + { + "epoch": 7.312781434014548, + "grad_norm": 0.07164610177278519, + "learning_rate": 2.687239944521498e-06, + "loss": 0.0014, + "step": 10556 + }, + { + "epoch": 7.313474194665743, + "grad_norm": 0.09530164301395416, + "learning_rate": 2.6865464632454923e-06, + "loss": 0.0016, + "step": 10557 + }, + { + "epoch": 7.314166955316938, + "grad_norm": 0.09830351918935776, + "learning_rate": 2.6858529819694872e-06, + "loss": 0.0014, + "step": 10558 + }, + { + "epoch": 7.314859715968133, + "grad_norm": 0.13945689797401428, + "learning_rate": 2.6851595006934817e-06, + "loss": 0.0025, + "step": 10559 + }, + { + "epoch": 7.315552476619328, + "grad_norm": 0.157485231757164, + "learning_rate": 2.684466019417476e-06, + "loss": 0.0022, + "step": 10560 + }, + { + "epoch": 7.316245237270523, + "grad_norm": 0.16562585532665253, + "learning_rate": 2.6837725381414703e-06, + "loss": 0.0018, + "step": 10561 + }, + { + "epoch": 7.316937997921718, + "grad_norm": 0.20526468753814697, + "learning_rate": 2.683079056865465e-06, + "loss": 0.0013, + "step": 10562 + }, + { + "epoch": 7.317630758572913, + "grad_norm": 0.0601305291056633, + "learning_rate": 2.6823855755894593e-06, + "loss": 0.0013, + "step": 10563 + }, + { + "epoch": 7.3183235192241085, + "grad_norm": 0.12286640703678131, + "learning_rate": 2.681692094313454e-06, + "loss": 0.0017, + "step": 10564 + }, + { + "epoch": 7.319016279875303, + "grad_norm": 0.08743162453174591, + "learning_rate": 2.6809986130374484e-06, + "loss": 0.0016, + "step": 10565 + }, + { + "epoch": 7.319709040526498, + "grad_norm": 0.2952720820903778, + "learning_rate": 2.6803051317614425e-06, + "loss": 0.0022, + "step": 10566 + }, + { + "epoch": 7.3204018011776935, + "grad_norm": 0.1119878888130188, + "learning_rate": 2.679611650485437e-06, + "loss": 0.0023, + "step": 10567 + }, + { + "epoch": 7.321094561828888, + "grad_norm": 0.10538679361343384, + "learning_rate": 2.678918169209432e-06, + "loss": 0.0014, + "step": 10568 + }, + { + "epoch": 7.321787322480083, + "grad_norm": 0.4837396442890167, + "learning_rate": 2.678224687933426e-06, + "loss": 0.0018, + "step": 10569 + }, + { + "epoch": 7.322480083131278, + "grad_norm": 0.10609745234251022, + "learning_rate": 2.6775312066574205e-06, + "loss": 0.0015, + "step": 10570 + }, + { + "epoch": 7.323172843782473, + "grad_norm": 0.08468212187290192, + "learning_rate": 2.676837725381415e-06, + "loss": 0.0015, + "step": 10571 + }, + { + "epoch": 7.323865604433668, + "grad_norm": 0.17318294942378998, + "learning_rate": 2.676144244105409e-06, + "loss": 0.0027, + "step": 10572 + }, + { + "epoch": 7.324558365084863, + "grad_norm": 0.15581314265727997, + "learning_rate": 2.675450762829404e-06, + "loss": 0.0019, + "step": 10573 + }, + { + "epoch": 7.325251125736058, + "grad_norm": 0.17034558951854706, + "learning_rate": 2.6747572815533985e-06, + "loss": 0.0019, + "step": 10574 + }, + { + "epoch": 7.325943886387253, + "grad_norm": 0.17120777070522308, + "learning_rate": 2.6740638002773926e-06, + "loss": 0.0022, + "step": 10575 + }, + { + "epoch": 7.326636647038448, + "grad_norm": 0.17404325306415558, + "learning_rate": 2.673370319001387e-06, + "loss": 0.0016, + "step": 10576 + }, + { + "epoch": 7.327329407689644, + "grad_norm": 0.20107604563236237, + "learning_rate": 2.6726768377253816e-06, + "loss": 0.0022, + "step": 10577 + }, + { + "epoch": 7.328022168340838, + "grad_norm": 0.11374718695878983, + "learning_rate": 2.671983356449376e-06, + "loss": 0.0014, + "step": 10578 + }, + { + "epoch": 7.328714928992033, + "grad_norm": 0.06300130486488342, + "learning_rate": 2.6712898751733707e-06, + "loss": 0.0014, + "step": 10579 + }, + { + "epoch": 7.3294076896432285, + "grad_norm": 0.13866983354091644, + "learning_rate": 2.670596393897365e-06, + "loss": 0.0022, + "step": 10580 + }, + { + "epoch": 7.330100450294423, + "grad_norm": 0.09184422343969345, + "learning_rate": 2.6699029126213593e-06, + "loss": 0.0016, + "step": 10581 + }, + { + "epoch": 7.330793210945618, + "grad_norm": 0.2565865218639374, + "learning_rate": 2.6692094313453538e-06, + "loss": 0.002, + "step": 10582 + }, + { + "epoch": 7.3314859715968135, + "grad_norm": 0.12624496221542358, + "learning_rate": 2.6685159500693487e-06, + "loss": 0.0018, + "step": 10583 + }, + { + "epoch": 7.332178732248009, + "grad_norm": 0.3806810677051544, + "learning_rate": 2.6678224687933428e-06, + "loss": 0.0024, + "step": 10584 + }, + { + "epoch": 7.332871492899203, + "grad_norm": 0.5169533491134644, + "learning_rate": 2.6671289875173373e-06, + "loss": 0.0029, + "step": 10585 + }, + { + "epoch": 7.333564253550398, + "grad_norm": 0.1569748967885971, + "learning_rate": 2.666435506241332e-06, + "loss": 0.0017, + "step": 10586 + }, + { + "epoch": 7.334257014201594, + "grad_norm": 0.10726001113653183, + "learning_rate": 2.665742024965326e-06, + "loss": 0.0013, + "step": 10587 + }, + { + "epoch": 7.334949774852788, + "grad_norm": 0.09763527661561966, + "learning_rate": 2.665048543689321e-06, + "loss": 0.0016, + "step": 10588 + }, + { + "epoch": 7.335642535503983, + "grad_norm": 0.08869840949773788, + "learning_rate": 2.6643550624133153e-06, + "loss": 0.0018, + "step": 10589 + }, + { + "epoch": 7.336335296155179, + "grad_norm": 0.15589062869548798, + "learning_rate": 2.6636615811373094e-06, + "loss": 0.0025, + "step": 10590 + }, + { + "epoch": 7.337028056806373, + "grad_norm": 0.06364841014146805, + "learning_rate": 2.662968099861304e-06, + "loss": 0.0013, + "step": 10591 + }, + { + "epoch": 7.337720817457568, + "grad_norm": 0.1893458515405655, + "learning_rate": 2.6622746185852984e-06, + "loss": 0.002, + "step": 10592 + }, + { + "epoch": 7.338413578108764, + "grad_norm": 0.11292804777622223, + "learning_rate": 2.661581137309293e-06, + "loss": 0.0015, + "step": 10593 + }, + { + "epoch": 7.339106338759958, + "grad_norm": 0.10484236478805542, + "learning_rate": 2.6608876560332875e-06, + "loss": 0.002, + "step": 10594 + }, + { + "epoch": 7.339799099411153, + "grad_norm": 0.06305152177810669, + "learning_rate": 2.660194174757282e-06, + "loss": 0.0013, + "step": 10595 + }, + { + "epoch": 7.3404918600623486, + "grad_norm": 0.10523684322834015, + "learning_rate": 2.659500693481276e-06, + "loss": 0.0017, + "step": 10596 + }, + { + "epoch": 7.341184620713544, + "grad_norm": 0.10294146835803986, + "learning_rate": 2.6588072122052706e-06, + "loss": 0.0016, + "step": 10597 + }, + { + "epoch": 7.341877381364738, + "grad_norm": 0.11167468130588531, + "learning_rate": 2.6581137309292655e-06, + "loss": 0.0014, + "step": 10598 + }, + { + "epoch": 7.3425701420159335, + "grad_norm": 0.21104352176189423, + "learning_rate": 2.6574202496532596e-06, + "loss": 0.0021, + "step": 10599 + }, + { + "epoch": 7.343262902667129, + "grad_norm": 0.09167628735303879, + "learning_rate": 2.656726768377254e-06, + "loss": 0.0014, + "step": 10600 + }, + { + "epoch": 7.343955663318323, + "grad_norm": 0.1054004430770874, + "learning_rate": 2.6560332871012486e-06, + "loss": 0.0014, + "step": 10601 + }, + { + "epoch": 7.3446484239695184, + "grad_norm": 0.16618290543556213, + "learning_rate": 2.6553398058252427e-06, + "loss": 0.0025, + "step": 10602 + }, + { + "epoch": 7.345341184620714, + "grad_norm": 0.14775697886943817, + "learning_rate": 2.6546463245492376e-06, + "loss": 0.0014, + "step": 10603 + }, + { + "epoch": 7.346033945271909, + "grad_norm": 0.11632692068815231, + "learning_rate": 2.653952843273232e-06, + "loss": 0.0019, + "step": 10604 + }, + { + "epoch": 7.346726705923103, + "grad_norm": 0.08726738393306732, + "learning_rate": 2.6532593619972262e-06, + "loss": 0.0015, + "step": 10605 + }, + { + "epoch": 7.347419466574299, + "grad_norm": 0.1897185891866684, + "learning_rate": 2.6525658807212207e-06, + "loss": 0.0019, + "step": 10606 + }, + { + "epoch": 7.348112227225494, + "grad_norm": 0.13701261579990387, + "learning_rate": 2.651872399445215e-06, + "loss": 0.0024, + "step": 10607 + }, + { + "epoch": 7.348804987876688, + "grad_norm": 0.09947992116212845, + "learning_rate": 2.6511789181692098e-06, + "loss": 0.0018, + "step": 10608 + }, + { + "epoch": 7.349497748527884, + "grad_norm": 0.09648267179727554, + "learning_rate": 2.6504854368932043e-06, + "loss": 0.0017, + "step": 10609 + }, + { + "epoch": 7.350190509179079, + "grad_norm": 0.33638474345207214, + "learning_rate": 2.6497919556171988e-06, + "loss": 0.0022, + "step": 10610 + }, + { + "epoch": 7.350883269830273, + "grad_norm": 0.12255071103572845, + "learning_rate": 2.649098474341193e-06, + "loss": 0.0027, + "step": 10611 + }, + { + "epoch": 7.351576030481469, + "grad_norm": 0.16847288608551025, + "learning_rate": 2.6484049930651874e-06, + "loss": 0.002, + "step": 10612 + }, + { + "epoch": 7.352268791132664, + "grad_norm": 0.14912553131580353, + "learning_rate": 2.6477115117891823e-06, + "loss": 0.0025, + "step": 10613 + }, + { + "epoch": 7.352961551783858, + "grad_norm": 0.09489867091178894, + "learning_rate": 2.6470180305131764e-06, + "loss": 0.0016, + "step": 10614 + }, + { + "epoch": 7.3536543124350535, + "grad_norm": 0.21695023775100708, + "learning_rate": 2.646324549237171e-06, + "loss": 0.0022, + "step": 10615 + }, + { + "epoch": 7.354347073086249, + "grad_norm": 0.13073423504829407, + "learning_rate": 2.6456310679611654e-06, + "loss": 0.0019, + "step": 10616 + }, + { + "epoch": 7.355039833737444, + "grad_norm": 0.12255122512578964, + "learning_rate": 2.6449375866851595e-06, + "loss": 0.0015, + "step": 10617 + }, + { + "epoch": 7.3557325943886385, + "grad_norm": 0.11707830429077148, + "learning_rate": 2.6442441054091544e-06, + "loss": 0.0016, + "step": 10618 + }, + { + "epoch": 7.356425355039834, + "grad_norm": 0.08144792169332504, + "learning_rate": 2.643550624133149e-06, + "loss": 0.0016, + "step": 10619 + }, + { + "epoch": 7.357118115691029, + "grad_norm": 0.09570600837469101, + "learning_rate": 2.642857142857143e-06, + "loss": 0.0017, + "step": 10620 + }, + { + "epoch": 7.357810876342223, + "grad_norm": 0.08857254683971405, + "learning_rate": 2.6421636615811375e-06, + "loss": 0.0016, + "step": 10621 + }, + { + "epoch": 7.358503636993419, + "grad_norm": 0.07691531628370285, + "learning_rate": 2.6414701803051316e-06, + "loss": 0.0015, + "step": 10622 + }, + { + "epoch": 7.359196397644614, + "grad_norm": 0.07823880016803741, + "learning_rate": 2.6407766990291266e-06, + "loss": 0.0014, + "step": 10623 + }, + { + "epoch": 7.359889158295809, + "grad_norm": 0.08908458799123764, + "learning_rate": 2.640083217753121e-06, + "loss": 0.0015, + "step": 10624 + }, + { + "epoch": 7.360581918947004, + "grad_norm": 0.176316499710083, + "learning_rate": 2.6393897364771156e-06, + "loss": 0.0032, + "step": 10625 + }, + { + "epoch": 7.361274679598199, + "grad_norm": 0.15909786522388458, + "learning_rate": 2.6386962552011097e-06, + "loss": 0.0017, + "step": 10626 + }, + { + "epoch": 7.361967440249394, + "grad_norm": 0.21318186819553375, + "learning_rate": 2.638002773925104e-06, + "loss": 0.0022, + "step": 10627 + }, + { + "epoch": 7.362660200900589, + "grad_norm": 0.09125927835702896, + "learning_rate": 2.637309292649099e-06, + "loss": 0.0012, + "step": 10628 + }, + { + "epoch": 7.363352961551784, + "grad_norm": 0.2801209092140198, + "learning_rate": 2.636615811373093e-06, + "loss": 0.0023, + "step": 10629 + }, + { + "epoch": 7.364045722202979, + "grad_norm": 0.10344817489385605, + "learning_rate": 2.6359223300970877e-06, + "loss": 0.0015, + "step": 10630 + }, + { + "epoch": 7.3647384828541735, + "grad_norm": 0.12958082556724548, + "learning_rate": 2.6352288488210818e-06, + "loss": 0.0024, + "step": 10631 + }, + { + "epoch": 7.365431243505369, + "grad_norm": 0.06534534692764282, + "learning_rate": 2.6345353675450763e-06, + "loss": 0.0015, + "step": 10632 + }, + { + "epoch": 7.366124004156564, + "grad_norm": 0.32471805810928345, + "learning_rate": 2.6338418862690712e-06, + "loss": 0.0039, + "step": 10633 + }, + { + "epoch": 7.3668167648077585, + "grad_norm": 0.14212051033973694, + "learning_rate": 2.6331484049930657e-06, + "loss": 0.0016, + "step": 10634 + }, + { + "epoch": 7.367509525458954, + "grad_norm": 0.10433334857225418, + "learning_rate": 2.63245492371706e-06, + "loss": 0.0014, + "step": 10635 + }, + { + "epoch": 7.368202286110149, + "grad_norm": 0.10529961436986923, + "learning_rate": 2.6317614424410543e-06, + "loss": 0.0017, + "step": 10636 + }, + { + "epoch": 7.368895046761344, + "grad_norm": 0.0966438427567482, + "learning_rate": 2.6310679611650484e-06, + "loss": 0.0019, + "step": 10637 + }, + { + "epoch": 7.369587807412539, + "grad_norm": 0.05615049600601196, + "learning_rate": 2.6303744798890434e-06, + "loss": 0.0014, + "step": 10638 + }, + { + "epoch": 7.370280568063734, + "grad_norm": 0.11485607177019119, + "learning_rate": 2.629680998613038e-06, + "loss": 0.0021, + "step": 10639 + }, + { + "epoch": 7.370973328714929, + "grad_norm": 0.2299732118844986, + "learning_rate": 2.6289875173370324e-06, + "loss": 0.0022, + "step": 10640 + }, + { + "epoch": 7.371666089366124, + "grad_norm": 0.1310017853975296, + "learning_rate": 2.6282940360610265e-06, + "loss": 0.0037, + "step": 10641 + }, + { + "epoch": 7.372358850017319, + "grad_norm": 0.11050750315189362, + "learning_rate": 2.627600554785021e-06, + "loss": 0.0017, + "step": 10642 + }, + { + "epoch": 7.373051610668514, + "grad_norm": 0.08821465075016022, + "learning_rate": 2.626907073509016e-06, + "loss": 0.0017, + "step": 10643 + }, + { + "epoch": 7.3737443713197095, + "grad_norm": 0.09382601827383041, + "learning_rate": 2.62621359223301e-06, + "loss": 0.0015, + "step": 10644 + }, + { + "epoch": 7.374437131970904, + "grad_norm": 0.16105882823467255, + "learning_rate": 2.6255201109570045e-06, + "loss": 0.002, + "step": 10645 + }, + { + "epoch": 7.375129892622099, + "grad_norm": 0.18233506381511688, + "learning_rate": 2.6248266296809986e-06, + "loss": 0.0021, + "step": 10646 + }, + { + "epoch": 7.3758226532732944, + "grad_norm": 0.07966306805610657, + "learning_rate": 2.624133148404993e-06, + "loss": 0.0014, + "step": 10647 + }, + { + "epoch": 7.376515413924489, + "grad_norm": 0.2737707793712616, + "learning_rate": 2.6234396671289876e-06, + "loss": 0.0018, + "step": 10648 + }, + { + "epoch": 7.377208174575684, + "grad_norm": 0.12508131563663483, + "learning_rate": 2.6227461858529825e-06, + "loss": 0.0015, + "step": 10649 + }, + { + "epoch": 7.377900935226879, + "grad_norm": 0.17339982092380524, + "learning_rate": 2.6220527045769766e-06, + "loss": 0.0021, + "step": 10650 + }, + { + "epoch": 7.378593695878074, + "grad_norm": 0.21655355393886566, + "learning_rate": 2.621359223300971e-06, + "loss": 0.0016, + "step": 10651 + }, + { + "epoch": 7.379286456529269, + "grad_norm": 0.2767361104488373, + "learning_rate": 2.6206657420249652e-06, + "loss": 0.0019, + "step": 10652 + }, + { + "epoch": 7.379979217180464, + "grad_norm": 0.24723610281944275, + "learning_rate": 2.6199722607489597e-06, + "loss": 0.0019, + "step": 10653 + }, + { + "epoch": 7.380671977831659, + "grad_norm": 0.1405622512102127, + "learning_rate": 2.6192787794729547e-06, + "loss": 0.0014, + "step": 10654 + }, + { + "epoch": 7.381364738482854, + "grad_norm": 0.11951754242181778, + "learning_rate": 2.6185852981969488e-06, + "loss": 0.0018, + "step": 10655 + }, + { + "epoch": 7.382057499134049, + "grad_norm": 0.13081449270248413, + "learning_rate": 2.6178918169209433e-06, + "loss": 0.0017, + "step": 10656 + }, + { + "epoch": 7.382750259785245, + "grad_norm": 0.10164713114500046, + "learning_rate": 2.6171983356449378e-06, + "loss": 0.0016, + "step": 10657 + }, + { + "epoch": 7.383443020436439, + "grad_norm": 0.1691260188817978, + "learning_rate": 2.616504854368932e-06, + "loss": 0.0022, + "step": 10658 + }, + { + "epoch": 7.384135781087634, + "grad_norm": 0.08594096451997757, + "learning_rate": 2.615811373092927e-06, + "loss": 0.0016, + "step": 10659 + }, + { + "epoch": 7.3848285417388295, + "grad_norm": 0.14206601679325104, + "learning_rate": 2.6151178918169213e-06, + "loss": 0.0018, + "step": 10660 + }, + { + "epoch": 7.385521302390024, + "grad_norm": 0.07885897159576416, + "learning_rate": 2.6144244105409154e-06, + "loss": 0.0015, + "step": 10661 + }, + { + "epoch": 7.386214063041219, + "grad_norm": 0.14833572506904602, + "learning_rate": 2.61373092926491e-06, + "loss": 0.0018, + "step": 10662 + }, + { + "epoch": 7.3869068236924145, + "grad_norm": 0.2038232535123825, + "learning_rate": 2.6130374479889044e-06, + "loss": 0.0038, + "step": 10663 + }, + { + "epoch": 7.38759958434361, + "grad_norm": 0.09282398223876953, + "learning_rate": 2.6123439667128993e-06, + "loss": 0.0018, + "step": 10664 + }, + { + "epoch": 7.388292344994804, + "grad_norm": 0.17177100479602814, + "learning_rate": 2.6116504854368934e-06, + "loss": 0.0024, + "step": 10665 + }, + { + "epoch": 7.388985105645999, + "grad_norm": 0.1776081919670105, + "learning_rate": 2.610957004160888e-06, + "loss": 0.0032, + "step": 10666 + }, + { + "epoch": 7.389677866297195, + "grad_norm": 0.08193469792604446, + "learning_rate": 2.610263522884882e-06, + "loss": 0.0016, + "step": 10667 + }, + { + "epoch": 7.390370626948389, + "grad_norm": 0.1997789740562439, + "learning_rate": 2.6095700416088765e-06, + "loss": 0.0028, + "step": 10668 + }, + { + "epoch": 7.391063387599584, + "grad_norm": 0.3581722378730774, + "learning_rate": 2.6088765603328715e-06, + "loss": 0.0019, + "step": 10669 + }, + { + "epoch": 7.39175614825078, + "grad_norm": 0.09195563197135925, + "learning_rate": 2.6081830790568656e-06, + "loss": 0.0017, + "step": 10670 + }, + { + "epoch": 7.392448908901974, + "grad_norm": 0.27698156237602234, + "learning_rate": 2.60748959778086e-06, + "loss": 0.0017, + "step": 10671 + }, + { + "epoch": 7.393141669553169, + "grad_norm": 0.1314757615327835, + "learning_rate": 2.6067961165048546e-06, + "loss": 0.0021, + "step": 10672 + }, + { + "epoch": 7.393834430204365, + "grad_norm": 0.055855996906757355, + "learning_rate": 2.6061026352288487e-06, + "loss": 0.0013, + "step": 10673 + }, + { + "epoch": 7.394527190855559, + "grad_norm": 0.18448542058467865, + "learning_rate": 2.6054091539528436e-06, + "loss": 0.0019, + "step": 10674 + }, + { + "epoch": 7.395219951506754, + "grad_norm": 0.1572575569152832, + "learning_rate": 2.604715672676838e-06, + "loss": 0.0019, + "step": 10675 + }, + { + "epoch": 7.3959127121579495, + "grad_norm": 0.22911396622657776, + "learning_rate": 2.604022191400832e-06, + "loss": 0.002, + "step": 10676 + }, + { + "epoch": 7.396605472809145, + "grad_norm": 0.0508640892803669, + "learning_rate": 2.6033287101248267e-06, + "loss": 0.0013, + "step": 10677 + }, + { + "epoch": 7.397298233460339, + "grad_norm": 0.1964741051197052, + "learning_rate": 2.602635228848821e-06, + "loss": 0.0023, + "step": 10678 + }, + { + "epoch": 7.3979909941115345, + "grad_norm": 0.16124382615089417, + "learning_rate": 2.6019417475728157e-06, + "loss": 0.0016, + "step": 10679 + }, + { + "epoch": 7.39868375476273, + "grad_norm": 0.22576114535331726, + "learning_rate": 2.6012482662968102e-06, + "loss": 0.0031, + "step": 10680 + }, + { + "epoch": 7.399376515413924, + "grad_norm": 0.1622762680053711, + "learning_rate": 2.6005547850208047e-06, + "loss": 0.0018, + "step": 10681 + }, + { + "epoch": 7.400069276065119, + "grad_norm": 0.18522429466247559, + "learning_rate": 2.599861303744799e-06, + "loss": 0.0019, + "step": 10682 + }, + { + "epoch": 7.400762036716315, + "grad_norm": 0.07557988911867142, + "learning_rate": 2.5991678224687933e-06, + "loss": 0.0014, + "step": 10683 + }, + { + "epoch": 7.40145479736751, + "grad_norm": 0.22076094150543213, + "learning_rate": 2.5984743411927883e-06, + "loss": 0.0021, + "step": 10684 + }, + { + "epoch": 7.402147558018704, + "grad_norm": 0.2016923874616623, + "learning_rate": 2.5977808599167824e-06, + "loss": 0.0027, + "step": 10685 + }, + { + "epoch": 7.4028403186699, + "grad_norm": 0.2840091288089752, + "learning_rate": 2.597087378640777e-06, + "loss": 0.0028, + "step": 10686 + }, + { + "epoch": 7.403533079321095, + "grad_norm": 0.24551159143447876, + "learning_rate": 2.5963938973647714e-06, + "loss": 0.0034, + "step": 10687 + }, + { + "epoch": 7.404225839972289, + "grad_norm": 0.28587213158607483, + "learning_rate": 2.5957004160887655e-06, + "loss": 0.002, + "step": 10688 + }, + { + "epoch": 7.404918600623485, + "grad_norm": 0.17210809886455536, + "learning_rate": 2.5950069348127604e-06, + "loss": 0.0019, + "step": 10689 + }, + { + "epoch": 7.40561136127468, + "grad_norm": 0.1567090004682541, + "learning_rate": 2.594313453536755e-06, + "loss": 0.0017, + "step": 10690 + }, + { + "epoch": 7.406304121925874, + "grad_norm": 0.15413030982017517, + "learning_rate": 2.593619972260749e-06, + "loss": 0.0018, + "step": 10691 + }, + { + "epoch": 7.40699688257707, + "grad_norm": 0.10482992231845856, + "learning_rate": 2.5929264909847435e-06, + "loss": 0.0015, + "step": 10692 + }, + { + "epoch": 7.407689643228265, + "grad_norm": 0.13086524605751038, + "learning_rate": 2.592233009708738e-06, + "loss": 0.0019, + "step": 10693 + }, + { + "epoch": 7.408382403879459, + "grad_norm": 0.07652492076158524, + "learning_rate": 2.5915395284327325e-06, + "loss": 0.0016, + "step": 10694 + }, + { + "epoch": 7.4090751645306545, + "grad_norm": 0.07175049930810928, + "learning_rate": 2.590846047156727e-06, + "loss": 0.0014, + "step": 10695 + }, + { + "epoch": 7.40976792518185, + "grad_norm": 0.12819047272205353, + "learning_rate": 2.5901525658807215e-06, + "loss": 0.0022, + "step": 10696 + }, + { + "epoch": 7.410460685833045, + "grad_norm": 0.08351161330938339, + "learning_rate": 2.5894590846047156e-06, + "loss": 0.0013, + "step": 10697 + }, + { + "epoch": 7.4111534464842395, + "grad_norm": 0.07833921164274216, + "learning_rate": 2.58876560332871e-06, + "loss": 0.0013, + "step": 10698 + }, + { + "epoch": 7.411846207135435, + "grad_norm": 0.16410838067531586, + "learning_rate": 2.588072122052705e-06, + "loss": 0.0019, + "step": 10699 + }, + { + "epoch": 7.41253896778663, + "grad_norm": 0.1312284618616104, + "learning_rate": 2.587378640776699e-06, + "loss": 0.0014, + "step": 10700 + }, + { + "epoch": 7.413231728437824, + "grad_norm": 0.04965529218316078, + "learning_rate": 2.5866851595006937e-06, + "loss": 0.0011, + "step": 10701 + }, + { + "epoch": 7.41392448908902, + "grad_norm": 0.06282943487167358, + "learning_rate": 2.585991678224688e-06, + "loss": 0.0013, + "step": 10702 + }, + { + "epoch": 7.414617249740215, + "grad_norm": 0.08431761711835861, + "learning_rate": 2.5852981969486823e-06, + "loss": 0.0013, + "step": 10703 + }, + { + "epoch": 7.415310010391409, + "grad_norm": 0.09277688711881638, + "learning_rate": 2.584604715672677e-06, + "loss": 0.0019, + "step": 10704 + }, + { + "epoch": 7.416002771042605, + "grad_norm": 0.09888166934251785, + "learning_rate": 2.5839112343966717e-06, + "loss": 0.0017, + "step": 10705 + }, + { + "epoch": 7.4166955316938, + "grad_norm": 0.06800324469804764, + "learning_rate": 2.583217753120666e-06, + "loss": 0.0015, + "step": 10706 + }, + { + "epoch": 7.417388292344995, + "grad_norm": 0.1317177712917328, + "learning_rate": 2.5825242718446603e-06, + "loss": 0.0015, + "step": 10707 + }, + { + "epoch": 7.41808105299619, + "grad_norm": 0.42098212242126465, + "learning_rate": 2.5818307905686544e-06, + "loss": 0.0023, + "step": 10708 + }, + { + "epoch": 7.418773813647385, + "grad_norm": 0.09738583117723465, + "learning_rate": 2.5811373092926493e-06, + "loss": 0.0017, + "step": 10709 + }, + { + "epoch": 7.41946657429858, + "grad_norm": 0.12297433614730835, + "learning_rate": 2.580443828016644e-06, + "loss": 0.0019, + "step": 10710 + }, + { + "epoch": 7.4201593349497745, + "grad_norm": 0.15178140997886658, + "learning_rate": 2.5797503467406383e-06, + "loss": 0.0017, + "step": 10711 + }, + { + "epoch": 7.42085209560097, + "grad_norm": 0.05053913965821266, + "learning_rate": 2.5790568654646324e-06, + "loss": 0.0011, + "step": 10712 + }, + { + "epoch": 7.421544856252165, + "grad_norm": 0.07242614775896072, + "learning_rate": 2.578363384188627e-06, + "loss": 0.0013, + "step": 10713 + }, + { + "epoch": 7.4222376169033595, + "grad_norm": 0.4279836118221283, + "learning_rate": 2.577669902912622e-06, + "loss": 0.002, + "step": 10714 + }, + { + "epoch": 7.422930377554555, + "grad_norm": 0.13643436133861542, + "learning_rate": 2.576976421636616e-06, + "loss": 0.0018, + "step": 10715 + }, + { + "epoch": 7.42362313820575, + "grad_norm": 0.08897361159324646, + "learning_rate": 2.5762829403606105e-06, + "loss": 0.0018, + "step": 10716 + }, + { + "epoch": 7.424315898856945, + "grad_norm": 0.1289350539445877, + "learning_rate": 2.575589459084605e-06, + "loss": 0.0015, + "step": 10717 + }, + { + "epoch": 7.42500865950814, + "grad_norm": 0.04847308248281479, + "learning_rate": 2.574895977808599e-06, + "loss": 0.0012, + "step": 10718 + }, + { + "epoch": 7.425701420159335, + "grad_norm": 0.14715151488780975, + "learning_rate": 2.574202496532594e-06, + "loss": 0.002, + "step": 10719 + }, + { + "epoch": 7.42639418081053, + "grad_norm": 0.2963087260723114, + "learning_rate": 2.5735090152565885e-06, + "loss": 0.0028, + "step": 10720 + }, + { + "epoch": 7.427086941461725, + "grad_norm": 0.12237878143787384, + "learning_rate": 2.5728155339805826e-06, + "loss": 0.0021, + "step": 10721 + }, + { + "epoch": 7.42777970211292, + "grad_norm": 0.08332429826259613, + "learning_rate": 2.572122052704577e-06, + "loss": 0.0015, + "step": 10722 + }, + { + "epoch": 7.428472462764115, + "grad_norm": 0.07035665214061737, + "learning_rate": 2.571428571428571e-06, + "loss": 0.0014, + "step": 10723 + }, + { + "epoch": 7.42916522341531, + "grad_norm": 0.06702131032943726, + "learning_rate": 2.570735090152566e-06, + "loss": 0.0015, + "step": 10724 + }, + { + "epoch": 7.429857984066505, + "grad_norm": 0.18980415165424347, + "learning_rate": 2.5700416088765606e-06, + "loss": 0.0019, + "step": 10725 + }, + { + "epoch": 7.4305507447177, + "grad_norm": 0.12252097576856613, + "learning_rate": 2.569348127600555e-06, + "loss": 0.0017, + "step": 10726 + }, + { + "epoch": 7.431243505368895, + "grad_norm": 0.11813729256391525, + "learning_rate": 2.5686546463245492e-06, + "loss": 0.0016, + "step": 10727 + }, + { + "epoch": 7.43193626602009, + "grad_norm": 0.056314866989851, + "learning_rate": 2.5679611650485437e-06, + "loss": 0.0012, + "step": 10728 + }, + { + "epoch": 7.432629026671285, + "grad_norm": 0.10481024533510208, + "learning_rate": 2.5672676837725387e-06, + "loss": 0.0015, + "step": 10729 + }, + { + "epoch": 7.43332178732248, + "grad_norm": 0.14865724742412567, + "learning_rate": 2.5665742024965328e-06, + "loss": 0.0025, + "step": 10730 + }, + { + "epoch": 7.434014547973675, + "grad_norm": 0.08985798060894012, + "learning_rate": 2.5658807212205273e-06, + "loss": 0.0014, + "step": 10731 + }, + { + "epoch": 7.43470730862487, + "grad_norm": 0.07113194465637207, + "learning_rate": 2.5651872399445214e-06, + "loss": 0.0014, + "step": 10732 + }, + { + "epoch": 7.435400069276065, + "grad_norm": 0.11472031474113464, + "learning_rate": 2.564493758668516e-06, + "loss": 0.0021, + "step": 10733 + }, + { + "epoch": 7.43609282992726, + "grad_norm": 0.12708251178264618, + "learning_rate": 2.563800277392511e-06, + "loss": 0.0017, + "step": 10734 + }, + { + "epoch": 7.436785590578455, + "grad_norm": 0.196594700217247, + "learning_rate": 2.5631067961165053e-06, + "loss": 0.0034, + "step": 10735 + }, + { + "epoch": 7.43747835122965, + "grad_norm": 0.10423848778009415, + "learning_rate": 2.5624133148404994e-06, + "loss": 0.0018, + "step": 10736 + }, + { + "epoch": 7.438171111880846, + "grad_norm": 0.1535194218158722, + "learning_rate": 2.561719833564494e-06, + "loss": 0.0031, + "step": 10737 + }, + { + "epoch": 7.43886387253204, + "grad_norm": 0.2090843766927719, + "learning_rate": 2.561026352288488e-06, + "loss": 0.0023, + "step": 10738 + }, + { + "epoch": 7.439556633183235, + "grad_norm": 0.10467515885829926, + "learning_rate": 2.560332871012483e-06, + "loss": 0.0013, + "step": 10739 + }, + { + "epoch": 7.4402493938344305, + "grad_norm": 0.35857662558555603, + "learning_rate": 2.5596393897364774e-06, + "loss": 0.0035, + "step": 10740 + }, + { + "epoch": 7.440942154485625, + "grad_norm": 0.09160470962524414, + "learning_rate": 2.558945908460472e-06, + "loss": 0.0013, + "step": 10741 + }, + { + "epoch": 7.44163491513682, + "grad_norm": 0.06767398118972778, + "learning_rate": 2.558252427184466e-06, + "loss": 0.0013, + "step": 10742 + }, + { + "epoch": 7.4423276757880155, + "grad_norm": 0.08082219958305359, + "learning_rate": 2.5575589459084605e-06, + "loss": 0.0017, + "step": 10743 + }, + { + "epoch": 7.44302043643921, + "grad_norm": 0.27682968974113464, + "learning_rate": 2.5568654646324555e-06, + "loss": 0.0024, + "step": 10744 + }, + { + "epoch": 7.443713197090405, + "grad_norm": 0.11621539294719696, + "learning_rate": 2.5561719833564496e-06, + "loss": 0.0015, + "step": 10745 + }, + { + "epoch": 7.4444059577416, + "grad_norm": 0.18691407144069672, + "learning_rate": 2.555478502080444e-06, + "loss": 0.0017, + "step": 10746 + }, + { + "epoch": 7.445098718392796, + "grad_norm": 0.11532662063837051, + "learning_rate": 2.554785020804438e-06, + "loss": 0.0013, + "step": 10747 + }, + { + "epoch": 7.44579147904399, + "grad_norm": 0.10876644402742386, + "learning_rate": 2.5540915395284327e-06, + "loss": 0.0015, + "step": 10748 + }, + { + "epoch": 7.446484239695185, + "grad_norm": 0.1477528214454651, + "learning_rate": 2.5533980582524276e-06, + "loss": 0.0017, + "step": 10749 + }, + { + "epoch": 7.447177000346381, + "grad_norm": 0.5255882143974304, + "learning_rate": 2.552704576976422e-06, + "loss": 0.0018, + "step": 10750 + }, + { + "epoch": 7.447869760997575, + "grad_norm": 0.2798275351524353, + "learning_rate": 2.552011095700416e-06, + "loss": 0.0023, + "step": 10751 + }, + { + "epoch": 7.44856252164877, + "grad_norm": 0.23782400786876678, + "learning_rate": 2.5513176144244107e-06, + "loss": 0.002, + "step": 10752 + }, + { + "epoch": 7.449255282299966, + "grad_norm": 0.17771799862384796, + "learning_rate": 2.550624133148405e-06, + "loss": 0.0023, + "step": 10753 + }, + { + "epoch": 7.44994804295116, + "grad_norm": 0.13188570737838745, + "learning_rate": 2.5499306518723997e-06, + "loss": 0.0018, + "step": 10754 + }, + { + "epoch": 7.450640803602355, + "grad_norm": 0.1313145011663437, + "learning_rate": 2.5492371705963942e-06, + "loss": 0.0017, + "step": 10755 + }, + { + "epoch": 7.4513335642535505, + "grad_norm": 0.11395426839590073, + "learning_rate": 2.5485436893203887e-06, + "loss": 0.0016, + "step": 10756 + }, + { + "epoch": 7.452026324904746, + "grad_norm": 0.11718688905239105, + "learning_rate": 2.547850208044383e-06, + "loss": 0.0016, + "step": 10757 + }, + { + "epoch": 7.45271908555594, + "grad_norm": 0.1720968782901764, + "learning_rate": 2.5471567267683773e-06, + "loss": 0.0023, + "step": 10758 + }, + { + "epoch": 7.4534118462071355, + "grad_norm": 0.11114460974931717, + "learning_rate": 2.5464632454923723e-06, + "loss": 0.0019, + "step": 10759 + }, + { + "epoch": 7.454104606858331, + "grad_norm": 0.10008231550455093, + "learning_rate": 2.5457697642163664e-06, + "loss": 0.0016, + "step": 10760 + }, + { + "epoch": 7.454797367509525, + "grad_norm": 0.08406735211610794, + "learning_rate": 2.545076282940361e-06, + "loss": 0.0015, + "step": 10761 + }, + { + "epoch": 7.45549012816072, + "grad_norm": 0.09855545312166214, + "learning_rate": 2.544382801664355e-06, + "loss": 0.0018, + "step": 10762 + }, + { + "epoch": 7.456182888811916, + "grad_norm": 0.17305642366409302, + "learning_rate": 2.5436893203883495e-06, + "loss": 0.0018, + "step": 10763 + }, + { + "epoch": 7.45687564946311, + "grad_norm": 0.09363764524459839, + "learning_rate": 2.5429958391123444e-06, + "loss": 0.0016, + "step": 10764 + }, + { + "epoch": 7.457568410114305, + "grad_norm": 0.15001751482486725, + "learning_rate": 2.542302357836339e-06, + "loss": 0.0016, + "step": 10765 + }, + { + "epoch": 7.458261170765501, + "grad_norm": 0.1612853854894638, + "learning_rate": 2.541608876560333e-06, + "loss": 0.002, + "step": 10766 + }, + { + "epoch": 7.458953931416696, + "grad_norm": 0.14551077783107758, + "learning_rate": 2.5409153952843275e-06, + "loss": 0.0025, + "step": 10767 + }, + { + "epoch": 7.45964669206789, + "grad_norm": 0.09144605696201324, + "learning_rate": 2.5402219140083216e-06, + "loss": 0.0016, + "step": 10768 + }, + { + "epoch": 7.460339452719086, + "grad_norm": 0.1093723326921463, + "learning_rate": 2.5395284327323165e-06, + "loss": 0.0015, + "step": 10769 + }, + { + "epoch": 7.461032213370281, + "grad_norm": 0.10207368433475494, + "learning_rate": 2.538834951456311e-06, + "loss": 0.0015, + "step": 10770 + }, + { + "epoch": 7.461724974021475, + "grad_norm": 0.2664475739002228, + "learning_rate": 2.538141470180305e-06, + "loss": 0.0023, + "step": 10771 + }, + { + "epoch": 7.4624177346726706, + "grad_norm": 0.45225033164024353, + "learning_rate": 2.5374479889042996e-06, + "loss": 0.0035, + "step": 10772 + }, + { + "epoch": 7.463110495323866, + "grad_norm": 0.08390573412179947, + "learning_rate": 2.536754507628294e-06, + "loss": 0.0015, + "step": 10773 + }, + { + "epoch": 7.46380325597506, + "grad_norm": 0.07127819955348969, + "learning_rate": 2.536061026352289e-06, + "loss": 0.0015, + "step": 10774 + }, + { + "epoch": 7.4644960166262555, + "grad_norm": 0.13551568984985352, + "learning_rate": 2.535367545076283e-06, + "loss": 0.0016, + "step": 10775 + }, + { + "epoch": 7.465188777277451, + "grad_norm": 0.10462245345115662, + "learning_rate": 2.5346740638002777e-06, + "loss": 0.0018, + "step": 10776 + }, + { + "epoch": 7.465881537928646, + "grad_norm": 0.08148667216300964, + "learning_rate": 2.5339805825242718e-06, + "loss": 0.0013, + "step": 10777 + }, + { + "epoch": 7.4665742985798405, + "grad_norm": 0.18152490258216858, + "learning_rate": 2.5332871012482663e-06, + "loss": 0.0019, + "step": 10778 + }, + { + "epoch": 7.467267059231036, + "grad_norm": 0.10702257603406906, + "learning_rate": 2.532593619972261e-06, + "loss": 0.0016, + "step": 10779 + }, + { + "epoch": 7.467959819882231, + "grad_norm": 0.13292056322097778, + "learning_rate": 2.5319001386962557e-06, + "loss": 0.002, + "step": 10780 + }, + { + "epoch": 7.468652580533425, + "grad_norm": 0.07058043032884598, + "learning_rate": 2.53120665742025e-06, + "loss": 0.0014, + "step": 10781 + }, + { + "epoch": 7.469345341184621, + "grad_norm": 0.08694568276405334, + "learning_rate": 2.5305131761442443e-06, + "loss": 0.0015, + "step": 10782 + }, + { + "epoch": 7.470038101835816, + "grad_norm": 0.15851189196109772, + "learning_rate": 2.5298196948682384e-06, + "loss": 0.003, + "step": 10783 + }, + { + "epoch": 7.47073086248701, + "grad_norm": 0.1388746052980423, + "learning_rate": 2.5291262135922333e-06, + "loss": 0.0017, + "step": 10784 + }, + { + "epoch": 7.471423623138206, + "grad_norm": 0.1215035617351532, + "learning_rate": 2.528432732316228e-06, + "loss": 0.0016, + "step": 10785 + }, + { + "epoch": 7.472116383789401, + "grad_norm": 0.14704327285289764, + "learning_rate": 2.527739251040222e-06, + "loss": 0.0019, + "step": 10786 + }, + { + "epoch": 7.472809144440596, + "grad_norm": 0.1499715894460678, + "learning_rate": 2.5270457697642164e-06, + "loss": 0.0019, + "step": 10787 + }, + { + "epoch": 7.473501905091791, + "grad_norm": 0.14678151905536652, + "learning_rate": 2.526352288488211e-06, + "loss": 0.002, + "step": 10788 + }, + { + "epoch": 7.474194665742986, + "grad_norm": 0.35057759284973145, + "learning_rate": 2.525658807212206e-06, + "loss": 0.0022, + "step": 10789 + }, + { + "epoch": 7.474887426394181, + "grad_norm": 0.14562055468559265, + "learning_rate": 2.5249653259362e-06, + "loss": 0.0033, + "step": 10790 + }, + { + "epoch": 7.4755801870453755, + "grad_norm": 0.13831159472465515, + "learning_rate": 2.5242718446601945e-06, + "loss": 0.0017, + "step": 10791 + }, + { + "epoch": 7.476272947696571, + "grad_norm": 0.10678946226835251, + "learning_rate": 2.5235783633841886e-06, + "loss": 0.0016, + "step": 10792 + }, + { + "epoch": 7.476965708347766, + "grad_norm": 0.2228410840034485, + "learning_rate": 2.522884882108183e-06, + "loss": 0.0022, + "step": 10793 + }, + { + "epoch": 7.4776584689989605, + "grad_norm": 0.24775110185146332, + "learning_rate": 2.522191400832178e-06, + "loss": 0.0022, + "step": 10794 + }, + { + "epoch": 7.478351229650156, + "grad_norm": 0.18266801536083221, + "learning_rate": 2.521497919556172e-06, + "loss": 0.0021, + "step": 10795 + }, + { + "epoch": 7.479043990301351, + "grad_norm": 0.06549399346113205, + "learning_rate": 2.5208044382801666e-06, + "loss": 0.0014, + "step": 10796 + }, + { + "epoch": 7.479736750952546, + "grad_norm": 0.27793261408805847, + "learning_rate": 2.520110957004161e-06, + "loss": 0.0025, + "step": 10797 + }, + { + "epoch": 7.480429511603741, + "grad_norm": 0.08995774388313293, + "learning_rate": 2.519417475728155e-06, + "loss": 0.0017, + "step": 10798 + }, + { + "epoch": 7.481122272254936, + "grad_norm": 0.2409883588552475, + "learning_rate": 2.51872399445215e-06, + "loss": 0.0043, + "step": 10799 + }, + { + "epoch": 7.481815032906131, + "grad_norm": 0.11554820835590363, + "learning_rate": 2.5180305131761446e-06, + "loss": 0.0024, + "step": 10800 + }, + { + "epoch": 7.482507793557326, + "grad_norm": 0.17683421075344086, + "learning_rate": 2.5173370319001387e-06, + "loss": 0.0019, + "step": 10801 + }, + { + "epoch": 7.483200554208521, + "grad_norm": 0.12466317415237427, + "learning_rate": 2.5166435506241332e-06, + "loss": 0.002, + "step": 10802 + }, + { + "epoch": 7.483893314859716, + "grad_norm": 0.7196019291877747, + "learning_rate": 2.5159500693481277e-06, + "loss": 0.0017, + "step": 10803 + }, + { + "epoch": 7.484586075510911, + "grad_norm": 0.690550684928894, + "learning_rate": 2.5152565880721227e-06, + "loss": 0.0018, + "step": 10804 + }, + { + "epoch": 7.485278836162106, + "grad_norm": 0.12522132694721222, + "learning_rate": 2.5145631067961168e-06, + "loss": 0.002, + "step": 10805 + }, + { + "epoch": 7.485971596813301, + "grad_norm": 0.33406364917755127, + "learning_rate": 2.5138696255201113e-06, + "loss": 0.0019, + "step": 10806 + }, + { + "epoch": 7.486664357464496, + "grad_norm": 0.18372103571891785, + "learning_rate": 2.5131761442441054e-06, + "loss": 0.0022, + "step": 10807 + }, + { + "epoch": 7.487357118115691, + "grad_norm": 0.20089584589004517, + "learning_rate": 2.5124826629681e-06, + "loss": 0.0016, + "step": 10808 + }, + { + "epoch": 7.488049878766886, + "grad_norm": 0.4255248010158539, + "learning_rate": 2.511789181692095e-06, + "loss": 0.002, + "step": 10809 + }, + { + "epoch": 7.488742639418081, + "grad_norm": 0.16915470361709595, + "learning_rate": 2.511095700416089e-06, + "loss": 0.0021, + "step": 10810 + }, + { + "epoch": 7.489435400069276, + "grad_norm": 0.2463148981332779, + "learning_rate": 2.5104022191400834e-06, + "loss": 0.0016, + "step": 10811 + }, + { + "epoch": 7.490128160720471, + "grad_norm": 0.20457732677459717, + "learning_rate": 2.509708737864078e-06, + "loss": 0.0019, + "step": 10812 + }, + { + "epoch": 7.490820921371666, + "grad_norm": 0.14776448905467987, + "learning_rate": 2.509015256588072e-06, + "loss": 0.0018, + "step": 10813 + }, + { + "epoch": 7.491513682022861, + "grad_norm": 0.1816481500864029, + "learning_rate": 2.508321775312067e-06, + "loss": 0.002, + "step": 10814 + }, + { + "epoch": 7.492206442674056, + "grad_norm": 0.0913500189781189, + "learning_rate": 2.5076282940360614e-06, + "loss": 0.0016, + "step": 10815 + }, + { + "epoch": 7.492899203325251, + "grad_norm": 0.14668625593185425, + "learning_rate": 2.5069348127600555e-06, + "loss": 0.002, + "step": 10816 + }, + { + "epoch": 7.493591963976446, + "grad_norm": 0.2666305899620056, + "learning_rate": 2.50624133148405e-06, + "loss": 0.0022, + "step": 10817 + }, + { + "epoch": 7.494284724627641, + "grad_norm": 0.14975956082344055, + "learning_rate": 2.5055478502080445e-06, + "loss": 0.0022, + "step": 10818 + }, + { + "epoch": 7.494977485278836, + "grad_norm": 0.07446333765983582, + "learning_rate": 2.504854368932039e-06, + "loss": 0.0014, + "step": 10819 + }, + { + "epoch": 7.4956702459300315, + "grad_norm": 0.10726433247327805, + "learning_rate": 2.5041608876560336e-06, + "loss": 0.0016, + "step": 10820 + }, + { + "epoch": 7.496363006581226, + "grad_norm": 0.10890372842550278, + "learning_rate": 2.503467406380028e-06, + "loss": 0.0015, + "step": 10821 + }, + { + "epoch": 7.497055767232421, + "grad_norm": 0.10032851994037628, + "learning_rate": 2.502773925104022e-06, + "loss": 0.0019, + "step": 10822 + }, + { + "epoch": 7.4977485278836165, + "grad_norm": 0.12057121098041534, + "learning_rate": 2.5020804438280167e-06, + "loss": 0.0018, + "step": 10823 + }, + { + "epoch": 7.498441288534811, + "grad_norm": 0.1493215560913086, + "learning_rate": 2.5013869625520116e-06, + "loss": 0.0017, + "step": 10824 + }, + { + "epoch": 7.499134049186006, + "grad_norm": 0.07878365367650986, + "learning_rate": 2.5006934812760057e-06, + "loss": 0.0014, + "step": 10825 + }, + { + "epoch": 7.499826809837201, + "grad_norm": 0.20616844296455383, + "learning_rate": 2.5e-06, + "loss": 0.002, + "step": 10826 + }, + { + "epoch": 7.500519570488397, + "grad_norm": 0.30661019682884216, + "learning_rate": 2.4993065187239947e-06, + "loss": 0.0044, + "step": 10827 + }, + { + "epoch": 7.501212331139591, + "grad_norm": 0.09549485146999359, + "learning_rate": 2.4986130374479892e-06, + "loss": 0.0016, + "step": 10828 + }, + { + "epoch": 7.501905091790786, + "grad_norm": 0.1462438851594925, + "learning_rate": 2.4979195561719833e-06, + "loss": 0.0017, + "step": 10829 + }, + { + "epoch": 7.502597852441982, + "grad_norm": 0.20211417973041534, + "learning_rate": 2.4972260748959782e-06, + "loss": 0.0018, + "step": 10830 + }, + { + "epoch": 7.503290613093176, + "grad_norm": 0.5373084545135498, + "learning_rate": 2.4965325936199723e-06, + "loss": 0.003, + "step": 10831 + }, + { + "epoch": 7.503983373744371, + "grad_norm": 0.09695852547883987, + "learning_rate": 2.495839112343967e-06, + "loss": 0.0019, + "step": 10832 + }, + { + "epoch": 7.504676134395567, + "grad_norm": 0.12500806152820587, + "learning_rate": 2.4951456310679614e-06, + "loss": 0.0025, + "step": 10833 + }, + { + "epoch": 7.505368895046761, + "grad_norm": 0.09857065975666046, + "learning_rate": 2.494452149791956e-06, + "loss": 0.0018, + "step": 10834 + }, + { + "epoch": 7.506061655697956, + "grad_norm": 0.08579905331134796, + "learning_rate": 2.4937586685159504e-06, + "loss": 0.0016, + "step": 10835 + }, + { + "epoch": 7.5067544163491515, + "grad_norm": 0.11580824106931686, + "learning_rate": 2.493065187239945e-06, + "loss": 0.0017, + "step": 10836 + }, + { + "epoch": 7.507447177000346, + "grad_norm": 0.119056336581707, + "learning_rate": 2.492371705963939e-06, + "loss": 0.0025, + "step": 10837 + }, + { + "epoch": 7.508139937651541, + "grad_norm": 0.18927937746047974, + "learning_rate": 2.491678224687934e-06, + "loss": 0.0017, + "step": 10838 + }, + { + "epoch": 7.5088326983027365, + "grad_norm": 0.21251261234283447, + "learning_rate": 2.490984743411928e-06, + "loss": 0.0025, + "step": 10839 + }, + { + "epoch": 7.509525458953932, + "grad_norm": 0.05846820026636124, + "learning_rate": 2.4902912621359225e-06, + "loss": 0.0013, + "step": 10840 + }, + { + "epoch": 7.510218219605126, + "grad_norm": 0.06526561826467514, + "learning_rate": 2.489597780859917e-06, + "loss": 0.0013, + "step": 10841 + }, + { + "epoch": 7.510910980256321, + "grad_norm": 0.14574207365512848, + "learning_rate": 2.4889042995839115e-06, + "loss": 0.0014, + "step": 10842 + }, + { + "epoch": 7.511603740907517, + "grad_norm": 0.08202770352363586, + "learning_rate": 2.488210818307906e-06, + "loss": 0.0015, + "step": 10843 + }, + { + "epoch": 7.512296501558711, + "grad_norm": 0.257617324590683, + "learning_rate": 2.4875173370319e-06, + "loss": 0.0026, + "step": 10844 + }, + { + "epoch": 7.512989262209906, + "grad_norm": 0.16567140817642212, + "learning_rate": 2.486823855755895e-06, + "loss": 0.002, + "step": 10845 + }, + { + "epoch": 7.513682022861102, + "grad_norm": 0.41710230708122253, + "learning_rate": 2.486130374479889e-06, + "loss": 0.0024, + "step": 10846 + }, + { + "epoch": 7.514374783512297, + "grad_norm": 0.12412635236978531, + "learning_rate": 2.4854368932038836e-06, + "loss": 0.0021, + "step": 10847 + }, + { + "epoch": 7.515067544163491, + "grad_norm": 0.09788189828395844, + "learning_rate": 2.484743411927878e-06, + "loss": 0.0019, + "step": 10848 + }, + { + "epoch": 7.515760304814687, + "grad_norm": 0.07323741912841797, + "learning_rate": 2.4840499306518727e-06, + "loss": 0.0012, + "step": 10849 + }, + { + "epoch": 7.516453065465882, + "grad_norm": 0.13336296379566193, + "learning_rate": 2.483356449375867e-06, + "loss": 0.0016, + "step": 10850 + }, + { + "epoch": 7.517145826117076, + "grad_norm": 0.08538836240768433, + "learning_rate": 2.4826629680998617e-06, + "loss": 0.0014, + "step": 10851 + }, + { + "epoch": 7.5178385867682715, + "grad_norm": 0.10788532346487045, + "learning_rate": 2.4819694868238558e-06, + "loss": 0.0016, + "step": 10852 + }, + { + "epoch": 7.518531347419467, + "grad_norm": 0.11885540187358856, + "learning_rate": 2.4812760055478503e-06, + "loss": 0.0016, + "step": 10853 + }, + { + "epoch": 7.519224108070661, + "grad_norm": 0.2749646008014679, + "learning_rate": 2.4805825242718448e-06, + "loss": 0.002, + "step": 10854 + }, + { + "epoch": 7.5199168687218565, + "grad_norm": 0.11428909003734589, + "learning_rate": 2.4798890429958393e-06, + "loss": 0.0019, + "step": 10855 + }, + { + "epoch": 7.520609629373052, + "grad_norm": 0.14908888936042786, + "learning_rate": 2.479195561719834e-06, + "loss": 0.0024, + "step": 10856 + }, + { + "epoch": 7.521302390024246, + "grad_norm": 0.207083061337471, + "learning_rate": 2.4785020804438283e-06, + "loss": 0.0018, + "step": 10857 + }, + { + "epoch": 7.521995150675441, + "grad_norm": 0.11959749460220337, + "learning_rate": 2.477808599167823e-06, + "loss": 0.0018, + "step": 10858 + }, + { + "epoch": 7.522687911326637, + "grad_norm": 0.10812567174434662, + "learning_rate": 2.477115117891817e-06, + "loss": 0.0019, + "step": 10859 + }, + { + "epoch": 7.523380671977832, + "grad_norm": 0.138017937541008, + "learning_rate": 2.476421636615812e-06, + "loss": 0.0018, + "step": 10860 + }, + { + "epoch": 7.524073432629026, + "grad_norm": 0.38856616616249084, + "learning_rate": 2.475728155339806e-06, + "loss": 0.0024, + "step": 10861 + }, + { + "epoch": 7.524766193280222, + "grad_norm": 0.140238419175148, + "learning_rate": 2.4750346740638004e-06, + "loss": 0.0019, + "step": 10862 + }, + { + "epoch": 7.525458953931417, + "grad_norm": 0.09630647301673889, + "learning_rate": 2.474341192787795e-06, + "loss": 0.002, + "step": 10863 + }, + { + "epoch": 7.526151714582611, + "grad_norm": 0.07080446183681488, + "learning_rate": 2.4736477115117895e-06, + "loss": 0.0016, + "step": 10864 + }, + { + "epoch": 7.526844475233807, + "grad_norm": 0.14970698952674866, + "learning_rate": 2.472954230235784e-06, + "loss": 0.0024, + "step": 10865 + }, + { + "epoch": 7.527537235885002, + "grad_norm": 0.0889345034956932, + "learning_rate": 2.4722607489597785e-06, + "loss": 0.0014, + "step": 10866 + }, + { + "epoch": 7.528229996536197, + "grad_norm": 0.2585148513317108, + "learning_rate": 2.4715672676837726e-06, + "loss": 0.0028, + "step": 10867 + }, + { + "epoch": 7.528922757187392, + "grad_norm": 0.11531859636306763, + "learning_rate": 2.470873786407767e-06, + "loss": 0.0018, + "step": 10868 + }, + { + "epoch": 7.529615517838587, + "grad_norm": 0.1980704963207245, + "learning_rate": 2.4701803051317616e-06, + "loss": 0.0017, + "step": 10869 + }, + { + "epoch": 7.530308278489782, + "grad_norm": 0.24060817062854767, + "learning_rate": 2.469486823855756e-06, + "loss": 0.0019, + "step": 10870 + }, + { + "epoch": 7.5310010391409765, + "grad_norm": 0.12572532892227173, + "learning_rate": 2.4687933425797506e-06, + "loss": 0.0016, + "step": 10871 + }, + { + "epoch": 7.531693799792172, + "grad_norm": 0.11596440523862839, + "learning_rate": 2.4680998613037447e-06, + "loss": 0.0018, + "step": 10872 + }, + { + "epoch": 7.532386560443367, + "grad_norm": 0.3188724219799042, + "learning_rate": 2.4674063800277396e-06, + "loss": 0.0031, + "step": 10873 + }, + { + "epoch": 7.5330793210945615, + "grad_norm": 0.05614947900176048, + "learning_rate": 2.4667128987517337e-06, + "loss": 0.0014, + "step": 10874 + }, + { + "epoch": 7.533772081745757, + "grad_norm": 0.08537238091230392, + "learning_rate": 2.4660194174757286e-06, + "loss": 0.0018, + "step": 10875 + }, + { + "epoch": 7.534464842396952, + "grad_norm": 0.30908671021461487, + "learning_rate": 2.4653259361997227e-06, + "loss": 0.0025, + "step": 10876 + }, + { + "epoch": 7.535157603048146, + "grad_norm": 0.10444451123476028, + "learning_rate": 2.4646324549237172e-06, + "loss": 0.0017, + "step": 10877 + }, + { + "epoch": 7.535850363699342, + "grad_norm": 0.3646766245365143, + "learning_rate": 2.4639389736477118e-06, + "loss": 0.0021, + "step": 10878 + }, + { + "epoch": 7.536543124350537, + "grad_norm": 0.09241663664579391, + "learning_rate": 2.4632454923717063e-06, + "loss": 0.0015, + "step": 10879 + }, + { + "epoch": 7.537235885001732, + "grad_norm": 0.061191245913505554, + "learning_rate": 2.4625520110957008e-06, + "loss": 0.0012, + "step": 10880 + }, + { + "epoch": 7.537928645652927, + "grad_norm": 0.09165678918361664, + "learning_rate": 2.4618585298196953e-06, + "loss": 0.002, + "step": 10881 + }, + { + "epoch": 7.538621406304122, + "grad_norm": 0.05164121091365814, + "learning_rate": 2.4611650485436894e-06, + "loss": 0.0013, + "step": 10882 + }, + { + "epoch": 7.539314166955317, + "grad_norm": 0.10606106370687485, + "learning_rate": 2.460471567267684e-06, + "loss": 0.0018, + "step": 10883 + }, + { + "epoch": 7.540006927606512, + "grad_norm": 0.20772795379161835, + "learning_rate": 2.4597780859916784e-06, + "loss": 0.0024, + "step": 10884 + }, + { + "epoch": 7.540699688257707, + "grad_norm": 0.09767861664295197, + "learning_rate": 2.459084604715673e-06, + "loss": 0.0015, + "step": 10885 + }, + { + "epoch": 7.541392448908902, + "grad_norm": 0.06639274209737778, + "learning_rate": 2.4583911234396674e-06, + "loss": 0.0013, + "step": 10886 + }, + { + "epoch": 7.542085209560097, + "grad_norm": 0.07911796867847443, + "learning_rate": 2.4576976421636615e-06, + "loss": 0.0015, + "step": 10887 + }, + { + "epoch": 7.542777970211292, + "grad_norm": 0.16360414028167725, + "learning_rate": 2.4570041608876564e-06, + "loss": 0.0021, + "step": 10888 + }, + { + "epoch": 7.543470730862487, + "grad_norm": 0.1348550170660019, + "learning_rate": 2.4563106796116505e-06, + "loss": 0.0022, + "step": 10889 + }, + { + "epoch": 7.544163491513682, + "grad_norm": 0.1356428563594818, + "learning_rate": 2.4556171983356454e-06, + "loss": 0.0017, + "step": 10890 + }, + { + "epoch": 7.544856252164877, + "grad_norm": 0.08549313992261887, + "learning_rate": 2.4549237170596395e-06, + "loss": 0.0015, + "step": 10891 + }, + { + "epoch": 7.545549012816072, + "grad_norm": 0.17838850617408752, + "learning_rate": 2.454230235783634e-06, + "loss": 0.0016, + "step": 10892 + }, + { + "epoch": 7.546241773467267, + "grad_norm": 0.1340678632259369, + "learning_rate": 2.4535367545076286e-06, + "loss": 0.0019, + "step": 10893 + }, + { + "epoch": 7.546934534118462, + "grad_norm": 0.06063477322459221, + "learning_rate": 2.452843273231623e-06, + "loss": 0.0012, + "step": 10894 + }, + { + "epoch": 7.547627294769657, + "grad_norm": 0.3364317715167999, + "learning_rate": 2.4521497919556176e-06, + "loss": 0.0028, + "step": 10895 + }, + { + "epoch": 7.548320055420852, + "grad_norm": 0.23646342754364014, + "learning_rate": 2.4514563106796117e-06, + "loss": 0.0017, + "step": 10896 + }, + { + "epoch": 7.549012816072047, + "grad_norm": 0.25620752573013306, + "learning_rate": 2.450762829403606e-06, + "loss": 0.0029, + "step": 10897 + }, + { + "epoch": 7.549705576723242, + "grad_norm": 0.5714073181152344, + "learning_rate": 2.4500693481276007e-06, + "loss": 0.003, + "step": 10898 + }, + { + "epoch": 7.550398337374437, + "grad_norm": 0.14835543930530548, + "learning_rate": 2.449375866851595e-06, + "loss": 0.0017, + "step": 10899 + }, + { + "epoch": 7.5510910980256325, + "grad_norm": 0.26132914423942566, + "learning_rate": 2.4486823855755897e-06, + "loss": 0.0025, + "step": 10900 + }, + { + "epoch": 7.551783858676827, + "grad_norm": 0.4453752934932709, + "learning_rate": 2.447988904299584e-06, + "loss": 0.0034, + "step": 10901 + }, + { + "epoch": 7.552476619328022, + "grad_norm": 0.15118376910686493, + "learning_rate": 2.4472954230235783e-06, + "loss": 0.0018, + "step": 10902 + }, + { + "epoch": 7.5531693799792174, + "grad_norm": 0.11513940989971161, + "learning_rate": 2.4466019417475732e-06, + "loss": 0.0017, + "step": 10903 + }, + { + "epoch": 7.553862140630412, + "grad_norm": 0.4208213984966278, + "learning_rate": 2.4459084604715673e-06, + "loss": 0.0028, + "step": 10904 + }, + { + "epoch": 7.554554901281607, + "grad_norm": 0.06580349802970886, + "learning_rate": 2.4452149791955623e-06, + "loss": 0.0014, + "step": 10905 + }, + { + "epoch": 7.555247661932802, + "grad_norm": 0.14686036109924316, + "learning_rate": 2.4445214979195563e-06, + "loss": 0.0015, + "step": 10906 + }, + { + "epoch": 7.555940422583998, + "grad_norm": 0.22417961061000824, + "learning_rate": 2.443828016643551e-06, + "loss": 0.003, + "step": 10907 + }, + { + "epoch": 7.556633183235192, + "grad_norm": 0.12621311843395233, + "learning_rate": 2.4431345353675454e-06, + "loss": 0.0023, + "step": 10908 + }, + { + "epoch": 7.557325943886387, + "grad_norm": 0.1350223869085312, + "learning_rate": 2.44244105409154e-06, + "loss": 0.0018, + "step": 10909 + }, + { + "epoch": 7.558018704537583, + "grad_norm": 0.2834029793739319, + "learning_rate": 2.4417475728155344e-06, + "loss": 0.0027, + "step": 10910 + }, + { + "epoch": 7.558711465188777, + "grad_norm": 0.21382243931293488, + "learning_rate": 2.4410540915395285e-06, + "loss": 0.003, + "step": 10911 + }, + { + "epoch": 7.559404225839972, + "grad_norm": 0.1368580311536789, + "learning_rate": 2.440360610263523e-06, + "loss": 0.0022, + "step": 10912 + }, + { + "epoch": 7.560096986491168, + "grad_norm": 0.10384763777256012, + "learning_rate": 2.4396671289875175e-06, + "loss": 0.0016, + "step": 10913 + }, + { + "epoch": 7.560789747142362, + "grad_norm": 0.28811296820640564, + "learning_rate": 2.438973647711512e-06, + "loss": 0.0025, + "step": 10914 + }, + { + "epoch": 7.561482507793557, + "grad_norm": 0.160031259059906, + "learning_rate": 2.4382801664355065e-06, + "loss": 0.0018, + "step": 10915 + }, + { + "epoch": 7.5621752684447525, + "grad_norm": 0.27531707286834717, + "learning_rate": 2.437586685159501e-06, + "loss": 0.0022, + "step": 10916 + }, + { + "epoch": 7.562868029095947, + "grad_norm": 0.3410033881664276, + "learning_rate": 2.436893203883495e-06, + "loss": 0.0032, + "step": 10917 + }, + { + "epoch": 7.563560789747142, + "grad_norm": 0.12519578635692596, + "learning_rate": 2.43619972260749e-06, + "loss": 0.0018, + "step": 10918 + }, + { + "epoch": 7.5642535503983375, + "grad_norm": 0.1801953762769699, + "learning_rate": 2.435506241331484e-06, + "loss": 0.0017, + "step": 10919 + }, + { + "epoch": 7.564946311049533, + "grad_norm": 0.07746082544326782, + "learning_rate": 2.4348127600554786e-06, + "loss": 0.0013, + "step": 10920 + }, + { + "epoch": 7.565639071700727, + "grad_norm": 0.08023972064256668, + "learning_rate": 2.434119278779473e-06, + "loss": 0.0015, + "step": 10921 + }, + { + "epoch": 7.566331832351922, + "grad_norm": 0.218221977353096, + "learning_rate": 2.4334257975034676e-06, + "loss": 0.0018, + "step": 10922 + }, + { + "epoch": 7.567024593003118, + "grad_norm": 0.4734841585159302, + "learning_rate": 2.432732316227462e-06, + "loss": 0.0029, + "step": 10923 + }, + { + "epoch": 7.567717353654312, + "grad_norm": 0.27004826068878174, + "learning_rate": 2.4320388349514567e-06, + "loss": 0.0024, + "step": 10924 + }, + { + "epoch": 7.568410114305507, + "grad_norm": 0.17549893260002136, + "learning_rate": 2.4313453536754508e-06, + "loss": 0.0019, + "step": 10925 + }, + { + "epoch": 7.569102874956703, + "grad_norm": 0.1410718560218811, + "learning_rate": 2.4306518723994453e-06, + "loss": 0.0016, + "step": 10926 + }, + { + "epoch": 7.569795635607898, + "grad_norm": 0.0840911790728569, + "learning_rate": 2.4299583911234398e-06, + "loss": 0.0015, + "step": 10927 + }, + { + "epoch": 7.570488396259092, + "grad_norm": 0.17833329737186432, + "learning_rate": 2.4292649098474343e-06, + "loss": 0.0025, + "step": 10928 + }, + { + "epoch": 7.571181156910288, + "grad_norm": 0.1955181509256363, + "learning_rate": 2.428571428571429e-06, + "loss": 0.0024, + "step": 10929 + }, + { + "epoch": 7.571873917561483, + "grad_norm": 0.2516569495201111, + "learning_rate": 2.427877947295423e-06, + "loss": 0.0027, + "step": 10930 + }, + { + "epoch": 7.572566678212677, + "grad_norm": 0.1913844347000122, + "learning_rate": 2.427184466019418e-06, + "loss": 0.0026, + "step": 10931 + }, + { + "epoch": 7.5732594388638725, + "grad_norm": 0.11958225071430206, + "learning_rate": 2.426490984743412e-06, + "loss": 0.0018, + "step": 10932 + }, + { + "epoch": 7.573952199515068, + "grad_norm": 0.11717060208320618, + "learning_rate": 2.425797503467407e-06, + "loss": 0.0016, + "step": 10933 + }, + { + "epoch": 7.574644960166262, + "grad_norm": 0.3873917758464813, + "learning_rate": 2.425104022191401e-06, + "loss": 0.002, + "step": 10934 + }, + { + "epoch": 7.5753377208174575, + "grad_norm": 0.17179837822914124, + "learning_rate": 2.4244105409153954e-06, + "loss": 0.002, + "step": 10935 + }, + { + "epoch": 7.576030481468653, + "grad_norm": 0.1312446892261505, + "learning_rate": 2.42371705963939e-06, + "loss": 0.0019, + "step": 10936 + }, + { + "epoch": 7.576723242119847, + "grad_norm": 0.08735194057226181, + "learning_rate": 2.4230235783633844e-06, + "loss": 0.0014, + "step": 10937 + }, + { + "epoch": 7.577416002771042, + "grad_norm": 0.3911072611808777, + "learning_rate": 2.422330097087379e-06, + "loss": 0.0021, + "step": 10938 + }, + { + "epoch": 7.578108763422238, + "grad_norm": 0.14175733923912048, + "learning_rate": 2.4216366158113735e-06, + "loss": 0.003, + "step": 10939 + }, + { + "epoch": 7.578801524073433, + "grad_norm": 0.2668286859989166, + "learning_rate": 2.4209431345353676e-06, + "loss": 0.0022, + "step": 10940 + }, + { + "epoch": 7.579494284724627, + "grad_norm": 0.12190745025873184, + "learning_rate": 2.420249653259362e-06, + "loss": 0.0015, + "step": 10941 + }, + { + "epoch": 7.580187045375823, + "grad_norm": 0.1163494661450386, + "learning_rate": 2.4195561719833566e-06, + "loss": 0.0015, + "step": 10942 + }, + { + "epoch": 7.580879806027018, + "grad_norm": 0.10534106194972992, + "learning_rate": 2.418862690707351e-06, + "loss": 0.0019, + "step": 10943 + }, + { + "epoch": 7.581572566678212, + "grad_norm": 0.13300900161266327, + "learning_rate": 2.4181692094313456e-06, + "loss": 0.0019, + "step": 10944 + }, + { + "epoch": 7.582265327329408, + "grad_norm": 0.10963454842567444, + "learning_rate": 2.4174757281553397e-06, + "loss": 0.0016, + "step": 10945 + }, + { + "epoch": 7.582958087980603, + "grad_norm": 1.2136335372924805, + "learning_rate": 2.4167822468793346e-06, + "loss": 0.0039, + "step": 10946 + }, + { + "epoch": 7.583650848631798, + "grad_norm": 0.12151437252759933, + "learning_rate": 2.4160887656033287e-06, + "loss": 0.0019, + "step": 10947 + }, + { + "epoch": 7.584343609282993, + "grad_norm": 0.09788265824317932, + "learning_rate": 2.4153952843273236e-06, + "loss": 0.0016, + "step": 10948 + }, + { + "epoch": 7.585036369934188, + "grad_norm": 0.09877084195613861, + "learning_rate": 2.4147018030513177e-06, + "loss": 0.0016, + "step": 10949 + }, + { + "epoch": 7.585729130585383, + "grad_norm": 0.06931530684232712, + "learning_rate": 2.4140083217753122e-06, + "loss": 0.0015, + "step": 10950 + }, + { + "epoch": 7.5864218912365775, + "grad_norm": 0.16568750143051147, + "learning_rate": 2.4133148404993067e-06, + "loss": 0.0027, + "step": 10951 + }, + { + "epoch": 7.587114651887773, + "grad_norm": 0.21498282253742218, + "learning_rate": 2.4126213592233013e-06, + "loss": 0.0019, + "step": 10952 + }, + { + "epoch": 7.587807412538968, + "grad_norm": 0.236845925450325, + "learning_rate": 2.4119278779472958e-06, + "loss": 0.0015, + "step": 10953 + }, + { + "epoch": 7.5885001731901625, + "grad_norm": 0.12705692648887634, + "learning_rate": 2.41123439667129e-06, + "loss": 0.0013, + "step": 10954 + }, + { + "epoch": 7.589192933841358, + "grad_norm": 0.21156752109527588, + "learning_rate": 2.4105409153952844e-06, + "loss": 0.0019, + "step": 10955 + }, + { + "epoch": 7.589885694492553, + "grad_norm": 0.15537095069885254, + "learning_rate": 2.409847434119279e-06, + "loss": 0.0018, + "step": 10956 + }, + { + "epoch": 7.590578455143747, + "grad_norm": 0.22730201482772827, + "learning_rate": 2.4091539528432734e-06, + "loss": 0.0031, + "step": 10957 + }, + { + "epoch": 7.591271215794943, + "grad_norm": 0.19640260934829712, + "learning_rate": 2.408460471567268e-06, + "loss": 0.0021, + "step": 10958 + }, + { + "epoch": 7.591963976446138, + "grad_norm": 0.10736030340194702, + "learning_rate": 2.4077669902912624e-06, + "loss": 0.0019, + "step": 10959 + }, + { + "epoch": 7.592656737097333, + "grad_norm": 0.07244516909122467, + "learning_rate": 2.4070735090152565e-06, + "loss": 0.0013, + "step": 10960 + }, + { + "epoch": 7.593349497748528, + "grad_norm": 0.4546343982219696, + "learning_rate": 2.4063800277392514e-06, + "loss": 0.0028, + "step": 10961 + }, + { + "epoch": 7.594042258399723, + "grad_norm": 0.13772660493850708, + "learning_rate": 2.4056865464632455e-06, + "loss": 0.0022, + "step": 10962 + }, + { + "epoch": 7.594735019050918, + "grad_norm": 0.1508208066225052, + "learning_rate": 2.4049930651872404e-06, + "loss": 0.0018, + "step": 10963 + }, + { + "epoch": 7.595427779702113, + "grad_norm": 0.21987207233905792, + "learning_rate": 2.4042995839112345e-06, + "loss": 0.002, + "step": 10964 + }, + { + "epoch": 7.596120540353308, + "grad_norm": 0.12725728750228882, + "learning_rate": 2.403606102635229e-06, + "loss": 0.0017, + "step": 10965 + }, + { + "epoch": 7.596813301004503, + "grad_norm": 0.11235237866640091, + "learning_rate": 2.4029126213592235e-06, + "loss": 0.0018, + "step": 10966 + }, + { + "epoch": 7.597506061655698, + "grad_norm": 0.1461298167705536, + "learning_rate": 2.402219140083218e-06, + "loss": 0.0022, + "step": 10967 + }, + { + "epoch": 7.598198822306893, + "grad_norm": 0.10823129862546921, + "learning_rate": 2.4015256588072126e-06, + "loss": 0.0017, + "step": 10968 + }, + { + "epoch": 7.598891582958088, + "grad_norm": 0.2289755642414093, + "learning_rate": 2.4008321775312066e-06, + "loss": 0.0021, + "step": 10969 + }, + { + "epoch": 7.599584343609283, + "grad_norm": 0.40313395857810974, + "learning_rate": 2.400138696255201e-06, + "loss": 0.0029, + "step": 10970 + }, + { + "epoch": 7.600277104260478, + "grad_norm": 0.07987464964389801, + "learning_rate": 2.3994452149791957e-06, + "loss": 0.0016, + "step": 10971 + }, + { + "epoch": 7.600969864911673, + "grad_norm": 0.09876357764005661, + "learning_rate": 2.39875173370319e-06, + "loss": 0.0017, + "step": 10972 + }, + { + "epoch": 7.601662625562868, + "grad_norm": 0.08993715047836304, + "learning_rate": 2.3980582524271847e-06, + "loss": 0.0017, + "step": 10973 + }, + { + "epoch": 7.602355386214063, + "grad_norm": 0.1425146609544754, + "learning_rate": 2.397364771151179e-06, + "loss": 0.0019, + "step": 10974 + }, + { + "epoch": 7.603048146865258, + "grad_norm": 0.5991483926773071, + "learning_rate": 2.3966712898751733e-06, + "loss": 0.0032, + "step": 10975 + }, + { + "epoch": 7.603740907516453, + "grad_norm": 0.11611495912075043, + "learning_rate": 2.3959778085991682e-06, + "loss": 0.0014, + "step": 10976 + }, + { + "epoch": 7.604433668167648, + "grad_norm": 0.11563742905855179, + "learning_rate": 2.3952843273231623e-06, + "loss": 0.0021, + "step": 10977 + }, + { + "epoch": 7.605126428818843, + "grad_norm": 0.15784800052642822, + "learning_rate": 2.394590846047157e-06, + "loss": 0.0017, + "step": 10978 + }, + { + "epoch": 7.605819189470038, + "grad_norm": 0.12296920269727707, + "learning_rate": 2.3938973647711513e-06, + "loss": 0.0014, + "step": 10979 + }, + { + "epoch": 7.6065119501212335, + "grad_norm": 0.2830219566822052, + "learning_rate": 2.393203883495146e-06, + "loss": 0.0018, + "step": 10980 + }, + { + "epoch": 7.607204710772428, + "grad_norm": 0.11406715214252472, + "learning_rate": 2.3925104022191403e-06, + "loss": 0.0021, + "step": 10981 + }, + { + "epoch": 7.607897471423623, + "grad_norm": 0.1440214216709137, + "learning_rate": 2.391816920943135e-06, + "loss": 0.0016, + "step": 10982 + }, + { + "epoch": 7.608590232074818, + "grad_norm": 0.17547141015529633, + "learning_rate": 2.3911234396671294e-06, + "loss": 0.002, + "step": 10983 + }, + { + "epoch": 7.609282992726013, + "grad_norm": 0.18593452870845795, + "learning_rate": 2.3904299583911235e-06, + "loss": 0.0019, + "step": 10984 + }, + { + "epoch": 7.609975753377208, + "grad_norm": 0.07089821994304657, + "learning_rate": 2.389736477115118e-06, + "loss": 0.0014, + "step": 10985 + }, + { + "epoch": 7.610668514028403, + "grad_norm": 0.10841097682714462, + "learning_rate": 2.3890429958391125e-06, + "loss": 0.0015, + "step": 10986 + }, + { + "epoch": 7.611361274679599, + "grad_norm": 0.14363816380500793, + "learning_rate": 2.388349514563107e-06, + "loss": 0.0019, + "step": 10987 + }, + { + "epoch": 7.612054035330793, + "grad_norm": 0.16981837153434753, + "learning_rate": 2.3876560332871015e-06, + "loss": 0.0019, + "step": 10988 + }, + { + "epoch": 7.612746795981988, + "grad_norm": 0.22910691797733307, + "learning_rate": 2.386962552011096e-06, + "loss": 0.0026, + "step": 10989 + }, + { + "epoch": 7.613439556633184, + "grad_norm": 0.2676429748535156, + "learning_rate": 2.38626907073509e-06, + "loss": 0.0018, + "step": 10990 + }, + { + "epoch": 7.614132317284378, + "grad_norm": 0.09594810754060745, + "learning_rate": 2.385575589459085e-06, + "loss": 0.0017, + "step": 10991 + }, + { + "epoch": 7.614825077935573, + "grad_norm": 0.16593818366527557, + "learning_rate": 2.384882108183079e-06, + "loss": 0.0029, + "step": 10992 + }, + { + "epoch": 7.615517838586769, + "grad_norm": 0.07593372464179993, + "learning_rate": 2.3841886269070736e-06, + "loss": 0.0015, + "step": 10993 + }, + { + "epoch": 7.616210599237963, + "grad_norm": 0.12001953274011612, + "learning_rate": 2.383495145631068e-06, + "loss": 0.0014, + "step": 10994 + }, + { + "epoch": 7.616903359889158, + "grad_norm": 0.09538743644952774, + "learning_rate": 2.3828016643550626e-06, + "loss": 0.0018, + "step": 10995 + }, + { + "epoch": 7.6175961205403535, + "grad_norm": 0.06194775551557541, + "learning_rate": 2.382108183079057e-06, + "loss": 0.0012, + "step": 10996 + }, + { + "epoch": 7.618288881191548, + "grad_norm": 0.12936419248580933, + "learning_rate": 2.3814147018030517e-06, + "loss": 0.0014, + "step": 10997 + }, + { + "epoch": 7.618981641842743, + "grad_norm": 0.08678889274597168, + "learning_rate": 2.380721220527046e-06, + "loss": 0.0016, + "step": 10998 + }, + { + "epoch": 7.6196744024939385, + "grad_norm": 0.09448511898517609, + "learning_rate": 2.3800277392510403e-06, + "loss": 0.0015, + "step": 10999 + }, + { + "epoch": 7.620367163145134, + "grad_norm": 0.2024964988231659, + "learning_rate": 2.3793342579750348e-06, + "loss": 0.0021, + "step": 11000 + }, + { + "epoch": 7.621059923796328, + "grad_norm": 0.12435255944728851, + "learning_rate": 2.3786407766990293e-06, + "loss": 0.0019, + "step": 11001 + }, + { + "epoch": 7.621752684447523, + "grad_norm": 0.20043282210826874, + "learning_rate": 2.3779472954230238e-06, + "loss": 0.0029, + "step": 11002 + }, + { + "epoch": 7.622445445098719, + "grad_norm": 0.15249516069889069, + "learning_rate": 2.3772538141470183e-06, + "loss": 0.0024, + "step": 11003 + }, + { + "epoch": 7.623138205749913, + "grad_norm": 0.19048525393009186, + "learning_rate": 2.376560332871013e-06, + "loss": 0.0018, + "step": 11004 + }, + { + "epoch": 7.623830966401108, + "grad_norm": 0.1399148851633072, + "learning_rate": 2.375866851595007e-06, + "loss": 0.0017, + "step": 11005 + }, + { + "epoch": 7.624523727052304, + "grad_norm": 0.3415045142173767, + "learning_rate": 2.375173370319002e-06, + "loss": 0.0044, + "step": 11006 + }, + { + "epoch": 7.625216487703499, + "grad_norm": 0.05213839188218117, + "learning_rate": 2.374479889042996e-06, + "loss": 0.0011, + "step": 11007 + }, + { + "epoch": 7.625909248354693, + "grad_norm": 0.1701638102531433, + "learning_rate": 2.3737864077669904e-06, + "loss": 0.0025, + "step": 11008 + }, + { + "epoch": 7.626602009005889, + "grad_norm": 0.16347801685333252, + "learning_rate": 2.373092926490985e-06, + "loss": 0.0026, + "step": 11009 + }, + { + "epoch": 7.627294769657084, + "grad_norm": 0.07998019456863403, + "learning_rate": 2.3723994452149794e-06, + "loss": 0.0015, + "step": 11010 + }, + { + "epoch": 7.627987530308278, + "grad_norm": 0.11584939807653427, + "learning_rate": 2.371705963938974e-06, + "loss": 0.0017, + "step": 11011 + }, + { + "epoch": 7.6286802909594735, + "grad_norm": 0.2052362859249115, + "learning_rate": 2.371012482662968e-06, + "loss": 0.0018, + "step": 11012 + }, + { + "epoch": 7.629373051610669, + "grad_norm": 0.6242120265960693, + "learning_rate": 2.370319001386963e-06, + "loss": 0.0022, + "step": 11013 + }, + { + "epoch": 7.630065812261863, + "grad_norm": 0.18045473098754883, + "learning_rate": 2.369625520110957e-06, + "loss": 0.0018, + "step": 11014 + }, + { + "epoch": 7.6307585729130585, + "grad_norm": 0.3712784945964813, + "learning_rate": 2.3689320388349516e-06, + "loss": 0.0028, + "step": 11015 + }, + { + "epoch": 7.631451333564254, + "grad_norm": 0.12799879908561707, + "learning_rate": 2.368238557558946e-06, + "loss": 0.0017, + "step": 11016 + }, + { + "epoch": 7.632144094215448, + "grad_norm": 0.10460387915372849, + "learning_rate": 2.3675450762829406e-06, + "loss": 0.0017, + "step": 11017 + }, + { + "epoch": 7.632836854866643, + "grad_norm": 0.13728325068950653, + "learning_rate": 2.366851595006935e-06, + "loss": 0.0021, + "step": 11018 + }, + { + "epoch": 7.633529615517839, + "grad_norm": 0.23726387321949005, + "learning_rate": 2.3661581137309296e-06, + "loss": 0.0018, + "step": 11019 + }, + { + "epoch": 7.634222376169034, + "grad_norm": 0.3002427816390991, + "learning_rate": 2.3654646324549237e-06, + "loss": 0.0022, + "step": 11020 + }, + { + "epoch": 7.634915136820228, + "grad_norm": 0.18956208229064941, + "learning_rate": 2.3647711511789186e-06, + "loss": 0.0018, + "step": 11021 + }, + { + "epoch": 7.635607897471424, + "grad_norm": 0.15530270338058472, + "learning_rate": 2.3640776699029127e-06, + "loss": 0.0015, + "step": 11022 + }, + { + "epoch": 7.636300658122619, + "grad_norm": 0.14310528337955475, + "learning_rate": 2.3633841886269072e-06, + "loss": 0.0026, + "step": 11023 + }, + { + "epoch": 7.636993418773813, + "grad_norm": 0.11291219294071198, + "learning_rate": 2.3626907073509017e-06, + "loss": 0.0016, + "step": 11024 + }, + { + "epoch": 7.637686179425009, + "grad_norm": 0.12167590856552124, + "learning_rate": 2.3619972260748962e-06, + "loss": 0.0017, + "step": 11025 + }, + { + "epoch": 7.638378940076204, + "grad_norm": 0.13032810389995575, + "learning_rate": 2.3613037447988907e-06, + "loss": 0.0014, + "step": 11026 + }, + { + "epoch": 7.639071700727399, + "grad_norm": 0.15263308584690094, + "learning_rate": 2.360610263522885e-06, + "loss": 0.0018, + "step": 11027 + }, + { + "epoch": 7.6397644613785936, + "grad_norm": 0.14499661326408386, + "learning_rate": 2.3599167822468798e-06, + "loss": 0.0017, + "step": 11028 + }, + { + "epoch": 7.640457222029789, + "grad_norm": 0.1629236340522766, + "learning_rate": 2.359223300970874e-06, + "loss": 0.0024, + "step": 11029 + }, + { + "epoch": 7.641149982680984, + "grad_norm": 0.23310644924640656, + "learning_rate": 2.3585298196948684e-06, + "loss": 0.0024, + "step": 11030 + }, + { + "epoch": 7.6418427433321785, + "grad_norm": 0.26413801312446594, + "learning_rate": 2.357836338418863e-06, + "loss": 0.0019, + "step": 11031 + }, + { + "epoch": 7.642535503983374, + "grad_norm": 0.1381988525390625, + "learning_rate": 2.3571428571428574e-06, + "loss": 0.0019, + "step": 11032 + }, + { + "epoch": 7.643228264634569, + "grad_norm": 0.20399077236652374, + "learning_rate": 2.356449375866852e-06, + "loss": 0.0021, + "step": 11033 + }, + { + "epoch": 7.6439210252857634, + "grad_norm": 0.134286567568779, + "learning_rate": 2.3557558945908464e-06, + "loss": 0.0019, + "step": 11034 + }, + { + "epoch": 7.644613785936959, + "grad_norm": 0.15523633360862732, + "learning_rate": 2.3550624133148405e-06, + "loss": 0.0017, + "step": 11035 + }, + { + "epoch": 7.645306546588154, + "grad_norm": 0.1477729082107544, + "learning_rate": 2.354368932038835e-06, + "loss": 0.0018, + "step": 11036 + }, + { + "epoch": 7.645999307239348, + "grad_norm": 0.15880073606967926, + "learning_rate": 2.3536754507628295e-06, + "loss": 0.002, + "step": 11037 + }, + { + "epoch": 7.646692067890544, + "grad_norm": 0.22143501043319702, + "learning_rate": 2.352981969486824e-06, + "loss": 0.0023, + "step": 11038 + }, + { + "epoch": 7.647384828541739, + "grad_norm": 0.06278867274522781, + "learning_rate": 2.3522884882108185e-06, + "loss": 0.0013, + "step": 11039 + }, + { + "epoch": 7.648077589192933, + "grad_norm": 0.0775415450334549, + "learning_rate": 2.351595006934813e-06, + "loss": 0.0014, + "step": 11040 + }, + { + "epoch": 7.648770349844129, + "grad_norm": 0.31963226199150085, + "learning_rate": 2.3509015256588075e-06, + "loss": 0.0029, + "step": 11041 + }, + { + "epoch": 7.649463110495324, + "grad_norm": 0.07944449782371521, + "learning_rate": 2.3502080443828016e-06, + "loss": 0.0015, + "step": 11042 + }, + { + "epoch": 7.650155871146519, + "grad_norm": 0.06306667625904083, + "learning_rate": 2.3495145631067966e-06, + "loss": 0.0014, + "step": 11043 + }, + { + "epoch": 7.650848631797714, + "grad_norm": 0.13178983330726624, + "learning_rate": 2.3488210818307907e-06, + "loss": 0.0015, + "step": 11044 + }, + { + "epoch": 7.651541392448909, + "grad_norm": 0.09785177558660507, + "learning_rate": 2.348127600554785e-06, + "loss": 0.0017, + "step": 11045 + }, + { + "epoch": 7.652234153100104, + "grad_norm": 0.235648974776268, + "learning_rate": 2.3474341192787797e-06, + "loss": 0.0038, + "step": 11046 + }, + { + "epoch": 7.652926913751299, + "grad_norm": 0.0811808779835701, + "learning_rate": 2.346740638002774e-06, + "loss": 0.0015, + "step": 11047 + }, + { + "epoch": 7.653619674402494, + "grad_norm": 0.2671070992946625, + "learning_rate": 2.3460471567267687e-06, + "loss": 0.0032, + "step": 11048 + }, + { + "epoch": 7.654312435053689, + "grad_norm": 0.06746388226747513, + "learning_rate": 2.345353675450763e-06, + "loss": 0.0013, + "step": 11049 + }, + { + "epoch": 7.655005195704884, + "grad_norm": 0.1766207218170166, + "learning_rate": 2.3446601941747573e-06, + "loss": 0.0026, + "step": 11050 + }, + { + "epoch": 7.655697956356079, + "grad_norm": 0.095545694231987, + "learning_rate": 2.343966712898752e-06, + "loss": 0.0013, + "step": 11051 + }, + { + "epoch": 7.656390717007274, + "grad_norm": 0.14303840696811676, + "learning_rate": 2.3432732316227463e-06, + "loss": 0.0025, + "step": 11052 + }, + { + "epoch": 7.657083477658469, + "grad_norm": 0.09510090947151184, + "learning_rate": 2.342579750346741e-06, + "loss": 0.0016, + "step": 11053 + }, + { + "epoch": 7.657776238309664, + "grad_norm": 0.10317273437976837, + "learning_rate": 2.3418862690707353e-06, + "loss": 0.0016, + "step": 11054 + }, + { + "epoch": 7.658468998960859, + "grad_norm": 0.14392735064029694, + "learning_rate": 2.34119278779473e-06, + "loss": 0.0018, + "step": 11055 + }, + { + "epoch": 7.659161759612054, + "grad_norm": 0.13661417365074158, + "learning_rate": 2.3404993065187244e-06, + "loss": 0.002, + "step": 11056 + }, + { + "epoch": 7.659854520263249, + "grad_norm": 0.5133945345878601, + "learning_rate": 2.3398058252427184e-06, + "loss": 0.0021, + "step": 11057 + }, + { + "epoch": 7.660547280914444, + "grad_norm": 0.16885030269622803, + "learning_rate": 2.3391123439667134e-06, + "loss": 0.0034, + "step": 11058 + }, + { + "epoch": 7.661240041565639, + "grad_norm": 0.17294436693191528, + "learning_rate": 2.3384188626907075e-06, + "loss": 0.0025, + "step": 11059 + }, + { + "epoch": 7.661932802216834, + "grad_norm": 0.1565319448709488, + "learning_rate": 2.337725381414702e-06, + "loss": 0.002, + "step": 11060 + }, + { + "epoch": 7.662625562868029, + "grad_norm": 0.19181755185127258, + "learning_rate": 2.3370319001386965e-06, + "loss": 0.0022, + "step": 11061 + }, + { + "epoch": 7.663318323519224, + "grad_norm": 0.13065962493419647, + "learning_rate": 2.336338418862691e-06, + "loss": 0.0034, + "step": 11062 + }, + { + "epoch": 7.664011084170419, + "grad_norm": 0.14209194481372833, + "learning_rate": 2.3356449375866855e-06, + "loss": 0.0021, + "step": 11063 + }, + { + "epoch": 7.664703844821614, + "grad_norm": 0.06735049933195114, + "learning_rate": 2.33495145631068e-06, + "loss": 0.0013, + "step": 11064 + }, + { + "epoch": 7.665396605472809, + "grad_norm": 0.17228339612483978, + "learning_rate": 2.334257975034674e-06, + "loss": 0.002, + "step": 11065 + }, + { + "epoch": 7.666089366124004, + "grad_norm": 0.25487828254699707, + "learning_rate": 2.3335644937586686e-06, + "loss": 0.0023, + "step": 11066 + }, + { + "epoch": 7.6667821267752, + "grad_norm": 0.11353092640638351, + "learning_rate": 2.332871012482663e-06, + "loss": 0.0021, + "step": 11067 + }, + { + "epoch": 7.667474887426394, + "grad_norm": 0.11350497603416443, + "learning_rate": 2.3321775312066576e-06, + "loss": 0.0015, + "step": 11068 + }, + { + "epoch": 7.668167648077589, + "grad_norm": 0.13392123579978943, + "learning_rate": 2.331484049930652e-06, + "loss": 0.0014, + "step": 11069 + }, + { + "epoch": 7.668860408728785, + "grad_norm": 0.1440533995628357, + "learning_rate": 2.3307905686546462e-06, + "loss": 0.0023, + "step": 11070 + }, + { + "epoch": 7.669553169379979, + "grad_norm": 0.6629919409751892, + "learning_rate": 2.330097087378641e-06, + "loss": 0.0022, + "step": 11071 + }, + { + "epoch": 7.670245930031174, + "grad_norm": 0.06951063126325607, + "learning_rate": 2.3294036061026352e-06, + "loss": 0.0015, + "step": 11072 + }, + { + "epoch": 7.6709386906823696, + "grad_norm": 0.14044804871082306, + "learning_rate": 2.32871012482663e-06, + "loss": 0.0022, + "step": 11073 + }, + { + "epoch": 7.671631451333564, + "grad_norm": 0.1531040072441101, + "learning_rate": 2.3280166435506243e-06, + "loss": 0.0016, + "step": 11074 + }, + { + "epoch": 7.672324211984759, + "grad_norm": 0.180302694439888, + "learning_rate": 2.3273231622746188e-06, + "loss": 0.0018, + "step": 11075 + }, + { + "epoch": 7.6730169726359545, + "grad_norm": 0.17087845504283905, + "learning_rate": 2.3266296809986133e-06, + "loss": 0.0023, + "step": 11076 + }, + { + "epoch": 7.673709733287149, + "grad_norm": 0.44742295145988464, + "learning_rate": 2.3259361997226078e-06, + "loss": 0.0031, + "step": 11077 + }, + { + "epoch": 7.674402493938344, + "grad_norm": 0.18731926381587982, + "learning_rate": 2.3252427184466023e-06, + "loss": 0.0033, + "step": 11078 + }, + { + "epoch": 7.6750952545895395, + "grad_norm": 0.24815206229686737, + "learning_rate": 2.324549237170597e-06, + "loss": 0.0026, + "step": 11079 + }, + { + "epoch": 7.675788015240734, + "grad_norm": 0.08992894738912582, + "learning_rate": 2.323855755894591e-06, + "loss": 0.0014, + "step": 11080 + }, + { + "epoch": 7.676480775891929, + "grad_norm": 0.11693129688501358, + "learning_rate": 2.3231622746185854e-06, + "loss": 0.0021, + "step": 11081 + }, + { + "epoch": 7.677173536543124, + "grad_norm": 0.10862977802753448, + "learning_rate": 2.32246879334258e-06, + "loss": 0.0017, + "step": 11082 + }, + { + "epoch": 7.67786629719432, + "grad_norm": 0.07881202548742294, + "learning_rate": 2.3217753120665744e-06, + "loss": 0.0015, + "step": 11083 + }, + { + "epoch": 7.678559057845514, + "grad_norm": 0.09780140966176987, + "learning_rate": 2.321081830790569e-06, + "loss": 0.0019, + "step": 11084 + }, + { + "epoch": 7.679251818496709, + "grad_norm": 0.09575121849775314, + "learning_rate": 2.320388349514563e-06, + "loss": 0.0018, + "step": 11085 + }, + { + "epoch": 7.679944579147905, + "grad_norm": 0.33804455399513245, + "learning_rate": 2.319694868238558e-06, + "loss": 0.0018, + "step": 11086 + }, + { + "epoch": 7.6806373397991, + "grad_norm": 0.4046388268470764, + "learning_rate": 2.319001386962552e-06, + "loss": 0.0022, + "step": 11087 + }, + { + "epoch": 7.681330100450294, + "grad_norm": 0.11567312479019165, + "learning_rate": 2.318307905686547e-06, + "loss": 0.0019, + "step": 11088 + }, + { + "epoch": 7.68202286110149, + "grad_norm": 0.10589355230331421, + "learning_rate": 2.317614424410541e-06, + "loss": 0.0016, + "step": 11089 + }, + { + "epoch": 7.682715621752685, + "grad_norm": 0.34544435143470764, + "learning_rate": 2.3169209431345356e-06, + "loss": 0.0029, + "step": 11090 + }, + { + "epoch": 7.683408382403879, + "grad_norm": 0.14481580257415771, + "learning_rate": 2.31622746185853e-06, + "loss": 0.0019, + "step": 11091 + }, + { + "epoch": 7.6841011430550745, + "grad_norm": 0.12427419424057007, + "learning_rate": 2.3155339805825246e-06, + "loss": 0.0021, + "step": 11092 + }, + { + "epoch": 7.68479390370627, + "grad_norm": 0.11880215257406235, + "learning_rate": 2.314840499306519e-06, + "loss": 0.0013, + "step": 11093 + }, + { + "epoch": 7.685486664357464, + "grad_norm": 0.1104602962732315, + "learning_rate": 2.314147018030513e-06, + "loss": 0.0017, + "step": 11094 + }, + { + "epoch": 7.6861794250086595, + "grad_norm": 0.11313404887914658, + "learning_rate": 2.3134535367545077e-06, + "loss": 0.0015, + "step": 11095 + }, + { + "epoch": 7.686872185659855, + "grad_norm": 0.1580561250448227, + "learning_rate": 2.312760055478502e-06, + "loss": 0.0018, + "step": 11096 + }, + { + "epoch": 7.687564946311049, + "grad_norm": 0.0821509137749672, + "learning_rate": 2.3120665742024967e-06, + "loss": 0.0019, + "step": 11097 + }, + { + "epoch": 7.688257706962244, + "grad_norm": 0.32258695363998413, + "learning_rate": 2.3113730929264912e-06, + "loss": 0.0021, + "step": 11098 + }, + { + "epoch": 7.68895046761344, + "grad_norm": 0.1450721025466919, + "learning_rate": 2.3106796116504857e-06, + "loss": 0.0016, + "step": 11099 + }, + { + "epoch": 7.689643228264634, + "grad_norm": 0.37808188796043396, + "learning_rate": 2.30998613037448e-06, + "loss": 0.0057, + "step": 11100 + }, + { + "epoch": 7.690335988915829, + "grad_norm": 0.09213859587907791, + "learning_rate": 2.3092926490984748e-06, + "loss": 0.0019, + "step": 11101 + }, + { + "epoch": 7.691028749567025, + "grad_norm": 0.08755435794591904, + "learning_rate": 2.308599167822469e-06, + "loss": 0.0017, + "step": 11102 + }, + { + "epoch": 7.69172151021822, + "grad_norm": 0.28027504682540894, + "learning_rate": 2.3079056865464634e-06, + "loss": 0.0017, + "step": 11103 + }, + { + "epoch": 7.692414270869414, + "grad_norm": 0.41618475317955017, + "learning_rate": 2.307212205270458e-06, + "loss": 0.0018, + "step": 11104 + }, + { + "epoch": 7.69310703152061, + "grad_norm": 0.33685627579689026, + "learning_rate": 2.3065187239944524e-06, + "loss": 0.002, + "step": 11105 + }, + { + "epoch": 7.693799792171805, + "grad_norm": 0.18762889504432678, + "learning_rate": 2.305825242718447e-06, + "loss": 0.0016, + "step": 11106 + }, + { + "epoch": 7.694492552822999, + "grad_norm": 0.10574682801961899, + "learning_rate": 2.3051317614424414e-06, + "loss": 0.0018, + "step": 11107 + }, + { + "epoch": 7.6951853134741945, + "grad_norm": 0.12032246589660645, + "learning_rate": 2.3044382801664355e-06, + "loss": 0.0015, + "step": 11108 + }, + { + "epoch": 7.69587807412539, + "grad_norm": 0.1229749470949173, + "learning_rate": 2.30374479889043e-06, + "loss": 0.0019, + "step": 11109 + }, + { + "epoch": 7.696570834776585, + "grad_norm": 0.17300957441329956, + "learning_rate": 2.3030513176144245e-06, + "loss": 0.0019, + "step": 11110 + }, + { + "epoch": 7.6972635954277795, + "grad_norm": 0.09111493080854416, + "learning_rate": 2.302357836338419e-06, + "loss": 0.0017, + "step": 11111 + }, + { + "epoch": 7.697956356078975, + "grad_norm": 0.3462311029434204, + "learning_rate": 2.3016643550624135e-06, + "loss": 0.0024, + "step": 11112 + }, + { + "epoch": 7.69864911673017, + "grad_norm": 0.14243848621845245, + "learning_rate": 2.300970873786408e-06, + "loss": 0.0017, + "step": 11113 + }, + { + "epoch": 7.699341877381364, + "grad_norm": 0.07458194345235825, + "learning_rate": 2.3002773925104025e-06, + "loss": 0.0017, + "step": 11114 + }, + { + "epoch": 7.70003463803256, + "grad_norm": 0.12619180977344513, + "learning_rate": 2.2995839112343966e-06, + "loss": 0.0017, + "step": 11115 + }, + { + "epoch": 7.700727398683755, + "grad_norm": 0.14020708203315735, + "learning_rate": 2.2988904299583916e-06, + "loss": 0.0034, + "step": 11116 + }, + { + "epoch": 7.701420159334949, + "grad_norm": 0.10561706870794296, + "learning_rate": 2.2981969486823856e-06, + "loss": 0.0019, + "step": 11117 + }, + { + "epoch": 7.702112919986145, + "grad_norm": 0.1736873984336853, + "learning_rate": 2.29750346740638e-06, + "loss": 0.0019, + "step": 11118 + }, + { + "epoch": 7.70280568063734, + "grad_norm": 0.12339480966329575, + "learning_rate": 2.2968099861303747e-06, + "loss": 0.0019, + "step": 11119 + }, + { + "epoch": 7.703498441288534, + "grad_norm": 0.1351640671491623, + "learning_rate": 2.296116504854369e-06, + "loss": 0.0017, + "step": 11120 + }, + { + "epoch": 7.70419120193973, + "grad_norm": 0.15358518064022064, + "learning_rate": 2.2954230235783637e-06, + "loss": 0.002, + "step": 11121 + }, + { + "epoch": 7.704883962590925, + "grad_norm": 0.24224039912223816, + "learning_rate": 2.294729542302358e-06, + "loss": 0.003, + "step": 11122 + }, + { + "epoch": 7.70557672324212, + "grad_norm": 0.1176552027463913, + "learning_rate": 2.2940360610263523e-06, + "loss": 0.0017, + "step": 11123 + }, + { + "epoch": 7.706269483893315, + "grad_norm": 0.2175241857767105, + "learning_rate": 2.293342579750347e-06, + "loss": 0.0027, + "step": 11124 + }, + { + "epoch": 7.70696224454451, + "grad_norm": 0.07370300590991974, + "learning_rate": 2.2926490984743413e-06, + "loss": 0.0014, + "step": 11125 + }, + { + "epoch": 7.707655005195705, + "grad_norm": 0.0793105959892273, + "learning_rate": 2.291955617198336e-06, + "loss": 0.0013, + "step": 11126 + }, + { + "epoch": 7.7083477658468995, + "grad_norm": 0.12147712707519531, + "learning_rate": 2.2912621359223303e-06, + "loss": 0.0021, + "step": 11127 + }, + { + "epoch": 7.709040526498095, + "grad_norm": 0.33362194895744324, + "learning_rate": 2.2905686546463244e-06, + "loss": 0.002, + "step": 11128 + }, + { + "epoch": 7.70973328714929, + "grad_norm": 0.05313370004296303, + "learning_rate": 2.2898751733703193e-06, + "loss": 0.0014, + "step": 11129 + }, + { + "epoch": 7.710426047800485, + "grad_norm": 0.08593767136335373, + "learning_rate": 2.2891816920943134e-06, + "loss": 0.0015, + "step": 11130 + }, + { + "epoch": 7.71111880845168, + "grad_norm": 0.12806203961372375, + "learning_rate": 2.2884882108183084e-06, + "loss": 0.0019, + "step": 11131 + }, + { + "epoch": 7.711811569102875, + "grad_norm": 0.13096466660499573, + "learning_rate": 2.2877947295423024e-06, + "loss": 0.0016, + "step": 11132 + }, + { + "epoch": 7.71250432975407, + "grad_norm": 0.18911923468112946, + "learning_rate": 2.287101248266297e-06, + "loss": 0.0024, + "step": 11133 + }, + { + "epoch": 7.713197090405265, + "grad_norm": 0.08166119456291199, + "learning_rate": 2.2864077669902915e-06, + "loss": 0.0015, + "step": 11134 + }, + { + "epoch": 7.71388985105646, + "grad_norm": 0.07480023056268692, + "learning_rate": 2.285714285714286e-06, + "loss": 0.0015, + "step": 11135 + }, + { + "epoch": 7.714582611707655, + "grad_norm": 0.11152496933937073, + "learning_rate": 2.2850208044382805e-06, + "loss": 0.0017, + "step": 11136 + }, + { + "epoch": 7.71527537235885, + "grad_norm": 0.16495485603809357, + "learning_rate": 2.284327323162275e-06, + "loss": 0.0024, + "step": 11137 + }, + { + "epoch": 7.715968133010045, + "grad_norm": 0.4652247130870819, + "learning_rate": 2.283633841886269e-06, + "loss": 0.0034, + "step": 11138 + }, + { + "epoch": 7.71666089366124, + "grad_norm": 0.19577790796756744, + "learning_rate": 2.2829403606102636e-06, + "loss": 0.0016, + "step": 11139 + }, + { + "epoch": 7.717353654312435, + "grad_norm": 0.25848260521888733, + "learning_rate": 2.282246879334258e-06, + "loss": 0.0017, + "step": 11140 + }, + { + "epoch": 7.71804641496363, + "grad_norm": 0.13725721836090088, + "learning_rate": 2.2815533980582526e-06, + "loss": 0.0018, + "step": 11141 + }, + { + "epoch": 7.718739175614825, + "grad_norm": 0.1336439996957779, + "learning_rate": 2.280859916782247e-06, + "loss": 0.0015, + "step": 11142 + }, + { + "epoch": 7.71943193626602, + "grad_norm": 0.15603455901145935, + "learning_rate": 2.280166435506241e-06, + "loss": 0.002, + "step": 11143 + }, + { + "epoch": 7.720124696917215, + "grad_norm": 0.10399144142866135, + "learning_rate": 2.279472954230236e-06, + "loss": 0.0015, + "step": 11144 + }, + { + "epoch": 7.72081745756841, + "grad_norm": 0.10489049553871155, + "learning_rate": 2.2787794729542302e-06, + "loss": 0.0022, + "step": 11145 + }, + { + "epoch": 7.721510218219605, + "grad_norm": 0.19018907845020294, + "learning_rate": 2.278085991678225e-06, + "loss": 0.0027, + "step": 11146 + }, + { + "epoch": 7.7222029788708, + "grad_norm": 0.0948394238948822, + "learning_rate": 2.2773925104022192e-06, + "loss": 0.0019, + "step": 11147 + }, + { + "epoch": 7.722895739521995, + "grad_norm": 0.18266138434410095, + "learning_rate": 2.2766990291262138e-06, + "loss": 0.0019, + "step": 11148 + }, + { + "epoch": 7.72358850017319, + "grad_norm": 0.06652650237083435, + "learning_rate": 2.2760055478502083e-06, + "loss": 0.0014, + "step": 11149 + }, + { + "epoch": 7.724281260824386, + "grad_norm": 0.14933249354362488, + "learning_rate": 2.2753120665742028e-06, + "loss": 0.0017, + "step": 11150 + }, + { + "epoch": 7.72497402147558, + "grad_norm": 0.062310583889484406, + "learning_rate": 2.2746185852981973e-06, + "loss": 0.0012, + "step": 11151 + }, + { + "epoch": 7.725666782126775, + "grad_norm": 0.09098254144191742, + "learning_rate": 2.2739251040221914e-06, + "loss": 0.0017, + "step": 11152 + }, + { + "epoch": 7.7263595427779705, + "grad_norm": 0.11707434058189392, + "learning_rate": 2.273231622746186e-06, + "loss": 0.0018, + "step": 11153 + }, + { + "epoch": 7.727052303429165, + "grad_norm": 0.06647341698408127, + "learning_rate": 2.2725381414701804e-06, + "loss": 0.0014, + "step": 11154 + }, + { + "epoch": 7.72774506408036, + "grad_norm": 0.27745726704597473, + "learning_rate": 2.271844660194175e-06, + "loss": 0.0022, + "step": 11155 + }, + { + "epoch": 7.7284378247315555, + "grad_norm": 0.19227567315101624, + "learning_rate": 2.2711511789181694e-06, + "loss": 0.0017, + "step": 11156 + }, + { + "epoch": 7.72913058538275, + "grad_norm": 0.42923104763031006, + "learning_rate": 2.270457697642164e-06, + "loss": 0.0024, + "step": 11157 + }, + { + "epoch": 7.729823346033945, + "grad_norm": 0.1012384220957756, + "learning_rate": 2.269764216366158e-06, + "loss": 0.0018, + "step": 11158 + }, + { + "epoch": 7.73051610668514, + "grad_norm": 0.22178469598293304, + "learning_rate": 2.269070735090153e-06, + "loss": 0.0022, + "step": 11159 + }, + { + "epoch": 7.731208867336335, + "grad_norm": 0.11777793616056442, + "learning_rate": 2.268377253814147e-06, + "loss": 0.0017, + "step": 11160 + }, + { + "epoch": 7.73190162798753, + "grad_norm": 0.42206209897994995, + "learning_rate": 2.267683772538142e-06, + "loss": 0.0016, + "step": 11161 + }, + { + "epoch": 7.732594388638725, + "grad_norm": 0.0868501365184784, + "learning_rate": 2.266990291262136e-06, + "loss": 0.0015, + "step": 11162 + }, + { + "epoch": 7.733287149289921, + "grad_norm": 0.1242530569434166, + "learning_rate": 2.2662968099861306e-06, + "loss": 0.0016, + "step": 11163 + }, + { + "epoch": 7.733979909941115, + "grad_norm": 0.22645629942417145, + "learning_rate": 2.265603328710125e-06, + "loss": 0.0022, + "step": 11164 + }, + { + "epoch": 7.73467267059231, + "grad_norm": 0.09080668538808823, + "learning_rate": 2.2649098474341196e-06, + "loss": 0.0016, + "step": 11165 + }, + { + "epoch": 7.735365431243506, + "grad_norm": 0.14278453588485718, + "learning_rate": 2.264216366158114e-06, + "loss": 0.0022, + "step": 11166 + }, + { + "epoch": 7.7360581918947, + "grad_norm": 0.21942207217216492, + "learning_rate": 2.263522884882108e-06, + "loss": 0.0024, + "step": 11167 + }, + { + "epoch": 7.736750952545895, + "grad_norm": 0.07790365070104599, + "learning_rate": 2.2628294036061027e-06, + "loss": 0.0014, + "step": 11168 + }, + { + "epoch": 7.737443713197091, + "grad_norm": 0.14562489092350006, + "learning_rate": 2.262135922330097e-06, + "loss": 0.0023, + "step": 11169 + }, + { + "epoch": 7.738136473848286, + "grad_norm": 0.12146992236375809, + "learning_rate": 2.2614424410540917e-06, + "loss": 0.0018, + "step": 11170 + }, + { + "epoch": 7.73882923449948, + "grad_norm": 0.10213583707809448, + "learning_rate": 2.2607489597780862e-06, + "loss": 0.0015, + "step": 11171 + }, + { + "epoch": 7.7395219951506755, + "grad_norm": 0.2260013222694397, + "learning_rate": 2.2600554785020807e-06, + "loss": 0.0033, + "step": 11172 + }, + { + "epoch": 7.740214755801871, + "grad_norm": 0.18813388049602509, + "learning_rate": 2.259361997226075e-06, + "loss": 0.0023, + "step": 11173 + }, + { + "epoch": 7.740907516453065, + "grad_norm": 0.3306540846824646, + "learning_rate": 2.2586685159500697e-06, + "loss": 0.0027, + "step": 11174 + }, + { + "epoch": 7.7416002771042605, + "grad_norm": 0.22028273344039917, + "learning_rate": 2.257975034674064e-06, + "loss": 0.0024, + "step": 11175 + }, + { + "epoch": 7.742293037755456, + "grad_norm": 0.12116731703281403, + "learning_rate": 2.2572815533980583e-06, + "loss": 0.0017, + "step": 11176 + }, + { + "epoch": 7.74298579840665, + "grad_norm": 0.09300985932350159, + "learning_rate": 2.256588072122053e-06, + "loss": 0.0017, + "step": 11177 + }, + { + "epoch": 7.743678559057845, + "grad_norm": 0.06648532301187515, + "learning_rate": 2.2558945908460474e-06, + "loss": 0.0013, + "step": 11178 + }, + { + "epoch": 7.744371319709041, + "grad_norm": 0.10641910880804062, + "learning_rate": 2.255201109570042e-06, + "loss": 0.0017, + "step": 11179 + }, + { + "epoch": 7.745064080360235, + "grad_norm": 0.2373981475830078, + "learning_rate": 2.2545076282940364e-06, + "loss": 0.0022, + "step": 11180 + }, + { + "epoch": 7.74575684101143, + "grad_norm": 0.12387380748987198, + "learning_rate": 2.253814147018031e-06, + "loss": 0.0018, + "step": 11181 + }, + { + "epoch": 7.746449601662626, + "grad_norm": 0.06476127356290817, + "learning_rate": 2.253120665742025e-06, + "loss": 0.0011, + "step": 11182 + }, + { + "epoch": 7.747142362313821, + "grad_norm": 0.1915445178747177, + "learning_rate": 2.2524271844660195e-06, + "loss": 0.0025, + "step": 11183 + }, + { + "epoch": 7.747835122965015, + "grad_norm": 0.09372086822986603, + "learning_rate": 2.251733703190014e-06, + "loss": 0.0017, + "step": 11184 + }, + { + "epoch": 7.748527883616211, + "grad_norm": 0.1344568133354187, + "learning_rate": 2.2510402219140085e-06, + "loss": 0.0023, + "step": 11185 + }, + { + "epoch": 7.749220644267406, + "grad_norm": 0.24896401166915894, + "learning_rate": 2.250346740638003e-06, + "loss": 0.0023, + "step": 11186 + }, + { + "epoch": 7.7499134049186, + "grad_norm": 0.1297737956047058, + "learning_rate": 2.2496532593619975e-06, + "loss": 0.0019, + "step": 11187 + }, + { + "epoch": 7.7506061655697955, + "grad_norm": 0.1254224330186844, + "learning_rate": 2.2489597780859916e-06, + "loss": 0.0017, + "step": 11188 + }, + { + "epoch": 7.751298926220991, + "grad_norm": 0.07593856751918793, + "learning_rate": 2.2482662968099865e-06, + "loss": 0.0013, + "step": 11189 + }, + { + "epoch": 7.751991686872186, + "grad_norm": 0.07970157265663147, + "learning_rate": 2.2475728155339806e-06, + "loss": 0.0015, + "step": 11190 + }, + { + "epoch": 7.7526844475233805, + "grad_norm": 0.1514560431241989, + "learning_rate": 2.246879334257975e-06, + "loss": 0.0024, + "step": 11191 + }, + { + "epoch": 7.753377208174576, + "grad_norm": 0.10897752642631531, + "learning_rate": 2.2461858529819696e-06, + "loss": 0.0014, + "step": 11192 + }, + { + "epoch": 7.754069968825771, + "grad_norm": 0.11987505853176117, + "learning_rate": 2.245492371705964e-06, + "loss": 0.0018, + "step": 11193 + }, + { + "epoch": 7.754762729476965, + "grad_norm": 0.07282281666994095, + "learning_rate": 2.2447988904299587e-06, + "loss": 0.0014, + "step": 11194 + }, + { + "epoch": 7.755455490128161, + "grad_norm": 0.08273394405841827, + "learning_rate": 2.244105409153953e-06, + "loss": 0.0014, + "step": 11195 + }, + { + "epoch": 7.756148250779356, + "grad_norm": 0.3216395080089569, + "learning_rate": 2.2434119278779477e-06, + "loss": 0.0023, + "step": 11196 + }, + { + "epoch": 7.75684101143055, + "grad_norm": 0.16991889476776123, + "learning_rate": 2.2427184466019418e-06, + "loss": 0.0017, + "step": 11197 + }, + { + "epoch": 7.757533772081746, + "grad_norm": 0.14216946065425873, + "learning_rate": 2.2420249653259363e-06, + "loss": 0.0018, + "step": 11198 + }, + { + "epoch": 7.758226532732941, + "grad_norm": 0.22020933032035828, + "learning_rate": 2.241331484049931e-06, + "loss": 0.0019, + "step": 11199 + }, + { + "epoch": 7.758919293384135, + "grad_norm": 0.13811734318733215, + "learning_rate": 2.2406380027739253e-06, + "loss": 0.0018, + "step": 11200 + }, + { + "epoch": 7.759612054035331, + "grad_norm": 0.3963909447193146, + "learning_rate": 2.23994452149792e-06, + "loss": 0.0038, + "step": 11201 + }, + { + "epoch": 7.760304814686526, + "grad_norm": 0.09692416340112686, + "learning_rate": 2.2392510402219143e-06, + "loss": 0.0018, + "step": 11202 + }, + { + "epoch": 7.760997575337721, + "grad_norm": 0.065221406519413, + "learning_rate": 2.2385575589459084e-06, + "loss": 0.0015, + "step": 11203 + }, + { + "epoch": 7.761690335988916, + "grad_norm": 0.06640864908695221, + "learning_rate": 2.2378640776699033e-06, + "loss": 0.0013, + "step": 11204 + }, + { + "epoch": 7.762383096640111, + "grad_norm": 0.06931870430707932, + "learning_rate": 2.2371705963938974e-06, + "loss": 0.0014, + "step": 11205 + }, + { + "epoch": 7.763075857291306, + "grad_norm": 0.11891216039657593, + "learning_rate": 2.236477115117892e-06, + "loss": 0.0016, + "step": 11206 + }, + { + "epoch": 7.7637686179425005, + "grad_norm": 0.11357012391090393, + "learning_rate": 2.2357836338418865e-06, + "loss": 0.0016, + "step": 11207 + }, + { + "epoch": 7.764461378593696, + "grad_norm": 0.1463513821363449, + "learning_rate": 2.235090152565881e-06, + "loss": 0.0021, + "step": 11208 + }, + { + "epoch": 7.765154139244891, + "grad_norm": 0.09959112852811813, + "learning_rate": 2.2343966712898755e-06, + "loss": 0.0015, + "step": 11209 + }, + { + "epoch": 7.765846899896086, + "grad_norm": 0.07624318450689316, + "learning_rate": 2.2337031900138696e-06, + "loss": 0.0015, + "step": 11210 + }, + { + "epoch": 7.766539660547281, + "grad_norm": 0.08544086664915085, + "learning_rate": 2.2330097087378645e-06, + "loss": 0.0014, + "step": 11211 + }, + { + "epoch": 7.767232421198476, + "grad_norm": 0.2006032019853592, + "learning_rate": 2.2323162274618586e-06, + "loss": 0.0039, + "step": 11212 + }, + { + "epoch": 7.767925181849671, + "grad_norm": 0.28245675563812256, + "learning_rate": 2.231622746185853e-06, + "loss": 0.0021, + "step": 11213 + }, + { + "epoch": 7.768617942500866, + "grad_norm": 0.10247663408517838, + "learning_rate": 2.2309292649098476e-06, + "loss": 0.0016, + "step": 11214 + }, + { + "epoch": 7.769310703152061, + "grad_norm": 0.06042948365211487, + "learning_rate": 2.230235783633842e-06, + "loss": 0.0015, + "step": 11215 + }, + { + "epoch": 7.770003463803256, + "grad_norm": 0.803938090801239, + "learning_rate": 2.2295423023578366e-06, + "loss": 0.0037, + "step": 11216 + }, + { + "epoch": 7.770696224454451, + "grad_norm": 0.13024838268756866, + "learning_rate": 2.228848821081831e-06, + "loss": 0.0016, + "step": 11217 + }, + { + "epoch": 7.771388985105646, + "grad_norm": 0.10446521639823914, + "learning_rate": 2.2281553398058252e-06, + "loss": 0.0017, + "step": 11218 + }, + { + "epoch": 7.772081745756841, + "grad_norm": 0.09668648988008499, + "learning_rate": 2.22746185852982e-06, + "loss": 0.0016, + "step": 11219 + }, + { + "epoch": 7.772774506408036, + "grad_norm": 0.22087211906909943, + "learning_rate": 2.2267683772538142e-06, + "loss": 0.0015, + "step": 11220 + }, + { + "epoch": 7.773467267059231, + "grad_norm": 0.36785584688186646, + "learning_rate": 2.2260748959778087e-06, + "loss": 0.0034, + "step": 11221 + }, + { + "epoch": 7.774160027710426, + "grad_norm": 0.10302146524190903, + "learning_rate": 2.2253814147018033e-06, + "loss": 0.0019, + "step": 11222 + }, + { + "epoch": 7.774852788361621, + "grad_norm": 0.0880153700709343, + "learning_rate": 2.2246879334257978e-06, + "loss": 0.0016, + "step": 11223 + }, + { + "epoch": 7.775545549012816, + "grad_norm": 0.10583964735269547, + "learning_rate": 2.2239944521497923e-06, + "loss": 0.0015, + "step": 11224 + }, + { + "epoch": 7.776238309664011, + "grad_norm": 0.23762349784374237, + "learning_rate": 2.2233009708737864e-06, + "loss": 0.0019, + "step": 11225 + }, + { + "epoch": 7.776931070315206, + "grad_norm": 0.12018120288848877, + "learning_rate": 2.2226074895977813e-06, + "loss": 0.0014, + "step": 11226 + }, + { + "epoch": 7.777623830966401, + "grad_norm": 0.15623846650123596, + "learning_rate": 2.2219140083217754e-06, + "loss": 0.0019, + "step": 11227 + }, + { + "epoch": 7.778316591617596, + "grad_norm": 0.1327710598707199, + "learning_rate": 2.22122052704577e-06, + "loss": 0.0018, + "step": 11228 + }, + { + "epoch": 7.779009352268791, + "grad_norm": 0.08099834620952606, + "learning_rate": 2.2205270457697644e-06, + "loss": 0.0013, + "step": 11229 + }, + { + "epoch": 7.779702112919987, + "grad_norm": 0.1376233547925949, + "learning_rate": 2.219833564493759e-06, + "loss": 0.0019, + "step": 11230 + }, + { + "epoch": 7.780394873571181, + "grad_norm": 0.1590290665626526, + "learning_rate": 2.2191400832177534e-06, + "loss": 0.0017, + "step": 11231 + }, + { + "epoch": 7.781087634222376, + "grad_norm": 0.14185279607772827, + "learning_rate": 2.218446601941748e-06, + "loss": 0.0016, + "step": 11232 + }, + { + "epoch": 7.7817803948735715, + "grad_norm": 0.08854740113019943, + "learning_rate": 2.217753120665742e-06, + "loss": 0.0015, + "step": 11233 + }, + { + "epoch": 7.782473155524766, + "grad_norm": 0.21793046593666077, + "learning_rate": 2.2170596393897365e-06, + "loss": 0.002, + "step": 11234 + }, + { + "epoch": 7.783165916175961, + "grad_norm": 0.14304323494434357, + "learning_rate": 2.216366158113731e-06, + "loss": 0.0019, + "step": 11235 + }, + { + "epoch": 7.7838586768271565, + "grad_norm": 0.198887899518013, + "learning_rate": 2.2156726768377255e-06, + "loss": 0.0029, + "step": 11236 + }, + { + "epoch": 7.784551437478351, + "grad_norm": 0.1203419491648674, + "learning_rate": 2.21497919556172e-06, + "loss": 0.0019, + "step": 11237 + }, + { + "epoch": 7.785244198129546, + "grad_norm": 0.08119252324104309, + "learning_rate": 2.2142857142857146e-06, + "loss": 0.0014, + "step": 11238 + }, + { + "epoch": 7.785936958780741, + "grad_norm": 0.12953047454357147, + "learning_rate": 2.213592233009709e-06, + "loss": 0.0017, + "step": 11239 + }, + { + "epoch": 7.786629719431936, + "grad_norm": 0.14757809042930603, + "learning_rate": 2.212898751733703e-06, + "loss": 0.0025, + "step": 11240 + }, + { + "epoch": 7.787322480083131, + "grad_norm": 0.18309348821640015, + "learning_rate": 2.212205270457698e-06, + "loss": 0.0017, + "step": 11241 + }, + { + "epoch": 7.788015240734326, + "grad_norm": 0.09014245867729187, + "learning_rate": 2.211511789181692e-06, + "loss": 0.0017, + "step": 11242 + }, + { + "epoch": 7.788708001385522, + "grad_norm": 0.06307058036327362, + "learning_rate": 2.2108183079056867e-06, + "loss": 0.0014, + "step": 11243 + }, + { + "epoch": 7.789400762036716, + "grad_norm": 0.07024972885847092, + "learning_rate": 2.210124826629681e-06, + "loss": 0.0015, + "step": 11244 + }, + { + "epoch": 7.790093522687911, + "grad_norm": 0.1116972491145134, + "learning_rate": 2.2094313453536757e-06, + "loss": 0.0019, + "step": 11245 + }, + { + "epoch": 7.790786283339107, + "grad_norm": 0.1071903258562088, + "learning_rate": 2.2087378640776702e-06, + "loss": 0.002, + "step": 11246 + }, + { + "epoch": 7.791479043990301, + "grad_norm": 0.12507693469524384, + "learning_rate": 2.2080443828016647e-06, + "loss": 0.0017, + "step": 11247 + }, + { + "epoch": 7.792171804641496, + "grad_norm": 0.10897404700517654, + "learning_rate": 2.207350901525659e-06, + "loss": 0.002, + "step": 11248 + }, + { + "epoch": 7.792864565292692, + "grad_norm": 0.1244838535785675, + "learning_rate": 2.2066574202496533e-06, + "loss": 0.0016, + "step": 11249 + }, + { + "epoch": 7.793557325943887, + "grad_norm": 0.11542940884828568, + "learning_rate": 2.205963938973648e-06, + "loss": 0.0023, + "step": 11250 + }, + { + "epoch": 7.794250086595081, + "grad_norm": 0.12995007634162903, + "learning_rate": 2.2052704576976423e-06, + "loss": 0.0018, + "step": 11251 + }, + { + "epoch": 7.7949428472462765, + "grad_norm": 0.34024178981781006, + "learning_rate": 2.204576976421637e-06, + "loss": 0.002, + "step": 11252 + }, + { + "epoch": 7.795635607897472, + "grad_norm": 0.184537872672081, + "learning_rate": 2.2038834951456314e-06, + "loss": 0.0025, + "step": 11253 + }, + { + "epoch": 7.796328368548666, + "grad_norm": 0.3570444583892822, + "learning_rate": 2.203190013869626e-06, + "loss": 0.0032, + "step": 11254 + }, + { + "epoch": 7.7970211291998615, + "grad_norm": 0.2577095925807953, + "learning_rate": 2.20249653259362e-06, + "loss": 0.0046, + "step": 11255 + }, + { + "epoch": 7.797713889851057, + "grad_norm": 0.22864298522472382, + "learning_rate": 2.201803051317615e-06, + "loss": 0.0021, + "step": 11256 + }, + { + "epoch": 7.798406650502251, + "grad_norm": 0.17480036616325378, + "learning_rate": 2.201109570041609e-06, + "loss": 0.0022, + "step": 11257 + }, + { + "epoch": 7.799099411153446, + "grad_norm": 0.08790505677461624, + "learning_rate": 2.2004160887656035e-06, + "loss": 0.0016, + "step": 11258 + }, + { + "epoch": 7.799792171804642, + "grad_norm": 0.34003666043281555, + "learning_rate": 2.199722607489598e-06, + "loss": 0.004, + "step": 11259 + }, + { + "epoch": 7.800484932455836, + "grad_norm": 0.05693638697266579, + "learning_rate": 2.1990291262135925e-06, + "loss": 0.0014, + "step": 11260 + }, + { + "epoch": 7.801177693107031, + "grad_norm": 0.19102239608764648, + "learning_rate": 2.198335644937587e-06, + "loss": 0.0021, + "step": 11261 + }, + { + "epoch": 7.801870453758227, + "grad_norm": 0.3277992010116577, + "learning_rate": 2.1976421636615815e-06, + "loss": 0.0022, + "step": 11262 + }, + { + "epoch": 7.802563214409422, + "grad_norm": 0.10052553564310074, + "learning_rate": 2.1969486823855756e-06, + "loss": 0.0017, + "step": 11263 + }, + { + "epoch": 7.803255975060616, + "grad_norm": 0.09415896981954575, + "learning_rate": 2.19625520110957e-06, + "loss": 0.0017, + "step": 11264 + }, + { + "epoch": 7.803948735711812, + "grad_norm": 0.28450697660446167, + "learning_rate": 2.1955617198335646e-06, + "loss": 0.0029, + "step": 11265 + }, + { + "epoch": 7.804641496363007, + "grad_norm": 0.10606548190116882, + "learning_rate": 2.194868238557559e-06, + "loss": 0.0015, + "step": 11266 + }, + { + "epoch": 7.805334257014201, + "grad_norm": 0.10414156317710876, + "learning_rate": 2.1941747572815537e-06, + "loss": 0.0027, + "step": 11267 + }, + { + "epoch": 7.8060270176653965, + "grad_norm": 0.17339912056922913, + "learning_rate": 2.1934812760055477e-06, + "loss": 0.0026, + "step": 11268 + }, + { + "epoch": 7.806719778316592, + "grad_norm": 0.07309307903051376, + "learning_rate": 2.1927877947295427e-06, + "loss": 0.0013, + "step": 11269 + }, + { + "epoch": 7.807412538967787, + "grad_norm": 0.17403848469257355, + "learning_rate": 2.1920943134535368e-06, + "loss": 0.0024, + "step": 11270 + }, + { + "epoch": 7.8081052996189815, + "grad_norm": 0.10918648540973663, + "learning_rate": 2.1914008321775317e-06, + "loss": 0.0019, + "step": 11271 + }, + { + "epoch": 7.808798060270177, + "grad_norm": 0.10794667154550552, + "learning_rate": 2.1907073509015258e-06, + "loss": 0.0016, + "step": 11272 + }, + { + "epoch": 7.809490820921372, + "grad_norm": 0.11508607864379883, + "learning_rate": 2.1900138696255203e-06, + "loss": 0.0018, + "step": 11273 + }, + { + "epoch": 7.810183581572566, + "grad_norm": 0.16168354451656342, + "learning_rate": 2.189320388349515e-06, + "loss": 0.0018, + "step": 11274 + }, + { + "epoch": 7.810876342223762, + "grad_norm": 0.2915906310081482, + "learning_rate": 2.1886269070735093e-06, + "loss": 0.0022, + "step": 11275 + }, + { + "epoch": 7.811569102874957, + "grad_norm": 0.0944768488407135, + "learning_rate": 2.187933425797504e-06, + "loss": 0.0014, + "step": 11276 + }, + { + "epoch": 7.812261863526151, + "grad_norm": 0.06748131662607193, + "learning_rate": 2.1872399445214983e-06, + "loss": 0.0013, + "step": 11277 + }, + { + "epoch": 7.812954624177347, + "grad_norm": 0.12040342390537262, + "learning_rate": 2.1865464632454924e-06, + "loss": 0.0014, + "step": 11278 + }, + { + "epoch": 7.813647384828542, + "grad_norm": 0.181565523147583, + "learning_rate": 2.185852981969487e-06, + "loss": 0.0029, + "step": 11279 + }, + { + "epoch": 7.814340145479736, + "grad_norm": 0.05621764063835144, + "learning_rate": 2.1851595006934814e-06, + "loss": 0.0012, + "step": 11280 + }, + { + "epoch": 7.815032906130932, + "grad_norm": 0.15113765001296997, + "learning_rate": 2.184466019417476e-06, + "loss": 0.0018, + "step": 11281 + }, + { + "epoch": 7.815725666782127, + "grad_norm": 0.06723956763744354, + "learning_rate": 2.1837725381414705e-06, + "loss": 0.0013, + "step": 11282 + }, + { + "epoch": 7.816418427433322, + "grad_norm": 0.3941515386104584, + "learning_rate": 2.1830790568654645e-06, + "loss": 0.0024, + "step": 11283 + }, + { + "epoch": 7.8171111880845165, + "grad_norm": 0.0861259177327156, + "learning_rate": 2.1823855755894595e-06, + "loss": 0.0017, + "step": 11284 + }, + { + "epoch": 7.817803948735712, + "grad_norm": 0.11990301311016083, + "learning_rate": 2.1816920943134536e-06, + "loss": 0.0016, + "step": 11285 + }, + { + "epoch": 7.818496709386907, + "grad_norm": 0.10869354009628296, + "learning_rate": 2.180998613037448e-06, + "loss": 0.0015, + "step": 11286 + }, + { + "epoch": 7.8191894700381015, + "grad_norm": 0.13147354125976562, + "learning_rate": 2.1803051317614426e-06, + "loss": 0.0023, + "step": 11287 + }, + { + "epoch": 7.819882230689297, + "grad_norm": 0.10592517256736755, + "learning_rate": 2.179611650485437e-06, + "loss": 0.0016, + "step": 11288 + }, + { + "epoch": 7.820574991340492, + "grad_norm": 0.695120632648468, + "learning_rate": 2.1789181692094316e-06, + "loss": 0.002, + "step": 11289 + }, + { + "epoch": 7.821267751991687, + "grad_norm": 0.17284594476222992, + "learning_rate": 2.178224687933426e-06, + "loss": 0.0019, + "step": 11290 + }, + { + "epoch": 7.821960512642882, + "grad_norm": 0.19730062782764435, + "learning_rate": 2.17753120665742e-06, + "loss": 0.003, + "step": 11291 + }, + { + "epoch": 7.822653273294077, + "grad_norm": 0.12815114855766296, + "learning_rate": 2.1768377253814147e-06, + "loss": 0.002, + "step": 11292 + }, + { + "epoch": 7.823346033945272, + "grad_norm": 0.07243195921182632, + "learning_rate": 2.1761442441054092e-06, + "loss": 0.0015, + "step": 11293 + }, + { + "epoch": 7.824038794596467, + "grad_norm": 0.12785346806049347, + "learning_rate": 2.1754507628294037e-06, + "loss": 0.0017, + "step": 11294 + }, + { + "epoch": 7.824731555247662, + "grad_norm": 0.08393670618534088, + "learning_rate": 2.1747572815533982e-06, + "loss": 0.0017, + "step": 11295 + }, + { + "epoch": 7.825424315898857, + "grad_norm": 0.3895988464355469, + "learning_rate": 2.1740638002773927e-06, + "loss": 0.0024, + "step": 11296 + }, + { + "epoch": 7.826117076550052, + "grad_norm": 0.20433297753334045, + "learning_rate": 2.1733703190013873e-06, + "loss": 0.0014, + "step": 11297 + }, + { + "epoch": 7.826809837201247, + "grad_norm": 0.09145063906908035, + "learning_rate": 2.1726768377253813e-06, + "loss": 0.0017, + "step": 11298 + }, + { + "epoch": 7.827502597852442, + "grad_norm": 0.10940881818532944, + "learning_rate": 2.1719833564493763e-06, + "loss": 0.0015, + "step": 11299 + }, + { + "epoch": 7.828195358503637, + "grad_norm": 0.17061269283294678, + "learning_rate": 2.1712898751733704e-06, + "loss": 0.0019, + "step": 11300 + }, + { + "epoch": 7.828888119154832, + "grad_norm": 0.1235680803656578, + "learning_rate": 2.170596393897365e-06, + "loss": 0.0016, + "step": 11301 + }, + { + "epoch": 7.829580879806027, + "grad_norm": 0.06370677053928375, + "learning_rate": 2.1699029126213594e-06, + "loss": 0.0012, + "step": 11302 + }, + { + "epoch": 7.830273640457222, + "grad_norm": 0.22167515754699707, + "learning_rate": 2.169209431345354e-06, + "loss": 0.0023, + "step": 11303 + }, + { + "epoch": 7.830966401108417, + "grad_norm": 0.10389106720685959, + "learning_rate": 2.1685159500693484e-06, + "loss": 0.0017, + "step": 11304 + }, + { + "epoch": 7.831659161759612, + "grad_norm": 0.1505199521780014, + "learning_rate": 2.167822468793343e-06, + "loss": 0.0031, + "step": 11305 + }, + { + "epoch": 7.832351922410807, + "grad_norm": 0.1148136705160141, + "learning_rate": 2.167128987517337e-06, + "loss": 0.0018, + "step": 11306 + }, + { + "epoch": 7.833044683062002, + "grad_norm": 0.057822197675704956, + "learning_rate": 2.1664355062413315e-06, + "loss": 0.0011, + "step": 11307 + }, + { + "epoch": 7.833737443713197, + "grad_norm": 0.09731818735599518, + "learning_rate": 2.165742024965326e-06, + "loss": 0.0015, + "step": 11308 + }, + { + "epoch": 7.834430204364392, + "grad_norm": 0.19469495117664337, + "learning_rate": 2.1650485436893205e-06, + "loss": 0.0025, + "step": 11309 + }, + { + "epoch": 7.835122965015588, + "grad_norm": 0.12179212272167206, + "learning_rate": 2.164355062413315e-06, + "loss": 0.0014, + "step": 11310 + }, + { + "epoch": 7.835815725666782, + "grad_norm": 0.28897035121917725, + "learning_rate": 2.1636615811373096e-06, + "loss": 0.0027, + "step": 11311 + }, + { + "epoch": 7.836508486317977, + "grad_norm": 0.24937516450881958, + "learning_rate": 2.162968099861304e-06, + "loss": 0.0019, + "step": 11312 + }, + { + "epoch": 7.8372012469691725, + "grad_norm": 0.08942914009094238, + "learning_rate": 2.162274618585298e-06, + "loss": 0.0015, + "step": 11313 + }, + { + "epoch": 7.837894007620367, + "grad_norm": 0.06706539541482925, + "learning_rate": 2.161581137309293e-06, + "loss": 0.0014, + "step": 11314 + }, + { + "epoch": 7.838586768271562, + "grad_norm": 0.19965887069702148, + "learning_rate": 2.160887656033287e-06, + "loss": 0.002, + "step": 11315 + }, + { + "epoch": 7.8392795289227575, + "grad_norm": 0.1354026347398758, + "learning_rate": 2.1601941747572817e-06, + "loss": 0.0024, + "step": 11316 + }, + { + "epoch": 7.839972289573952, + "grad_norm": 0.15121757984161377, + "learning_rate": 2.159500693481276e-06, + "loss": 0.0019, + "step": 11317 + }, + { + "epoch": 7.840665050225147, + "grad_norm": 0.14452719688415527, + "learning_rate": 2.1588072122052707e-06, + "loss": 0.0021, + "step": 11318 + }, + { + "epoch": 7.841357810876342, + "grad_norm": 0.10823360830545425, + "learning_rate": 2.158113730929265e-06, + "loss": 0.0016, + "step": 11319 + }, + { + "epoch": 7.842050571527537, + "grad_norm": 0.06858740746974945, + "learning_rate": 2.1574202496532597e-06, + "loss": 0.0015, + "step": 11320 + }, + { + "epoch": 7.842743332178732, + "grad_norm": 0.0829811617732048, + "learning_rate": 2.156726768377254e-06, + "loss": 0.0015, + "step": 11321 + }, + { + "epoch": 7.843436092829927, + "grad_norm": 0.09882291406393051, + "learning_rate": 2.1560332871012483e-06, + "loss": 0.0013, + "step": 11322 + }, + { + "epoch": 7.844128853481123, + "grad_norm": 0.34364140033721924, + "learning_rate": 2.155339805825243e-06, + "loss": 0.0026, + "step": 11323 + }, + { + "epoch": 7.844821614132317, + "grad_norm": 0.10191264748573303, + "learning_rate": 2.1546463245492373e-06, + "loss": 0.0017, + "step": 11324 + }, + { + "epoch": 7.845514374783512, + "grad_norm": 0.08529110252857208, + "learning_rate": 2.153952843273232e-06, + "loss": 0.0013, + "step": 11325 + }, + { + "epoch": 7.846207135434708, + "grad_norm": 0.12132897973060608, + "learning_rate": 2.153259361997226e-06, + "loss": 0.0014, + "step": 11326 + }, + { + "epoch": 7.846899896085902, + "grad_norm": 0.13429586589336395, + "learning_rate": 2.152565880721221e-06, + "loss": 0.0015, + "step": 11327 + }, + { + "epoch": 7.847592656737097, + "grad_norm": 0.4025964140892029, + "learning_rate": 2.151872399445215e-06, + "loss": 0.0028, + "step": 11328 + }, + { + "epoch": 7.8482854173882926, + "grad_norm": 0.14849776029586792, + "learning_rate": 2.15117891816921e-06, + "loss": 0.0017, + "step": 11329 + }, + { + "epoch": 7.848978178039488, + "grad_norm": 0.09574708342552185, + "learning_rate": 2.150485436893204e-06, + "loss": 0.0016, + "step": 11330 + }, + { + "epoch": 7.849670938690682, + "grad_norm": 0.1349084973335266, + "learning_rate": 2.1497919556171985e-06, + "loss": 0.0015, + "step": 11331 + }, + { + "epoch": 7.8503636993418775, + "grad_norm": 0.0910324901342392, + "learning_rate": 2.149098474341193e-06, + "loss": 0.0014, + "step": 11332 + }, + { + "epoch": 7.851056459993073, + "grad_norm": 0.08863872289657593, + "learning_rate": 2.1484049930651875e-06, + "loss": 0.0015, + "step": 11333 + }, + { + "epoch": 7.851749220644267, + "grad_norm": 0.05655772611498833, + "learning_rate": 2.147711511789182e-06, + "loss": 0.0013, + "step": 11334 + }, + { + "epoch": 7.8524419812954624, + "grad_norm": 0.14138595759868622, + "learning_rate": 2.1470180305131765e-06, + "loss": 0.0016, + "step": 11335 + }, + { + "epoch": 7.853134741946658, + "grad_norm": 0.062396854162216187, + "learning_rate": 2.1463245492371706e-06, + "loss": 0.0013, + "step": 11336 + }, + { + "epoch": 7.853827502597852, + "grad_norm": 0.08552103489637375, + "learning_rate": 2.145631067961165e-06, + "loss": 0.0017, + "step": 11337 + }, + { + "epoch": 7.854520263249047, + "grad_norm": 0.2230331301689148, + "learning_rate": 2.1449375866851596e-06, + "loss": 0.0024, + "step": 11338 + }, + { + "epoch": 7.855213023900243, + "grad_norm": 0.23418837785720825, + "learning_rate": 2.144244105409154e-06, + "loss": 0.0018, + "step": 11339 + }, + { + "epoch": 7.855905784551437, + "grad_norm": 0.4438765048980713, + "learning_rate": 2.1435506241331486e-06, + "loss": 0.0045, + "step": 11340 + }, + { + "epoch": 7.856598545202632, + "grad_norm": 0.549567461013794, + "learning_rate": 2.1428571428571427e-06, + "loss": 0.0041, + "step": 11341 + }, + { + "epoch": 7.857291305853828, + "grad_norm": 0.09813400357961655, + "learning_rate": 2.1421636615811377e-06, + "loss": 0.0014, + "step": 11342 + }, + { + "epoch": 7.857984066505023, + "grad_norm": 0.0821555033326149, + "learning_rate": 2.1414701803051317e-06, + "loss": 0.0016, + "step": 11343 + }, + { + "epoch": 7.858676827156217, + "grad_norm": 0.1884317547082901, + "learning_rate": 2.1407766990291267e-06, + "loss": 0.0017, + "step": 11344 + }, + { + "epoch": 7.859369587807413, + "grad_norm": 0.17601996660232544, + "learning_rate": 2.1400832177531208e-06, + "loss": 0.0016, + "step": 11345 + }, + { + "epoch": 7.860062348458608, + "grad_norm": 0.14312589168548584, + "learning_rate": 2.1393897364771153e-06, + "loss": 0.0024, + "step": 11346 + }, + { + "epoch": 7.860755109109802, + "grad_norm": 0.1217978298664093, + "learning_rate": 2.13869625520111e-06, + "loss": 0.0018, + "step": 11347 + }, + { + "epoch": 7.8614478697609975, + "grad_norm": 0.4607955515384674, + "learning_rate": 2.1380027739251043e-06, + "loss": 0.0019, + "step": 11348 + }, + { + "epoch": 7.862140630412193, + "grad_norm": 0.24453143775463104, + "learning_rate": 2.137309292649099e-06, + "loss": 0.002, + "step": 11349 + }, + { + "epoch": 7.862833391063388, + "grad_norm": 0.14901407063007355, + "learning_rate": 2.136615811373093e-06, + "loss": 0.002, + "step": 11350 + }, + { + "epoch": 7.8635261517145825, + "grad_norm": 0.26923948526382446, + "learning_rate": 2.1359223300970874e-06, + "loss": 0.0021, + "step": 11351 + }, + { + "epoch": 7.864218912365778, + "grad_norm": 0.10520078986883163, + "learning_rate": 2.135228848821082e-06, + "loss": 0.0017, + "step": 11352 + }, + { + "epoch": 7.864911673016973, + "grad_norm": 0.10828854888677597, + "learning_rate": 2.1345353675450764e-06, + "loss": 0.0016, + "step": 11353 + }, + { + "epoch": 7.865604433668167, + "grad_norm": 0.1414390653371811, + "learning_rate": 2.133841886269071e-06, + "loss": 0.0027, + "step": 11354 + }, + { + "epoch": 7.866297194319363, + "grad_norm": 0.2203105390071869, + "learning_rate": 2.1331484049930654e-06, + "loss": 0.0022, + "step": 11355 + }, + { + "epoch": 7.866989954970558, + "grad_norm": 0.36115825176239014, + "learning_rate": 2.1324549237170595e-06, + "loss": 0.0035, + "step": 11356 + }, + { + "epoch": 7.867682715621752, + "grad_norm": 0.6102029085159302, + "learning_rate": 2.1317614424410545e-06, + "loss": 0.0026, + "step": 11357 + }, + { + "epoch": 7.868375476272948, + "grad_norm": 0.14479592442512512, + "learning_rate": 2.1310679611650486e-06, + "loss": 0.0017, + "step": 11358 + }, + { + "epoch": 7.869068236924143, + "grad_norm": 0.4564879834651947, + "learning_rate": 2.1303744798890435e-06, + "loss": 0.0022, + "step": 11359 + }, + { + "epoch": 7.869760997575337, + "grad_norm": 0.10407575964927673, + "learning_rate": 2.1296809986130376e-06, + "loss": 0.0016, + "step": 11360 + }, + { + "epoch": 7.870453758226533, + "grad_norm": 0.2893345057964325, + "learning_rate": 2.128987517337032e-06, + "loss": 0.0029, + "step": 11361 + }, + { + "epoch": 7.871146518877728, + "grad_norm": 0.10947423428297043, + "learning_rate": 2.1282940360610266e-06, + "loss": 0.0023, + "step": 11362 + }, + { + "epoch": 7.871839279528922, + "grad_norm": 0.06859258562326431, + "learning_rate": 2.127600554785021e-06, + "loss": 0.0012, + "step": 11363 + }, + { + "epoch": 7.8725320401801175, + "grad_norm": 0.1107030063867569, + "learning_rate": 2.1269070735090156e-06, + "loss": 0.0016, + "step": 11364 + }, + { + "epoch": 7.873224800831313, + "grad_norm": 0.16727851331233978, + "learning_rate": 2.1262135922330097e-06, + "loss": 0.0016, + "step": 11365 + }, + { + "epoch": 7.873917561482508, + "grad_norm": 0.112326979637146, + "learning_rate": 2.125520110957004e-06, + "loss": 0.0021, + "step": 11366 + }, + { + "epoch": 7.8746103221337025, + "grad_norm": 0.13387130200862885, + "learning_rate": 2.1248266296809987e-06, + "loss": 0.002, + "step": 11367 + }, + { + "epoch": 7.875303082784898, + "grad_norm": 0.2403022199869156, + "learning_rate": 2.1241331484049932e-06, + "loss": 0.0028, + "step": 11368 + }, + { + "epoch": 7.875995843436093, + "grad_norm": 0.10315826535224915, + "learning_rate": 2.1234396671289877e-06, + "loss": 0.002, + "step": 11369 + }, + { + "epoch": 7.876688604087288, + "grad_norm": 0.2243594378232956, + "learning_rate": 2.1227461858529822e-06, + "loss": 0.002, + "step": 11370 + }, + { + "epoch": 7.877381364738483, + "grad_norm": 0.08643249422311783, + "learning_rate": 2.1220527045769763e-06, + "loss": 0.0015, + "step": 11371 + }, + { + "epoch": 7.878074125389678, + "grad_norm": 0.42081764340400696, + "learning_rate": 2.1213592233009713e-06, + "loss": 0.0049, + "step": 11372 + }, + { + "epoch": 7.878766886040873, + "grad_norm": 0.14030934870243073, + "learning_rate": 2.1206657420249654e-06, + "loss": 0.0021, + "step": 11373 + }, + { + "epoch": 7.879459646692068, + "grad_norm": 0.08104424178600311, + "learning_rate": 2.11997226074896e-06, + "loss": 0.0015, + "step": 11374 + }, + { + "epoch": 7.880152407343263, + "grad_norm": 0.09887759387493134, + "learning_rate": 2.1192787794729544e-06, + "loss": 0.0015, + "step": 11375 + }, + { + "epoch": 7.880845167994458, + "grad_norm": 0.14208678901195526, + "learning_rate": 2.118585298196949e-06, + "loss": 0.0023, + "step": 11376 + }, + { + "epoch": 7.881537928645653, + "grad_norm": 0.07758071273565292, + "learning_rate": 2.1178918169209434e-06, + "loss": 0.0014, + "step": 11377 + }, + { + "epoch": 7.882230689296848, + "grad_norm": 0.21352963149547577, + "learning_rate": 2.117198335644938e-06, + "loss": 0.0021, + "step": 11378 + }, + { + "epoch": 7.882923449948043, + "grad_norm": 0.2536875605583191, + "learning_rate": 2.1165048543689324e-06, + "loss": 0.0021, + "step": 11379 + }, + { + "epoch": 7.883616210599238, + "grad_norm": 0.1069861650466919, + "learning_rate": 2.1158113730929265e-06, + "loss": 0.0015, + "step": 11380 + }, + { + "epoch": 7.884308971250433, + "grad_norm": 0.2671584486961365, + "learning_rate": 2.115117891816921e-06, + "loss": 0.0025, + "step": 11381 + }, + { + "epoch": 7.885001731901628, + "grad_norm": 0.10854753106832504, + "learning_rate": 2.1144244105409155e-06, + "loss": 0.0017, + "step": 11382 + }, + { + "epoch": 7.8856944925528225, + "grad_norm": 0.13896214962005615, + "learning_rate": 2.11373092926491e-06, + "loss": 0.0021, + "step": 11383 + }, + { + "epoch": 7.886387253204018, + "grad_norm": 0.10801216959953308, + "learning_rate": 2.1130374479889045e-06, + "loss": 0.0018, + "step": 11384 + }, + { + "epoch": 7.887080013855213, + "grad_norm": 0.1705852597951889, + "learning_rate": 2.112343966712899e-06, + "loss": 0.002, + "step": 11385 + }, + { + "epoch": 7.887772774506408, + "grad_norm": 0.18517433106899261, + "learning_rate": 2.111650485436893e-06, + "loss": 0.0024, + "step": 11386 + }, + { + "epoch": 7.888465535157603, + "grad_norm": 0.08442562073469162, + "learning_rate": 2.110957004160888e-06, + "loss": 0.0018, + "step": 11387 + }, + { + "epoch": 7.889158295808798, + "grad_norm": 0.10756903886795044, + "learning_rate": 2.110263522884882e-06, + "loss": 0.0019, + "step": 11388 + }, + { + "epoch": 7.889851056459993, + "grad_norm": 0.13703681528568268, + "learning_rate": 2.1095700416088767e-06, + "loss": 0.0016, + "step": 11389 + }, + { + "epoch": 7.890543817111189, + "grad_norm": 0.13048480451107025, + "learning_rate": 2.108876560332871e-06, + "loss": 0.0014, + "step": 11390 + }, + { + "epoch": 7.891236577762383, + "grad_norm": 0.20304864645004272, + "learning_rate": 2.1081830790568657e-06, + "loss": 0.0018, + "step": 11391 + }, + { + "epoch": 7.891929338413578, + "grad_norm": 0.056315258145332336, + "learning_rate": 2.10748959778086e-06, + "loss": 0.0012, + "step": 11392 + }, + { + "epoch": 7.8926220990647735, + "grad_norm": 0.13983847200870514, + "learning_rate": 2.1067961165048547e-06, + "loss": 0.0018, + "step": 11393 + }, + { + "epoch": 7.893314859715968, + "grad_norm": 0.10653576254844666, + "learning_rate": 2.1061026352288492e-06, + "loss": 0.0016, + "step": 11394 + }, + { + "epoch": 7.894007620367163, + "grad_norm": 0.15684229135513306, + "learning_rate": 2.1054091539528433e-06, + "loss": 0.0021, + "step": 11395 + }, + { + "epoch": 7.8947003810183585, + "grad_norm": 0.1900988519191742, + "learning_rate": 2.104715672676838e-06, + "loss": 0.0019, + "step": 11396 + }, + { + "epoch": 7.895393141669553, + "grad_norm": 0.23246794939041138, + "learning_rate": 2.1040221914008323e-06, + "loss": 0.0021, + "step": 11397 + }, + { + "epoch": 7.896085902320748, + "grad_norm": 0.13619092106819153, + "learning_rate": 2.103328710124827e-06, + "loss": 0.0016, + "step": 11398 + }, + { + "epoch": 7.896778662971943, + "grad_norm": 0.4182014465332031, + "learning_rate": 2.1026352288488213e-06, + "loss": 0.0023, + "step": 11399 + }, + { + "epoch": 7.897471423623138, + "grad_norm": 0.1065654307603836, + "learning_rate": 2.101941747572816e-06, + "loss": 0.0015, + "step": 11400 + }, + { + "epoch": 7.898164184274333, + "grad_norm": 0.3538290858268738, + "learning_rate": 2.10124826629681e-06, + "loss": 0.0028, + "step": 11401 + }, + { + "epoch": 7.898856944925528, + "grad_norm": 0.1610267460346222, + "learning_rate": 2.100554785020805e-06, + "loss": 0.0017, + "step": 11402 + }, + { + "epoch": 7.899549705576723, + "grad_norm": 0.10012906044721603, + "learning_rate": 2.099861303744799e-06, + "loss": 0.0016, + "step": 11403 + }, + { + "epoch": 7.900242466227918, + "grad_norm": 0.13199912011623383, + "learning_rate": 2.0991678224687935e-06, + "loss": 0.0017, + "step": 11404 + }, + { + "epoch": 7.900935226879113, + "grad_norm": 0.11438145488500595, + "learning_rate": 2.098474341192788e-06, + "loss": 0.0015, + "step": 11405 + }, + { + "epoch": 7.901627987530309, + "grad_norm": 0.12943808734416962, + "learning_rate": 2.0977808599167825e-06, + "loss": 0.002, + "step": 11406 + }, + { + "epoch": 7.902320748181503, + "grad_norm": 0.24114668369293213, + "learning_rate": 2.097087378640777e-06, + "loss": 0.002, + "step": 11407 + }, + { + "epoch": 7.903013508832698, + "grad_norm": 0.0662049949169159, + "learning_rate": 2.096393897364771e-06, + "loss": 0.0013, + "step": 11408 + }, + { + "epoch": 7.9037062694838935, + "grad_norm": 0.10101411491632462, + "learning_rate": 2.095700416088766e-06, + "loss": 0.0015, + "step": 11409 + }, + { + "epoch": 7.904399030135089, + "grad_norm": 0.1440206617116928, + "learning_rate": 2.09500693481276e-06, + "loss": 0.0018, + "step": 11410 + }, + { + "epoch": 7.905091790786283, + "grad_norm": 0.193631112575531, + "learning_rate": 2.0943134535367546e-06, + "loss": 0.0023, + "step": 11411 + }, + { + "epoch": 7.9057845514374785, + "grad_norm": 0.13785867393016815, + "learning_rate": 2.093619972260749e-06, + "loss": 0.0016, + "step": 11412 + }, + { + "epoch": 7.906477312088674, + "grad_norm": 0.18846537172794342, + "learning_rate": 2.0929264909847436e-06, + "loss": 0.0015, + "step": 11413 + }, + { + "epoch": 7.907170072739868, + "grad_norm": 0.38149115443229675, + "learning_rate": 2.092233009708738e-06, + "loss": 0.0027, + "step": 11414 + }, + { + "epoch": 7.907862833391063, + "grad_norm": 0.2141820341348648, + "learning_rate": 2.0915395284327327e-06, + "loss": 0.0025, + "step": 11415 + }, + { + "epoch": 7.908555594042259, + "grad_norm": 0.10129716992378235, + "learning_rate": 2.0908460471567267e-06, + "loss": 0.0016, + "step": 11416 + }, + { + "epoch": 7.909248354693453, + "grad_norm": 0.08857739716768265, + "learning_rate": 2.0901525658807217e-06, + "loss": 0.0014, + "step": 11417 + }, + { + "epoch": 7.909941115344648, + "grad_norm": 0.14479796588420868, + "learning_rate": 2.0894590846047158e-06, + "loss": 0.0022, + "step": 11418 + }, + { + "epoch": 7.910633875995844, + "grad_norm": 0.19538909196853638, + "learning_rate": 2.0887656033287103e-06, + "loss": 0.0025, + "step": 11419 + }, + { + "epoch": 7.911326636647038, + "grad_norm": 0.07969465106725693, + "learning_rate": 2.0880721220527048e-06, + "loss": 0.0015, + "step": 11420 + }, + { + "epoch": 7.912019397298233, + "grad_norm": 0.09164320677518845, + "learning_rate": 2.0873786407766993e-06, + "loss": 0.0014, + "step": 11421 + }, + { + "epoch": 7.912712157949429, + "grad_norm": 0.0549982450902462, + "learning_rate": 2.086685159500694e-06, + "loss": 0.0013, + "step": 11422 + }, + { + "epoch": 7.913404918600623, + "grad_norm": 0.1624305099248886, + "learning_rate": 2.085991678224688e-06, + "loss": 0.0019, + "step": 11423 + }, + { + "epoch": 7.914097679251818, + "grad_norm": 0.12367258220911026, + "learning_rate": 2.085298196948683e-06, + "loss": 0.0015, + "step": 11424 + }, + { + "epoch": 7.914790439903014, + "grad_norm": 0.3276459872722626, + "learning_rate": 2.084604715672677e-06, + "loss": 0.0025, + "step": 11425 + }, + { + "epoch": 7.915483200554209, + "grad_norm": 0.13010117411613464, + "learning_rate": 2.0839112343966714e-06, + "loss": 0.0021, + "step": 11426 + }, + { + "epoch": 7.916175961205403, + "grad_norm": 0.08906543254852295, + "learning_rate": 2.083217753120666e-06, + "loss": 0.0017, + "step": 11427 + }, + { + "epoch": 7.9168687218565985, + "grad_norm": 0.07558631896972656, + "learning_rate": 2.0825242718446604e-06, + "loss": 0.0014, + "step": 11428 + }, + { + "epoch": 7.917561482507794, + "grad_norm": 0.12094772607088089, + "learning_rate": 2.081830790568655e-06, + "loss": 0.0021, + "step": 11429 + }, + { + "epoch": 7.918254243158988, + "grad_norm": 0.22087305784225464, + "learning_rate": 2.0811373092926495e-06, + "loss": 0.0035, + "step": 11430 + }, + { + "epoch": 7.9189470038101835, + "grad_norm": 0.1467932164669037, + "learning_rate": 2.0804438280166435e-06, + "loss": 0.0016, + "step": 11431 + }, + { + "epoch": 7.919639764461379, + "grad_norm": 0.12079876661300659, + "learning_rate": 2.079750346740638e-06, + "loss": 0.0019, + "step": 11432 + }, + { + "epoch": 7.920332525112574, + "grad_norm": 0.3745475709438324, + "learning_rate": 2.0790568654646326e-06, + "loss": 0.0023, + "step": 11433 + }, + { + "epoch": 7.921025285763768, + "grad_norm": 0.11212224513292313, + "learning_rate": 2.078363384188627e-06, + "loss": 0.0019, + "step": 11434 + }, + { + "epoch": 7.921718046414964, + "grad_norm": 0.13426125049591064, + "learning_rate": 2.0776699029126216e-06, + "loss": 0.0015, + "step": 11435 + }, + { + "epoch": 7.922410807066159, + "grad_norm": 0.04780206456780434, + "learning_rate": 2.076976421636616e-06, + "loss": 0.0013, + "step": 11436 + }, + { + "epoch": 7.923103567717353, + "grad_norm": 0.22956909239292145, + "learning_rate": 2.0762829403606106e-06, + "loss": 0.0027, + "step": 11437 + }, + { + "epoch": 7.923796328368549, + "grad_norm": 0.08223120868206024, + "learning_rate": 2.0755894590846047e-06, + "loss": 0.0013, + "step": 11438 + }, + { + "epoch": 7.924489089019744, + "grad_norm": 0.1168670505285263, + "learning_rate": 2.0748959778085996e-06, + "loss": 0.0014, + "step": 11439 + }, + { + "epoch": 7.925181849670938, + "grad_norm": 0.11173796653747559, + "learning_rate": 2.0742024965325937e-06, + "loss": 0.0019, + "step": 11440 + }, + { + "epoch": 7.925874610322134, + "grad_norm": 0.12356695532798767, + "learning_rate": 2.0735090152565882e-06, + "loss": 0.002, + "step": 11441 + }, + { + "epoch": 7.926567370973329, + "grad_norm": 0.07252473384141922, + "learning_rate": 2.0728155339805827e-06, + "loss": 0.0015, + "step": 11442 + }, + { + "epoch": 7.927260131624523, + "grad_norm": 0.22668412327766418, + "learning_rate": 2.0721220527045772e-06, + "loss": 0.0019, + "step": 11443 + }, + { + "epoch": 7.9279528922757185, + "grad_norm": 0.09695761650800705, + "learning_rate": 2.0714285714285717e-06, + "loss": 0.0016, + "step": 11444 + }, + { + "epoch": 7.928645652926914, + "grad_norm": 0.09948756545782089, + "learning_rate": 2.0707350901525663e-06, + "loss": 0.0016, + "step": 11445 + }, + { + "epoch": 7.929338413578109, + "grad_norm": 0.22758884727954865, + "learning_rate": 2.0700416088765603e-06, + "loss": 0.0021, + "step": 11446 + }, + { + "epoch": 7.9300311742293035, + "grad_norm": 0.0764935314655304, + "learning_rate": 2.069348127600555e-06, + "loss": 0.0015, + "step": 11447 + }, + { + "epoch": 7.930723934880499, + "grad_norm": 0.20141349732875824, + "learning_rate": 2.0686546463245494e-06, + "loss": 0.0026, + "step": 11448 + }, + { + "epoch": 7.931416695531694, + "grad_norm": 0.3428936302661896, + "learning_rate": 2.067961165048544e-06, + "loss": 0.0029, + "step": 11449 + }, + { + "epoch": 7.932109456182888, + "grad_norm": 0.06243060529232025, + "learning_rate": 2.0672676837725384e-06, + "loss": 0.0012, + "step": 11450 + }, + { + "epoch": 7.932802216834084, + "grad_norm": 0.1965719759464264, + "learning_rate": 2.066574202496533e-06, + "loss": 0.0018, + "step": 11451 + }, + { + "epoch": 7.933494977485279, + "grad_norm": 0.2134178876876831, + "learning_rate": 2.0658807212205274e-06, + "loss": 0.0021, + "step": 11452 + }, + { + "epoch": 7.934187738136474, + "grad_norm": 0.2270844727754593, + "learning_rate": 2.0651872399445215e-06, + "loss": 0.0017, + "step": 11453 + }, + { + "epoch": 7.934880498787669, + "grad_norm": 0.07822228223085403, + "learning_rate": 2.0644937586685164e-06, + "loss": 0.0013, + "step": 11454 + }, + { + "epoch": 7.935573259438864, + "grad_norm": 0.10275891423225403, + "learning_rate": 2.0638002773925105e-06, + "loss": 0.0016, + "step": 11455 + }, + { + "epoch": 7.936266020090059, + "grad_norm": 0.16150107979774475, + "learning_rate": 2.063106796116505e-06, + "loss": 0.0018, + "step": 11456 + }, + { + "epoch": 7.936958780741254, + "grad_norm": 0.16756756603717804, + "learning_rate": 2.0624133148404995e-06, + "loss": 0.0025, + "step": 11457 + }, + { + "epoch": 7.937651541392449, + "grad_norm": 0.2795891761779785, + "learning_rate": 2.061719833564494e-06, + "loss": 0.002, + "step": 11458 + }, + { + "epoch": 7.938344302043644, + "grad_norm": 0.16245059669017792, + "learning_rate": 2.061026352288488e-06, + "loss": 0.0018, + "step": 11459 + }, + { + "epoch": 7.9390370626948386, + "grad_norm": 0.25377747416496277, + "learning_rate": 2.060332871012483e-06, + "loss": 0.0021, + "step": 11460 + }, + { + "epoch": 7.939729823346034, + "grad_norm": 0.11814465373754501, + "learning_rate": 2.059639389736477e-06, + "loss": 0.0014, + "step": 11461 + }, + { + "epoch": 7.940422583997229, + "grad_norm": 0.14700928330421448, + "learning_rate": 2.0589459084604717e-06, + "loss": 0.0026, + "step": 11462 + }, + { + "epoch": 7.9411153446484235, + "grad_norm": 0.11324573308229446, + "learning_rate": 2.058252427184466e-06, + "loss": 0.0015, + "step": 11463 + }, + { + "epoch": 7.941808105299619, + "grad_norm": 0.23448750376701355, + "learning_rate": 2.0575589459084607e-06, + "loss": 0.0031, + "step": 11464 + }, + { + "epoch": 7.942500865950814, + "grad_norm": 0.09061235189437866, + "learning_rate": 2.056865464632455e-06, + "loss": 0.0015, + "step": 11465 + }, + { + "epoch": 7.943193626602009, + "grad_norm": 0.33940520882606506, + "learning_rate": 2.0561719833564493e-06, + "loss": 0.0033, + "step": 11466 + }, + { + "epoch": 7.943886387253204, + "grad_norm": 0.26225733757019043, + "learning_rate": 2.055478502080444e-06, + "loss": 0.0023, + "step": 11467 + }, + { + "epoch": 7.944579147904399, + "grad_norm": 0.07459697127342224, + "learning_rate": 2.0547850208044383e-06, + "loss": 0.0015, + "step": 11468 + }, + { + "epoch": 7.945271908555594, + "grad_norm": 0.06712120026350021, + "learning_rate": 2.054091539528433e-06, + "loss": 0.0012, + "step": 11469 + }, + { + "epoch": 7.945964669206789, + "grad_norm": 0.20380155742168427, + "learning_rate": 2.0533980582524273e-06, + "loss": 0.0017, + "step": 11470 + }, + { + "epoch": 7.946657429857984, + "grad_norm": 0.13156256079673767, + "learning_rate": 2.052704576976422e-06, + "loss": 0.0018, + "step": 11471 + }, + { + "epoch": 7.947350190509179, + "grad_norm": 0.3585037589073181, + "learning_rate": 2.0520110957004163e-06, + "loss": 0.0045, + "step": 11472 + }, + { + "epoch": 7.9480429511603745, + "grad_norm": 0.06246247887611389, + "learning_rate": 2.051317614424411e-06, + "loss": 0.0014, + "step": 11473 + }, + { + "epoch": 7.948735711811569, + "grad_norm": 0.11869733035564423, + "learning_rate": 2.050624133148405e-06, + "loss": 0.0017, + "step": 11474 + }, + { + "epoch": 7.949428472462764, + "grad_norm": 0.12826837599277496, + "learning_rate": 2.0499306518724e-06, + "loss": 0.0013, + "step": 11475 + }, + { + "epoch": 7.9501212331139595, + "grad_norm": 0.07099510729312897, + "learning_rate": 2.049237170596394e-06, + "loss": 0.0015, + "step": 11476 + }, + { + "epoch": 7.950813993765154, + "grad_norm": 0.0927015021443367, + "learning_rate": 2.0485436893203885e-06, + "loss": 0.0016, + "step": 11477 + }, + { + "epoch": 7.951506754416349, + "grad_norm": 0.14832226932048798, + "learning_rate": 2.047850208044383e-06, + "loss": 0.002, + "step": 11478 + }, + { + "epoch": 7.952199515067544, + "grad_norm": 0.1016480028629303, + "learning_rate": 2.0471567267683775e-06, + "loss": 0.0014, + "step": 11479 + }, + { + "epoch": 7.952892275718739, + "grad_norm": 0.08733993023633957, + "learning_rate": 2.046463245492372e-06, + "loss": 0.0013, + "step": 11480 + }, + { + "epoch": 7.953585036369934, + "grad_norm": 0.18449991941452026, + "learning_rate": 2.045769764216366e-06, + "loss": 0.002, + "step": 11481 + }, + { + "epoch": 7.954277797021129, + "grad_norm": 0.278288334608078, + "learning_rate": 2.045076282940361e-06, + "loss": 0.0019, + "step": 11482 + }, + { + "epoch": 7.954970557672324, + "grad_norm": 0.0932365208864212, + "learning_rate": 2.044382801664355e-06, + "loss": 0.0015, + "step": 11483 + }, + { + "epoch": 7.955663318323519, + "grad_norm": 0.2478647381067276, + "learning_rate": 2.0436893203883496e-06, + "loss": 0.0021, + "step": 11484 + }, + { + "epoch": 7.956356078974714, + "grad_norm": 0.08164886385202408, + "learning_rate": 2.042995839112344e-06, + "loss": 0.0013, + "step": 11485 + }, + { + "epoch": 7.95704883962591, + "grad_norm": 0.5557247996330261, + "learning_rate": 2.0423023578363386e-06, + "loss": 0.0064, + "step": 11486 + }, + { + "epoch": 7.957741600277104, + "grad_norm": 0.08273177593946457, + "learning_rate": 2.041608876560333e-06, + "loss": 0.0014, + "step": 11487 + }, + { + "epoch": 7.958434360928299, + "grad_norm": 0.109470434486866, + "learning_rate": 2.0409153952843276e-06, + "loss": 0.0018, + "step": 11488 + }, + { + "epoch": 7.9591271215794945, + "grad_norm": 0.08045624941587448, + "learning_rate": 2.0402219140083217e-06, + "loss": 0.0014, + "step": 11489 + }, + { + "epoch": 7.959819882230689, + "grad_norm": 0.11285190284252167, + "learning_rate": 2.0395284327323162e-06, + "loss": 0.0014, + "step": 11490 + }, + { + "epoch": 7.960512642881884, + "grad_norm": 0.08281496912240982, + "learning_rate": 2.0388349514563107e-06, + "loss": 0.0014, + "step": 11491 + }, + { + "epoch": 7.9612054035330795, + "grad_norm": 0.25705453753471375, + "learning_rate": 2.0381414701803053e-06, + "loss": 0.0026, + "step": 11492 + }, + { + "epoch": 7.961898164184275, + "grad_norm": 0.07038560509681702, + "learning_rate": 2.0374479889042998e-06, + "loss": 0.0012, + "step": 11493 + }, + { + "epoch": 7.962590924835469, + "grad_norm": 0.08487432450056076, + "learning_rate": 2.0367545076282943e-06, + "loss": 0.0013, + "step": 11494 + }, + { + "epoch": 7.963283685486664, + "grad_norm": 0.07545677572488785, + "learning_rate": 2.0360610263522888e-06, + "loss": 0.0015, + "step": 11495 + }, + { + "epoch": 7.96397644613786, + "grad_norm": 0.09319596737623215, + "learning_rate": 2.035367545076283e-06, + "loss": 0.0016, + "step": 11496 + }, + { + "epoch": 7.964669206789054, + "grad_norm": 0.17455600202083588, + "learning_rate": 2.034674063800278e-06, + "loss": 0.0027, + "step": 11497 + }, + { + "epoch": 7.965361967440249, + "grad_norm": 0.09785149991512299, + "learning_rate": 2.033980582524272e-06, + "loss": 0.0016, + "step": 11498 + }, + { + "epoch": 7.966054728091445, + "grad_norm": 0.21693672239780426, + "learning_rate": 2.0332871012482664e-06, + "loss": 0.0023, + "step": 11499 + }, + { + "epoch": 7.966747488742639, + "grad_norm": 0.15302878618240356, + "learning_rate": 2.032593619972261e-06, + "loss": 0.002, + "step": 11500 + }, + { + "epoch": 7.967440249393834, + "grad_norm": 0.15403737127780914, + "learning_rate": 2.0319001386962554e-06, + "loss": 0.0018, + "step": 11501 + }, + { + "epoch": 7.96813301004503, + "grad_norm": 0.2522105276584625, + "learning_rate": 2.03120665742025e-06, + "loss": 0.0025, + "step": 11502 + }, + { + "epoch": 7.968825770696224, + "grad_norm": 0.10669081658124924, + "learning_rate": 2.0305131761442444e-06, + "loss": 0.0017, + "step": 11503 + }, + { + "epoch": 7.969518531347419, + "grad_norm": 0.06838635355234146, + "learning_rate": 2.0298196948682385e-06, + "loss": 0.0011, + "step": 11504 + }, + { + "epoch": 7.9702112919986146, + "grad_norm": 0.32742977142333984, + "learning_rate": 2.029126213592233e-06, + "loss": 0.002, + "step": 11505 + }, + { + "epoch": 7.97090405264981, + "grad_norm": 0.40055006742477417, + "learning_rate": 2.0284327323162275e-06, + "loss": 0.0019, + "step": 11506 + }, + { + "epoch": 7.971596813301004, + "grad_norm": 0.12538351118564606, + "learning_rate": 2.027739251040222e-06, + "loss": 0.0018, + "step": 11507 + }, + { + "epoch": 7.9722895739521995, + "grad_norm": 0.07275067269802094, + "learning_rate": 2.0270457697642166e-06, + "loss": 0.0014, + "step": 11508 + }, + { + "epoch": 7.972982334603395, + "grad_norm": 0.19842331111431122, + "learning_rate": 2.0263522884882107e-06, + "loss": 0.0024, + "step": 11509 + }, + { + "epoch": 7.973675095254589, + "grad_norm": 0.07606042176485062, + "learning_rate": 2.0256588072122056e-06, + "loss": 0.0014, + "step": 11510 + }, + { + "epoch": 7.9743678559057845, + "grad_norm": 0.09152119606733322, + "learning_rate": 2.0249653259361997e-06, + "loss": 0.0012, + "step": 11511 + }, + { + "epoch": 7.97506061655698, + "grad_norm": 0.12996184825897217, + "learning_rate": 2.0242718446601946e-06, + "loss": 0.0016, + "step": 11512 + }, + { + "epoch": 7.975753377208175, + "grad_norm": 0.08830322325229645, + "learning_rate": 2.0235783633841887e-06, + "loss": 0.0016, + "step": 11513 + }, + { + "epoch": 7.976446137859369, + "grad_norm": 0.08426600694656372, + "learning_rate": 2.022884882108183e-06, + "loss": 0.0014, + "step": 11514 + }, + { + "epoch": 7.977138898510565, + "grad_norm": 0.1549033224582672, + "learning_rate": 2.0221914008321777e-06, + "loss": 0.0018, + "step": 11515 + }, + { + "epoch": 7.97783165916176, + "grad_norm": 0.22533456981182098, + "learning_rate": 2.0214979195561722e-06, + "loss": 0.0021, + "step": 11516 + }, + { + "epoch": 7.978524419812954, + "grad_norm": 0.08146145939826965, + "learning_rate": 2.0208044382801667e-06, + "loss": 0.0015, + "step": 11517 + }, + { + "epoch": 7.97921718046415, + "grad_norm": 0.13984590768814087, + "learning_rate": 2.0201109570041612e-06, + "loss": 0.0017, + "step": 11518 + }, + { + "epoch": 7.979909941115345, + "grad_norm": 0.18048150837421417, + "learning_rate": 2.0194174757281553e-06, + "loss": 0.0017, + "step": 11519 + }, + { + "epoch": 7.980602701766539, + "grad_norm": 0.14031150937080383, + "learning_rate": 2.01872399445215e-06, + "loss": 0.0014, + "step": 11520 + }, + { + "epoch": 7.981295462417735, + "grad_norm": 0.1632668375968933, + "learning_rate": 2.0180305131761443e-06, + "loss": 0.0017, + "step": 11521 + }, + { + "epoch": 7.98198822306893, + "grad_norm": 0.11185518652200699, + "learning_rate": 2.017337031900139e-06, + "loss": 0.0016, + "step": 11522 + }, + { + "epoch": 7.982680983720124, + "grad_norm": 0.11018916964530945, + "learning_rate": 2.0166435506241334e-06, + "loss": 0.0017, + "step": 11523 + }, + { + "epoch": 7.9833737443713195, + "grad_norm": 0.819999098777771, + "learning_rate": 2.0159500693481275e-06, + "loss": 0.0022, + "step": 11524 + }, + { + "epoch": 7.984066505022515, + "grad_norm": 0.15081410109996796, + "learning_rate": 2.0152565880721224e-06, + "loss": 0.0033, + "step": 11525 + }, + { + "epoch": 7.98475926567371, + "grad_norm": 0.48248326778411865, + "learning_rate": 2.0145631067961165e-06, + "loss": 0.0025, + "step": 11526 + }, + { + "epoch": 7.9854520263249045, + "grad_norm": 0.13126465678215027, + "learning_rate": 2.0138696255201114e-06, + "loss": 0.0016, + "step": 11527 + }, + { + "epoch": 7.9861447869761, + "grad_norm": 0.08971597254276276, + "learning_rate": 2.0131761442441055e-06, + "loss": 0.0015, + "step": 11528 + }, + { + "epoch": 7.986837547627295, + "grad_norm": 0.1712157279253006, + "learning_rate": 2.0124826629681e-06, + "loss": 0.0016, + "step": 11529 + }, + { + "epoch": 7.987530308278489, + "grad_norm": 0.1463373899459839, + "learning_rate": 2.0117891816920945e-06, + "loss": 0.0019, + "step": 11530 + }, + { + "epoch": 7.988223068929685, + "grad_norm": 0.1179339662194252, + "learning_rate": 2.011095700416089e-06, + "loss": 0.0013, + "step": 11531 + }, + { + "epoch": 7.98891582958088, + "grad_norm": 0.29913946986198425, + "learning_rate": 2.0104022191400835e-06, + "loss": 0.0016, + "step": 11532 + }, + { + "epoch": 7.989608590232075, + "grad_norm": 0.0902574211359024, + "learning_rate": 2.009708737864078e-06, + "loss": 0.0014, + "step": 11533 + }, + { + "epoch": 7.99030135088327, + "grad_norm": 0.08017268776893616, + "learning_rate": 2.009015256588072e-06, + "loss": 0.0014, + "step": 11534 + }, + { + "epoch": 7.990994111534465, + "grad_norm": 0.15926645696163177, + "learning_rate": 2.0083217753120666e-06, + "loss": 0.0018, + "step": 11535 + }, + { + "epoch": 7.99168687218566, + "grad_norm": 0.14761313796043396, + "learning_rate": 2.007628294036061e-06, + "loss": 0.0019, + "step": 11536 + }, + { + "epoch": 7.992379632836855, + "grad_norm": 0.21028392016887665, + "learning_rate": 2.0069348127600557e-06, + "loss": 0.0018, + "step": 11537 + }, + { + "epoch": 7.99307239348805, + "grad_norm": 0.17841485142707825, + "learning_rate": 2.00624133148405e-06, + "loss": 0.0039, + "step": 11538 + }, + { + "epoch": 7.993765154139245, + "grad_norm": 0.08202453702688217, + "learning_rate": 2.0055478502080443e-06, + "loss": 0.0013, + "step": 11539 + }, + { + "epoch": 7.9944579147904395, + "grad_norm": 0.1495659202337265, + "learning_rate": 2.004854368932039e-06, + "loss": 0.0016, + "step": 11540 + }, + { + "epoch": 7.995150675441635, + "grad_norm": 0.4311334788799286, + "learning_rate": 2.0041608876560333e-06, + "loss": 0.0028, + "step": 11541 + }, + { + "epoch": 7.99584343609283, + "grad_norm": 0.10675432533025742, + "learning_rate": 2.003467406380028e-06, + "loss": 0.0017, + "step": 11542 + }, + { + "epoch": 7.9965361967440245, + "grad_norm": 0.12781338393688202, + "learning_rate": 2.0027739251040223e-06, + "loss": 0.002, + "step": 11543 + }, + { + "epoch": 7.99722895739522, + "grad_norm": 0.1689104437828064, + "learning_rate": 2.002080443828017e-06, + "loss": 0.0022, + "step": 11544 + }, + { + "epoch": 7.997921718046415, + "grad_norm": 0.1593383103609085, + "learning_rate": 2.0013869625520113e-06, + "loss": 0.0021, + "step": 11545 + }, + { + "epoch": 7.99861447869761, + "grad_norm": 0.2896552085876465, + "learning_rate": 2.000693481276006e-06, + "loss": 0.0018, + "step": 11546 + }, + { + "epoch": 7.999307239348805, + "grad_norm": 0.08897218853235245, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.0015, + "step": 11547 + }, + { + "epoch": 8.0, + "grad_norm": 0.16805683076381683, + "learning_rate": 1.9993065187239944e-06, + "loss": 0.0025, + "step": 11548 + }, + { + "epoch": 8.0, + "eval_loss": 0.31035521626472473, + "eval_runtime": 7617.722, + "eval_samples_per_second": 1.05, + "eval_steps_per_second": 0.033, + "eval_wer": 12.103847094587513, + "step": 11548 + } + ], + "logging_steps": 1, + "max_steps": 14430, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 6.277245051186708e+21, + "train_batch_size": 80, + "trial_name": null, + "trial_params": null +}