{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.15244991073199762, "eval_steps": 385, "global_step": 1537, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 9.918666931164451e-05, "grad_norm": 1.5675016641616821, "learning_rate": 2e-05, "loss": 44.4878, "step": 1 }, { "epoch": 9.918666931164451e-05, "eval_loss": 11.11313247680664, "eval_runtime": 12.6586, "eval_samples_per_second": 335.345, "eval_steps_per_second": 167.712, "step": 1 }, { "epoch": 0.00019837333862328903, "grad_norm": 1.4330368041992188, "learning_rate": 4e-05, "loss": 44.5058, "step": 2 }, { "epoch": 0.00029756000793493357, "grad_norm": 1.5805532932281494, "learning_rate": 6e-05, "loss": 44.4799, "step": 3 }, { "epoch": 0.00039674667724657806, "grad_norm": 1.3924373388290405, "learning_rate": 8e-05, "loss": 44.501, "step": 4 }, { "epoch": 0.0004959333465582226, "grad_norm": 1.5956809520721436, "learning_rate": 0.0001, "loss": 44.3656, "step": 5 }, { "epoch": 0.0005951200158698671, "grad_norm": 1.894731879234314, "learning_rate": 0.00012, "loss": 44.5017, "step": 6 }, { "epoch": 0.0006943066851815116, "grad_norm": 1.578338623046875, "learning_rate": 0.00014, "loss": 44.5081, "step": 7 }, { "epoch": 0.0007934933544931561, "grad_norm": 1.4924142360687256, "learning_rate": 0.00016, "loss": 44.55, "step": 8 }, { "epoch": 0.0008926800238048007, "grad_norm": 1.7426700592041016, "learning_rate": 0.00018, "loss": 44.3046, "step": 9 }, { "epoch": 0.0009918666931164452, "grad_norm": 1.4936484098434448, "learning_rate": 0.0002, "loss": 44.4573, "step": 10 }, { "epoch": 0.0010910533624280897, "grad_norm": 1.6182663440704346, "learning_rate": 0.0001999997883630362, "loss": 44.3885, "step": 11 }, { "epoch": 0.0011902400317397343, "grad_norm": 1.6213288307189941, "learning_rate": 0.00019999915345304058, "loss": 44.5466, "step": 12 }, { "epoch": 0.0012894267010513786, "grad_norm": 1.5364705324172974, "learning_rate": 0.00019999809527270051, "loss": 44.5294, "step": 13 }, { "epoch": 0.0013886133703630231, "grad_norm": 1.6264804601669312, "learning_rate": 0.00019999661382649509, "loss": 44.5433, "step": 14 }, { "epoch": 0.0014878000396746677, "grad_norm": 1.601863980293274, "learning_rate": 0.0001999947091206948, "loss": 44.3718, "step": 15 }, { "epoch": 0.0015869867089863122, "grad_norm": 1.6560978889465332, "learning_rate": 0.0001999923811633618, "loss": 44.5575, "step": 16 }, { "epoch": 0.0016861733782979568, "grad_norm": 1.6490329504013062, "learning_rate": 0.00019998962996434975, "loss": 44.347, "step": 17 }, { "epoch": 0.0017853600476096013, "grad_norm": 1.4694714546203613, "learning_rate": 0.00019998645553530374, "loss": 44.4052, "step": 18 }, { "epoch": 0.0018845467169212459, "grad_norm": 1.6723171472549438, "learning_rate": 0.00019998285788966027, "loss": 44.3845, "step": 19 }, { "epoch": 0.0019837333862328904, "grad_norm": 1.7051684856414795, "learning_rate": 0.0001999788370426473, "loss": 44.4125, "step": 20 }, { "epoch": 0.002082920055544535, "grad_norm": 1.6583561897277832, "learning_rate": 0.00019997439301128392, "loss": 44.3765, "step": 21 }, { "epoch": 0.0021821067248561795, "grad_norm": 1.8576438426971436, "learning_rate": 0.00019996952581438068, "loss": 44.4891, "step": 22 }, { "epoch": 0.002281293394167824, "grad_norm": 1.848366618156433, "learning_rate": 0.00019996423547253908, "loss": 44.5022, "step": 23 }, { "epoch": 0.0023804800634794686, "grad_norm": 1.5471642017364502, "learning_rate": 0.0001999585220081518, "loss": 44.3303, "step": 24 }, { "epoch": 0.0024796667327911127, "grad_norm": 1.4158411026000977, "learning_rate": 0.00019995238544540241, "loss": 44.2844, "step": 25 }, { "epoch": 0.002578853402102757, "grad_norm": 1.7801810503005981, "learning_rate": 0.00019994582581026537, "loss": 44.2582, "step": 26 }, { "epoch": 0.0026780400714144018, "grad_norm": 1.5551291704177856, "learning_rate": 0.00019993884313050596, "loss": 44.4768, "step": 27 }, { "epoch": 0.0027772267407260463, "grad_norm": 1.8527891635894775, "learning_rate": 0.00019993143743568, "loss": 44.4173, "step": 28 }, { "epoch": 0.002876413410037691, "grad_norm": 2.0317776203155518, "learning_rate": 0.00019992360875713388, "loss": 44.2719, "step": 29 }, { "epoch": 0.0029756000793493354, "grad_norm": 1.7244848012924194, "learning_rate": 0.00019991535712800436, "loss": 44.4475, "step": 30 }, { "epoch": 0.00307478674866098, "grad_norm": 1.6184711456298828, "learning_rate": 0.0001999066825832184, "loss": 44.2963, "step": 31 }, { "epoch": 0.0031739734179726245, "grad_norm": 1.8100088834762573, "learning_rate": 0.00019989758515949314, "loss": 44.4886, "step": 32 }, { "epoch": 0.003273160087284269, "grad_norm": 1.7541842460632324, "learning_rate": 0.0001998880648953356, "loss": 44.3164, "step": 33 }, { "epoch": 0.0033723467565959135, "grad_norm": 2.108027935028076, "learning_rate": 0.00019987812183104247, "loss": 44.2234, "step": 34 }, { "epoch": 0.003471533425907558, "grad_norm": 2.0712199211120605, "learning_rate": 0.00019986775600870028, "loss": 44.4076, "step": 35 }, { "epoch": 0.0035707200952192026, "grad_norm": 1.7738473415374756, "learning_rate": 0.0001998569674721848, "loss": 44.3042, "step": 36 }, { "epoch": 0.003669906764530847, "grad_norm": 2.1896848678588867, "learning_rate": 0.0001998457562671611, "loss": 44.4304, "step": 37 }, { "epoch": 0.0037690934338424917, "grad_norm": 2.1115825176239014, "learning_rate": 0.0001998341224410833, "loss": 44.2476, "step": 38 }, { "epoch": 0.0038682801031541363, "grad_norm": 1.9150196313858032, "learning_rate": 0.00019982206604319432, "loss": 44.2516, "step": 39 }, { "epoch": 0.003967466772465781, "grad_norm": 2.235543966293335, "learning_rate": 0.00019980958712452577, "loss": 44.2808, "step": 40 }, { "epoch": 0.004066653441777425, "grad_norm": 2.3197598457336426, "learning_rate": 0.00019979668573789764, "loss": 44.0965, "step": 41 }, { "epoch": 0.00416584011108907, "grad_norm": 2.2134547233581543, "learning_rate": 0.00019978336193791818, "loss": 44.0797, "step": 42 }, { "epoch": 0.004265026780400714, "grad_norm": 2.184122085571289, "learning_rate": 0.0001997696157809835, "loss": 44.2404, "step": 43 }, { "epoch": 0.004364213449712359, "grad_norm": 2.197744131088257, "learning_rate": 0.00019975544732527754, "loss": 44.1234, "step": 44 }, { "epoch": 0.0044634001190240035, "grad_norm": 2.085909843444824, "learning_rate": 0.0001997408566307717, "loss": 44.0743, "step": 45 }, { "epoch": 0.004562586788335648, "grad_norm": 2.50148344039917, "learning_rate": 0.0001997258437592245, "loss": 44.0883, "step": 46 }, { "epoch": 0.004661773457647293, "grad_norm": 2.4316093921661377, "learning_rate": 0.0001997104087741816, "loss": 44.2416, "step": 47 }, { "epoch": 0.004760960126958937, "grad_norm": 2.3223953247070312, "learning_rate": 0.00019969455174097522, "loss": 44.2523, "step": 48 }, { "epoch": 0.004860146796270581, "grad_norm": 2.0773441791534424, "learning_rate": 0.00019967827272672408, "loss": 44.2364, "step": 49 }, { "epoch": 0.004959333465582225, "grad_norm": 2.2387514114379883, "learning_rate": 0.00019966157180033295, "loss": 44.1623, "step": 50 }, { "epoch": 0.00505852013489387, "grad_norm": 2.4744904041290283, "learning_rate": 0.00019964444903249257, "loss": 43.9827, "step": 51 }, { "epoch": 0.005157706804205514, "grad_norm": 2.2337639331817627, "learning_rate": 0.00019962690449567912, "loss": 43.9373, "step": 52 }, { "epoch": 0.005256893473517159, "grad_norm": 2.2573468685150146, "learning_rate": 0.000199608938264154, "loss": 44.0735, "step": 53 }, { "epoch": 0.0053560801428288035, "grad_norm": 2.193406581878662, "learning_rate": 0.00019959055041396365, "loss": 44.0876, "step": 54 }, { "epoch": 0.005455266812140448, "grad_norm": 2.396475315093994, "learning_rate": 0.000199571741022939, "loss": 43.9384, "step": 55 }, { "epoch": 0.005554453481452093, "grad_norm": 2.4828500747680664, "learning_rate": 0.00019955251017069535, "loss": 43.8114, "step": 56 }, { "epoch": 0.005653640150763737, "grad_norm": 2.346181869506836, "learning_rate": 0.00019953285793863183, "loss": 43.8613, "step": 57 }, { "epoch": 0.005752826820075382, "grad_norm": 2.233077049255371, "learning_rate": 0.0001995127844099313, "loss": 43.9494, "step": 58 }, { "epoch": 0.005852013489387026, "grad_norm": 2.2398388385772705, "learning_rate": 0.00019949228966955964, "loss": 43.9634, "step": 59 }, { "epoch": 0.005951200158698671, "grad_norm": 2.659518241882324, "learning_rate": 0.00019947137380426584, "loss": 43.7641, "step": 60 }, { "epoch": 0.006050386828010315, "grad_norm": 2.532860279083252, "learning_rate": 0.00019945003690258125, "loss": 43.7495, "step": 61 }, { "epoch": 0.00614957349732196, "grad_norm": 2.4075887203216553, "learning_rate": 0.00019942827905481948, "loss": 43.778, "step": 62 }, { "epoch": 0.006248760166633604, "grad_norm": 2.2907755374908447, "learning_rate": 0.0001994061003530758, "loss": 43.7928, "step": 63 }, { "epoch": 0.006347946835945249, "grad_norm": 2.027778148651123, "learning_rate": 0.00019938350089122682, "loss": 43.7225, "step": 64 }, { "epoch": 0.0064471335052568935, "grad_norm": 2.1447701454162598, "learning_rate": 0.00019936048076493024, "loss": 43.87, "step": 65 }, { "epoch": 0.006546320174568538, "grad_norm": 2.119171142578125, "learning_rate": 0.00019933704007162422, "loss": 43.7435, "step": 66 }, { "epoch": 0.0066455068438801826, "grad_norm": 2.7517640590667725, "learning_rate": 0.00019931317891052708, "loss": 43.6046, "step": 67 }, { "epoch": 0.006744693513191827, "grad_norm": 2.407707691192627, "learning_rate": 0.00019928889738263692, "loss": 43.6347, "step": 68 }, { "epoch": 0.006843880182503472, "grad_norm": 2.4712741374969482, "learning_rate": 0.00019926419559073116, "loss": 43.6957, "step": 69 }, { "epoch": 0.006943066851815116, "grad_norm": 2.052049160003662, "learning_rate": 0.00019923907363936593, "loss": 43.7108, "step": 70 }, { "epoch": 0.007042253521126761, "grad_norm": 2.056997060775757, "learning_rate": 0.00019921353163487605, "loss": 43.5159, "step": 71 }, { "epoch": 0.007141440190438405, "grad_norm": 2.082824945449829, "learning_rate": 0.00019918756968537404, "loss": 43.535, "step": 72 }, { "epoch": 0.00724062685975005, "grad_norm": 2.326700210571289, "learning_rate": 0.00019916118790075008, "loss": 43.5982, "step": 73 }, { "epoch": 0.007339813529061694, "grad_norm": 2.0763967037200928, "learning_rate": 0.00019913438639267147, "loss": 43.7443, "step": 74 }, { "epoch": 0.007439000198373339, "grad_norm": 2.132988691329956, "learning_rate": 0.00019910716527458196, "loss": 43.6283, "step": 75 }, { "epoch": 0.007538186867684983, "grad_norm": 2.0437686443328857, "learning_rate": 0.00019907952466170138, "loss": 43.5852, "step": 76 }, { "epoch": 0.007637373536996628, "grad_norm": 2.0281074047088623, "learning_rate": 0.00019905146467102535, "loss": 43.6325, "step": 77 }, { "epoch": 0.0077365602063082725, "grad_norm": 1.7176659107208252, "learning_rate": 0.0001990229854213244, "loss": 43.627, "step": 78 }, { "epoch": 0.007835746875619916, "grad_norm": 2.1028647422790527, "learning_rate": 0.00019899408703314385, "loss": 43.4667, "step": 79 }, { "epoch": 0.007934933544931562, "grad_norm": 2.0309996604919434, "learning_rate": 0.00019896476962880298, "loss": 43.4972, "step": 80 }, { "epoch": 0.008034120214243205, "grad_norm": 2.239147901535034, "learning_rate": 0.00019893503333239472, "loss": 43.5698, "step": 81 }, { "epoch": 0.00813330688355485, "grad_norm": 2.0050814151763916, "learning_rate": 0.0001989048782697851, "loss": 43.6927, "step": 82 }, { "epoch": 0.008232493552866494, "grad_norm": 1.7180596590042114, "learning_rate": 0.00019887430456861263, "loss": 43.6059, "step": 83 }, { "epoch": 0.00833168022217814, "grad_norm": 1.9801454544067383, "learning_rate": 0.0001988433123582878, "loss": 43.4086, "step": 84 }, { "epoch": 0.008430866891489783, "grad_norm": 1.8289369344711304, "learning_rate": 0.00019881190176999255, "loss": 43.6122, "step": 85 }, { "epoch": 0.008530053560801429, "grad_norm": 1.9409068822860718, "learning_rate": 0.0001987800729366797, "loss": 43.5798, "step": 86 }, { "epoch": 0.008629240230113073, "grad_norm": 1.632542371749878, "learning_rate": 0.00019874782599307246, "loss": 43.5092, "step": 87 }, { "epoch": 0.008728426899424718, "grad_norm": 1.9863494634628296, "learning_rate": 0.00019871516107566366, "loss": 43.5534, "step": 88 }, { "epoch": 0.008827613568736362, "grad_norm": 2.040839910507202, "learning_rate": 0.00019868207832271544, "loss": 43.5086, "step": 89 }, { "epoch": 0.008926800238048007, "grad_norm": 2.190769672393799, "learning_rate": 0.00019864857787425843, "loss": 43.4819, "step": 90 }, { "epoch": 0.00902598690735965, "grad_norm": 1.9851669073104858, "learning_rate": 0.0001986146598720913, "loss": 43.5616, "step": 91 }, { "epoch": 0.009125173576671296, "grad_norm": 1.8120468854904175, "learning_rate": 0.00019858032445978016, "loss": 43.5843, "step": 92 }, { "epoch": 0.00922436024598294, "grad_norm": 1.8418827056884766, "learning_rate": 0.0001985455717826578, "loss": 43.5347, "step": 93 }, { "epoch": 0.009323546915294585, "grad_norm": 1.6635512113571167, "learning_rate": 0.00019851040198782326, "loss": 43.5441, "step": 94 }, { "epoch": 0.009422733584606229, "grad_norm": 1.7090002298355103, "learning_rate": 0.0001984748152241411, "loss": 43.6308, "step": 95 }, { "epoch": 0.009521920253917874, "grad_norm": 1.870936632156372, "learning_rate": 0.00019843881164224083, "loss": 43.4429, "step": 96 }, { "epoch": 0.009621106923229518, "grad_norm": 1.9159389734268188, "learning_rate": 0.0001984023913945162, "loss": 43.5475, "step": 97 }, { "epoch": 0.009720293592541162, "grad_norm": 1.7508138418197632, "learning_rate": 0.00019836555463512468, "loss": 43.5315, "step": 98 }, { "epoch": 0.009819480261852807, "grad_norm": 2.1173791885375977, "learning_rate": 0.00019832830151998658, "loss": 43.2984, "step": 99 }, { "epoch": 0.00991866693116445, "grad_norm": 2.0050880908966064, "learning_rate": 0.0001982906322067847, "loss": 43.3122, "step": 100 }, { "epoch": 0.010017853600476096, "grad_norm": 1.960893988609314, "learning_rate": 0.00019825254685496338, "loss": 43.3747, "step": 101 }, { "epoch": 0.01011704026978774, "grad_norm": 1.678585410118103, "learning_rate": 0.00019821404562572802, "loss": 43.4109, "step": 102 }, { "epoch": 0.010216226939099385, "grad_norm": 1.6985716819763184, "learning_rate": 0.00019817512868204425, "loss": 43.5895, "step": 103 }, { "epoch": 0.010315413608411029, "grad_norm": 1.864445447921753, "learning_rate": 0.00019813579618863734, "loss": 43.5837, "step": 104 }, { "epoch": 0.010414600277722674, "grad_norm": 1.9400899410247803, "learning_rate": 0.00019809604831199154, "loss": 43.5438, "step": 105 }, { "epoch": 0.010513786947034318, "grad_norm": 1.9073424339294434, "learning_rate": 0.00019805588522034916, "loss": 43.6905, "step": 106 }, { "epoch": 0.010612973616345963, "grad_norm": 1.7238534688949585, "learning_rate": 0.00019801530708371017, "loss": 43.6856, "step": 107 }, { "epoch": 0.010712160285657607, "grad_norm": 1.609853982925415, "learning_rate": 0.00019797431407383122, "loss": 43.4932, "step": 108 }, { "epoch": 0.010811346954969252, "grad_norm": 1.9697946310043335, "learning_rate": 0.00019793290636422505, "loss": 43.3459, "step": 109 }, { "epoch": 0.010910533624280896, "grad_norm": 1.9920051097869873, "learning_rate": 0.00019789108413015963, "loss": 43.6104, "step": 110 }, { "epoch": 0.011009720293592542, "grad_norm": 2.0194878578186035, "learning_rate": 0.00019784884754865765, "loss": 43.4293, "step": 111 }, { "epoch": 0.011108906962904185, "grad_norm": 2.0362696647644043, "learning_rate": 0.00019780619679849552, "loss": 43.3875, "step": 112 }, { "epoch": 0.01120809363221583, "grad_norm": 1.4652763605117798, "learning_rate": 0.00019776313206020275, "loss": 43.5407, "step": 113 }, { "epoch": 0.011307280301527474, "grad_norm": 1.9906383752822876, "learning_rate": 0.00019771965351606113, "loss": 43.4089, "step": 114 }, { "epoch": 0.01140646697083912, "grad_norm": 2.1399691104888916, "learning_rate": 0.000197675761350104, "loss": 43.483, "step": 115 }, { "epoch": 0.011505653640150763, "grad_norm": 1.813093900680542, "learning_rate": 0.00019763145574811547, "loss": 43.415, "step": 116 }, { "epoch": 0.011604840309462409, "grad_norm": 1.8109016418457031, "learning_rate": 0.00019758673689762962, "loss": 43.3056, "step": 117 }, { "epoch": 0.011704026978774052, "grad_norm": 1.8245785236358643, "learning_rate": 0.00019754160498792965, "loss": 43.5708, "step": 118 }, { "epoch": 0.011803213648085698, "grad_norm": 2.116065263748169, "learning_rate": 0.00019749606021004717, "loss": 43.1676, "step": 119 }, { "epoch": 0.011902400317397342, "grad_norm": 1.7063171863555908, "learning_rate": 0.00019745010275676138, "loss": 43.6211, "step": 120 }, { "epoch": 0.012001586986708987, "grad_norm": 1.6588932275772095, "learning_rate": 0.0001974037328225982, "loss": 43.5795, "step": 121 }, { "epoch": 0.01210077365602063, "grad_norm": 1.9446206092834473, "learning_rate": 0.00019735695060382944, "loss": 43.5033, "step": 122 }, { "epoch": 0.012199960325332276, "grad_norm": 2.098048686981201, "learning_rate": 0.00019730975629847201, "loss": 43.609, "step": 123 }, { "epoch": 0.01229914699464392, "grad_norm": 1.849349856376648, "learning_rate": 0.00019726215010628718, "loss": 43.3481, "step": 124 }, { "epoch": 0.012398333663955565, "grad_norm": 1.530873417854309, "learning_rate": 0.0001972141322287795, "loss": 43.4044, "step": 125 }, { "epoch": 0.012497520333267209, "grad_norm": 2.065800189971924, "learning_rate": 0.0001971657028691961, "loss": 43.0564, "step": 126 }, { "epoch": 0.012596707002578854, "grad_norm": 1.8316601514816284, "learning_rate": 0.0001971168622325259, "loss": 43.3583, "step": 127 }, { "epoch": 0.012695893671890498, "grad_norm": 1.7628271579742432, "learning_rate": 0.00019706761052549853, "loss": 43.4499, "step": 128 }, { "epoch": 0.012795080341202143, "grad_norm": 1.9169988632202148, "learning_rate": 0.00019701794795658363, "loss": 43.3065, "step": 129 }, { "epoch": 0.012894267010513787, "grad_norm": 1.784313440322876, "learning_rate": 0.00019696787473598993, "loss": 43.308, "step": 130 }, { "epoch": 0.01299345367982543, "grad_norm": 1.80251944065094, "learning_rate": 0.0001969173910756643, "loss": 43.3693, "step": 131 }, { "epoch": 0.013092640349137076, "grad_norm": 1.9307643175125122, "learning_rate": 0.0001968664971892909, "loss": 43.2518, "step": 132 }, { "epoch": 0.01319182701844872, "grad_norm": 1.8621703386306763, "learning_rate": 0.00019681519329229033, "loss": 43.5505, "step": 133 }, { "epoch": 0.013291013687760365, "grad_norm": 2.358393430709839, "learning_rate": 0.00019676347960181855, "loss": 43.4462, "step": 134 }, { "epoch": 0.013390200357072009, "grad_norm": 1.9287742376327515, "learning_rate": 0.00019671135633676616, "loss": 43.3768, "step": 135 }, { "epoch": 0.013489387026383654, "grad_norm": 1.6573784351348877, "learning_rate": 0.00019665882371775733, "loss": 43.5067, "step": 136 }, { "epoch": 0.013588573695695298, "grad_norm": 1.8315342664718628, "learning_rate": 0.00019660588196714903, "loss": 43.4329, "step": 137 }, { "epoch": 0.013687760365006943, "grad_norm": 1.889752984046936, "learning_rate": 0.00019655253130902974, "loss": 43.4747, "step": 138 }, { "epoch": 0.013786947034318587, "grad_norm": 1.7668240070343018, "learning_rate": 0.00019649877196921896, "loss": 43.4241, "step": 139 }, { "epoch": 0.013886133703630232, "grad_norm": 1.9625318050384521, "learning_rate": 0.000196444604175266, "loss": 43.6246, "step": 140 }, { "epoch": 0.013985320372941876, "grad_norm": 1.970367193222046, "learning_rate": 0.00019639002815644894, "loss": 43.1665, "step": 141 }, { "epoch": 0.014084507042253521, "grad_norm": 1.95570707321167, "learning_rate": 0.00019633504414377388, "loss": 43.4941, "step": 142 }, { "epoch": 0.014183693711565165, "grad_norm": 1.896040678024292, "learning_rate": 0.0001962796523699738, "loss": 43.4991, "step": 143 }, { "epoch": 0.01428288038087681, "grad_norm": 1.9114121198654175, "learning_rate": 0.00019622385306950762, "loss": 43.5409, "step": 144 }, { "epoch": 0.014382067050188454, "grad_norm": 1.6941092014312744, "learning_rate": 0.00019616764647855926, "loss": 43.7337, "step": 145 }, { "epoch": 0.0144812537195001, "grad_norm": 1.8652082681655884, "learning_rate": 0.00019611103283503654, "loss": 43.5431, "step": 146 }, { "epoch": 0.014580440388811743, "grad_norm": 1.7759754657745361, "learning_rate": 0.00019605401237857023, "loss": 43.5254, "step": 147 }, { "epoch": 0.014679627058123389, "grad_norm": 1.5800156593322754, "learning_rate": 0.00019599658535051314, "loss": 43.4581, "step": 148 }, { "epoch": 0.014778813727435032, "grad_norm": 2.1402370929718018, "learning_rate": 0.00019593875199393882, "loss": 43.4564, "step": 149 }, { "epoch": 0.014878000396746678, "grad_norm": 1.8057047128677368, "learning_rate": 0.00019588051255364083, "loss": 43.4389, "step": 150 }, { "epoch": 0.014977187066058321, "grad_norm": 1.5020058155059814, "learning_rate": 0.00019582186727613152, "loss": 43.3091, "step": 151 }, { "epoch": 0.015076373735369967, "grad_norm": 1.6418455839157104, "learning_rate": 0.0001957628164096411, "loss": 43.211, "step": 152 }, { "epoch": 0.01517556040468161, "grad_norm": 1.5354704856872559, "learning_rate": 0.00019570336020411643, "loss": 43.3525, "step": 153 }, { "epoch": 0.015274747073993256, "grad_norm": 1.8784282207489014, "learning_rate": 0.00019564349891122018, "loss": 43.6796, "step": 154 }, { "epoch": 0.0153739337433049, "grad_norm": 1.6533524990081787, "learning_rate": 0.00019558323278432956, "loss": 43.2943, "step": 155 }, { "epoch": 0.015473120412616545, "grad_norm": 1.8979015350341797, "learning_rate": 0.00019552256207853538, "loss": 43.4212, "step": 156 }, { "epoch": 0.015572307081928189, "grad_norm": 1.5906367301940918, "learning_rate": 0.00019546148705064097, "loss": 43.5057, "step": 157 }, { "epoch": 0.015671493751239832, "grad_norm": 1.5876492261886597, "learning_rate": 0.0001954000079591609, "loss": 43.1743, "step": 158 }, { "epoch": 0.015770680420551478, "grad_norm": 1.9419764280319214, "learning_rate": 0.0001953381250643202, "loss": 43.5099, "step": 159 }, { "epoch": 0.015869867089863123, "grad_norm": 1.7575715780258179, "learning_rate": 0.00019527583862805303, "loss": 43.5125, "step": 160 }, { "epoch": 0.01596905375917477, "grad_norm": 1.4455037117004395, "learning_rate": 0.00019521314891400162, "loss": 43.4008, "step": 161 }, { "epoch": 0.01606824042848641, "grad_norm": 1.5164740085601807, "learning_rate": 0.0001951500561875152, "loss": 43.5441, "step": 162 }, { "epoch": 0.016167427097798056, "grad_norm": 1.5752917528152466, "learning_rate": 0.00019508656071564882, "loss": 43.5938, "step": 163 }, { "epoch": 0.0162666137671097, "grad_norm": 1.9830480813980103, "learning_rate": 0.00019502266276716224, "loss": 43.3155, "step": 164 }, { "epoch": 0.016365800436421343, "grad_norm": 1.8618719577789307, "learning_rate": 0.00019495836261251883, "loss": 43.4229, "step": 165 }, { "epoch": 0.01646498710573299, "grad_norm": 1.8885892629623413, "learning_rate": 0.00019489366052388441, "loss": 43.3779, "step": 166 }, { "epoch": 0.016564173775044634, "grad_norm": 1.7652533054351807, "learning_rate": 0.000194828556775126, "loss": 43.2559, "step": 167 }, { "epoch": 0.01666336044435628, "grad_norm": 2.0413341522216797, "learning_rate": 0.00019476305164181087, "loss": 43.5517, "step": 168 }, { "epoch": 0.01676254711366792, "grad_norm": 1.603642463684082, "learning_rate": 0.00019469714540120507, "loss": 43.4541, "step": 169 }, { "epoch": 0.016861733782979567, "grad_norm": 1.7321805953979492, "learning_rate": 0.0001946308383322726, "loss": 43.4268, "step": 170 }, { "epoch": 0.016960920452291212, "grad_norm": 1.5815627574920654, "learning_rate": 0.00019456413071567398, "loss": 43.5278, "step": 171 }, { "epoch": 0.017060107121602858, "grad_norm": 1.7616136074066162, "learning_rate": 0.00019449702283376517, "loss": 43.3663, "step": 172 }, { "epoch": 0.0171592937909145, "grad_norm": 1.7920783758163452, "learning_rate": 0.0001944295149705963, "loss": 43.3133, "step": 173 }, { "epoch": 0.017258480460226145, "grad_norm": 1.9470977783203125, "learning_rate": 0.0001943616074119106, "loss": 43.4104, "step": 174 }, { "epoch": 0.01735766712953779, "grad_norm": 1.9142847061157227, "learning_rate": 0.00019429330044514305, "loss": 43.4963, "step": 175 }, { "epoch": 0.017456853798849436, "grad_norm": 1.6739574670791626, "learning_rate": 0.0001942245943594192, "loss": 43.276, "step": 176 }, { "epoch": 0.017556040468161078, "grad_norm": 1.5089815855026245, "learning_rate": 0.00019415548944555405, "loss": 43.4358, "step": 177 }, { "epoch": 0.017655227137472723, "grad_norm": 1.8579277992248535, "learning_rate": 0.0001940859859960506, "loss": 43.5816, "step": 178 }, { "epoch": 0.01775441380678437, "grad_norm": 2.0343070030212402, "learning_rate": 0.00019401608430509893, "loss": 43.394, "step": 179 }, { "epoch": 0.017853600476096014, "grad_norm": 1.8242613077163696, "learning_rate": 0.0001939457846685746, "loss": 43.5701, "step": 180 }, { "epoch": 0.017952787145407656, "grad_norm": 1.7436537742614746, "learning_rate": 0.00019387508738403768, "loss": 43.4366, "step": 181 }, { "epoch": 0.0180519738147193, "grad_norm": 1.698238492012024, "learning_rate": 0.0001938039927507313, "loss": 43.7248, "step": 182 }, { "epoch": 0.018151160484030947, "grad_norm": 1.576426386833191, "learning_rate": 0.00019373250106958056, "loss": 43.4787, "step": 183 }, { "epoch": 0.018250347153342592, "grad_norm": 1.729797601699829, "learning_rate": 0.0001936606126431911, "loss": 43.2909, "step": 184 }, { "epoch": 0.018349533822654234, "grad_norm": 2.1329703330993652, "learning_rate": 0.00019358832777584785, "loss": 43.5946, "step": 185 }, { "epoch": 0.01844872049196588, "grad_norm": 1.9629263877868652, "learning_rate": 0.00019351564677351385, "loss": 43.2455, "step": 186 }, { "epoch": 0.018547907161277525, "grad_norm": 1.9318324327468872, "learning_rate": 0.00019344256994382878, "loss": 43.665, "step": 187 }, { "epoch": 0.01864709383058917, "grad_norm": 2.088292121887207, "learning_rate": 0.00019336909759610786, "loss": 43.4711, "step": 188 }, { "epoch": 0.018746280499900812, "grad_norm": 1.6517703533172607, "learning_rate": 0.00019329523004134037, "loss": 43.4119, "step": 189 }, { "epoch": 0.018845467169212458, "grad_norm": 1.9081069231033325, "learning_rate": 0.00019322096759218836, "loss": 43.5138, "step": 190 }, { "epoch": 0.018944653838524103, "grad_norm": 1.9650448560714722, "learning_rate": 0.0001931463105629855, "loss": 43.5728, "step": 191 }, { "epoch": 0.01904384050783575, "grad_norm": 1.7469886541366577, "learning_rate": 0.00019307125926973546, "loss": 43.5318, "step": 192 }, { "epoch": 0.01914302717714739, "grad_norm": 1.57616126537323, "learning_rate": 0.00019299581403011082, "loss": 43.4801, "step": 193 }, { "epoch": 0.019242213846459036, "grad_norm": 1.945536494255066, "learning_rate": 0.00019291997516345162, "loss": 43.5905, "step": 194 }, { "epoch": 0.01934140051577068, "grad_norm": 1.567405343055725, "learning_rate": 0.000192843742990764, "loss": 43.5673, "step": 195 }, { "epoch": 0.019440587185082323, "grad_norm": 1.8113877773284912, "learning_rate": 0.0001927671178347189, "loss": 43.5219, "step": 196 }, { "epoch": 0.01953977385439397, "grad_norm": 1.5996432304382324, "learning_rate": 0.00019269010001965055, "loss": 43.593, "step": 197 }, { "epoch": 0.019638960523705614, "grad_norm": 1.77357816696167, "learning_rate": 0.00019261268987155535, "loss": 43.4401, "step": 198 }, { "epoch": 0.01973814719301726, "grad_norm": 1.4410920143127441, "learning_rate": 0.00019253488771809024, "loss": 43.5063, "step": 199 }, { "epoch": 0.0198373338623289, "grad_norm": 1.6010916233062744, "learning_rate": 0.0001924566938885715, "loss": 43.3343, "step": 200 }, { "epoch": 0.019936520531640547, "grad_norm": 1.791623592376709, "learning_rate": 0.00019237810871397315, "loss": 43.4581, "step": 201 }, { "epoch": 0.020035707200952192, "grad_norm": 1.7843762636184692, "learning_rate": 0.0001922991325269258, "loss": 43.5192, "step": 202 }, { "epoch": 0.020134893870263838, "grad_norm": 1.483970046043396, "learning_rate": 0.00019221976566171503, "loss": 43.4632, "step": 203 }, { "epoch": 0.02023408053957548, "grad_norm": 1.9275554418563843, "learning_rate": 0.0001921400084542801, "loss": 43.4883, "step": 204 }, { "epoch": 0.020333267208887125, "grad_norm": 1.6076281070709229, "learning_rate": 0.00019205986124221251, "loss": 43.2708, "step": 205 }, { "epoch": 0.02043245387819877, "grad_norm": 1.8743144273757935, "learning_rate": 0.00019197932436475446, "loss": 43.4004, "step": 206 }, { "epoch": 0.020531640547510416, "grad_norm": 1.8211561441421509, "learning_rate": 0.00019189839816279754, "loss": 43.4402, "step": 207 }, { "epoch": 0.020630827216822058, "grad_norm": 1.6338099241256714, "learning_rate": 0.00019181708297888133, "loss": 43.4574, "step": 208 }, { "epoch": 0.020730013886133703, "grad_norm": 1.7611514329910278, "learning_rate": 0.0001917353791571918, "loss": 43.25, "step": 209 }, { "epoch": 0.02082920055544535, "grad_norm": 1.874194622039795, "learning_rate": 0.0001916532870435598, "loss": 43.4563, "step": 210 }, { "epoch": 0.020928387224756994, "grad_norm": 1.9529900550842285, "learning_rate": 0.00019157080698546, "loss": 43.3493, "step": 211 }, { "epoch": 0.021027573894068636, "grad_norm": 1.6599798202514648, "learning_rate": 0.0001914879393320089, "loss": 43.4581, "step": 212 }, { "epoch": 0.02112676056338028, "grad_norm": 1.828358769416809, "learning_rate": 0.0001914046844339637, "loss": 43.3117, "step": 213 }, { "epoch": 0.021225947232691927, "grad_norm": 1.8962675333023071, "learning_rate": 0.00019132104264372063, "loss": 43.3318, "step": 214 }, { "epoch": 0.021325133902003572, "grad_norm": 1.7085741758346558, "learning_rate": 0.00019123701431531366, "loss": 43.4381, "step": 215 }, { "epoch": 0.021424320571315214, "grad_norm": 1.8335672616958618, "learning_rate": 0.00019115259980441272, "loss": 43.6232, "step": 216 }, { "epoch": 0.02152350724062686, "grad_norm": 1.8107479810714722, "learning_rate": 0.0001910677994683225, "loss": 43.3826, "step": 217 }, { "epoch": 0.021622693909938505, "grad_norm": 1.871582269668579, "learning_rate": 0.00019098261366598066, "loss": 43.4663, "step": 218 }, { "epoch": 0.02172188057925015, "grad_norm": 1.9268748760223389, "learning_rate": 0.00019089704275795648, "loss": 43.2329, "step": 219 }, { "epoch": 0.021821067248561792, "grad_norm": 2.409046173095703, "learning_rate": 0.00019081108710644932, "loss": 43.5765, "step": 220 }, { "epoch": 0.021920253917873438, "grad_norm": 1.7865118980407715, "learning_rate": 0.00019072474707528708, "loss": 43.5461, "step": 221 }, { "epoch": 0.022019440587185083, "grad_norm": 2.009024143218994, "learning_rate": 0.00019063802302992454, "loss": 43.5502, "step": 222 }, { "epoch": 0.02211862725649673, "grad_norm": 1.73333740234375, "learning_rate": 0.00019055091533744202, "loss": 43.4989, "step": 223 }, { "epoch": 0.02221781392580837, "grad_norm": 1.75791335105896, "learning_rate": 0.00019046342436654365, "loss": 43.4315, "step": 224 }, { "epoch": 0.022317000595120016, "grad_norm": 1.5838587284088135, "learning_rate": 0.0001903755504875559, "loss": 43.633, "step": 225 }, { "epoch": 0.02241618726443166, "grad_norm": 1.833753228187561, "learning_rate": 0.00019028729407242597, "loss": 43.3394, "step": 226 }, { "epoch": 0.022515373933743307, "grad_norm": 2.0033679008483887, "learning_rate": 0.0001901986554947203, "loss": 43.3916, "step": 227 }, { "epoch": 0.02261456060305495, "grad_norm": 1.6959342956542969, "learning_rate": 0.00019010963512962288, "loss": 43.4505, "step": 228 }, { "epoch": 0.022713747272366594, "grad_norm": 2.0134894847869873, "learning_rate": 0.00019002023335393364, "loss": 43.5815, "step": 229 }, { "epoch": 0.02281293394167824, "grad_norm": 2.1908352375030518, "learning_rate": 0.00018993045054606705, "loss": 43.6322, "step": 230 }, { "epoch": 0.02291212061098988, "grad_norm": 1.8331201076507568, "learning_rate": 0.0001898402870860503, "loss": 43.3902, "step": 231 }, { "epoch": 0.023011307280301527, "grad_norm": 1.6046873331069946, "learning_rate": 0.0001897497433555218, "loss": 43.5463, "step": 232 }, { "epoch": 0.023110493949613172, "grad_norm": 2.0050089359283447, "learning_rate": 0.0001896588197377296, "loss": 43.6324, "step": 233 }, { "epoch": 0.023209680618924818, "grad_norm": 1.8393956422805786, "learning_rate": 0.0001895675166175296, "loss": 43.5556, "step": 234 }, { "epoch": 0.02330886728823646, "grad_norm": 1.8612674474716187, "learning_rate": 0.0001894758343813842, "loss": 43.4871, "step": 235 }, { "epoch": 0.023408053957548105, "grad_norm": 1.6958578824996948, "learning_rate": 0.0001893837734173603, "loss": 43.4409, "step": 236 }, { "epoch": 0.02350724062685975, "grad_norm": 1.769364595413208, "learning_rate": 0.000189291334115128, "loss": 43.3601, "step": 237 }, { "epoch": 0.023606427296171396, "grad_norm": 1.8303394317626953, "learning_rate": 0.00018919851686595874, "loss": 43.3631, "step": 238 }, { "epoch": 0.023705613965483038, "grad_norm": 1.7114533185958862, "learning_rate": 0.00018910532206272378, "loss": 43.3691, "step": 239 }, { "epoch": 0.023804800634794683, "grad_norm": 1.8776710033416748, "learning_rate": 0.0001890117500998924, "loss": 43.3386, "step": 240 }, { "epoch": 0.02390398730410633, "grad_norm": 1.8478602170944214, "learning_rate": 0.00018891780137353034, "loss": 43.4358, "step": 241 }, { "epoch": 0.024003173973417974, "grad_norm": 2.248779296875, "learning_rate": 0.00018882347628129806, "loss": 43.3187, "step": 242 }, { "epoch": 0.024102360642729616, "grad_norm": 1.8203048706054688, "learning_rate": 0.00018872877522244905, "loss": 43.321, "step": 243 }, { "epoch": 0.02420154731204126, "grad_norm": 1.9116209745407104, "learning_rate": 0.00018863369859782825, "loss": 43.2978, "step": 244 }, { "epoch": 0.024300733981352907, "grad_norm": 2.242579936981201, "learning_rate": 0.00018853824680987018, "loss": 43.51, "step": 245 }, { "epoch": 0.024399920650664552, "grad_norm": 1.6903893947601318, "learning_rate": 0.0001884424202625974, "loss": 43.2704, "step": 246 }, { "epoch": 0.024499107319976194, "grad_norm": 1.7841495275497437, "learning_rate": 0.0001883462193616187, "loss": 43.2848, "step": 247 }, { "epoch": 0.02459829398928784, "grad_norm": 1.6767023801803589, "learning_rate": 0.00018824964451412738, "loss": 43.6522, "step": 248 }, { "epoch": 0.024697480658599485, "grad_norm": 1.715501308441162, "learning_rate": 0.0001881526961288996, "loss": 43.4763, "step": 249 }, { "epoch": 0.02479666732791113, "grad_norm": 1.7172234058380127, "learning_rate": 0.00018805537461629265, "loss": 43.5719, "step": 250 }, { "epoch": 0.024895853997222772, "grad_norm": 1.8724406957626343, "learning_rate": 0.00018795768038824305, "loss": 43.3464, "step": 251 }, { "epoch": 0.024995040666534418, "grad_norm": 1.9240301847457886, "learning_rate": 0.00018785961385826502, "loss": 43.5301, "step": 252 }, { "epoch": 0.025094227335846063, "grad_norm": 1.727128028869629, "learning_rate": 0.00018776117544144863, "loss": 43.2979, "step": 253 }, { "epoch": 0.02519341400515771, "grad_norm": 1.5075703859329224, "learning_rate": 0.00018766236555445805, "loss": 43.5771, "step": 254 }, { "epoch": 0.02529260067446935, "grad_norm": 1.6511516571044922, "learning_rate": 0.00018756318461552973, "loss": 43.5003, "step": 255 }, { "epoch": 0.025391787343780996, "grad_norm": 2.1450982093811035, "learning_rate": 0.00018746363304447073, "loss": 43.2654, "step": 256 }, { "epoch": 0.02549097401309264, "grad_norm": 1.5322790145874023, "learning_rate": 0.0001873637112626569, "loss": 43.648, "step": 257 }, { "epoch": 0.025590160682404287, "grad_norm": 1.7161816358566284, "learning_rate": 0.00018726341969303107, "loss": 43.564, "step": 258 }, { "epoch": 0.02568934735171593, "grad_norm": 1.6408555507659912, "learning_rate": 0.00018716275876010135, "loss": 43.4033, "step": 259 }, { "epoch": 0.025788534021027574, "grad_norm": 2.001344680786133, "learning_rate": 0.0001870617288899392, "loss": 43.1792, "step": 260 }, { "epoch": 0.02588772069033922, "grad_norm": 1.552886724472046, "learning_rate": 0.00018696033051017772, "loss": 43.5722, "step": 261 }, { "epoch": 0.02598690735965086, "grad_norm": 1.9805954694747925, "learning_rate": 0.00018685856405000983, "loss": 43.4879, "step": 262 }, { "epoch": 0.026086094028962507, "grad_norm": 1.8676209449768066, "learning_rate": 0.0001867564299401864, "loss": 43.2659, "step": 263 }, { "epoch": 0.026185280698274152, "grad_norm": 1.423711895942688, "learning_rate": 0.0001866539286130145, "loss": 43.4853, "step": 264 }, { "epoch": 0.026284467367585797, "grad_norm": 1.8016430139541626, "learning_rate": 0.00018655106050235548, "loss": 43.4172, "step": 265 }, { "epoch": 0.02638365403689744, "grad_norm": 2.0731699466705322, "learning_rate": 0.00018644782604362333, "loss": 43.5431, "step": 266 }, { "epoch": 0.026482840706209085, "grad_norm": 2.1776654720306396, "learning_rate": 0.00018634422567378255, "loss": 43.3922, "step": 267 }, { "epoch": 0.02658202737552073, "grad_norm": 1.788361668586731, "learning_rate": 0.00018624025983134644, "loss": 43.3689, "step": 268 }, { "epoch": 0.026681214044832376, "grad_norm": 1.6715213060379028, "learning_rate": 0.00018613592895637537, "loss": 43.2955, "step": 269 }, { "epoch": 0.026780400714144018, "grad_norm": 1.6671379804611206, "learning_rate": 0.0001860312334904747, "loss": 43.4089, "step": 270 }, { "epoch": 0.026879587383455663, "grad_norm": 1.5834424495697021, "learning_rate": 0.00018592617387679306, "loss": 43.5199, "step": 271 }, { "epoch": 0.02697877405276731, "grad_norm": 1.8269447088241577, "learning_rate": 0.00018582075056002042, "loss": 43.6922, "step": 272 }, { "epoch": 0.027077960722078954, "grad_norm": 1.8284502029418945, "learning_rate": 0.00018571496398638612, "loss": 43.4936, "step": 273 }, { "epoch": 0.027177147391390596, "grad_norm": 1.616306185722351, "learning_rate": 0.00018560881460365724, "loss": 43.4763, "step": 274 }, { "epoch": 0.02727633406070224, "grad_norm": 1.6758257150650024, "learning_rate": 0.00018550230286113636, "loss": 43.4248, "step": 275 }, { "epoch": 0.027375520730013887, "grad_norm": 2.0079963207244873, "learning_rate": 0.00018539542920965998, "loss": 43.3084, "step": 276 }, { "epoch": 0.027474707399325532, "grad_norm": 1.9072431325912476, "learning_rate": 0.0001852881941015964, "loss": 43.4282, "step": 277 }, { "epoch": 0.027573894068637174, "grad_norm": 1.9698050022125244, "learning_rate": 0.00018518059799084379, "loss": 43.4791, "step": 278 }, { "epoch": 0.02767308073794882, "grad_norm": 2.197556972503662, "learning_rate": 0.00018507264133282852, "loss": 43.3443, "step": 279 }, { "epoch": 0.027772267407260465, "grad_norm": 1.764831781387329, "learning_rate": 0.00018496432458450294, "loss": 43.4362, "step": 280 }, { "epoch": 0.02787145407657211, "grad_norm": 1.5838619470596313, "learning_rate": 0.00018485564820434363, "loss": 43.4305, "step": 281 }, { "epoch": 0.027970640745883752, "grad_norm": 1.8322888612747192, "learning_rate": 0.00018474661265234935, "loss": 43.4129, "step": 282 }, { "epoch": 0.028069827415195397, "grad_norm": 1.7399368286132812, "learning_rate": 0.00018463721839003915, "loss": 43.5489, "step": 283 }, { "epoch": 0.028169014084507043, "grad_norm": 1.741345763206482, "learning_rate": 0.00018452746588045046, "loss": 43.314, "step": 284 }, { "epoch": 0.02826820075381869, "grad_norm": 1.7756701707839966, "learning_rate": 0.00018441735558813704, "loss": 43.5505, "step": 285 }, { "epoch": 0.02836738742313033, "grad_norm": 1.6505262851715088, "learning_rate": 0.000184306887979167, "loss": 43.508, "step": 286 }, { "epoch": 0.028466574092441976, "grad_norm": 1.7458088397979736, "learning_rate": 0.00018419606352112095, "loss": 43.5299, "step": 287 }, { "epoch": 0.02856576076175362, "grad_norm": 1.8814047574996948, "learning_rate": 0.00018408488268308997, "loss": 43.2876, "step": 288 }, { "epoch": 0.028664947431065266, "grad_norm": 2.1112706661224365, "learning_rate": 0.00018397334593567348, "loss": 43.2426, "step": 289 }, { "epoch": 0.02876413410037691, "grad_norm": 1.7809046506881714, "learning_rate": 0.0001838614537509775, "loss": 43.5884, "step": 290 }, { "epoch": 0.028863320769688554, "grad_norm": 1.6529407501220703, "learning_rate": 0.00018374920660261246, "loss": 43.5984, "step": 291 }, { "epoch": 0.0289625074390002, "grad_norm": 1.7374414205551147, "learning_rate": 0.00018363660496569127, "loss": 43.6149, "step": 292 }, { "epoch": 0.02906169410831184, "grad_norm": 1.9328715801239014, "learning_rate": 0.00018352364931682735, "loss": 43.4508, "step": 293 }, { "epoch": 0.029160880777623487, "grad_norm": 1.8607913255691528, "learning_rate": 0.00018341034013413248, "loss": 43.2958, "step": 294 }, { "epoch": 0.029260067446935132, "grad_norm": 1.653661847114563, "learning_rate": 0.00018329667789721485, "loss": 43.3118, "step": 295 }, { "epoch": 0.029359254116246777, "grad_norm": 1.7331739664077759, "learning_rate": 0.00018318266308717712, "loss": 43.187, "step": 296 }, { "epoch": 0.02945844078555842, "grad_norm": 1.9389817714691162, "learning_rate": 0.00018306829618661423, "loss": 43.3045, "step": 297 }, { "epoch": 0.029557627454870065, "grad_norm": 1.7669439315795898, "learning_rate": 0.00018295357767961144, "loss": 43.6229, "step": 298 }, { "epoch": 0.02965681412418171, "grad_norm": 2.2777292728424072, "learning_rate": 0.00018283850805174233, "loss": 43.3467, "step": 299 }, { "epoch": 0.029756000793493356, "grad_norm": 1.8629502058029175, "learning_rate": 0.00018272308779006664, "loss": 43.4681, "step": 300 }, { "epoch": 0.029855187462804997, "grad_norm": 1.8004388809204102, "learning_rate": 0.00018260731738312818, "loss": 43.5564, "step": 301 }, { "epoch": 0.029954374132116643, "grad_norm": 1.7747710943222046, "learning_rate": 0.0001824911973209529, "loss": 43.48, "step": 302 }, { "epoch": 0.03005356080142829, "grad_norm": 1.8199105262756348, "learning_rate": 0.00018237472809504683, "loss": 43.483, "step": 303 }, { "epoch": 0.030152747470739934, "grad_norm": 1.813331961631775, "learning_rate": 0.00018225791019839375, "loss": 43.3525, "step": 304 }, { "epoch": 0.030251934140051576, "grad_norm": 1.9097211360931396, "learning_rate": 0.0001821407441254534, "loss": 43.174, "step": 305 }, { "epoch": 0.03035112080936322, "grad_norm": 1.5276966094970703, "learning_rate": 0.00018202323037215922, "loss": 43.4794, "step": 306 }, { "epoch": 0.030450307478674866, "grad_norm": 2.1980795860290527, "learning_rate": 0.00018190536943591624, "loss": 43.4615, "step": 307 }, { "epoch": 0.030549494147986512, "grad_norm": 1.925775170326233, "learning_rate": 0.00018178716181559918, "loss": 43.4785, "step": 308 }, { "epoch": 0.030648680817298154, "grad_norm": 1.6581778526306152, "learning_rate": 0.00018166860801155, "loss": 43.5451, "step": 309 }, { "epoch": 0.0307478674866098, "grad_norm": 1.8703173398971558, "learning_rate": 0.00018154970852557603, "loss": 43.261, "step": 310 }, { "epoch": 0.030847054155921445, "grad_norm": 1.6486387252807617, "learning_rate": 0.0001814304638609478, "loss": 43.3558, "step": 311 }, { "epoch": 0.03094624082523309, "grad_norm": 1.5768872499465942, "learning_rate": 0.00018131087452239695, "loss": 43.3209, "step": 312 }, { "epoch": 0.031045427494544732, "grad_norm": 1.8140771389007568, "learning_rate": 0.0001811909410161139, "loss": 43.339, "step": 313 }, { "epoch": 0.031144614163856377, "grad_norm": 1.6479640007019043, "learning_rate": 0.00018107066384974595, "loss": 43.2227, "step": 314 }, { "epoch": 0.031243800833168023, "grad_norm": 1.773018717765808, "learning_rate": 0.00018095004353239498, "loss": 43.2334, "step": 315 }, { "epoch": 0.031342987502479665, "grad_norm": 1.9624122381210327, "learning_rate": 0.0001808290805746153, "loss": 43.313, "step": 316 }, { "epoch": 0.031442174171791314, "grad_norm": 2.0385379791259766, "learning_rate": 0.00018070777548841166, "loss": 43.5272, "step": 317 }, { "epoch": 0.031541360841102956, "grad_norm": 2.0700814723968506, "learning_rate": 0.00018058612878723678, "loss": 43.5878, "step": 318 }, { "epoch": 0.0316405475104146, "grad_norm": 2.1185619831085205, "learning_rate": 0.00018046414098598948, "loss": 43.4353, "step": 319 }, { "epoch": 0.031739734179726246, "grad_norm": 1.8049787282943726, "learning_rate": 0.00018034181260101232, "loss": 43.4017, "step": 320 }, { "epoch": 0.03183892084903789, "grad_norm": 1.8069190979003906, "learning_rate": 0.0001802191441500894, "loss": 43.4134, "step": 321 }, { "epoch": 0.03193810751834954, "grad_norm": 2.0122478008270264, "learning_rate": 0.00018009613615244436, "loss": 43.6144, "step": 322 }, { "epoch": 0.03203729418766118, "grad_norm": 1.7980411052703857, "learning_rate": 0.00017997278912873794, "loss": 43.1723, "step": 323 }, { "epoch": 0.03213648085697282, "grad_norm": 1.7612178325653076, "learning_rate": 0.00017984910360106598, "loss": 43.2345, "step": 324 }, { "epoch": 0.03223566752628447, "grad_norm": 1.687091588973999, "learning_rate": 0.000179725080092957, "loss": 43.402, "step": 325 }, { "epoch": 0.03233485419559611, "grad_norm": 1.8118903636932373, "learning_rate": 0.0001796007191293702, "loss": 43.273, "step": 326 }, { "epoch": 0.032434040864907754, "grad_norm": 1.7755016088485718, "learning_rate": 0.00017947602123669315, "loss": 43.5131, "step": 327 }, { "epoch": 0.0325332275342194, "grad_norm": 1.5700405836105347, "learning_rate": 0.0001793509869427395, "loss": 43.4376, "step": 328 }, { "epoch": 0.032632414203531045, "grad_norm": 1.8542143106460571, "learning_rate": 0.0001792256167767468, "loss": 43.5953, "step": 329 }, { "epoch": 0.03273160087284269, "grad_norm": 1.9271124601364136, "learning_rate": 0.00017909991126937428, "loss": 43.145, "step": 330 }, { "epoch": 0.032830787542154335, "grad_norm": 1.8772542476654053, "learning_rate": 0.00017897387095270058, "loss": 43.5586, "step": 331 }, { "epoch": 0.03292997421146598, "grad_norm": 1.8798960447311401, "learning_rate": 0.0001788474963602215, "loss": 43.1815, "step": 332 }, { "epoch": 0.033029160880777626, "grad_norm": 1.892846941947937, "learning_rate": 0.00017872078802684777, "loss": 43.5654, "step": 333 }, { "epoch": 0.03312834755008927, "grad_norm": 1.6253376007080078, "learning_rate": 0.0001785937464889027, "loss": 43.4119, "step": 334 }, { "epoch": 0.03322753421940091, "grad_norm": 2.1778671741485596, "learning_rate": 0.00017846637228411996, "loss": 43.5123, "step": 335 }, { "epoch": 0.03332672088871256, "grad_norm": 1.8277655839920044, "learning_rate": 0.00017833866595164146, "loss": 43.2783, "step": 336 }, { "epoch": 0.0334259075580242, "grad_norm": 1.8617613315582275, "learning_rate": 0.0001782106280320147, "loss": 43.5128, "step": 337 }, { "epoch": 0.03352509422733584, "grad_norm": 1.5705748796463013, "learning_rate": 0.00017808225906719086, "loss": 43.3098, "step": 338 }, { "epoch": 0.03362428089664749, "grad_norm": 2.355485200881958, "learning_rate": 0.00017795355960052226, "loss": 43.0172, "step": 339 }, { "epoch": 0.033723467565959134, "grad_norm": 1.827103853225708, "learning_rate": 0.00017782453017676025, "loss": 43.3646, "step": 340 }, { "epoch": 0.03382265423527078, "grad_norm": 1.751451015472412, "learning_rate": 0.0001776951713420527, "loss": 43.2285, "step": 341 }, { "epoch": 0.033921840904582425, "grad_norm": 1.675229549407959, "learning_rate": 0.00017756548364394184, "loss": 43.2931, "step": 342 }, { "epoch": 0.034021027573894067, "grad_norm": 1.8893396854400635, "learning_rate": 0.00017743546763136187, "loss": 43.5569, "step": 343 }, { "epoch": 0.034120214243205715, "grad_norm": 2.1921472549438477, "learning_rate": 0.0001773051238546367, "loss": 42.8586, "step": 344 }, { "epoch": 0.03421940091251736, "grad_norm": 1.8088001012802124, "learning_rate": 0.0001771744528654775, "loss": 43.4254, "step": 345 }, { "epoch": 0.034318587581829, "grad_norm": 1.800023078918457, "learning_rate": 0.00017704345521698058, "loss": 43.4496, "step": 346 }, { "epoch": 0.03441777425114065, "grad_norm": 2.0886213779449463, "learning_rate": 0.0001769121314636248, "loss": 43.5807, "step": 347 }, { "epoch": 0.03451696092045229, "grad_norm": 1.9248942136764526, "learning_rate": 0.00017678048216126932, "loss": 43.5729, "step": 348 }, { "epoch": 0.03461614758976394, "grad_norm": 1.967187523841858, "learning_rate": 0.00017664850786715136, "loss": 43.4213, "step": 349 }, { "epoch": 0.03471533425907558, "grad_norm": 1.8709222078323364, "learning_rate": 0.00017651620913988368, "loss": 43.5444, "step": 350 }, { "epoch": 0.03481452092838722, "grad_norm": 1.9087177515029907, "learning_rate": 0.0001763835865394523, "loss": 43.629, "step": 351 }, { "epoch": 0.03491370759769887, "grad_norm": 1.8189947605133057, "learning_rate": 0.00017625064062721415, "loss": 43.2782, "step": 352 }, { "epoch": 0.035012894267010514, "grad_norm": 1.8703895807266235, "learning_rate": 0.00017611737196589455, "loss": 43.4687, "step": 353 }, { "epoch": 0.035112080936322156, "grad_norm": 1.653400182723999, "learning_rate": 0.00017598378111958503, "loss": 43.5273, "step": 354 }, { "epoch": 0.035211267605633804, "grad_norm": 1.8059368133544922, "learning_rate": 0.00017584986865374082, "loss": 43.346, "step": 355 }, { "epoch": 0.035310454274945446, "grad_norm": 2.0954408645629883, "learning_rate": 0.00017571563513517845, "loss": 43.4398, "step": 356 }, { "epoch": 0.035409640944257095, "grad_norm": 1.9335533380508423, "learning_rate": 0.00017558108113207343, "loss": 43.4087, "step": 357 }, { "epoch": 0.03550882761356874, "grad_norm": 1.8835747241973877, "learning_rate": 0.00017544620721395777, "loss": 43.3362, "step": 358 }, { "epoch": 0.03560801428288038, "grad_norm": 1.5072838068008423, "learning_rate": 0.0001753110139517176, "loss": 43.444, "step": 359 }, { "epoch": 0.03570720095219203, "grad_norm": 2.321190357208252, "learning_rate": 0.0001751755019175907, "loss": 43.4326, "step": 360 }, { "epoch": 0.03580638762150367, "grad_norm": 2.016446590423584, "learning_rate": 0.00017503967168516426, "loss": 43.5701, "step": 361 }, { "epoch": 0.03590557429081531, "grad_norm": 2.0664498805999756, "learning_rate": 0.0001749035238293722, "loss": 43.1563, "step": 362 }, { "epoch": 0.03600476096012696, "grad_norm": 1.9161981344223022, "learning_rate": 0.0001747670589264929, "loss": 43.5632, "step": 363 }, { "epoch": 0.0361039476294386, "grad_norm": 1.9115240573883057, "learning_rate": 0.0001746302775541467, "loss": 43.487, "step": 364 }, { "epoch": 0.036203134298750245, "grad_norm": 1.7145079374313354, "learning_rate": 0.00017449318029129352, "loss": 43.454, "step": 365 }, { "epoch": 0.036302320968061894, "grad_norm": 1.7576640844345093, "learning_rate": 0.0001743557677182303, "loss": 43.5317, "step": 366 }, { "epoch": 0.036401507637373536, "grad_norm": 2.1920742988586426, "learning_rate": 0.00017421804041658863, "loss": 43.0603, "step": 367 }, { "epoch": 0.036500694306685184, "grad_norm": 1.7613914012908936, "learning_rate": 0.00017407999896933233, "loss": 43.3305, "step": 368 }, { "epoch": 0.036599880975996826, "grad_norm": 1.883078932762146, "learning_rate": 0.00017394164396075478, "loss": 43.5902, "step": 369 }, { "epoch": 0.03669906764530847, "grad_norm": 1.9879181385040283, "learning_rate": 0.00017380297597647667, "loss": 42.8947, "step": 370 }, { "epoch": 0.03679825431462012, "grad_norm": 1.855925440788269, "learning_rate": 0.00017366399560344347, "loss": 43.3136, "step": 371 }, { "epoch": 0.03689744098393176, "grad_norm": 1.8231003284454346, "learning_rate": 0.00017352470342992286, "loss": 43.5144, "step": 372 }, { "epoch": 0.0369966276532434, "grad_norm": 1.7878133058547974, "learning_rate": 0.00017338510004550223, "loss": 43.3447, "step": 373 }, { "epoch": 0.03709581432255505, "grad_norm": 2.1518242359161377, "learning_rate": 0.00017324518604108638, "loss": 43.3633, "step": 374 }, { "epoch": 0.03719500099186669, "grad_norm": 1.8321210145950317, "learning_rate": 0.00017310496200889473, "loss": 43.5334, "step": 375 }, { "epoch": 0.03729418766117834, "grad_norm": 1.7566676139831543, "learning_rate": 0.00017296442854245915, "loss": 43.3849, "step": 376 }, { "epoch": 0.03739337433048998, "grad_norm": 1.915132761001587, "learning_rate": 0.00017282358623662108, "loss": 43.2322, "step": 377 }, { "epoch": 0.037492560999801625, "grad_norm": 1.7967201471328735, "learning_rate": 0.0001726824356875293, "loss": 43.2558, "step": 378 }, { "epoch": 0.037591747669113273, "grad_norm": 1.9357024431228638, "learning_rate": 0.00017254097749263734, "loss": 43.4631, "step": 379 }, { "epoch": 0.037690934338424915, "grad_norm": 1.8330596685409546, "learning_rate": 0.00017239921225070077, "loss": 43.348, "step": 380 }, { "epoch": 0.03779012100773656, "grad_norm": 1.65016508102417, "learning_rate": 0.00017225714056177496, "loss": 43.4593, "step": 381 }, { "epoch": 0.037889307677048206, "grad_norm": 2.196983814239502, "learning_rate": 0.0001721147630272123, "loss": 43.5154, "step": 382 }, { "epoch": 0.03798849434635985, "grad_norm": 1.9112484455108643, "learning_rate": 0.00017197208024965978, "loss": 43.1733, "step": 383 }, { "epoch": 0.0380876810156715, "grad_norm": 2.278130292892456, "learning_rate": 0.0001718290928330564, "loss": 43.5212, "step": 384 }, { "epoch": 0.03818686768498314, "grad_norm": 1.8458198308944702, "learning_rate": 0.00017168580138263062, "loss": 43.248, "step": 385 }, { "epoch": 0.03818686768498314, "eval_loss": 10.842378616333008, "eval_runtime": 11.2646, "eval_samples_per_second": 376.845, "eval_steps_per_second": 188.467, "step": 385 }, { "epoch": 0.03828605435429478, "grad_norm": 1.8958542346954346, "learning_rate": 0.00017154220650489777, "loss": 43.3558, "step": 386 }, { "epoch": 0.03838524102360643, "grad_norm": 2.030120611190796, "learning_rate": 0.00017139830880765752, "loss": 43.5224, "step": 387 }, { "epoch": 0.03848442769291807, "grad_norm": 1.7606405019760132, "learning_rate": 0.00017125410889999134, "loss": 43.4718, "step": 388 }, { "epoch": 0.038583614362229714, "grad_norm": 2.0355708599090576, "learning_rate": 0.00017110960739225982, "loss": 43.3295, "step": 389 }, { "epoch": 0.03868280103154136, "grad_norm": 2.0534448623657227, "learning_rate": 0.0001709648048961002, "loss": 43.3462, "step": 390 }, { "epoch": 0.038781987700853005, "grad_norm": 2.1878533363342285, "learning_rate": 0.00017081970202442362, "loss": 43.2808, "step": 391 }, { "epoch": 0.038881174370164646, "grad_norm": 1.7620257139205933, "learning_rate": 0.0001706742993914128, "loss": 43.4059, "step": 392 }, { "epoch": 0.038980361039476295, "grad_norm": 1.7489093542099, "learning_rate": 0.00017052859761251913, "loss": 43.3036, "step": 393 }, { "epoch": 0.03907954770878794, "grad_norm": 1.9009250402450562, "learning_rate": 0.0001703825973044602, "loss": 43.2956, "step": 394 }, { "epoch": 0.039178734378099586, "grad_norm": 2.1137168407440186, "learning_rate": 0.00017023629908521735, "loss": 43.6373, "step": 395 }, { "epoch": 0.03927792104741123, "grad_norm": 1.9356465339660645, "learning_rate": 0.0001700897035740327, "loss": 43.3979, "step": 396 }, { "epoch": 0.03937710771672287, "grad_norm": 1.8591188192367554, "learning_rate": 0.00016994281139140688, "loss": 43.3016, "step": 397 }, { "epoch": 0.03947629438603452, "grad_norm": 2.07243013381958, "learning_rate": 0.0001697956231590962, "loss": 43.2778, "step": 398 }, { "epoch": 0.03957548105534616, "grad_norm": 1.8727376461029053, "learning_rate": 0.00016964813950011008, "loss": 43.4077, "step": 399 }, { "epoch": 0.0396746677246578, "grad_norm": 2.3649816513061523, "learning_rate": 0.0001695003610387084, "loss": 43.0328, "step": 400 }, { "epoch": 0.03977385439396945, "grad_norm": 1.8893752098083496, "learning_rate": 0.0001693522884003988, "loss": 43.3928, "step": 401 }, { "epoch": 0.039873041063281094, "grad_norm": 1.9837206602096558, "learning_rate": 0.00016920392221193422, "loss": 43.412, "step": 402 }, { "epoch": 0.03997222773259274, "grad_norm": 1.9072617292404175, "learning_rate": 0.00016905526310130999, "loss": 43.3748, "step": 403 }, { "epoch": 0.040071414401904384, "grad_norm": 2.164289951324463, "learning_rate": 0.00016890631169776143, "loss": 43.1653, "step": 404 }, { "epoch": 0.040170601071216026, "grad_norm": 2.2234630584716797, "learning_rate": 0.00016875706863176098, "loss": 43.2566, "step": 405 }, { "epoch": 0.040269787740527675, "grad_norm": 2.283557176589966, "learning_rate": 0.0001686075345350156, "loss": 43.3958, "step": 406 }, { "epoch": 0.04036897440983932, "grad_norm": 2.7405457496643066, "learning_rate": 0.00016845771004046415, "loss": 43.3162, "step": 407 }, { "epoch": 0.04046816107915096, "grad_norm": 1.8976521492004395, "learning_rate": 0.00016830759578227462, "loss": 43.251, "step": 408 }, { "epoch": 0.04056734774846261, "grad_norm": 1.896527647972107, "learning_rate": 0.0001681571923958416, "loss": 43.422, "step": 409 }, { "epoch": 0.04066653441777425, "grad_norm": 2.491332530975342, "learning_rate": 0.00016800650051778334, "loss": 43.1891, "step": 410 }, { "epoch": 0.0407657210870859, "grad_norm": 1.6540522575378418, "learning_rate": 0.00016785552078593931, "loss": 43.494, "step": 411 }, { "epoch": 0.04086490775639754, "grad_norm": 2.042348861694336, "learning_rate": 0.00016770425383936735, "loss": 43.3853, "step": 412 }, { "epoch": 0.04096409442570918, "grad_norm": 1.9173510074615479, "learning_rate": 0.00016755270031834098, "loss": 43.2695, "step": 413 }, { "epoch": 0.04106328109502083, "grad_norm": 1.91815185546875, "learning_rate": 0.0001674008608643468, "loss": 43.6696, "step": 414 }, { "epoch": 0.041162467764332474, "grad_norm": 1.9971388578414917, "learning_rate": 0.00016724873612008155, "loss": 43.3496, "step": 415 }, { "epoch": 0.041261654433644115, "grad_norm": 1.8367887735366821, "learning_rate": 0.00016709632672944966, "loss": 43.4582, "step": 416 }, { "epoch": 0.041360841102955764, "grad_norm": 2.22133207321167, "learning_rate": 0.00016694363333756034, "loss": 43.4528, "step": 417 }, { "epoch": 0.041460027772267406, "grad_norm": 1.8544198274612427, "learning_rate": 0.00016679065659072487, "loss": 43.4174, "step": 418 }, { "epoch": 0.041559214441579055, "grad_norm": 1.8612130880355835, "learning_rate": 0.00016663739713645398, "loss": 43.4778, "step": 419 }, { "epoch": 0.0416584011108907, "grad_norm": 1.8601411581039429, "learning_rate": 0.00016648385562345497, "loss": 43.3575, "step": 420 }, { "epoch": 0.04175758778020234, "grad_norm": 1.86284339427948, "learning_rate": 0.00016633003270162902, "loss": 43.2158, "step": 421 }, { "epoch": 0.04185677444951399, "grad_norm": 1.8202400207519531, "learning_rate": 0.0001661759290220685, "loss": 43.4971, "step": 422 }, { "epoch": 0.04195596111882563, "grad_norm": 2.007105827331543, "learning_rate": 0.00016602154523705402, "loss": 43.4312, "step": 423 }, { "epoch": 0.04205514778813727, "grad_norm": 2.0013821125030518, "learning_rate": 0.00016586688200005193, "loss": 43.476, "step": 424 }, { "epoch": 0.04215433445744892, "grad_norm": 1.9608008861541748, "learning_rate": 0.00016571193996571144, "loss": 43.3971, "step": 425 }, { "epoch": 0.04225352112676056, "grad_norm": 2.2429168224334717, "learning_rate": 0.00016555671978986172, "loss": 43.2615, "step": 426 }, { "epoch": 0.042352707796072205, "grad_norm": 1.900251865386963, "learning_rate": 0.00016540122212950934, "loss": 43.3754, "step": 427 }, { "epoch": 0.04245189446538385, "grad_norm": 1.846684455871582, "learning_rate": 0.0001652454476428353, "loss": 43.388, "step": 428 }, { "epoch": 0.042551081134695495, "grad_norm": 1.7164525985717773, "learning_rate": 0.00016508939698919246, "loss": 43.626, "step": 429 }, { "epoch": 0.042650267804007144, "grad_norm": 1.8913099765777588, "learning_rate": 0.00016493307082910249, "loss": 43.2921, "step": 430 }, { "epoch": 0.042749454473318786, "grad_norm": 2.015695333480835, "learning_rate": 0.0001647764698242533, "loss": 43.2211, "step": 431 }, { "epoch": 0.04284864114263043, "grad_norm": 1.9346154928207397, "learning_rate": 0.00016461959463749616, "loss": 43.5128, "step": 432 }, { "epoch": 0.04294782781194208, "grad_norm": 1.866926908493042, "learning_rate": 0.00016446244593284277, "loss": 43.3834, "step": 433 }, { "epoch": 0.04304701448125372, "grad_norm": 2.2372262477874756, "learning_rate": 0.0001643050243754626, "loss": 43.7052, "step": 434 }, { "epoch": 0.04314620115056536, "grad_norm": 1.9076186418533325, "learning_rate": 0.00016414733063168014, "loss": 43.4141, "step": 435 }, { "epoch": 0.04324538781987701, "grad_norm": 1.7319496870040894, "learning_rate": 0.00016398936536897183, "loss": 43.52, "step": 436 }, { "epoch": 0.04334457448918865, "grad_norm": 2.4877874851226807, "learning_rate": 0.00016383112925596343, "loss": 43.2557, "step": 437 }, { "epoch": 0.0434437611585003, "grad_norm": 2.4825005531311035, "learning_rate": 0.00016367262296242724, "loss": 43.1532, "step": 438 }, { "epoch": 0.04354294782781194, "grad_norm": 1.7283111810684204, "learning_rate": 0.00016351384715927898, "loss": 43.5862, "step": 439 }, { "epoch": 0.043642134497123584, "grad_norm": 2.457141637802124, "learning_rate": 0.00016335480251857527, "loss": 43.2074, "step": 440 }, { "epoch": 0.04374132116643523, "grad_norm": 1.844632625579834, "learning_rate": 0.0001631954897135106, "loss": 43.1014, "step": 441 }, { "epoch": 0.043840507835746875, "grad_norm": 1.9801989793777466, "learning_rate": 0.00016303590941841458, "loss": 43.4654, "step": 442 }, { "epoch": 0.04393969450505852, "grad_norm": 2.2188639640808105, "learning_rate": 0.0001628760623087489, "loss": 42.9202, "step": 443 }, { "epoch": 0.044038881174370166, "grad_norm": 2.018659830093384, "learning_rate": 0.00016271594906110476, "loss": 43.1481, "step": 444 }, { "epoch": 0.04413806784368181, "grad_norm": 2.0580058097839355, "learning_rate": 0.0001625555703531998, "loss": 43.4648, "step": 445 }, { "epoch": 0.04423725451299346, "grad_norm": 2.1084511280059814, "learning_rate": 0.00016239492686387529, "loss": 43.4597, "step": 446 }, { "epoch": 0.0443364411823051, "grad_norm": 1.8640772104263306, "learning_rate": 0.00016223401927309316, "loss": 43.3068, "step": 447 }, { "epoch": 0.04443562785161674, "grad_norm": 1.7706291675567627, "learning_rate": 0.00016207284826193335, "loss": 43.6387, "step": 448 }, { "epoch": 0.04453481452092839, "grad_norm": 1.9456933736801147, "learning_rate": 0.00016191141451259072, "loss": 43.3623, "step": 449 }, { "epoch": 0.04463400119024003, "grad_norm": 2.3101038932800293, "learning_rate": 0.0001617497187083722, "loss": 43.4343, "step": 450 }, { "epoch": 0.044733187859551674, "grad_norm": 1.798179268836975, "learning_rate": 0.00016158776153369402, "loss": 43.4195, "step": 451 }, { "epoch": 0.04483237452886332, "grad_norm": 2.23087739944458, "learning_rate": 0.00016142554367407872, "loss": 43.5208, "step": 452 }, { "epoch": 0.044931561198174964, "grad_norm": 2.0814554691314697, "learning_rate": 0.0001612630658161521, "loss": 43.356, "step": 453 }, { "epoch": 0.04503074786748661, "grad_norm": 2.3725922107696533, "learning_rate": 0.0001611003286476406, "loss": 43.4122, "step": 454 }, { "epoch": 0.045129934536798255, "grad_norm": 2.25369930267334, "learning_rate": 0.00016093733285736826, "loss": 43.477, "step": 455 }, { "epoch": 0.0452291212061099, "grad_norm": 2.368757963180542, "learning_rate": 0.0001607740791352538, "loss": 43.1755, "step": 456 }, { "epoch": 0.045328307875421546, "grad_norm": 2.162149429321289, "learning_rate": 0.00016061056817230754, "loss": 43.3276, "step": 457 }, { "epoch": 0.04542749454473319, "grad_norm": 2.0595765113830566, "learning_rate": 0.00016044680066062885, "loss": 43.3539, "step": 458 }, { "epoch": 0.04552668121404483, "grad_norm": 2.3723464012145996, "learning_rate": 0.00016028277729340288, "loss": 43.1594, "step": 459 }, { "epoch": 0.04562586788335648, "grad_norm": 1.9992680549621582, "learning_rate": 0.00016011849876489776, "loss": 43.4977, "step": 460 }, { "epoch": 0.04572505455266812, "grad_norm": 2.250610113143921, "learning_rate": 0.0001599539657704617, "loss": 43.4759, "step": 461 }, { "epoch": 0.04582424122197976, "grad_norm": 2.0322961807250977, "learning_rate": 0.00015978917900652, "loss": 42.9417, "step": 462 }, { "epoch": 0.04592342789129141, "grad_norm": 1.8161320686340332, "learning_rate": 0.000159624139170572, "loss": 43.4887, "step": 463 }, { "epoch": 0.04602261456060305, "grad_norm": 2.228445529937744, "learning_rate": 0.00015945884696118832, "loss": 43.0499, "step": 464 }, { "epoch": 0.0461218012299147, "grad_norm": 1.801005482673645, "learning_rate": 0.00015929330307800775, "loss": 43.6632, "step": 465 }, { "epoch": 0.046220987899226344, "grad_norm": 1.9599015712738037, "learning_rate": 0.00015912750822173445, "loss": 43.6235, "step": 466 }, { "epoch": 0.046320174568537986, "grad_norm": 1.8597904443740845, "learning_rate": 0.00015896146309413481, "loss": 43.3612, "step": 467 }, { "epoch": 0.046419361237849635, "grad_norm": 1.9823458194732666, "learning_rate": 0.00015879516839803457, "loss": 43.416, "step": 468 }, { "epoch": 0.04651854790716128, "grad_norm": 2.408478260040283, "learning_rate": 0.00015862862483731574, "loss": 42.8046, "step": 469 }, { "epoch": 0.04661773457647292, "grad_norm": 1.8994836807250977, "learning_rate": 0.0001584618331169139, "loss": 43.161, "step": 470 }, { "epoch": 0.04671692124578457, "grad_norm": 2.195263624191284, "learning_rate": 0.00015829479394281485, "loss": 43.4513, "step": 471 }, { "epoch": 0.04681610791509621, "grad_norm": 1.9655941724777222, "learning_rate": 0.00015812750802205187, "loss": 43.2207, "step": 472 }, { "epoch": 0.04691529458440786, "grad_norm": 2.0135717391967773, "learning_rate": 0.00015795997606270266, "loss": 43.4908, "step": 473 }, { "epoch": 0.0470144812537195, "grad_norm": 2.319117784500122, "learning_rate": 0.00015779219877388634, "loss": 43.0259, "step": 474 }, { "epoch": 0.04711366792303114, "grad_norm": 2.104710817337036, "learning_rate": 0.00015762417686576038, "loss": 43.3605, "step": 475 }, { "epoch": 0.04721285459234279, "grad_norm": 1.9044405221939087, "learning_rate": 0.00015745591104951778, "loss": 43.5428, "step": 476 }, { "epoch": 0.04731204126165443, "grad_norm": 2.1451425552368164, "learning_rate": 0.00015728740203738379, "loss": 43.2096, "step": 477 }, { "epoch": 0.047411227930966075, "grad_norm": 1.968700647354126, "learning_rate": 0.0001571186505426132, "loss": 43.3241, "step": 478 }, { "epoch": 0.047510414600277724, "grad_norm": 1.9085205793380737, "learning_rate": 0.000156949657279487, "loss": 43.2277, "step": 479 }, { "epoch": 0.047609601269589366, "grad_norm": 2.0941076278686523, "learning_rate": 0.0001567804229633097, "loss": 43.2023, "step": 480 }, { "epoch": 0.047708787938901015, "grad_norm": 1.997017502784729, "learning_rate": 0.00015661094831040598, "loss": 43.3526, "step": 481 }, { "epoch": 0.04780797460821266, "grad_norm": 2.093484878540039, "learning_rate": 0.00015644123403811793, "loss": 43.6757, "step": 482 }, { "epoch": 0.0479071612775243, "grad_norm": 2.5758907794952393, "learning_rate": 0.00015627128086480175, "loss": 43.4697, "step": 483 }, { "epoch": 0.04800634794683595, "grad_norm": 1.76503586769104, "learning_rate": 0.00015610108950982494, "loss": 43.4233, "step": 484 }, { "epoch": 0.04810553461614759, "grad_norm": 1.970333456993103, "learning_rate": 0.0001559306606935631, "loss": 43.2243, "step": 485 }, { "epoch": 0.04820472128545923, "grad_norm": 2.3156676292419434, "learning_rate": 0.000155759995137397, "loss": 43.59, "step": 486 }, { "epoch": 0.04830390795477088, "grad_norm": 1.784239411354065, "learning_rate": 0.00015558909356370944, "loss": 43.227, "step": 487 }, { "epoch": 0.04840309462408252, "grad_norm": 1.5904680490493774, "learning_rate": 0.00015541795669588222, "loss": 43.3363, "step": 488 }, { "epoch": 0.048502281293394164, "grad_norm": 2.0892555713653564, "learning_rate": 0.00015524658525829308, "loss": 43.3758, "step": 489 }, { "epoch": 0.04860146796270581, "grad_norm": 2.2841103076934814, "learning_rate": 0.00015507497997631266, "loss": 43.393, "step": 490 }, { "epoch": 0.048700654632017455, "grad_norm": 2.4132015705108643, "learning_rate": 0.00015490314157630137, "loss": 43.4775, "step": 491 }, { "epoch": 0.048799841301329104, "grad_norm": 2.2090461254119873, "learning_rate": 0.00015473107078560632, "loss": 43.127, "step": 492 }, { "epoch": 0.048899027970640746, "grad_norm": 1.940687656402588, "learning_rate": 0.0001545587683325583, "loss": 43.4704, "step": 493 }, { "epoch": 0.04899821463995239, "grad_norm": 1.9189916849136353, "learning_rate": 0.00015438623494646873, "loss": 43.2119, "step": 494 }, { "epoch": 0.04909740130926404, "grad_norm": 2.2180159091949463, "learning_rate": 0.00015421347135762643, "loss": 43.0975, "step": 495 }, { "epoch": 0.04919658797857568, "grad_norm": 2.2772672176361084, "learning_rate": 0.00015404047829729457, "loss": 43.3432, "step": 496 }, { "epoch": 0.04929577464788732, "grad_norm": 2.2242465019226074, "learning_rate": 0.00015386725649770778, "loss": 43.3511, "step": 497 }, { "epoch": 0.04939496131719897, "grad_norm": 2.005730390548706, "learning_rate": 0.00015369380669206866, "loss": 43.1514, "step": 498 }, { "epoch": 0.04949414798651061, "grad_norm": 2.255737066268921, "learning_rate": 0.00015352012961454507, "loss": 43.6627, "step": 499 }, { "epoch": 0.04959333465582226, "grad_norm": 2.042703866958618, "learning_rate": 0.0001533462260002668, "loss": 43.4096, "step": 500 }, { "epoch": 0.0496925213251339, "grad_norm": 2.3833088874816895, "learning_rate": 0.0001531720965853225, "loss": 43.3581, "step": 501 }, { "epoch": 0.049791707994445544, "grad_norm": 2.1803300380706787, "learning_rate": 0.00015299774210675657, "loss": 43.4317, "step": 502 }, { "epoch": 0.04989089466375719, "grad_norm": 2.095017194747925, "learning_rate": 0.0001528231633025661, "loss": 43.1832, "step": 503 }, { "epoch": 0.049990081333068835, "grad_norm": 1.6770892143249512, "learning_rate": 0.0001526483609116976, "loss": 43.5793, "step": 504 }, { "epoch": 0.05008926800238048, "grad_norm": 2.198793888092041, "learning_rate": 0.00015247333567404406, "loss": 43.0588, "step": 505 }, { "epoch": 0.050188454671692126, "grad_norm": 2.13785457611084, "learning_rate": 0.00015229808833044163, "loss": 43.1178, "step": 506 }, { "epoch": 0.05028764134100377, "grad_norm": 1.7616395950317383, "learning_rate": 0.00015212261962266668, "loss": 43.4594, "step": 507 }, { "epoch": 0.05038682801031542, "grad_norm": 2.443803548812866, "learning_rate": 0.00015194693029343248, "loss": 42.957, "step": 508 }, { "epoch": 0.05048601467962706, "grad_norm": 2.114178419113159, "learning_rate": 0.00015177102108638612, "loss": 43.1909, "step": 509 }, { "epoch": 0.0505852013489387, "grad_norm": 2.003145456314087, "learning_rate": 0.00015159489274610546, "loss": 43.4466, "step": 510 }, { "epoch": 0.05068438801825035, "grad_norm": 1.715397596359253, "learning_rate": 0.00015141854601809581, "loss": 43.58, "step": 511 }, { "epoch": 0.05078357468756199, "grad_norm": 2.002887487411499, "learning_rate": 0.0001512419816487869, "loss": 43.2627, "step": 512 }, { "epoch": 0.05088276135687363, "grad_norm": 2.1487252712249756, "learning_rate": 0.00015106520038552968, "loss": 43.2213, "step": 513 }, { "epoch": 0.05098194802618528, "grad_norm": 1.9472956657409668, "learning_rate": 0.00015088820297659314, "loss": 43.2843, "step": 514 }, { "epoch": 0.051081134695496924, "grad_norm": 2.400360107421875, "learning_rate": 0.00015071099017116118, "loss": 43.3905, "step": 515 }, { "epoch": 0.05118032136480857, "grad_norm": 2.016178846359253, "learning_rate": 0.00015053356271932936, "loss": 43.2827, "step": 516 }, { "epoch": 0.051279508034120215, "grad_norm": 2.2779204845428467, "learning_rate": 0.00015035592137210187, "loss": 43.1734, "step": 517 }, { "epoch": 0.05137869470343186, "grad_norm": 2.500525712966919, "learning_rate": 0.00015017806688138817, "loss": 43.3598, "step": 518 }, { "epoch": 0.051477881372743506, "grad_norm": 2.2447829246520996, "learning_rate": 0.00015000000000000001, "loss": 43.4888, "step": 519 }, { "epoch": 0.05157706804205515, "grad_norm": 1.8975471258163452, "learning_rate": 0.00014982172148164804, "loss": 43.3574, "step": 520 }, { "epoch": 0.05167625471136679, "grad_norm": 2.2666919231414795, "learning_rate": 0.0001496432320809387, "loss": 43.3658, "step": 521 }, { "epoch": 0.05177544138067844, "grad_norm": 2.0550897121429443, "learning_rate": 0.00014946453255337117, "loss": 43.0961, "step": 522 }, { "epoch": 0.05187462804999008, "grad_norm": 1.9846802949905396, "learning_rate": 0.00014928562365533392, "loss": 43.287, "step": 523 }, { "epoch": 0.05197381471930172, "grad_norm": 2.3645148277282715, "learning_rate": 0.00014910650614410165, "loss": 43.5701, "step": 524 }, { "epoch": 0.05207300138861337, "grad_norm": 1.7542608976364136, "learning_rate": 0.00014892718077783214, "loss": 43.3929, "step": 525 }, { "epoch": 0.05217218805792501, "grad_norm": 2.0839455127716064, "learning_rate": 0.00014874764831556285, "loss": 43.303, "step": 526 }, { "epoch": 0.05227137472723666, "grad_norm": 2.229410409927368, "learning_rate": 0.00014856790951720797, "loss": 43.1675, "step": 527 }, { "epoch": 0.052370561396548304, "grad_norm": 1.6923551559448242, "learning_rate": 0.00014838796514355483, "loss": 43.351, "step": 528 }, { "epoch": 0.052469748065859946, "grad_norm": 2.2549707889556885, "learning_rate": 0.00014820781595626116, "loss": 43.0883, "step": 529 }, { "epoch": 0.052568934735171595, "grad_norm": 2.37475848197937, "learning_rate": 0.00014802746271785149, "loss": 42.8019, "step": 530 }, { "epoch": 0.05266812140448324, "grad_norm": 2.2322258949279785, "learning_rate": 0.00014784690619171401, "loss": 43.466, "step": 531 }, { "epoch": 0.05276730807379488, "grad_norm": 2.2950708866119385, "learning_rate": 0.0001476661471420975, "loss": 43.3774, "step": 532 }, { "epoch": 0.05286649474310653, "grad_norm": 1.6149096488952637, "learning_rate": 0.00014748518633410775, "loss": 43.3975, "step": 533 }, { "epoch": 0.05296568141241817, "grad_norm": 1.895948052406311, "learning_rate": 0.00014730402453370483, "loss": 43.3509, "step": 534 }, { "epoch": 0.05306486808172982, "grad_norm": 2.144165515899658, "learning_rate": 0.0001471226625076993, "loss": 43.1804, "step": 535 }, { "epoch": 0.05316405475104146, "grad_norm": 1.9626020193099976, "learning_rate": 0.0001469411010237494, "loss": 43.0332, "step": 536 }, { "epoch": 0.0532632414203531, "grad_norm": 1.8684499263763428, "learning_rate": 0.0001467593408503575, "loss": 43.4276, "step": 537 }, { "epoch": 0.05336242808966475, "grad_norm": 1.958593726158142, "learning_rate": 0.0001465773827568671, "loss": 43.142, "step": 538 }, { "epoch": 0.05346161475897639, "grad_norm": 1.9472311735153198, "learning_rate": 0.00014639522751345928, "loss": 43.4936, "step": 539 }, { "epoch": 0.053560801428288035, "grad_norm": 2.015083074569702, "learning_rate": 0.0001462128758911498, "loss": 43.0938, "step": 540 }, { "epoch": 0.053659988097599684, "grad_norm": 2.009272336959839, "learning_rate": 0.00014603032866178538, "loss": 43.4618, "step": 541 }, { "epoch": 0.053759174766911326, "grad_norm": 2.0587611198425293, "learning_rate": 0.00014584758659804098, "loss": 43.437, "step": 542 }, { "epoch": 0.053858361436222975, "grad_norm": 2.556206226348877, "learning_rate": 0.0001456646504734161, "loss": 43.3641, "step": 543 }, { "epoch": 0.05395754810553462, "grad_norm": 2.36332368850708, "learning_rate": 0.00014548152106223157, "loss": 43.3912, "step": 544 }, { "epoch": 0.05405673477484626, "grad_norm": 2.1485421657562256, "learning_rate": 0.00014529819913962653, "loss": 43.4191, "step": 545 }, { "epoch": 0.05415592144415791, "grad_norm": 2.6901614665985107, "learning_rate": 0.00014511468548155483, "loss": 43.3381, "step": 546 }, { "epoch": 0.05425510811346955, "grad_norm": 1.9959933757781982, "learning_rate": 0.00014493098086478196, "loss": 43.5722, "step": 547 }, { "epoch": 0.05435429478278119, "grad_norm": 2.293649911880493, "learning_rate": 0.00014474708606688165, "loss": 43.1551, "step": 548 }, { "epoch": 0.05445348145209284, "grad_norm": 2.286330461502075, "learning_rate": 0.00014456300186623268, "loss": 43.071, "step": 549 }, { "epoch": 0.05455266812140448, "grad_norm": 1.834014892578125, "learning_rate": 0.00014437872904201542, "loss": 43.4938, "step": 550 }, { "epoch": 0.05465185479071613, "grad_norm": 2.5310239791870117, "learning_rate": 0.00014419426837420873, "loss": 43.2099, "step": 551 }, { "epoch": 0.05475104146002777, "grad_norm": 1.9483962059020996, "learning_rate": 0.00014400962064358653, "loss": 43.4101, "step": 552 }, { "epoch": 0.054850228129339415, "grad_norm": 2.441307544708252, "learning_rate": 0.0001438247866317145, "loss": 43.0891, "step": 553 }, { "epoch": 0.054949414798651064, "grad_norm": 2.156123161315918, "learning_rate": 0.00014363976712094677, "loss": 43.1826, "step": 554 }, { "epoch": 0.055048601467962706, "grad_norm": 2.034651517868042, "learning_rate": 0.00014345456289442275, "loss": 43.3789, "step": 555 }, { "epoch": 0.05514778813727435, "grad_norm": 2.029855728149414, "learning_rate": 0.00014326917473606366, "loss": 43.3386, "step": 556 }, { "epoch": 0.055246974806586, "grad_norm": 2.700876474380493, "learning_rate": 0.00014308360343056922, "loss": 43.0103, "step": 557 }, { "epoch": 0.05534616147589764, "grad_norm": 1.9576337337493896, "learning_rate": 0.00014289784976341438, "loss": 43.4213, "step": 558 }, { "epoch": 0.05544534814520928, "grad_norm": 2.172898530960083, "learning_rate": 0.00014271191452084597, "loss": 43.4874, "step": 559 }, { "epoch": 0.05554453481452093, "grad_norm": 2.1174685955047607, "learning_rate": 0.00014252579848987942, "loss": 43.4704, "step": 560 }, { "epoch": 0.05564372148383257, "grad_norm": 1.9230237007141113, "learning_rate": 0.00014233950245829533, "loss": 43.1421, "step": 561 }, { "epoch": 0.05574290815314422, "grad_norm": 1.8260465860366821, "learning_rate": 0.00014215302721463623, "loss": 43.4519, "step": 562 }, { "epoch": 0.05584209482245586, "grad_norm": 2.038991689682007, "learning_rate": 0.00014196637354820326, "loss": 43.562, "step": 563 }, { "epoch": 0.055941281491767504, "grad_norm": 2.3954482078552246, "learning_rate": 0.00014177954224905268, "loss": 43.3555, "step": 564 }, { "epoch": 0.05604046816107915, "grad_norm": 1.9379189014434814, "learning_rate": 0.0001415925341079927, "loss": 43.5975, "step": 565 }, { "epoch": 0.056139654830390795, "grad_norm": 2.488739013671875, "learning_rate": 0.00014140534991658, "loss": 43.1201, "step": 566 }, { "epoch": 0.05623884149970244, "grad_norm": 2.2356808185577393, "learning_rate": 0.00014121799046711643, "loss": 43.5215, "step": 567 }, { "epoch": 0.056338028169014086, "grad_norm": 1.7546778917312622, "learning_rate": 0.00014103045655264576, "loss": 43.2874, "step": 568 }, { "epoch": 0.05643721483832573, "grad_norm": 1.9305354356765747, "learning_rate": 0.00014084274896695005, "loss": 43.2634, "step": 569 }, { "epoch": 0.05653640150763738, "grad_norm": 1.9542958736419678, "learning_rate": 0.00014065486850454672, "loss": 43.6221, "step": 570 }, { "epoch": 0.05663558817694902, "grad_norm": 1.7128233909606934, "learning_rate": 0.00014046681596068466, "loss": 43.5441, "step": 571 }, { "epoch": 0.05673477484626066, "grad_norm": 2.316441535949707, "learning_rate": 0.00014027859213134133, "loss": 43.2042, "step": 572 }, { "epoch": 0.05683396151557231, "grad_norm": 1.82426118850708, "learning_rate": 0.0001400901978132191, "loss": 43.2997, "step": 573 }, { "epoch": 0.05693314818488395, "grad_norm": 2.148385763168335, "learning_rate": 0.00013990163380374194, "loss": 42.986, "step": 574 }, { "epoch": 0.05703233485419559, "grad_norm": 2.191030740737915, "learning_rate": 0.00013971290090105222, "loss": 43.2331, "step": 575 }, { "epoch": 0.05713152152350724, "grad_norm": 1.8650566339492798, "learning_rate": 0.0001395239999040071, "loss": 43.5216, "step": 576 }, { "epoch": 0.057230708192818884, "grad_norm": 2.0447537899017334, "learning_rate": 0.00013933493161217523, "loss": 43.2801, "step": 577 }, { "epoch": 0.05732989486213053, "grad_norm": 1.9151122570037842, "learning_rate": 0.0001391456968258334, "loss": 43.3431, "step": 578 }, { "epoch": 0.057429081531442175, "grad_norm": 2.1118319034576416, "learning_rate": 0.0001389562963459631, "loss": 43.4472, "step": 579 }, { "epoch": 0.05752826820075382, "grad_norm": 1.779803991317749, "learning_rate": 0.0001387667309742472, "loss": 43.4929, "step": 580 }, { "epoch": 0.057627454870065466, "grad_norm": 1.8886220455169678, "learning_rate": 0.00013857700151306653, "loss": 43.1377, "step": 581 }, { "epoch": 0.05772664153937711, "grad_norm": 1.8922873735427856, "learning_rate": 0.00013838710876549635, "loss": 43.2163, "step": 582 }, { "epoch": 0.05782582820868875, "grad_norm": 1.9414300918579102, "learning_rate": 0.0001381970535353032, "loss": 43.3846, "step": 583 }, { "epoch": 0.0579250148780004, "grad_norm": 2.14066219329834, "learning_rate": 0.00013800683662694134, "loss": 43.3012, "step": 584 }, { "epoch": 0.05802420154731204, "grad_norm": 1.9403343200683594, "learning_rate": 0.00013781645884554933, "loss": 43.4795, "step": 585 }, { "epoch": 0.05812338821662368, "grad_norm": 1.89680814743042, "learning_rate": 0.00013762592099694665, "loss": 43.3806, "step": 586 }, { "epoch": 0.05822257488593533, "grad_norm": 1.8024598360061646, "learning_rate": 0.00013743522388763037, "loss": 43.3833, "step": 587 }, { "epoch": 0.05832176155524697, "grad_norm": 2.3497109413146973, "learning_rate": 0.00013724436832477163, "loss": 43.1991, "step": 588 }, { "epoch": 0.05842094822455862, "grad_norm": 2.401840925216675, "learning_rate": 0.00013705335511621228, "loss": 43.2245, "step": 589 }, { "epoch": 0.058520134893870264, "grad_norm": 1.982479214668274, "learning_rate": 0.0001368621850704614, "loss": 43.3168, "step": 590 }, { "epoch": 0.058619321563181906, "grad_norm": 2.0087342262268066, "learning_rate": 0.00013667085899669198, "loss": 43.4179, "step": 591 }, { "epoch": 0.058718508232493555, "grad_norm": 1.6955896615982056, "learning_rate": 0.00013647937770473737, "loss": 43.0989, "step": 592 }, { "epoch": 0.0588176949018052, "grad_norm": 1.6221680641174316, "learning_rate": 0.00013628774200508798, "loss": 43.2575, "step": 593 }, { "epoch": 0.05891688157111684, "grad_norm": 1.8718053102493286, "learning_rate": 0.00013609595270888778, "loss": 43.6714, "step": 594 }, { "epoch": 0.05901606824042849, "grad_norm": 2.2167725563049316, "learning_rate": 0.00013590401062793083, "loss": 43.2747, "step": 595 }, { "epoch": 0.05911525490974013, "grad_norm": 2.1741771697998047, "learning_rate": 0.00013571191657465792, "loss": 43.102, "step": 596 }, { "epoch": 0.05921444157905178, "grad_norm": 2.149712085723877, "learning_rate": 0.00013551967136215306, "loss": 43.4909, "step": 597 }, { "epoch": 0.05931362824836342, "grad_norm": 1.884291648864746, "learning_rate": 0.0001353272758041402, "loss": 43.4386, "step": 598 }, { "epoch": 0.05941281491767506, "grad_norm": 1.8479968309402466, "learning_rate": 0.00013513473071497946, "loss": 43.519, "step": 599 }, { "epoch": 0.05951200158698671, "grad_norm": 1.8414955139160156, "learning_rate": 0.0001349420369096641, "loss": 43.242, "step": 600 }, { "epoch": 0.05961118825629835, "grad_norm": 2.4204518795013428, "learning_rate": 0.00013474919520381671, "loss": 43.6366, "step": 601 }, { "epoch": 0.059710374925609995, "grad_norm": 2.0407822132110596, "learning_rate": 0.000134556206413686, "loss": 43.3893, "step": 602 }, { "epoch": 0.059809561594921644, "grad_norm": 2.002960205078125, "learning_rate": 0.00013436307135614314, "loss": 43.3842, "step": 603 }, { "epoch": 0.059908748264233286, "grad_norm": 2.607391119003296, "learning_rate": 0.00013416979084867852, "loss": 43.3548, "step": 604 }, { "epoch": 0.060007934933544935, "grad_norm": 1.9031591415405273, "learning_rate": 0.0001339763657093981, "loss": 43.1324, "step": 605 }, { "epoch": 0.06010712160285658, "grad_norm": 2.341731548309326, "learning_rate": 0.00013378279675702005, "loss": 43.2245, "step": 606 }, { "epoch": 0.06020630827216822, "grad_norm": 2.246828556060791, "learning_rate": 0.00013358908481087134, "loss": 43.5009, "step": 607 }, { "epoch": 0.06030549494147987, "grad_norm": 1.6529605388641357, "learning_rate": 0.00013339523069088409, "loss": 43.3581, "step": 608 }, { "epoch": 0.06040468161079151, "grad_norm": 2.4459779262542725, "learning_rate": 0.00013320123521759226, "loss": 43.4186, "step": 609 }, { "epoch": 0.06050386828010315, "grad_norm": 1.9739339351654053, "learning_rate": 0.0001330070992121281, "loss": 43.0586, "step": 610 }, { "epoch": 0.0606030549494148, "grad_norm": 1.8968048095703125, "learning_rate": 0.00013281282349621867, "loss": 43.3721, "step": 611 }, { "epoch": 0.06070224161872644, "grad_norm": 1.8437169790267944, "learning_rate": 0.00013261840889218252, "loss": 43.38, "step": 612 }, { "epoch": 0.06080142828803809, "grad_norm": 2.460237503051758, "learning_rate": 0.00013242385622292592, "loss": 43.2404, "step": 613 }, { "epoch": 0.06090061495734973, "grad_norm": 2.0136892795562744, "learning_rate": 0.00013222916631193957, "loss": 43.1507, "step": 614 }, { "epoch": 0.060999801626661375, "grad_norm": 2.1666438579559326, "learning_rate": 0.00013203433998329518, "loss": 43.1412, "step": 615 }, { "epoch": 0.061098988295973024, "grad_norm": 1.6473026275634766, "learning_rate": 0.00013183937806164172, "loss": 43.3906, "step": 616 }, { "epoch": 0.061198174965284666, "grad_norm": 2.250753402709961, "learning_rate": 0.00013164428137220222, "loss": 42.9498, "step": 617 }, { "epoch": 0.06129736163459631, "grad_norm": 2.1864540576934814, "learning_rate": 0.00013144905074077007, "loss": 43.255, "step": 618 }, { "epoch": 0.06139654830390796, "grad_norm": 2.072943687438965, "learning_rate": 0.00013125368699370567, "loss": 43.7479, "step": 619 }, { "epoch": 0.0614957349732196, "grad_norm": 2.1312782764434814, "learning_rate": 0.0001310581909579328, "loss": 43.3928, "step": 620 }, { "epoch": 0.06159492164253124, "grad_norm": 2.123581886291504, "learning_rate": 0.0001308625634609352, "loss": 43.5654, "step": 621 }, { "epoch": 0.06169410831184289, "grad_norm": 2.5420310497283936, "learning_rate": 0.0001306668053307531, "loss": 43.1436, "step": 622 }, { "epoch": 0.06179329498115453, "grad_norm": 1.928264856338501, "learning_rate": 0.0001304709173959796, "loss": 43.3955, "step": 623 }, { "epoch": 0.06189248165046618, "grad_norm": 2.8915982246398926, "learning_rate": 0.00013027490048575722, "loss": 43.2937, "step": 624 }, { "epoch": 0.06199166831977782, "grad_norm": 2.04233455657959, "learning_rate": 0.00013007875542977448, "loss": 43.1464, "step": 625 }, { "epoch": 0.062090854989089464, "grad_norm": 1.9762567281723022, "learning_rate": 0.00012988248305826226, "loss": 43.4393, "step": 626 }, { "epoch": 0.06219004165840111, "grad_norm": 2.2382287979125977, "learning_rate": 0.00012968608420199033, "loss": 43.2089, "step": 627 }, { "epoch": 0.062289228327712755, "grad_norm": 2.701388359069824, "learning_rate": 0.00012948955969226383, "loss": 43.7195, "step": 628 }, { "epoch": 0.0623884149970244, "grad_norm": 1.8112866878509521, "learning_rate": 0.00012929291036091982, "loss": 43.338, "step": 629 }, { "epoch": 0.062487601666336046, "grad_norm": 1.7273180484771729, "learning_rate": 0.00012909613704032357, "loss": 43.4329, "step": 630 }, { "epoch": 0.0625867883356477, "grad_norm": 1.7907323837280273, "learning_rate": 0.00012889924056336532, "loss": 43.5965, "step": 631 }, { "epoch": 0.06268597500495933, "grad_norm": 2.156737804412842, "learning_rate": 0.00012870222176345646, "loss": 43.4857, "step": 632 }, { "epoch": 0.06278516167427098, "grad_norm": 1.9107551574707031, "learning_rate": 0.0001285050814745262, "loss": 43.2368, "step": 633 }, { "epoch": 0.06288434834358263, "grad_norm": 1.8818268775939941, "learning_rate": 0.00012830782053101805, "loss": 43.2146, "step": 634 }, { "epoch": 0.06298353501289426, "grad_norm": 1.9194517135620117, "learning_rate": 0.0001281104397678861, "loss": 43.508, "step": 635 }, { "epoch": 0.06308272168220591, "grad_norm": 1.8047842979431152, "learning_rate": 0.0001279129400205917, "loss": 43.1821, "step": 636 }, { "epoch": 0.06318190835151756, "grad_norm": 2.30544376373291, "learning_rate": 0.00012771532212509974, "loss": 43.1541, "step": 637 }, { "epoch": 0.0632810950208292, "grad_norm": 2.176767349243164, "learning_rate": 0.00012751758691787525, "loss": 43.4023, "step": 638 }, { "epoch": 0.06338028169014084, "grad_norm": 1.8000246286392212, "learning_rate": 0.00012731973523587983, "loss": 43.2803, "step": 639 }, { "epoch": 0.06347946835945249, "grad_norm": 1.8897356986999512, "learning_rate": 0.00012712176791656807, "loss": 42.9886, "step": 640 }, { "epoch": 0.06357865502876413, "grad_norm": 2.1935887336730957, "learning_rate": 0.00012692368579788403, "loss": 43.3729, "step": 641 }, { "epoch": 0.06367784169807578, "grad_norm": 1.8704290390014648, "learning_rate": 0.00012672548971825762, "loss": 43.4844, "step": 642 }, { "epoch": 0.06377702836738743, "grad_norm": 1.7146517038345337, "learning_rate": 0.0001265271805166012, "loss": 43.5231, "step": 643 }, { "epoch": 0.06387621503669907, "grad_norm": 2.1074321269989014, "learning_rate": 0.000126328759032306, "loss": 43.4481, "step": 644 }, { "epoch": 0.06397540170601071, "grad_norm": 1.8534600734710693, "learning_rate": 0.00012613022610523828, "loss": 43.1294, "step": 645 }, { "epoch": 0.06407458837532236, "grad_norm": 1.9485807418823242, "learning_rate": 0.0001259315825757362, "loss": 43.1661, "step": 646 }, { "epoch": 0.06417377504463401, "grad_norm": 1.8090543746948242, "learning_rate": 0.0001257328292846061, "loss": 43.3416, "step": 647 }, { "epoch": 0.06427296171394564, "grad_norm": 2.271904230117798, "learning_rate": 0.0001255339670731188, "loss": 43.398, "step": 648 }, { "epoch": 0.06437214838325729, "grad_norm": 2.0795629024505615, "learning_rate": 0.00012533499678300618, "loss": 43.2094, "step": 649 }, { "epoch": 0.06447133505256894, "grad_norm": 1.8797928094863892, "learning_rate": 0.00012513591925645762, "loss": 43.5146, "step": 650 }, { "epoch": 0.06457052172188057, "grad_norm": 1.82566237449646, "learning_rate": 0.00012493673533611638, "loss": 43.1696, "step": 651 }, { "epoch": 0.06466970839119222, "grad_norm": 2.1695291996002197, "learning_rate": 0.00012473744586507604, "loss": 43.1521, "step": 652 }, { "epoch": 0.06476889506050387, "grad_norm": 2.276831865310669, "learning_rate": 0.000124538051686877, "loss": 43.4115, "step": 653 }, { "epoch": 0.06486808172981551, "grad_norm": 1.9476512670516968, "learning_rate": 0.00012433855364550283, "loss": 43.3368, "step": 654 }, { "epoch": 0.06496726839912716, "grad_norm": 1.870421051979065, "learning_rate": 0.00012413895258537675, "loss": 43.4044, "step": 655 }, { "epoch": 0.0650664550684388, "grad_norm": 1.8717193603515625, "learning_rate": 0.00012393924935135794, "loss": 43.3015, "step": 656 }, { "epoch": 0.06516564173775044, "grad_norm": 2.0823211669921875, "learning_rate": 0.0001237394447887382, "loss": 43.428, "step": 657 }, { "epoch": 0.06526482840706209, "grad_norm": 2.260535955429077, "learning_rate": 0.00012353953974323807, "loss": 43.4058, "step": 658 }, { "epoch": 0.06536401507637374, "grad_norm": 2.070770263671875, "learning_rate": 0.0001233395350610035, "loss": 43.2602, "step": 659 }, { "epoch": 0.06546320174568537, "grad_norm": 2.0625433921813965, "learning_rate": 0.0001231394315886022, "loss": 43.2021, "step": 660 }, { "epoch": 0.06556238841499702, "grad_norm": 1.7805284261703491, "learning_rate": 0.00012293923017302002, "loss": 43.057, "step": 661 }, { "epoch": 0.06566157508430867, "grad_norm": 2.2269046306610107, "learning_rate": 0.0001227389316616573, "loss": 43.4289, "step": 662 }, { "epoch": 0.06576076175362032, "grad_norm": 2.0715208053588867, "learning_rate": 0.00012253853690232544, "loss": 43.1889, "step": 663 }, { "epoch": 0.06585994842293195, "grad_norm": 2.3068435192108154, "learning_rate": 0.0001223380467432432, "loss": 43.3712, "step": 664 }, { "epoch": 0.0659591350922436, "grad_norm": 2.1551735401153564, "learning_rate": 0.00012213746203303317, "loss": 43.2689, "step": 665 }, { "epoch": 0.06605832176155525, "grad_norm": 2.1703572273254395, "learning_rate": 0.0001219367836207181, "loss": 43.2267, "step": 666 }, { "epoch": 0.06615750843086689, "grad_norm": 1.8056743144989014, "learning_rate": 0.00012173601235571742, "loss": 43.3811, "step": 667 }, { "epoch": 0.06625669510017854, "grad_norm": 2.023479700088501, "learning_rate": 0.00012153514908784352, "loss": 43.6131, "step": 668 }, { "epoch": 0.06635588176949019, "grad_norm": 2.173466682434082, "learning_rate": 0.00012133419466729827, "loss": 43.4799, "step": 669 }, { "epoch": 0.06645506843880182, "grad_norm": 2.105743646621704, "learning_rate": 0.0001211331499446693, "loss": 43.1905, "step": 670 }, { "epoch": 0.06655425510811347, "grad_norm": 2.2968595027923584, "learning_rate": 0.00012093201577092657, "loss": 43.4302, "step": 671 }, { "epoch": 0.06665344177742512, "grad_norm": 2.0486233234405518, "learning_rate": 0.00012073079299741854, "loss": 43.6473, "step": 672 }, { "epoch": 0.06675262844673675, "grad_norm": 2.3592488765716553, "learning_rate": 0.00012052948247586873, "loss": 43.1191, "step": 673 }, { "epoch": 0.0668518151160484, "grad_norm": 2.2749383449554443, "learning_rate": 0.00012032808505837215, "loss": 43.1717, "step": 674 }, { "epoch": 0.06695100178536005, "grad_norm": 2.0912585258483887, "learning_rate": 0.00012012660159739153, "loss": 43.4302, "step": 675 }, { "epoch": 0.06705018845467169, "grad_norm": 2.0188992023468018, "learning_rate": 0.00011992503294575383, "loss": 43.5247, "step": 676 }, { "epoch": 0.06714937512398333, "grad_norm": 2.094830274581909, "learning_rate": 0.0001197233799566466, "loss": 43.2191, "step": 677 }, { "epoch": 0.06724856179329498, "grad_norm": 2.1457154750823975, "learning_rate": 0.00011952164348361438, "loss": 43.0548, "step": 678 }, { "epoch": 0.06734774846260663, "grad_norm": 2.069021463394165, "learning_rate": 0.00011931982438055505, "loss": 43.0663, "step": 679 }, { "epoch": 0.06744693513191827, "grad_norm": 2.2444372177124023, "learning_rate": 0.00011911792350171623, "loss": 43.4136, "step": 680 }, { "epoch": 0.06754612180122992, "grad_norm": 2.3468430042266846, "learning_rate": 0.00011891594170169173, "loss": 43.4566, "step": 681 }, { "epoch": 0.06764530847054157, "grad_norm": 1.7173528671264648, "learning_rate": 0.00011871387983541789, "loss": 43.3776, "step": 682 }, { "epoch": 0.0677444951398532, "grad_norm": 1.9458478689193726, "learning_rate": 0.00011851173875816981, "loss": 43.5081, "step": 683 }, { "epoch": 0.06784368180916485, "grad_norm": 2.420630931854248, "learning_rate": 0.00011830951932555803, "loss": 42.9701, "step": 684 }, { "epoch": 0.0679428684784765, "grad_norm": 2.5840578079223633, "learning_rate": 0.00011810722239352467, "loss": 43.4166, "step": 685 }, { "epoch": 0.06804205514778813, "grad_norm": 2.482377767562866, "learning_rate": 0.00011790484881833992, "loss": 43.1931, "step": 686 }, { "epoch": 0.06814124181709978, "grad_norm": 1.876880407333374, "learning_rate": 0.00011770239945659828, "loss": 43.4322, "step": 687 }, { "epoch": 0.06824042848641143, "grad_norm": 2.0752649307250977, "learning_rate": 0.00011749987516521523, "loss": 43.608, "step": 688 }, { "epoch": 0.06833961515572307, "grad_norm": 2.184887409210205, "learning_rate": 0.00011729727680142324, "loss": 43.3287, "step": 689 }, { "epoch": 0.06843880182503471, "grad_norm": 2.0941390991210938, "learning_rate": 0.00011709460522276836, "loss": 43.2035, "step": 690 }, { "epoch": 0.06853798849434636, "grad_norm": 2.109752893447876, "learning_rate": 0.00011689186128710654, "loss": 43.2281, "step": 691 }, { "epoch": 0.068637175163658, "grad_norm": 1.6978241205215454, "learning_rate": 0.00011668904585259999, "loss": 43.3744, "step": 692 }, { "epoch": 0.06873636183296965, "grad_norm": 2.1997804641723633, "learning_rate": 0.0001164861597777136, "loss": 43.2087, "step": 693 }, { "epoch": 0.0688355485022813, "grad_norm": 2.0489675998687744, "learning_rate": 0.00011628320392121117, "loss": 43.5293, "step": 694 }, { "epoch": 0.06893473517159293, "grad_norm": 1.9380823373794556, "learning_rate": 0.00011608017914215199, "loss": 43.4076, "step": 695 }, { "epoch": 0.06903392184090458, "grad_norm": 2.074038028717041, "learning_rate": 0.00011587708629988698, "loss": 43.4672, "step": 696 }, { "epoch": 0.06913310851021623, "grad_norm": 2.015775680541992, "learning_rate": 0.0001156739262540552, "loss": 43.2192, "step": 697 }, { "epoch": 0.06923229517952788, "grad_norm": 2.0507521629333496, "learning_rate": 0.00011547069986458014, "loss": 43.2227, "step": 698 }, { "epoch": 0.06933148184883951, "grad_norm": 2.078310966491699, "learning_rate": 0.00011526740799166613, "loss": 43.2058, "step": 699 }, { "epoch": 0.06943066851815116, "grad_norm": 1.9881670475006104, "learning_rate": 0.00011506405149579468, "loss": 43.3019, "step": 700 }, { "epoch": 0.06952985518746281, "grad_norm": 2.05478572845459, "learning_rate": 0.00011486063123772077, "loss": 43.1937, "step": 701 }, { "epoch": 0.06962904185677445, "grad_norm": 2.0337398052215576, "learning_rate": 0.00011465714807846939, "loss": 43.3345, "step": 702 }, { "epoch": 0.0697282285260861, "grad_norm": 1.8894999027252197, "learning_rate": 0.00011445360287933165, "loss": 43.4023, "step": 703 }, { "epoch": 0.06982741519539774, "grad_norm": 2.0361080169677734, "learning_rate": 0.00011424999650186132, "loss": 43.2203, "step": 704 }, { "epoch": 0.06992660186470938, "grad_norm": 1.8185749053955078, "learning_rate": 0.00011404632980787112, "loss": 43.3893, "step": 705 }, { "epoch": 0.07002578853402103, "grad_norm": 2.1560885906219482, "learning_rate": 0.00011384260365942904, "loss": 43.5766, "step": 706 }, { "epoch": 0.07012497520333268, "grad_norm": 2.2028756141662598, "learning_rate": 0.00011363881891885478, "loss": 43.0853, "step": 707 }, { "epoch": 0.07022416187264431, "grad_norm": 1.9392995834350586, "learning_rate": 0.00011343497644871599, "loss": 43.1831, "step": 708 }, { "epoch": 0.07032334854195596, "grad_norm": 2.51483416557312, "learning_rate": 0.00011323107711182473, "loss": 43.3001, "step": 709 }, { "epoch": 0.07042253521126761, "grad_norm": 1.958832859992981, "learning_rate": 0.0001130271217712337, "loss": 43.3144, "step": 710 }, { "epoch": 0.07052172188057924, "grad_norm": 2.116614580154419, "learning_rate": 0.00011282311129023266, "loss": 43.4639, "step": 711 }, { "epoch": 0.07062090854989089, "grad_norm": 2.3148276805877686, "learning_rate": 0.00011261904653234485, "loss": 43.2808, "step": 712 }, { "epoch": 0.07072009521920254, "grad_norm": 1.8139839172363281, "learning_rate": 0.00011241492836132317, "loss": 42.8443, "step": 713 }, { "epoch": 0.07081928188851419, "grad_norm": 2.3591361045837402, "learning_rate": 0.00011221075764114657, "loss": 43.1324, "step": 714 }, { "epoch": 0.07091846855782583, "grad_norm": 2.1484439373016357, "learning_rate": 0.00011200653523601652, "loss": 43.4277, "step": 715 }, { "epoch": 0.07101765522713747, "grad_norm": 1.7410988807678223, "learning_rate": 0.0001118022620103532, "loss": 43.5825, "step": 716 }, { "epoch": 0.07111684189644912, "grad_norm": 2.1806561946868896, "learning_rate": 0.00011159793882879192, "loss": 43.2687, "step": 717 }, { "epoch": 0.07121602856576076, "grad_norm": 1.9958840608596802, "learning_rate": 0.00011139356655617945, "loss": 43.7212, "step": 718 }, { "epoch": 0.07131521523507241, "grad_norm": 1.9662500619888306, "learning_rate": 0.00011118914605757033, "loss": 43.4024, "step": 719 }, { "epoch": 0.07141440190438406, "grad_norm": 2.0258219242095947, "learning_rate": 0.0001109846781982232, "loss": 43.4606, "step": 720 }, { "epoch": 0.07151358857369569, "grad_norm": 2.297673463821411, "learning_rate": 0.00011078016384359724, "loss": 43.0811, "step": 721 }, { "epoch": 0.07161277524300734, "grad_norm": 1.6901992559432983, "learning_rate": 0.00011057560385934838, "loss": 43.4504, "step": 722 }, { "epoch": 0.07171196191231899, "grad_norm": 2.023513078689575, "learning_rate": 0.00011037099911132568, "loss": 43.4859, "step": 723 }, { "epoch": 0.07181114858163062, "grad_norm": 2.355264902114868, "learning_rate": 0.00011016635046556772, "loss": 43.2088, "step": 724 }, { "epoch": 0.07191033525094227, "grad_norm": 2.1904029846191406, "learning_rate": 0.00010996165878829886, "loss": 43.3399, "step": 725 }, { "epoch": 0.07200952192025392, "grad_norm": 2.091703414916992, "learning_rate": 0.00010975692494592555, "loss": 43.2976, "step": 726 }, { "epoch": 0.07210870858956556, "grad_norm": 2.0189108848571777, "learning_rate": 0.00010955214980503284, "loss": 43.2727, "step": 727 }, { "epoch": 0.0722078952588772, "grad_norm": 2.10141658782959, "learning_rate": 0.00010934733423238048, "loss": 43.3245, "step": 728 }, { "epoch": 0.07230708192818885, "grad_norm": 2.0579469203948975, "learning_rate": 0.00010914247909489934, "loss": 43.544, "step": 729 }, { "epoch": 0.07240626859750049, "grad_norm": 1.9957363605499268, "learning_rate": 0.00010893758525968789, "loss": 43.4117, "step": 730 }, { "epoch": 0.07250545526681214, "grad_norm": 2.300391912460327, "learning_rate": 0.00010873265359400828, "loss": 43.3068, "step": 731 }, { "epoch": 0.07260464193612379, "grad_norm": 1.767351746559143, "learning_rate": 0.0001085276849652828, "loss": 43.4994, "step": 732 }, { "epoch": 0.07270382860543544, "grad_norm": 1.922280192375183, "learning_rate": 0.00010832268024109025, "loss": 43.4082, "step": 733 }, { "epoch": 0.07280301527474707, "grad_norm": 1.8020707368850708, "learning_rate": 0.00010811764028916215, "loss": 43.322, "step": 734 }, { "epoch": 0.07290220194405872, "grad_norm": 1.94365656375885, "learning_rate": 0.00010791256597737917, "loss": 43.3427, "step": 735 }, { "epoch": 0.07300138861337037, "grad_norm": 1.7357027530670166, "learning_rate": 0.00010770745817376742, "loss": 43.257, "step": 736 }, { "epoch": 0.073100575282682, "grad_norm": 2.018589973449707, "learning_rate": 0.00010750231774649471, "loss": 43.3609, "step": 737 }, { "epoch": 0.07319976195199365, "grad_norm": 2.106233835220337, "learning_rate": 0.00010729714556386703, "loss": 43.1318, "step": 738 }, { "epoch": 0.0732989486213053, "grad_norm": 2.0256407260894775, "learning_rate": 0.0001070919424943247, "loss": 43.1411, "step": 739 }, { "epoch": 0.07339813529061694, "grad_norm": 1.8484340906143188, "learning_rate": 0.00010688670940643886, "loss": 43.1536, "step": 740 }, { "epoch": 0.07349732195992859, "grad_norm": 1.8888932466506958, "learning_rate": 0.0001066814471689076, "loss": 43.2585, "step": 741 }, { "epoch": 0.07359650862924023, "grad_norm": 2.2361347675323486, "learning_rate": 0.0001064761566505525, "loss": 43.4558, "step": 742 }, { "epoch": 0.07369569529855187, "grad_norm": 1.6723800897598267, "learning_rate": 0.00010627083872031475, "loss": 43.4809, "step": 743 }, { "epoch": 0.07379488196786352, "grad_norm": 2.4091243743896484, "learning_rate": 0.0001060654942472517, "loss": 43.1592, "step": 744 }, { "epoch": 0.07389406863717517, "grad_norm": 2.046893358230591, "learning_rate": 0.00010586012410053292, "loss": 43.4459, "step": 745 }, { "epoch": 0.0739932553064868, "grad_norm": 1.9660325050354004, "learning_rate": 0.00010565472914943669, "loss": 43.2088, "step": 746 }, { "epoch": 0.07409244197579845, "grad_norm": 1.8325823545455933, "learning_rate": 0.00010544931026334628, "loss": 43.5796, "step": 747 }, { "epoch": 0.0741916286451101, "grad_norm": 1.9669077396392822, "learning_rate": 0.00010524386831174628, "loss": 43.1441, "step": 748 }, { "epoch": 0.07429081531442173, "grad_norm": 2.019927501678467, "learning_rate": 0.00010503840416421897, "loss": 43.3234, "step": 749 }, { "epoch": 0.07439000198373338, "grad_norm": 1.807498574256897, "learning_rate": 0.0001048329186904404, "loss": 43.3648, "step": 750 }, { "epoch": 0.07448918865304503, "grad_norm": 2.210771322250366, "learning_rate": 0.00010462741276017711, "loss": 43.4761, "step": 751 }, { "epoch": 0.07458837532235668, "grad_norm": 1.832092523574829, "learning_rate": 0.00010442188724328205, "loss": 43.3958, "step": 752 }, { "epoch": 0.07468756199166832, "grad_norm": 2.0560593605041504, "learning_rate": 0.00010421634300969123, "loss": 43.2366, "step": 753 }, { "epoch": 0.07478674866097997, "grad_norm": 1.571313500404358, "learning_rate": 0.00010401078092941971, "loss": 43.4409, "step": 754 }, { "epoch": 0.07488593533029161, "grad_norm": 2.0047566890716553, "learning_rate": 0.00010380520187255825, "loss": 43.4865, "step": 755 }, { "epoch": 0.07498512199960325, "grad_norm": 2.1494686603546143, "learning_rate": 0.00010359960670926934, "loss": 43.4681, "step": 756 }, { "epoch": 0.0750843086689149, "grad_norm": 1.9020618200302124, "learning_rate": 0.00010339399630978373, "loss": 43.253, "step": 757 }, { "epoch": 0.07518349533822655, "grad_norm": 2.162813186645508, "learning_rate": 0.00010318837154439665, "loss": 43.5694, "step": 758 }, { "epoch": 0.07528268200753818, "grad_norm": 2.1656718254089355, "learning_rate": 0.00010298273328346411, "loss": 43.6855, "step": 759 }, { "epoch": 0.07538186867684983, "grad_norm": 1.8090124130249023, "learning_rate": 0.00010277708239739924, "loss": 43.2986, "step": 760 }, { "epoch": 0.07548105534616148, "grad_norm": 1.8135164976119995, "learning_rate": 0.00010257141975666866, "loss": 43.3275, "step": 761 }, { "epoch": 0.07558024201547311, "grad_norm": 2.150045871734619, "learning_rate": 0.0001023657462317887, "loss": 43.1874, "step": 762 }, { "epoch": 0.07567942868478476, "grad_norm": 2.495265007019043, "learning_rate": 0.0001021600626933217, "loss": 43.0854, "step": 763 }, { "epoch": 0.07577861535409641, "grad_norm": 2.129762887954712, "learning_rate": 0.0001019543700118725, "loss": 43.2001, "step": 764 }, { "epoch": 0.07587780202340805, "grad_norm": 1.7054615020751953, "learning_rate": 0.0001017486690580846, "loss": 43.2899, "step": 765 }, { "epoch": 0.0759769886927197, "grad_norm": 1.7827651500701904, "learning_rate": 0.00010154296070263649, "loss": 43.2659, "step": 766 }, { "epoch": 0.07607617536203135, "grad_norm": 1.9203288555145264, "learning_rate": 0.00010133724581623802, "loss": 43.2211, "step": 767 }, { "epoch": 0.076175362031343, "grad_norm": 2.113375663757324, "learning_rate": 0.00010113152526962665, "loss": 43.0736, "step": 768 }, { "epoch": 0.07627454870065463, "grad_norm": 1.8991948366165161, "learning_rate": 0.00010092579993356386, "loss": 43.3945, "step": 769 }, { "epoch": 0.07637373536996628, "grad_norm": 2.1152963638305664, "learning_rate": 0.00010072007067883132, "loss": 43.3966, "step": 770 }, { "epoch": 0.07637373536996628, "eval_loss": 10.826260566711426, "eval_runtime": 11.198, "eval_samples_per_second": 379.085, "eval_steps_per_second": 189.587, "step": 770 }, { "epoch": 0.07647292203927793, "grad_norm": 1.8426955938339233, "learning_rate": 0.0001005143383762273, "loss": 43.6264, "step": 771 }, { "epoch": 0.07657210870858956, "grad_norm": 1.909189224243164, "learning_rate": 0.00010030860389656305, "loss": 43.3986, "step": 772 }, { "epoch": 0.07667129537790121, "grad_norm": 1.6439357995986938, "learning_rate": 0.00010010286811065899, "loss": 43.3349, "step": 773 }, { "epoch": 0.07677048204721286, "grad_norm": 1.8733209371566772, "learning_rate": 9.989713188934105e-05, "loss": 43.1739, "step": 774 }, { "epoch": 0.0768696687165245, "grad_norm": 1.9661895036697388, "learning_rate": 9.969139610343696e-05, "loss": 43.4383, "step": 775 }, { "epoch": 0.07696885538583614, "grad_norm": 1.8934898376464844, "learning_rate": 9.948566162377274e-05, "loss": 43.4416, "step": 776 }, { "epoch": 0.07706804205514779, "grad_norm": 2.4417266845703125, "learning_rate": 9.927992932116873e-05, "loss": 43.3214, "step": 777 }, { "epoch": 0.07716722872445943, "grad_norm": 2.0885379314422607, "learning_rate": 9.907420006643619e-05, "loss": 43.3923, "step": 778 }, { "epoch": 0.07726641539377108, "grad_norm": 2.045199394226074, "learning_rate": 9.886847473037337e-05, "loss": 43.4345, "step": 779 }, { "epoch": 0.07736560206308273, "grad_norm": 1.69662344455719, "learning_rate": 9.866275418376202e-05, "loss": 43.5109, "step": 780 }, { "epoch": 0.07746478873239436, "grad_norm": 1.8541556596755981, "learning_rate": 9.845703929736351e-05, "loss": 43.1333, "step": 781 }, { "epoch": 0.07756397540170601, "grad_norm": 1.9225294589996338, "learning_rate": 9.825133094191541e-05, "loss": 43.5618, "step": 782 }, { "epoch": 0.07766316207101766, "grad_norm": 1.8808287382125854, "learning_rate": 9.80456299881275e-05, "loss": 43.2339, "step": 783 }, { "epoch": 0.07776234874032929, "grad_norm": 2.0820178985595703, "learning_rate": 9.783993730667831e-05, "loss": 43.2547, "step": 784 }, { "epoch": 0.07786153540964094, "grad_norm": 1.9718780517578125, "learning_rate": 9.763425376821134e-05, "loss": 43.2476, "step": 785 }, { "epoch": 0.07796072207895259, "grad_norm": 2.1505115032196045, "learning_rate": 9.742858024333136e-05, "loss": 43.3617, "step": 786 }, { "epoch": 0.07805990874826424, "grad_norm": 2.218475103378296, "learning_rate": 9.722291760260077e-05, "loss": 43.4309, "step": 787 }, { "epoch": 0.07815909541757587, "grad_norm": 1.9229711294174194, "learning_rate": 9.70172667165359e-05, "loss": 43.0936, "step": 788 }, { "epoch": 0.07825828208688752, "grad_norm": 2.0784668922424316, "learning_rate": 9.681162845560336e-05, "loss": 43.4272, "step": 789 }, { "epoch": 0.07835746875619917, "grad_norm": 2.339954376220703, "learning_rate": 9.66060036902163e-05, "loss": 43.3583, "step": 790 }, { "epoch": 0.07845665542551081, "grad_norm": 1.8724277019500732, "learning_rate": 9.64003932907307e-05, "loss": 43.3627, "step": 791 }, { "epoch": 0.07855584209482246, "grad_norm": 2.0573184490203857, "learning_rate": 9.619479812744182e-05, "loss": 43.4146, "step": 792 }, { "epoch": 0.0786550287641341, "grad_norm": 1.992231011390686, "learning_rate": 9.598921907058033e-05, "loss": 43.4727, "step": 793 }, { "epoch": 0.07875421543344574, "grad_norm": 2.3907063007354736, "learning_rate": 9.578365699030882e-05, "loss": 43.3525, "step": 794 }, { "epoch": 0.07885340210275739, "grad_norm": 1.9989038705825806, "learning_rate": 9.557811275671793e-05, "loss": 43.281, "step": 795 }, { "epoch": 0.07895258877206904, "grad_norm": 1.6934643983840942, "learning_rate": 9.53725872398229e-05, "loss": 42.9595, "step": 796 }, { "epoch": 0.07905177544138067, "grad_norm": 1.954426646232605, "learning_rate": 9.51670813095596e-05, "loss": 43.2288, "step": 797 }, { "epoch": 0.07915096211069232, "grad_norm": 2.4834814071655273, "learning_rate": 9.496159583578107e-05, "loss": 43.4051, "step": 798 }, { "epoch": 0.07925014878000397, "grad_norm": 1.9120532274246216, "learning_rate": 9.475613168825374e-05, "loss": 43.4583, "step": 799 }, { "epoch": 0.0793493354493156, "grad_norm": 2.355027437210083, "learning_rate": 9.455068973665376e-05, "loss": 43.3426, "step": 800 }, { "epoch": 0.07944852211862725, "grad_norm": 2.232403039932251, "learning_rate": 9.434527085056335e-05, "loss": 43.3287, "step": 801 }, { "epoch": 0.0795477087879389, "grad_norm": 1.9266585111618042, "learning_rate": 9.413987589946711e-05, "loss": 43.3629, "step": 802 }, { "epoch": 0.07964689545725055, "grad_norm": 2.3601436614990234, "learning_rate": 9.393450575274831e-05, "loss": 42.8946, "step": 803 }, { "epoch": 0.07974608212656219, "grad_norm": 2.0758936405181885, "learning_rate": 9.372916127968525e-05, "loss": 43.434, "step": 804 }, { "epoch": 0.07984526879587384, "grad_norm": 1.6952619552612305, "learning_rate": 9.352384334944753e-05, "loss": 43.2809, "step": 805 }, { "epoch": 0.07994445546518548, "grad_norm": 1.6877068281173706, "learning_rate": 9.331855283109244e-05, "loss": 43.3979, "step": 806 }, { "epoch": 0.08004364213449712, "grad_norm": 1.739186406135559, "learning_rate": 9.311329059356119e-05, "loss": 43.4973, "step": 807 }, { "epoch": 0.08014282880380877, "grad_norm": 2.102400779724121, "learning_rate": 9.290805750567532e-05, "loss": 43.5786, "step": 808 }, { "epoch": 0.08024201547312042, "grad_norm": 1.8936583995819092, "learning_rate": 9.270285443613297e-05, "loss": 43.2228, "step": 809 }, { "epoch": 0.08034120214243205, "grad_norm": 1.5889769792556763, "learning_rate": 9.249768225350528e-05, "loss": 43.1588, "step": 810 }, { "epoch": 0.0804403888117437, "grad_norm": 2.0594937801361084, "learning_rate": 9.22925418262326e-05, "loss": 43.5444, "step": 811 }, { "epoch": 0.08053957548105535, "grad_norm": 2.0004096031188965, "learning_rate": 9.208743402262085e-05, "loss": 43.533, "step": 812 }, { "epoch": 0.08063876215036699, "grad_norm": 1.799493670463562, "learning_rate": 9.188235971083786e-05, "loss": 43.4169, "step": 813 }, { "epoch": 0.08073794881967863, "grad_norm": 2.0812935829162598, "learning_rate": 9.167731975890976e-05, "loss": 43.4735, "step": 814 }, { "epoch": 0.08083713548899028, "grad_norm": 2.335212230682373, "learning_rate": 9.147231503471722e-05, "loss": 43.2451, "step": 815 }, { "epoch": 0.08093632215830192, "grad_norm": 1.8596166372299194, "learning_rate": 9.126734640599175e-05, "loss": 43.5395, "step": 816 }, { "epoch": 0.08103550882761357, "grad_norm": 1.847628116607666, "learning_rate": 9.106241474031212e-05, "loss": 43.522, "step": 817 }, { "epoch": 0.08113469549692522, "grad_norm": 1.8958014249801636, "learning_rate": 9.085752090510068e-05, "loss": 43.2172, "step": 818 }, { "epoch": 0.08123388216623685, "grad_norm": 2.072009801864624, "learning_rate": 9.065266576761957e-05, "loss": 43.3761, "step": 819 }, { "epoch": 0.0813330688355485, "grad_norm": 2.2282555103302, "learning_rate": 9.04478501949672e-05, "loss": 43.374, "step": 820 }, { "epoch": 0.08143225550486015, "grad_norm": 2.631197690963745, "learning_rate": 9.024307505407448e-05, "loss": 43.3543, "step": 821 }, { "epoch": 0.0815314421741718, "grad_norm": 1.9607280492782593, "learning_rate": 9.00383412117012e-05, "loss": 43.2129, "step": 822 }, { "epoch": 0.08163062884348343, "grad_norm": 2.081810474395752, "learning_rate": 8.983364953443227e-05, "loss": 43.1948, "step": 823 }, { "epoch": 0.08172981551279508, "grad_norm": 1.7728948593139648, "learning_rate": 8.962900088867433e-05, "loss": 43.5004, "step": 824 }, { "epoch": 0.08182900218210673, "grad_norm": 2.38051700592041, "learning_rate": 8.942439614065163e-05, "loss": 43.1374, "step": 825 }, { "epoch": 0.08192818885141837, "grad_norm": 2.2605717182159424, "learning_rate": 8.921983615640277e-05, "loss": 43.0003, "step": 826 }, { "epoch": 0.08202737552073001, "grad_norm": 2.0431177616119385, "learning_rate": 8.901532180177681e-05, "loss": 43.2422, "step": 827 }, { "epoch": 0.08212656219004166, "grad_norm": 1.8278536796569824, "learning_rate": 8.881085394242969e-05, "loss": 43.497, "step": 828 }, { "epoch": 0.0822257488593533, "grad_norm": 1.7372757196426392, "learning_rate": 8.860643344382056e-05, "loss": 43.6374, "step": 829 }, { "epoch": 0.08232493552866495, "grad_norm": 2.222142219543457, "learning_rate": 8.840206117120808e-05, "loss": 43.5056, "step": 830 }, { "epoch": 0.0824241221979766, "grad_norm": 1.8501993417739868, "learning_rate": 8.819773798964682e-05, "loss": 43.4804, "step": 831 }, { "epoch": 0.08252330886728823, "grad_norm": 2.0640487670898438, "learning_rate": 8.79934647639835e-05, "loss": 43.0978, "step": 832 }, { "epoch": 0.08262249553659988, "grad_norm": 1.6562526226043701, "learning_rate": 8.778924235885347e-05, "loss": 43.5381, "step": 833 }, { "epoch": 0.08272168220591153, "grad_norm": 2.252190113067627, "learning_rate": 8.758507163867688e-05, "loss": 43.4221, "step": 834 }, { "epoch": 0.08282086887522316, "grad_norm": 2.2501325607299805, "learning_rate": 8.738095346765518e-05, "loss": 43.3227, "step": 835 }, { "epoch": 0.08292005554453481, "grad_norm": 2.0487358570098877, "learning_rate": 8.717688870976735e-05, "loss": 43.293, "step": 836 }, { "epoch": 0.08301924221384646, "grad_norm": 2.0176048278808594, "learning_rate": 8.697287822876634e-05, "loss": 43.4876, "step": 837 }, { "epoch": 0.08311842888315811, "grad_norm": 1.7127087116241455, "learning_rate": 8.676892288817531e-05, "loss": 43.4954, "step": 838 }, { "epoch": 0.08321761555246975, "grad_norm": 2.146505832672119, "learning_rate": 8.656502355128403e-05, "loss": 43.288, "step": 839 }, { "epoch": 0.0833168022217814, "grad_norm": 2.016024112701416, "learning_rate": 8.636118108114525e-05, "loss": 43.3378, "step": 840 }, { "epoch": 0.08341598889109304, "grad_norm": 1.853689193725586, "learning_rate": 8.615739634057098e-05, "loss": 43.3059, "step": 841 }, { "epoch": 0.08351517556040468, "grad_norm": 2.3869194984436035, "learning_rate": 8.595367019212891e-05, "loss": 43.3853, "step": 842 }, { "epoch": 0.08361436222971633, "grad_norm": 2.057309865951538, "learning_rate": 8.575000349813869e-05, "loss": 43.6381, "step": 843 }, { "epoch": 0.08371354889902798, "grad_norm": 1.9723752737045288, "learning_rate": 8.554639712066836e-05, "loss": 43.4401, "step": 844 }, { "epoch": 0.08381273556833961, "grad_norm": 2.210163116455078, "learning_rate": 8.534285192153063e-05, "loss": 43.373, "step": 845 }, { "epoch": 0.08391192223765126, "grad_norm": 1.7642301321029663, "learning_rate": 8.513936876227924e-05, "loss": 43.6333, "step": 846 }, { "epoch": 0.08401110890696291, "grad_norm": 2.186332941055298, "learning_rate": 8.493594850420537e-05, "loss": 43.4153, "step": 847 }, { "epoch": 0.08411029557627454, "grad_norm": 2.000633716583252, "learning_rate": 8.473259200833392e-05, "loss": 43.5029, "step": 848 }, { "epoch": 0.08420948224558619, "grad_norm": 1.8704267740249634, "learning_rate": 8.452930013541991e-05, "loss": 43.1078, "step": 849 }, { "epoch": 0.08430866891489784, "grad_norm": 1.8920806646347046, "learning_rate": 8.432607374594484e-05, "loss": 43.4884, "step": 850 }, { "epoch": 0.08440785558420948, "grad_norm": 2.1088051795959473, "learning_rate": 8.412291370011305e-05, "loss": 43.5061, "step": 851 }, { "epoch": 0.08450704225352113, "grad_norm": 1.9346009492874146, "learning_rate": 8.391982085784804e-05, "loss": 43.4881, "step": 852 }, { "epoch": 0.08460622892283277, "grad_norm": 1.6725366115570068, "learning_rate": 8.371679607878884e-05, "loss": 43.369, "step": 853 }, { "epoch": 0.08470541559214441, "grad_norm": 2.194059133529663, "learning_rate": 8.351384022228644e-05, "loss": 43.3481, "step": 854 }, { "epoch": 0.08480460226145606, "grad_norm": 2.2864034175872803, "learning_rate": 8.331095414740002e-05, "loss": 43.0558, "step": 855 }, { "epoch": 0.0849037889307677, "grad_norm": 1.8839657306671143, "learning_rate": 8.310813871289348e-05, "loss": 43.2508, "step": 856 }, { "epoch": 0.08500297560007936, "grad_norm": 1.9816380739212036, "learning_rate": 8.290539477723166e-05, "loss": 43.3909, "step": 857 }, { "epoch": 0.08510216226939099, "grad_norm": 1.9969961643218994, "learning_rate": 8.270272319857677e-05, "loss": 42.9909, "step": 858 }, { "epoch": 0.08520134893870264, "grad_norm": 2.2538459300994873, "learning_rate": 8.250012483478478e-05, "loss": 43.2314, "step": 859 }, { "epoch": 0.08530053560801429, "grad_norm": 2.2255730628967285, "learning_rate": 8.229760054340173e-05, "loss": 43.3036, "step": 860 }, { "epoch": 0.08539972227732592, "grad_norm": 1.8784208297729492, "learning_rate": 8.209515118166013e-05, "loss": 43.1374, "step": 861 }, { "epoch": 0.08549890894663757, "grad_norm": 1.8871439695358276, "learning_rate": 8.189277760647537e-05, "loss": 43.5098, "step": 862 }, { "epoch": 0.08559809561594922, "grad_norm": 1.8206897974014282, "learning_rate": 8.169048067444202e-05, "loss": 43.5459, "step": 863 }, { "epoch": 0.08569728228526086, "grad_norm": 2.27030086517334, "learning_rate": 8.148826124183021e-05, "loss": 43.1809, "step": 864 }, { "epoch": 0.0857964689545725, "grad_norm": 1.6263984441757202, "learning_rate": 8.128612016458215e-05, "loss": 43.3087, "step": 865 }, { "epoch": 0.08589565562388415, "grad_norm": 2.527437210083008, "learning_rate": 8.108405829830828e-05, "loss": 43.2486, "step": 866 }, { "epoch": 0.08599484229319579, "grad_norm": 1.9116472005844116, "learning_rate": 8.088207649828378e-05, "loss": 43.2823, "step": 867 }, { "epoch": 0.08609402896250744, "grad_norm": 2.1750049591064453, "learning_rate": 8.068017561944499e-05, "loss": 43.3239, "step": 868 }, { "epoch": 0.08619321563181909, "grad_norm": 2.554185628890991, "learning_rate": 8.047835651638564e-05, "loss": 43.2689, "step": 869 }, { "epoch": 0.08629240230113072, "grad_norm": 2.2520182132720947, "learning_rate": 8.027662004335341e-05, "loss": 43.5203, "step": 870 }, { "epoch": 0.08639158897044237, "grad_norm": 2.4951577186584473, "learning_rate": 8.00749670542462e-05, "loss": 43.1764, "step": 871 }, { "epoch": 0.08649077563975402, "grad_norm": 2.1510233879089355, "learning_rate": 7.98733984026085e-05, "loss": 43.0574, "step": 872 }, { "epoch": 0.08658996230906567, "grad_norm": 2.0969350337982178, "learning_rate": 7.967191494162788e-05, "loss": 43.3438, "step": 873 }, { "epoch": 0.0866891489783773, "grad_norm": 2.208730936050415, "learning_rate": 7.94705175241313e-05, "loss": 43.2914, "step": 874 }, { "epoch": 0.08678833564768895, "grad_norm": 1.6258764266967773, "learning_rate": 7.926920700258151e-05, "loss": 43.3183, "step": 875 }, { "epoch": 0.0868875223170006, "grad_norm": 1.7710012197494507, "learning_rate": 7.906798422907348e-05, "loss": 43.4455, "step": 876 }, { "epoch": 0.08698670898631224, "grad_norm": 1.8769505023956299, "learning_rate": 7.886685005533072e-05, "loss": 43.1386, "step": 877 }, { "epoch": 0.08708589565562389, "grad_norm": 1.9469021558761597, "learning_rate": 7.866580533270175e-05, "loss": 43.3755, "step": 878 }, { "epoch": 0.08718508232493553, "grad_norm": 1.9156124591827393, "learning_rate": 7.846485091215649e-05, "loss": 43.3023, "step": 879 }, { "epoch": 0.08728426899424717, "grad_norm": 1.9027340412139893, "learning_rate": 7.82639876442826e-05, "loss": 43.1858, "step": 880 }, { "epoch": 0.08738345566355882, "grad_norm": 1.8347947597503662, "learning_rate": 7.806321637928191e-05, "loss": 43.2774, "step": 881 }, { "epoch": 0.08748264233287047, "grad_norm": 1.7137409448623657, "learning_rate": 7.786253796696685e-05, "loss": 43.3747, "step": 882 }, { "epoch": 0.0875818290021821, "grad_norm": 1.8516457080841064, "learning_rate": 7.76619532567568e-05, "loss": 43.3912, "step": 883 }, { "epoch": 0.08768101567149375, "grad_norm": 2.108398914337158, "learning_rate": 7.746146309767457e-05, "loss": 43.1975, "step": 884 }, { "epoch": 0.0877802023408054, "grad_norm": 1.9404754638671875, "learning_rate": 7.72610683383427e-05, "loss": 43.2615, "step": 885 }, { "epoch": 0.08787938901011703, "grad_norm": 1.8628884553909302, "learning_rate": 7.706076982697999e-05, "loss": 43.2653, "step": 886 }, { "epoch": 0.08797857567942868, "grad_norm": 2.1554386615753174, "learning_rate": 7.68605684113978e-05, "loss": 43.2609, "step": 887 }, { "epoch": 0.08807776234874033, "grad_norm": 1.7659459114074707, "learning_rate": 7.666046493899652e-05, "loss": 43.0785, "step": 888 }, { "epoch": 0.08817694901805197, "grad_norm": 2.274144411087036, "learning_rate": 7.646046025676198e-05, "loss": 43.1762, "step": 889 }, { "epoch": 0.08827613568736362, "grad_norm": 1.9197145700454712, "learning_rate": 7.626055521126186e-05, "loss": 43.5115, "step": 890 }, { "epoch": 0.08837532235667526, "grad_norm": 1.8726004362106323, "learning_rate": 7.606075064864208e-05, "loss": 43.2822, "step": 891 }, { "epoch": 0.08847450902598691, "grad_norm": 1.746451497077942, "learning_rate": 7.586104741462325e-05, "loss": 43.5798, "step": 892 }, { "epoch": 0.08857369569529855, "grad_norm": 2.0699939727783203, "learning_rate": 7.566144635449714e-05, "loss": 43.0018, "step": 893 }, { "epoch": 0.0886728823646102, "grad_norm": 2.1814639568328857, "learning_rate": 7.546194831312299e-05, "loss": 43.2637, "step": 894 }, { "epoch": 0.08877206903392185, "grad_norm": 1.8805299997329712, "learning_rate": 7.526255413492395e-05, "loss": 43.2859, "step": 895 }, { "epoch": 0.08887125570323348, "grad_norm": 1.977893590927124, "learning_rate": 7.506326466388365e-05, "loss": 42.9297, "step": 896 }, { "epoch": 0.08897044237254513, "grad_norm": 2.2348368167877197, "learning_rate": 7.486408074354239e-05, "loss": 43.3931, "step": 897 }, { "epoch": 0.08906962904185678, "grad_norm": 2.1115787029266357, "learning_rate": 7.466500321699383e-05, "loss": 43.3988, "step": 898 }, { "epoch": 0.08916881571116841, "grad_norm": 1.9082059860229492, "learning_rate": 7.446603292688122e-05, "loss": 43.1923, "step": 899 }, { "epoch": 0.08926800238048006, "grad_norm": 2.0960566997528076, "learning_rate": 7.426717071539391e-05, "loss": 43.1175, "step": 900 }, { "epoch": 0.08936718904979171, "grad_norm": 2.003662586212158, "learning_rate": 7.40684174242638e-05, "loss": 43.2361, "step": 901 }, { "epoch": 0.08946637571910335, "grad_norm": 2.0358119010925293, "learning_rate": 7.386977389476177e-05, "loss": 43.2814, "step": 902 }, { "epoch": 0.089565562388415, "grad_norm": 1.760054349899292, "learning_rate": 7.367124096769406e-05, "loss": 43.4104, "step": 903 }, { "epoch": 0.08966474905772664, "grad_norm": 2.033608913421631, "learning_rate": 7.347281948339879e-05, "loss": 43.3881, "step": 904 }, { "epoch": 0.08976393572703828, "grad_norm": 1.8651162385940552, "learning_rate": 7.327451028174239e-05, "loss": 43.3416, "step": 905 }, { "epoch": 0.08986312239634993, "grad_norm": 2.2551417350769043, "learning_rate": 7.307631420211597e-05, "loss": 43.2184, "step": 906 }, { "epoch": 0.08996230906566158, "grad_norm": 2.0666677951812744, "learning_rate": 7.287823208343192e-05, "loss": 43.5389, "step": 907 }, { "epoch": 0.09006149573497323, "grad_norm": 2.007537841796875, "learning_rate": 7.268026476412016e-05, "loss": 43.491, "step": 908 }, { "epoch": 0.09016068240428486, "grad_norm": 1.9002093076705933, "learning_rate": 7.248241308212476e-05, "loss": 43.2267, "step": 909 }, { "epoch": 0.09025986907359651, "grad_norm": 2.2901949882507324, "learning_rate": 7.228467787490028e-05, "loss": 43.3848, "step": 910 }, { "epoch": 0.09035905574290816, "grad_norm": 2.2208406925201416, "learning_rate": 7.208705997940832e-05, "loss": 43.4141, "step": 911 }, { "epoch": 0.0904582424122198, "grad_norm": 2.069545269012451, "learning_rate": 7.18895602321139e-05, "loss": 43.3589, "step": 912 }, { "epoch": 0.09055742908153144, "grad_norm": 1.8305208683013916, "learning_rate": 7.169217946898197e-05, "loss": 43.0796, "step": 913 }, { "epoch": 0.09065661575084309, "grad_norm": 2.1818175315856934, "learning_rate": 7.149491852547381e-05, "loss": 43.4106, "step": 914 }, { "epoch": 0.09075580242015473, "grad_norm": 2.072183847427368, "learning_rate": 7.129777823654357e-05, "loss": 43.4138, "step": 915 }, { "epoch": 0.09085498908946638, "grad_norm": 1.7879893779754639, "learning_rate": 7.110075943663472e-05, "loss": 43.403, "step": 916 }, { "epoch": 0.09095417575877802, "grad_norm": 1.8871010541915894, "learning_rate": 7.090386295967645e-05, "loss": 43.4451, "step": 917 }, { "epoch": 0.09105336242808966, "grad_norm": 2.0419015884399414, "learning_rate": 7.070708963908022e-05, "loss": 43.1008, "step": 918 }, { "epoch": 0.09115254909740131, "grad_norm": 1.8709052801132202, "learning_rate": 7.051044030773618e-05, "loss": 43.2672, "step": 919 }, { "epoch": 0.09125173576671296, "grad_norm": 1.7151799201965332, "learning_rate": 7.031391579800968e-05, "loss": 43.3119, "step": 920 }, { "epoch": 0.09135092243602459, "grad_norm": 1.9316984415054321, "learning_rate": 7.011751694173774e-05, "loss": 42.986, "step": 921 }, { "epoch": 0.09145010910533624, "grad_norm": 1.858638882637024, "learning_rate": 6.992124457022553e-05, "loss": 43.3671, "step": 922 }, { "epoch": 0.09154929577464789, "grad_norm": 1.8579833507537842, "learning_rate": 6.97250995142428e-05, "loss": 43.462, "step": 923 }, { "epoch": 0.09164848244395953, "grad_norm": 2.0259623527526855, "learning_rate": 6.952908260402044e-05, "loss": 43.3319, "step": 924 }, { "epoch": 0.09174766911327117, "grad_norm": 2.292117118835449, "learning_rate": 6.933319466924693e-05, "loss": 43.3251, "step": 925 }, { "epoch": 0.09184685578258282, "grad_norm": 1.8373504877090454, "learning_rate": 6.913743653906481e-05, "loss": 43.3862, "step": 926 }, { "epoch": 0.09194604245189447, "grad_norm": 1.8580958843231201, "learning_rate": 6.894180904206722e-05, "loss": 43.4301, "step": 927 }, { "epoch": 0.0920452291212061, "grad_norm": 1.6251627206802368, "learning_rate": 6.874631300629435e-05, "loss": 43.3982, "step": 928 }, { "epoch": 0.09214441579051776, "grad_norm": 1.8827046155929565, "learning_rate": 6.855094925922995e-05, "loss": 43.3695, "step": 929 }, { "epoch": 0.0922436024598294, "grad_norm": 1.693343997001648, "learning_rate": 6.835571862779782e-05, "loss": 43.4202, "step": 930 }, { "epoch": 0.09234278912914104, "grad_norm": 1.837537407875061, "learning_rate": 6.81606219383583e-05, "loss": 43.2809, "step": 931 }, { "epoch": 0.09244197579845269, "grad_norm": 2.221561908721924, "learning_rate": 6.796566001670484e-05, "loss": 43.4303, "step": 932 }, { "epoch": 0.09254116246776434, "grad_norm": 2.1096158027648926, "learning_rate": 6.777083368806045e-05, "loss": 43.414, "step": 933 }, { "epoch": 0.09264034913707597, "grad_norm": 1.9879943132400513, "learning_rate": 6.757614377707409e-05, "loss": 43.0699, "step": 934 }, { "epoch": 0.09273953580638762, "grad_norm": 2.0617077350616455, "learning_rate": 6.738159110781748e-05, "loss": 43.3147, "step": 935 }, { "epoch": 0.09283872247569927, "grad_norm": 1.7820866107940674, "learning_rate": 6.71871765037813e-05, "loss": 43.5547, "step": 936 }, { "epoch": 0.0929379091450109, "grad_norm": 1.9996870756149292, "learning_rate": 6.699290078787193e-05, "loss": 43.26, "step": 937 }, { "epoch": 0.09303709581432255, "grad_norm": 2.1735830307006836, "learning_rate": 6.679876478240777e-05, "loss": 43.3021, "step": 938 }, { "epoch": 0.0931362824836342, "grad_norm": 2.156907320022583, "learning_rate": 6.660476930911592e-05, "loss": 43.4819, "step": 939 }, { "epoch": 0.09323546915294584, "grad_norm": 1.9821912050247192, "learning_rate": 6.641091518912867e-05, "loss": 43.5658, "step": 940 }, { "epoch": 0.09333465582225749, "grad_norm": 2.2868778705596924, "learning_rate": 6.621720324297995e-05, "loss": 43.0609, "step": 941 }, { "epoch": 0.09343384249156914, "grad_norm": 2.146440029144287, "learning_rate": 6.602363429060195e-05, "loss": 43.3597, "step": 942 }, { "epoch": 0.09353302916088078, "grad_norm": 1.8547191619873047, "learning_rate": 6.583020915132152e-05, "loss": 43.2957, "step": 943 }, { "epoch": 0.09363221583019242, "grad_norm": 1.8656991720199585, "learning_rate": 6.563692864385687e-05, "loss": 43.4724, "step": 944 }, { "epoch": 0.09373140249950407, "grad_norm": 1.7782119512557983, "learning_rate": 6.544379358631402e-05, "loss": 43.3768, "step": 945 }, { "epoch": 0.09383058916881572, "grad_norm": 2.25162410736084, "learning_rate": 6.525080479618331e-05, "loss": 42.9683, "step": 946 }, { "epoch": 0.09392977583812735, "grad_norm": 2.063875198364258, "learning_rate": 6.505796309033594e-05, "loss": 43.382, "step": 947 }, { "epoch": 0.094028962507439, "grad_norm": 2.0970652103424072, "learning_rate": 6.486526928502052e-05, "loss": 43.3539, "step": 948 }, { "epoch": 0.09412814917675065, "grad_norm": 2.415295124053955, "learning_rate": 6.467272419585984e-05, "loss": 43.5025, "step": 949 }, { "epoch": 0.09422733584606229, "grad_norm": 1.9924023151397705, "learning_rate": 6.448032863784695e-05, "loss": 43.4337, "step": 950 }, { "epoch": 0.09432652251537393, "grad_norm": 1.8715537786483765, "learning_rate": 6.428808342534212e-05, "loss": 43.1356, "step": 951 }, { "epoch": 0.09442570918468558, "grad_norm": 2.0384087562561035, "learning_rate": 6.40959893720692e-05, "loss": 43.2594, "step": 952 }, { "epoch": 0.09452489585399722, "grad_norm": 2.6872024536132812, "learning_rate": 6.390404729111225e-05, "loss": 43.0744, "step": 953 }, { "epoch": 0.09462408252330887, "grad_norm": 2.256521463394165, "learning_rate": 6.371225799491203e-05, "loss": 42.9652, "step": 954 }, { "epoch": 0.09472326919262052, "grad_norm": 1.823710322380066, "learning_rate": 6.352062229526266e-05, "loss": 43.4237, "step": 955 }, { "epoch": 0.09482245586193215, "grad_norm": 1.9346442222595215, "learning_rate": 6.332914100330805e-05, "loss": 43.2199, "step": 956 }, { "epoch": 0.0949216425312438, "grad_norm": 1.7909510135650635, "learning_rate": 6.313781492953861e-05, "loss": 43.5083, "step": 957 }, { "epoch": 0.09502082920055545, "grad_norm": 1.8812695741653442, "learning_rate": 6.294664488378776e-05, "loss": 43.4594, "step": 958 }, { "epoch": 0.09512001586986708, "grad_norm": 2.4366650581359863, "learning_rate": 6.27556316752284e-05, "loss": 43.4484, "step": 959 }, { "epoch": 0.09521920253917873, "grad_norm": 2.2085118293762207, "learning_rate": 6.256477611236966e-05, "loss": 43.2067, "step": 960 }, { "epoch": 0.09531838920849038, "grad_norm": 2.11188006401062, "learning_rate": 6.237407900305335e-05, "loss": 43.0082, "step": 961 }, { "epoch": 0.09541757587780203, "grad_norm": 2.034411668777466, "learning_rate": 6.218354115445069e-05, "loss": 43.0793, "step": 962 }, { "epoch": 0.09551676254711366, "grad_norm": 2.1220459938049316, "learning_rate": 6.199316337305867e-05, "loss": 43.2767, "step": 963 }, { "epoch": 0.09561594921642531, "grad_norm": 2.0053889751434326, "learning_rate": 6.180294646469679e-05, "loss": 43.4069, "step": 964 }, { "epoch": 0.09571513588573696, "grad_norm": 2.008760452270508, "learning_rate": 6.161289123450367e-05, "loss": 43.3277, "step": 965 }, { "epoch": 0.0958143225550486, "grad_norm": 2.0443673133850098, "learning_rate": 6.142299848693351e-05, "loss": 43.2367, "step": 966 }, { "epoch": 0.09591350922436025, "grad_norm": 1.8105210065841675, "learning_rate": 6.123326902575282e-05, "loss": 43.2775, "step": 967 }, { "epoch": 0.0960126958936719, "grad_norm": 1.9195630550384521, "learning_rate": 6.104370365403694e-05, "loss": 43.1919, "step": 968 }, { "epoch": 0.09611188256298353, "grad_norm": 1.6864792108535767, "learning_rate": 6.0854303174166636e-05, "loss": 43.3297, "step": 969 }, { "epoch": 0.09621106923229518, "grad_norm": 2.0158731937408447, "learning_rate": 6.06650683878248e-05, "loss": 43.4519, "step": 970 }, { "epoch": 0.09631025590160683, "grad_norm": 1.9543628692626953, "learning_rate": 6.0476000095992945e-05, "loss": 42.9919, "step": 971 }, { "epoch": 0.09640944257091846, "grad_norm": 1.844759225845337, "learning_rate": 6.0287099098947805e-05, "loss": 43.2794, "step": 972 }, { "epoch": 0.09650862924023011, "grad_norm": 1.6986547708511353, "learning_rate": 6.009836619625809e-05, "loss": 43.4787, "step": 973 }, { "epoch": 0.09660781590954176, "grad_norm": 1.8585858345031738, "learning_rate": 5.990980218678097e-05, "loss": 43.2107, "step": 974 }, { "epoch": 0.0967070025788534, "grad_norm": 1.8360117673873901, "learning_rate": 5.9721407868658674e-05, "loss": 43.3234, "step": 975 }, { "epoch": 0.09680618924816504, "grad_norm": 2.125579595565796, "learning_rate": 5.953318403931532e-05, "loss": 42.8494, "step": 976 }, { "epoch": 0.0969053759174767, "grad_norm": 2.0391440391540527, "learning_rate": 5.9345131495453274e-05, "loss": 43.0214, "step": 977 }, { "epoch": 0.09700456258678833, "grad_norm": 2.4064784049987793, "learning_rate": 5.915725103304992e-05, "loss": 42.9429, "step": 978 }, { "epoch": 0.09710374925609998, "grad_norm": 1.6260275840759277, "learning_rate": 5.896954344735426e-05, "loss": 43.4142, "step": 979 }, { "epoch": 0.09720293592541163, "grad_norm": 1.8077256679534912, "learning_rate": 5.878200953288358e-05, "loss": 43.4126, "step": 980 }, { "epoch": 0.09730212259472328, "grad_norm": 1.915785312652588, "learning_rate": 5.859465008342002e-05, "loss": 43.1531, "step": 981 }, { "epoch": 0.09740130926403491, "grad_norm": 2.13857102394104, "learning_rate": 5.840746589200732e-05, "loss": 43.1655, "step": 982 }, { "epoch": 0.09750049593334656, "grad_norm": 2.0060760974884033, "learning_rate": 5.8220457750947344e-05, "loss": 43.2617, "step": 983 }, { "epoch": 0.09759968260265821, "grad_norm": 1.6532596349716187, "learning_rate": 5.803362645179675e-05, "loss": 43.5712, "step": 984 }, { "epoch": 0.09769886927196984, "grad_norm": 2.157123327255249, "learning_rate": 5.784697278536379e-05, "loss": 42.9544, "step": 985 }, { "epoch": 0.09779805594128149, "grad_norm": 2.0179286003112793, "learning_rate": 5.766049754170471e-05, "loss": 43.4726, "step": 986 }, { "epoch": 0.09789724261059314, "grad_norm": 1.8653314113616943, "learning_rate": 5.747420151012064e-05, "loss": 43.1918, "step": 987 }, { "epoch": 0.09799642927990478, "grad_norm": 2.115196704864502, "learning_rate": 5.728808547915405e-05, "loss": 43.4468, "step": 988 }, { "epoch": 0.09809561594921642, "grad_norm": 2.247753620147705, "learning_rate": 5.7102150236585626e-05, "loss": 43.2648, "step": 989 }, { "epoch": 0.09819480261852807, "grad_norm": 1.9527291059494019, "learning_rate": 5.69163965694308e-05, "loss": 43.2446, "step": 990 }, { "epoch": 0.09829398928783971, "grad_norm": 1.9459764957427979, "learning_rate": 5.673082526393634e-05, "loss": 43.4385, "step": 991 }, { "epoch": 0.09839317595715136, "grad_norm": 2.1540186405181885, "learning_rate": 5.654543710557726e-05, "loss": 42.9314, "step": 992 }, { "epoch": 0.098492362626463, "grad_norm": 2.3612327575683594, "learning_rate": 5.636023287905324e-05, "loss": 43.4194, "step": 993 }, { "epoch": 0.09859154929577464, "grad_norm": 1.8521513938903809, "learning_rate": 5.617521336828556e-05, "loss": 43.5853, "step": 994 }, { "epoch": 0.09869073596508629, "grad_norm": 2.453296661376953, "learning_rate": 5.5990379356413495e-05, "loss": 43.4619, "step": 995 }, { "epoch": 0.09878992263439794, "grad_norm": 2.257439374923706, "learning_rate": 5.580573162579128e-05, "loss": 43.1267, "step": 996 }, { "epoch": 0.09888910930370959, "grad_norm": 2.043151617050171, "learning_rate": 5.5621270957984573e-05, "loss": 43.4254, "step": 997 }, { "epoch": 0.09898829597302122, "grad_norm": 1.9218149185180664, "learning_rate": 5.5436998133767345e-05, "loss": 43.1244, "step": 998 }, { "epoch": 0.09908748264233287, "grad_norm": 2.1752357482910156, "learning_rate": 5.525291393311835e-05, "loss": 43.2948, "step": 999 }, { "epoch": 0.09918666931164452, "grad_norm": 2.7527828216552734, "learning_rate": 5.506901913521808e-05, "loss": 43.5285, "step": 1000 }, { "epoch": 0.09928585598095616, "grad_norm": 1.906812310218811, "learning_rate": 5.48853145184452e-05, "loss": 43.3144, "step": 1001 }, { "epoch": 0.0993850426502678, "grad_norm": 1.603485345840454, "learning_rate": 5.470180086037353e-05, "loss": 43.4677, "step": 1002 }, { "epoch": 0.09948422931957945, "grad_norm": 1.8495134115219116, "learning_rate": 5.451847893776845e-05, "loss": 43.3694, "step": 1003 }, { "epoch": 0.09958341598889109, "grad_norm": 1.8568984270095825, "learning_rate": 5.4335349526583926e-05, "loss": 43.5146, "step": 1004 }, { "epoch": 0.09968260265820274, "grad_norm": 1.9716614484786987, "learning_rate": 5.415241340195903e-05, "loss": 43.5199, "step": 1005 }, { "epoch": 0.09978178932751439, "grad_norm": 2.1100313663482666, "learning_rate": 5.396967133821461e-05, "loss": 43.2311, "step": 1006 }, { "epoch": 0.09988097599682602, "grad_norm": 1.80048668384552, "learning_rate": 5.378712410885025e-05, "loss": 43.4196, "step": 1007 }, { "epoch": 0.09998016266613767, "grad_norm": 1.7968202829360962, "learning_rate": 5.36047724865407e-05, "loss": 43.347, "step": 1008 }, { "epoch": 0.10007934933544932, "grad_norm": 2.149846315383911, "learning_rate": 5.342261724313292e-05, "loss": 43.264, "step": 1009 }, { "epoch": 0.10017853600476095, "grad_norm": 2.306180715560913, "learning_rate": 5.324065914964248e-05, "loss": 43.5817, "step": 1010 }, { "epoch": 0.1002777226740726, "grad_norm": 2.3895561695098877, "learning_rate": 5.3058898976250624e-05, "loss": 43.1668, "step": 1011 }, { "epoch": 0.10037690934338425, "grad_norm": 2.045203447341919, "learning_rate": 5.28773374923007e-05, "loss": 43.2144, "step": 1012 }, { "epoch": 0.10047609601269589, "grad_norm": 1.7149254083633423, "learning_rate": 5.269597546629521e-05, "loss": 43.3314, "step": 1013 }, { "epoch": 0.10057528268200754, "grad_norm": 2.1053225994110107, "learning_rate": 5.251481366589226e-05, "loss": 43.2455, "step": 1014 }, { "epoch": 0.10067446935131918, "grad_norm": 2.0924081802368164, "learning_rate": 5.2333852857902575e-05, "loss": 43.3812, "step": 1015 }, { "epoch": 0.10077365602063083, "grad_norm": 1.7114591598510742, "learning_rate": 5.2153093808286016e-05, "loss": 43.2465, "step": 1016 }, { "epoch": 0.10087284268994247, "grad_norm": 1.7574715614318848, "learning_rate": 5.197253728214852e-05, "loss": 43.497, "step": 1017 }, { "epoch": 0.10097202935925412, "grad_norm": 1.9413849115371704, "learning_rate": 5.1792184043738855e-05, "loss": 43.0519, "step": 1018 }, { "epoch": 0.10107121602856577, "grad_norm": 1.9501484632492065, "learning_rate": 5.161203485644517e-05, "loss": 42.8536, "step": 1019 }, { "epoch": 0.1011704026978774, "grad_norm": 1.6921439170837402, "learning_rate": 5.143209048279208e-05, "loss": 43.5067, "step": 1020 }, { "epoch": 0.10126958936718905, "grad_norm": 1.8455314636230469, "learning_rate": 5.1252351684437136e-05, "loss": 43.2189, "step": 1021 }, { "epoch": 0.1013687760365007, "grad_norm": 2.301485061645508, "learning_rate": 5.107281922216788e-05, "loss": 43.4071, "step": 1022 }, { "epoch": 0.10146796270581233, "grad_norm": 1.9626744985580444, "learning_rate": 5.089349385589833e-05, "loss": 43.5991, "step": 1023 }, { "epoch": 0.10156714937512398, "grad_norm": 2.3110949993133545, "learning_rate": 5.071437634466609e-05, "loss": 42.8043, "step": 1024 }, { "epoch": 0.10166633604443563, "grad_norm": 1.7664554119110107, "learning_rate": 5.0535467446628825e-05, "loss": 43.3835, "step": 1025 }, { "epoch": 0.10176552271374727, "grad_norm": 2.0509145259857178, "learning_rate": 5.035676791906132e-05, "loss": 43.3937, "step": 1026 }, { "epoch": 0.10186470938305892, "grad_norm": 2.435389518737793, "learning_rate": 5.0178278518351983e-05, "loss": 43.3827, "step": 1027 }, { "epoch": 0.10196389605237056, "grad_norm": 1.8111709356307983, "learning_rate": 5.000000000000002e-05, "loss": 43.407, "step": 1028 }, { "epoch": 0.1020630827216822, "grad_norm": 1.8976213932037354, "learning_rate": 4.982193311861183e-05, "loss": 43.4389, "step": 1029 }, { "epoch": 0.10216226939099385, "grad_norm": 1.647125482559204, "learning_rate": 4.964407862789817e-05, "loss": 43.46, "step": 1030 }, { "epoch": 0.1022614560603055, "grad_norm": 2.009765386581421, "learning_rate": 4.9466437280670655e-05, "loss": 43.3748, "step": 1031 }, { "epoch": 0.10236064272961715, "grad_norm": 1.9777253866195679, "learning_rate": 4.928900982883883e-05, "loss": 43.4294, "step": 1032 }, { "epoch": 0.10245982939892878, "grad_norm": 2.074974775314331, "learning_rate": 4.911179702340688e-05, "loss": 43.2607, "step": 1033 }, { "epoch": 0.10255901606824043, "grad_norm": 2.69441556930542, "learning_rate": 4.8934799614470316e-05, "loss": 43.2071, "step": 1034 }, { "epoch": 0.10265820273755208, "grad_norm": 1.746802806854248, "learning_rate": 4.875801835121312e-05, "loss": 43.2512, "step": 1035 }, { "epoch": 0.10275738940686371, "grad_norm": 1.6910595893859863, "learning_rate": 4.85814539819042e-05, "loss": 43.3361, "step": 1036 }, { "epoch": 0.10285657607617536, "grad_norm": 1.9047490358352661, "learning_rate": 4.840510725389457e-05, "loss": 43.3071, "step": 1037 }, { "epoch": 0.10295576274548701, "grad_norm": 1.809377908706665, "learning_rate": 4.8228978913613886e-05, "loss": 43.2802, "step": 1038 }, { "epoch": 0.10305494941479865, "grad_norm": 1.7433143854141235, "learning_rate": 4.8053069706567554e-05, "loss": 43.0608, "step": 1039 }, { "epoch": 0.1031541360841103, "grad_norm": 2.1607906818389893, "learning_rate": 4.7877380377333326e-05, "loss": 42.9814, "step": 1040 }, { "epoch": 0.10325332275342194, "grad_norm": 2.052744150161743, "learning_rate": 4.7701911669558384e-05, "loss": 43.4076, "step": 1041 }, { "epoch": 0.10335250942273358, "grad_norm": 2.064326763153076, "learning_rate": 4.752666432595596e-05, "loss": 43.1904, "step": 1042 }, { "epoch": 0.10345169609204523, "grad_norm": 1.7264080047607422, "learning_rate": 4.7351639088302435e-05, "loss": 43.2059, "step": 1043 }, { "epoch": 0.10355088276135688, "grad_norm": 1.9093258380889893, "learning_rate": 4.717683669743397e-05, "loss": 43.3171, "step": 1044 }, { "epoch": 0.10365006943066851, "grad_norm": 1.9440662860870361, "learning_rate": 4.700225789324343e-05, "loss": 43.2336, "step": 1045 }, { "epoch": 0.10374925609998016, "grad_norm": 2.2268941402435303, "learning_rate": 4.682790341467753e-05, "loss": 43.3435, "step": 1046 }, { "epoch": 0.10384844276929181, "grad_norm": 2.077326774597168, "learning_rate": 4.665377399973321e-05, "loss": 43.0513, "step": 1047 }, { "epoch": 0.10394762943860344, "grad_norm": 2.223978042602539, "learning_rate": 4.647987038545496e-05, "loss": 42.9335, "step": 1048 }, { "epoch": 0.1040468161079151, "grad_norm": 1.950343370437622, "learning_rate": 4.6306193307931355e-05, "loss": 43.3709, "step": 1049 }, { "epoch": 0.10414600277722674, "grad_norm": 1.9032632112503052, "learning_rate": 4.613274350229226e-05, "loss": 43.4144, "step": 1050 }, { "epoch": 0.10424518944653839, "grad_norm": 2.298159599304199, "learning_rate": 4.595952170270542e-05, "loss": 42.8251, "step": 1051 }, { "epoch": 0.10434437611585003, "grad_norm": 2.5844788551330566, "learning_rate": 4.578652864237361e-05, "loss": 43.3193, "step": 1052 }, { "epoch": 0.10444356278516168, "grad_norm": 1.964105486869812, "learning_rate": 4.561376505353127e-05, "loss": 43.7334, "step": 1053 }, { "epoch": 0.10454274945447332, "grad_norm": 1.8807120323181152, "learning_rate": 4.544123166744172e-05, "loss": 43.3226, "step": 1054 }, { "epoch": 0.10464193612378496, "grad_norm": 1.9958428144454956, "learning_rate": 4.5268929214393704e-05, "loss": 43.3045, "step": 1055 }, { "epoch": 0.10474112279309661, "grad_norm": 2.212786912918091, "learning_rate": 4.509685842369867e-05, "loss": 43.3741, "step": 1056 }, { "epoch": 0.10484030946240826, "grad_norm": 2.0918195247650146, "learning_rate": 4.492502002368738e-05, "loss": 43.1454, "step": 1057 }, { "epoch": 0.10493949613171989, "grad_norm": 1.8375208377838135, "learning_rate": 4.4753414741706944e-05, "loss": 43.272, "step": 1058 }, { "epoch": 0.10503868280103154, "grad_norm": 1.8446433544158936, "learning_rate": 4.458204330411779e-05, "loss": 43.0881, "step": 1059 }, { "epoch": 0.10513786947034319, "grad_norm": 1.8900576829910278, "learning_rate": 4.4410906436290566e-05, "loss": 43.2045, "step": 1060 }, { "epoch": 0.10523705613965482, "grad_norm": 2.3998546600341797, "learning_rate": 4.4240004862603026e-05, "loss": 43.3991, "step": 1061 }, { "epoch": 0.10533624280896647, "grad_norm": 2.495123863220215, "learning_rate": 4.406933930643692e-05, "loss": 43.161, "step": 1062 }, { "epoch": 0.10543542947827812, "grad_norm": 2.1448473930358887, "learning_rate": 4.38989104901751e-05, "loss": 43.5213, "step": 1063 }, { "epoch": 0.10553461614758976, "grad_norm": 1.683742880821228, "learning_rate": 4.372871913519826e-05, "loss": 43.3546, "step": 1064 }, { "epoch": 0.1056338028169014, "grad_norm": 2.0388755798339844, "learning_rate": 4.355876596188209e-05, "loss": 43.3346, "step": 1065 }, { "epoch": 0.10573298948621306, "grad_norm": 1.9111831188201904, "learning_rate": 4.3389051689594e-05, "loss": 43.4083, "step": 1066 }, { "epoch": 0.1058321761555247, "grad_norm": 1.788818120956421, "learning_rate": 4.3219577036690315e-05, "loss": 43.3641, "step": 1067 }, { "epoch": 0.10593136282483634, "grad_norm": 1.7852380275726318, "learning_rate": 4.305034272051299e-05, "loss": 43.361, "step": 1068 }, { "epoch": 0.10603054949414799, "grad_norm": 1.7794376611709595, "learning_rate": 4.288134945738684e-05, "loss": 43.2712, "step": 1069 }, { "epoch": 0.10612973616345964, "grad_norm": 1.828147530555725, "learning_rate": 4.271259796261625e-05, "loss": 43.4838, "step": 1070 }, { "epoch": 0.10622892283277127, "grad_norm": 2.0267014503479004, "learning_rate": 4.2544088950482266e-05, "loss": 43.3498, "step": 1071 }, { "epoch": 0.10632810950208292, "grad_norm": 2.292487621307373, "learning_rate": 4.237582313423962e-05, "loss": 43.3811, "step": 1072 }, { "epoch": 0.10642729617139457, "grad_norm": 2.3983542919158936, "learning_rate": 4.2207801226113665e-05, "loss": 43.1442, "step": 1073 }, { "epoch": 0.1065264828407062, "grad_norm": 2.1029767990112305, "learning_rate": 4.2040023937297346e-05, "loss": 43.6896, "step": 1074 }, { "epoch": 0.10662566951001785, "grad_norm": 1.9081923961639404, "learning_rate": 4.187249197794813e-05, "loss": 43.3981, "step": 1075 }, { "epoch": 0.1067248561793295, "grad_norm": 1.906647801399231, "learning_rate": 4.1705206057185164e-05, "loss": 43.3082, "step": 1076 }, { "epoch": 0.10682404284864114, "grad_norm": 2.1955394744873047, "learning_rate": 4.153816688308609e-05, "loss": 43.451, "step": 1077 }, { "epoch": 0.10692322951795279, "grad_norm": 2.041405200958252, "learning_rate": 4.137137516268426e-05, "loss": 43.0773, "step": 1078 }, { "epoch": 0.10702241618726444, "grad_norm": 1.8755104541778564, "learning_rate": 4.1204831601965445e-05, "loss": 43.4287, "step": 1079 }, { "epoch": 0.10712160285657607, "grad_norm": 1.9361135959625244, "learning_rate": 4.10385369058652e-05, "loss": 43.4697, "step": 1080 }, { "epoch": 0.10722078952588772, "grad_norm": 2.084237575531006, "learning_rate": 4.0872491778265535e-05, "loss": 43.3185, "step": 1081 }, { "epoch": 0.10731997619519937, "grad_norm": 2.22873854637146, "learning_rate": 4.070669692199226e-05, "loss": 43.411, "step": 1082 }, { "epoch": 0.107419162864511, "grad_norm": 1.9395787715911865, "learning_rate": 4.054115303881174e-05, "loss": 43.272, "step": 1083 }, { "epoch": 0.10751834953382265, "grad_norm": 2.202510118484497, "learning_rate": 4.037586082942805e-05, "loss": 43.4715, "step": 1084 }, { "epoch": 0.1076175362031343, "grad_norm": 1.9438726902008057, "learning_rate": 4.021082099348006e-05, "loss": 43.1722, "step": 1085 }, { "epoch": 0.10771672287244595, "grad_norm": 2.385270833969116, "learning_rate": 4.004603422953827e-05, "loss": 43.3762, "step": 1086 }, { "epoch": 0.10781590954175758, "grad_norm": 1.7869514226913452, "learning_rate": 3.988150123510224e-05, "loss": 43.6321, "step": 1087 }, { "epoch": 0.10791509621106923, "grad_norm": 2.123931646347046, "learning_rate": 3.971722270659712e-05, "loss": 43.3388, "step": 1088 }, { "epoch": 0.10801428288038088, "grad_norm": 2.030195713043213, "learning_rate": 3.955319933937116e-05, "loss": 42.8098, "step": 1089 }, { "epoch": 0.10811346954969252, "grad_norm": 2.3113832473754883, "learning_rate": 3.938943182769246e-05, "loss": 43.4552, "step": 1090 }, { "epoch": 0.10821265621900417, "grad_norm": 2.05252742767334, "learning_rate": 3.922592086474624e-05, "loss": 43.4441, "step": 1091 }, { "epoch": 0.10831184288831582, "grad_norm": 2.1791186332702637, "learning_rate": 3.906266714263171e-05, "loss": 43.2954, "step": 1092 }, { "epoch": 0.10841102955762745, "grad_norm": 2.1548943519592285, "learning_rate": 3.88996713523594e-05, "loss": 43.3552, "step": 1093 }, { "epoch": 0.1085102162269391, "grad_norm": 1.7970424890518188, "learning_rate": 3.873693418384795e-05, "loss": 43.2561, "step": 1094 }, { "epoch": 0.10860940289625075, "grad_norm": 2.10015869140625, "learning_rate": 3.857445632592132e-05, "loss": 43.2973, "step": 1095 }, { "epoch": 0.10870858956556238, "grad_norm": 1.962088704109192, "learning_rate": 3.841223846630599e-05, "loss": 43.254, "step": 1096 }, { "epoch": 0.10880777623487403, "grad_norm": 2.2271792888641357, "learning_rate": 3.825028129162781e-05, "loss": 43.448, "step": 1097 }, { "epoch": 0.10890696290418568, "grad_norm": 1.9572752714157104, "learning_rate": 3.808858548740935e-05, "loss": 43.1279, "step": 1098 }, { "epoch": 0.10900614957349732, "grad_norm": 1.7174750566482544, "learning_rate": 3.792715173806669e-05, "loss": 43.2553, "step": 1099 }, { "epoch": 0.10910533624280896, "grad_norm": 2.2040371894836426, "learning_rate": 3.776598072690686e-05, "loss": 43.2612, "step": 1100 }, { "epoch": 0.10920452291212061, "grad_norm": 1.800062894821167, "learning_rate": 3.760507313612472e-05, "loss": 43.561, "step": 1101 }, { "epoch": 0.10930370958143226, "grad_norm": 1.809275507926941, "learning_rate": 3.74444296468002e-05, "loss": 43.4344, "step": 1102 }, { "epoch": 0.1094028962507439, "grad_norm": 2.1517601013183594, "learning_rate": 3.728405093889522e-05, "loss": 43.4366, "step": 1103 }, { "epoch": 0.10950208292005555, "grad_norm": 2.1089038848876953, "learning_rate": 3.7123937691251106e-05, "loss": 43.5134, "step": 1104 }, { "epoch": 0.1096012695893672, "grad_norm": 1.9352220296859741, "learning_rate": 3.696409058158544e-05, "loss": 43.1101, "step": 1105 }, { "epoch": 0.10970045625867883, "grad_norm": 2.1643447875976562, "learning_rate": 3.68045102864894e-05, "loss": 43.5045, "step": 1106 }, { "epoch": 0.10979964292799048, "grad_norm": 2.143470287322998, "learning_rate": 3.6645197481424767e-05, "loss": 43.4478, "step": 1107 }, { "epoch": 0.10989882959730213, "grad_norm": 2.286166191101074, "learning_rate": 3.6486152840721046e-05, "loss": 43.152, "step": 1108 }, { "epoch": 0.10999801626661376, "grad_norm": 1.9622173309326172, "learning_rate": 3.632737703757282e-05, "loss": 43.3474, "step": 1109 }, { "epoch": 0.11009720293592541, "grad_norm": 1.9437733888626099, "learning_rate": 3.616887074403659e-05, "loss": 43.1433, "step": 1110 }, { "epoch": 0.11019638960523706, "grad_norm": 1.769162654876709, "learning_rate": 3.6010634631028226e-05, "loss": 43.5857, "step": 1111 }, { "epoch": 0.1102955762745487, "grad_norm": 1.9913947582244873, "learning_rate": 3.58526693683199e-05, "loss": 43.1546, "step": 1112 }, { "epoch": 0.11039476294386034, "grad_norm": 2.0115365982055664, "learning_rate": 3.5694975624537444e-05, "loss": 43.4062, "step": 1113 }, { "epoch": 0.110493949613172, "grad_norm": 1.8967533111572266, "learning_rate": 3.553755406715724e-05, "loss": 43.2579, "step": 1114 }, { "epoch": 0.11059313628248363, "grad_norm": 1.8920150995254517, "learning_rate": 3.5380405362503855e-05, "loss": 43.3123, "step": 1115 }, { "epoch": 0.11069232295179528, "grad_norm": 2.0870227813720703, "learning_rate": 3.5223530175746666e-05, "loss": 43.0646, "step": 1116 }, { "epoch": 0.11079150962110693, "grad_norm": 2.0629279613494873, "learning_rate": 3.506692917089751e-05, "loss": 43.2129, "step": 1117 }, { "epoch": 0.11089069629041856, "grad_norm": 1.740701675415039, "learning_rate": 3.491060301080754e-05, "loss": 43.1896, "step": 1118 }, { "epoch": 0.11098988295973021, "grad_norm": 2.2692818641662598, "learning_rate": 3.475455235716471e-05, "loss": 43.0347, "step": 1119 }, { "epoch": 0.11108906962904186, "grad_norm": 2.0202059745788574, "learning_rate": 3.459877787049072e-05, "loss": 43.1927, "step": 1120 }, { "epoch": 0.11118825629835351, "grad_norm": 1.9879977703094482, "learning_rate": 3.4443280210138305e-05, "loss": 43.2057, "step": 1121 }, { "epoch": 0.11128744296766514, "grad_norm": 1.8173496723175049, "learning_rate": 3.4288060034288604e-05, "loss": 43.229, "step": 1122 }, { "epoch": 0.11138662963697679, "grad_norm": 2.0935144424438477, "learning_rate": 3.413311799994808e-05, "loss": 43.5347, "step": 1123 }, { "epoch": 0.11148581630628844, "grad_norm": 2.2023987770080566, "learning_rate": 3.3978454762946036e-05, "loss": 43.0691, "step": 1124 }, { "epoch": 0.11158500297560008, "grad_norm": 1.9284822940826416, "learning_rate": 3.3824070977931554e-05, "loss": 42.8539, "step": 1125 }, { "epoch": 0.11168418964491172, "grad_norm": 2.1574866771698, "learning_rate": 3.366996729837102e-05, "loss": 43.2193, "step": 1126 }, { "epoch": 0.11178337631422337, "grad_norm": 2.4391090869903564, "learning_rate": 3.351614437654506e-05, "loss": 42.9505, "step": 1127 }, { "epoch": 0.11188256298353501, "grad_norm": 2.0434987545013428, "learning_rate": 3.336260286354602e-05, "loss": 43.52, "step": 1128 }, { "epoch": 0.11198174965284666, "grad_norm": 2.463559627532959, "learning_rate": 3.320934340927513e-05, "loss": 43.3699, "step": 1129 }, { "epoch": 0.1120809363221583, "grad_norm": 2.0348167419433594, "learning_rate": 3.3056366662439685e-05, "loss": 43.3942, "step": 1130 }, { "epoch": 0.11218012299146994, "grad_norm": 1.9807182550430298, "learning_rate": 3.290367327055034e-05, "loss": 43.4093, "step": 1131 }, { "epoch": 0.11227930966078159, "grad_norm": 1.9169766902923584, "learning_rate": 3.275126387991847e-05, "loss": 43.3948, "step": 1132 }, { "epoch": 0.11237849633009324, "grad_norm": 1.9982572793960571, "learning_rate": 3.2599139135653246e-05, "loss": 42.9981, "step": 1133 }, { "epoch": 0.11247768299940487, "grad_norm": 2.030264377593994, "learning_rate": 3.2447299681659015e-05, "loss": 43.0078, "step": 1134 }, { "epoch": 0.11257686966871652, "grad_norm": 1.9856185913085938, "learning_rate": 3.229574616063268e-05, "loss": 43.2602, "step": 1135 }, { "epoch": 0.11267605633802817, "grad_norm": 2.3267641067504883, "learning_rate": 3.2144479214060695e-05, "loss": 42.7966, "step": 1136 }, { "epoch": 0.11277524300733982, "grad_norm": 2.511244058609009, "learning_rate": 3.199349948221669e-05, "loss": 43.5626, "step": 1137 }, { "epoch": 0.11287442967665146, "grad_norm": 2.2020349502563477, "learning_rate": 3.184280760415843e-05, "loss": 43.3367, "step": 1138 }, { "epoch": 0.1129736163459631, "grad_norm": 1.8621561527252197, "learning_rate": 3.1692404217725414e-05, "loss": 43.4803, "step": 1139 }, { "epoch": 0.11307280301527475, "grad_norm": 1.9662930965423584, "learning_rate": 3.15422899595359e-05, "loss": 43.5218, "step": 1140 }, { "epoch": 0.11317198968458639, "grad_norm": 1.966377854347229, "learning_rate": 3.1392465464984455e-05, "loss": 43.2068, "step": 1141 }, { "epoch": 0.11327117635389804, "grad_norm": 1.8065776824951172, "learning_rate": 3.1242931368239026e-05, "loss": 43.3088, "step": 1142 }, { "epoch": 0.11337036302320969, "grad_norm": 2.2194085121154785, "learning_rate": 3.109368830223858e-05, "loss": 43.2588, "step": 1143 }, { "epoch": 0.11346954969252132, "grad_norm": 2.152547597885132, "learning_rate": 3.094473689869002e-05, "loss": 43.5761, "step": 1144 }, { "epoch": 0.11356873636183297, "grad_norm": 1.8473504781723022, "learning_rate": 3.0796077788065805e-05, "loss": 43.3933, "step": 1145 }, { "epoch": 0.11366792303114462, "grad_norm": 2.2697126865386963, "learning_rate": 3.0647711599601225e-05, "loss": 43.4602, "step": 1146 }, { "epoch": 0.11376710970045625, "grad_norm": 2.197999954223633, "learning_rate": 3.0499638961291623e-05, "loss": 43.4497, "step": 1147 }, { "epoch": 0.1138662963697679, "grad_norm": 1.796280026435852, "learning_rate": 3.035186049988994e-05, "loss": 43.4344, "step": 1148 }, { "epoch": 0.11396548303907955, "grad_norm": 1.6256227493286133, "learning_rate": 3.0204376840903792e-05, "loss": 43.4667, "step": 1149 }, { "epoch": 0.11406466970839119, "grad_norm": 2.233433961868286, "learning_rate": 3.0057188608593147e-05, "loss": 42.9149, "step": 1150 }, { "epoch": 0.11416385637770284, "grad_norm": 2.412339687347412, "learning_rate": 2.9910296425967322e-05, "loss": 43.5334, "step": 1151 }, { "epoch": 0.11426304304701448, "grad_norm": 1.5818597078323364, "learning_rate": 2.97637009147827e-05, "loss": 43.4461, "step": 1152 }, { "epoch": 0.11436222971632612, "grad_norm": 1.9922378063201904, "learning_rate": 2.9617402695539808e-05, "loss": 43.2253, "step": 1153 }, { "epoch": 0.11446141638563777, "grad_norm": 2.319293260574341, "learning_rate": 2.947140238748093e-05, "loss": 43.4716, "step": 1154 }, { "epoch": 0.11456060305494942, "grad_norm": 2.4096126556396484, "learning_rate": 2.9325700608587216e-05, "loss": 43.2436, "step": 1155 }, { "epoch": 0.11456060305494942, "eval_loss": 10.823346138000488, "eval_runtime": 11.1466, "eval_samples_per_second": 380.835, "eval_steps_per_second": 190.462, "step": 1155 }, { "epoch": 0.11465978972426107, "grad_norm": 1.9147896766662598, "learning_rate": 2.9180297975576364e-05, "loss": 43.2374, "step": 1156 }, { "epoch": 0.1147589763935727, "grad_norm": 1.8915377855300903, "learning_rate": 2.9035195103899825e-05, "loss": 43.0964, "step": 1157 }, { "epoch": 0.11485816306288435, "grad_norm": 2.041377305984497, "learning_rate": 2.889039260774018e-05, "loss": 43.2494, "step": 1158 }, { "epoch": 0.114957349732196, "grad_norm": 2.3205907344818115, "learning_rate": 2.8745891100008683e-05, "loss": 43.4031, "step": 1159 }, { "epoch": 0.11505653640150763, "grad_norm": 2.3421196937561035, "learning_rate": 2.8601691192342493e-05, "loss": 43.2697, "step": 1160 }, { "epoch": 0.11515572307081928, "grad_norm": 2.095224380493164, "learning_rate": 2.845779349510227e-05, "loss": 43.666, "step": 1161 }, { "epoch": 0.11525490974013093, "grad_norm": 2.259864568710327, "learning_rate": 2.83141986173694e-05, "loss": 42.9196, "step": 1162 }, { "epoch": 0.11535409640944257, "grad_norm": 2.165755033493042, "learning_rate": 2.817090716694363e-05, "loss": 43.5717, "step": 1163 }, { "epoch": 0.11545328307875422, "grad_norm": 2.1785712242126465, "learning_rate": 2.802791975034024e-05, "loss": 43.2673, "step": 1164 }, { "epoch": 0.11555246974806586, "grad_norm": 1.9809575080871582, "learning_rate": 2.788523697278773e-05, "loss": 43.2572, "step": 1165 }, { "epoch": 0.1156516564173775, "grad_norm": 1.791969656944275, "learning_rate": 2.7742859438225055e-05, "loss": 43.4034, "step": 1166 }, { "epoch": 0.11575084308668915, "grad_norm": 2.295616388320923, "learning_rate": 2.7600787749299263e-05, "loss": 43.4226, "step": 1167 }, { "epoch": 0.1158500297560008, "grad_norm": 1.8220045566558838, "learning_rate": 2.7459022507362686e-05, "loss": 43.349, "step": 1168 }, { "epoch": 0.11594921642531243, "grad_norm": 1.9346619844436646, "learning_rate": 2.731756431247072e-05, "loss": 43.2268, "step": 1169 }, { "epoch": 0.11604840309462408, "grad_norm": 1.8838238716125488, "learning_rate": 2.717641376337895e-05, "loss": 43.0441, "step": 1170 }, { "epoch": 0.11614758976393573, "grad_norm": 2.1250696182250977, "learning_rate": 2.7035571457540865e-05, "loss": 43.1962, "step": 1171 }, { "epoch": 0.11624677643324736, "grad_norm": 1.7829358577728271, "learning_rate": 2.6895037991105286e-05, "loss": 43.4272, "step": 1172 }, { "epoch": 0.11634596310255901, "grad_norm": 1.9922783374786377, "learning_rate": 2.675481395891365e-05, "loss": 43.112, "step": 1173 }, { "epoch": 0.11644514977187066, "grad_norm": 2.087114095687866, "learning_rate": 2.6614899954497795e-05, "loss": 43.0554, "step": 1174 }, { "epoch": 0.11654433644118231, "grad_norm": 2.154203414916992, "learning_rate": 2.6475296570077158e-05, "loss": 43.0469, "step": 1175 }, { "epoch": 0.11664352311049395, "grad_norm": 2.1721832752227783, "learning_rate": 2.6336004396556534e-05, "loss": 43.5545, "step": 1176 }, { "epoch": 0.1167427097798056, "grad_norm": 1.7544825077056885, "learning_rate": 2.619702402352332e-05, "loss": 43.5061, "step": 1177 }, { "epoch": 0.11684189644911724, "grad_norm": 2.276806592941284, "learning_rate": 2.6058356039245246e-05, "loss": 43.3631, "step": 1178 }, { "epoch": 0.11694108311842888, "grad_norm": 1.8149032592773438, "learning_rate": 2.5920001030667684e-05, "loss": 43.539, "step": 1179 }, { "epoch": 0.11704026978774053, "grad_norm": 2.0274498462677, "learning_rate": 2.5781959583411374e-05, "loss": 43.03, "step": 1180 }, { "epoch": 0.11713945645705218, "grad_norm": 2.6130144596099854, "learning_rate": 2.564423228176971e-05, "loss": 42.7279, "step": 1181 }, { "epoch": 0.11723864312636381, "grad_norm": 1.7635635137557983, "learning_rate": 2.5506819708706507e-05, "loss": 43.1984, "step": 1182 }, { "epoch": 0.11733782979567546, "grad_norm": 2.270341157913208, "learning_rate": 2.5369722445853304e-05, "loss": 43.2886, "step": 1183 }, { "epoch": 0.11743701646498711, "grad_norm": 2.2385268211364746, "learning_rate": 2.523294107350711e-05, "loss": 43.0718, "step": 1184 }, { "epoch": 0.11753620313429874, "grad_norm": 2.09205961227417, "learning_rate": 2.5096476170627825e-05, "loss": 43.4038, "step": 1185 }, { "epoch": 0.1176353898036104, "grad_norm": 1.9170998334884644, "learning_rate": 2.4960328314835745e-05, "loss": 43.4337, "step": 1186 }, { "epoch": 0.11773457647292204, "grad_norm": 2.0192651748657227, "learning_rate": 2.482449808240931e-05, "loss": 43.3694, "step": 1187 }, { "epoch": 0.11783376314223368, "grad_norm": 2.1703100204467773, "learning_rate": 2.4688986048282425e-05, "loss": 43.1931, "step": 1188 }, { "epoch": 0.11793294981154533, "grad_norm": 2.178591728210449, "learning_rate": 2.4553792786042262e-05, "loss": 43.0468, "step": 1189 }, { "epoch": 0.11803213648085698, "grad_norm": 1.8173599243164062, "learning_rate": 2.4418918867926577e-05, "loss": 43.1974, "step": 1190 }, { "epoch": 0.11813132315016862, "grad_norm": 2.2965517044067383, "learning_rate": 2.4284364864821563e-05, "loss": 43.2255, "step": 1191 }, { "epoch": 0.11823050981948026, "grad_norm": 2.666412115097046, "learning_rate": 2.4150131346259197e-05, "loss": 43.412, "step": 1192 }, { "epoch": 0.11832969648879191, "grad_norm": 2.0623011589050293, "learning_rate": 2.4016218880414998e-05, "loss": 42.8318, "step": 1193 }, { "epoch": 0.11842888315810356, "grad_norm": 2.0412418842315674, "learning_rate": 2.388262803410547e-05, "loss": 43.3501, "step": 1194 }, { "epoch": 0.11852806982741519, "grad_norm": 1.9982753992080688, "learning_rate": 2.3749359372785883e-05, "loss": 43.4671, "step": 1195 }, { "epoch": 0.11862725649672684, "grad_norm": 1.9406991004943848, "learning_rate": 2.3616413460547702e-05, "loss": 43.2969, "step": 1196 }, { "epoch": 0.11872644316603849, "grad_norm": 1.9013112783432007, "learning_rate": 2.3483790860116316e-05, "loss": 43.5889, "step": 1197 }, { "epoch": 0.11882562983535012, "grad_norm": 1.7011783123016357, "learning_rate": 2.3351492132848664e-05, "loss": 43.3593, "step": 1198 }, { "epoch": 0.11892481650466177, "grad_norm": 1.7616310119628906, "learning_rate": 2.3219517838730686e-05, "loss": 43.3636, "step": 1199 }, { "epoch": 0.11902400317397342, "grad_norm": 1.9877979755401611, "learning_rate": 2.3087868536375234e-05, "loss": 43.4894, "step": 1200 }, { "epoch": 0.11912318984328506, "grad_norm": 2.02473521232605, "learning_rate": 2.2956544783019418e-05, "loss": 43.1826, "step": 1201 }, { "epoch": 0.1192223765125967, "grad_norm": 1.9803928136825562, "learning_rate": 2.2825547134522495e-05, "loss": 43.4213, "step": 1202 }, { "epoch": 0.11932156318190835, "grad_norm": 2.13665771484375, "learning_rate": 2.2694876145363308e-05, "loss": 43.2816, "step": 1203 }, { "epoch": 0.11942074985121999, "grad_norm": 2.1694679260253906, "learning_rate": 2.2564532368638146e-05, "loss": 43.4692, "step": 1204 }, { "epoch": 0.11951993652053164, "grad_norm": 2.1723930835723877, "learning_rate": 2.2434516356058177e-05, "loss": 43.3492, "step": 1205 }, { "epoch": 0.11961912318984329, "grad_norm": 2.2894623279571533, "learning_rate": 2.230482865794733e-05, "loss": 43.2695, "step": 1206 }, { "epoch": 0.11971830985915492, "grad_norm": 1.7843515872955322, "learning_rate": 2.2175469823239768e-05, "loss": 43.1204, "step": 1207 }, { "epoch": 0.11981749652846657, "grad_norm": 1.960874080657959, "learning_rate": 2.2046440399477762e-05, "loss": 43.4489, "step": 1208 }, { "epoch": 0.11991668319777822, "grad_norm": 1.8730344772338867, "learning_rate": 2.1917740932809173e-05, "loss": 42.9379, "step": 1209 }, { "epoch": 0.12001586986708987, "grad_norm": 2.100027322769165, "learning_rate": 2.1789371967985338e-05, "loss": 43.5234, "step": 1210 }, { "epoch": 0.1201150565364015, "grad_norm": 2.344916820526123, "learning_rate": 2.1661334048358573e-05, "loss": 43.3028, "step": 1211 }, { "epoch": 0.12021424320571315, "grad_norm": 1.93478262424469, "learning_rate": 2.1533627715880023e-05, "loss": 42.8419, "step": 1212 }, { "epoch": 0.1203134298750248, "grad_norm": 2.0599870681762695, "learning_rate": 2.140625351109733e-05, "loss": 43.5079, "step": 1213 }, { "epoch": 0.12041261654433644, "grad_norm": 2.1910853385925293, "learning_rate": 2.1279211973152234e-05, "loss": 43.2715, "step": 1214 }, { "epoch": 0.12051180321364809, "grad_norm": 1.9811651706695557, "learning_rate": 2.115250363977851e-05, "loss": 43.4015, "step": 1215 }, { "epoch": 0.12061098988295973, "grad_norm": 1.7748483419418335, "learning_rate": 2.1026129047299436e-05, "loss": 43.3905, "step": 1216 }, { "epoch": 0.12071017655227137, "grad_norm": 1.954972267150879, "learning_rate": 2.0900088730625755e-05, "loss": 43.2451, "step": 1217 }, { "epoch": 0.12080936322158302, "grad_norm": 2.3269190788269043, "learning_rate": 2.0774383223253225e-05, "loss": 43.2175, "step": 1218 }, { "epoch": 0.12090854989089467, "grad_norm": 2.1407523155212402, "learning_rate": 2.0649013057260546e-05, "loss": 43.0747, "step": 1219 }, { "epoch": 0.1210077365602063, "grad_norm": 1.73319673538208, "learning_rate": 2.0523978763306873e-05, "loss": 43.4118, "step": 1220 }, { "epoch": 0.12110692322951795, "grad_norm": 1.7930755615234375, "learning_rate": 2.0399280870629823e-05, "loss": 43.1181, "step": 1221 }, { "epoch": 0.1212061098988296, "grad_norm": 2.0965116024017334, "learning_rate": 2.0274919907043033e-05, "loss": 43.3303, "step": 1222 }, { "epoch": 0.12130529656814124, "grad_norm": 1.9727377891540527, "learning_rate": 2.0150896398934073e-05, "loss": 43.2962, "step": 1223 }, { "epoch": 0.12140448323745288, "grad_norm": 2.1021900177001953, "learning_rate": 2.0027210871262068e-05, "loss": 43.2473, "step": 1224 }, { "epoch": 0.12150366990676453, "grad_norm": 2.155071258544922, "learning_rate": 1.9903863847555648e-05, "loss": 43.3419, "step": 1225 }, { "epoch": 0.12160285657607618, "grad_norm": 2.007256507873535, "learning_rate": 1.9780855849910605e-05, "loss": 43.5286, "step": 1226 }, { "epoch": 0.12170204324538782, "grad_norm": 2.027193546295166, "learning_rate": 1.9658187398987692e-05, "loss": 43.5892, "step": 1227 }, { "epoch": 0.12180122991469947, "grad_norm": 1.863158941268921, "learning_rate": 1.9535859014010526e-05, "loss": 43.0008, "step": 1228 }, { "epoch": 0.12190041658401111, "grad_norm": 1.9494130611419678, "learning_rate": 1.9413871212763212e-05, "loss": 43.3241, "step": 1229 }, { "epoch": 0.12199960325332275, "grad_norm": 1.9798649549484253, "learning_rate": 1.9292224511588365e-05, "loss": 43.1907, "step": 1230 }, { "epoch": 0.1220987899226344, "grad_norm": 2.2833685874938965, "learning_rate": 1.917091942538469e-05, "loss": 43.007, "step": 1231 }, { "epoch": 0.12219797659194605, "grad_norm": 2.0100409984588623, "learning_rate": 1.9049956467605055e-05, "loss": 43.1298, "step": 1232 }, { "epoch": 0.12229716326125768, "grad_norm": 2.5046565532684326, "learning_rate": 1.8929336150254064e-05, "loss": 43.3805, "step": 1233 }, { "epoch": 0.12239634993056933, "grad_norm": 1.741270661354065, "learning_rate": 1.880905898388612e-05, "loss": 43.0911, "step": 1234 }, { "epoch": 0.12249553659988098, "grad_norm": 2.1863491535186768, "learning_rate": 1.8689125477603063e-05, "loss": 43.0767, "step": 1235 }, { "epoch": 0.12259472326919262, "grad_norm": 2.0383381843566895, "learning_rate": 1.8569536139052213e-05, "loss": 43.4303, "step": 1236 }, { "epoch": 0.12269390993850426, "grad_norm": 2.046947956085205, "learning_rate": 1.8450291474423998e-05, "loss": 43.3978, "step": 1237 }, { "epoch": 0.12279309660781591, "grad_norm": 2.215596914291382, "learning_rate": 1.8331391988450052e-05, "loss": 43.1113, "step": 1238 }, { "epoch": 0.12289228327712755, "grad_norm": 1.731846570968628, "learning_rate": 1.8212838184400838e-05, "loss": 43.5589, "step": 1239 }, { "epoch": 0.1229914699464392, "grad_norm": 2.037714719772339, "learning_rate": 1.8094630564083736e-05, "loss": 43.4503, "step": 1240 }, { "epoch": 0.12309065661575085, "grad_norm": 2.0696065425872803, "learning_rate": 1.7976769627840806e-05, "loss": 42.9741, "step": 1241 }, { "epoch": 0.12318984328506248, "grad_norm": 2.007958173751831, "learning_rate": 1.785925587454661e-05, "loss": 43.2596, "step": 1242 }, { "epoch": 0.12328902995437413, "grad_norm": 2.0569231510162354, "learning_rate": 1.7742089801606276e-05, "loss": 43.4014, "step": 1243 }, { "epoch": 0.12338821662368578, "grad_norm": 2.107531785964966, "learning_rate": 1.762527190495319e-05, "loss": 43.0654, "step": 1244 }, { "epoch": 0.12348740329299743, "grad_norm": 1.771912693977356, "learning_rate": 1.750880267904711e-05, "loss": 43.5022, "step": 1245 }, { "epoch": 0.12358658996230906, "grad_norm": 2.4979348182678223, "learning_rate": 1.7392682616871837e-05, "loss": 43.4391, "step": 1246 }, { "epoch": 0.12368577663162071, "grad_norm": 2.4610869884490967, "learning_rate": 1.7276912209933392e-05, "loss": 43.3589, "step": 1247 }, { "epoch": 0.12378496330093236, "grad_norm": 1.9890514612197876, "learning_rate": 1.7161491948257657e-05, "loss": 43.2026, "step": 1248 }, { "epoch": 0.123884149970244, "grad_norm": 2.3719217777252197, "learning_rate": 1.7046422320388556e-05, "loss": 43.1041, "step": 1249 }, { "epoch": 0.12398333663955564, "grad_norm": 2.3359856605529785, "learning_rate": 1.693170381338578e-05, "loss": 43.5385, "step": 1250 }, { "epoch": 0.12408252330886729, "grad_norm": 1.738102912902832, "learning_rate": 1.6817336912822912e-05, "loss": 43.344, "step": 1251 }, { "epoch": 0.12418170997817893, "grad_norm": 2.1407241821289062, "learning_rate": 1.6703322102785168e-05, "loss": 43.3036, "step": 1252 }, { "epoch": 0.12428089664749058, "grad_norm": 1.9617172479629517, "learning_rate": 1.6589659865867534e-05, "loss": 43.4202, "step": 1253 }, { "epoch": 0.12438008331680223, "grad_norm": 2.001938819885254, "learning_rate": 1.647635068317265e-05, "loss": 43.2289, "step": 1254 }, { "epoch": 0.12447926998611386, "grad_norm": 2.245469331741333, "learning_rate": 1.6363395034308703e-05, "loss": 43.215, "step": 1255 }, { "epoch": 0.12457845665542551, "grad_norm": 2.2097630500793457, "learning_rate": 1.6250793397387555e-05, "loss": 43.4554, "step": 1256 }, { "epoch": 0.12467764332473716, "grad_norm": 2.113950729370117, "learning_rate": 1.6138546249022513e-05, "loss": 43.2097, "step": 1257 }, { "epoch": 0.1247768299940488, "grad_norm": 2.116868019104004, "learning_rate": 1.6026654064326553e-05, "loss": 43.3248, "step": 1258 }, { "epoch": 0.12487601666336044, "grad_norm": 2.185568332672119, "learning_rate": 1.5915117316910055e-05, "loss": 43.0621, "step": 1259 }, { "epoch": 0.12497520333267209, "grad_norm": 2.0478570461273193, "learning_rate": 1.5803936478879057e-05, "loss": 43.1099, "step": 1260 }, { "epoch": 0.12507439000198373, "grad_norm": 2.1610467433929443, "learning_rate": 1.5693112020833013e-05, "loss": 43.2475, "step": 1261 }, { "epoch": 0.1251735766712954, "grad_norm": 1.7375211715698242, "learning_rate": 1.558264441186299e-05, "loss": 43.4413, "step": 1262 }, { "epoch": 0.12527276334060702, "grad_norm": 1.689758539199829, "learning_rate": 1.547253411954953e-05, "loss": 43.1902, "step": 1263 }, { "epoch": 0.12537195000991866, "grad_norm": 2.010601043701172, "learning_rate": 1.5362781609960852e-05, "loss": 43.3252, "step": 1264 }, { "epoch": 0.12547113667923032, "grad_norm": 2.1295485496520996, "learning_rate": 1.5253387347650661e-05, "loss": 43.4037, "step": 1265 }, { "epoch": 0.12557032334854196, "grad_norm": 2.2790913581848145, "learning_rate": 1.5144351795656387e-05, "loss": 43.5434, "step": 1266 }, { "epoch": 0.1256695100178536, "grad_norm": 2.4958717823028564, "learning_rate": 1.5035675415497063e-05, "loss": 43.0798, "step": 1267 }, { "epoch": 0.12576869668716525, "grad_norm": 2.2541685104370117, "learning_rate": 1.4927358667171476e-05, "loss": 43.1865, "step": 1268 }, { "epoch": 0.1258678833564769, "grad_norm": 1.9234634637832642, "learning_rate": 1.4819402009156214e-05, "loss": 43.5065, "step": 1269 }, { "epoch": 0.12596707002578852, "grad_norm": 2.2710721492767334, "learning_rate": 1.471180589840363e-05, "loss": 43.2936, "step": 1270 }, { "epoch": 0.1260662566951002, "grad_norm": 2.3189542293548584, "learning_rate": 1.4604570790340023e-05, "loss": 43.1946, "step": 1271 }, { "epoch": 0.12616544336441182, "grad_norm": 2.0585691928863525, "learning_rate": 1.449769713886363e-05, "loss": 43.1392, "step": 1272 }, { "epoch": 0.12626463003372346, "grad_norm": 2.1090476512908936, "learning_rate": 1.4391185396342789e-05, "loss": 43.2654, "step": 1273 }, { "epoch": 0.12636381670303512, "grad_norm": 1.9602532386779785, "learning_rate": 1.4285036013613872e-05, "loss": 42.953, "step": 1274 }, { "epoch": 0.12646300337234676, "grad_norm": 2.2727339267730713, "learning_rate": 1.4179249439979613e-05, "loss": 43.0101, "step": 1275 }, { "epoch": 0.1265621900416584, "grad_norm": 2.0381500720977783, "learning_rate": 1.4073826123206946e-05, "loss": 43.1364, "step": 1276 }, { "epoch": 0.12666137671097005, "grad_norm": 2.0481743812561035, "learning_rate": 1.3968766509525322e-05, "loss": 43.5437, "step": 1277 }, { "epoch": 0.1267605633802817, "grad_norm": 2.1864757537841797, "learning_rate": 1.386407104362467e-05, "loss": 43.0872, "step": 1278 }, { "epoch": 0.12685975004959332, "grad_norm": 2.3352155685424805, "learning_rate": 1.375974016865359e-05, "loss": 43.2959, "step": 1279 }, { "epoch": 0.12695893671890499, "grad_norm": 2.0664021968841553, "learning_rate": 1.3655774326217507e-05, "loss": 43.2873, "step": 1280 }, { "epoch": 0.12705812338821662, "grad_norm": 2.0694098472595215, "learning_rate": 1.3552173956376668e-05, "loss": 43.305, "step": 1281 }, { "epoch": 0.12715731005752826, "grad_norm": 2.1970510482788086, "learning_rate": 1.3448939497644509e-05, "loss": 43.1985, "step": 1282 }, { "epoch": 0.12725649672683992, "grad_norm": 1.886986494064331, "learning_rate": 1.3346071386985515e-05, "loss": 43.5169, "step": 1283 }, { "epoch": 0.12735568339615155, "grad_norm": 1.7438634634017944, "learning_rate": 1.3243570059813626e-05, "loss": 43.2525, "step": 1284 }, { "epoch": 0.1274548700654632, "grad_norm": 2.0268194675445557, "learning_rate": 1.3141435949990188e-05, "loss": 43.4215, "step": 1285 }, { "epoch": 0.12755405673477485, "grad_norm": 2.101165533065796, "learning_rate": 1.3039669489822292e-05, "loss": 42.7054, "step": 1286 }, { "epoch": 0.12765324340408649, "grad_norm": 1.8700491189956665, "learning_rate": 1.2938271110060807e-05, "loss": 43.267, "step": 1287 }, { "epoch": 0.12775243007339815, "grad_norm": 1.8919130563735962, "learning_rate": 1.2837241239898667e-05, "loss": 43.4118, "step": 1288 }, { "epoch": 0.12785161674270978, "grad_norm": 2.1673426628112793, "learning_rate": 1.2736580306968938e-05, "loss": 43.2691, "step": 1289 }, { "epoch": 0.12795080341202142, "grad_norm": 1.9596093893051147, "learning_rate": 1.2636288737343138e-05, "loss": 43.1042, "step": 1290 }, { "epoch": 0.12804999008133308, "grad_norm": 1.9993942975997925, "learning_rate": 1.253636695552931e-05, "loss": 43.4206, "step": 1291 }, { "epoch": 0.12814917675064472, "grad_norm": 1.9622743129730225, "learning_rate": 1.2436815384470301e-05, "loss": 43.5773, "step": 1292 }, { "epoch": 0.12824836341995635, "grad_norm": 2.12934947013855, "learning_rate": 1.233763444554198e-05, "loss": 42.9882, "step": 1293 }, { "epoch": 0.12834755008926801, "grad_norm": 1.997297763824463, "learning_rate": 1.2238824558551365e-05, "loss": 43.2069, "step": 1294 }, { "epoch": 0.12844673675857965, "grad_norm": 1.7098406553268433, "learning_rate": 1.2140386141734972e-05, "loss": 43.3409, "step": 1295 }, { "epoch": 0.12854592342789128, "grad_norm": 1.924278736114502, "learning_rate": 1.2042319611756948e-05, "loss": 43.202, "step": 1296 }, { "epoch": 0.12864511009720295, "grad_norm": 2.1477646827697754, "learning_rate": 1.1944625383707374e-05, "loss": 43.1039, "step": 1297 }, { "epoch": 0.12874429676651458, "grad_norm": 1.9721094369888306, "learning_rate": 1.184730387110039e-05, "loss": 43.5037, "step": 1298 }, { "epoch": 0.12884348343582622, "grad_norm": 1.9413458108901978, "learning_rate": 1.1750355485872644e-05, "loss": 43.7498, "step": 1299 }, { "epoch": 0.12894267010513788, "grad_norm": 2.565575122833252, "learning_rate": 1.1653780638381328e-05, "loss": 43.1948, "step": 1300 }, { "epoch": 0.12904185677444951, "grad_norm": 2.2411508560180664, "learning_rate": 1.1557579737402623e-05, "loss": 43.4395, "step": 1301 }, { "epoch": 0.12914104344376115, "grad_norm": 1.8507673740386963, "learning_rate": 1.146175319012983e-05, "loss": 43.1794, "step": 1302 }, { "epoch": 0.1292402301130728, "grad_norm": 2.061469078063965, "learning_rate": 1.1366301402171775e-05, "loss": 43.0605, "step": 1303 }, { "epoch": 0.12933941678238445, "grad_norm": 2.121185302734375, "learning_rate": 1.127122477755097e-05, "loss": 43.431, "step": 1304 }, { "epoch": 0.12943860345169608, "grad_norm": 2.0242440700531006, "learning_rate": 1.117652371870197e-05, "loss": 43.4726, "step": 1305 }, { "epoch": 0.12953779012100775, "grad_norm": 2.2344465255737305, "learning_rate": 1.1082198626469686e-05, "loss": 42.8992, "step": 1306 }, { "epoch": 0.12963697679031938, "grad_norm": 1.7026211023330688, "learning_rate": 1.0988249900107605e-05, "loss": 43.401, "step": 1307 }, { "epoch": 0.12973616345963102, "grad_norm": 2.0010180473327637, "learning_rate": 1.0894677937276232e-05, "loss": 43.3382, "step": 1308 }, { "epoch": 0.12983535012894268, "grad_norm": 1.8324205875396729, "learning_rate": 1.0801483134041268e-05, "loss": 43.4343, "step": 1309 }, { "epoch": 0.1299345367982543, "grad_norm": 2.0147500038146973, "learning_rate": 1.0708665884872037e-05, "loss": 43.0836, "step": 1310 }, { "epoch": 0.13003372346756595, "grad_norm": 2.060643434524536, "learning_rate": 1.0616226582639732e-05, "loss": 43.3897, "step": 1311 }, { "epoch": 0.1301329101368776, "grad_norm": 1.9988598823547363, "learning_rate": 1.0524165618615845e-05, "loss": 43.5015, "step": 1312 }, { "epoch": 0.13023209680618925, "grad_norm": 1.6039632558822632, "learning_rate": 1.0432483382470393e-05, "loss": 43.352, "step": 1313 }, { "epoch": 0.13033128347550088, "grad_norm": 2.301841974258423, "learning_rate": 1.0341180262270422e-05, "loss": 43.1568, "step": 1314 }, { "epoch": 0.13043047014481254, "grad_norm": 2.549867868423462, "learning_rate": 1.0250256644478195e-05, "loss": 43.1025, "step": 1315 }, { "epoch": 0.13052965681412418, "grad_norm": 1.9916534423828125, "learning_rate": 1.0159712913949715e-05, "loss": 43.4582, "step": 1316 }, { "epoch": 0.1306288434834358, "grad_norm": 2.260643720626831, "learning_rate": 1.0069549453932969e-05, "loss": 43.3585, "step": 1317 }, { "epoch": 0.13072803015274748, "grad_norm": 2.251774549484253, "learning_rate": 9.979766646066368e-06, "loss": 43.3543, "step": 1318 }, { "epoch": 0.1308272168220591, "grad_norm": 2.2947351932525635, "learning_rate": 9.890364870377156e-06, "loss": 43.4266, "step": 1319 }, { "epoch": 0.13092640349137075, "grad_norm": 2.2526769638061523, "learning_rate": 9.801344505279698e-06, "loss": 43.4395, "step": 1320 }, { "epoch": 0.1310255901606824, "grad_norm": 2.026726007461548, "learning_rate": 9.71270592757404e-06, "loss": 43.4505, "step": 1321 }, { "epoch": 0.13112477682999404, "grad_norm": 2.089557647705078, "learning_rate": 9.6244495124441e-06, "loss": 43.5353, "step": 1322 }, { "epoch": 0.1312239634993057, "grad_norm": 1.9276314973831177, "learning_rate": 9.536575633456369e-06, "loss": 43.4943, "step": 1323 }, { "epoch": 0.13132315016861734, "grad_norm": 1.8411586284637451, "learning_rate": 9.449084662557982e-06, "loss": 43.3444, "step": 1324 }, { "epoch": 0.13142233683792898, "grad_norm": 2.3631200790405273, "learning_rate": 9.36197697007547e-06, "loss": 42.7248, "step": 1325 }, { "epoch": 0.13152152350724064, "grad_norm": 2.6807281970977783, "learning_rate": 9.27525292471294e-06, "loss": 43.3432, "step": 1326 }, { "epoch": 0.13162071017655227, "grad_norm": 1.9830889701843262, "learning_rate": 9.188912893550695e-06, "loss": 43.2004, "step": 1327 }, { "epoch": 0.1317198968458639, "grad_norm": 2.103409767150879, "learning_rate": 9.102957242043552e-06, "loss": 43.3022, "step": 1328 }, { "epoch": 0.13181908351517557, "grad_norm": 2.049328327178955, "learning_rate": 9.017386334019373e-06, "loss": 43.3698, "step": 1329 }, { "epoch": 0.1319182701844872, "grad_norm": 2.0538601875305176, "learning_rate": 8.932200531677537e-06, "loss": 43.3331, "step": 1330 }, { "epoch": 0.13201745685379884, "grad_norm": 1.9742823839187622, "learning_rate": 8.847400195587274e-06, "loss": 43.3083, "step": 1331 }, { "epoch": 0.1321166435231105, "grad_norm": 2.0585734844207764, "learning_rate": 8.762985684686365e-06, "loss": 43.3448, "step": 1332 }, { "epoch": 0.13221583019242214, "grad_norm": 2.1064953804016113, "learning_rate": 8.678957356279371e-06, "loss": 43.6047, "step": 1333 }, { "epoch": 0.13231501686173378, "grad_norm": 2.0612661838531494, "learning_rate": 8.595315566036333e-06, "loss": 43.451, "step": 1334 }, { "epoch": 0.13241420353104544, "grad_norm": 1.9173448085784912, "learning_rate": 8.512060667991117e-06, "loss": 43.3475, "step": 1335 }, { "epoch": 0.13251339020035707, "grad_norm": 1.8247857093811035, "learning_rate": 8.429193014540015e-06, "loss": 43.4886, "step": 1336 }, { "epoch": 0.1326125768696687, "grad_norm": 1.8412190675735474, "learning_rate": 8.346712956440195e-06, "loss": 43.2855, "step": 1337 }, { "epoch": 0.13271176353898037, "grad_norm": 1.8325763940811157, "learning_rate": 8.264620842808247e-06, "loss": 43.215, "step": 1338 }, { "epoch": 0.132810950208292, "grad_norm": 2.0761842727661133, "learning_rate": 8.182917021118663e-06, "loss": 43.1977, "step": 1339 }, { "epoch": 0.13291013687760364, "grad_norm": 2.371229410171509, "learning_rate": 8.101601837202456e-06, "loss": 43.4271, "step": 1340 }, { "epoch": 0.1330093235469153, "grad_norm": 2.003568410873413, "learning_rate": 8.020675635245567e-06, "loss": 43.716, "step": 1341 }, { "epoch": 0.13310851021622694, "grad_norm": 2.2959647178649902, "learning_rate": 7.940138757787507e-06, "loss": 42.9468, "step": 1342 }, { "epoch": 0.13320769688553857, "grad_norm": 2.332428455352783, "learning_rate": 7.859991545719902e-06, "loss": 43.3107, "step": 1343 }, { "epoch": 0.13330688355485024, "grad_norm": 2.177833080291748, "learning_rate": 7.780234338284974e-06, "loss": 43.4292, "step": 1344 }, { "epoch": 0.13340607022416187, "grad_norm": 2.025319814682007, "learning_rate": 7.700867473074224e-06, "loss": 43.2634, "step": 1345 }, { "epoch": 0.1335052568934735, "grad_norm": 1.993777871131897, "learning_rate": 7.621891286026872e-06, "loss": 43.3163, "step": 1346 }, { "epoch": 0.13360444356278517, "grad_norm": 1.8726677894592285, "learning_rate": 7.543306111428539e-06, "loss": 43.4174, "step": 1347 }, { "epoch": 0.1337036302320968, "grad_norm": 2.088778495788574, "learning_rate": 7.46511228190977e-06, "loss": 43.2981, "step": 1348 }, { "epoch": 0.13380281690140844, "grad_norm": 2.2077391147613525, "learning_rate": 7.38731012844468e-06, "loss": 43.5584, "step": 1349 }, { "epoch": 0.1339020035707201, "grad_norm": 2.162419319152832, "learning_rate": 7.309899980349455e-06, "loss": 43.2591, "step": 1350 }, { "epoch": 0.13400119024003174, "grad_norm": 2.255364418029785, "learning_rate": 7.232882165281141e-06, "loss": 42.9439, "step": 1351 }, { "epoch": 0.13410037690934337, "grad_norm": 1.9275546073913574, "learning_rate": 7.156257009236e-06, "loss": 43.1532, "step": 1352 }, { "epoch": 0.13419956357865503, "grad_norm": 1.801643967628479, "learning_rate": 7.0800248365483825e-06, "loss": 43.3247, "step": 1353 }, { "epoch": 0.13429875024796667, "grad_norm": 2.316209554672241, "learning_rate": 7.004185969889187e-06, "loss": 43.4831, "step": 1354 }, { "epoch": 0.1343979369172783, "grad_norm": 2.057708263397217, "learning_rate": 6.928740730264549e-06, "loss": 42.9086, "step": 1355 }, { "epoch": 0.13449712358658997, "grad_norm": 2.5561258792877197, "learning_rate": 6.853689437014521e-06, "loss": 43.3944, "step": 1356 }, { "epoch": 0.1345963102559016, "grad_norm": 1.672929048538208, "learning_rate": 6.7790324078116364e-06, "loss": 43.1944, "step": 1357 }, { "epoch": 0.13469549692521327, "grad_norm": 1.909250259399414, "learning_rate": 6.704769958659652e-06, "loss": 43.4062, "step": 1358 }, { "epoch": 0.1347946835945249, "grad_norm": 1.9488860368728638, "learning_rate": 6.6309024038921475e-06, "loss": 43.4522, "step": 1359 }, { "epoch": 0.13489387026383654, "grad_norm": 2.196932315826416, "learning_rate": 6.557430056171221e-06, "loss": 43.2967, "step": 1360 }, { "epoch": 0.1349930569331482, "grad_norm": 2.2892370223999023, "learning_rate": 6.484353226486162e-06, "loss": 43.4358, "step": 1361 }, { "epoch": 0.13509224360245983, "grad_norm": 2.0446407794952393, "learning_rate": 6.411672224152154e-06, "loss": 43.493, "step": 1362 }, { "epoch": 0.13519143027177147, "grad_norm": 1.935500979423523, "learning_rate": 6.339387356808912e-06, "loss": 43.3435, "step": 1363 }, { "epoch": 0.13529061694108313, "grad_norm": 1.5912455320358276, "learning_rate": 6.267498930419435e-06, "loss": 43.3201, "step": 1364 }, { "epoch": 0.13538980361039477, "grad_norm": 1.9594824314117432, "learning_rate": 6.196007249268687e-06, "loss": 43.4821, "step": 1365 }, { "epoch": 0.1354889902797064, "grad_norm": 2.056896924972534, "learning_rate": 6.124912615962341e-06, "loss": 43.3942, "step": 1366 }, { "epoch": 0.13558817694901806, "grad_norm": 1.7575795650482178, "learning_rate": 6.054215331425428e-06, "loss": 43.166, "step": 1367 }, { "epoch": 0.1356873636183297, "grad_norm": 2.0878756046295166, "learning_rate": 5.983915694901088e-06, "loss": 43.4634, "step": 1368 }, { "epoch": 0.13578655028764133, "grad_norm": 2.114713668823242, "learning_rate": 5.9140140039494084e-06, "loss": 43.2898, "step": 1369 }, { "epoch": 0.135885736956953, "grad_norm": 2.129774332046509, "learning_rate": 5.844510554445981e-06, "loss": 43.4494, "step": 1370 }, { "epoch": 0.13598492362626463, "grad_norm": 2.3913118839263916, "learning_rate": 5.775405640580822e-06, "loss": 43.3137, "step": 1371 }, { "epoch": 0.13608411029557627, "grad_norm": 1.7819161415100098, "learning_rate": 5.706699554856964e-06, "loss": 43.3058, "step": 1372 }, { "epoch": 0.13618329696488793, "grad_norm": 1.9535526037216187, "learning_rate": 5.6383925880894135e-06, "loss": 43.3178, "step": 1373 }, { "epoch": 0.13628248363419956, "grad_norm": 2.282865285873413, "learning_rate": 5.5704850294036934e-06, "loss": 43.0879, "step": 1374 }, { "epoch": 0.1363816703035112, "grad_norm": 1.8366732597351074, "learning_rate": 5.502977166234857e-06, "loss": 43.3269, "step": 1375 }, { "epoch": 0.13648085697282286, "grad_norm": 2.4548354148864746, "learning_rate": 5.435869284326023e-06, "loss": 43.1059, "step": 1376 }, { "epoch": 0.1365800436421345, "grad_norm": 2.022681474685669, "learning_rate": 5.3691616677274185e-06, "loss": 43.1864, "step": 1377 }, { "epoch": 0.13667923031144613, "grad_norm": 2.470262289047241, "learning_rate": 5.302854598794937e-06, "loss": 43.3405, "step": 1378 }, { "epoch": 0.1367784169807578, "grad_norm": 2.0381100177764893, "learning_rate": 5.2369483581891535e-06, "loss": 43.3638, "step": 1379 }, { "epoch": 0.13687760365006943, "grad_norm": 2.0829873085021973, "learning_rate": 5.171443224873995e-06, "loss": 43.4733, "step": 1380 }, { "epoch": 0.13697679031938106, "grad_norm": 2.2366280555725098, "learning_rate": 5.106339476115596e-06, "loss": 43.3411, "step": 1381 }, { "epoch": 0.13707597698869273, "grad_norm": 2.3511526584625244, "learning_rate": 5.0416373874811684e-06, "loss": 42.9051, "step": 1382 }, { "epoch": 0.13717516365800436, "grad_norm": 2.483473062515259, "learning_rate": 4.977337232837775e-06, "loss": 43.109, "step": 1383 }, { "epoch": 0.137274350327316, "grad_norm": 2.010577440261841, "learning_rate": 4.913439284351207e-06, "loss": 43.135, "step": 1384 }, { "epoch": 0.13737353699662766, "grad_norm": 2.0378925800323486, "learning_rate": 4.849943812484814e-06, "loss": 42.8639, "step": 1385 }, { "epoch": 0.1374727236659393, "grad_norm": 2.2532873153686523, "learning_rate": 4.7868510859983895e-06, "loss": 43.4281, "step": 1386 }, { "epoch": 0.13757191033525093, "grad_norm": 1.898999571800232, "learning_rate": 4.724161371946978e-06, "loss": 43.4366, "step": 1387 }, { "epoch": 0.1376710970045626, "grad_norm": 1.8824591636657715, "learning_rate": 4.661874935679822e-06, "loss": 43.4965, "step": 1388 }, { "epoch": 0.13777028367387423, "grad_norm": 2.134467124938965, "learning_rate": 4.599992040839118e-06, "loss": 43.329, "step": 1389 }, { "epoch": 0.13786947034318586, "grad_norm": 2.0056257247924805, "learning_rate": 4.538512949359075e-06, "loss": 43.0327, "step": 1390 }, { "epoch": 0.13796865701249753, "grad_norm": 2.1297647953033447, "learning_rate": 4.4774379214646225e-06, "loss": 43.2565, "step": 1391 }, { "epoch": 0.13806784368180916, "grad_norm": 1.80899977684021, "learning_rate": 4.416767215670448e-06, "loss": 43.0096, "step": 1392 }, { "epoch": 0.13816703035112082, "grad_norm": 2.1779046058654785, "learning_rate": 4.356501088779841e-06, "loss": 43.1146, "step": 1393 }, { "epoch": 0.13826621702043246, "grad_norm": 2.2874755859375, "learning_rate": 4.296639795883572e-06, "loss": 43.1322, "step": 1394 }, { "epoch": 0.1383654036897441, "grad_norm": 1.7321752309799194, "learning_rate": 4.237183590358928e-06, "loss": 43.2184, "step": 1395 }, { "epoch": 0.13846459035905576, "grad_norm": 2.2388689517974854, "learning_rate": 4.178132723868477e-06, "loss": 43.6094, "step": 1396 }, { "epoch": 0.1385637770283674, "grad_norm": 1.992551565170288, "learning_rate": 4.119487446359193e-06, "loss": 43.1707, "step": 1397 }, { "epoch": 0.13866296369767903, "grad_norm": 2.0371286869049072, "learning_rate": 4.061248006061191e-06, "loss": 43.2199, "step": 1398 }, { "epoch": 0.1387621503669907, "grad_norm": 2.181845188140869, "learning_rate": 4.003414649486892e-06, "loss": 43.2273, "step": 1399 }, { "epoch": 0.13886133703630232, "grad_norm": 1.8432098627090454, "learning_rate": 3.94598762142977e-06, "loss": 43.3462, "step": 1400 }, { "epoch": 0.13896052370561396, "grad_norm": 1.937048077583313, "learning_rate": 3.888967164963498e-06, "loss": 43.1906, "step": 1401 }, { "epoch": 0.13905971037492562, "grad_norm": 1.9433784484863281, "learning_rate": 3.832353521440768e-06, "loss": 43.3311, "step": 1402 }, { "epoch": 0.13915889704423726, "grad_norm": 2.5950987339019775, "learning_rate": 3.7761469304923967e-06, "loss": 43.1959, "step": 1403 }, { "epoch": 0.1392580837135489, "grad_norm": 2.0187675952911377, "learning_rate": 3.720347630026222e-06, "loss": 43.417, "step": 1404 }, { "epoch": 0.13935727038286055, "grad_norm": 2.4105563163757324, "learning_rate": 3.6649558562261375e-06, "loss": 43.4502, "step": 1405 }, { "epoch": 0.1394564570521722, "grad_norm": 2.395488739013672, "learning_rate": 3.6099718435510633e-06, "loss": 43.0515, "step": 1406 }, { "epoch": 0.13955564372148382, "grad_norm": 1.9171357154846191, "learning_rate": 3.555395824734009e-06, "loss": 43.3508, "step": 1407 }, { "epoch": 0.1396548303907955, "grad_norm": 1.94326913356781, "learning_rate": 3.501228030781034e-06, "loss": 43.2267, "step": 1408 }, { "epoch": 0.13975401706010712, "grad_norm": 2.010206699371338, "learning_rate": 3.447468690970268e-06, "loss": 43.3639, "step": 1409 }, { "epoch": 0.13985320372941876, "grad_norm": 2.325780153274536, "learning_rate": 3.3941180328510123e-06, "loss": 43.2116, "step": 1410 }, { "epoch": 0.13995239039873042, "grad_norm": 2.193591356277466, "learning_rate": 3.341176282242653e-06, "loss": 43.3946, "step": 1411 }, { "epoch": 0.14005157706804205, "grad_norm": 2.0868961811065674, "learning_rate": 3.288643663233848e-06, "loss": 43.1527, "step": 1412 }, { "epoch": 0.1401507637373537, "grad_norm": 1.874427080154419, "learning_rate": 3.236520398181464e-06, "loss": 43.3895, "step": 1413 }, { "epoch": 0.14024995040666535, "grad_norm": 1.8140789270401, "learning_rate": 3.184806707709698e-06, "loss": 43.1514, "step": 1414 }, { "epoch": 0.140349137075977, "grad_norm": 1.7428981065750122, "learning_rate": 3.133502810709099e-06, "loss": 43.4509, "step": 1415 }, { "epoch": 0.14044832374528862, "grad_norm": 2.1699752807617188, "learning_rate": 3.082608924335717e-06, "loss": 43.1771, "step": 1416 }, { "epoch": 0.14054751041460029, "grad_norm": 2.0626633167266846, "learning_rate": 3.0321252640100885e-06, "loss": 43.5371, "step": 1417 }, { "epoch": 0.14064669708391192, "grad_norm": 2.053748607635498, "learning_rate": 2.9820520434163836e-06, "loss": 43.4088, "step": 1418 }, { "epoch": 0.14074588375322356, "grad_norm": 2.031611442565918, "learning_rate": 2.932389474501496e-06, "loss": 43.6168, "step": 1419 }, { "epoch": 0.14084507042253522, "grad_norm": 2.2878761291503906, "learning_rate": 2.88313776747412e-06, "loss": 43.3679, "step": 1420 }, { "epoch": 0.14094425709184685, "grad_norm": 1.8133399486541748, "learning_rate": 2.834297130803909e-06, "loss": 43.2068, "step": 1421 }, { "epoch": 0.1410434437611585, "grad_norm": 2.0135884284973145, "learning_rate": 2.7858677712205296e-06, "loss": 43.1477, "step": 1422 }, { "epoch": 0.14114263043047015, "grad_norm": 1.9554595947265625, "learning_rate": 2.7378498937128404e-06, "loss": 43.3352, "step": 1423 }, { "epoch": 0.14124181709978179, "grad_norm": 1.9702895879745483, "learning_rate": 2.6902437015279837e-06, "loss": 43.4386, "step": 1424 }, { "epoch": 0.14134100376909342, "grad_norm": 2.0473744869232178, "learning_rate": 2.6430493961705937e-06, "loss": 43.3568, "step": 1425 }, { "epoch": 0.14144019043840508, "grad_norm": 2.1810948848724365, "learning_rate": 2.5962671774018234e-06, "loss": 42.9775, "step": 1426 }, { "epoch": 0.14153937710771672, "grad_norm": 1.9952651262283325, "learning_rate": 2.549897243238619e-06, "loss": 43.1992, "step": 1427 }, { "epoch": 0.14163856377702838, "grad_norm": 2.113675832748413, "learning_rate": 2.503939789952825e-06, "loss": 43.426, "step": 1428 }, { "epoch": 0.14173775044634002, "grad_norm": 1.946372628211975, "learning_rate": 2.458395012070369e-06, "loss": 43.1353, "step": 1429 }, { "epoch": 0.14183693711565165, "grad_norm": 2.023200750350952, "learning_rate": 2.4132631023704e-06, "loss": 43.2011, "step": 1430 }, { "epoch": 0.14193612378496331, "grad_norm": 2.0508205890655518, "learning_rate": 2.3685442518845522e-06, "loss": 43.2194, "step": 1431 }, { "epoch": 0.14203531045427495, "grad_norm": 2.032804489135742, "learning_rate": 2.3242386498960266e-06, "loss": 43.2455, "step": 1432 }, { "epoch": 0.14213449712358658, "grad_norm": 2.1437833309173584, "learning_rate": 2.2803464839388998e-06, "loss": 43.2066, "step": 1433 }, { "epoch": 0.14223368379289825, "grad_norm": 2.121809482574463, "learning_rate": 2.2368679397972824e-06, "loss": 43.0669, "step": 1434 }, { "epoch": 0.14233287046220988, "grad_norm": 1.9407131671905518, "learning_rate": 2.1938032015044964e-06, "loss": 43.597, "step": 1435 }, { "epoch": 0.14243205713152152, "grad_norm": 1.904272198677063, "learning_rate": 2.151152451342364e-06, "loss": 43.0514, "step": 1436 }, { "epoch": 0.14253124380083318, "grad_norm": 2.0880041122436523, "learning_rate": 2.1089158698403665e-06, "loss": 43.1989, "step": 1437 }, { "epoch": 0.14263043047014481, "grad_norm": 1.813359260559082, "learning_rate": 2.067093635774975e-06, "loss": 43.3236, "step": 1438 }, { "epoch": 0.14272961713945645, "grad_norm": 2.1034488677978516, "learning_rate": 2.0256859261687856e-06, "loss": 43.2322, "step": 1439 }, { "epoch": 0.1428288038087681, "grad_norm": 1.979280948638916, "learning_rate": 1.984692916289832e-06, "loss": 43.4587, "step": 1440 }, { "epoch": 0.14292799047807975, "grad_norm": 2.622344732284546, "learning_rate": 1.9441147796508407e-06, "loss": 43.1863, "step": 1441 }, { "epoch": 0.14302717714739138, "grad_norm": 2.316920518875122, "learning_rate": 1.9039516880084873e-06, "loss": 43.2889, "step": 1442 }, { "epoch": 0.14312636381670304, "grad_norm": 1.8629003763198853, "learning_rate": 1.8642038113626635e-06, "loss": 43.2849, "step": 1443 }, { "epoch": 0.14322555048601468, "grad_norm": 1.8128821849822998, "learning_rate": 1.8248713179557786e-06, "loss": 43.5935, "step": 1444 }, { "epoch": 0.14332473715532631, "grad_norm": 2.1895546913146973, "learning_rate": 1.785954374272003e-06, "loss": 43.1179, "step": 1445 }, { "epoch": 0.14342392382463798, "grad_norm": 1.8723267316818237, "learning_rate": 1.7474531450366371e-06, "loss": 43.4893, "step": 1446 }, { "epoch": 0.1435231104939496, "grad_norm": 2.146305799484253, "learning_rate": 1.7093677932153218e-06, "loss": 43.4065, "step": 1447 }, { "epoch": 0.14362229716326125, "grad_norm": 2.2784523963928223, "learning_rate": 1.6716984800134282e-06, "loss": 43.2687, "step": 1448 }, { "epoch": 0.1437214838325729, "grad_norm": 2.066173791885376, "learning_rate": 1.6344453648753588e-06, "loss": 43.2307, "step": 1449 }, { "epoch": 0.14382067050188455, "grad_norm": 2.0641512870788574, "learning_rate": 1.5976086054838025e-06, "loss": 43.3932, "step": 1450 }, { "epoch": 0.14391985717119618, "grad_norm": 1.8807092905044556, "learning_rate": 1.5611883577591912e-06, "loss": 43.2905, "step": 1451 }, { "epoch": 0.14401904384050784, "grad_norm": 2.4049503803253174, "learning_rate": 1.5251847758589121e-06, "loss": 43.0874, "step": 1452 }, { "epoch": 0.14411823050981948, "grad_norm": 2.178377389907837, "learning_rate": 1.4895980121767627e-06, "loss": 43.3295, "step": 1453 }, { "epoch": 0.1442174171791311, "grad_norm": 2.1683950424194336, "learning_rate": 1.4544282173422074e-06, "loss": 43.3708, "step": 1454 }, { "epoch": 0.14431660384844278, "grad_norm": 2.144456624984741, "learning_rate": 1.4196755402198448e-06, "loss": 43.3268, "step": 1455 }, { "epoch": 0.1444157905177544, "grad_norm": 2.1770358085632324, "learning_rate": 1.3853401279086854e-06, "loss": 43.3516, "step": 1456 }, { "epoch": 0.14451497718706605, "grad_norm": 2.203105926513672, "learning_rate": 1.3514221257415748e-06, "loss": 43.2459, "step": 1457 }, { "epoch": 0.1446141638563777, "grad_norm": 1.9053125381469727, "learning_rate": 1.3179216772845727e-06, "loss": 43.2546, "step": 1458 }, { "epoch": 0.14471335052568934, "grad_norm": 2.0264782905578613, "learning_rate": 1.2848389243363512e-06, "loss": 43.2631, "step": 1459 }, { "epoch": 0.14481253719500098, "grad_norm": 1.9565973281860352, "learning_rate": 1.2521740069275644e-06, "loss": 43.4172, "step": 1460 }, { "epoch": 0.14491172386431264, "grad_norm": 1.9662400484085083, "learning_rate": 1.219927063320303e-06, "loss": 43.2851, "step": 1461 }, { "epoch": 0.14501091053362428, "grad_norm": 1.6376630067825317, "learning_rate": 1.1880982300074838e-06, "loss": 43.4667, "step": 1462 }, { "epoch": 0.14511009720293594, "grad_norm": 2.073845863342285, "learning_rate": 1.1566876417122285e-06, "loss": 43.1126, "step": 1463 }, { "epoch": 0.14520928387224757, "grad_norm": 2.2647864818573, "learning_rate": 1.1256954313873858e-06, "loss": 43.2001, "step": 1464 }, { "epoch": 0.1453084705415592, "grad_norm": 2.2635481357574463, "learning_rate": 1.0951217302148986e-06, "loss": 43.2655, "step": 1465 }, { "epoch": 0.14540765721087087, "grad_norm": 2.0819239616394043, "learning_rate": 1.0649666676052828e-06, "loss": 43.3746, "step": 1466 }, { "epoch": 0.1455068438801825, "grad_norm": 1.84273362159729, "learning_rate": 1.0352303711970379e-06, "loss": 43.547, "step": 1467 }, { "epoch": 0.14560603054949414, "grad_norm": 1.9994412660598755, "learning_rate": 1.0059129668561707e-06, "loss": 43.311, "step": 1468 }, { "epoch": 0.1457052172188058, "grad_norm": 1.9508692026138306, "learning_rate": 9.770145786755946e-07, "loss": 43.2921, "step": 1469 }, { "epoch": 0.14580440388811744, "grad_norm": 2.2198736667633057, "learning_rate": 9.485353289746645e-07, "loss": 43.4834, "step": 1470 }, { "epoch": 0.14590359055742907, "grad_norm": 1.7677615880966187, "learning_rate": 9.204753382986097e-07, "loss": 43.3114, "step": 1471 }, { "epoch": 0.14600277722674074, "grad_norm": 2.2032389640808105, "learning_rate": 8.92834725418068e-07, "loss": 43.2516, "step": 1472 }, { "epoch": 0.14610196389605237, "grad_norm": 1.672240972518921, "learning_rate": 8.656136073285303e-07, "loss": 43.4628, "step": 1473 }, { "epoch": 0.146201150565364, "grad_norm": 2.144817352294922, "learning_rate": 8.388120992499083e-07, "loss": 43.451, "step": 1474 }, { "epoch": 0.14630033723467567, "grad_norm": 2.099653482437134, "learning_rate": 8.124303146259782e-07, "loss": 43.2155, "step": 1475 }, { "epoch": 0.1463995239039873, "grad_norm": 2.6512537002563477, "learning_rate": 7.864683651239824e-07, "loss": 42.983, "step": 1476 }, { "epoch": 0.14649871057329894, "grad_norm": 2.3182458877563477, "learning_rate": 7.609263606340622e-07, "loss": 43.3022, "step": 1477 }, { "epoch": 0.1465978972426106, "grad_norm": 2.0440011024475098, "learning_rate": 7.358044092688699e-07, "loss": 43.3016, "step": 1478 }, { "epoch": 0.14669708391192224, "grad_norm": 2.181492805480957, "learning_rate": 7.111026173630797e-07, "loss": 43.1403, "step": 1479 }, { "epoch": 0.14679627058123387, "grad_norm": 1.6637905836105347, "learning_rate": 6.868210894729332e-07, "loss": 43.4882, "step": 1480 }, { "epoch": 0.14689545725054554, "grad_norm": 2.085326671600342, "learning_rate": 6.629599283758059e-07, "loss": 43.2251, "step": 1481 }, { "epoch": 0.14699464391985717, "grad_norm": 2.2099697589874268, "learning_rate": 6.395192350697743e-07, "loss": 43.2126, "step": 1482 }, { "epoch": 0.1470938305891688, "grad_norm": 2.3335022926330566, "learning_rate": 6.164991087731831e-07, "loss": 43.3794, "step": 1483 }, { "epoch": 0.14719301725848047, "grad_norm": 1.7655500173568726, "learning_rate": 5.938996469242231e-07, "loss": 43.4807, "step": 1484 }, { "epoch": 0.1472922039277921, "grad_norm": 2.487640380859375, "learning_rate": 5.717209451805205e-07, "loss": 42.9848, "step": 1485 }, { "epoch": 0.14739139059710374, "grad_norm": 1.804437279701233, "learning_rate": 5.499630974187375e-07, "loss": 43.4311, "step": 1486 }, { "epoch": 0.1474905772664154, "grad_norm": 2.1954946517944336, "learning_rate": 5.286261957341831e-07, "loss": 42.9627, "step": 1487 }, { "epoch": 0.14758976393572704, "grad_norm": 2.0663084983825684, "learning_rate": 5.077103304403807e-07, "loss": 43.292, "step": 1488 }, { "epoch": 0.14768895060503867, "grad_norm": 2.280759811401367, "learning_rate": 4.872155900687347e-07, "loss": 43.044, "step": 1489 }, { "epoch": 0.14778813727435033, "grad_norm": 2.1535468101501465, "learning_rate": 4.671420613681643e-07, "loss": 43.2001, "step": 1490 }, { "epoch": 0.14788732394366197, "grad_norm": 2.0268213748931885, "learning_rate": 4.4748982930465923e-07, "loss": 43.3799, "step": 1491 }, { "epoch": 0.1479865106129736, "grad_norm": 1.9153803586959839, "learning_rate": 4.2825897706100235e-07, "loss": 43.1269, "step": 1492 }, { "epoch": 0.14808569728228527, "grad_norm": 2.1148533821105957, "learning_rate": 4.09449586036359e-07, "loss": 43.2859, "step": 1493 }, { "epoch": 0.1481848839515969, "grad_norm": 2.9361672401428223, "learning_rate": 3.9106173584601e-07, "loss": 43.5126, "step": 1494 }, { "epoch": 0.14828407062090854, "grad_norm": 2.002664804458618, "learning_rate": 3.7309550432090835e-07, "loss": 43.1279, "step": 1495 }, { "epoch": 0.1483832572902202, "grad_norm": 2.234067678451538, "learning_rate": 3.5555096750743424e-07, "loss": 43.5213, "step": 1496 }, { "epoch": 0.14848244395953183, "grad_norm": 2.1555521488189697, "learning_rate": 3.384281996670402e-07, "loss": 43.3817, "step": 1497 }, { "epoch": 0.14858163062884347, "grad_norm": 2.080760955810547, "learning_rate": 3.217272732759402e-07, "loss": 43.3713, "step": 1498 }, { "epoch": 0.14868081729815513, "grad_norm": 2.126169443130493, "learning_rate": 3.054482590247876e-07, "loss": 43.1336, "step": 1499 }, { "epoch": 0.14878000396746677, "grad_norm": 1.7946844100952148, "learning_rate": 2.8959122581840857e-07, "loss": 43.0998, "step": 1500 }, { "epoch": 0.14887919063677843, "grad_norm": 1.9658114910125732, "learning_rate": 2.741562407755138e-07, "loss": 43.256, "step": 1501 }, { "epoch": 0.14897837730609007, "grad_norm": 2.403459310531616, "learning_rate": 2.5914336922833183e-07, "loss": 43.117, "step": 1502 }, { "epoch": 0.1490775639754017, "grad_norm": 2.071451425552368, "learning_rate": 2.445526747224647e-07, "loss": 43.3778, "step": 1503 }, { "epoch": 0.14917675064471336, "grad_norm": 2.062016248703003, "learning_rate": 2.3038421901651064e-07, "loss": 43.6402, "step": 1504 }, { "epoch": 0.149275937314025, "grad_norm": 2.29769229888916, "learning_rate": 2.1663806208184201e-07, "loss": 43.3161, "step": 1505 }, { "epoch": 0.14937512398333663, "grad_norm": 2.2068557739257812, "learning_rate": 2.0331426210236093e-07, "loss": 43.3807, "step": 1506 }, { "epoch": 0.1494743106526483, "grad_norm": 2.1809418201446533, "learning_rate": 1.9041287547424403e-07, "loss": 43.3291, "step": 1507 }, { "epoch": 0.14957349732195993, "grad_norm": 2.355131149291992, "learning_rate": 1.7793395680568703e-07, "loss": 43.1938, "step": 1508 }, { "epoch": 0.14967268399127157, "grad_norm": 1.9455194473266602, "learning_rate": 1.6587755891671608e-07, "loss": 43.2039, "step": 1509 }, { "epoch": 0.14977187066058323, "grad_norm": 2.080108165740967, "learning_rate": 1.5424373283889904e-07, "loss": 43.2726, "step": 1510 }, { "epoch": 0.14987105732989486, "grad_norm": 2.2005887031555176, "learning_rate": 1.4303252781520114e-07, "loss": 43.4239, "step": 1511 }, { "epoch": 0.1499702439992065, "grad_norm": 2.1430304050445557, "learning_rate": 1.3224399129971866e-07, "loss": 43.2449, "step": 1512 }, { "epoch": 0.15006943066851816, "grad_norm": 1.9301306009292603, "learning_rate": 1.2187816895752324e-07, "loss": 43.3271, "step": 1513 }, { "epoch": 0.1501686173378298, "grad_norm": 2.6255922317504883, "learning_rate": 1.1193510466445123e-07, "loss": 43.4344, "step": 1514 }, { "epoch": 0.15026780400714143, "grad_norm": 2.2844913005828857, "learning_rate": 1.0241484050687034e-07, "loss": 43.3174, "step": 1515 }, { "epoch": 0.1503669906764531, "grad_norm": 2.1364171504974365, "learning_rate": 9.3317416781602e-08, "loss": 43.2297, "step": 1516 }, { "epoch": 0.15046617734576473, "grad_norm": 1.8956122398376465, "learning_rate": 8.46428719956549e-08, "loss": 43.6502, "step": 1517 }, { "epoch": 0.15056536401507636, "grad_norm": 2.3670010566711426, "learning_rate": 7.639124286612509e-08, "loss": 43.1647, "step": 1518 }, { "epoch": 0.15066455068438803, "grad_norm": 2.168613910675049, "learning_rate": 6.856256432000718e-08, "loss": 43.2205, "step": 1519 }, { "epoch": 0.15076373735369966, "grad_norm": 2.2199530601501465, "learning_rate": 6.115686949405008e-08, "loss": 43.4035, "step": 1520 }, { "epoch": 0.1508629240230113, "grad_norm": 1.9278777837753296, "learning_rate": 5.417418973462374e-08, "loss": 43.4381, "step": 1521 }, { "epoch": 0.15096211069232296, "grad_norm": 1.9266911745071411, "learning_rate": 4.7614554597608105e-08, "loss": 42.9214, "step": 1522 }, { "epoch": 0.1510612973616346, "grad_norm": 2.1948959827423096, "learning_rate": 4.147799184821555e-08, "loss": 43.3784, "step": 1523 }, { "epoch": 0.15116048403094623, "grad_norm": 2.2459158897399902, "learning_rate": 3.576452746092418e-08, "loss": 43.157, "step": 1524 }, { "epoch": 0.1512596707002579, "grad_norm": 1.832255244255066, "learning_rate": 3.047418561933357e-08, "loss": 43.1398, "step": 1525 }, { "epoch": 0.15135885736956953, "grad_norm": 2.164360761642456, "learning_rate": 2.5606988716075918e-08, "loss": 43.4648, "step": 1526 }, { "epoch": 0.15145804403888116, "grad_norm": 2.135101318359375, "learning_rate": 2.1162957352738323e-08, "loss": 43.3128, "step": 1527 }, { "epoch": 0.15155723070819282, "grad_norm": 1.6633306741714478, "learning_rate": 1.7142110339740668e-08, "loss": 43.4242, "step": 1528 }, { "epoch": 0.15165641737750446, "grad_norm": 2.3267548084259033, "learning_rate": 1.3544464696280124e-08, "loss": 43.4299, "step": 1529 }, { "epoch": 0.1517556040468161, "grad_norm": 1.9193159341812134, "learning_rate": 1.0370035650253407e-08, "loss": 43.2334, "step": 1530 }, { "epoch": 0.15185479071612776, "grad_norm": 1.8524115085601807, "learning_rate": 7.618836638190186e-09, "loss": 43.5525, "step": 1531 }, { "epoch": 0.1519539773854394, "grad_norm": 2.0548107624053955, "learning_rate": 5.29087930519756e-09, "loss": 43.15, "step": 1532 }, { "epoch": 0.15205316405475103, "grad_norm": 1.9881281852722168, "learning_rate": 3.3861735049267597e-09, "loss": 43.4443, "step": 1533 }, { "epoch": 0.1521523507240627, "grad_norm": 2.0406267642974854, "learning_rate": 1.904727299473219e-09, "loss": 43.1231, "step": 1534 }, { "epoch": 0.15225153739337433, "grad_norm": 2.137437582015991, "learning_rate": 8.465469594431952e-10, "loss": 43.3994, "step": 1535 }, { "epoch": 0.152350724062686, "grad_norm": 2.3377254009246826, "learning_rate": 2.116369638094362e-10, "loss": 43.3795, "step": 1536 }, { "epoch": 0.15244991073199762, "grad_norm": 1.9583935737609863, "learning_rate": 0.0, "loss": 43.5157, "step": 1537 } ], "logging_steps": 1, "max_steps": 1537, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 385, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 14932564770816.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }