{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.1000107828337287, "eval_steps": 500, "global_step": 1855, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 5.391416864351952e-05, "grad_norm": 53.75010299682617, "learning_rate": 1.0000000000000001e-07, "loss": 2.5864, "step": 1 }, { "epoch": 0.00010782833728703904, "grad_norm": 45.00067138671875, "learning_rate": 2.0000000000000002e-07, "loss": 2.3757, "step": 2 }, { "epoch": 0.00016174250593055855, "grad_norm": 51.22366714477539, "learning_rate": 3.0000000000000004e-07, "loss": 2.4653, "step": 3 }, { "epoch": 0.00021565667457407807, "grad_norm": 62.225242614746094, "learning_rate": 4.0000000000000003e-07, "loss": 2.5819, "step": 4 }, { "epoch": 0.0002695708432175976, "grad_norm": 54.67008590698242, "learning_rate": 5.000000000000001e-07, "loss": 2.6368, "step": 5 }, { "epoch": 0.0003234850118611171, "grad_norm": 51.261009216308594, "learning_rate": 6.000000000000001e-07, "loss": 2.3245, "step": 6 }, { "epoch": 0.0003773991805046366, "grad_norm": 53.58714294433594, "learning_rate": 7.000000000000001e-07, "loss": 2.7622, "step": 7 }, { "epoch": 0.00043131334914815614, "grad_norm": 41.32997131347656, "learning_rate": 8.000000000000001e-07, "loss": 2.6444, "step": 8 }, { "epoch": 0.00048522751779167566, "grad_norm": 33.232242584228516, "learning_rate": 9.000000000000001e-07, "loss": 2.1475, "step": 9 }, { "epoch": 0.0005391416864351952, "grad_norm": 34.1890983581543, "learning_rate": 1.0000000000000002e-06, "loss": 2.7256, "step": 10 }, { "epoch": 0.0005930558550787146, "grad_norm": 19.263437271118164, "learning_rate": 1.1e-06, "loss": 2.4132, "step": 11 }, { "epoch": 0.0006469700237222342, "grad_norm": 15.612638473510742, "learning_rate": 1.2000000000000002e-06, "loss": 2.0422, "step": 12 }, { "epoch": 0.0007008841923657537, "grad_norm": 13.81751537322998, "learning_rate": 1.3e-06, "loss": 1.9663, "step": 13 }, { "epoch": 0.0007547983610092732, "grad_norm": 16.390897750854492, "learning_rate": 1.4000000000000001e-06, "loss": 2.1135, "step": 14 }, { "epoch": 0.0008087125296527927, "grad_norm": 21.830646514892578, "learning_rate": 1.5e-06, "loss": 2.217, "step": 15 }, { "epoch": 0.0008626266982963123, "grad_norm": 18.630046844482422, "learning_rate": 1.6000000000000001e-06, "loss": 2.1612, "step": 16 }, { "epoch": 0.0009165408669398317, "grad_norm": 12.403571128845215, "learning_rate": 1.7000000000000002e-06, "loss": 1.9358, "step": 17 }, { "epoch": 0.0009704550355833513, "grad_norm": 7.713366508483887, "learning_rate": 1.8000000000000001e-06, "loss": 1.8522, "step": 18 }, { "epoch": 0.001024369204226871, "grad_norm": 7.731616973876953, "learning_rate": 1.9000000000000002e-06, "loss": 1.7984, "step": 19 }, { "epoch": 0.0010782833728703904, "grad_norm": 7.5799174308776855, "learning_rate": 2.0000000000000003e-06, "loss": 1.701, "step": 20 }, { "epoch": 0.0011321975415139098, "grad_norm": 5.5428080558776855, "learning_rate": 2.1000000000000002e-06, "loss": 1.624, "step": 21 }, { "epoch": 0.0011861117101574293, "grad_norm": 5.851474285125732, "learning_rate": 2.2e-06, "loss": 1.8064, "step": 22 }, { "epoch": 0.001240025878800949, "grad_norm": 5.243111610412598, "learning_rate": 2.3000000000000004e-06, "loss": 1.7246, "step": 23 }, { "epoch": 0.0012939400474444684, "grad_norm": 4.835971832275391, "learning_rate": 2.4000000000000003e-06, "loss": 1.763, "step": 24 }, { "epoch": 0.0013478542160879879, "grad_norm": 4.127845287322998, "learning_rate": 2.5e-06, "loss": 1.5869, "step": 25 }, { "epoch": 0.0014017683847315074, "grad_norm": 3.7648322582244873, "learning_rate": 2.6e-06, "loss": 1.5599, "step": 26 }, { "epoch": 0.001455682553375027, "grad_norm": 3.5424962043762207, "learning_rate": 2.7000000000000004e-06, "loss": 1.4703, "step": 27 }, { "epoch": 0.0015095967220185465, "grad_norm": 3.3707985877990723, "learning_rate": 2.8000000000000003e-06, "loss": 1.354, "step": 28 }, { "epoch": 0.001563510890662066, "grad_norm": 4.71254825592041, "learning_rate": 2.9e-06, "loss": 1.8162, "step": 29 }, { "epoch": 0.0016174250593055854, "grad_norm": 3.7660300731658936, "learning_rate": 3e-06, "loss": 1.5951, "step": 30 }, { "epoch": 0.001671339227949105, "grad_norm": 3.4810571670532227, "learning_rate": 3.1000000000000004e-06, "loss": 1.5183, "step": 31 }, { "epoch": 0.0017252533965926246, "grad_norm": 3.672693967819214, "learning_rate": 3.2000000000000003e-06, "loss": 1.6374, "step": 32 }, { "epoch": 0.001779167565236144, "grad_norm": 3.3589682579040527, "learning_rate": 3.3000000000000006e-06, "loss": 1.4371, "step": 33 }, { "epoch": 0.0018330817338796635, "grad_norm": 3.6365807056427, "learning_rate": 3.4000000000000005e-06, "loss": 1.595, "step": 34 }, { "epoch": 0.0018869959025231832, "grad_norm": 3.6467039585113525, "learning_rate": 3.5e-06, "loss": 1.5714, "step": 35 }, { "epoch": 0.0019409100711667026, "grad_norm": 3.4684648513793945, "learning_rate": 3.6000000000000003e-06, "loss": 1.4897, "step": 36 }, { "epoch": 0.001994824239810222, "grad_norm": 3.70845627784729, "learning_rate": 3.7e-06, "loss": 1.5954, "step": 37 }, { "epoch": 0.002048738408453742, "grad_norm": 3.1803395748138428, "learning_rate": 3.8000000000000005e-06, "loss": 1.3976, "step": 38 }, { "epoch": 0.002102652577097261, "grad_norm": 2.851703405380249, "learning_rate": 3.900000000000001e-06, "loss": 1.1894, "step": 39 }, { "epoch": 0.0021565667457407807, "grad_norm": 2.832003593444824, "learning_rate": 4.000000000000001e-06, "loss": 1.353, "step": 40 }, { "epoch": 0.0022104809143843004, "grad_norm": 3.397498607635498, "learning_rate": 4.1e-06, "loss": 1.4541, "step": 41 }, { "epoch": 0.0022643950830278196, "grad_norm": 3.4537954330444336, "learning_rate": 4.2000000000000004e-06, "loss": 1.4475, "step": 42 }, { "epoch": 0.0023183092516713393, "grad_norm": 3.1131632328033447, "learning_rate": 4.3e-06, "loss": 1.2707, "step": 43 }, { "epoch": 0.0023722234203148586, "grad_norm": 3.0421881675720215, "learning_rate": 4.4e-06, "loss": 1.3418, "step": 44 }, { "epoch": 0.0024261375889583782, "grad_norm": 3.528514862060547, "learning_rate": 4.5e-06, "loss": 1.4432, "step": 45 }, { "epoch": 0.002480051757601898, "grad_norm": 3.6783225536346436, "learning_rate": 4.600000000000001e-06, "loss": 1.4863, "step": 46 }, { "epoch": 0.002533965926245417, "grad_norm": 2.9829189777374268, "learning_rate": 4.7e-06, "loss": 1.2856, "step": 47 }, { "epoch": 0.002587880094888937, "grad_norm": 3.4480350017547607, "learning_rate": 4.800000000000001e-06, "loss": 1.4129, "step": 48 }, { "epoch": 0.0026417942635324565, "grad_norm": 3.4247214794158936, "learning_rate": 4.9000000000000005e-06, "loss": 1.3467, "step": 49 }, { "epoch": 0.0026957084321759758, "grad_norm": 3.5268948078155518, "learning_rate": 5e-06, "loss": 1.4795, "step": 50 }, { "epoch": 0.0027496226008194955, "grad_norm": 3.3228304386138916, "learning_rate": 5.1e-06, "loss": 1.461, "step": 51 }, { "epoch": 0.0028035367694630147, "grad_norm": 3.365630865097046, "learning_rate": 5.2e-06, "loss": 1.2947, "step": 52 }, { "epoch": 0.0028574509381065344, "grad_norm": 3.4889328479766846, "learning_rate": 5.300000000000001e-06, "loss": 1.432, "step": 53 }, { "epoch": 0.002911365106750054, "grad_norm": 3.5767273902893066, "learning_rate": 5.400000000000001e-06, "loss": 1.3773, "step": 54 }, { "epoch": 0.0029652792753935733, "grad_norm": 3.499298095703125, "learning_rate": 5.500000000000001e-06, "loss": 1.4132, "step": 55 }, { "epoch": 0.003019193444037093, "grad_norm": 3.6990244388580322, "learning_rate": 5.600000000000001e-06, "loss": 1.4595, "step": 56 }, { "epoch": 0.0030731076126806127, "grad_norm": 3.0908327102661133, "learning_rate": 5.7e-06, "loss": 1.1873, "step": 57 }, { "epoch": 0.003127021781324132, "grad_norm": 3.149425745010376, "learning_rate": 5.8e-06, "loss": 1.3306, "step": 58 }, { "epoch": 0.0031809359499676516, "grad_norm": 3.193023204803467, "learning_rate": 5.9e-06, "loss": 1.3326, "step": 59 }, { "epoch": 0.003234850118611171, "grad_norm": 3.610344409942627, "learning_rate": 6e-06, "loss": 1.4527, "step": 60 }, { "epoch": 0.0032887642872546905, "grad_norm": 2.9877095222473145, "learning_rate": 6.1e-06, "loss": 1.2029, "step": 61 }, { "epoch": 0.00334267845589821, "grad_norm": 3.0241923332214355, "learning_rate": 6.200000000000001e-06, "loss": 1.3413, "step": 62 }, { "epoch": 0.0033965926245417295, "grad_norm": 3.212700366973877, "learning_rate": 6.300000000000001e-06, "loss": 1.3471, "step": 63 }, { "epoch": 0.003450506793185249, "grad_norm": 2.7138960361480713, "learning_rate": 6.4000000000000006e-06, "loss": 1.0885, "step": 64 }, { "epoch": 0.0035044209618287684, "grad_norm": 2.5690340995788574, "learning_rate": 6.5000000000000004e-06, "loss": 1.1168, "step": 65 }, { "epoch": 0.003558335130472288, "grad_norm": 3.0344784259796143, "learning_rate": 6.600000000000001e-06, "loss": 1.2828, "step": 66 }, { "epoch": 0.0036122492991158077, "grad_norm": 3.0589816570281982, "learning_rate": 6.700000000000001e-06, "loss": 1.2604, "step": 67 }, { "epoch": 0.003666163467759327, "grad_norm": 2.676417112350464, "learning_rate": 6.800000000000001e-06, "loss": 1.1679, "step": 68 }, { "epoch": 0.0037200776364028467, "grad_norm": 2.6590960025787354, "learning_rate": 6.9e-06, "loss": 1.2283, "step": 69 }, { "epoch": 0.0037739918050463664, "grad_norm": 2.6973354816436768, "learning_rate": 7e-06, "loss": 1.2028, "step": 70 }, { "epoch": 0.0038279059736898856, "grad_norm": 2.7046608924865723, "learning_rate": 7.100000000000001e-06, "loss": 1.2629, "step": 71 }, { "epoch": 0.0038818201423334053, "grad_norm": 2.2172696590423584, "learning_rate": 7.2000000000000005e-06, "loss": 1.1367, "step": 72 }, { "epoch": 0.0039357343109769245, "grad_norm": 2.6138789653778076, "learning_rate": 7.3e-06, "loss": 1.3167, "step": 73 }, { "epoch": 0.003989648479620444, "grad_norm": 2.2926838397979736, "learning_rate": 7.4e-06, "loss": 1.2909, "step": 74 }, { "epoch": 0.004043562648263964, "grad_norm": 2.0647220611572266, "learning_rate": 7.500000000000001e-06, "loss": 1.2054, "step": 75 }, { "epoch": 0.004097476816907484, "grad_norm": 2.1190452575683594, "learning_rate": 7.600000000000001e-06, "loss": 1.1497, "step": 76 }, { "epoch": 0.004151390985551002, "grad_norm": 1.9973243474960327, "learning_rate": 7.7e-06, "loss": 1.1997, "step": 77 }, { "epoch": 0.004205305154194522, "grad_norm": 2.11751651763916, "learning_rate": 7.800000000000002e-06, "loss": 1.2181, "step": 78 }, { "epoch": 0.004259219322838042, "grad_norm": 1.8975950479507446, "learning_rate": 7.9e-06, "loss": 1.1582, "step": 79 }, { "epoch": 0.004313133491481561, "grad_norm": 1.8368147611618042, "learning_rate": 8.000000000000001e-06, "loss": 1.1389, "step": 80 }, { "epoch": 0.004367047660125081, "grad_norm": 1.7472988367080688, "learning_rate": 8.1e-06, "loss": 1.0959, "step": 81 }, { "epoch": 0.004420961828768601, "grad_norm": 1.7325443029403687, "learning_rate": 8.2e-06, "loss": 1.1847, "step": 82 }, { "epoch": 0.00447487599741212, "grad_norm": 1.6171561479568481, "learning_rate": 8.3e-06, "loss": 0.9834, "step": 83 }, { "epoch": 0.004528790166055639, "grad_norm": 1.6583327054977417, "learning_rate": 8.400000000000001e-06, "loss": 1.0413, "step": 84 }, { "epoch": 0.004582704334699159, "grad_norm": 1.8914967775344849, "learning_rate": 8.5e-06, "loss": 1.2413, "step": 85 }, { "epoch": 0.004636618503342679, "grad_norm": 1.6018317937850952, "learning_rate": 8.6e-06, "loss": 1.0577, "step": 86 }, { "epoch": 0.004690532671986198, "grad_norm": 1.9170053005218506, "learning_rate": 8.700000000000001e-06, "loss": 1.2463, "step": 87 }, { "epoch": 0.004744446840629717, "grad_norm": 1.666536569595337, "learning_rate": 8.8e-06, "loss": 1.0532, "step": 88 }, { "epoch": 0.004798361009273237, "grad_norm": 1.660115361213684, "learning_rate": 8.900000000000001e-06, "loss": 1.0514, "step": 89 }, { "epoch": 0.0048522751779167565, "grad_norm": 1.8667477369308472, "learning_rate": 9e-06, "loss": 1.2039, "step": 90 }, { "epoch": 0.004906189346560276, "grad_norm": 1.9490039348602295, "learning_rate": 9.100000000000001e-06, "loss": 1.1804, "step": 91 }, { "epoch": 0.004960103515203796, "grad_norm": 1.8415377140045166, "learning_rate": 9.200000000000002e-06, "loss": 1.1435, "step": 92 }, { "epoch": 0.005014017683847315, "grad_norm": 1.8571438789367676, "learning_rate": 9.3e-06, "loss": 1.0974, "step": 93 }, { "epoch": 0.005067931852490834, "grad_norm": 1.8480113744735718, "learning_rate": 9.4e-06, "loss": 1.149, "step": 94 }, { "epoch": 0.005121846021134354, "grad_norm": 2.003490447998047, "learning_rate": 9.5e-06, "loss": 1.1954, "step": 95 }, { "epoch": 0.005175760189777874, "grad_norm": 1.8002668619155884, "learning_rate": 9.600000000000001e-06, "loss": 0.9953, "step": 96 }, { "epoch": 0.005229674358421393, "grad_norm": 1.9040817022323608, "learning_rate": 9.7e-06, "loss": 1.1195, "step": 97 }, { "epoch": 0.005283588527064913, "grad_norm": 1.8311433792114258, "learning_rate": 9.800000000000001e-06, "loss": 1.083, "step": 98 }, { "epoch": 0.005337502695708432, "grad_norm": 1.9509624242782593, "learning_rate": 9.9e-06, "loss": 1.176, "step": 99 }, { "epoch": 0.0053914168643519516, "grad_norm": 2.0624589920043945, "learning_rate": 1e-05, "loss": 1.119, "step": 100 }, { "epoch": 0.005445331032995471, "grad_norm": 1.9618796110153198, "learning_rate": 9.999999995505339e-06, "loss": 1.1371, "step": 101 }, { "epoch": 0.005499245201638991, "grad_norm": 1.946245551109314, "learning_rate": 9.999999982021349e-06, "loss": 0.9736, "step": 102 }, { "epoch": 0.005553159370282511, "grad_norm": 1.9871301651000977, "learning_rate": 9.999999959548035e-06, "loss": 1.1077, "step": 103 }, { "epoch": 0.005607073538926029, "grad_norm": 1.86216402053833, "learning_rate": 9.999999928085396e-06, "loss": 1.0882, "step": 104 }, { "epoch": 0.005660987707569549, "grad_norm": 1.8447723388671875, "learning_rate": 9.999999887633432e-06, "loss": 1.0344, "step": 105 }, { "epoch": 0.005714901876213069, "grad_norm": 1.8345638513565063, "learning_rate": 9.99999983819214e-06, "loss": 1.1077, "step": 106 }, { "epoch": 0.0057688160448565885, "grad_norm": 1.8410178422927856, "learning_rate": 9.999999779761524e-06, "loss": 1.0824, "step": 107 }, { "epoch": 0.005822730213500108, "grad_norm": 1.5881969928741455, "learning_rate": 9.999999712341583e-06, "loss": 0.9439, "step": 108 }, { "epoch": 0.005876644382143627, "grad_norm": 1.6704047918319702, "learning_rate": 9.999999635932316e-06, "loss": 1.033, "step": 109 }, { "epoch": 0.005930558550787147, "grad_norm": 1.792449712753296, "learning_rate": 9.999999550533726e-06, "loss": 1.0279, "step": 110 }, { "epoch": 0.005984472719430666, "grad_norm": 1.6515668630599976, "learning_rate": 9.999999456145809e-06, "loss": 1.0301, "step": 111 }, { "epoch": 0.006038386888074186, "grad_norm": 1.8541395664215088, "learning_rate": 9.999999352768568e-06, "loss": 1.1057, "step": 112 }, { "epoch": 0.006092301056717706, "grad_norm": 1.6490236520767212, "learning_rate": 9.999999240402002e-06, "loss": 1.0523, "step": 113 }, { "epoch": 0.006146215225361225, "grad_norm": 1.655333161354065, "learning_rate": 9.999999119046113e-06, "loss": 1.0448, "step": 114 }, { "epoch": 0.006200129394004744, "grad_norm": 1.5721609592437744, "learning_rate": 9.999998988700899e-06, "loss": 0.9883, "step": 115 }, { "epoch": 0.006254043562648264, "grad_norm": 1.6411349773406982, "learning_rate": 9.99999884936636e-06, "loss": 1.0255, "step": 116 }, { "epoch": 0.0063079577312917835, "grad_norm": 1.6399502754211426, "learning_rate": 9.999998701042501e-06, "loss": 1.0146, "step": 117 }, { "epoch": 0.006361871899935303, "grad_norm": 1.615026831626892, "learning_rate": 9.999998543729316e-06, "loss": 1.0022, "step": 118 }, { "epoch": 0.006415786068578823, "grad_norm": 1.4867664575576782, "learning_rate": 9.99999837742681e-06, "loss": 1.0164, "step": 119 }, { "epoch": 0.006469700237222342, "grad_norm": 1.540153980255127, "learning_rate": 9.999998202134979e-06, "loss": 0.989, "step": 120 }, { "epoch": 0.006523614405865861, "grad_norm": 1.5535691976547241, "learning_rate": 9.999998017853825e-06, "loss": 0.9942, "step": 121 }, { "epoch": 0.006577528574509381, "grad_norm": 1.4892929792404175, "learning_rate": 9.999997824583351e-06, "loss": 1.0537, "step": 122 }, { "epoch": 0.006631442743152901, "grad_norm": 1.4674094915390015, "learning_rate": 9.999997622323554e-06, "loss": 1.0239, "step": 123 }, { "epoch": 0.00668535691179642, "grad_norm": 1.394027590751648, "learning_rate": 9.999997411074436e-06, "loss": 0.9781, "step": 124 }, { "epoch": 0.006739271080439939, "grad_norm": 1.372728705406189, "learning_rate": 9.999997190835999e-06, "loss": 1.0433, "step": 125 }, { "epoch": 0.006793185249083459, "grad_norm": 1.2535908222198486, "learning_rate": 9.999996961608238e-06, "loss": 0.958, "step": 126 }, { "epoch": 0.006847099417726979, "grad_norm": 1.337633490562439, "learning_rate": 9.999996723391158e-06, "loss": 1.0213, "step": 127 }, { "epoch": 0.006901013586370498, "grad_norm": 1.3640319108963013, "learning_rate": 9.999996476184759e-06, "loss": 1.0432, "step": 128 }, { "epoch": 0.006954927755014018, "grad_norm": 1.2663391828536987, "learning_rate": 9.99999621998904e-06, "loss": 1.0154, "step": 129 }, { "epoch": 0.007008841923657537, "grad_norm": 1.450737476348877, "learning_rate": 9.999995954804004e-06, "loss": 1.0074, "step": 130 }, { "epoch": 0.0070627560923010565, "grad_norm": 1.2757987976074219, "learning_rate": 9.999995680629649e-06, "loss": 0.9996, "step": 131 }, { "epoch": 0.007116670260944576, "grad_norm": 1.3978132009506226, "learning_rate": 9.999995397465974e-06, "loss": 1.04, "step": 132 }, { "epoch": 0.007170584429588096, "grad_norm": 1.3167297840118408, "learning_rate": 9.999995105312982e-06, "loss": 1.0069, "step": 133 }, { "epoch": 0.0072244985982316155, "grad_norm": 1.1626744270324707, "learning_rate": 9.999994804170674e-06, "loss": 0.9722, "step": 134 }, { "epoch": 0.007278412766875135, "grad_norm": 1.354797601699829, "learning_rate": 9.99999449403905e-06, "loss": 0.9019, "step": 135 }, { "epoch": 0.007332326935518654, "grad_norm": 1.2605732679367065, "learning_rate": 9.99999417491811e-06, "loss": 1.0038, "step": 136 }, { "epoch": 0.007386241104162174, "grad_norm": 1.3804657459259033, "learning_rate": 9.999993846807855e-06, "loss": 1.0139, "step": 137 }, { "epoch": 0.007440155272805693, "grad_norm": 1.3001742362976074, "learning_rate": 9.999993509708286e-06, "loss": 1.1436, "step": 138 }, { "epoch": 0.007494069441449213, "grad_norm": 1.2776422500610352, "learning_rate": 9.999993163619401e-06, "loss": 0.9792, "step": 139 }, { "epoch": 0.007547983610092733, "grad_norm": 1.2149187326431274, "learning_rate": 9.999992808541204e-06, "loss": 0.963, "step": 140 }, { "epoch": 0.0076018977787362515, "grad_norm": 1.341806173324585, "learning_rate": 9.999992444473694e-06, "loss": 0.9639, "step": 141 }, { "epoch": 0.007655811947379771, "grad_norm": 1.2565757036209106, "learning_rate": 9.999992071416874e-06, "loss": 0.9193, "step": 142 }, { "epoch": 0.007709726116023291, "grad_norm": 1.3059918880462646, "learning_rate": 9.99999168937074e-06, "loss": 0.9632, "step": 143 }, { "epoch": 0.0077636402846668106, "grad_norm": 1.1719332933425903, "learning_rate": 9.999991298335295e-06, "loss": 0.9687, "step": 144 }, { "epoch": 0.00781755445331033, "grad_norm": 1.125950813293457, "learning_rate": 9.999990898310542e-06, "loss": 0.968, "step": 145 }, { "epoch": 0.007871468621953849, "grad_norm": 1.2400416135787964, "learning_rate": 9.999990489296478e-06, "loss": 0.972, "step": 146 }, { "epoch": 0.007925382790597369, "grad_norm": 1.172117829322815, "learning_rate": 9.999990071293106e-06, "loss": 0.9243, "step": 147 }, { "epoch": 0.007979296959240888, "grad_norm": 1.240317463874817, "learning_rate": 9.999989644300427e-06, "loss": 1.0655, "step": 148 }, { "epoch": 0.008033211127884408, "grad_norm": 1.1535708904266357, "learning_rate": 9.999989208318438e-06, "loss": 0.9871, "step": 149 }, { "epoch": 0.008087125296527928, "grad_norm": 1.2711198329925537, "learning_rate": 9.999988763347145e-06, "loss": 1.0307, "step": 150 }, { "epoch": 0.008141039465171447, "grad_norm": 1.2345954179763794, "learning_rate": 9.999988309386548e-06, "loss": 1.1343, "step": 151 }, { "epoch": 0.008194953633814967, "grad_norm": 1.2489601373672485, "learning_rate": 9.999987846436645e-06, "loss": 1.0303, "step": 152 }, { "epoch": 0.008248867802458487, "grad_norm": 1.264240026473999, "learning_rate": 9.999987374497439e-06, "loss": 0.9562, "step": 153 }, { "epoch": 0.008302781971102005, "grad_norm": 1.2613575458526611, "learning_rate": 9.99998689356893e-06, "loss": 0.954, "step": 154 }, { "epoch": 0.008356696139745524, "grad_norm": 1.2091072797775269, "learning_rate": 9.999986403651116e-06, "loss": 1.0734, "step": 155 }, { "epoch": 0.008410610308389044, "grad_norm": 1.18421471118927, "learning_rate": 9.999985904744002e-06, "loss": 0.9167, "step": 156 }, { "epoch": 0.008464524477032564, "grad_norm": 1.0399659872055054, "learning_rate": 9.99998539684759e-06, "loss": 0.9068, "step": 157 }, { "epoch": 0.008518438645676083, "grad_norm": 1.1292288303375244, "learning_rate": 9.999984879961877e-06, "loss": 1.0027, "step": 158 }, { "epoch": 0.008572352814319603, "grad_norm": 1.2592105865478516, "learning_rate": 9.999984354086867e-06, "loss": 1.0794, "step": 159 }, { "epoch": 0.008626266982963123, "grad_norm": 1.1646504402160645, "learning_rate": 9.999983819222558e-06, "loss": 1.0468, "step": 160 }, { "epoch": 0.008680181151606643, "grad_norm": 1.156711220741272, "learning_rate": 9.999983275368952e-06, "loss": 0.9053, "step": 161 }, { "epoch": 0.008734095320250162, "grad_norm": 1.1169341802597046, "learning_rate": 9.999982722526051e-06, "loss": 0.97, "step": 162 }, { "epoch": 0.008788009488893682, "grad_norm": 1.3474149703979492, "learning_rate": 9.999982160693856e-06, "loss": 1.0221, "step": 163 }, { "epoch": 0.008841923657537202, "grad_norm": 1.2021468877792358, "learning_rate": 9.999981589872368e-06, "loss": 0.9303, "step": 164 }, { "epoch": 0.00889583782618072, "grad_norm": 1.0625534057617188, "learning_rate": 9.999981010061586e-06, "loss": 0.8765, "step": 165 }, { "epoch": 0.00894975199482424, "grad_norm": 1.2688498497009277, "learning_rate": 9.999980421261512e-06, "loss": 1.0163, "step": 166 }, { "epoch": 0.009003666163467759, "grad_norm": 1.122948408126831, "learning_rate": 9.999979823472148e-06, "loss": 0.9953, "step": 167 }, { "epoch": 0.009057580332111279, "grad_norm": 1.1817872524261475, "learning_rate": 9.999979216693495e-06, "loss": 1.0774, "step": 168 }, { "epoch": 0.009111494500754798, "grad_norm": 1.1483280658721924, "learning_rate": 9.999978600925553e-06, "loss": 1.0105, "step": 169 }, { "epoch": 0.009165408669398318, "grad_norm": 1.4039335250854492, "learning_rate": 9.999977976168325e-06, "loss": 0.944, "step": 170 }, { "epoch": 0.009219322838041838, "grad_norm": 1.1459723711013794, "learning_rate": 9.999977342421812e-06, "loss": 0.9208, "step": 171 }, { "epoch": 0.009273237006685357, "grad_norm": 1.0897774696350098, "learning_rate": 9.999976699686011e-06, "loss": 0.8719, "step": 172 }, { "epoch": 0.009327151175328877, "grad_norm": 1.206467866897583, "learning_rate": 9.999976047960928e-06, "loss": 1.0645, "step": 173 }, { "epoch": 0.009381065343972397, "grad_norm": 1.004550814628601, "learning_rate": 9.999975387246563e-06, "loss": 0.9317, "step": 174 }, { "epoch": 0.009434979512615916, "grad_norm": 1.2359992265701294, "learning_rate": 9.999974717542916e-06, "loss": 1.1136, "step": 175 }, { "epoch": 0.009488893681259434, "grad_norm": 1.1922352313995361, "learning_rate": 9.999974038849989e-06, "loss": 1.0307, "step": 176 }, { "epoch": 0.009542807849902954, "grad_norm": 1.1597613096237183, "learning_rate": 9.999973351167782e-06, "loss": 1.0275, "step": 177 }, { "epoch": 0.009596722018546474, "grad_norm": 1.172133445739746, "learning_rate": 9.999972654496298e-06, "loss": 0.9269, "step": 178 }, { "epoch": 0.009650636187189993, "grad_norm": 1.1879733800888062, "learning_rate": 9.999971948835538e-06, "loss": 0.9547, "step": 179 }, { "epoch": 0.009704550355833513, "grad_norm": 1.0029833316802979, "learning_rate": 9.999971234185502e-06, "loss": 0.8994, "step": 180 }, { "epoch": 0.009758464524477033, "grad_norm": 1.0769891738891602, "learning_rate": 9.999970510546194e-06, "loss": 0.9107, "step": 181 }, { "epoch": 0.009812378693120552, "grad_norm": 1.3288064002990723, "learning_rate": 9.99996977791761e-06, "loss": 1.0116, "step": 182 }, { "epoch": 0.009866292861764072, "grad_norm": 1.142452597618103, "learning_rate": 9.999969036299757e-06, "loss": 0.9367, "step": 183 }, { "epoch": 0.009920207030407592, "grad_norm": 1.2458518743515015, "learning_rate": 9.999968285692632e-06, "loss": 1.1398, "step": 184 }, { "epoch": 0.009974121199051111, "grad_norm": 1.3373422622680664, "learning_rate": 9.99996752609624e-06, "loss": 0.959, "step": 185 }, { "epoch": 0.01002803536769463, "grad_norm": 1.2288920879364014, "learning_rate": 9.99996675751058e-06, "loss": 0.9908, "step": 186 }, { "epoch": 0.010081949536338149, "grad_norm": 1.1954001188278198, "learning_rate": 9.999965979935656e-06, "loss": 0.9332, "step": 187 }, { "epoch": 0.010135863704981669, "grad_norm": 1.171021819114685, "learning_rate": 9.999965193371466e-06, "loss": 0.9119, "step": 188 }, { "epoch": 0.010189777873625188, "grad_norm": 1.025169014930725, "learning_rate": 9.999964397818013e-06, "loss": 0.784, "step": 189 }, { "epoch": 0.010243692042268708, "grad_norm": 1.1340326070785522, "learning_rate": 9.999963593275298e-06, "loss": 1.0036, "step": 190 }, { "epoch": 0.010297606210912228, "grad_norm": 1.0302847623825073, "learning_rate": 9.999962779743324e-06, "loss": 0.8293, "step": 191 }, { "epoch": 0.010351520379555747, "grad_norm": 1.2410109043121338, "learning_rate": 9.99996195722209e-06, "loss": 0.9507, "step": 192 }, { "epoch": 0.010405434548199267, "grad_norm": 1.2054308652877808, "learning_rate": 9.9999611257116e-06, "loss": 0.9356, "step": 193 }, { "epoch": 0.010459348716842787, "grad_norm": 1.2046679258346558, "learning_rate": 9.999960285211853e-06, "loss": 1.0638, "step": 194 }, { "epoch": 0.010513262885486306, "grad_norm": 1.4594306945800781, "learning_rate": 9.999959435722852e-06, "loss": 0.9624, "step": 195 }, { "epoch": 0.010567177054129826, "grad_norm": 1.0909247398376465, "learning_rate": 9.999958577244598e-06, "loss": 0.9503, "step": 196 }, { "epoch": 0.010621091222773344, "grad_norm": 1.1524754762649536, "learning_rate": 9.999957709777094e-06, "loss": 0.8954, "step": 197 }, { "epoch": 0.010675005391416864, "grad_norm": 1.4128906726837158, "learning_rate": 9.99995683332034e-06, "loss": 0.8903, "step": 198 }, { "epoch": 0.010728919560060383, "grad_norm": 1.1304652690887451, "learning_rate": 9.999955947874338e-06, "loss": 0.9247, "step": 199 }, { "epoch": 0.010782833728703903, "grad_norm": 1.2978957891464233, "learning_rate": 9.99995505343909e-06, "loss": 0.9473, "step": 200 }, { "epoch": 0.010836747897347423, "grad_norm": 1.0742554664611816, "learning_rate": 9.999954150014595e-06, "loss": 0.9626, "step": 201 }, { "epoch": 0.010890662065990942, "grad_norm": 1.0707745552062988, "learning_rate": 9.999953237600859e-06, "loss": 0.8721, "step": 202 }, { "epoch": 0.010944576234634462, "grad_norm": 1.17974853515625, "learning_rate": 9.99995231619788e-06, "loss": 1.0059, "step": 203 }, { "epoch": 0.010998490403277982, "grad_norm": 1.0108370780944824, "learning_rate": 9.999951385805662e-06, "loss": 0.9527, "step": 204 }, { "epoch": 0.011052404571921502, "grad_norm": 0.9983445405960083, "learning_rate": 9.999950446424204e-06, "loss": 0.7626, "step": 205 }, { "epoch": 0.011106318740565021, "grad_norm": 1.0860002040863037, "learning_rate": 9.99994949805351e-06, "loss": 0.9591, "step": 206 }, { "epoch": 0.01116023290920854, "grad_norm": 1.0447322130203247, "learning_rate": 9.999948540693584e-06, "loss": 0.9861, "step": 207 }, { "epoch": 0.011214147077852059, "grad_norm": 1.2582998275756836, "learning_rate": 9.999947574344423e-06, "loss": 0.8949, "step": 208 }, { "epoch": 0.011268061246495579, "grad_norm": 1.1507002115249634, "learning_rate": 9.99994659900603e-06, "loss": 0.918, "step": 209 }, { "epoch": 0.011321975415139098, "grad_norm": 1.135169267654419, "learning_rate": 9.999945614678408e-06, "loss": 0.9891, "step": 210 }, { "epoch": 0.011375889583782618, "grad_norm": 1.1746275424957275, "learning_rate": 9.999944621361558e-06, "loss": 1.0186, "step": 211 }, { "epoch": 0.011429803752426138, "grad_norm": 1.1137248277664185, "learning_rate": 9.999943619055483e-06, "loss": 0.9584, "step": 212 }, { "epoch": 0.011483717921069657, "grad_norm": 1.336651086807251, "learning_rate": 9.999942607760182e-06, "loss": 1.091, "step": 213 }, { "epoch": 0.011537632089713177, "grad_norm": 1.1966856718063354, "learning_rate": 9.999941587475658e-06, "loss": 0.9761, "step": 214 }, { "epoch": 0.011591546258356697, "grad_norm": 1.0843144655227661, "learning_rate": 9.999940558201915e-06, "loss": 0.8917, "step": 215 }, { "epoch": 0.011645460427000216, "grad_norm": 1.2089293003082275, "learning_rate": 9.999939519938953e-06, "loss": 0.9704, "step": 216 }, { "epoch": 0.011699374595643736, "grad_norm": 1.2409982681274414, "learning_rate": 9.999938472686775e-06, "loss": 0.9949, "step": 217 }, { "epoch": 0.011753288764287254, "grad_norm": 1.1310094594955444, "learning_rate": 9.99993741644538e-06, "loss": 0.9666, "step": 218 }, { "epoch": 0.011807202932930774, "grad_norm": 1.120510220527649, "learning_rate": 9.999936351214772e-06, "loss": 0.8844, "step": 219 }, { "epoch": 0.011861117101574293, "grad_norm": 1.0931518077850342, "learning_rate": 9.999935276994954e-06, "loss": 0.9647, "step": 220 }, { "epoch": 0.011915031270217813, "grad_norm": 1.2821122407913208, "learning_rate": 9.999934193785926e-06, "loss": 1.0533, "step": 221 }, { "epoch": 0.011968945438861333, "grad_norm": 1.183580756187439, "learning_rate": 9.999933101587691e-06, "loss": 0.9196, "step": 222 }, { "epoch": 0.012022859607504852, "grad_norm": 1.045825719833374, "learning_rate": 9.99993200040025e-06, "loss": 0.8953, "step": 223 }, { "epoch": 0.012076773776148372, "grad_norm": 1.0963969230651855, "learning_rate": 9.999930890223605e-06, "loss": 0.9723, "step": 224 }, { "epoch": 0.012130687944791892, "grad_norm": 1.0356731414794922, "learning_rate": 9.999929771057761e-06, "loss": 1.0215, "step": 225 }, { "epoch": 0.012184602113435411, "grad_norm": 1.112277626991272, "learning_rate": 9.999928642902717e-06, "loss": 0.9886, "step": 226 }, { "epoch": 0.012238516282078931, "grad_norm": 0.9969072937965393, "learning_rate": 9.999927505758475e-06, "loss": 0.8601, "step": 227 }, { "epoch": 0.01229243045072245, "grad_norm": 1.123781442642212, "learning_rate": 9.999926359625036e-06, "loss": 0.9894, "step": 228 }, { "epoch": 0.012346344619365969, "grad_norm": 1.2122100591659546, "learning_rate": 9.999925204502406e-06, "loss": 1.0783, "step": 229 }, { "epoch": 0.012400258788009488, "grad_norm": 1.1256672143936157, "learning_rate": 9.999924040390584e-06, "loss": 0.9116, "step": 230 }, { "epoch": 0.012454172956653008, "grad_norm": 1.0646952390670776, "learning_rate": 9.999922867289573e-06, "loss": 0.8993, "step": 231 }, { "epoch": 0.012508087125296528, "grad_norm": 1.194676399230957, "learning_rate": 9.999921685199376e-06, "loss": 1.0377, "step": 232 }, { "epoch": 0.012562001293940047, "grad_norm": 1.0519152879714966, "learning_rate": 9.999920494119992e-06, "loss": 0.8283, "step": 233 }, { "epoch": 0.012615915462583567, "grad_norm": 1.243249773979187, "learning_rate": 9.999919294051427e-06, "loss": 0.9741, "step": 234 }, { "epoch": 0.012669829631227087, "grad_norm": 1.1071687936782837, "learning_rate": 9.999918084993681e-06, "loss": 1.0402, "step": 235 }, { "epoch": 0.012723743799870606, "grad_norm": 1.1224809885025024, "learning_rate": 9.999916866946757e-06, "loss": 0.8793, "step": 236 }, { "epoch": 0.012777657968514126, "grad_norm": 1.0458532571792603, "learning_rate": 9.999915639910656e-06, "loss": 0.9855, "step": 237 }, { "epoch": 0.012831572137157646, "grad_norm": 1.0610811710357666, "learning_rate": 9.999914403885383e-06, "loss": 0.8092, "step": 238 }, { "epoch": 0.012885486305801164, "grad_norm": 1.2818992137908936, "learning_rate": 9.999913158870936e-06, "loss": 1.0101, "step": 239 }, { "epoch": 0.012939400474444683, "grad_norm": 1.110400915145874, "learning_rate": 9.999911904867319e-06, "loss": 0.9782, "step": 240 }, { "epoch": 0.012993314643088203, "grad_norm": 1.3290835618972778, "learning_rate": 9.999910641874537e-06, "loss": 1.0683, "step": 241 }, { "epoch": 0.013047228811731723, "grad_norm": 1.1448980569839478, "learning_rate": 9.999909369892588e-06, "loss": 0.9223, "step": 242 }, { "epoch": 0.013101142980375242, "grad_norm": 1.1710877418518066, "learning_rate": 9.999908088921477e-06, "loss": 0.8022, "step": 243 }, { "epoch": 0.013155057149018762, "grad_norm": 1.1242793798446655, "learning_rate": 9.999906798961207e-06, "loss": 0.9238, "step": 244 }, { "epoch": 0.013208971317662282, "grad_norm": 1.0338802337646484, "learning_rate": 9.999905500011778e-06, "loss": 0.8386, "step": 245 }, { "epoch": 0.013262885486305801, "grad_norm": 1.0910224914550781, "learning_rate": 9.999904192073193e-06, "loss": 0.937, "step": 246 }, { "epoch": 0.013316799654949321, "grad_norm": 1.297788143157959, "learning_rate": 9.999902875145453e-06, "loss": 0.9054, "step": 247 }, { "epoch": 0.01337071382359284, "grad_norm": 1.1317543983459473, "learning_rate": 9.999901549228564e-06, "loss": 0.9418, "step": 248 }, { "epoch": 0.01342462799223636, "grad_norm": 1.0944132804870605, "learning_rate": 9.999900214322526e-06, "loss": 0.9445, "step": 249 }, { "epoch": 0.013478542160879878, "grad_norm": 1.4942843914031982, "learning_rate": 9.999898870427342e-06, "loss": 0.8956, "step": 250 }, { "epoch": 0.013532456329523398, "grad_norm": 1.0630019903182983, "learning_rate": 9.999897517543013e-06, "loss": 0.8381, "step": 251 }, { "epoch": 0.013586370498166918, "grad_norm": 1.65073561668396, "learning_rate": 9.999896155669544e-06, "loss": 1.0148, "step": 252 }, { "epoch": 0.013640284666810438, "grad_norm": 1.035731315612793, "learning_rate": 9.999894784806936e-06, "loss": 0.8092, "step": 253 }, { "epoch": 0.013694198835453957, "grad_norm": 1.308863639831543, "learning_rate": 9.99989340495519e-06, "loss": 0.9742, "step": 254 }, { "epoch": 0.013748113004097477, "grad_norm": 1.1512938737869263, "learning_rate": 9.999892016114313e-06, "loss": 0.8747, "step": 255 }, { "epoch": 0.013802027172740997, "grad_norm": 0.9977009296417236, "learning_rate": 9.9998906182843e-06, "loss": 0.8183, "step": 256 }, { "epoch": 0.013855941341384516, "grad_norm": 1.2228175401687622, "learning_rate": 9.99988921146516e-06, "loss": 0.9917, "step": 257 }, { "epoch": 0.013909855510028036, "grad_norm": 1.0753847360610962, "learning_rate": 9.999887795656896e-06, "loss": 1.0063, "step": 258 }, { "epoch": 0.013963769678671556, "grad_norm": 1.0010429620742798, "learning_rate": 9.999886370859506e-06, "loss": 0.9315, "step": 259 }, { "epoch": 0.014017683847315074, "grad_norm": 1.2038911581039429, "learning_rate": 9.999884937072995e-06, "loss": 0.8764, "step": 260 }, { "epoch": 0.014071598015958593, "grad_norm": 1.1268917322158813, "learning_rate": 9.999883494297365e-06, "loss": 1.0059, "step": 261 }, { "epoch": 0.014125512184602113, "grad_norm": 1.1053709983825684, "learning_rate": 9.999882042532619e-06, "loss": 0.8866, "step": 262 }, { "epoch": 0.014179426353245633, "grad_norm": 1.091145396232605, "learning_rate": 9.999880581778758e-06, "loss": 1.0415, "step": 263 }, { "epoch": 0.014233340521889152, "grad_norm": 1.0019958019256592, "learning_rate": 9.999879112035786e-06, "loss": 0.8177, "step": 264 }, { "epoch": 0.014287254690532672, "grad_norm": 1.1044156551361084, "learning_rate": 9.999877633303708e-06, "loss": 0.9508, "step": 265 }, { "epoch": 0.014341168859176192, "grad_norm": 0.9750218391418457, "learning_rate": 9.999876145582524e-06, "loss": 0.8501, "step": 266 }, { "epoch": 0.014395083027819711, "grad_norm": 1.4015804529190063, "learning_rate": 9.999874648872235e-06, "loss": 0.9491, "step": 267 }, { "epoch": 0.014448997196463231, "grad_norm": 1.066422939300537, "learning_rate": 9.999873143172848e-06, "loss": 1.0104, "step": 268 }, { "epoch": 0.01450291136510675, "grad_norm": 1.1133167743682861, "learning_rate": 9.99987162848436e-06, "loss": 1.0142, "step": 269 }, { "epoch": 0.01455682553375027, "grad_norm": 1.1259140968322754, "learning_rate": 9.999870104806782e-06, "loss": 0.9803, "step": 270 }, { "epoch": 0.014610739702393788, "grad_norm": 1.0813393592834473, "learning_rate": 9.999868572140108e-06, "loss": 0.8728, "step": 271 }, { "epoch": 0.014664653871037308, "grad_norm": 0.9939939379692078, "learning_rate": 9.999867030484347e-06, "loss": 0.8826, "step": 272 }, { "epoch": 0.014718568039680828, "grad_norm": 1.0081939697265625, "learning_rate": 9.999865479839499e-06, "loss": 0.8682, "step": 273 }, { "epoch": 0.014772482208324347, "grad_norm": 1.0190658569335938, "learning_rate": 9.999863920205567e-06, "loss": 0.9094, "step": 274 }, { "epoch": 0.014826396376967867, "grad_norm": 1.0702111721038818, "learning_rate": 9.999862351582553e-06, "loss": 0.9244, "step": 275 }, { "epoch": 0.014880310545611387, "grad_norm": 1.0891972780227661, "learning_rate": 9.999860773970461e-06, "loss": 1.0318, "step": 276 }, { "epoch": 0.014934224714254906, "grad_norm": 0.9788139462471008, "learning_rate": 9.999859187369294e-06, "loss": 0.8779, "step": 277 }, { "epoch": 0.014988138882898426, "grad_norm": 1.0678125619888306, "learning_rate": 9.999857591779055e-06, "loss": 0.8962, "step": 278 }, { "epoch": 0.015042053051541946, "grad_norm": 0.9882293343544006, "learning_rate": 9.999855987199747e-06, "loss": 0.9082, "step": 279 }, { "epoch": 0.015095967220185465, "grad_norm": 0.9987571835517883, "learning_rate": 9.999854373631371e-06, "loss": 0.9708, "step": 280 }, { "epoch": 0.015149881388828985, "grad_norm": 1.0238722562789917, "learning_rate": 9.99985275107393e-06, "loss": 0.9461, "step": 281 }, { "epoch": 0.015203795557472503, "grad_norm": 0.9628013372421265, "learning_rate": 9.999851119527431e-06, "loss": 0.9412, "step": 282 }, { "epoch": 0.015257709726116023, "grad_norm": 1.0021862983703613, "learning_rate": 9.999849478991873e-06, "loss": 0.8461, "step": 283 }, { "epoch": 0.015311623894759542, "grad_norm": 0.9776142239570618, "learning_rate": 9.99984782946726e-06, "loss": 0.962, "step": 284 }, { "epoch": 0.015365538063403062, "grad_norm": 1.0114799737930298, "learning_rate": 9.999846170953593e-06, "loss": 0.8732, "step": 285 }, { "epoch": 0.015419452232046582, "grad_norm": 0.9860401749610901, "learning_rate": 9.999844503450879e-06, "loss": 0.8204, "step": 286 }, { "epoch": 0.015473366400690101, "grad_norm": 1.0743263959884644, "learning_rate": 9.999842826959119e-06, "loss": 0.9445, "step": 287 }, { "epoch": 0.015527280569333621, "grad_norm": 1.0456606149673462, "learning_rate": 9.999841141478315e-06, "loss": 0.8869, "step": 288 }, { "epoch": 0.01558119473797714, "grad_norm": 1.0299748182296753, "learning_rate": 9.99983944700847e-06, "loss": 0.9543, "step": 289 }, { "epoch": 0.01563510890662066, "grad_norm": 1.0176036357879639, "learning_rate": 9.99983774354959e-06, "loss": 0.9672, "step": 290 }, { "epoch": 0.01568902307526418, "grad_norm": 1.0023303031921387, "learning_rate": 9.999836031101675e-06, "loss": 0.9417, "step": 291 }, { "epoch": 0.015742937243907698, "grad_norm": 0.9801005721092224, "learning_rate": 9.99983430966473e-06, "loss": 0.9376, "step": 292 }, { "epoch": 0.01579685141255122, "grad_norm": 1.002906322479248, "learning_rate": 9.999832579238756e-06, "loss": 0.8973, "step": 293 }, { "epoch": 0.015850765581194737, "grad_norm": 1.0014845132827759, "learning_rate": 9.999830839823759e-06, "loss": 0.9583, "step": 294 }, { "epoch": 0.01590467974983826, "grad_norm": 1.0173449516296387, "learning_rate": 9.999829091419739e-06, "loss": 0.9006, "step": 295 }, { "epoch": 0.015958593918481777, "grad_norm": 0.9779545664787292, "learning_rate": 9.999827334026702e-06, "loss": 0.9342, "step": 296 }, { "epoch": 0.016012508087125298, "grad_norm": 0.9800315499305725, "learning_rate": 9.999825567644648e-06, "loss": 0.7948, "step": 297 }, { "epoch": 0.016066422255768816, "grad_norm": 0.9628249406814575, "learning_rate": 9.999823792273583e-06, "loss": 0.8415, "step": 298 }, { "epoch": 0.016120336424412334, "grad_norm": 1.1227449178695679, "learning_rate": 9.99982200791351e-06, "loss": 0.9646, "step": 299 }, { "epoch": 0.016174250593055856, "grad_norm": 1.1018567085266113, "learning_rate": 9.99982021456443e-06, "loss": 0.8647, "step": 300 }, { "epoch": 0.016228164761699373, "grad_norm": 1.1017298698425293, "learning_rate": 9.999818412226347e-06, "loss": 0.8708, "step": 301 }, { "epoch": 0.016282078930342895, "grad_norm": 1.084594488143921, "learning_rate": 9.999816600899267e-06, "loss": 0.9765, "step": 302 }, { "epoch": 0.016335993098986413, "grad_norm": 1.3735941648483276, "learning_rate": 9.99981478058319e-06, "loss": 1.0253, "step": 303 }, { "epoch": 0.016389907267629934, "grad_norm": 1.1644489765167236, "learning_rate": 9.999812951278119e-06, "loss": 0.8519, "step": 304 }, { "epoch": 0.016443821436273452, "grad_norm": 1.0079474449157715, "learning_rate": 9.99981111298406e-06, "loss": 0.9422, "step": 305 }, { "epoch": 0.016497735604916974, "grad_norm": 1.0046736001968384, "learning_rate": 9.999809265701015e-06, "loss": 0.7766, "step": 306 }, { "epoch": 0.01655164977356049, "grad_norm": 1.0312374830245972, "learning_rate": 9.999807409428987e-06, "loss": 0.8844, "step": 307 }, { "epoch": 0.01660556394220401, "grad_norm": 1.0419421195983887, "learning_rate": 9.99980554416798e-06, "loss": 0.8902, "step": 308 }, { "epoch": 0.01665947811084753, "grad_norm": 1.2056832313537598, "learning_rate": 9.999803669917996e-06, "loss": 0.9842, "step": 309 }, { "epoch": 0.01671339227949105, "grad_norm": 0.9645346403121948, "learning_rate": 9.999801786679039e-06, "loss": 0.7837, "step": 310 }, { "epoch": 0.01676730644813457, "grad_norm": 1.0259841680526733, "learning_rate": 9.999799894451115e-06, "loss": 0.8927, "step": 311 }, { "epoch": 0.016821220616778088, "grad_norm": 0.9932212233543396, "learning_rate": 9.999797993234224e-06, "loss": 0.815, "step": 312 }, { "epoch": 0.01687513478542161, "grad_norm": 1.0666078329086304, "learning_rate": 9.99979608302837e-06, "loss": 0.8245, "step": 313 }, { "epoch": 0.016929048954065128, "grad_norm": 0.9566568732261658, "learning_rate": 9.999794163833557e-06, "loss": 0.851, "step": 314 }, { "epoch": 0.01698296312270865, "grad_norm": 1.0056332349777222, "learning_rate": 9.999792235649789e-06, "loss": 0.8704, "step": 315 }, { "epoch": 0.017036877291352167, "grad_norm": 1.036537528038025, "learning_rate": 9.999790298477068e-06, "loss": 0.9512, "step": 316 }, { "epoch": 0.01709079145999569, "grad_norm": 1.1026023626327515, "learning_rate": 9.9997883523154e-06, "loss": 1.0007, "step": 317 }, { "epoch": 0.017144705628639206, "grad_norm": 1.006659984588623, "learning_rate": 9.999786397164786e-06, "loss": 0.8992, "step": 318 }, { "epoch": 0.017198619797282724, "grad_norm": 1.0100573301315308, "learning_rate": 9.99978443302523e-06, "loss": 0.9545, "step": 319 }, { "epoch": 0.017252533965926246, "grad_norm": 1.000086784362793, "learning_rate": 9.999782459896735e-06, "loss": 0.8732, "step": 320 }, { "epoch": 0.017306448134569764, "grad_norm": 1.2039650678634644, "learning_rate": 9.999780477779306e-06, "loss": 0.9881, "step": 321 }, { "epoch": 0.017360362303213285, "grad_norm": 1.0316474437713623, "learning_rate": 9.999778486672948e-06, "loss": 0.8686, "step": 322 }, { "epoch": 0.017414276471856803, "grad_norm": 1.1697666645050049, "learning_rate": 9.999776486577661e-06, "loss": 0.9185, "step": 323 }, { "epoch": 0.017468190640500324, "grad_norm": 0.9523053169250488, "learning_rate": 9.999774477493451e-06, "loss": 0.858, "step": 324 }, { "epoch": 0.017522104809143842, "grad_norm": 0.9660015106201172, "learning_rate": 9.999772459420319e-06, "loss": 0.9964, "step": 325 }, { "epoch": 0.017576018977787364, "grad_norm": 0.971128523349762, "learning_rate": 9.999770432358271e-06, "loss": 0.8999, "step": 326 }, { "epoch": 0.01762993314643088, "grad_norm": 1.221969485282898, "learning_rate": 9.999768396307312e-06, "loss": 0.8628, "step": 327 }, { "epoch": 0.017683847315074403, "grad_norm": 1.0868507623672485, "learning_rate": 9.999766351267442e-06, "loss": 1.0732, "step": 328 }, { "epoch": 0.01773776148371792, "grad_norm": 0.9527992606163025, "learning_rate": 9.999764297238666e-06, "loss": 0.8221, "step": 329 }, { "epoch": 0.01779167565236144, "grad_norm": 0.9969122409820557, "learning_rate": 9.99976223422099e-06, "loss": 0.9234, "step": 330 }, { "epoch": 0.01784558982100496, "grad_norm": 0.9291784763336182, "learning_rate": 9.999760162214415e-06, "loss": 0.7839, "step": 331 }, { "epoch": 0.01789950398964848, "grad_norm": 0.9766960144042969, "learning_rate": 9.999758081218944e-06, "loss": 0.7929, "step": 332 }, { "epoch": 0.017953418158292, "grad_norm": 0.9536904692649841, "learning_rate": 9.999755991234585e-06, "loss": 0.9136, "step": 333 }, { "epoch": 0.018007332326935518, "grad_norm": 1.0325372219085693, "learning_rate": 9.999753892261337e-06, "loss": 0.8367, "step": 334 }, { "epoch": 0.01806124649557904, "grad_norm": 0.9486141800880432, "learning_rate": 9.999751784299207e-06, "loss": 0.8802, "step": 335 }, { "epoch": 0.018115160664222557, "grad_norm": 0.9880577921867371, "learning_rate": 9.999749667348198e-06, "loss": 0.8597, "step": 336 }, { "epoch": 0.01816907483286608, "grad_norm": 1.043199896812439, "learning_rate": 9.999747541408312e-06, "loss": 0.9142, "step": 337 }, { "epoch": 0.018222989001509596, "grad_norm": 1.0606465339660645, "learning_rate": 9.999745406479554e-06, "loss": 0.9876, "step": 338 }, { "epoch": 0.018276903170153118, "grad_norm": 1.139449954032898, "learning_rate": 9.999743262561929e-06, "loss": 0.7773, "step": 339 }, { "epoch": 0.018330817338796636, "grad_norm": 1.1416115760803223, "learning_rate": 9.99974110965544e-06, "loss": 0.9566, "step": 340 }, { "epoch": 0.018384731507440154, "grad_norm": 1.0145153999328613, "learning_rate": 9.99973894776009e-06, "loss": 0.9543, "step": 341 }, { "epoch": 0.018438645676083675, "grad_norm": 0.950528621673584, "learning_rate": 9.999736776875885e-06, "loss": 0.8007, "step": 342 }, { "epoch": 0.018492559844727193, "grad_norm": 0.9080097079277039, "learning_rate": 9.999734597002826e-06, "loss": 0.8273, "step": 343 }, { "epoch": 0.018546474013370715, "grad_norm": 1.0038888454437256, "learning_rate": 9.99973240814092e-06, "loss": 0.9394, "step": 344 }, { "epoch": 0.018600388182014232, "grad_norm": 1.05253267288208, "learning_rate": 9.999730210290168e-06, "loss": 0.9485, "step": 345 }, { "epoch": 0.018654302350657754, "grad_norm": 0.9396592974662781, "learning_rate": 9.999728003450577e-06, "loss": 0.8943, "step": 346 }, { "epoch": 0.018708216519301272, "grad_norm": 1.149387240409851, "learning_rate": 9.999725787622148e-06, "loss": 0.8566, "step": 347 }, { "epoch": 0.018762130687944793, "grad_norm": 1.1573290824890137, "learning_rate": 9.999723562804887e-06, "loss": 0.9641, "step": 348 }, { "epoch": 0.01881604485658831, "grad_norm": 1.0217385292053223, "learning_rate": 9.999721328998797e-06, "loss": 0.9555, "step": 349 }, { "epoch": 0.018869959025231833, "grad_norm": 1.034690499305725, "learning_rate": 9.999719086203884e-06, "loss": 0.9407, "step": 350 }, { "epoch": 0.01892387319387535, "grad_norm": 0.9819002151489258, "learning_rate": 9.999716834420148e-06, "loss": 0.9104, "step": 351 }, { "epoch": 0.01897778736251887, "grad_norm": 1.0459688901901245, "learning_rate": 9.999714573647597e-06, "loss": 0.9296, "step": 352 }, { "epoch": 0.01903170153116239, "grad_norm": 0.9575183391571045, "learning_rate": 9.999712303886232e-06, "loss": 0.8517, "step": 353 }, { "epoch": 0.019085615699805908, "grad_norm": 1.0018881559371948, "learning_rate": 9.99971002513606e-06, "loss": 0.9208, "step": 354 }, { "epoch": 0.01913952986844943, "grad_norm": 1.0291972160339355, "learning_rate": 9.999707737397085e-06, "loss": 0.8765, "step": 355 }, { "epoch": 0.019193444037092947, "grad_norm": 1.0081498622894287, "learning_rate": 9.999705440669306e-06, "loss": 0.9204, "step": 356 }, { "epoch": 0.01924735820573647, "grad_norm": 0.956950843334198, "learning_rate": 9.999703134952733e-06, "loss": 0.8058, "step": 357 }, { "epoch": 0.019301272374379987, "grad_norm": 1.1130229234695435, "learning_rate": 9.999700820247369e-06, "loss": 0.8202, "step": 358 }, { "epoch": 0.019355186543023508, "grad_norm": 1.047211766242981, "learning_rate": 9.999698496553216e-06, "loss": 0.9357, "step": 359 }, { "epoch": 0.019409100711667026, "grad_norm": 1.0225415229797363, "learning_rate": 9.99969616387028e-06, "loss": 0.8306, "step": 360 }, { "epoch": 0.019463014880310544, "grad_norm": 1.060727596282959, "learning_rate": 9.999693822198564e-06, "loss": 0.9178, "step": 361 }, { "epoch": 0.019516929048954065, "grad_norm": 1.0743412971496582, "learning_rate": 9.999691471538074e-06, "loss": 0.8761, "step": 362 }, { "epoch": 0.019570843217597583, "grad_norm": 1.2229491472244263, "learning_rate": 9.99968911188881e-06, "loss": 1.0738, "step": 363 }, { "epoch": 0.019624757386241105, "grad_norm": 0.9889073967933655, "learning_rate": 9.999686743250783e-06, "loss": 0.9458, "step": 364 }, { "epoch": 0.019678671554884623, "grad_norm": 1.0398520231246948, "learning_rate": 9.999684365623992e-06, "loss": 0.9096, "step": 365 }, { "epoch": 0.019732585723528144, "grad_norm": 1.0613081455230713, "learning_rate": 9.999681979008442e-06, "loss": 0.9312, "step": 366 }, { "epoch": 0.019786499892171662, "grad_norm": 0.946211040019989, "learning_rate": 9.99967958340414e-06, "loss": 0.9208, "step": 367 }, { "epoch": 0.019840414060815183, "grad_norm": 1.1298933029174805, "learning_rate": 9.999677178811087e-06, "loss": 0.9378, "step": 368 }, { "epoch": 0.0198943282294587, "grad_norm": 1.1042351722717285, "learning_rate": 9.999674765229288e-06, "loss": 0.9487, "step": 369 }, { "epoch": 0.019948242398102223, "grad_norm": 1.0717188119888306, "learning_rate": 9.999672342658751e-06, "loss": 0.939, "step": 370 }, { "epoch": 0.02000215656674574, "grad_norm": 1.0936871767044067, "learning_rate": 9.999669911099474e-06, "loss": 1.1361, "step": 371 }, { "epoch": 0.02005607073538926, "grad_norm": 1.0650005340576172, "learning_rate": 9.999667470551466e-06, "loss": 0.9709, "step": 372 }, { "epoch": 0.02010998490403278, "grad_norm": 1.0154083967208862, "learning_rate": 9.999665021014731e-06, "loss": 0.9422, "step": 373 }, { "epoch": 0.020163899072676298, "grad_norm": 1.1382607221603394, "learning_rate": 9.999662562489272e-06, "loss": 0.984, "step": 374 }, { "epoch": 0.02021781324131982, "grad_norm": 0.9372896552085876, "learning_rate": 9.999660094975095e-06, "loss": 0.9857, "step": 375 }, { "epoch": 0.020271727409963337, "grad_norm": 1.1777011156082153, "learning_rate": 9.999657618472203e-06, "loss": 0.9731, "step": 376 }, { "epoch": 0.02032564157860686, "grad_norm": 0.9054237604141235, "learning_rate": 9.9996551329806e-06, "loss": 0.9104, "step": 377 }, { "epoch": 0.020379555747250377, "grad_norm": 0.9255661964416504, "learning_rate": 9.999652638500292e-06, "loss": 0.8632, "step": 378 }, { "epoch": 0.020433469915893898, "grad_norm": 0.9440998435020447, "learning_rate": 9.999650135031282e-06, "loss": 0.8945, "step": 379 }, { "epoch": 0.020487384084537416, "grad_norm": 0.9822732210159302, "learning_rate": 9.999647622573577e-06, "loss": 0.8874, "step": 380 }, { "epoch": 0.020541298253180938, "grad_norm": 1.1294387578964233, "learning_rate": 9.999645101127179e-06, "loss": 0.9892, "step": 381 }, { "epoch": 0.020595212421824455, "grad_norm": 1.0458290576934814, "learning_rate": 9.999642570692094e-06, "loss": 0.9163, "step": 382 }, { "epoch": 0.020649126590467973, "grad_norm": 0.8124557733535767, "learning_rate": 9.999640031268326e-06, "loss": 0.6927, "step": 383 }, { "epoch": 0.020703040759111495, "grad_norm": 1.1053259372711182, "learning_rate": 9.999637482855878e-06, "loss": 0.8651, "step": 384 }, { "epoch": 0.020756954927755013, "grad_norm": 1.1280632019042969, "learning_rate": 9.999634925454757e-06, "loss": 0.9708, "step": 385 }, { "epoch": 0.020810869096398534, "grad_norm": 0.9916180372238159, "learning_rate": 9.999632359064965e-06, "loss": 0.9081, "step": 386 }, { "epoch": 0.020864783265042052, "grad_norm": 1.0430771112442017, "learning_rate": 9.99962978368651e-06, "loss": 0.9837, "step": 387 }, { "epoch": 0.020918697433685574, "grad_norm": 1.031343698501587, "learning_rate": 9.999627199319398e-06, "loss": 0.9156, "step": 388 }, { "epoch": 0.02097261160232909, "grad_norm": 1.0157191753387451, "learning_rate": 9.999624605963627e-06, "loss": 0.9379, "step": 389 }, { "epoch": 0.021026525770972613, "grad_norm": 0.9524544477462769, "learning_rate": 9.999622003619204e-06, "loss": 0.8448, "step": 390 }, { "epoch": 0.02108043993961613, "grad_norm": 1.091670036315918, "learning_rate": 9.999619392286137e-06, "loss": 0.9794, "step": 391 }, { "epoch": 0.021134354108259652, "grad_norm": 1.0502233505249023, "learning_rate": 9.999616771964429e-06, "loss": 1.0047, "step": 392 }, { "epoch": 0.02118826827690317, "grad_norm": 1.2087476253509521, "learning_rate": 9.999614142654084e-06, "loss": 0.8964, "step": 393 }, { "epoch": 0.021242182445546688, "grad_norm": 1.0264590978622437, "learning_rate": 9.999611504355106e-06, "loss": 0.8608, "step": 394 }, { "epoch": 0.02129609661419021, "grad_norm": 0.9883281588554382, "learning_rate": 9.999608857067503e-06, "loss": 0.9109, "step": 395 }, { "epoch": 0.021350010782833728, "grad_norm": 0.9913623332977295, "learning_rate": 9.999606200791276e-06, "loss": 0.8993, "step": 396 }, { "epoch": 0.02140392495147725, "grad_norm": 1.019178867340088, "learning_rate": 9.999603535526432e-06, "loss": 0.9115, "step": 397 }, { "epoch": 0.021457839120120767, "grad_norm": 0.9756026864051819, "learning_rate": 9.999600861272974e-06, "loss": 0.834, "step": 398 }, { "epoch": 0.02151175328876429, "grad_norm": 0.9956341981887817, "learning_rate": 9.999598178030909e-06, "loss": 0.8756, "step": 399 }, { "epoch": 0.021565667457407806, "grad_norm": 1.0267717838287354, "learning_rate": 9.999595485800239e-06, "loss": 0.9427, "step": 400 }, { "epoch": 0.021619581626051328, "grad_norm": 1.061139464378357, "learning_rate": 9.999592784580974e-06, "loss": 0.9835, "step": 401 }, { "epoch": 0.021673495794694846, "grad_norm": 0.9970353245735168, "learning_rate": 9.999590074373114e-06, "loss": 0.8946, "step": 402 }, { "epoch": 0.021727409963338367, "grad_norm": 1.056242823600769, "learning_rate": 9.999587355176664e-06, "loss": 0.9076, "step": 403 }, { "epoch": 0.021781324131981885, "grad_norm": 1.0285427570343018, "learning_rate": 9.999584626991632e-06, "loss": 0.8506, "step": 404 }, { "epoch": 0.021835238300625403, "grad_norm": 1.0026901960372925, "learning_rate": 9.99958188981802e-06, "loss": 0.8457, "step": 405 }, { "epoch": 0.021889152469268924, "grad_norm": 0.8921003341674805, "learning_rate": 9.999579143655833e-06, "loss": 0.8215, "step": 406 }, { "epoch": 0.021943066637912442, "grad_norm": 1.2816855907440186, "learning_rate": 9.99957638850508e-06, "loss": 0.8779, "step": 407 }, { "epoch": 0.021996980806555964, "grad_norm": 1.4713681936264038, "learning_rate": 9.99957362436576e-06, "loss": 0.8581, "step": 408 }, { "epoch": 0.02205089497519948, "grad_norm": 1.0117568969726562, "learning_rate": 9.999570851237883e-06, "loss": 0.8865, "step": 409 }, { "epoch": 0.022104809143843003, "grad_norm": 0.9530962705612183, "learning_rate": 9.99956806912145e-06, "loss": 0.8888, "step": 410 }, { "epoch": 0.02215872331248652, "grad_norm": 0.865692675113678, "learning_rate": 9.99956527801647e-06, "loss": 0.8075, "step": 411 }, { "epoch": 0.022212637481130042, "grad_norm": 0.9613220691680908, "learning_rate": 9.999562477922944e-06, "loss": 0.9289, "step": 412 }, { "epoch": 0.02226655164977356, "grad_norm": 0.9419745802879333, "learning_rate": 9.99955966884088e-06, "loss": 0.8758, "step": 413 }, { "epoch": 0.02232046581841708, "grad_norm": 1.0120573043823242, "learning_rate": 9.999556850770282e-06, "loss": 0.9014, "step": 414 }, { "epoch": 0.0223743799870606, "grad_norm": 0.9833963513374329, "learning_rate": 9.999554023711155e-06, "loss": 0.9354, "step": 415 }, { "epoch": 0.022428294155704118, "grad_norm": 0.9058681130409241, "learning_rate": 9.999551187663505e-06, "loss": 0.9201, "step": 416 }, { "epoch": 0.02248220832434764, "grad_norm": 1.0103633403778076, "learning_rate": 9.999548342627334e-06, "loss": 0.9023, "step": 417 }, { "epoch": 0.022536122492991157, "grad_norm": 0.8671039342880249, "learning_rate": 9.99954548860265e-06, "loss": 0.7263, "step": 418 }, { "epoch": 0.02259003666163468, "grad_norm": 1.0967090129852295, "learning_rate": 9.999542625589461e-06, "loss": 1.0616, "step": 419 }, { "epoch": 0.022643950830278196, "grad_norm": 0.9032139778137207, "learning_rate": 9.999539753587764e-06, "loss": 0.782, "step": 420 }, { "epoch": 0.022697864998921718, "grad_norm": 0.9532387256622314, "learning_rate": 9.99953687259757e-06, "loss": 0.9628, "step": 421 }, { "epoch": 0.022751779167565236, "grad_norm": 0.9732246994972229, "learning_rate": 9.999533982618885e-06, "loss": 0.8682, "step": 422 }, { "epoch": 0.022805693336208757, "grad_norm": 0.9160019159317017, "learning_rate": 9.99953108365171e-06, "loss": 0.9051, "step": 423 }, { "epoch": 0.022859607504852275, "grad_norm": 1.0100488662719727, "learning_rate": 9.999528175696054e-06, "loss": 0.9836, "step": 424 }, { "epoch": 0.022913521673495793, "grad_norm": 1.0130014419555664, "learning_rate": 9.99952525875192e-06, "loss": 0.8653, "step": 425 }, { "epoch": 0.022967435842139314, "grad_norm": 0.9726247787475586, "learning_rate": 9.999522332819313e-06, "loss": 0.8761, "step": 426 }, { "epoch": 0.023021350010782832, "grad_norm": 0.9457972049713135, "learning_rate": 9.99951939789824e-06, "loss": 0.8792, "step": 427 }, { "epoch": 0.023075264179426354, "grad_norm": 1.083130121231079, "learning_rate": 9.999516453988706e-06, "loss": 0.9035, "step": 428 }, { "epoch": 0.023129178348069872, "grad_norm": 0.9195771217346191, "learning_rate": 9.999513501090714e-06, "loss": 0.8586, "step": 429 }, { "epoch": 0.023183092516713393, "grad_norm": 0.983346700668335, "learning_rate": 9.999510539204273e-06, "loss": 0.8335, "step": 430 }, { "epoch": 0.02323700668535691, "grad_norm": 1.0524029731750488, "learning_rate": 9.999507568329386e-06, "loss": 0.838, "step": 431 }, { "epoch": 0.023290920854000433, "grad_norm": 1.0267860889434814, "learning_rate": 9.999504588466058e-06, "loss": 0.9345, "step": 432 }, { "epoch": 0.02334483502264395, "grad_norm": 1.025707483291626, "learning_rate": 9.999501599614294e-06, "loss": 0.9042, "step": 433 }, { "epoch": 0.023398749191287472, "grad_norm": 0.9739174842834473, "learning_rate": 9.999498601774101e-06, "loss": 0.7433, "step": 434 }, { "epoch": 0.02345266335993099, "grad_norm": 0.9468310475349426, "learning_rate": 9.999495594945486e-06, "loss": 0.8447, "step": 435 }, { "epoch": 0.023506577528574508, "grad_norm": 0.9820529818534851, "learning_rate": 9.99949257912845e-06, "loss": 0.8842, "step": 436 }, { "epoch": 0.02356049169721803, "grad_norm": 0.998515784740448, "learning_rate": 9.999489554323e-06, "loss": 0.9226, "step": 437 }, { "epoch": 0.023614405865861547, "grad_norm": 0.9819791316986084, "learning_rate": 9.999486520529144e-06, "loss": 0.8559, "step": 438 }, { "epoch": 0.02366832003450507, "grad_norm": 0.9468326568603516, "learning_rate": 9.999483477746884e-06, "loss": 0.8064, "step": 439 }, { "epoch": 0.023722234203148587, "grad_norm": 1.0087614059448242, "learning_rate": 9.999480425976229e-06, "loss": 0.9232, "step": 440 }, { "epoch": 0.023776148371792108, "grad_norm": 0.9446098208427429, "learning_rate": 9.99947736521718e-06, "loss": 0.8511, "step": 441 }, { "epoch": 0.023830062540435626, "grad_norm": 1.0966850519180298, "learning_rate": 9.999474295469746e-06, "loss": 0.9929, "step": 442 }, { "epoch": 0.023883976709079147, "grad_norm": 0.8858770728111267, "learning_rate": 9.99947121673393e-06, "loss": 0.8492, "step": 443 }, { "epoch": 0.023937890877722665, "grad_norm": 1.083717703819275, "learning_rate": 9.999468129009742e-06, "loss": 0.9948, "step": 444 }, { "epoch": 0.023991805046366187, "grad_norm": 1.0251178741455078, "learning_rate": 9.999465032297184e-06, "loss": 0.8769, "step": 445 }, { "epoch": 0.024045719215009705, "grad_norm": 0.9331875443458557, "learning_rate": 9.999461926596261e-06, "loss": 0.8663, "step": 446 }, { "epoch": 0.024099633383653223, "grad_norm": 0.8941493034362793, "learning_rate": 9.999458811906979e-06, "loss": 0.8172, "step": 447 }, { "epoch": 0.024153547552296744, "grad_norm": 0.9978699684143066, "learning_rate": 9.999455688229347e-06, "loss": 0.9303, "step": 448 }, { "epoch": 0.024207461720940262, "grad_norm": 0.8835211992263794, "learning_rate": 9.999452555563366e-06, "loss": 0.8921, "step": 449 }, { "epoch": 0.024261375889583783, "grad_norm": 0.9061810970306396, "learning_rate": 9.999449413909043e-06, "loss": 0.8201, "step": 450 }, { "epoch": 0.0243152900582273, "grad_norm": 1.0061571598052979, "learning_rate": 9.999446263266385e-06, "loss": 0.8506, "step": 451 }, { "epoch": 0.024369204226870823, "grad_norm": 0.9286402463912964, "learning_rate": 9.999443103635398e-06, "loss": 0.8532, "step": 452 }, { "epoch": 0.02442311839551434, "grad_norm": 1.0919772386550903, "learning_rate": 9.999439935016087e-06, "loss": 0.9466, "step": 453 }, { "epoch": 0.024477032564157862, "grad_norm": 1.0552513599395752, "learning_rate": 9.999436757408453e-06, "loss": 0.8406, "step": 454 }, { "epoch": 0.02453094673280138, "grad_norm": 0.9604331851005554, "learning_rate": 9.999433570812511e-06, "loss": 0.8928, "step": 455 }, { "epoch": 0.0245848609014449, "grad_norm": 1.0126323699951172, "learning_rate": 9.999430375228259e-06, "loss": 0.924, "step": 456 }, { "epoch": 0.02463877507008842, "grad_norm": 1.0540791749954224, "learning_rate": 9.999427170655707e-06, "loss": 0.9656, "step": 457 }, { "epoch": 0.024692689238731937, "grad_norm": 0.8622417449951172, "learning_rate": 9.999423957094857e-06, "loss": 0.7428, "step": 458 }, { "epoch": 0.02474660340737546, "grad_norm": 1.106581211090088, "learning_rate": 9.999420734545719e-06, "loss": 0.9258, "step": 459 }, { "epoch": 0.024800517576018977, "grad_norm": 0.990807294845581, "learning_rate": 9.999417503008296e-06, "loss": 0.9083, "step": 460 }, { "epoch": 0.024854431744662498, "grad_norm": 0.9302589893341064, "learning_rate": 9.999414262482594e-06, "loss": 0.8654, "step": 461 }, { "epoch": 0.024908345913306016, "grad_norm": 1.0218255519866943, "learning_rate": 9.999411012968621e-06, "loss": 0.8996, "step": 462 }, { "epoch": 0.024962260081949537, "grad_norm": 0.976108193397522, "learning_rate": 9.99940775446638e-06, "loss": 0.9423, "step": 463 }, { "epoch": 0.025016174250593055, "grad_norm": 1.1027617454528809, "learning_rate": 9.99940448697588e-06, "loss": 1.0407, "step": 464 }, { "epoch": 0.025070088419236577, "grad_norm": 1.0148764848709106, "learning_rate": 9.999401210497122e-06, "loss": 0.9418, "step": 465 }, { "epoch": 0.025124002587880095, "grad_norm": 1.0120681524276733, "learning_rate": 9.999397925030116e-06, "loss": 0.92, "step": 466 }, { "epoch": 0.025177916756523613, "grad_norm": 1.1855127811431885, "learning_rate": 9.999394630574868e-06, "loss": 0.9285, "step": 467 }, { "epoch": 0.025231830925167134, "grad_norm": 1.8014320135116577, "learning_rate": 9.999391327131383e-06, "loss": 0.979, "step": 468 }, { "epoch": 0.025285745093810652, "grad_norm": 1.1568403244018555, "learning_rate": 9.999388014699664e-06, "loss": 0.9574, "step": 469 }, { "epoch": 0.025339659262454173, "grad_norm": 1.2544865608215332, "learning_rate": 9.99938469327972e-06, "loss": 0.8356, "step": 470 }, { "epoch": 0.02539357343109769, "grad_norm": 1.8647997379302979, "learning_rate": 9.99938136287156e-06, "loss": 0.9181, "step": 471 }, { "epoch": 0.025447487599741213, "grad_norm": 0.9942222237586975, "learning_rate": 9.999378023475184e-06, "loss": 0.9297, "step": 472 }, { "epoch": 0.02550140176838473, "grad_norm": 0.9839766621589661, "learning_rate": 9.9993746750906e-06, "loss": 0.9181, "step": 473 }, { "epoch": 0.025555315937028252, "grad_norm": 0.9353258609771729, "learning_rate": 9.999371317717817e-06, "loss": 0.8789, "step": 474 }, { "epoch": 0.02560923010567177, "grad_norm": 0.9256170988082886, "learning_rate": 9.999367951356838e-06, "loss": 0.8725, "step": 475 }, { "epoch": 0.02566314427431529, "grad_norm": 1.1102124452590942, "learning_rate": 9.999364576007669e-06, "loss": 0.9818, "step": 476 }, { "epoch": 0.02571705844295881, "grad_norm": 1.04171884059906, "learning_rate": 9.999361191670316e-06, "loss": 0.9275, "step": 477 }, { "epoch": 0.025770972611602327, "grad_norm": 0.9670290350914001, "learning_rate": 9.999357798344787e-06, "loss": 0.8919, "step": 478 }, { "epoch": 0.02582488678024585, "grad_norm": 1.0543723106384277, "learning_rate": 9.999354396031085e-06, "loss": 0.9356, "step": 479 }, { "epoch": 0.025878800948889367, "grad_norm": 1.1368457078933716, "learning_rate": 9.99935098472922e-06, "loss": 0.9387, "step": 480 }, { "epoch": 0.025932715117532888, "grad_norm": 1.0627872943878174, "learning_rate": 9.999347564439196e-06, "loss": 1.0047, "step": 481 }, { "epoch": 0.025986629286176406, "grad_norm": 0.9553730487823486, "learning_rate": 9.999344135161018e-06, "loss": 0.8845, "step": 482 }, { "epoch": 0.026040543454819928, "grad_norm": 0.9605830907821655, "learning_rate": 9.999340696894694e-06, "loss": 0.8816, "step": 483 }, { "epoch": 0.026094457623463446, "grad_norm": 1.0464140176773071, "learning_rate": 9.999337249640232e-06, "loss": 0.9344, "step": 484 }, { "epoch": 0.026148371792106967, "grad_norm": 1.0667988061904907, "learning_rate": 9.999333793397635e-06, "loss": 0.8834, "step": 485 }, { "epoch": 0.026202285960750485, "grad_norm": 0.8996486663818359, "learning_rate": 9.999330328166908e-06, "loss": 0.8247, "step": 486 }, { "epoch": 0.026256200129394006, "grad_norm": 1.0483838319778442, "learning_rate": 9.99932685394806e-06, "loss": 0.9414, "step": 487 }, { "epoch": 0.026310114298037524, "grad_norm": 1.2089953422546387, "learning_rate": 9.999323370741097e-06, "loss": 1.0913, "step": 488 }, { "epoch": 0.026364028466681042, "grad_norm": 1.074291467666626, "learning_rate": 9.999319878546025e-06, "loss": 0.8882, "step": 489 }, { "epoch": 0.026417942635324564, "grad_norm": 1.0076494216918945, "learning_rate": 9.99931637736285e-06, "loss": 0.8393, "step": 490 }, { "epoch": 0.02647185680396808, "grad_norm": 1.2263407707214355, "learning_rate": 9.99931286719158e-06, "loss": 0.955, "step": 491 }, { "epoch": 0.026525770972611603, "grad_norm": 0.9093664884567261, "learning_rate": 9.999309348032218e-06, "loss": 0.8366, "step": 492 }, { "epoch": 0.02657968514125512, "grad_norm": 1.0704407691955566, "learning_rate": 9.999305819884772e-06, "loss": 0.981, "step": 493 }, { "epoch": 0.026633599309898642, "grad_norm": 1.2105270624160767, "learning_rate": 9.999302282749249e-06, "loss": 0.8896, "step": 494 }, { "epoch": 0.02668751347854216, "grad_norm": 1.0142449140548706, "learning_rate": 9.999298736625654e-06, "loss": 0.8627, "step": 495 }, { "epoch": 0.02674142764718568, "grad_norm": 1.0887057781219482, "learning_rate": 9.999295181513994e-06, "loss": 0.8884, "step": 496 }, { "epoch": 0.0267953418158292, "grad_norm": 0.9958952069282532, "learning_rate": 9.999291617414277e-06, "loss": 0.7768, "step": 497 }, { "epoch": 0.02684925598447272, "grad_norm": 0.8576722741127014, "learning_rate": 9.999288044326508e-06, "loss": 0.715, "step": 498 }, { "epoch": 0.02690317015311624, "grad_norm": 1.058148741722107, "learning_rate": 9.999284462250691e-06, "loss": 0.8693, "step": 499 }, { "epoch": 0.026957084321759757, "grad_norm": 0.9429569244384766, "learning_rate": 9.999280871186837e-06, "loss": 0.8883, "step": 500 }, { "epoch": 0.02701099849040328, "grad_norm": 0.9450993537902832, "learning_rate": 9.999277271134948e-06, "loss": 0.9376, "step": 501 }, { "epoch": 0.027064912659046796, "grad_norm": 1.0307891368865967, "learning_rate": 9.999273662095035e-06, "loss": 0.9098, "step": 502 }, { "epoch": 0.027118826827690318, "grad_norm": 0.9515891671180725, "learning_rate": 9.999270044067101e-06, "loss": 0.8854, "step": 503 }, { "epoch": 0.027172740996333836, "grad_norm": 1.1173255443572998, "learning_rate": 9.999266417051154e-06, "loss": 0.7977, "step": 504 }, { "epoch": 0.027226655164977357, "grad_norm": 1.028194785118103, "learning_rate": 9.9992627810472e-06, "loss": 0.9585, "step": 505 }, { "epoch": 0.027280569333620875, "grad_norm": 1.0855528116226196, "learning_rate": 9.999259136055245e-06, "loss": 0.9807, "step": 506 }, { "epoch": 0.027334483502264396, "grad_norm": 1.1148236989974976, "learning_rate": 9.999255482075298e-06, "loss": 0.9672, "step": 507 }, { "epoch": 0.027388397670907914, "grad_norm": 0.9697713255882263, "learning_rate": 9.999251819107364e-06, "loss": 0.9073, "step": 508 }, { "epoch": 0.027442311839551436, "grad_norm": 0.9802384972572327, "learning_rate": 9.999248147151448e-06, "loss": 0.8704, "step": 509 }, { "epoch": 0.027496226008194954, "grad_norm": 0.963330090045929, "learning_rate": 9.999244466207559e-06, "loss": 0.9312, "step": 510 }, { "epoch": 0.02755014017683847, "grad_norm": 0.8776309490203857, "learning_rate": 9.999240776275703e-06, "loss": 0.8068, "step": 511 }, { "epoch": 0.027604054345481993, "grad_norm": 1.1159353256225586, "learning_rate": 9.999237077355886e-06, "loss": 0.8164, "step": 512 }, { "epoch": 0.02765796851412551, "grad_norm": 1.004232406616211, "learning_rate": 9.999233369448115e-06, "loss": 0.8666, "step": 513 }, { "epoch": 0.027711882682769032, "grad_norm": 1.0300110578536987, "learning_rate": 9.999229652552395e-06, "loss": 0.8774, "step": 514 }, { "epoch": 0.02776579685141255, "grad_norm": 0.8823155164718628, "learning_rate": 9.999225926668736e-06, "loss": 0.7579, "step": 515 }, { "epoch": 0.027819711020056072, "grad_norm": 0.938956618309021, "learning_rate": 9.999222191797144e-06, "loss": 0.8749, "step": 516 }, { "epoch": 0.02787362518869959, "grad_norm": 0.9111800789833069, "learning_rate": 9.999218447937624e-06, "loss": 0.8915, "step": 517 }, { "epoch": 0.02792753935734311, "grad_norm": 0.971813440322876, "learning_rate": 9.999214695090182e-06, "loss": 0.9038, "step": 518 }, { "epoch": 0.02798145352598663, "grad_norm": 0.9159868359565735, "learning_rate": 9.999210933254828e-06, "loss": 0.8726, "step": 519 }, { "epoch": 0.028035367694630147, "grad_norm": 1.0223439931869507, "learning_rate": 9.999207162431566e-06, "loss": 0.8738, "step": 520 }, { "epoch": 0.02808928186327367, "grad_norm": 0.9844004511833191, "learning_rate": 9.999203382620404e-06, "loss": 0.8815, "step": 521 }, { "epoch": 0.028143196031917186, "grad_norm": 1.1636719703674316, "learning_rate": 9.99919959382135e-06, "loss": 0.8781, "step": 522 }, { "epoch": 0.028197110200560708, "grad_norm": 0.9637702703475952, "learning_rate": 9.999195796034407e-06, "loss": 0.8491, "step": 523 }, { "epoch": 0.028251024369204226, "grad_norm": 0.975931704044342, "learning_rate": 9.999191989259584e-06, "loss": 0.9983, "step": 524 }, { "epoch": 0.028304938537847747, "grad_norm": 0.9855527877807617, "learning_rate": 9.999188173496889e-06, "loss": 0.9587, "step": 525 }, { "epoch": 0.028358852706491265, "grad_norm": 0.9925652742385864, "learning_rate": 9.99918434874633e-06, "loss": 0.8408, "step": 526 }, { "epoch": 0.028412766875134787, "grad_norm": 0.9272180795669556, "learning_rate": 9.999180515007908e-06, "loss": 0.8267, "step": 527 }, { "epoch": 0.028466681043778305, "grad_norm": 1.161076307296753, "learning_rate": 9.999176672281636e-06, "loss": 0.9282, "step": 528 }, { "epoch": 0.028520595212421826, "grad_norm": 0.8953909277915955, "learning_rate": 9.99917282056752e-06, "loss": 0.8078, "step": 529 }, { "epoch": 0.028574509381065344, "grad_norm": 0.9194382429122925, "learning_rate": 9.999168959865562e-06, "loss": 0.8385, "step": 530 }, { "epoch": 0.028628423549708862, "grad_norm": 1.0351816415786743, "learning_rate": 9.999165090175775e-06, "loss": 0.8155, "step": 531 }, { "epoch": 0.028682337718352383, "grad_norm": 0.9233224391937256, "learning_rate": 9.999161211498163e-06, "loss": 0.8825, "step": 532 }, { "epoch": 0.0287362518869959, "grad_norm": 1.0415356159210205, "learning_rate": 9.999157323832732e-06, "loss": 0.7844, "step": 533 }, { "epoch": 0.028790166055639423, "grad_norm": 1.0329923629760742, "learning_rate": 9.999153427179492e-06, "loss": 0.893, "step": 534 }, { "epoch": 0.02884408022428294, "grad_norm": 1.237291932106018, "learning_rate": 9.999149521538448e-06, "loss": 0.9786, "step": 535 }, { "epoch": 0.028897994392926462, "grad_norm": 0.9952654242515564, "learning_rate": 9.999145606909607e-06, "loss": 0.9262, "step": 536 }, { "epoch": 0.02895190856156998, "grad_norm": 1.016533374786377, "learning_rate": 9.999141683292977e-06, "loss": 0.9854, "step": 537 }, { "epoch": 0.0290058227302135, "grad_norm": 1.0334454774856567, "learning_rate": 9.999137750688564e-06, "loss": 0.8928, "step": 538 }, { "epoch": 0.02905973689885702, "grad_norm": 0.941662609577179, "learning_rate": 9.999133809096374e-06, "loss": 0.8698, "step": 539 }, { "epoch": 0.02911365106750054, "grad_norm": 0.9454428553581238, "learning_rate": 9.999129858516418e-06, "loss": 0.9261, "step": 540 }, { "epoch": 0.02916756523614406, "grad_norm": 1.0921217203140259, "learning_rate": 9.9991258989487e-06, "loss": 0.9163, "step": 541 }, { "epoch": 0.029221479404787577, "grad_norm": 0.8999170064926147, "learning_rate": 9.999121930393227e-06, "loss": 0.883, "step": 542 }, { "epoch": 0.029275393573431098, "grad_norm": 0.9732702970504761, "learning_rate": 9.999117952850009e-06, "loss": 0.9168, "step": 543 }, { "epoch": 0.029329307742074616, "grad_norm": 1.00196373462677, "learning_rate": 9.99911396631905e-06, "loss": 0.826, "step": 544 }, { "epoch": 0.029383221910718137, "grad_norm": 0.9776156544685364, "learning_rate": 9.999109970800358e-06, "loss": 0.8176, "step": 545 }, { "epoch": 0.029437136079361655, "grad_norm": 1.0503387451171875, "learning_rate": 9.99910596629394e-06, "loss": 0.8617, "step": 546 }, { "epoch": 0.029491050248005177, "grad_norm": 0.9195687174797058, "learning_rate": 9.999101952799805e-06, "loss": 0.8224, "step": 547 }, { "epoch": 0.029544964416648695, "grad_norm": 0.8746809959411621, "learning_rate": 9.999097930317959e-06, "loss": 0.8407, "step": 548 }, { "epoch": 0.029598878585292216, "grad_norm": 0.9035898447036743, "learning_rate": 9.999093898848407e-06, "loss": 0.8344, "step": 549 }, { "epoch": 0.029652792753935734, "grad_norm": 0.8764795064926147, "learning_rate": 9.99908985839116e-06, "loss": 0.8323, "step": 550 }, { "epoch": 0.029706706922579255, "grad_norm": 0.9654614329338074, "learning_rate": 9.999085808946224e-06, "loss": 0.8696, "step": 551 }, { "epoch": 0.029760621091222773, "grad_norm": 1.1295796632766724, "learning_rate": 9.999081750513606e-06, "loss": 0.9608, "step": 552 }, { "epoch": 0.02981453525986629, "grad_norm": 0.9591107368469238, "learning_rate": 9.999077683093313e-06, "loss": 0.8762, "step": 553 }, { "epoch": 0.029868449428509813, "grad_norm": 0.8287899494171143, "learning_rate": 9.999073606685353e-06, "loss": 0.7265, "step": 554 }, { "epoch": 0.02992236359715333, "grad_norm": 0.9429282546043396, "learning_rate": 9.99906952128973e-06, "loss": 0.8835, "step": 555 }, { "epoch": 0.029976277765796852, "grad_norm": 0.9617370963096619, "learning_rate": 9.999065426906459e-06, "loss": 0.9138, "step": 556 }, { "epoch": 0.03003019193444037, "grad_norm": 1.2346372604370117, "learning_rate": 9.999061323535538e-06, "loss": 0.831, "step": 557 }, { "epoch": 0.03008410610308389, "grad_norm": 1.2413623332977295, "learning_rate": 9.999057211176982e-06, "loss": 1.0211, "step": 558 }, { "epoch": 0.03013802027172741, "grad_norm": 0.98906010389328, "learning_rate": 9.999053089830794e-06, "loss": 0.7821, "step": 559 }, { "epoch": 0.03019193444037093, "grad_norm": 0.96706622838974, "learning_rate": 9.999048959496983e-06, "loss": 0.8593, "step": 560 }, { "epoch": 0.03024584860901445, "grad_norm": 0.9400071501731873, "learning_rate": 9.999044820175556e-06, "loss": 0.8731, "step": 561 }, { "epoch": 0.03029976277765797, "grad_norm": 1.1276499032974243, "learning_rate": 9.999040671866522e-06, "loss": 0.86, "step": 562 }, { "epoch": 0.030353676946301488, "grad_norm": 0.8859087228775024, "learning_rate": 9.999036514569885e-06, "loss": 0.8274, "step": 563 }, { "epoch": 0.030407591114945006, "grad_norm": 1.1617575883865356, "learning_rate": 9.999032348285656e-06, "loss": 1.0519, "step": 564 }, { "epoch": 0.030461505283588527, "grad_norm": 0.9717594385147095, "learning_rate": 9.99902817301384e-06, "loss": 0.9276, "step": 565 }, { "epoch": 0.030515419452232045, "grad_norm": 1.000722050666809, "learning_rate": 9.999023988754446e-06, "loss": 0.8714, "step": 566 }, { "epoch": 0.030569333620875567, "grad_norm": 1.1744625568389893, "learning_rate": 9.999019795507481e-06, "loss": 1.0087, "step": 567 }, { "epoch": 0.030623247789519085, "grad_norm": 1.0199978351593018, "learning_rate": 9.999015593272953e-06, "loss": 0.8537, "step": 568 }, { "epoch": 0.030677161958162606, "grad_norm": 0.9232216477394104, "learning_rate": 9.999011382050869e-06, "loss": 0.8488, "step": 569 }, { "epoch": 0.030731076126806124, "grad_norm": 0.9905959367752075, "learning_rate": 9.99900716184124e-06, "loss": 0.9048, "step": 570 }, { "epoch": 0.030784990295449646, "grad_norm": 0.9921644330024719, "learning_rate": 9.999002932644066e-06, "loss": 0.9294, "step": 571 }, { "epoch": 0.030838904464093164, "grad_norm": 1.1583740711212158, "learning_rate": 9.99899869445936e-06, "loss": 0.727, "step": 572 }, { "epoch": 0.03089281863273668, "grad_norm": 0.906736433506012, "learning_rate": 9.998994447287127e-06, "loss": 0.7889, "step": 573 }, { "epoch": 0.030946732801380203, "grad_norm": 0.9060770869255066, "learning_rate": 9.998990191127379e-06, "loss": 0.8493, "step": 574 }, { "epoch": 0.03100064697002372, "grad_norm": 0.9094041585922241, "learning_rate": 9.99898592598012e-06, "loss": 0.8604, "step": 575 }, { "epoch": 0.031054561138667242, "grad_norm": 1.0964977741241455, "learning_rate": 9.998981651845358e-06, "loss": 0.8481, "step": 576 }, { "epoch": 0.03110847530731076, "grad_norm": 0.9509627223014832, "learning_rate": 9.998977368723102e-06, "loss": 0.8601, "step": 577 }, { "epoch": 0.03116238947595428, "grad_norm": 1.0108642578125, "learning_rate": 9.998973076613359e-06, "loss": 0.9076, "step": 578 }, { "epoch": 0.0312163036445978, "grad_norm": 1.0268129110336304, "learning_rate": 9.998968775516136e-06, "loss": 0.8273, "step": 579 }, { "epoch": 0.03127021781324132, "grad_norm": 0.968941867351532, "learning_rate": 9.99896446543144e-06, "loss": 0.8859, "step": 580 }, { "epoch": 0.03132413198188484, "grad_norm": 0.936779260635376, "learning_rate": 9.998960146359283e-06, "loss": 0.8589, "step": 581 }, { "epoch": 0.03137804615052836, "grad_norm": 0.9675167202949524, "learning_rate": 9.998955818299667e-06, "loss": 0.973, "step": 582 }, { "epoch": 0.03143196031917188, "grad_norm": 0.9475553035736084, "learning_rate": 9.998951481252604e-06, "loss": 0.8936, "step": 583 }, { "epoch": 0.031485874487815396, "grad_norm": 0.9130968451499939, "learning_rate": 9.9989471352181e-06, "loss": 0.7668, "step": 584 }, { "epoch": 0.031539788656458914, "grad_norm": 0.8890071511268616, "learning_rate": 9.998942780196164e-06, "loss": 0.8971, "step": 585 }, { "epoch": 0.03159370282510244, "grad_norm": 0.9298738837242126, "learning_rate": 9.998938416186803e-06, "loss": 0.9313, "step": 586 }, { "epoch": 0.03164761699374596, "grad_norm": 1.0683361291885376, "learning_rate": 9.998934043190025e-06, "loss": 0.9018, "step": 587 }, { "epoch": 0.031701531162389475, "grad_norm": 0.939253568649292, "learning_rate": 9.99892966120584e-06, "loss": 0.9119, "step": 588 }, { "epoch": 0.03175544533103299, "grad_norm": 0.9245349764823914, "learning_rate": 9.99892527023425e-06, "loss": 0.9258, "step": 589 }, { "epoch": 0.03180935949967652, "grad_norm": 0.9318797588348389, "learning_rate": 9.998920870275267e-06, "loss": 0.9557, "step": 590 }, { "epoch": 0.031863273668320036, "grad_norm": 0.8909592628479004, "learning_rate": 9.998916461328899e-06, "loss": 0.8122, "step": 591 }, { "epoch": 0.031917187836963554, "grad_norm": 1.0637080669403076, "learning_rate": 9.998912043395154e-06, "loss": 0.9517, "step": 592 }, { "epoch": 0.03197110200560707, "grad_norm": 0.881934642791748, "learning_rate": 9.99890761647404e-06, "loss": 0.8729, "step": 593 }, { "epoch": 0.032025016174250596, "grad_norm": 0.8882094025611877, "learning_rate": 9.998903180565562e-06, "loss": 0.7943, "step": 594 }, { "epoch": 0.032078930342894114, "grad_norm": 0.965085506439209, "learning_rate": 9.99889873566973e-06, "loss": 0.8894, "step": 595 }, { "epoch": 0.03213284451153763, "grad_norm": 0.9679432511329651, "learning_rate": 9.998894281786556e-06, "loss": 0.854, "step": 596 }, { "epoch": 0.03218675868018115, "grad_norm": 1.4454354047775269, "learning_rate": 9.998889818916043e-06, "loss": 0.9944, "step": 597 }, { "epoch": 0.03224067284882467, "grad_norm": 0.9369311928749084, "learning_rate": 9.998885347058198e-06, "loss": 0.8699, "step": 598 }, { "epoch": 0.03229458701746819, "grad_norm": 0.9014303088188171, "learning_rate": 9.998880866213033e-06, "loss": 0.8735, "step": 599 }, { "epoch": 0.03234850118611171, "grad_norm": 0.989251971244812, "learning_rate": 9.998876376380555e-06, "loss": 0.8872, "step": 600 }, { "epoch": 0.03240241535475523, "grad_norm": 1.0256885290145874, "learning_rate": 9.99887187756077e-06, "loss": 0.8787, "step": 601 }, { "epoch": 0.03245632952339875, "grad_norm": 0.9560148119926453, "learning_rate": 9.998867369753688e-06, "loss": 0.8301, "step": 602 }, { "epoch": 0.03251024369204227, "grad_norm": 1.044754147529602, "learning_rate": 9.998862852959316e-06, "loss": 0.9286, "step": 603 }, { "epoch": 0.03256415786068579, "grad_norm": 0.8769629597663879, "learning_rate": 9.998858327177665e-06, "loss": 0.7927, "step": 604 }, { "epoch": 0.03261807202932931, "grad_norm": 0.9217430949211121, "learning_rate": 9.99885379240874e-06, "loss": 0.8327, "step": 605 }, { "epoch": 0.032671986197972826, "grad_norm": 0.8202590942382812, "learning_rate": 9.99884924865255e-06, "loss": 0.7269, "step": 606 }, { "epoch": 0.032725900366616344, "grad_norm": 0.9598796367645264, "learning_rate": 9.998844695909102e-06, "loss": 0.9329, "step": 607 }, { "epoch": 0.03277981453525987, "grad_norm": 1.1016643047332764, "learning_rate": 9.998840134178407e-06, "loss": 0.9836, "step": 608 }, { "epoch": 0.032833728703903386, "grad_norm": 0.9639281630516052, "learning_rate": 9.998835563460471e-06, "loss": 0.8475, "step": 609 }, { "epoch": 0.032887642872546904, "grad_norm": 0.9266204833984375, "learning_rate": 9.998830983755304e-06, "loss": 0.7307, "step": 610 }, { "epoch": 0.03294155704119042, "grad_norm": 0.9282877445220947, "learning_rate": 9.99882639506291e-06, "loss": 0.8163, "step": 611 }, { "epoch": 0.03299547120983395, "grad_norm": 0.8939738869667053, "learning_rate": 9.998821797383302e-06, "loss": 0.6902, "step": 612 }, { "epoch": 0.033049385378477465, "grad_norm": 0.9041041731834412, "learning_rate": 9.998817190716488e-06, "loss": 0.8735, "step": 613 }, { "epoch": 0.03310329954712098, "grad_norm": 0.9973318576812744, "learning_rate": 9.998812575062473e-06, "loss": 0.9017, "step": 614 }, { "epoch": 0.0331572137157645, "grad_norm": 1.0416412353515625, "learning_rate": 9.998807950421268e-06, "loss": 0.9293, "step": 615 }, { "epoch": 0.03321112788440802, "grad_norm": 0.8686584234237671, "learning_rate": 9.998803316792882e-06, "loss": 0.8585, "step": 616 }, { "epoch": 0.033265042053051544, "grad_norm": 0.9907833337783813, "learning_rate": 9.998798674177319e-06, "loss": 0.9264, "step": 617 }, { "epoch": 0.03331895622169506, "grad_norm": 0.9927001595497131, "learning_rate": 9.998794022574592e-06, "loss": 0.895, "step": 618 }, { "epoch": 0.03337287039033858, "grad_norm": 0.9314623475074768, "learning_rate": 9.998789361984707e-06, "loss": 0.8353, "step": 619 }, { "epoch": 0.0334267845589821, "grad_norm": 0.9768248796463013, "learning_rate": 9.998784692407673e-06, "loss": 0.8917, "step": 620 }, { "epoch": 0.03348069872762562, "grad_norm": 0.9487942457199097, "learning_rate": 9.998780013843498e-06, "loss": 0.9022, "step": 621 }, { "epoch": 0.03353461289626914, "grad_norm": 1.0376895666122437, "learning_rate": 9.99877532629219e-06, "loss": 0.7692, "step": 622 }, { "epoch": 0.03358852706491266, "grad_norm": 1.021345853805542, "learning_rate": 9.99877062975376e-06, "loss": 1.0386, "step": 623 }, { "epoch": 0.033642441233556176, "grad_norm": 0.9979421496391296, "learning_rate": 9.998765924228214e-06, "loss": 0.9209, "step": 624 }, { "epoch": 0.0336963554021997, "grad_norm": 0.8552166819572449, "learning_rate": 9.998761209715559e-06, "loss": 0.8765, "step": 625 }, { "epoch": 0.03375026957084322, "grad_norm": 0.9737898707389832, "learning_rate": 9.998756486215809e-06, "loss": 0.7459, "step": 626 }, { "epoch": 0.03380418373948674, "grad_norm": 1.1067259311676025, "learning_rate": 9.998751753728967e-06, "loss": 0.8582, "step": 627 }, { "epoch": 0.033858097908130255, "grad_norm": 1.0689613819122314, "learning_rate": 9.998747012255044e-06, "loss": 0.8523, "step": 628 }, { "epoch": 0.03391201207677377, "grad_norm": 1.1880419254302979, "learning_rate": 9.998742261794048e-06, "loss": 0.9085, "step": 629 }, { "epoch": 0.0339659262454173, "grad_norm": 0.9569217562675476, "learning_rate": 9.998737502345987e-06, "loss": 0.9112, "step": 630 }, { "epoch": 0.034019840414060816, "grad_norm": 0.9955928921699524, "learning_rate": 9.99873273391087e-06, "loss": 0.9166, "step": 631 }, { "epoch": 0.034073754582704334, "grad_norm": 0.8906963467597961, "learning_rate": 9.998727956488708e-06, "loss": 0.882, "step": 632 }, { "epoch": 0.03412766875134785, "grad_norm": 0.9241589307785034, "learning_rate": 9.998723170079506e-06, "loss": 0.8488, "step": 633 }, { "epoch": 0.03418158291999138, "grad_norm": 0.9666005969047546, "learning_rate": 9.998718374683271e-06, "loss": 0.8432, "step": 634 }, { "epoch": 0.034235497088634895, "grad_norm": 0.9036918878555298, "learning_rate": 9.998713570300018e-06, "loss": 0.7979, "step": 635 }, { "epoch": 0.03428941125727841, "grad_norm": 0.8946508765220642, "learning_rate": 9.998708756929751e-06, "loss": 0.8854, "step": 636 }, { "epoch": 0.03434332542592193, "grad_norm": 1.0300164222717285, "learning_rate": 9.99870393457248e-06, "loss": 0.9116, "step": 637 }, { "epoch": 0.03439723959456545, "grad_norm": 1.0635035037994385, "learning_rate": 9.998699103228214e-06, "loss": 0.9138, "step": 638 }, { "epoch": 0.03445115376320897, "grad_norm": 1.0362621545791626, "learning_rate": 9.998694262896962e-06, "loss": 1.0177, "step": 639 }, { "epoch": 0.03450506793185249, "grad_norm": 0.9081454873085022, "learning_rate": 9.99868941357873e-06, "loss": 0.7802, "step": 640 }, { "epoch": 0.03455898210049601, "grad_norm": 0.9943915605545044, "learning_rate": 9.998684555273529e-06, "loss": 0.9356, "step": 641 }, { "epoch": 0.03461289626913953, "grad_norm": 0.9647786021232605, "learning_rate": 9.998679687981367e-06, "loss": 0.741, "step": 642 }, { "epoch": 0.03466681043778305, "grad_norm": 0.9655315279960632, "learning_rate": 9.998674811702255e-06, "loss": 0.8644, "step": 643 }, { "epoch": 0.03472072460642657, "grad_norm": 0.9162091612815857, "learning_rate": 9.998669926436197e-06, "loss": 0.8383, "step": 644 }, { "epoch": 0.03477463877507009, "grad_norm": 0.9509754776954651, "learning_rate": 9.998665032183207e-06, "loss": 0.8066, "step": 645 }, { "epoch": 0.034828552943713606, "grad_norm": 1.0545740127563477, "learning_rate": 9.998660128943292e-06, "loss": 0.8455, "step": 646 }, { "epoch": 0.03488246711235713, "grad_norm": 1.0928760766983032, "learning_rate": 9.998655216716458e-06, "loss": 0.8708, "step": 647 }, { "epoch": 0.03493638128100065, "grad_norm": 0.9743762016296387, "learning_rate": 9.998650295502717e-06, "loss": 0.878, "step": 648 }, { "epoch": 0.03499029544964417, "grad_norm": 1.016741156578064, "learning_rate": 9.998645365302077e-06, "loss": 0.867, "step": 649 }, { "epoch": 0.035044209618287685, "grad_norm": 1.125252366065979, "learning_rate": 9.998640426114548e-06, "loss": 0.9443, "step": 650 }, { "epoch": 0.0350981237869312, "grad_norm": 0.9555762410163879, "learning_rate": 9.998635477940135e-06, "loss": 0.8353, "step": 651 }, { "epoch": 0.03515203795557473, "grad_norm": 0.930173397064209, "learning_rate": 9.998630520778851e-06, "loss": 0.8383, "step": 652 }, { "epoch": 0.035205952124218245, "grad_norm": 1.1592127084732056, "learning_rate": 9.998625554630704e-06, "loss": 0.9708, "step": 653 }, { "epoch": 0.03525986629286176, "grad_norm": 0.9333894848823547, "learning_rate": 9.998620579495701e-06, "loss": 0.9055, "step": 654 }, { "epoch": 0.03531378046150528, "grad_norm": 0.9495646357536316, "learning_rate": 9.998615595373853e-06, "loss": 0.7993, "step": 655 }, { "epoch": 0.035367694630148806, "grad_norm": 1.0919233560562134, "learning_rate": 9.99861060226517e-06, "loss": 0.8852, "step": 656 }, { "epoch": 0.035421608798792324, "grad_norm": 0.907940685749054, "learning_rate": 9.998605600169657e-06, "loss": 0.8294, "step": 657 }, { "epoch": 0.03547552296743584, "grad_norm": 1.0423756837844849, "learning_rate": 9.998600589087328e-06, "loss": 0.8758, "step": 658 }, { "epoch": 0.03552943713607936, "grad_norm": 1.0387269258499146, "learning_rate": 9.998595569018186e-06, "loss": 0.9099, "step": 659 }, { "epoch": 0.03558335130472288, "grad_norm": 0.9186104536056519, "learning_rate": 9.998590539962245e-06, "loss": 0.9025, "step": 660 }, { "epoch": 0.0356372654733664, "grad_norm": 1.0173289775848389, "learning_rate": 9.998585501919514e-06, "loss": 0.8468, "step": 661 }, { "epoch": 0.03569117964200992, "grad_norm": 0.9579570889472961, "learning_rate": 9.998580454889996e-06, "loss": 0.8542, "step": 662 }, { "epoch": 0.03574509381065344, "grad_norm": 1.093515396118164, "learning_rate": 9.99857539887371e-06, "loss": 0.8932, "step": 663 }, { "epoch": 0.03579900797929696, "grad_norm": 1.0651243925094604, "learning_rate": 9.998570333870656e-06, "loss": 0.8822, "step": 664 }, { "epoch": 0.03585292214794048, "grad_norm": 0.973278284072876, "learning_rate": 9.998565259880845e-06, "loss": 0.8724, "step": 665 }, { "epoch": 0.035906836316584, "grad_norm": 0.961321234703064, "learning_rate": 9.998560176904291e-06, "loss": 0.947, "step": 666 }, { "epoch": 0.03596075048522752, "grad_norm": 1.0216654539108276, "learning_rate": 9.998555084940999e-06, "loss": 0.8528, "step": 667 }, { "epoch": 0.036014664653871035, "grad_norm": 0.9917817711830139, "learning_rate": 9.99854998399098e-06, "loss": 0.8608, "step": 668 }, { "epoch": 0.03606857882251455, "grad_norm": 1.0164326429367065, "learning_rate": 9.998544874054243e-06, "loss": 0.8752, "step": 669 }, { "epoch": 0.03612249299115808, "grad_norm": 0.9181317687034607, "learning_rate": 9.998539755130793e-06, "loss": 0.8032, "step": 670 }, { "epoch": 0.036176407159801596, "grad_norm": 1.0100011825561523, "learning_rate": 9.998534627220646e-06, "loss": 0.9205, "step": 671 }, { "epoch": 0.036230321328445114, "grad_norm": 0.9306463599205017, "learning_rate": 9.998529490323807e-06, "loss": 0.8209, "step": 672 }, { "epoch": 0.03628423549708863, "grad_norm": 1.8988754749298096, "learning_rate": 9.998524344440286e-06, "loss": 0.8455, "step": 673 }, { "epoch": 0.03633814966573216, "grad_norm": 0.9742317795753479, "learning_rate": 9.998519189570091e-06, "loss": 0.8733, "step": 674 }, { "epoch": 0.036392063834375675, "grad_norm": 0.9334224462509155, "learning_rate": 9.998514025713234e-06, "loss": 0.8761, "step": 675 }, { "epoch": 0.03644597800301919, "grad_norm": 0.9729838371276855, "learning_rate": 9.998508852869724e-06, "loss": 0.8916, "step": 676 }, { "epoch": 0.03649989217166271, "grad_norm": 0.9721505641937256, "learning_rate": 9.998503671039568e-06, "loss": 0.8735, "step": 677 }, { "epoch": 0.036553806340306236, "grad_norm": 0.9600850939750671, "learning_rate": 9.998498480222775e-06, "loss": 0.9157, "step": 678 }, { "epoch": 0.036607720508949754, "grad_norm": 0.9010732173919678, "learning_rate": 9.998493280419358e-06, "loss": 0.9215, "step": 679 }, { "epoch": 0.03666163467759327, "grad_norm": 0.8708087801933289, "learning_rate": 9.998488071629324e-06, "loss": 0.7218, "step": 680 }, { "epoch": 0.03671554884623679, "grad_norm": 0.9739180207252502, "learning_rate": 9.998482853852682e-06, "loss": 0.8845, "step": 681 }, { "epoch": 0.03676946301488031, "grad_norm": 0.9823595881462097, "learning_rate": 9.998477627089443e-06, "loss": 0.896, "step": 682 }, { "epoch": 0.03682337718352383, "grad_norm": 0.9629859328269958, "learning_rate": 9.998472391339612e-06, "loss": 0.8636, "step": 683 }, { "epoch": 0.03687729135216735, "grad_norm": 0.8644251823425293, "learning_rate": 9.998467146603206e-06, "loss": 0.9124, "step": 684 }, { "epoch": 0.03693120552081087, "grad_norm": 0.8987632989883423, "learning_rate": 9.99846189288023e-06, "loss": 0.801, "step": 685 }, { "epoch": 0.036985119689454386, "grad_norm": 0.9017630219459534, "learning_rate": 9.99845663017069e-06, "loss": 0.8675, "step": 686 }, { "epoch": 0.03703903385809791, "grad_norm": 0.8905850648880005, "learning_rate": 9.998451358474603e-06, "loss": 0.8512, "step": 687 }, { "epoch": 0.03709294802674143, "grad_norm": 0.9807800650596619, "learning_rate": 9.998446077791972e-06, "loss": 0.9258, "step": 688 }, { "epoch": 0.03714686219538495, "grad_norm": 0.8916336894035339, "learning_rate": 9.99844078812281e-06, "loss": 0.8236, "step": 689 }, { "epoch": 0.037200776364028465, "grad_norm": 0.9330187439918518, "learning_rate": 9.998435489467126e-06, "loss": 0.7812, "step": 690 }, { "epoch": 0.03725469053267198, "grad_norm": 0.9859142899513245, "learning_rate": 9.99843018182493e-06, "loss": 0.8699, "step": 691 }, { "epoch": 0.03730860470131551, "grad_norm": 0.9277002215385437, "learning_rate": 9.998424865196228e-06, "loss": 0.9276, "step": 692 }, { "epoch": 0.037362518869959026, "grad_norm": 0.9764281511306763, "learning_rate": 9.998419539581034e-06, "loss": 0.9482, "step": 693 }, { "epoch": 0.037416433038602544, "grad_norm": 1.0108616352081299, "learning_rate": 9.998414204979357e-06, "loss": 0.8582, "step": 694 }, { "epoch": 0.03747034720724606, "grad_norm": 1.2767362594604492, "learning_rate": 9.998408861391202e-06, "loss": 0.7833, "step": 695 }, { "epoch": 0.03752426137588959, "grad_norm": 0.8874560594558716, "learning_rate": 9.998403508816585e-06, "loss": 0.8935, "step": 696 }, { "epoch": 0.037578175544533104, "grad_norm": 0.8549458980560303, "learning_rate": 9.998398147255511e-06, "loss": 0.7747, "step": 697 }, { "epoch": 0.03763208971317662, "grad_norm": 0.9971988201141357, "learning_rate": 9.998392776707993e-06, "loss": 0.753, "step": 698 }, { "epoch": 0.03768600388182014, "grad_norm": 0.9822113513946533, "learning_rate": 9.998387397174037e-06, "loss": 0.9121, "step": 699 }, { "epoch": 0.037739918050463665, "grad_norm": 0.996151864528656, "learning_rate": 9.998382008653656e-06, "loss": 0.9356, "step": 700 }, { "epoch": 0.03779383221910718, "grad_norm": 1.7505156993865967, "learning_rate": 9.998376611146857e-06, "loss": 0.8351, "step": 701 }, { "epoch": 0.0378477463877507, "grad_norm": 1.070356011390686, "learning_rate": 9.998371204653651e-06, "loss": 0.9153, "step": 702 }, { "epoch": 0.03790166055639422, "grad_norm": 0.9383741617202759, "learning_rate": 9.998365789174048e-06, "loss": 0.8904, "step": 703 }, { "epoch": 0.03795557472503774, "grad_norm": 0.8444882035255432, "learning_rate": 9.998360364708058e-06, "loss": 0.8243, "step": 704 }, { "epoch": 0.03800948889368126, "grad_norm": 1.0012257099151611, "learning_rate": 9.99835493125569e-06, "loss": 0.9439, "step": 705 }, { "epoch": 0.03806340306232478, "grad_norm": 0.9745193719863892, "learning_rate": 9.998349488816954e-06, "loss": 0.8667, "step": 706 }, { "epoch": 0.0381173172309683, "grad_norm": 0.8363852500915527, "learning_rate": 9.998344037391859e-06, "loss": 0.8082, "step": 707 }, { "epoch": 0.038171231399611816, "grad_norm": 0.9389918446540833, "learning_rate": 9.998338576980417e-06, "loss": 0.8113, "step": 708 }, { "epoch": 0.03822514556825534, "grad_norm": 0.9216110110282898, "learning_rate": 9.998333107582635e-06, "loss": 0.8179, "step": 709 }, { "epoch": 0.03827905973689886, "grad_norm": 1.0292471647262573, "learning_rate": 9.998327629198526e-06, "loss": 0.8605, "step": 710 }, { "epoch": 0.03833297390554238, "grad_norm": 0.9812708497047424, "learning_rate": 9.998322141828097e-06, "loss": 0.9279, "step": 711 }, { "epoch": 0.038386888074185894, "grad_norm": 0.8186620473861694, "learning_rate": 9.998316645471358e-06, "loss": 0.7877, "step": 712 }, { "epoch": 0.03844080224282941, "grad_norm": 1.034134864807129, "learning_rate": 9.99831114012832e-06, "loss": 0.9867, "step": 713 }, { "epoch": 0.03849471641147294, "grad_norm": 1.1604938507080078, "learning_rate": 9.998305625798993e-06, "loss": 0.9134, "step": 714 }, { "epoch": 0.038548630580116455, "grad_norm": 0.8452483415603638, "learning_rate": 9.998300102483388e-06, "loss": 0.8732, "step": 715 }, { "epoch": 0.03860254474875997, "grad_norm": 0.8881269693374634, "learning_rate": 9.998294570181512e-06, "loss": 0.847, "step": 716 }, { "epoch": 0.03865645891740349, "grad_norm": 0.8822013735771179, "learning_rate": 9.998289028893375e-06, "loss": 0.8404, "step": 717 }, { "epoch": 0.038710373086047016, "grad_norm": 1.0011916160583496, "learning_rate": 9.998283478618991e-06, "loss": 0.8133, "step": 718 }, { "epoch": 0.038764287254690534, "grad_norm": 1.0004018545150757, "learning_rate": 9.998277919358367e-06, "loss": 0.9556, "step": 719 }, { "epoch": 0.03881820142333405, "grad_norm": 0.8176954984664917, "learning_rate": 9.998272351111513e-06, "loss": 0.7977, "step": 720 }, { "epoch": 0.03887211559197757, "grad_norm": 0.9160690307617188, "learning_rate": 9.99826677387844e-06, "loss": 0.9239, "step": 721 }, { "epoch": 0.03892602976062109, "grad_norm": 1.2158405780792236, "learning_rate": 9.998261187659157e-06, "loss": 0.9023, "step": 722 }, { "epoch": 0.03897994392926461, "grad_norm": 0.9564448595046997, "learning_rate": 9.998255592453674e-06, "loss": 0.8585, "step": 723 }, { "epoch": 0.03903385809790813, "grad_norm": 0.8902252316474915, "learning_rate": 9.998249988262002e-06, "loss": 0.8388, "step": 724 }, { "epoch": 0.03908777226655165, "grad_norm": 0.8738620281219482, "learning_rate": 9.998244375084152e-06, "loss": 0.9545, "step": 725 }, { "epoch": 0.03914168643519517, "grad_norm": 0.9670735001564026, "learning_rate": 9.99823875292013e-06, "loss": 0.8335, "step": 726 }, { "epoch": 0.03919560060383869, "grad_norm": 0.8719429969787598, "learning_rate": 9.998233121769952e-06, "loss": 0.8546, "step": 727 }, { "epoch": 0.03924951477248221, "grad_norm": 1.318429708480835, "learning_rate": 9.998227481633622e-06, "loss": 1.0658, "step": 728 }, { "epoch": 0.03930342894112573, "grad_norm": 0.962630569934845, "learning_rate": 9.998221832511155e-06, "loss": 0.9049, "step": 729 }, { "epoch": 0.039357343109769245, "grad_norm": 0.9639857411384583, "learning_rate": 9.998216174402558e-06, "loss": 0.9114, "step": 730 }, { "epoch": 0.03941125727841277, "grad_norm": 1.1621571779251099, "learning_rate": 9.998210507307843e-06, "loss": 0.8776, "step": 731 }, { "epoch": 0.03946517144705629, "grad_norm": 1.170089840888977, "learning_rate": 9.998204831227019e-06, "loss": 0.9928, "step": 732 }, { "epoch": 0.039519085615699806, "grad_norm": 0.8257297873497009, "learning_rate": 9.998199146160098e-06, "loss": 0.7885, "step": 733 }, { "epoch": 0.039572999784343324, "grad_norm": 0.8887513279914856, "learning_rate": 9.998193452107088e-06, "loss": 0.8389, "step": 734 }, { "epoch": 0.03962691395298684, "grad_norm": 0.9321185350418091, "learning_rate": 9.998187749068001e-06, "loss": 0.9083, "step": 735 }, { "epoch": 0.03968082812163037, "grad_norm": 0.9926772713661194, "learning_rate": 9.998182037042847e-06, "loss": 0.9102, "step": 736 }, { "epoch": 0.039734742290273885, "grad_norm": 1.0760009288787842, "learning_rate": 9.998176316031634e-06, "loss": 0.7781, "step": 737 }, { "epoch": 0.0397886564589174, "grad_norm": 1.0998133420944214, "learning_rate": 9.998170586034376e-06, "loss": 0.9725, "step": 738 }, { "epoch": 0.03984257062756092, "grad_norm": 0.9367475509643555, "learning_rate": 9.99816484705108e-06, "loss": 0.8277, "step": 739 }, { "epoch": 0.039896484796204446, "grad_norm": 0.942954957485199, "learning_rate": 9.998159099081758e-06, "loss": 0.8542, "step": 740 }, { "epoch": 0.039950398964847963, "grad_norm": 0.9841166138648987, "learning_rate": 9.998153342126421e-06, "loss": 0.9179, "step": 741 }, { "epoch": 0.04000431313349148, "grad_norm": 0.9215245246887207, "learning_rate": 9.998147576185077e-06, "loss": 0.8899, "step": 742 }, { "epoch": 0.040058227302135, "grad_norm": 1.0368192195892334, "learning_rate": 9.998141801257739e-06, "loss": 0.9828, "step": 743 }, { "epoch": 0.04011214147077852, "grad_norm": 0.9696660041809082, "learning_rate": 9.998136017344416e-06, "loss": 0.9431, "step": 744 }, { "epoch": 0.04016605563942204, "grad_norm": 1.111257791519165, "learning_rate": 9.998130224445117e-06, "loss": 0.9666, "step": 745 }, { "epoch": 0.04021996980806556, "grad_norm": 0.9260644316673279, "learning_rate": 9.998124422559856e-06, "loss": 0.8941, "step": 746 }, { "epoch": 0.04027388397670908, "grad_norm": 0.8622020483016968, "learning_rate": 9.99811861168864e-06, "loss": 0.8148, "step": 747 }, { "epoch": 0.040327798145352596, "grad_norm": 0.8767471313476562, "learning_rate": 9.998112791831483e-06, "loss": 0.7093, "step": 748 }, { "epoch": 0.04038171231399612, "grad_norm": 0.902917206287384, "learning_rate": 9.998106962988391e-06, "loss": 0.7677, "step": 749 }, { "epoch": 0.04043562648263964, "grad_norm": 1.351694941520691, "learning_rate": 9.998101125159377e-06, "loss": 1.0382, "step": 750 }, { "epoch": 0.04048954065128316, "grad_norm": 0.8547930121421814, "learning_rate": 9.998095278344452e-06, "loss": 0.7974, "step": 751 }, { "epoch": 0.040543454819926675, "grad_norm": 0.941149115562439, "learning_rate": 9.998089422543626e-06, "loss": 0.8518, "step": 752 }, { "epoch": 0.0405973689885702, "grad_norm": 0.8671521544456482, "learning_rate": 9.998083557756908e-06, "loss": 0.8049, "step": 753 }, { "epoch": 0.04065128315721372, "grad_norm": 0.9877942800521851, "learning_rate": 9.998077683984311e-06, "loss": 0.8874, "step": 754 }, { "epoch": 0.040705197325857236, "grad_norm": 1.2130393981933594, "learning_rate": 9.998071801225843e-06, "loss": 0.9794, "step": 755 }, { "epoch": 0.040759111494500753, "grad_norm": 0.9422823786735535, "learning_rate": 9.998065909481518e-06, "loss": 0.899, "step": 756 }, { "epoch": 0.04081302566314427, "grad_norm": 0.9770492911338806, "learning_rate": 9.998060008751343e-06, "loss": 0.8434, "step": 757 }, { "epoch": 0.040866939831787796, "grad_norm": 0.9227531552314758, "learning_rate": 9.998054099035332e-06, "loss": 0.8797, "step": 758 }, { "epoch": 0.040920854000431314, "grad_norm": 1.0452102422714233, "learning_rate": 9.998048180333492e-06, "loss": 0.8702, "step": 759 }, { "epoch": 0.04097476816907483, "grad_norm": 1.034125566482544, "learning_rate": 9.998042252645837e-06, "loss": 0.9041, "step": 760 }, { "epoch": 0.04102868233771835, "grad_norm": 0.886029064655304, "learning_rate": 9.998036315972375e-06, "loss": 0.7805, "step": 761 }, { "epoch": 0.041082596506361875, "grad_norm": 0.9845888614654541, "learning_rate": 9.998030370313116e-06, "loss": 0.9836, "step": 762 }, { "epoch": 0.04113651067500539, "grad_norm": 0.9223973155021667, "learning_rate": 9.998024415668075e-06, "loss": 0.768, "step": 763 }, { "epoch": 0.04119042484364891, "grad_norm": 1.0607362985610962, "learning_rate": 9.99801845203726e-06, "loss": 0.865, "step": 764 }, { "epoch": 0.04124433901229243, "grad_norm": 0.9620907306671143, "learning_rate": 9.998012479420683e-06, "loss": 0.7645, "step": 765 }, { "epoch": 0.04129825318093595, "grad_norm": 0.9490310549736023, "learning_rate": 9.99800649781835e-06, "loss": 0.9124, "step": 766 }, { "epoch": 0.04135216734957947, "grad_norm": 0.9684557914733887, "learning_rate": 9.99800050723028e-06, "loss": 0.876, "step": 767 }, { "epoch": 0.04140608151822299, "grad_norm": 0.9633080959320068, "learning_rate": 9.997994507656476e-06, "loss": 0.8976, "step": 768 }, { "epoch": 0.04145999568686651, "grad_norm": 0.9495208263397217, "learning_rate": 9.997988499096953e-06, "loss": 0.9049, "step": 769 }, { "epoch": 0.041513909855510026, "grad_norm": 1.0614326000213623, "learning_rate": 9.997982481551721e-06, "loss": 0.905, "step": 770 }, { "epoch": 0.04156782402415355, "grad_norm": 0.820672869682312, "learning_rate": 9.99797645502079e-06, "loss": 0.8306, "step": 771 }, { "epoch": 0.04162173819279707, "grad_norm": 0.9719771146774292, "learning_rate": 9.997970419504171e-06, "loss": 0.828, "step": 772 }, { "epoch": 0.041675652361440586, "grad_norm": 0.893326997756958, "learning_rate": 9.997964375001875e-06, "loss": 0.8416, "step": 773 }, { "epoch": 0.041729566530084104, "grad_norm": 0.858121395111084, "learning_rate": 9.997958321513915e-06, "loss": 0.8779, "step": 774 }, { "epoch": 0.04178348069872762, "grad_norm": 0.9703636765480042, "learning_rate": 9.997952259040297e-06, "loss": 0.8623, "step": 775 }, { "epoch": 0.04183739486737115, "grad_norm": 0.9626398086547852, "learning_rate": 9.997946187581039e-06, "loss": 0.8309, "step": 776 }, { "epoch": 0.041891309036014665, "grad_norm": 0.9132344722747803, "learning_rate": 9.997940107136143e-06, "loss": 0.8798, "step": 777 }, { "epoch": 0.04194522320465818, "grad_norm": 0.9608821272850037, "learning_rate": 9.997934017705629e-06, "loss": 0.8764, "step": 778 }, { "epoch": 0.0419991373733017, "grad_norm": 1.0852513313293457, "learning_rate": 9.997927919289501e-06, "loss": 0.8908, "step": 779 }, { "epoch": 0.042053051541945226, "grad_norm": 0.9690573215484619, "learning_rate": 9.997921811887774e-06, "loss": 0.8556, "step": 780 }, { "epoch": 0.042106965710588744, "grad_norm": 0.9107050895690918, "learning_rate": 9.997915695500458e-06, "loss": 0.9249, "step": 781 }, { "epoch": 0.04216087987923226, "grad_norm": 1.029974102973938, "learning_rate": 9.997909570127564e-06, "loss": 0.8369, "step": 782 }, { "epoch": 0.04221479404787578, "grad_norm": 0.8179258704185486, "learning_rate": 9.997903435769101e-06, "loss": 0.7729, "step": 783 }, { "epoch": 0.042268708216519305, "grad_norm": 1.0664961338043213, "learning_rate": 9.997897292425082e-06, "loss": 0.8815, "step": 784 }, { "epoch": 0.04232262238516282, "grad_norm": 0.9794465899467468, "learning_rate": 9.997891140095519e-06, "loss": 0.9244, "step": 785 }, { "epoch": 0.04237653655380634, "grad_norm": 0.875953197479248, "learning_rate": 9.99788497878042e-06, "loss": 0.9191, "step": 786 }, { "epoch": 0.04243045072244986, "grad_norm": 0.9880902767181396, "learning_rate": 9.9978788084798e-06, "loss": 0.8639, "step": 787 }, { "epoch": 0.042484364891093376, "grad_norm": 1.0391566753387451, "learning_rate": 9.997872629193666e-06, "loss": 0.9943, "step": 788 }, { "epoch": 0.0425382790597369, "grad_norm": 0.9321290850639343, "learning_rate": 9.997866440922033e-06, "loss": 0.7809, "step": 789 }, { "epoch": 0.04259219322838042, "grad_norm": 0.8898556232452393, "learning_rate": 9.99786024366491e-06, "loss": 0.9353, "step": 790 }, { "epoch": 0.04264610739702394, "grad_norm": 1.1177983283996582, "learning_rate": 9.997854037422306e-06, "loss": 0.8157, "step": 791 }, { "epoch": 0.042700021565667455, "grad_norm": 0.8821296691894531, "learning_rate": 9.997847822194236e-06, "loss": 0.8729, "step": 792 }, { "epoch": 0.04275393573431098, "grad_norm": 0.8545325398445129, "learning_rate": 9.997841597980709e-06, "loss": 0.8415, "step": 793 }, { "epoch": 0.0428078499029545, "grad_norm": 0.9313606023788452, "learning_rate": 9.997835364781739e-06, "loss": 0.8411, "step": 794 }, { "epoch": 0.042861764071598016, "grad_norm": 0.9587781429290771, "learning_rate": 9.997829122597332e-06, "loss": 0.8086, "step": 795 }, { "epoch": 0.042915678240241534, "grad_norm": 0.9708360433578491, "learning_rate": 9.997822871427504e-06, "loss": 0.8715, "step": 796 }, { "epoch": 0.04296959240888505, "grad_norm": 0.8868080973625183, "learning_rate": 9.997816611272265e-06, "loss": 0.8549, "step": 797 }, { "epoch": 0.04302350657752858, "grad_norm": 0.9147778153419495, "learning_rate": 9.997810342131624e-06, "loss": 0.7854, "step": 798 }, { "epoch": 0.043077420746172095, "grad_norm": 0.9853960275650024, "learning_rate": 9.997804064005596e-06, "loss": 0.8243, "step": 799 }, { "epoch": 0.04313133491481561, "grad_norm": 1.0076130628585815, "learning_rate": 9.997797776894189e-06, "loss": 0.9077, "step": 800 }, { "epoch": 0.04318524908345913, "grad_norm": 0.9694076776504517, "learning_rate": 9.997791480797417e-06, "loss": 0.8767, "step": 801 }, { "epoch": 0.043239163252102655, "grad_norm": 1.114001750946045, "learning_rate": 9.99778517571529e-06, "loss": 0.8211, "step": 802 }, { "epoch": 0.04329307742074617, "grad_norm": 0.9701128005981445, "learning_rate": 9.997778861647817e-06, "loss": 0.9084, "step": 803 }, { "epoch": 0.04334699158938969, "grad_norm": 0.868299126625061, "learning_rate": 9.997772538595015e-06, "loss": 0.7556, "step": 804 }, { "epoch": 0.04340090575803321, "grad_norm": 0.9160446524620056, "learning_rate": 9.997766206556888e-06, "loss": 0.821, "step": 805 }, { "epoch": 0.043454819926676734, "grad_norm": 0.934198260307312, "learning_rate": 9.997759865533454e-06, "loss": 0.9113, "step": 806 }, { "epoch": 0.04350873409532025, "grad_norm": 0.8949079513549805, "learning_rate": 9.997753515524722e-06, "loss": 0.7821, "step": 807 }, { "epoch": 0.04356264826396377, "grad_norm": 0.9035944938659668, "learning_rate": 9.997747156530702e-06, "loss": 0.8233, "step": 808 }, { "epoch": 0.04361656243260729, "grad_norm": 0.9681552052497864, "learning_rate": 9.99774078855141e-06, "loss": 0.9241, "step": 809 }, { "epoch": 0.043670476601250806, "grad_norm": 0.906092643737793, "learning_rate": 9.99773441158685e-06, "loss": 0.8948, "step": 810 }, { "epoch": 0.04372439076989433, "grad_norm": 0.9229143261909485, "learning_rate": 9.997728025637039e-06, "loss": 0.8897, "step": 811 }, { "epoch": 0.04377830493853785, "grad_norm": 0.9263061881065369, "learning_rate": 9.997721630701986e-06, "loss": 0.7923, "step": 812 }, { "epoch": 0.04383221910718137, "grad_norm": 0.8474372029304504, "learning_rate": 9.997715226781706e-06, "loss": 0.796, "step": 813 }, { "epoch": 0.043886133275824885, "grad_norm": 0.9960548877716064, "learning_rate": 9.997708813876206e-06, "loss": 0.9166, "step": 814 }, { "epoch": 0.04394004744446841, "grad_norm": 0.9843032956123352, "learning_rate": 9.997702391985499e-06, "loss": 0.9354, "step": 815 }, { "epoch": 0.04399396161311193, "grad_norm": 0.9313154220581055, "learning_rate": 9.997695961109599e-06, "loss": 0.8972, "step": 816 }, { "epoch": 0.044047875781755445, "grad_norm": 0.8846973180770874, "learning_rate": 9.997689521248515e-06, "loss": 0.8599, "step": 817 }, { "epoch": 0.04410178995039896, "grad_norm": 0.8113641738891602, "learning_rate": 9.99768307240226e-06, "loss": 0.8509, "step": 818 }, { "epoch": 0.04415570411904248, "grad_norm": 1.0659984350204468, "learning_rate": 9.997676614570844e-06, "loss": 0.938, "step": 819 }, { "epoch": 0.044209618287686006, "grad_norm": 0.9183745384216309, "learning_rate": 9.99767014775428e-06, "loss": 0.8761, "step": 820 }, { "epoch": 0.044263532456329524, "grad_norm": 0.87090003490448, "learning_rate": 9.997663671952578e-06, "loss": 0.8535, "step": 821 }, { "epoch": 0.04431744662497304, "grad_norm": 0.9857214093208313, "learning_rate": 9.997657187165753e-06, "loss": 0.9434, "step": 822 }, { "epoch": 0.04437136079361656, "grad_norm": 1.0443209409713745, "learning_rate": 9.997650693393812e-06, "loss": 0.8994, "step": 823 }, { "epoch": 0.044425274962260085, "grad_norm": 0.8348391652107239, "learning_rate": 9.99764419063677e-06, "loss": 0.8383, "step": 824 }, { "epoch": 0.0444791891309036, "grad_norm": 1.2708821296691895, "learning_rate": 9.997637678894639e-06, "loss": 0.8733, "step": 825 }, { "epoch": 0.04453310329954712, "grad_norm": 0.9863126277923584, "learning_rate": 9.997631158167428e-06, "loss": 0.9364, "step": 826 }, { "epoch": 0.04458701746819064, "grad_norm": 1.0223352909088135, "learning_rate": 9.99762462845515e-06, "loss": 0.9139, "step": 827 }, { "epoch": 0.04464093163683416, "grad_norm": 0.8559738397598267, "learning_rate": 9.997618089757818e-06, "loss": 0.7461, "step": 828 }, { "epoch": 0.04469484580547768, "grad_norm": 0.9347368478775024, "learning_rate": 9.997611542075442e-06, "loss": 0.9275, "step": 829 }, { "epoch": 0.0447487599741212, "grad_norm": 1.0208019018173218, "learning_rate": 9.997604985408036e-06, "loss": 0.8338, "step": 830 }, { "epoch": 0.04480267414276472, "grad_norm": 0.9792174100875854, "learning_rate": 9.997598419755607e-06, "loss": 0.9437, "step": 831 }, { "epoch": 0.044856588311408235, "grad_norm": 0.851665198802948, "learning_rate": 9.997591845118173e-06, "loss": 0.8008, "step": 832 }, { "epoch": 0.04491050248005176, "grad_norm": 0.9315025806427002, "learning_rate": 9.997585261495742e-06, "loss": 0.8389, "step": 833 }, { "epoch": 0.04496441664869528, "grad_norm": 0.9658921360969543, "learning_rate": 9.997578668888326e-06, "loss": 0.9252, "step": 834 }, { "epoch": 0.045018330817338796, "grad_norm": 0.8989397287368774, "learning_rate": 9.997572067295938e-06, "loss": 0.8648, "step": 835 }, { "epoch": 0.045072244985982314, "grad_norm": 0.8874988555908203, "learning_rate": 9.99756545671859e-06, "loss": 0.7801, "step": 836 }, { "epoch": 0.04512615915462584, "grad_norm": 0.9186223745346069, "learning_rate": 9.997558837156293e-06, "loss": 0.767, "step": 837 }, { "epoch": 0.04518007332326936, "grad_norm": 1.163044810295105, "learning_rate": 9.997552208609059e-06, "loss": 0.8938, "step": 838 }, { "epoch": 0.045233987491912875, "grad_norm": 0.8315468430519104, "learning_rate": 9.997545571076901e-06, "loss": 0.725, "step": 839 }, { "epoch": 0.04528790166055639, "grad_norm": 1.0088660717010498, "learning_rate": 9.99753892455983e-06, "loss": 0.8533, "step": 840 }, { "epoch": 0.04534181582919991, "grad_norm": 0.9268692135810852, "learning_rate": 9.997532269057857e-06, "loss": 0.8739, "step": 841 }, { "epoch": 0.045395729997843436, "grad_norm": 1.0793242454528809, "learning_rate": 9.997525604570995e-06, "loss": 0.9605, "step": 842 }, { "epoch": 0.045449644166486954, "grad_norm": 1.101798176765442, "learning_rate": 9.997518931099258e-06, "loss": 0.9525, "step": 843 }, { "epoch": 0.04550355833513047, "grad_norm": 0.9046466946601868, "learning_rate": 9.997512248642654e-06, "loss": 0.8853, "step": 844 }, { "epoch": 0.04555747250377399, "grad_norm": 0.9629097580909729, "learning_rate": 9.997505557201198e-06, "loss": 0.8882, "step": 845 }, { "epoch": 0.045611386672417514, "grad_norm": 1.1880977153778076, "learning_rate": 9.997498856774898e-06, "loss": 0.8812, "step": 846 }, { "epoch": 0.04566530084106103, "grad_norm": 0.8678451180458069, "learning_rate": 9.997492147363772e-06, "loss": 0.887, "step": 847 }, { "epoch": 0.04571921500970455, "grad_norm": 1.3359739780426025, "learning_rate": 9.99748542896783e-06, "loss": 0.8141, "step": 848 }, { "epoch": 0.04577312917834807, "grad_norm": 0.9263296127319336, "learning_rate": 9.99747870158708e-06, "loss": 0.9357, "step": 849 }, { "epoch": 0.045827043346991586, "grad_norm": 0.9199776649475098, "learning_rate": 9.997471965221541e-06, "loss": 0.8352, "step": 850 }, { "epoch": 0.04588095751563511, "grad_norm": 0.8880730867385864, "learning_rate": 9.997465219871218e-06, "loss": 0.7802, "step": 851 }, { "epoch": 0.04593487168427863, "grad_norm": 0.8561250567436218, "learning_rate": 9.99745846553613e-06, "loss": 0.7987, "step": 852 }, { "epoch": 0.04598878585292215, "grad_norm": 0.8975661396980286, "learning_rate": 9.997451702216283e-06, "loss": 0.8325, "step": 853 }, { "epoch": 0.046042700021565665, "grad_norm": 0.9350215196609497, "learning_rate": 9.997444929911693e-06, "loss": 0.7708, "step": 854 }, { "epoch": 0.04609661419020919, "grad_norm": 1.0229014158248901, "learning_rate": 9.99743814862237e-06, "loss": 0.9643, "step": 855 }, { "epoch": 0.04615052835885271, "grad_norm": 0.9249217510223389, "learning_rate": 9.997431358348329e-06, "loss": 0.8411, "step": 856 }, { "epoch": 0.046204442527496226, "grad_norm": 0.9823042154312134, "learning_rate": 9.99742455908958e-06, "loss": 0.9406, "step": 857 }, { "epoch": 0.046258356696139744, "grad_norm": 1.2525794506072998, "learning_rate": 9.997417750846134e-06, "loss": 0.8507, "step": 858 }, { "epoch": 0.04631227086478327, "grad_norm": 0.9583309888839722, "learning_rate": 9.997410933618006e-06, "loss": 0.8504, "step": 859 }, { "epoch": 0.046366185033426786, "grad_norm": 0.9264401793479919, "learning_rate": 9.997404107405207e-06, "loss": 0.8595, "step": 860 }, { "epoch": 0.046420099202070304, "grad_norm": 0.9833316206932068, "learning_rate": 9.99739727220775e-06, "loss": 0.9025, "step": 861 }, { "epoch": 0.04647401337071382, "grad_norm": 1.0220664739608765, "learning_rate": 9.997390428025645e-06, "loss": 0.8671, "step": 862 }, { "epoch": 0.04652792753935734, "grad_norm": 1.0774664878845215, "learning_rate": 9.997383574858908e-06, "loss": 0.8463, "step": 863 }, { "epoch": 0.046581841708000865, "grad_norm": 0.8821879029273987, "learning_rate": 9.997376712707547e-06, "loss": 0.7565, "step": 864 }, { "epoch": 0.04663575587664438, "grad_norm": 0.9233925938606262, "learning_rate": 9.997369841571577e-06, "loss": 0.9151, "step": 865 }, { "epoch": 0.0466896700452879, "grad_norm": 1.0006109476089478, "learning_rate": 9.997362961451015e-06, "loss": 0.8339, "step": 866 }, { "epoch": 0.04674358421393142, "grad_norm": 0.865035891532898, "learning_rate": 9.997356072345863e-06, "loss": 0.8997, "step": 867 }, { "epoch": 0.046797498382574944, "grad_norm": 1.0450654029846191, "learning_rate": 9.99734917425614e-06, "loss": 0.7966, "step": 868 }, { "epoch": 0.04685141255121846, "grad_norm": 0.8878824710845947, "learning_rate": 9.997342267181857e-06, "loss": 0.831, "step": 869 }, { "epoch": 0.04690532671986198, "grad_norm": 1.0056546926498413, "learning_rate": 9.997335351123028e-06, "loss": 0.8178, "step": 870 }, { "epoch": 0.0469592408885055, "grad_norm": 1.0531659126281738, "learning_rate": 9.997328426079661e-06, "loss": 0.7773, "step": 871 }, { "epoch": 0.047013155057149016, "grad_norm": 0.911021888256073, "learning_rate": 9.997321492051775e-06, "loss": 0.9001, "step": 872 }, { "epoch": 0.04706706922579254, "grad_norm": 0.920103132724762, "learning_rate": 9.997314549039379e-06, "loss": 0.7222, "step": 873 }, { "epoch": 0.04712098339443606, "grad_norm": 0.9449265599250793, "learning_rate": 9.997307597042483e-06, "loss": 0.9197, "step": 874 }, { "epoch": 0.047174897563079576, "grad_norm": 1.013066291809082, "learning_rate": 9.997300636061103e-06, "loss": 0.8854, "step": 875 }, { "epoch": 0.047228811731723094, "grad_norm": 0.8990256786346436, "learning_rate": 9.99729366609525e-06, "loss": 0.81, "step": 876 }, { "epoch": 0.04728272590036662, "grad_norm": 1.0211769342422485, "learning_rate": 9.997286687144938e-06, "loss": 0.8335, "step": 877 }, { "epoch": 0.04733664006901014, "grad_norm": 1.14606773853302, "learning_rate": 9.997279699210178e-06, "loss": 1.0956, "step": 878 }, { "epoch": 0.047390554237653655, "grad_norm": 0.982725977897644, "learning_rate": 9.997272702290981e-06, "loss": 0.8289, "step": 879 }, { "epoch": 0.04744446840629717, "grad_norm": 0.8667361736297607, "learning_rate": 9.997265696387364e-06, "loss": 0.8056, "step": 880 }, { "epoch": 0.04749838257494069, "grad_norm": 0.9029837250709534, "learning_rate": 9.997258681499338e-06, "loss": 0.8461, "step": 881 }, { "epoch": 0.047552296743584216, "grad_norm": 0.8767060041427612, "learning_rate": 9.997251657626915e-06, "loss": 0.8162, "step": 882 }, { "epoch": 0.047606210912227734, "grad_norm": 1.4750713109970093, "learning_rate": 9.997244624770104e-06, "loss": 0.8677, "step": 883 }, { "epoch": 0.04766012508087125, "grad_norm": 1.001286506652832, "learning_rate": 9.997237582928924e-06, "loss": 0.7673, "step": 884 }, { "epoch": 0.04771403924951477, "grad_norm": 0.9560269713401794, "learning_rate": 9.997230532103384e-06, "loss": 0.8597, "step": 885 }, { "epoch": 0.047767953418158295, "grad_norm": 0.834237277507782, "learning_rate": 9.997223472293499e-06, "loss": 0.7629, "step": 886 }, { "epoch": 0.04782186758680181, "grad_norm": 0.9642406702041626, "learning_rate": 9.997216403499278e-06, "loss": 0.83, "step": 887 }, { "epoch": 0.04787578175544533, "grad_norm": 1.2931480407714844, "learning_rate": 9.997209325720736e-06, "loss": 1.0333, "step": 888 }, { "epoch": 0.04792969592408885, "grad_norm": 0.8024531602859497, "learning_rate": 9.997202238957886e-06, "loss": 0.7166, "step": 889 }, { "epoch": 0.04798361009273237, "grad_norm": 0.9585899710655212, "learning_rate": 9.997195143210741e-06, "loss": 0.8099, "step": 890 }, { "epoch": 0.04803752426137589, "grad_norm": 0.9917063117027283, "learning_rate": 9.997188038479313e-06, "loss": 0.8486, "step": 891 }, { "epoch": 0.04809143843001941, "grad_norm": 1.6290080547332764, "learning_rate": 9.997180924763616e-06, "loss": 0.863, "step": 892 }, { "epoch": 0.04814535259866293, "grad_norm": 0.9488585591316223, "learning_rate": 9.99717380206366e-06, "loss": 0.8277, "step": 893 }, { "epoch": 0.048199266767306445, "grad_norm": 1.0710817575454712, "learning_rate": 9.997166670379459e-06, "loss": 0.8898, "step": 894 }, { "epoch": 0.04825318093594997, "grad_norm": 0.9916248917579651, "learning_rate": 9.997159529711026e-06, "loss": 0.9144, "step": 895 }, { "epoch": 0.04830709510459349, "grad_norm": 1.0074565410614014, "learning_rate": 9.997152380058378e-06, "loss": 0.8391, "step": 896 }, { "epoch": 0.048361009273237006, "grad_norm": 1.0258312225341797, "learning_rate": 9.99714522142152e-06, "loss": 0.973, "step": 897 }, { "epoch": 0.048414923441880524, "grad_norm": 0.9497826099395752, "learning_rate": 9.99713805380047e-06, "loss": 0.9221, "step": 898 }, { "epoch": 0.04846883761052405, "grad_norm": 0.9103115200996399, "learning_rate": 9.99713087719524e-06, "loss": 0.7942, "step": 899 }, { "epoch": 0.04852275177916757, "grad_norm": 0.9810470938682556, "learning_rate": 9.997123691605843e-06, "loss": 0.8673, "step": 900 }, { "epoch": 0.048576665947811085, "grad_norm": 1.0422937870025635, "learning_rate": 9.997116497032291e-06, "loss": 0.9263, "step": 901 }, { "epoch": 0.0486305801164546, "grad_norm": 0.8522017002105713, "learning_rate": 9.997109293474596e-06, "loss": 0.8296, "step": 902 }, { "epoch": 0.04868449428509812, "grad_norm": 0.818270742893219, "learning_rate": 9.997102080932775e-06, "loss": 0.7898, "step": 903 }, { "epoch": 0.048738408453741645, "grad_norm": 0.9286766648292542, "learning_rate": 9.997094859406838e-06, "loss": 0.8751, "step": 904 }, { "epoch": 0.04879232262238516, "grad_norm": 1.0779087543487549, "learning_rate": 9.997087628896797e-06, "loss": 0.8377, "step": 905 }, { "epoch": 0.04884623679102868, "grad_norm": 0.8711867928504944, "learning_rate": 9.997080389402667e-06, "loss": 0.8547, "step": 906 }, { "epoch": 0.0489001509596722, "grad_norm": 0.8919721245765686, "learning_rate": 9.99707314092446e-06, "loss": 0.8178, "step": 907 }, { "epoch": 0.048954065128315724, "grad_norm": 0.9084917306900024, "learning_rate": 9.997065883462192e-06, "loss": 0.8618, "step": 908 }, { "epoch": 0.04900797929695924, "grad_norm": 0.869216799736023, "learning_rate": 9.997058617015871e-06, "loss": 0.8636, "step": 909 }, { "epoch": 0.04906189346560276, "grad_norm": 0.9376553893089294, "learning_rate": 9.997051341585513e-06, "loss": 0.8986, "step": 910 }, { "epoch": 0.04911580763424628, "grad_norm": 0.9041107892990112, "learning_rate": 9.99704405717113e-06, "loss": 0.817, "step": 911 }, { "epoch": 0.0491697218028898, "grad_norm": 0.9530431628227234, "learning_rate": 9.997036763772737e-06, "loss": 0.9464, "step": 912 }, { "epoch": 0.04922363597153332, "grad_norm": 0.9601117968559265, "learning_rate": 9.997029461390344e-06, "loss": 0.9014, "step": 913 }, { "epoch": 0.04927755014017684, "grad_norm": 0.9162781834602356, "learning_rate": 9.997022150023968e-06, "loss": 0.8851, "step": 914 }, { "epoch": 0.04933146430882036, "grad_norm": 0.9514605402946472, "learning_rate": 9.99701482967362e-06, "loss": 0.8975, "step": 915 }, { "epoch": 0.049385378477463875, "grad_norm": 0.897203803062439, "learning_rate": 9.997007500339313e-06, "loss": 0.8371, "step": 916 }, { "epoch": 0.0494392926461074, "grad_norm": 0.9372673630714417, "learning_rate": 9.99700016202106e-06, "loss": 0.9432, "step": 917 }, { "epoch": 0.04949320681475092, "grad_norm": 0.8993443846702576, "learning_rate": 9.996992814718875e-06, "loss": 0.8528, "step": 918 }, { "epoch": 0.049547120983394435, "grad_norm": 0.9300720691680908, "learning_rate": 9.996985458432771e-06, "loss": 0.873, "step": 919 }, { "epoch": 0.04960103515203795, "grad_norm": 0.9311426281929016, "learning_rate": 9.996978093162761e-06, "loss": 0.9092, "step": 920 }, { "epoch": 0.04965494932068148, "grad_norm": 0.9244507551193237, "learning_rate": 9.996970718908859e-06, "loss": 0.764, "step": 921 }, { "epoch": 0.049708863489324996, "grad_norm": 0.915512204170227, "learning_rate": 9.996963335671074e-06, "loss": 0.8328, "step": 922 }, { "epoch": 0.049762777657968514, "grad_norm": 0.889994740486145, "learning_rate": 9.996955943449426e-06, "loss": 0.8491, "step": 923 }, { "epoch": 0.04981669182661203, "grad_norm": 0.8676478266716003, "learning_rate": 9.996948542243925e-06, "loss": 0.7677, "step": 924 }, { "epoch": 0.04987060599525555, "grad_norm": 0.9795013070106506, "learning_rate": 9.996941132054586e-06, "loss": 0.9279, "step": 925 }, { "epoch": 0.049924520163899075, "grad_norm": 0.940078854560852, "learning_rate": 9.996933712881419e-06, "loss": 0.8685, "step": 926 }, { "epoch": 0.04997843433254259, "grad_norm": 0.9440926313400269, "learning_rate": 9.996926284724437e-06, "loss": 0.9634, "step": 927 }, { "epoch": 0.05003234850118611, "grad_norm": 0.9120537638664246, "learning_rate": 9.99691884758366e-06, "loss": 0.7656, "step": 928 }, { "epoch": 0.05008626266982963, "grad_norm": 1.1514596939086914, "learning_rate": 9.996911401459093e-06, "loss": 0.864, "step": 929 }, { "epoch": 0.050140176838473154, "grad_norm": 0.8924434185028076, "learning_rate": 9.996903946350756e-06, "loss": 0.877, "step": 930 }, { "epoch": 0.05019409100711667, "grad_norm": 0.9884456992149353, "learning_rate": 9.996896482258657e-06, "loss": 0.94, "step": 931 }, { "epoch": 0.05024800517576019, "grad_norm": 0.9282665252685547, "learning_rate": 9.996889009182814e-06, "loss": 0.8443, "step": 932 }, { "epoch": 0.05030191934440371, "grad_norm": 1.1029064655303955, "learning_rate": 9.996881527123237e-06, "loss": 0.9168, "step": 933 }, { "epoch": 0.050355833513047225, "grad_norm": 0.839625358581543, "learning_rate": 9.996874036079942e-06, "loss": 0.8261, "step": 934 }, { "epoch": 0.05040974768169075, "grad_norm": 0.8612869381904602, "learning_rate": 9.996866536052942e-06, "loss": 0.8197, "step": 935 }, { "epoch": 0.05046366185033427, "grad_norm": 0.9483891129493713, "learning_rate": 9.996859027042249e-06, "loss": 0.8374, "step": 936 }, { "epoch": 0.050517576018977786, "grad_norm": 0.9374566674232483, "learning_rate": 9.996851509047877e-06, "loss": 0.8884, "step": 937 }, { "epoch": 0.050571490187621304, "grad_norm": 0.9164647459983826, "learning_rate": 9.99684398206984e-06, "loss": 0.8419, "step": 938 }, { "epoch": 0.05062540435626483, "grad_norm": 1.0109184980392456, "learning_rate": 9.996836446108153e-06, "loss": 0.8912, "step": 939 }, { "epoch": 0.05067931852490835, "grad_norm": 0.8549674153327942, "learning_rate": 9.996828901162825e-06, "loss": 0.8043, "step": 940 }, { "epoch": 0.050733232693551865, "grad_norm": 0.9618684649467468, "learning_rate": 9.996821347233875e-06, "loss": 0.8246, "step": 941 }, { "epoch": 0.05078714686219538, "grad_norm": 0.9777100682258606, "learning_rate": 9.996813784321314e-06, "loss": 0.887, "step": 942 }, { "epoch": 0.05084106103083891, "grad_norm": 0.8675182461738586, "learning_rate": 9.996806212425157e-06, "loss": 0.7584, "step": 943 }, { "epoch": 0.050894975199482426, "grad_norm": 0.9174523949623108, "learning_rate": 9.996798631545414e-06, "loss": 0.8911, "step": 944 }, { "epoch": 0.050948889368125944, "grad_norm": 0.9269078373908997, "learning_rate": 9.996791041682101e-06, "loss": 0.8049, "step": 945 }, { "epoch": 0.05100280353676946, "grad_norm": 0.8447721600532532, "learning_rate": 9.996783442835233e-06, "loss": 0.7781, "step": 946 }, { "epoch": 0.05105671770541298, "grad_norm": 0.9178231954574585, "learning_rate": 9.99677583500482e-06, "loss": 0.8107, "step": 947 }, { "epoch": 0.051110631874056504, "grad_norm": 0.8741039633750916, "learning_rate": 9.996768218190879e-06, "loss": 0.9278, "step": 948 }, { "epoch": 0.05116454604270002, "grad_norm": 0.7997228503227234, "learning_rate": 9.996760592393425e-06, "loss": 0.7706, "step": 949 }, { "epoch": 0.05121846021134354, "grad_norm": 1.003300428390503, "learning_rate": 9.996752957612468e-06, "loss": 0.8464, "step": 950 }, { "epoch": 0.05127237437998706, "grad_norm": 0.9237748980522156, "learning_rate": 9.996745313848021e-06, "loss": 0.9088, "step": 951 }, { "epoch": 0.05132628854863058, "grad_norm": 0.8565654754638672, "learning_rate": 9.996737661100103e-06, "loss": 0.8208, "step": 952 }, { "epoch": 0.0513802027172741, "grad_norm": 1.0590770244598389, "learning_rate": 9.996729999368722e-06, "loss": 0.9272, "step": 953 }, { "epoch": 0.05143411688591762, "grad_norm": 0.8888198733329773, "learning_rate": 9.996722328653897e-06, "loss": 0.8264, "step": 954 }, { "epoch": 0.05148803105456114, "grad_norm": 0.9211130142211914, "learning_rate": 9.996714648955636e-06, "loss": 0.8807, "step": 955 }, { "epoch": 0.051541945223204655, "grad_norm": 1.0241321325302124, "learning_rate": 9.996706960273958e-06, "loss": 0.7638, "step": 956 }, { "epoch": 0.05159585939184818, "grad_norm": 0.903762698173523, "learning_rate": 9.996699262608875e-06, "loss": 0.8583, "step": 957 }, { "epoch": 0.0516497735604917, "grad_norm": 0.9271189570426941, "learning_rate": 9.9966915559604e-06, "loss": 0.8341, "step": 958 }, { "epoch": 0.051703687729135216, "grad_norm": 0.865260899066925, "learning_rate": 9.996683840328546e-06, "loss": 0.9136, "step": 959 }, { "epoch": 0.051757601897778734, "grad_norm": 0.8903625011444092, "learning_rate": 9.996676115713332e-06, "loss": 0.8706, "step": 960 }, { "epoch": 0.05181151606642226, "grad_norm": 0.9228227138519287, "learning_rate": 9.996668382114765e-06, "loss": 0.8825, "step": 961 }, { "epoch": 0.051865430235065776, "grad_norm": 0.9146421551704407, "learning_rate": 9.996660639532863e-06, "loss": 0.8347, "step": 962 }, { "epoch": 0.051919344403709294, "grad_norm": 0.9010991454124451, "learning_rate": 9.99665288796764e-06, "loss": 0.8016, "step": 963 }, { "epoch": 0.05197325857235281, "grad_norm": 0.8763105869293213, "learning_rate": 9.996645127419107e-06, "loss": 0.8651, "step": 964 }, { "epoch": 0.05202717274099634, "grad_norm": 0.9506256580352783, "learning_rate": 9.996637357887281e-06, "loss": 0.9429, "step": 965 }, { "epoch": 0.052081086909639855, "grad_norm": 0.9484269022941589, "learning_rate": 9.996629579372175e-06, "loss": 0.855, "step": 966 }, { "epoch": 0.05213500107828337, "grad_norm": 0.8970646262168884, "learning_rate": 9.996621791873804e-06, "loss": 0.8611, "step": 967 }, { "epoch": 0.05218891524692689, "grad_norm": 0.8925203680992126, "learning_rate": 9.99661399539218e-06, "loss": 0.8206, "step": 968 }, { "epoch": 0.05224282941557041, "grad_norm": 1.069669246673584, "learning_rate": 9.996606189927318e-06, "loss": 0.876, "step": 969 }, { "epoch": 0.052296743584213934, "grad_norm": 0.8456307649612427, "learning_rate": 9.996598375479232e-06, "loss": 0.7514, "step": 970 }, { "epoch": 0.05235065775285745, "grad_norm": 0.9182801246643066, "learning_rate": 9.996590552047936e-06, "loss": 0.8915, "step": 971 }, { "epoch": 0.05240457192150097, "grad_norm": 0.7616676688194275, "learning_rate": 9.996582719633445e-06, "loss": 0.7106, "step": 972 }, { "epoch": 0.05245848609014449, "grad_norm": 0.8873127102851868, "learning_rate": 9.99657487823577e-06, "loss": 0.9171, "step": 973 }, { "epoch": 0.05251240025878801, "grad_norm": 0.9724618792533875, "learning_rate": 9.996567027854929e-06, "loss": 0.9765, "step": 974 }, { "epoch": 0.05256631442743153, "grad_norm": 0.9106513857841492, "learning_rate": 9.996559168490933e-06, "loss": 0.8332, "step": 975 }, { "epoch": 0.05262022859607505, "grad_norm": 0.8551159501075745, "learning_rate": 9.996551300143798e-06, "loss": 0.8128, "step": 976 }, { "epoch": 0.052674142764718566, "grad_norm": 0.9829822182655334, "learning_rate": 9.996543422813539e-06, "loss": 0.9088, "step": 977 }, { "epoch": 0.052728056933362084, "grad_norm": 0.8281888961791992, "learning_rate": 9.996535536500166e-06, "loss": 0.8338, "step": 978 }, { "epoch": 0.05278197110200561, "grad_norm": 0.951319694519043, "learning_rate": 9.9965276412037e-06, "loss": 0.9359, "step": 979 }, { "epoch": 0.05283588527064913, "grad_norm": 0.841390073299408, "learning_rate": 9.996519736924148e-06, "loss": 0.7952, "step": 980 }, { "epoch": 0.052889799439292645, "grad_norm": 0.8847686648368835, "learning_rate": 9.996511823661528e-06, "loss": 0.8435, "step": 981 }, { "epoch": 0.05294371360793616, "grad_norm": 0.9261316061019897, "learning_rate": 9.996503901415855e-06, "loss": 0.8646, "step": 982 }, { "epoch": 0.05299762777657969, "grad_norm": 0.9366586804389954, "learning_rate": 9.99649597018714e-06, "loss": 0.8586, "step": 983 }, { "epoch": 0.053051541945223206, "grad_norm": 0.8916764259338379, "learning_rate": 9.9964880299754e-06, "loss": 0.8215, "step": 984 }, { "epoch": 0.053105456113866724, "grad_norm": 0.9496534466743469, "learning_rate": 9.996480080780648e-06, "loss": 0.7984, "step": 985 }, { "epoch": 0.05315937028251024, "grad_norm": 0.9736526608467102, "learning_rate": 9.9964721226029e-06, "loss": 0.7881, "step": 986 }, { "epoch": 0.05321328445115376, "grad_norm": 0.9533856511116028, "learning_rate": 9.996464155442167e-06, "loss": 0.9855, "step": 987 }, { "epoch": 0.053267198619797285, "grad_norm": 0.9656437039375305, "learning_rate": 9.996456179298467e-06, "loss": 0.9571, "step": 988 }, { "epoch": 0.0533211127884408, "grad_norm": 0.8887313008308411, "learning_rate": 9.996448194171813e-06, "loss": 0.9381, "step": 989 }, { "epoch": 0.05337502695708432, "grad_norm": 1.0181535482406616, "learning_rate": 9.996440200062217e-06, "loss": 0.8834, "step": 990 }, { "epoch": 0.05342894112572784, "grad_norm": 0.9083503484725952, "learning_rate": 9.996432196969696e-06, "loss": 0.9733, "step": 991 }, { "epoch": 0.05348285529437136, "grad_norm": 0.9051093459129333, "learning_rate": 9.996424184894264e-06, "loss": 0.8531, "step": 992 }, { "epoch": 0.05353676946301488, "grad_norm": 1.0264357328414917, "learning_rate": 9.996416163835935e-06, "loss": 0.9212, "step": 993 }, { "epoch": 0.0535906836316584, "grad_norm": 1.0350812673568726, "learning_rate": 9.996408133794726e-06, "loss": 0.7843, "step": 994 }, { "epoch": 0.05364459780030192, "grad_norm": 0.9610341787338257, "learning_rate": 9.996400094770647e-06, "loss": 0.8561, "step": 995 }, { "epoch": 0.05369851196894544, "grad_norm": 0.8123961687088013, "learning_rate": 9.996392046763714e-06, "loss": 0.8296, "step": 996 }, { "epoch": 0.05375242613758896, "grad_norm": 0.9337920546531677, "learning_rate": 9.996383989773942e-06, "loss": 0.8525, "step": 997 }, { "epoch": 0.05380634030623248, "grad_norm": 1.1319444179534912, "learning_rate": 9.996375923801347e-06, "loss": 0.9127, "step": 998 }, { "epoch": 0.053860254474875996, "grad_norm": 0.8506798148155212, "learning_rate": 9.996367848845941e-06, "loss": 0.884, "step": 999 }, { "epoch": 0.053914168643519514, "grad_norm": 0.8248615860939026, "learning_rate": 9.996359764907739e-06, "loss": 0.7579, "step": 1000 }, { "epoch": 0.05396808281216304, "grad_norm": 0.9258946180343628, "learning_rate": 9.996351671986756e-06, "loss": 0.8632, "step": 1001 }, { "epoch": 0.05402199698080656, "grad_norm": 0.8891279101371765, "learning_rate": 9.996343570083006e-06, "loss": 0.8758, "step": 1002 }, { "epoch": 0.054075911149450075, "grad_norm": 0.9592086672782898, "learning_rate": 9.996335459196505e-06, "loss": 0.8962, "step": 1003 }, { "epoch": 0.05412982531809359, "grad_norm": 0.8937798738479614, "learning_rate": 9.996327339327267e-06, "loss": 0.8434, "step": 1004 }, { "epoch": 0.05418373948673712, "grad_norm": 0.9602083563804626, "learning_rate": 9.996319210475307e-06, "loss": 0.9692, "step": 1005 }, { "epoch": 0.054237653655380635, "grad_norm": 0.870637834072113, "learning_rate": 9.996311072640637e-06, "loss": 0.9146, "step": 1006 }, { "epoch": 0.05429156782402415, "grad_norm": 0.9330273866653442, "learning_rate": 9.996302925823276e-06, "loss": 0.8584, "step": 1007 }, { "epoch": 0.05434548199266767, "grad_norm": 0.8185963034629822, "learning_rate": 9.996294770023234e-06, "loss": 0.7854, "step": 1008 }, { "epoch": 0.05439939616131119, "grad_norm": 0.8727489113807678, "learning_rate": 9.996286605240528e-06, "loss": 0.7388, "step": 1009 }, { "epoch": 0.054453310329954714, "grad_norm": 1.0858477354049683, "learning_rate": 9.996278431475172e-06, "loss": 0.9201, "step": 1010 }, { "epoch": 0.05450722449859823, "grad_norm": 0.9749255776405334, "learning_rate": 9.996270248727184e-06, "loss": 0.9041, "step": 1011 }, { "epoch": 0.05456113866724175, "grad_norm": 0.9460576176643372, "learning_rate": 9.996262056996575e-06, "loss": 0.8553, "step": 1012 }, { "epoch": 0.05461505283588527, "grad_norm": 0.9379808306694031, "learning_rate": 9.99625385628336e-06, "loss": 0.9253, "step": 1013 }, { "epoch": 0.05466896700452879, "grad_norm": 0.8154170513153076, "learning_rate": 9.996245646587553e-06, "loss": 0.8703, "step": 1014 }, { "epoch": 0.05472288117317231, "grad_norm": 0.9122161269187927, "learning_rate": 9.996237427909172e-06, "loss": 0.7734, "step": 1015 }, { "epoch": 0.05477679534181583, "grad_norm": 0.9049486517906189, "learning_rate": 9.996229200248228e-06, "loss": 0.8991, "step": 1016 }, { "epoch": 0.05483070951045935, "grad_norm": 0.9244295358657837, "learning_rate": 9.996220963604741e-06, "loss": 0.8514, "step": 1017 }, { "epoch": 0.05488462367910287, "grad_norm": 0.9817934036254883, "learning_rate": 9.99621271797872e-06, "loss": 0.8641, "step": 1018 }, { "epoch": 0.05493853784774639, "grad_norm": 0.9253972768783569, "learning_rate": 9.996204463370182e-06, "loss": 0.9199, "step": 1019 }, { "epoch": 0.05499245201638991, "grad_norm": 0.9114319682121277, "learning_rate": 9.996196199779145e-06, "loss": 0.8063, "step": 1020 }, { "epoch": 0.055046366185033425, "grad_norm": 0.9643195867538452, "learning_rate": 9.996187927205619e-06, "loss": 0.9668, "step": 1021 }, { "epoch": 0.05510028035367694, "grad_norm": 0.8127598166465759, "learning_rate": 9.996179645649622e-06, "loss": 0.764, "step": 1022 }, { "epoch": 0.05515419452232047, "grad_norm": 0.8728108406066895, "learning_rate": 9.996171355111167e-06, "loss": 0.7703, "step": 1023 }, { "epoch": 0.055208108690963986, "grad_norm": 0.8554317355155945, "learning_rate": 9.996163055590269e-06, "loss": 0.8266, "step": 1024 }, { "epoch": 0.055262022859607504, "grad_norm": 0.7951076030731201, "learning_rate": 9.996154747086946e-06, "loss": 0.7601, "step": 1025 }, { "epoch": 0.05531593702825102, "grad_norm": 0.8916927576065063, "learning_rate": 9.996146429601208e-06, "loss": 0.8936, "step": 1026 }, { "epoch": 0.05536985119689455, "grad_norm": 1.0242576599121094, "learning_rate": 9.996138103133075e-06, "loss": 0.8868, "step": 1027 }, { "epoch": 0.055423765365538065, "grad_norm": 0.9273019433021545, "learning_rate": 9.996129767682557e-06, "loss": 0.8622, "step": 1028 }, { "epoch": 0.05547767953418158, "grad_norm": 0.9547039866447449, "learning_rate": 9.996121423249673e-06, "loss": 0.7814, "step": 1029 }, { "epoch": 0.0555315937028251, "grad_norm": 0.8750621676445007, "learning_rate": 9.996113069834437e-06, "loss": 0.7717, "step": 1030 }, { "epoch": 0.05558550787146862, "grad_norm": 0.9547988176345825, "learning_rate": 9.996104707436862e-06, "loss": 0.8877, "step": 1031 }, { "epoch": 0.055639422040112144, "grad_norm": 0.8856480717658997, "learning_rate": 9.996096336056966e-06, "loss": 0.7927, "step": 1032 }, { "epoch": 0.05569333620875566, "grad_norm": 0.8311342000961304, "learning_rate": 9.99608795569476e-06, "loss": 0.7847, "step": 1033 }, { "epoch": 0.05574725037739918, "grad_norm": 1.0720731019973755, "learning_rate": 9.996079566350266e-06, "loss": 0.9243, "step": 1034 }, { "epoch": 0.0558011645460427, "grad_norm": 0.9498684406280518, "learning_rate": 9.996071168023491e-06, "loss": 0.8605, "step": 1035 }, { "epoch": 0.05585507871468622, "grad_norm": 0.9043952822685242, "learning_rate": 9.996062760714456e-06, "loss": 0.8488, "step": 1036 }, { "epoch": 0.05590899288332974, "grad_norm": 0.8051116466522217, "learning_rate": 9.996054344423173e-06, "loss": 0.8275, "step": 1037 }, { "epoch": 0.05596290705197326, "grad_norm": 0.857120156288147, "learning_rate": 9.996045919149658e-06, "loss": 0.8837, "step": 1038 }, { "epoch": 0.056016821220616776, "grad_norm": 0.8810911774635315, "learning_rate": 9.996037484893926e-06, "loss": 0.8179, "step": 1039 }, { "epoch": 0.056070735389260294, "grad_norm": 0.8783093690872192, "learning_rate": 9.996029041655994e-06, "loss": 0.7734, "step": 1040 }, { "epoch": 0.05612464955790382, "grad_norm": 0.9281952977180481, "learning_rate": 9.996020589435874e-06, "loss": 0.8747, "step": 1041 }, { "epoch": 0.05617856372654734, "grad_norm": 0.8307299613952637, "learning_rate": 9.996012128233583e-06, "loss": 0.8055, "step": 1042 }, { "epoch": 0.056232477895190855, "grad_norm": 0.9520873427391052, "learning_rate": 9.996003658049136e-06, "loss": 0.8181, "step": 1043 }, { "epoch": 0.05628639206383437, "grad_norm": 0.8753806948661804, "learning_rate": 9.995995178882549e-06, "loss": 0.808, "step": 1044 }, { "epoch": 0.0563403062324779, "grad_norm": 1.067691683769226, "learning_rate": 9.995986690733836e-06, "loss": 0.8048, "step": 1045 }, { "epoch": 0.056394220401121416, "grad_norm": 0.8575261235237122, "learning_rate": 9.995978193603013e-06, "loss": 0.9231, "step": 1046 }, { "epoch": 0.056448134569764934, "grad_norm": 0.9857104420661926, "learning_rate": 9.995969687490096e-06, "loss": 0.8883, "step": 1047 }, { "epoch": 0.05650204873840845, "grad_norm": 0.9203484654426575, "learning_rate": 9.995961172395098e-06, "loss": 0.7634, "step": 1048 }, { "epoch": 0.056555962907051976, "grad_norm": 0.8741904497146606, "learning_rate": 9.995952648318036e-06, "loss": 0.8061, "step": 1049 }, { "epoch": 0.056609877075695494, "grad_norm": 0.9495588541030884, "learning_rate": 9.995944115258925e-06, "loss": 0.8922, "step": 1050 }, { "epoch": 0.05666379124433901, "grad_norm": 0.9306020140647888, "learning_rate": 9.99593557321778e-06, "loss": 0.8454, "step": 1051 }, { "epoch": 0.05671770541298253, "grad_norm": 0.9457784295082092, "learning_rate": 9.995927022194615e-06, "loss": 0.8701, "step": 1052 }, { "epoch": 0.05677161958162605, "grad_norm": 0.88719242811203, "learning_rate": 9.99591846218945e-06, "loss": 0.8416, "step": 1053 }, { "epoch": 0.05682553375026957, "grad_norm": 0.8740848302841187, "learning_rate": 9.995909893202296e-06, "loss": 0.7962, "step": 1054 }, { "epoch": 0.05687944791891309, "grad_norm": 1.0149377584457397, "learning_rate": 9.99590131523317e-06, "loss": 0.8352, "step": 1055 }, { "epoch": 0.05693336208755661, "grad_norm": 0.9014917016029358, "learning_rate": 9.995892728282088e-06, "loss": 0.9244, "step": 1056 }, { "epoch": 0.05698727625620013, "grad_norm": 0.9351898431777954, "learning_rate": 9.995884132349062e-06, "loss": 0.865, "step": 1057 }, { "epoch": 0.05704119042484365, "grad_norm": 0.8656749129295349, "learning_rate": 9.995875527434113e-06, "loss": 0.8836, "step": 1058 }, { "epoch": 0.05709510459348717, "grad_norm": 0.9120789170265198, "learning_rate": 9.995866913537254e-06, "loss": 0.8772, "step": 1059 }, { "epoch": 0.05714901876213069, "grad_norm": 1.0019149780273438, "learning_rate": 9.995858290658497e-06, "loss": 0.9338, "step": 1060 }, { "epoch": 0.057202932930774206, "grad_norm": 0.8492977023124695, "learning_rate": 9.995849658797863e-06, "loss": 0.742, "step": 1061 }, { "epoch": 0.057256847099417724, "grad_norm": 1.000607967376709, "learning_rate": 9.995841017955363e-06, "loss": 0.8498, "step": 1062 }, { "epoch": 0.05731076126806125, "grad_norm": 1.0268487930297852, "learning_rate": 9.995832368131016e-06, "loss": 0.8937, "step": 1063 }, { "epoch": 0.057364675436704766, "grad_norm": 0.9388830661773682, "learning_rate": 9.995823709324836e-06, "loss": 0.877, "step": 1064 }, { "epoch": 0.057418589605348284, "grad_norm": 0.9747199416160583, "learning_rate": 9.99581504153684e-06, "loss": 0.8436, "step": 1065 }, { "epoch": 0.0574725037739918, "grad_norm": 0.9125073552131653, "learning_rate": 9.99580636476704e-06, "loss": 0.8853, "step": 1066 }, { "epoch": 0.05752641794263533, "grad_norm": 0.8910282254219055, "learning_rate": 9.995797679015455e-06, "loss": 0.8566, "step": 1067 }, { "epoch": 0.057580332111278845, "grad_norm": 0.8546010255813599, "learning_rate": 9.995788984282101e-06, "loss": 0.8209, "step": 1068 }, { "epoch": 0.05763424627992236, "grad_norm": 0.9205883145332336, "learning_rate": 9.99578028056699e-06, "loss": 0.7814, "step": 1069 }, { "epoch": 0.05768816044856588, "grad_norm": 0.9627780914306641, "learning_rate": 9.995771567870142e-06, "loss": 0.8686, "step": 1070 }, { "epoch": 0.057742074617209406, "grad_norm": 0.9917465448379517, "learning_rate": 9.995762846191569e-06, "loss": 0.9672, "step": 1071 }, { "epoch": 0.057795988785852924, "grad_norm": 0.9396706223487854, "learning_rate": 9.995754115531288e-06, "loss": 0.8631, "step": 1072 }, { "epoch": 0.05784990295449644, "grad_norm": 0.8310922980308533, "learning_rate": 9.995745375889317e-06, "loss": 0.8637, "step": 1073 }, { "epoch": 0.05790381712313996, "grad_norm": 0.9085954427719116, "learning_rate": 9.995736627265667e-06, "loss": 0.8821, "step": 1074 }, { "epoch": 0.05795773129178348, "grad_norm": 0.8529816269874573, "learning_rate": 9.995727869660357e-06, "loss": 0.8426, "step": 1075 }, { "epoch": 0.058011645460427, "grad_norm": 0.8288499116897583, "learning_rate": 9.995719103073403e-06, "loss": 0.8415, "step": 1076 }, { "epoch": 0.05806555962907052, "grad_norm": 0.9105609059333801, "learning_rate": 9.995710327504819e-06, "loss": 0.7683, "step": 1077 }, { "epoch": 0.05811947379771404, "grad_norm": 0.9578274488449097, "learning_rate": 9.995701542954622e-06, "loss": 0.8796, "step": 1078 }, { "epoch": 0.058173387966357556, "grad_norm": 0.8542460799217224, "learning_rate": 9.995692749422827e-06, "loss": 0.8363, "step": 1079 }, { "epoch": 0.05822730213500108, "grad_norm": 0.8723183274269104, "learning_rate": 9.99568394690945e-06, "loss": 0.8434, "step": 1080 }, { "epoch": 0.0582812163036446, "grad_norm": 0.9157887697219849, "learning_rate": 9.995675135414507e-06, "loss": 0.6532, "step": 1081 }, { "epoch": 0.05833513047228812, "grad_norm": 0.9055691361427307, "learning_rate": 9.995666314938014e-06, "loss": 0.8762, "step": 1082 }, { "epoch": 0.058389044640931635, "grad_norm": 0.8224693536758423, "learning_rate": 9.995657485479987e-06, "loss": 0.7976, "step": 1083 }, { "epoch": 0.05844295880957515, "grad_norm": 0.925414502620697, "learning_rate": 9.995648647040441e-06, "loss": 0.8673, "step": 1084 }, { "epoch": 0.05849687297821868, "grad_norm": 0.9194141626358032, "learning_rate": 9.995639799619395e-06, "loss": 0.7916, "step": 1085 }, { "epoch": 0.058550787146862196, "grad_norm": 1.08795166015625, "learning_rate": 9.995630943216859e-06, "loss": 0.9135, "step": 1086 }, { "epoch": 0.058604701315505714, "grad_norm": 0.9648925065994263, "learning_rate": 9.995622077832854e-06, "loss": 0.8442, "step": 1087 }, { "epoch": 0.05865861548414923, "grad_norm": 1.0012339353561401, "learning_rate": 9.995613203467394e-06, "loss": 0.9543, "step": 1088 }, { "epoch": 0.05871252965279276, "grad_norm": 0.9333881735801697, "learning_rate": 9.995604320120496e-06, "loss": 0.9267, "step": 1089 }, { "epoch": 0.058766443821436275, "grad_norm": 0.8566498160362244, "learning_rate": 9.995595427792173e-06, "loss": 0.8539, "step": 1090 }, { "epoch": 0.05882035799007979, "grad_norm": 0.8766364455223083, "learning_rate": 9.995586526482446e-06, "loss": 0.9293, "step": 1091 }, { "epoch": 0.05887427215872331, "grad_norm": 0.9181047677993774, "learning_rate": 9.995577616191326e-06, "loss": 0.8333, "step": 1092 }, { "epoch": 0.05892818632736683, "grad_norm": 0.8831031918525696, "learning_rate": 9.995568696918833e-06, "loss": 0.8016, "step": 1093 }, { "epoch": 0.05898210049601035, "grad_norm": 0.8618754148483276, "learning_rate": 9.99555976866498e-06, "loss": 0.8988, "step": 1094 }, { "epoch": 0.05903601466465387, "grad_norm": 0.9083183407783508, "learning_rate": 9.995550831429785e-06, "loss": 0.8626, "step": 1095 }, { "epoch": 0.05908992883329739, "grad_norm": 0.8423884510993958, "learning_rate": 9.995541885213262e-06, "loss": 0.9121, "step": 1096 }, { "epoch": 0.05914384300194091, "grad_norm": 0.7747607827186584, "learning_rate": 9.99553293001543e-06, "loss": 0.8087, "step": 1097 }, { "epoch": 0.05919775717058443, "grad_norm": 0.8828368186950684, "learning_rate": 9.995523965836302e-06, "loss": 0.8284, "step": 1098 }, { "epoch": 0.05925167133922795, "grad_norm": 0.9448524713516235, "learning_rate": 9.995514992675896e-06, "loss": 0.9565, "step": 1099 }, { "epoch": 0.05930558550787147, "grad_norm": 0.8967006206512451, "learning_rate": 9.99550601053423e-06, "loss": 0.8412, "step": 1100 }, { "epoch": 0.059359499676514986, "grad_norm": 0.9394551515579224, "learning_rate": 9.995497019411315e-06, "loss": 0.929, "step": 1101 }, { "epoch": 0.05941341384515851, "grad_norm": 0.9002842903137207, "learning_rate": 9.995488019307172e-06, "loss": 0.734, "step": 1102 }, { "epoch": 0.05946732801380203, "grad_norm": 1.3590562343597412, "learning_rate": 9.995479010221816e-06, "loss": 0.8843, "step": 1103 }, { "epoch": 0.05952124218244555, "grad_norm": 1.041528582572937, "learning_rate": 9.99546999215526e-06, "loss": 0.9001, "step": 1104 }, { "epoch": 0.059575156351089065, "grad_norm": 0.9846720099449158, "learning_rate": 9.995460965107524e-06, "loss": 0.8174, "step": 1105 }, { "epoch": 0.05962907051973258, "grad_norm": 0.9171685576438904, "learning_rate": 9.995451929078624e-06, "loss": 0.8756, "step": 1106 }, { "epoch": 0.05968298468837611, "grad_norm": 0.9155516028404236, "learning_rate": 9.995442884068574e-06, "loss": 0.7327, "step": 1107 }, { "epoch": 0.059736898857019625, "grad_norm": 0.8734007477760315, "learning_rate": 9.99543383007739e-06, "loss": 0.8385, "step": 1108 }, { "epoch": 0.05979081302566314, "grad_norm": 0.8580977320671082, "learning_rate": 9.99542476710509e-06, "loss": 0.885, "step": 1109 }, { "epoch": 0.05984472719430666, "grad_norm": 0.8499299883842468, "learning_rate": 9.995415695151692e-06, "loss": 0.8323, "step": 1110 }, { "epoch": 0.059898641362950186, "grad_norm": 0.8348694443702698, "learning_rate": 9.99540661421721e-06, "loss": 0.7947, "step": 1111 }, { "epoch": 0.059952555531593704, "grad_norm": 0.8865199685096741, "learning_rate": 9.99539752430166e-06, "loss": 0.9363, "step": 1112 }, { "epoch": 0.06000646970023722, "grad_norm": 0.9492315649986267, "learning_rate": 9.995388425405059e-06, "loss": 0.913, "step": 1113 }, { "epoch": 0.06006038386888074, "grad_norm": 0.938252329826355, "learning_rate": 9.995379317527422e-06, "loss": 0.861, "step": 1114 }, { "epoch": 0.06011429803752426, "grad_norm": 1.2601032257080078, "learning_rate": 9.995370200668768e-06, "loss": 0.9435, "step": 1115 }, { "epoch": 0.06016821220616778, "grad_norm": 0.915830671787262, "learning_rate": 9.995361074829112e-06, "loss": 0.9372, "step": 1116 }, { "epoch": 0.0602221263748113, "grad_norm": 1.4548465013504028, "learning_rate": 9.995351940008473e-06, "loss": 0.9055, "step": 1117 }, { "epoch": 0.06027604054345482, "grad_norm": 0.9090906381607056, "learning_rate": 9.995342796206861e-06, "loss": 0.8849, "step": 1118 }, { "epoch": 0.06032995471209834, "grad_norm": 0.9860616326332092, "learning_rate": 9.995333643424298e-06, "loss": 0.8304, "step": 1119 }, { "epoch": 0.06038386888074186, "grad_norm": 0.8320879340171814, "learning_rate": 9.9953244816608e-06, "loss": 0.8432, "step": 1120 }, { "epoch": 0.06043778304938538, "grad_norm": 0.8633564114570618, "learning_rate": 9.995315310916381e-06, "loss": 0.7461, "step": 1121 }, { "epoch": 0.0604916972180289, "grad_norm": 0.881287693977356, "learning_rate": 9.995306131191059e-06, "loss": 0.8512, "step": 1122 }, { "epoch": 0.060545611386672415, "grad_norm": 0.8888201713562012, "learning_rate": 9.99529694248485e-06, "loss": 0.8416, "step": 1123 }, { "epoch": 0.06059952555531594, "grad_norm": 0.8073605895042419, "learning_rate": 9.99528774479777e-06, "loss": 0.8369, "step": 1124 }, { "epoch": 0.06065343972395946, "grad_norm": 0.9260549545288086, "learning_rate": 9.995278538129837e-06, "loss": 0.8548, "step": 1125 }, { "epoch": 0.060707353892602976, "grad_norm": 0.9169156551361084, "learning_rate": 9.99526932248107e-06, "loss": 0.9149, "step": 1126 }, { "epoch": 0.060761268061246494, "grad_norm": 0.8481706380844116, "learning_rate": 9.995260097851478e-06, "loss": 0.8591, "step": 1127 }, { "epoch": 0.06081518222989001, "grad_norm": 0.8934486508369446, "learning_rate": 9.995250864241085e-06, "loss": 0.9322, "step": 1128 }, { "epoch": 0.06086909639853354, "grad_norm": 0.947390615940094, "learning_rate": 9.995241621649902e-06, "loss": 1.0015, "step": 1129 }, { "epoch": 0.060923010567177055, "grad_norm": 0.9185096025466919, "learning_rate": 9.995232370077949e-06, "loss": 0.9293, "step": 1130 }, { "epoch": 0.06097692473582057, "grad_norm": 0.9517882466316223, "learning_rate": 9.995223109525245e-06, "loss": 0.8673, "step": 1131 }, { "epoch": 0.06103083890446409, "grad_norm": 1.065699815750122, "learning_rate": 9.9952138399918e-06, "loss": 0.9144, "step": 1132 }, { "epoch": 0.061084753073107616, "grad_norm": 0.9048404693603516, "learning_rate": 9.995204561477635e-06, "loss": 0.7773, "step": 1133 }, { "epoch": 0.061138667241751134, "grad_norm": 1.104457139968872, "learning_rate": 9.995195273982768e-06, "loss": 0.8847, "step": 1134 }, { "epoch": 0.06119258141039465, "grad_norm": 0.9009587168693542, "learning_rate": 9.995185977507212e-06, "loss": 0.8118, "step": 1135 }, { "epoch": 0.06124649557903817, "grad_norm": 1.0740209817886353, "learning_rate": 9.995176672050983e-06, "loss": 0.9173, "step": 1136 }, { "epoch": 0.06130040974768169, "grad_norm": 0.9820743203163147, "learning_rate": 9.995167357614104e-06, "loss": 0.8555, "step": 1137 }, { "epoch": 0.06135432391632521, "grad_norm": 0.9250825047492981, "learning_rate": 9.995158034196586e-06, "loss": 0.8771, "step": 1138 }, { "epoch": 0.06140823808496873, "grad_norm": 0.8952597379684448, "learning_rate": 9.995148701798447e-06, "loss": 0.8598, "step": 1139 }, { "epoch": 0.06146215225361225, "grad_norm": 0.8485212922096252, "learning_rate": 9.995139360419706e-06, "loss": 0.8557, "step": 1140 }, { "epoch": 0.061516066422255766, "grad_norm": 0.9676715731620789, "learning_rate": 9.995130010060377e-06, "loss": 0.7748, "step": 1141 }, { "epoch": 0.06156998059089929, "grad_norm": 0.7896347045898438, "learning_rate": 9.995120650720478e-06, "loss": 0.6183, "step": 1142 }, { "epoch": 0.06162389475954281, "grad_norm": 0.8746615052223206, "learning_rate": 9.995111282400024e-06, "loss": 0.8321, "step": 1143 }, { "epoch": 0.06167780892818633, "grad_norm": 0.9029875993728638, "learning_rate": 9.995101905099036e-06, "loss": 0.8686, "step": 1144 }, { "epoch": 0.061731723096829845, "grad_norm": 0.9529547095298767, "learning_rate": 9.995092518817528e-06, "loss": 0.8878, "step": 1145 }, { "epoch": 0.06178563726547336, "grad_norm": 0.8280455470085144, "learning_rate": 9.995083123555517e-06, "loss": 0.8232, "step": 1146 }, { "epoch": 0.06183955143411689, "grad_norm": 0.908881664276123, "learning_rate": 9.995073719313021e-06, "loss": 0.8387, "step": 1147 }, { "epoch": 0.061893465602760406, "grad_norm": 0.9137653708457947, "learning_rate": 9.995064306090055e-06, "loss": 0.8943, "step": 1148 }, { "epoch": 0.061947379771403924, "grad_norm": 0.863861620426178, "learning_rate": 9.995054883886639e-06, "loss": 0.7435, "step": 1149 }, { "epoch": 0.06200129394004744, "grad_norm": 0.8534915447235107, "learning_rate": 9.995045452702786e-06, "loss": 0.941, "step": 1150 }, { "epoch": 0.06205520810869097, "grad_norm": 0.9469791650772095, "learning_rate": 9.995036012538515e-06, "loss": 0.9137, "step": 1151 }, { "epoch": 0.062109122277334484, "grad_norm": 0.9044890999794006, "learning_rate": 9.995026563393844e-06, "loss": 0.9117, "step": 1152 }, { "epoch": 0.062163036445978, "grad_norm": 0.989772379398346, "learning_rate": 9.995017105268789e-06, "loss": 0.8306, "step": 1153 }, { "epoch": 0.06221695061462152, "grad_norm": 0.8586496114730835, "learning_rate": 9.995007638163365e-06, "loss": 0.8012, "step": 1154 }, { "epoch": 0.062270864783265045, "grad_norm": 0.9221116304397583, "learning_rate": 9.994998162077594e-06, "loss": 0.7935, "step": 1155 }, { "epoch": 0.06232477895190856, "grad_norm": 0.9453061819076538, "learning_rate": 9.994988677011489e-06, "loss": 0.8257, "step": 1156 }, { "epoch": 0.06237869312055208, "grad_norm": 0.8065335154533386, "learning_rate": 9.994979182965065e-06, "loss": 0.86, "step": 1157 }, { "epoch": 0.0624326072891956, "grad_norm": 0.9597793817520142, "learning_rate": 9.994969679938346e-06, "loss": 0.862, "step": 1158 }, { "epoch": 0.06248652145783912, "grad_norm": 0.9118353128433228, "learning_rate": 9.994960167931342e-06, "loss": 0.8925, "step": 1159 }, { "epoch": 0.06254043562648263, "grad_norm": 1.0216273069381714, "learning_rate": 9.994950646944077e-06, "loss": 0.7078, "step": 1160 }, { "epoch": 0.06259434979512615, "grad_norm": 0.960182785987854, "learning_rate": 9.994941116976562e-06, "loss": 0.8936, "step": 1161 }, { "epoch": 0.06264826396376968, "grad_norm": 0.9551856517791748, "learning_rate": 9.994931578028817e-06, "loss": 0.8053, "step": 1162 }, { "epoch": 0.0627021781324132, "grad_norm": 0.9419867992401123, "learning_rate": 9.994922030100857e-06, "loss": 0.8333, "step": 1163 }, { "epoch": 0.06275609230105672, "grad_norm": 0.9780306816101074, "learning_rate": 9.994912473192702e-06, "loss": 0.88, "step": 1164 }, { "epoch": 0.06281000646970024, "grad_norm": 0.9320577383041382, "learning_rate": 9.99490290730437e-06, "loss": 0.8859, "step": 1165 }, { "epoch": 0.06286392063834376, "grad_norm": 0.7692422270774841, "learning_rate": 9.994893332435874e-06, "loss": 0.8093, "step": 1166 }, { "epoch": 0.06291783480698727, "grad_norm": 1.0622048377990723, "learning_rate": 9.994883748587234e-06, "loss": 0.8959, "step": 1167 }, { "epoch": 0.06297174897563079, "grad_norm": 0.9598555564880371, "learning_rate": 9.994874155758467e-06, "loss": 0.8153, "step": 1168 }, { "epoch": 0.06302566314427431, "grad_norm": 0.9207014441490173, "learning_rate": 9.994864553949591e-06, "loss": 0.9383, "step": 1169 }, { "epoch": 0.06307957731291783, "grad_norm": 1.0074093341827393, "learning_rate": 9.99485494316062e-06, "loss": 0.9999, "step": 1170 }, { "epoch": 0.06313349148156136, "grad_norm": 0.8454248905181885, "learning_rate": 9.994845323391575e-06, "loss": 0.7946, "step": 1171 }, { "epoch": 0.06318740565020488, "grad_norm": 0.847578763961792, "learning_rate": 9.99483569464247e-06, "loss": 0.7144, "step": 1172 }, { "epoch": 0.0632413198188484, "grad_norm": 0.9083126187324524, "learning_rate": 9.994826056913325e-06, "loss": 0.774, "step": 1173 }, { "epoch": 0.06329523398749191, "grad_norm": 0.8995345830917358, "learning_rate": 9.994816410204158e-06, "loss": 0.8995, "step": 1174 }, { "epoch": 0.06334914815613543, "grad_norm": 1.0547746419906616, "learning_rate": 9.994806754514983e-06, "loss": 0.8142, "step": 1175 }, { "epoch": 0.06340306232477895, "grad_norm": 0.946854829788208, "learning_rate": 9.99479708984582e-06, "loss": 0.8639, "step": 1176 }, { "epoch": 0.06345697649342247, "grad_norm": 0.8746247291564941, "learning_rate": 9.994787416196683e-06, "loss": 0.8601, "step": 1177 }, { "epoch": 0.06351089066206599, "grad_norm": 0.9075024127960205, "learning_rate": 9.994777733567595e-06, "loss": 0.7969, "step": 1178 }, { "epoch": 0.0635648048307095, "grad_norm": 0.9435486197471619, "learning_rate": 9.994768041958569e-06, "loss": 0.8199, "step": 1179 }, { "epoch": 0.06361871899935304, "grad_norm": 0.8597564697265625, "learning_rate": 9.994758341369624e-06, "loss": 0.8791, "step": 1180 }, { "epoch": 0.06367263316799655, "grad_norm": 0.7960480451583862, "learning_rate": 9.994748631800777e-06, "loss": 0.8035, "step": 1181 }, { "epoch": 0.06372654733664007, "grad_norm": 1.1984984874725342, "learning_rate": 9.994738913252045e-06, "loss": 0.7372, "step": 1182 }, { "epoch": 0.06378046150528359, "grad_norm": 0.8532997369766235, "learning_rate": 9.994729185723446e-06, "loss": 0.9094, "step": 1183 }, { "epoch": 0.06383437567392711, "grad_norm": 0.8327267169952393, "learning_rate": 9.994719449214999e-06, "loss": 0.809, "step": 1184 }, { "epoch": 0.06388828984257063, "grad_norm": 0.9086306691169739, "learning_rate": 9.99470970372672e-06, "loss": 0.8278, "step": 1185 }, { "epoch": 0.06394220401121414, "grad_norm": 0.8422104716300964, "learning_rate": 9.994699949258626e-06, "loss": 0.7754, "step": 1186 }, { "epoch": 0.06399611817985766, "grad_norm": 1.0434929132461548, "learning_rate": 9.994690185810733e-06, "loss": 0.908, "step": 1187 }, { "epoch": 0.06405003234850119, "grad_norm": 1.1625720262527466, "learning_rate": 9.994680413383064e-06, "loss": 0.8814, "step": 1188 }, { "epoch": 0.06410394651714471, "grad_norm": 0.9940767288208008, "learning_rate": 9.994670631975631e-06, "loss": 0.7846, "step": 1189 }, { "epoch": 0.06415786068578823, "grad_norm": 0.8356907963752747, "learning_rate": 9.994660841588457e-06, "loss": 0.798, "step": 1190 }, { "epoch": 0.06421177485443175, "grad_norm": 0.830348014831543, "learning_rate": 9.994651042221552e-06, "loss": 0.7875, "step": 1191 }, { "epoch": 0.06426568902307526, "grad_norm": 1.1060880422592163, "learning_rate": 9.994641233874943e-06, "loss": 0.8893, "step": 1192 }, { "epoch": 0.06431960319171878, "grad_norm": 0.9319590926170349, "learning_rate": 9.994631416548637e-06, "loss": 0.791, "step": 1193 }, { "epoch": 0.0643735173603623, "grad_norm": 0.8345780968666077, "learning_rate": 9.994621590242661e-06, "loss": 0.8213, "step": 1194 }, { "epoch": 0.06442743152900582, "grad_norm": 0.9848359227180481, "learning_rate": 9.99461175495703e-06, "loss": 0.735, "step": 1195 }, { "epoch": 0.06448134569764934, "grad_norm": 0.9134055972099304, "learning_rate": 9.994601910691758e-06, "loss": 0.8415, "step": 1196 }, { "epoch": 0.06453525986629287, "grad_norm": 0.8084586262702942, "learning_rate": 9.994592057446866e-06, "loss": 0.8702, "step": 1197 }, { "epoch": 0.06458917403493639, "grad_norm": 0.9168767333030701, "learning_rate": 9.994582195222371e-06, "loss": 0.8921, "step": 1198 }, { "epoch": 0.0646430882035799, "grad_norm": 0.8380446434020996, "learning_rate": 9.994572324018292e-06, "loss": 0.7705, "step": 1199 }, { "epoch": 0.06469700237222342, "grad_norm": 0.8120049238204956, "learning_rate": 9.994562443834646e-06, "loss": 0.7576, "step": 1200 }, { "epoch": 0.06475091654086694, "grad_norm": 0.9559764266014099, "learning_rate": 9.994552554671448e-06, "loss": 0.8427, "step": 1201 }, { "epoch": 0.06480483070951046, "grad_norm": 0.9473673105239868, "learning_rate": 9.99454265652872e-06, "loss": 0.9988, "step": 1202 }, { "epoch": 0.06485874487815398, "grad_norm": 1.0704870223999023, "learning_rate": 9.994532749406477e-06, "loss": 0.9499, "step": 1203 }, { "epoch": 0.0649126590467975, "grad_norm": 0.9905646443367004, "learning_rate": 9.994522833304738e-06, "loss": 0.8801, "step": 1204 }, { "epoch": 0.06496657321544101, "grad_norm": 1.194190502166748, "learning_rate": 9.99451290822352e-06, "loss": 0.9051, "step": 1205 }, { "epoch": 0.06502048738408454, "grad_norm": 0.8571314811706543, "learning_rate": 9.994502974162843e-06, "loss": 0.8131, "step": 1206 }, { "epoch": 0.06507440155272806, "grad_norm": 0.9769417643547058, "learning_rate": 9.994493031122721e-06, "loss": 0.8524, "step": 1207 }, { "epoch": 0.06512831572137158, "grad_norm": 0.8106759786605835, "learning_rate": 9.994483079103176e-06, "loss": 0.8142, "step": 1208 }, { "epoch": 0.0651822298900151, "grad_norm": 0.8817846775054932, "learning_rate": 9.994473118104223e-06, "loss": 0.9076, "step": 1209 }, { "epoch": 0.06523614405865862, "grad_norm": 0.8271930813789368, "learning_rate": 9.994463148125882e-06, "loss": 0.7914, "step": 1210 }, { "epoch": 0.06529005822730213, "grad_norm": 0.9060614705085754, "learning_rate": 9.994453169168169e-06, "loss": 0.8375, "step": 1211 }, { "epoch": 0.06534397239594565, "grad_norm": 0.880614697933197, "learning_rate": 9.994443181231103e-06, "loss": 0.7751, "step": 1212 }, { "epoch": 0.06539788656458917, "grad_norm": 0.9420819282531738, "learning_rate": 9.994433184314702e-06, "loss": 0.8532, "step": 1213 }, { "epoch": 0.06545180073323269, "grad_norm": 0.8587054014205933, "learning_rate": 9.994423178418984e-06, "loss": 0.8804, "step": 1214 }, { "epoch": 0.06550571490187622, "grad_norm": 0.9624550938606262, "learning_rate": 9.994413163543965e-06, "loss": 0.9782, "step": 1215 }, { "epoch": 0.06555962907051974, "grad_norm": 0.9458224773406982, "learning_rate": 9.994403139689665e-06, "loss": 0.8274, "step": 1216 }, { "epoch": 0.06561354323916326, "grad_norm": 1.0417940616607666, "learning_rate": 9.994393106856104e-06, "loss": 0.9065, "step": 1217 }, { "epoch": 0.06566745740780677, "grad_norm": 1.0225417613983154, "learning_rate": 9.994383065043296e-06, "loss": 0.8642, "step": 1218 }, { "epoch": 0.06572137157645029, "grad_norm": 0.9015594720840454, "learning_rate": 9.994373014251261e-06, "loss": 0.8775, "step": 1219 }, { "epoch": 0.06577528574509381, "grad_norm": 0.8473883271217346, "learning_rate": 9.994362954480018e-06, "loss": 0.8566, "step": 1220 }, { "epoch": 0.06582919991373733, "grad_norm": 0.8571242690086365, "learning_rate": 9.994352885729584e-06, "loss": 0.8502, "step": 1221 }, { "epoch": 0.06588311408238084, "grad_norm": 0.8793268799781799, "learning_rate": 9.994342807999977e-06, "loss": 0.9062, "step": 1222 }, { "epoch": 0.06593702825102436, "grad_norm": 0.8866230249404907, "learning_rate": 9.994332721291214e-06, "loss": 0.9026, "step": 1223 }, { "epoch": 0.0659909424196679, "grad_norm": 0.9135996103286743, "learning_rate": 9.994322625603314e-06, "loss": 0.8558, "step": 1224 }, { "epoch": 0.06604485658831141, "grad_norm": 0.9904530048370361, "learning_rate": 9.994312520936297e-06, "loss": 0.8823, "step": 1225 }, { "epoch": 0.06609877075695493, "grad_norm": 0.8590260148048401, "learning_rate": 9.99430240729018e-06, "loss": 0.8344, "step": 1226 }, { "epoch": 0.06615268492559845, "grad_norm": 1.1669397354125977, "learning_rate": 9.99429228466498e-06, "loss": 0.9459, "step": 1227 }, { "epoch": 0.06620659909424197, "grad_norm": 0.9290857315063477, "learning_rate": 9.994282153060715e-06, "loss": 0.8723, "step": 1228 }, { "epoch": 0.06626051326288548, "grad_norm": 0.9619696140289307, "learning_rate": 9.994272012477405e-06, "loss": 0.8986, "step": 1229 }, { "epoch": 0.066314427431529, "grad_norm": 0.8312071561813354, "learning_rate": 9.994261862915068e-06, "loss": 0.7291, "step": 1230 }, { "epoch": 0.06636834160017252, "grad_norm": 1.0099300146102905, "learning_rate": 9.994251704373721e-06, "loss": 0.8725, "step": 1231 }, { "epoch": 0.06642225576881604, "grad_norm": 0.8522336483001709, "learning_rate": 9.994241536853384e-06, "loss": 0.8656, "step": 1232 }, { "epoch": 0.06647616993745957, "grad_norm": 0.919360339641571, "learning_rate": 9.994231360354074e-06, "loss": 0.8854, "step": 1233 }, { "epoch": 0.06653008410610309, "grad_norm": 0.8002495169639587, "learning_rate": 9.994221174875809e-06, "loss": 0.7879, "step": 1234 }, { "epoch": 0.0665839982747466, "grad_norm": 0.9539757370948792, "learning_rate": 9.994210980418607e-06, "loss": 0.9027, "step": 1235 }, { "epoch": 0.06663791244339012, "grad_norm": 0.9222649335861206, "learning_rate": 9.99420077698249e-06, "loss": 0.7611, "step": 1236 }, { "epoch": 0.06669182661203364, "grad_norm": 0.8629900813102722, "learning_rate": 9.994190564567472e-06, "loss": 0.8122, "step": 1237 }, { "epoch": 0.06674574078067716, "grad_norm": 0.8339203000068665, "learning_rate": 9.994180343173574e-06, "loss": 0.7873, "step": 1238 }, { "epoch": 0.06679965494932068, "grad_norm": 0.8844656348228455, "learning_rate": 9.994170112800812e-06, "loss": 0.8176, "step": 1239 }, { "epoch": 0.0668535691179642, "grad_norm": 1.0024579763412476, "learning_rate": 9.994159873449206e-06, "loss": 0.844, "step": 1240 }, { "epoch": 0.06690748328660773, "grad_norm": 0.8317261338233948, "learning_rate": 9.994149625118774e-06, "loss": 0.9103, "step": 1241 }, { "epoch": 0.06696139745525125, "grad_norm": 0.8915300965309143, "learning_rate": 9.994139367809534e-06, "loss": 0.9084, "step": 1242 }, { "epoch": 0.06701531162389476, "grad_norm": 0.9270803332328796, "learning_rate": 9.994129101521506e-06, "loss": 0.7634, "step": 1243 }, { "epoch": 0.06706922579253828, "grad_norm": 0.9891652464866638, "learning_rate": 9.994118826254708e-06, "loss": 0.9776, "step": 1244 }, { "epoch": 0.0671231399611818, "grad_norm": 0.7778229713439941, "learning_rate": 9.994108542009156e-06, "loss": 0.7481, "step": 1245 }, { "epoch": 0.06717705412982532, "grad_norm": 0.8451201319694519, "learning_rate": 9.994098248784872e-06, "loss": 0.8012, "step": 1246 }, { "epoch": 0.06723096829846884, "grad_norm": 0.8115825057029724, "learning_rate": 9.994087946581873e-06, "loss": 0.874, "step": 1247 }, { "epoch": 0.06728488246711235, "grad_norm": 0.815934419631958, "learning_rate": 9.994077635400175e-06, "loss": 0.8114, "step": 1248 }, { "epoch": 0.06733879663575587, "grad_norm": 1.1179388761520386, "learning_rate": 9.9940673152398e-06, "loss": 0.9078, "step": 1249 }, { "epoch": 0.0673927108043994, "grad_norm": 0.9235454201698303, "learning_rate": 9.994056986100767e-06, "loss": 0.7511, "step": 1250 }, { "epoch": 0.06744662497304292, "grad_norm": 0.8568270206451416, "learning_rate": 9.994046647983093e-06, "loss": 0.7805, "step": 1251 }, { "epoch": 0.06750053914168644, "grad_norm": 1.1337388753890991, "learning_rate": 9.994036300886796e-06, "loss": 0.8835, "step": 1252 }, { "epoch": 0.06755445331032996, "grad_norm": 0.9154239892959595, "learning_rate": 9.994025944811896e-06, "loss": 0.8804, "step": 1253 }, { "epoch": 0.06760836747897347, "grad_norm": 0.8301606774330139, "learning_rate": 9.99401557975841e-06, "loss": 0.7905, "step": 1254 }, { "epoch": 0.06766228164761699, "grad_norm": 0.9907017350196838, "learning_rate": 9.994005205726358e-06, "loss": 0.9091, "step": 1255 }, { "epoch": 0.06771619581626051, "grad_norm": 0.8883876204490662, "learning_rate": 9.993994822715758e-06, "loss": 0.8815, "step": 1256 }, { "epoch": 0.06777010998490403, "grad_norm": 0.9746614098548889, "learning_rate": 9.993984430726627e-06, "loss": 0.7897, "step": 1257 }, { "epoch": 0.06782402415354755, "grad_norm": 0.9773344993591309, "learning_rate": 9.993974029758988e-06, "loss": 0.8499, "step": 1258 }, { "epoch": 0.06787793832219108, "grad_norm": 0.9552164077758789, "learning_rate": 9.993963619812856e-06, "loss": 0.711, "step": 1259 }, { "epoch": 0.0679318524908346, "grad_norm": 0.9146968126296997, "learning_rate": 9.993953200888252e-06, "loss": 0.9016, "step": 1260 }, { "epoch": 0.06798576665947811, "grad_norm": 0.924244225025177, "learning_rate": 9.993942772985192e-06, "loss": 0.7534, "step": 1261 }, { "epoch": 0.06803968082812163, "grad_norm": 1.2963265180587769, "learning_rate": 9.993932336103699e-06, "loss": 0.9409, "step": 1262 }, { "epoch": 0.06809359499676515, "grad_norm": 0.7954462766647339, "learning_rate": 9.993921890243788e-06, "loss": 0.7669, "step": 1263 }, { "epoch": 0.06814750916540867, "grad_norm": 0.9115849137306213, "learning_rate": 9.993911435405478e-06, "loss": 0.7567, "step": 1264 }, { "epoch": 0.06820142333405219, "grad_norm": 1.0030237436294556, "learning_rate": 9.99390097158879e-06, "loss": 0.8952, "step": 1265 }, { "epoch": 0.0682553375026957, "grad_norm": 0.8897690773010254, "learning_rate": 9.993890498793742e-06, "loss": 0.7993, "step": 1266 }, { "epoch": 0.06830925167133922, "grad_norm": 0.9283807277679443, "learning_rate": 9.993880017020349e-06, "loss": 0.8808, "step": 1267 }, { "epoch": 0.06836316583998275, "grad_norm": 0.848922848701477, "learning_rate": 9.993869526268637e-06, "loss": 0.7979, "step": 1268 }, { "epoch": 0.06841708000862627, "grad_norm": 0.8896105289459229, "learning_rate": 9.993859026538618e-06, "loss": 0.8886, "step": 1269 }, { "epoch": 0.06847099417726979, "grad_norm": 0.8602685928344727, "learning_rate": 9.993848517830318e-06, "loss": 0.8209, "step": 1270 }, { "epoch": 0.06852490834591331, "grad_norm": 0.9300077557563782, "learning_rate": 9.99383800014375e-06, "loss": 0.9261, "step": 1271 }, { "epoch": 0.06857882251455683, "grad_norm": 0.8691270351409912, "learning_rate": 9.993827473478934e-06, "loss": 0.9217, "step": 1272 }, { "epoch": 0.06863273668320034, "grad_norm": 0.7943814992904663, "learning_rate": 9.99381693783589e-06, "loss": 0.8557, "step": 1273 }, { "epoch": 0.06868665085184386, "grad_norm": 0.9060125946998596, "learning_rate": 9.993806393214638e-06, "loss": 0.8314, "step": 1274 }, { "epoch": 0.06874056502048738, "grad_norm": 0.8014434576034546, "learning_rate": 9.993795839615194e-06, "loss": 0.8047, "step": 1275 }, { "epoch": 0.0687944791891309, "grad_norm": 1.0498815774917603, "learning_rate": 9.993785277037578e-06, "loss": 0.7125, "step": 1276 }, { "epoch": 0.06884839335777443, "grad_norm": 0.8868438005447388, "learning_rate": 9.993774705481812e-06, "loss": 0.8594, "step": 1277 }, { "epoch": 0.06890230752641795, "grad_norm": 0.8213896155357361, "learning_rate": 9.993764124947911e-06, "loss": 0.7995, "step": 1278 }, { "epoch": 0.06895622169506146, "grad_norm": 0.9007741212844849, "learning_rate": 9.993753535435895e-06, "loss": 0.8982, "step": 1279 }, { "epoch": 0.06901013586370498, "grad_norm": 0.8377478122711182, "learning_rate": 9.993742936945785e-06, "loss": 0.7387, "step": 1280 }, { "epoch": 0.0690640500323485, "grad_norm": 0.8009492754936218, "learning_rate": 9.993732329477598e-06, "loss": 0.8079, "step": 1281 }, { "epoch": 0.06911796420099202, "grad_norm": 0.8478789925575256, "learning_rate": 9.993721713031354e-06, "loss": 0.8682, "step": 1282 }, { "epoch": 0.06917187836963554, "grad_norm": 0.7498561143875122, "learning_rate": 9.993711087607072e-06, "loss": 0.8107, "step": 1283 }, { "epoch": 0.06922579253827905, "grad_norm": 0.8972634077072144, "learning_rate": 9.99370045320477e-06, "loss": 0.8494, "step": 1284 }, { "epoch": 0.06927970670692257, "grad_norm": 0.942449152469635, "learning_rate": 9.99368980982447e-06, "loss": 0.8487, "step": 1285 }, { "epoch": 0.0693336208755661, "grad_norm": 0.8752795457839966, "learning_rate": 9.993679157466188e-06, "loss": 0.8859, "step": 1286 }, { "epoch": 0.06938753504420962, "grad_norm": 0.8289507031440735, "learning_rate": 9.993668496129945e-06, "loss": 0.8726, "step": 1287 }, { "epoch": 0.06944144921285314, "grad_norm": 0.9452151656150818, "learning_rate": 9.993657825815759e-06, "loss": 0.9266, "step": 1288 }, { "epoch": 0.06949536338149666, "grad_norm": 0.8697348237037659, "learning_rate": 9.993647146523651e-06, "loss": 0.8946, "step": 1289 }, { "epoch": 0.06954927755014018, "grad_norm": 0.8712061643600464, "learning_rate": 9.993636458253637e-06, "loss": 0.8551, "step": 1290 }, { "epoch": 0.0696031917187837, "grad_norm": 0.9295617938041687, "learning_rate": 9.993625761005739e-06, "loss": 0.8963, "step": 1291 }, { "epoch": 0.06965710588742721, "grad_norm": 0.9441055059432983, "learning_rate": 9.993615054779975e-06, "loss": 0.9567, "step": 1292 }, { "epoch": 0.06971102005607073, "grad_norm": 0.8742032051086426, "learning_rate": 9.993604339576365e-06, "loss": 0.8341, "step": 1293 }, { "epoch": 0.06976493422471426, "grad_norm": 0.8596220016479492, "learning_rate": 9.993593615394928e-06, "loss": 0.8576, "step": 1294 }, { "epoch": 0.06981884839335778, "grad_norm": 0.8011770844459534, "learning_rate": 9.993582882235682e-06, "loss": 0.7317, "step": 1295 }, { "epoch": 0.0698727625620013, "grad_norm": 0.8578245043754578, "learning_rate": 9.993572140098648e-06, "loss": 0.8853, "step": 1296 }, { "epoch": 0.06992667673064482, "grad_norm": 1.1155178546905518, "learning_rate": 9.993561388983845e-06, "loss": 0.8199, "step": 1297 }, { "epoch": 0.06998059089928833, "grad_norm": 1.035699486732483, "learning_rate": 9.993550628891293e-06, "loss": 0.9498, "step": 1298 }, { "epoch": 0.07003450506793185, "grad_norm": 0.8635748028755188, "learning_rate": 9.99353985982101e-06, "loss": 0.8741, "step": 1299 }, { "epoch": 0.07008841923657537, "grad_norm": 0.8650850653648376, "learning_rate": 9.993529081773016e-06, "loss": 0.7337, "step": 1300 }, { "epoch": 0.07014233340521889, "grad_norm": 0.8334539532661438, "learning_rate": 9.99351829474733e-06, "loss": 0.8927, "step": 1301 }, { "epoch": 0.0701962475738624, "grad_norm": 0.9150926470756531, "learning_rate": 9.993507498743971e-06, "loss": 0.8464, "step": 1302 }, { "epoch": 0.07025016174250594, "grad_norm": 0.8916522860527039, "learning_rate": 9.993496693762958e-06, "loss": 0.7899, "step": 1303 }, { "epoch": 0.07030407591114946, "grad_norm": 1.0224976539611816, "learning_rate": 9.993485879804314e-06, "loss": 0.8256, "step": 1304 }, { "epoch": 0.07035799007979297, "grad_norm": 0.921816885471344, "learning_rate": 9.993475056868054e-06, "loss": 0.7944, "step": 1305 }, { "epoch": 0.07041190424843649, "grad_norm": 0.8775705099105835, "learning_rate": 9.9934642249542e-06, "loss": 0.9098, "step": 1306 }, { "epoch": 0.07046581841708001, "grad_norm": 0.9802567362785339, "learning_rate": 9.99345338406277e-06, "loss": 0.9756, "step": 1307 }, { "epoch": 0.07051973258572353, "grad_norm": 0.9785491228103638, "learning_rate": 9.993442534193786e-06, "loss": 1.0017, "step": 1308 }, { "epoch": 0.07057364675436704, "grad_norm": 0.8796840906143188, "learning_rate": 9.993431675347265e-06, "loss": 0.7202, "step": 1309 }, { "epoch": 0.07062756092301056, "grad_norm": 0.878099799156189, "learning_rate": 9.993420807523227e-06, "loss": 0.8655, "step": 1310 }, { "epoch": 0.07068147509165408, "grad_norm": 0.8361509442329407, "learning_rate": 9.99340993072169e-06, "loss": 0.8522, "step": 1311 }, { "epoch": 0.07073538926029761, "grad_norm": 0.8556873798370361, "learning_rate": 9.99339904494268e-06, "loss": 0.8603, "step": 1312 }, { "epoch": 0.07078930342894113, "grad_norm": 0.8434461355209351, "learning_rate": 9.993388150186208e-06, "loss": 0.8571, "step": 1313 }, { "epoch": 0.07084321759758465, "grad_norm": 0.8545907139778137, "learning_rate": 9.9933772464523e-06, "loss": 0.8145, "step": 1314 }, { "epoch": 0.07089713176622817, "grad_norm": 0.9502561092376709, "learning_rate": 9.993366333740971e-06, "loss": 0.8068, "step": 1315 }, { "epoch": 0.07095104593487168, "grad_norm": 0.848628580570221, "learning_rate": 9.993355412052244e-06, "loss": 0.8793, "step": 1316 }, { "epoch": 0.0710049601035152, "grad_norm": 0.9699797630310059, "learning_rate": 9.993344481386137e-06, "loss": 0.9904, "step": 1317 }, { "epoch": 0.07105887427215872, "grad_norm": 0.8888396620750427, "learning_rate": 9.993333541742671e-06, "loss": 0.8363, "step": 1318 }, { "epoch": 0.07111278844080224, "grad_norm": 0.8805423974990845, "learning_rate": 9.993322593121863e-06, "loss": 0.8905, "step": 1319 }, { "epoch": 0.07116670260944576, "grad_norm": 0.8875272274017334, "learning_rate": 9.993311635523736e-06, "loss": 0.7717, "step": 1320 }, { "epoch": 0.07122061677808929, "grad_norm": 0.8853299617767334, "learning_rate": 9.993300668948308e-06, "loss": 0.9077, "step": 1321 }, { "epoch": 0.0712745309467328, "grad_norm": 0.8847644329071045, "learning_rate": 9.993289693395599e-06, "loss": 0.8362, "step": 1322 }, { "epoch": 0.07132844511537632, "grad_norm": 0.9531683325767517, "learning_rate": 9.993278708865629e-06, "loss": 0.8848, "step": 1323 }, { "epoch": 0.07138235928401984, "grad_norm": 0.8573325276374817, "learning_rate": 9.993267715358414e-06, "loss": 0.8367, "step": 1324 }, { "epoch": 0.07143627345266336, "grad_norm": 0.8920298218727112, "learning_rate": 9.99325671287398e-06, "loss": 0.8838, "step": 1325 }, { "epoch": 0.07149018762130688, "grad_norm": 0.8472782969474792, "learning_rate": 9.993245701412343e-06, "loss": 0.8313, "step": 1326 }, { "epoch": 0.0715441017899504, "grad_norm": 1.047664761543274, "learning_rate": 9.993234680973525e-06, "loss": 0.8663, "step": 1327 }, { "epoch": 0.07159801595859391, "grad_norm": 0.9395570158958435, "learning_rate": 9.993223651557542e-06, "loss": 0.7703, "step": 1328 }, { "epoch": 0.07165193012723743, "grad_norm": 0.9125472903251648, "learning_rate": 9.993212613164419e-06, "loss": 0.9335, "step": 1329 }, { "epoch": 0.07170584429588096, "grad_norm": 0.9043323397636414, "learning_rate": 9.993201565794172e-06, "loss": 0.9185, "step": 1330 }, { "epoch": 0.07175975846452448, "grad_norm": 0.8764339089393616, "learning_rate": 9.993190509446821e-06, "loss": 0.8807, "step": 1331 }, { "epoch": 0.071813672633168, "grad_norm": 0.9123268723487854, "learning_rate": 9.99317944412239e-06, "loss": 0.8134, "step": 1332 }, { "epoch": 0.07186758680181152, "grad_norm": 0.9625567197799683, "learning_rate": 9.993168369820892e-06, "loss": 0.8132, "step": 1333 }, { "epoch": 0.07192150097045504, "grad_norm": 0.880536675453186, "learning_rate": 9.993157286542352e-06, "loss": 0.8107, "step": 1334 }, { "epoch": 0.07197541513909855, "grad_norm": 0.9165224432945251, "learning_rate": 9.99314619428679e-06, "loss": 0.8376, "step": 1335 }, { "epoch": 0.07202932930774207, "grad_norm": 0.8278066515922546, "learning_rate": 9.993135093054223e-06, "loss": 0.8075, "step": 1336 }, { "epoch": 0.07208324347638559, "grad_norm": 0.9237795472145081, "learning_rate": 9.993123982844674e-06, "loss": 0.7838, "step": 1337 }, { "epoch": 0.0721371576450291, "grad_norm": 0.8200939297676086, "learning_rate": 9.993112863658161e-06, "loss": 0.8475, "step": 1338 }, { "epoch": 0.07219107181367264, "grad_norm": 0.8505958318710327, "learning_rate": 9.993101735494704e-06, "loss": 0.7891, "step": 1339 }, { "epoch": 0.07224498598231616, "grad_norm": 0.8407264351844788, "learning_rate": 9.993090598354323e-06, "loss": 0.8128, "step": 1340 }, { "epoch": 0.07229890015095967, "grad_norm": 0.8039887547492981, "learning_rate": 9.993079452237038e-06, "loss": 0.8504, "step": 1341 }, { "epoch": 0.07235281431960319, "grad_norm": 0.7590643167495728, "learning_rate": 9.993068297142871e-06, "loss": 0.7402, "step": 1342 }, { "epoch": 0.07240672848824671, "grad_norm": 0.7866249680519104, "learning_rate": 9.993057133071842e-06, "loss": 0.7076, "step": 1343 }, { "epoch": 0.07246064265689023, "grad_norm": 0.9846029281616211, "learning_rate": 9.993045960023967e-06, "loss": 0.9179, "step": 1344 }, { "epoch": 0.07251455682553375, "grad_norm": 0.8918319940567017, "learning_rate": 9.99303477799927e-06, "loss": 0.8087, "step": 1345 }, { "epoch": 0.07256847099417726, "grad_norm": 0.8407700061798096, "learning_rate": 9.99302358699777e-06, "loss": 0.7272, "step": 1346 }, { "epoch": 0.0726223851628208, "grad_norm": 0.9637326598167419, "learning_rate": 9.993012387019486e-06, "loss": 0.8613, "step": 1347 }, { "epoch": 0.07267629933146431, "grad_norm": 0.8362317681312561, "learning_rate": 9.99300117806444e-06, "loss": 0.917, "step": 1348 }, { "epoch": 0.07273021350010783, "grad_norm": 0.8584982752799988, "learning_rate": 9.992989960132651e-06, "loss": 0.8857, "step": 1349 }, { "epoch": 0.07278412766875135, "grad_norm": 0.8341198563575745, "learning_rate": 9.992978733224139e-06, "loss": 0.802, "step": 1350 }, { "epoch": 0.07283804183739487, "grad_norm": 1.6860167980194092, "learning_rate": 9.992967497338926e-06, "loss": 0.8789, "step": 1351 }, { "epoch": 0.07289195600603839, "grad_norm": 0.8399189114570618, "learning_rate": 9.99295625247703e-06, "loss": 0.6338, "step": 1352 }, { "epoch": 0.0729458701746819, "grad_norm": 0.9616976976394653, "learning_rate": 9.992944998638473e-06, "loss": 0.9735, "step": 1353 }, { "epoch": 0.07299978434332542, "grad_norm": 0.8592861890792847, "learning_rate": 9.992933735823272e-06, "loss": 0.8159, "step": 1354 }, { "epoch": 0.07305369851196894, "grad_norm": 0.8448725342750549, "learning_rate": 9.992922464031451e-06, "loss": 0.7942, "step": 1355 }, { "epoch": 0.07310761268061247, "grad_norm": 0.8015927672386169, "learning_rate": 9.99291118326303e-06, "loss": 0.7429, "step": 1356 }, { "epoch": 0.07316152684925599, "grad_norm": 0.8255912065505981, "learning_rate": 9.992899893518025e-06, "loss": 0.8532, "step": 1357 }, { "epoch": 0.07321544101789951, "grad_norm": 0.8764085173606873, "learning_rate": 9.992888594796462e-06, "loss": 0.7989, "step": 1358 }, { "epoch": 0.07326935518654303, "grad_norm": 0.8405522704124451, "learning_rate": 9.992877287098357e-06, "loss": 0.8709, "step": 1359 }, { "epoch": 0.07332326935518654, "grad_norm": 0.8657836318016052, "learning_rate": 9.992865970423733e-06, "loss": 0.8236, "step": 1360 }, { "epoch": 0.07337718352383006, "grad_norm": 0.8817959427833557, "learning_rate": 9.992854644772609e-06, "loss": 0.902, "step": 1361 }, { "epoch": 0.07343109769247358, "grad_norm": 0.8290701508522034, "learning_rate": 9.992843310145006e-06, "loss": 0.8454, "step": 1362 }, { "epoch": 0.0734850118611171, "grad_norm": 0.9637642502784729, "learning_rate": 9.992831966540946e-06, "loss": 0.9414, "step": 1363 }, { "epoch": 0.07353892602976062, "grad_norm": 0.9220197200775146, "learning_rate": 9.992820613960446e-06, "loss": 0.9827, "step": 1364 }, { "epoch": 0.07359284019840415, "grad_norm": 0.9008362889289856, "learning_rate": 9.992809252403526e-06, "loss": 0.8388, "step": 1365 }, { "epoch": 0.07364675436704766, "grad_norm": 0.9517331123352051, "learning_rate": 9.992797881870212e-06, "loss": 0.8758, "step": 1366 }, { "epoch": 0.07370066853569118, "grad_norm": 0.7811571359634399, "learning_rate": 9.992786502360517e-06, "loss": 0.6984, "step": 1367 }, { "epoch": 0.0737545827043347, "grad_norm": 0.9887184500694275, "learning_rate": 9.992775113874466e-06, "loss": 0.7832, "step": 1368 }, { "epoch": 0.07380849687297822, "grad_norm": 1.025869607925415, "learning_rate": 9.99276371641208e-06, "loss": 0.8417, "step": 1369 }, { "epoch": 0.07386241104162174, "grad_norm": 0.8479165434837341, "learning_rate": 9.99275230997338e-06, "loss": 0.7862, "step": 1370 }, { "epoch": 0.07391632521026525, "grad_norm": 0.9213555455207825, "learning_rate": 9.992740894558381e-06, "loss": 0.915, "step": 1371 }, { "epoch": 0.07397023937890877, "grad_norm": 0.832306444644928, "learning_rate": 9.992729470167109e-06, "loss": 0.7566, "step": 1372 }, { "epoch": 0.07402415354755229, "grad_norm": 1.0360348224639893, "learning_rate": 9.992718036799583e-06, "loss": 0.9096, "step": 1373 }, { "epoch": 0.07407806771619582, "grad_norm": 0.8898483514785767, "learning_rate": 9.992706594455823e-06, "loss": 0.8738, "step": 1374 }, { "epoch": 0.07413198188483934, "grad_norm": 0.8813758492469788, "learning_rate": 9.992695143135849e-06, "loss": 0.8736, "step": 1375 }, { "epoch": 0.07418589605348286, "grad_norm": 1.1480571031570435, "learning_rate": 9.992683682839683e-06, "loss": 0.915, "step": 1376 }, { "epoch": 0.07423981022212638, "grad_norm": 0.8588376641273499, "learning_rate": 9.992672213567345e-06, "loss": 0.8295, "step": 1377 }, { "epoch": 0.0742937243907699, "grad_norm": 0.8729918599128723, "learning_rate": 9.992660735318858e-06, "loss": 0.9058, "step": 1378 }, { "epoch": 0.07434763855941341, "grad_norm": 0.7953224778175354, "learning_rate": 9.992649248094236e-06, "loss": 0.7857, "step": 1379 }, { "epoch": 0.07440155272805693, "grad_norm": 0.8485717177391052, "learning_rate": 9.992637751893508e-06, "loss": 0.7641, "step": 1380 }, { "epoch": 0.07445546689670045, "grad_norm": 0.8630878329277039, "learning_rate": 9.99262624671669e-06, "loss": 0.8624, "step": 1381 }, { "epoch": 0.07450938106534397, "grad_norm": 0.8655185103416443, "learning_rate": 9.992614732563802e-06, "loss": 0.8428, "step": 1382 }, { "epoch": 0.0745632952339875, "grad_norm": 0.7875732779502869, "learning_rate": 9.992603209434868e-06, "loss": 0.7272, "step": 1383 }, { "epoch": 0.07461720940263102, "grad_norm": 0.875879168510437, "learning_rate": 9.992591677329905e-06, "loss": 0.8539, "step": 1384 }, { "epoch": 0.07467112357127453, "grad_norm": 0.8618319034576416, "learning_rate": 9.992580136248934e-06, "loss": 0.879, "step": 1385 }, { "epoch": 0.07472503773991805, "grad_norm": 0.8695591688156128, "learning_rate": 9.992568586191981e-06, "loss": 0.8477, "step": 1386 }, { "epoch": 0.07477895190856157, "grad_norm": 0.8539825677871704, "learning_rate": 9.992557027159062e-06, "loss": 0.7347, "step": 1387 }, { "epoch": 0.07483286607720509, "grad_norm": 0.9625217914581299, "learning_rate": 9.992545459150197e-06, "loss": 0.8561, "step": 1388 }, { "epoch": 0.0748867802458486, "grad_norm": 0.9862298369407654, "learning_rate": 9.992533882165409e-06, "loss": 0.9583, "step": 1389 }, { "epoch": 0.07494069441449212, "grad_norm": 0.8217719793319702, "learning_rate": 9.99252229620472e-06, "loss": 0.7995, "step": 1390 }, { "epoch": 0.07499460858313564, "grad_norm": 0.8668621182441711, "learning_rate": 9.992510701268147e-06, "loss": 0.8484, "step": 1391 }, { "epoch": 0.07504852275177917, "grad_norm": 0.8549453616142273, "learning_rate": 9.992499097355716e-06, "loss": 0.8552, "step": 1392 }, { "epoch": 0.07510243692042269, "grad_norm": 0.8262618184089661, "learning_rate": 9.992487484467444e-06, "loss": 0.7054, "step": 1393 }, { "epoch": 0.07515635108906621, "grad_norm": 0.8524961471557617, "learning_rate": 9.992475862603352e-06, "loss": 0.8231, "step": 1394 }, { "epoch": 0.07521026525770973, "grad_norm": 0.7805570363998413, "learning_rate": 9.99246423176346e-06, "loss": 0.7778, "step": 1395 }, { "epoch": 0.07526417942635324, "grad_norm": 0.950484037399292, "learning_rate": 9.992452591947794e-06, "loss": 0.8662, "step": 1396 }, { "epoch": 0.07531809359499676, "grad_norm": 0.8746458888053894, "learning_rate": 9.99244094315637e-06, "loss": 0.7854, "step": 1397 }, { "epoch": 0.07537200776364028, "grad_norm": 0.9450538754463196, "learning_rate": 9.992429285389212e-06, "loss": 0.954, "step": 1398 }, { "epoch": 0.0754259219322838, "grad_norm": 0.9048300385475159, "learning_rate": 9.992417618646337e-06, "loss": 0.8915, "step": 1399 }, { "epoch": 0.07547983610092733, "grad_norm": 0.8735381364822388, "learning_rate": 9.99240594292777e-06, "loss": 0.8391, "step": 1400 }, { "epoch": 0.07553375026957085, "grad_norm": 1.0980675220489502, "learning_rate": 9.99239425823353e-06, "loss": 0.8892, "step": 1401 }, { "epoch": 0.07558766443821437, "grad_norm": 0.9016425013542175, "learning_rate": 9.992382564563638e-06, "loss": 0.8192, "step": 1402 }, { "epoch": 0.07564157860685788, "grad_norm": 0.801419198513031, "learning_rate": 9.992370861918117e-06, "loss": 0.7914, "step": 1403 }, { "epoch": 0.0756954927755014, "grad_norm": 0.9043407440185547, "learning_rate": 9.992359150296985e-06, "loss": 0.8767, "step": 1404 }, { "epoch": 0.07574940694414492, "grad_norm": 0.9703086018562317, "learning_rate": 9.992347429700266e-06, "loss": 0.9173, "step": 1405 }, { "epoch": 0.07580332111278844, "grad_norm": 0.8154104351997375, "learning_rate": 9.992335700127978e-06, "loss": 0.8453, "step": 1406 }, { "epoch": 0.07585723528143196, "grad_norm": 0.8551482558250427, "learning_rate": 9.992323961580146e-06, "loss": 0.9132, "step": 1407 }, { "epoch": 0.07591114945007547, "grad_norm": 0.9425063729286194, "learning_rate": 9.992312214056785e-06, "loss": 0.8171, "step": 1408 }, { "epoch": 0.075965063618719, "grad_norm": 0.8958794474601746, "learning_rate": 9.992300457557922e-06, "loss": 0.7983, "step": 1409 }, { "epoch": 0.07601897778736252, "grad_norm": 0.873874843120575, "learning_rate": 9.992288692083579e-06, "loss": 0.798, "step": 1410 }, { "epoch": 0.07607289195600604, "grad_norm": 0.7951189279556274, "learning_rate": 9.99227691763377e-06, "loss": 0.8671, "step": 1411 }, { "epoch": 0.07612680612464956, "grad_norm": 0.8073802590370178, "learning_rate": 9.992265134208522e-06, "loss": 0.8214, "step": 1412 }, { "epoch": 0.07618072029329308, "grad_norm": 0.918222188949585, "learning_rate": 9.992253341807854e-06, "loss": 0.807, "step": 1413 }, { "epoch": 0.0762346344619366, "grad_norm": 0.834381103515625, "learning_rate": 9.992241540431789e-06, "loss": 0.8737, "step": 1414 }, { "epoch": 0.07628854863058011, "grad_norm": 0.808437168598175, "learning_rate": 9.992229730080347e-06, "loss": 0.7982, "step": 1415 }, { "epoch": 0.07634246279922363, "grad_norm": 0.7868708968162537, "learning_rate": 9.992217910753547e-06, "loss": 0.7071, "step": 1416 }, { "epoch": 0.07639637696786715, "grad_norm": 0.8445919156074524, "learning_rate": 9.992206082451416e-06, "loss": 0.8353, "step": 1417 }, { "epoch": 0.07645029113651068, "grad_norm": 0.8283419609069824, "learning_rate": 9.992194245173969e-06, "loss": 0.867, "step": 1418 }, { "epoch": 0.0765042053051542, "grad_norm": 0.8390635251998901, "learning_rate": 9.99218239892123e-06, "loss": 0.822, "step": 1419 }, { "epoch": 0.07655811947379772, "grad_norm": 0.9037001132965088, "learning_rate": 9.992170543693222e-06, "loss": 0.8759, "step": 1420 }, { "epoch": 0.07661203364244124, "grad_norm": 0.9708169102668762, "learning_rate": 9.992158679489965e-06, "loss": 0.875, "step": 1421 }, { "epoch": 0.07666594781108475, "grad_norm": 0.8712205290794373, "learning_rate": 9.992146806311479e-06, "loss": 0.8711, "step": 1422 }, { "epoch": 0.07671986197972827, "grad_norm": 0.953936755657196, "learning_rate": 9.992134924157786e-06, "loss": 0.8117, "step": 1423 }, { "epoch": 0.07677377614837179, "grad_norm": 1.3178669214248657, "learning_rate": 9.992123033028908e-06, "loss": 0.8932, "step": 1424 }, { "epoch": 0.0768276903170153, "grad_norm": 0.8657799959182739, "learning_rate": 9.992111132924867e-06, "loss": 0.8429, "step": 1425 }, { "epoch": 0.07688160448565882, "grad_norm": 0.8979378938674927, "learning_rate": 9.992099223845681e-06, "loss": 0.9165, "step": 1426 }, { "epoch": 0.07693551865430236, "grad_norm": 0.797493040561676, "learning_rate": 9.992087305791376e-06, "loss": 0.8139, "step": 1427 }, { "epoch": 0.07698943282294587, "grad_norm": 0.9762497544288635, "learning_rate": 9.99207537876197e-06, "loss": 0.8006, "step": 1428 }, { "epoch": 0.07704334699158939, "grad_norm": 0.9322238564491272, "learning_rate": 9.992063442757487e-06, "loss": 0.8708, "step": 1429 }, { "epoch": 0.07709726116023291, "grad_norm": 0.9208402037620544, "learning_rate": 9.992051497777947e-06, "loss": 0.9137, "step": 1430 }, { "epoch": 0.07715117532887643, "grad_norm": 0.9262849688529968, "learning_rate": 9.99203954382337e-06, "loss": 0.8043, "step": 1431 }, { "epoch": 0.07720508949751995, "grad_norm": 1.0556507110595703, "learning_rate": 9.992027580893781e-06, "loss": 0.8321, "step": 1432 }, { "epoch": 0.07725900366616346, "grad_norm": 1.0503417253494263, "learning_rate": 9.9920156089892e-06, "loss": 0.8875, "step": 1433 }, { "epoch": 0.07731291783480698, "grad_norm": 0.8772387504577637, "learning_rate": 9.992003628109647e-06, "loss": 0.7407, "step": 1434 }, { "epoch": 0.0773668320034505, "grad_norm": 0.942286491394043, "learning_rate": 9.991991638255146e-06, "loss": 0.8493, "step": 1435 }, { "epoch": 0.07742074617209403, "grad_norm": 0.8584794998168945, "learning_rate": 9.991979639425717e-06, "loss": 0.8003, "step": 1436 }, { "epoch": 0.07747466034073755, "grad_norm": 0.8247780203819275, "learning_rate": 9.99196763162138e-06, "loss": 0.9156, "step": 1437 }, { "epoch": 0.07752857450938107, "grad_norm": 0.859018862247467, "learning_rate": 9.99195561484216e-06, "loss": 0.8255, "step": 1438 }, { "epoch": 0.07758248867802459, "grad_norm": 0.9073282480239868, "learning_rate": 9.991943589088078e-06, "loss": 0.903, "step": 1439 }, { "epoch": 0.0776364028466681, "grad_norm": 0.9324385523796082, "learning_rate": 9.991931554359154e-06, "loss": 0.8618, "step": 1440 }, { "epoch": 0.07769031701531162, "grad_norm": 0.8038938045501709, "learning_rate": 9.991919510655409e-06, "loss": 0.7545, "step": 1441 }, { "epoch": 0.07774423118395514, "grad_norm": 0.7999526858329773, "learning_rate": 9.991907457976866e-06, "loss": 0.6804, "step": 1442 }, { "epoch": 0.07779814535259866, "grad_norm": 1.0165048837661743, "learning_rate": 9.991895396323548e-06, "loss": 0.7664, "step": 1443 }, { "epoch": 0.07785205952124218, "grad_norm": 0.9513073563575745, "learning_rate": 9.991883325695475e-06, "loss": 0.8115, "step": 1444 }, { "epoch": 0.07790597368988571, "grad_norm": 1.0391769409179688, "learning_rate": 9.991871246092669e-06, "loss": 0.9197, "step": 1445 }, { "epoch": 0.07795988785852923, "grad_norm": 0.8990768194198608, "learning_rate": 9.991859157515151e-06, "loss": 0.9507, "step": 1446 }, { "epoch": 0.07801380202717274, "grad_norm": 0.9990912079811096, "learning_rate": 9.991847059962945e-06, "loss": 0.7951, "step": 1447 }, { "epoch": 0.07806771619581626, "grad_norm": 1.0030032396316528, "learning_rate": 9.99183495343607e-06, "loss": 0.7237, "step": 1448 }, { "epoch": 0.07812163036445978, "grad_norm": 0.889561116695404, "learning_rate": 9.991822837934551e-06, "loss": 0.9061, "step": 1449 }, { "epoch": 0.0781755445331033, "grad_norm": 0.8766982555389404, "learning_rate": 9.991810713458405e-06, "loss": 0.7952, "step": 1450 }, { "epoch": 0.07822945870174682, "grad_norm": 0.9144406914710999, "learning_rate": 9.991798580007658e-06, "loss": 0.9235, "step": 1451 }, { "epoch": 0.07828337287039033, "grad_norm": 0.895516037940979, "learning_rate": 9.99178643758233e-06, "loss": 0.9469, "step": 1452 }, { "epoch": 0.07833728703903386, "grad_norm": 0.8802943229675293, "learning_rate": 9.991774286182443e-06, "loss": 0.8548, "step": 1453 }, { "epoch": 0.07839120120767738, "grad_norm": 1.2773913145065308, "learning_rate": 9.99176212580802e-06, "loss": 0.794, "step": 1454 }, { "epoch": 0.0784451153763209, "grad_norm": 0.9501168131828308, "learning_rate": 9.99174995645908e-06, "loss": 0.8711, "step": 1455 }, { "epoch": 0.07849902954496442, "grad_norm": 0.9047390222549438, "learning_rate": 9.991737778135649e-06, "loss": 0.8419, "step": 1456 }, { "epoch": 0.07855294371360794, "grad_norm": 0.9492837190628052, "learning_rate": 9.991725590837747e-06, "loss": 0.9832, "step": 1457 }, { "epoch": 0.07860685788225145, "grad_norm": 0.9585106372833252, "learning_rate": 9.991713394565394e-06, "loss": 0.8393, "step": 1458 }, { "epoch": 0.07866077205089497, "grad_norm": 0.9568297266960144, "learning_rate": 9.991701189318615e-06, "loss": 0.8711, "step": 1459 }, { "epoch": 0.07871468621953849, "grad_norm": 0.9201347231864929, "learning_rate": 9.991688975097429e-06, "loss": 0.7947, "step": 1460 }, { "epoch": 0.07876860038818201, "grad_norm": 0.8375768661499023, "learning_rate": 9.99167675190186e-06, "loss": 0.8051, "step": 1461 }, { "epoch": 0.07882251455682554, "grad_norm": 0.8397765755653381, "learning_rate": 9.99166451973193e-06, "loss": 0.7727, "step": 1462 }, { "epoch": 0.07887642872546906, "grad_norm": 0.8697947859764099, "learning_rate": 9.99165227858766e-06, "loss": 0.8171, "step": 1463 }, { "epoch": 0.07893034289411258, "grad_norm": 0.8894750475883484, "learning_rate": 9.991640028469073e-06, "loss": 0.8773, "step": 1464 }, { "epoch": 0.0789842570627561, "grad_norm": 0.8817871809005737, "learning_rate": 9.991627769376189e-06, "loss": 0.8983, "step": 1465 }, { "epoch": 0.07903817123139961, "grad_norm": 0.9241123795509338, "learning_rate": 9.99161550130903e-06, "loss": 0.8967, "step": 1466 }, { "epoch": 0.07909208540004313, "grad_norm": 0.852982223033905, "learning_rate": 9.991603224267623e-06, "loss": 0.9054, "step": 1467 }, { "epoch": 0.07914599956868665, "grad_norm": 0.7719098925590515, "learning_rate": 9.991590938251986e-06, "loss": 0.7845, "step": 1468 }, { "epoch": 0.07919991373733017, "grad_norm": 0.8700329661369324, "learning_rate": 9.99157864326214e-06, "loss": 0.9664, "step": 1469 }, { "epoch": 0.07925382790597368, "grad_norm": 0.880553126335144, "learning_rate": 9.991566339298112e-06, "loss": 0.8803, "step": 1470 }, { "epoch": 0.07930774207461722, "grad_norm": 0.9425762295722961, "learning_rate": 9.991554026359918e-06, "loss": 0.8259, "step": 1471 }, { "epoch": 0.07936165624326073, "grad_norm": 0.8611294031143188, "learning_rate": 9.991541704447585e-06, "loss": 0.8693, "step": 1472 }, { "epoch": 0.07941557041190425, "grad_norm": 0.856023907661438, "learning_rate": 9.99152937356113e-06, "loss": 0.7073, "step": 1473 }, { "epoch": 0.07946948458054777, "grad_norm": 0.7763693332672119, "learning_rate": 9.991517033700582e-06, "loss": 0.6815, "step": 1474 }, { "epoch": 0.07952339874919129, "grad_norm": 0.8417321443557739, "learning_rate": 9.991504684865959e-06, "loss": 0.8239, "step": 1475 }, { "epoch": 0.0795773129178348, "grad_norm": 0.9151323437690735, "learning_rate": 9.991492327057282e-06, "loss": 0.8327, "step": 1476 }, { "epoch": 0.07963122708647832, "grad_norm": 0.8285405039787292, "learning_rate": 9.991479960274576e-06, "loss": 0.8623, "step": 1477 }, { "epoch": 0.07968514125512184, "grad_norm": 0.8204792141914368, "learning_rate": 9.991467584517863e-06, "loss": 0.8494, "step": 1478 }, { "epoch": 0.07973905542376536, "grad_norm": 0.8516230583190918, "learning_rate": 9.991455199787164e-06, "loss": 0.8219, "step": 1479 }, { "epoch": 0.07979296959240889, "grad_norm": 0.9418333172798157, "learning_rate": 9.991442806082501e-06, "loss": 0.9293, "step": 1480 }, { "epoch": 0.07984688376105241, "grad_norm": 0.8852763175964355, "learning_rate": 9.991430403403898e-06, "loss": 0.8124, "step": 1481 }, { "epoch": 0.07990079792969593, "grad_norm": 0.8435791730880737, "learning_rate": 9.991417991751376e-06, "loss": 0.8634, "step": 1482 }, { "epoch": 0.07995471209833944, "grad_norm": 0.7795083522796631, "learning_rate": 9.991405571124957e-06, "loss": 0.802, "step": 1483 }, { "epoch": 0.08000862626698296, "grad_norm": 0.8102303743362427, "learning_rate": 9.991393141524663e-06, "loss": 0.7492, "step": 1484 }, { "epoch": 0.08006254043562648, "grad_norm": 0.8433593511581421, "learning_rate": 9.99138070295052e-06, "loss": 0.7926, "step": 1485 }, { "epoch": 0.08011645460427, "grad_norm": 0.8992267847061157, "learning_rate": 9.991368255402546e-06, "loss": 0.7859, "step": 1486 }, { "epoch": 0.08017036877291352, "grad_norm": 0.8748059868812561, "learning_rate": 9.991355798880765e-06, "loss": 0.8245, "step": 1487 }, { "epoch": 0.08022428294155703, "grad_norm": 0.8456832766532898, "learning_rate": 9.9913433333852e-06, "loss": 0.9009, "step": 1488 }, { "epoch": 0.08027819711020057, "grad_norm": 0.8582474589347839, "learning_rate": 9.991330858915873e-06, "loss": 0.7607, "step": 1489 }, { "epoch": 0.08033211127884408, "grad_norm": 0.8157060146331787, "learning_rate": 9.991318375472807e-06, "loss": 0.8426, "step": 1490 }, { "epoch": 0.0803860254474876, "grad_norm": 0.7474784851074219, "learning_rate": 9.991305883056021e-06, "loss": 0.8014, "step": 1491 }, { "epoch": 0.08043993961613112, "grad_norm": 0.8432475924491882, "learning_rate": 9.991293381665543e-06, "loss": 0.8254, "step": 1492 }, { "epoch": 0.08049385378477464, "grad_norm": 0.8733057379722595, "learning_rate": 9.991280871301392e-06, "loss": 0.8694, "step": 1493 }, { "epoch": 0.08054776795341816, "grad_norm": 0.8694074153900146, "learning_rate": 9.991268351963592e-06, "loss": 0.7306, "step": 1494 }, { "epoch": 0.08060168212206167, "grad_norm": 0.8981258869171143, "learning_rate": 9.991255823652162e-06, "loss": 0.7821, "step": 1495 }, { "epoch": 0.08065559629070519, "grad_norm": 0.9740719795227051, "learning_rate": 9.99124328636713e-06, "loss": 0.7678, "step": 1496 }, { "epoch": 0.08070951045934871, "grad_norm": 0.8847763538360596, "learning_rate": 9.991230740108515e-06, "loss": 0.73, "step": 1497 }, { "epoch": 0.08076342462799224, "grad_norm": 0.8909339308738708, "learning_rate": 9.99121818487634e-06, "loss": 0.7713, "step": 1498 }, { "epoch": 0.08081733879663576, "grad_norm": 0.8183975219726562, "learning_rate": 9.991205620670626e-06, "loss": 0.8234, "step": 1499 }, { "epoch": 0.08087125296527928, "grad_norm": 1.241355299949646, "learning_rate": 9.991193047491399e-06, "loss": 0.8135, "step": 1500 }, { "epoch": 0.0809251671339228, "grad_norm": 0.9039500951766968, "learning_rate": 9.991180465338682e-06, "loss": 0.8642, "step": 1501 }, { "epoch": 0.08097908130256631, "grad_norm": 1.1762068271636963, "learning_rate": 9.991167874212493e-06, "loss": 0.7892, "step": 1502 }, { "epoch": 0.08103299547120983, "grad_norm": 0.8402833938598633, "learning_rate": 9.991155274112857e-06, "loss": 0.9054, "step": 1503 }, { "epoch": 0.08108690963985335, "grad_norm": 0.9271976351737976, "learning_rate": 9.991142665039799e-06, "loss": 0.8902, "step": 1504 }, { "epoch": 0.08114082380849687, "grad_norm": 0.9105845093727112, "learning_rate": 9.991130046993337e-06, "loss": 0.8522, "step": 1505 }, { "epoch": 0.0811947379771404, "grad_norm": 0.8248290419578552, "learning_rate": 9.991117419973499e-06, "loss": 0.882, "step": 1506 }, { "epoch": 0.08124865214578392, "grad_norm": 1.0726820230484009, "learning_rate": 9.991104783980305e-06, "loss": 0.8001, "step": 1507 }, { "epoch": 0.08130256631442744, "grad_norm": 1.296281337738037, "learning_rate": 9.991092139013776e-06, "loss": 1.0022, "step": 1508 }, { "epoch": 0.08135648048307095, "grad_norm": 1.7287628650665283, "learning_rate": 9.991079485073938e-06, "loss": 0.914, "step": 1509 }, { "epoch": 0.08141039465171447, "grad_norm": 0.8731694221496582, "learning_rate": 9.991066822160813e-06, "loss": 0.8672, "step": 1510 }, { "epoch": 0.08146430882035799, "grad_norm": 0.875747799873352, "learning_rate": 9.99105415027442e-06, "loss": 0.8044, "step": 1511 }, { "epoch": 0.08151822298900151, "grad_norm": 0.9055120348930359, "learning_rate": 9.991041469414787e-06, "loss": 0.8312, "step": 1512 }, { "epoch": 0.08157213715764502, "grad_norm": 0.8849499821662903, "learning_rate": 9.991028779581935e-06, "loss": 0.889, "step": 1513 }, { "epoch": 0.08162605132628854, "grad_norm": 0.9549855589866638, "learning_rate": 9.991016080775884e-06, "loss": 0.8929, "step": 1514 }, { "epoch": 0.08167996549493207, "grad_norm": 0.8395527005195618, "learning_rate": 9.991003372996662e-06, "loss": 0.6774, "step": 1515 }, { "epoch": 0.08173387966357559, "grad_norm": 0.7791672945022583, "learning_rate": 9.990990656244287e-06, "loss": 0.7178, "step": 1516 }, { "epoch": 0.08178779383221911, "grad_norm": 0.91841721534729, "learning_rate": 9.990977930518785e-06, "loss": 0.8372, "step": 1517 }, { "epoch": 0.08184170800086263, "grad_norm": 0.923937976360321, "learning_rate": 9.990965195820178e-06, "loss": 0.8467, "step": 1518 }, { "epoch": 0.08189562216950615, "grad_norm": 0.9804415106773376, "learning_rate": 9.990952452148488e-06, "loss": 0.9281, "step": 1519 }, { "epoch": 0.08194953633814966, "grad_norm": 0.9396255016326904, "learning_rate": 9.99093969950374e-06, "loss": 0.8606, "step": 1520 }, { "epoch": 0.08200345050679318, "grad_norm": 0.8492118120193481, "learning_rate": 9.990926937885953e-06, "loss": 0.8253, "step": 1521 }, { "epoch": 0.0820573646754367, "grad_norm": 0.8482204079627991, "learning_rate": 9.990914167295154e-06, "loss": 0.7361, "step": 1522 }, { "epoch": 0.08211127884408022, "grad_norm": 1.1302778720855713, "learning_rate": 9.990901387731365e-06, "loss": 0.7511, "step": 1523 }, { "epoch": 0.08216519301272375, "grad_norm": 0.9285756945610046, "learning_rate": 9.990888599194607e-06, "loss": 0.8329, "step": 1524 }, { "epoch": 0.08221910718136727, "grad_norm": 0.8932104110717773, "learning_rate": 9.990875801684905e-06, "loss": 0.8146, "step": 1525 }, { "epoch": 0.08227302135001079, "grad_norm": 0.8232647180557251, "learning_rate": 9.990862995202282e-06, "loss": 0.763, "step": 1526 }, { "epoch": 0.0823269355186543, "grad_norm": 0.8582163453102112, "learning_rate": 9.990850179746759e-06, "loss": 0.7675, "step": 1527 }, { "epoch": 0.08238084968729782, "grad_norm": 0.9890977144241333, "learning_rate": 9.990837355318362e-06, "loss": 0.8438, "step": 1528 }, { "epoch": 0.08243476385594134, "grad_norm": 0.9228235483169556, "learning_rate": 9.990824521917113e-06, "loss": 0.9324, "step": 1529 }, { "epoch": 0.08248867802458486, "grad_norm": 0.8286252617835999, "learning_rate": 9.990811679543033e-06, "loss": 0.872, "step": 1530 }, { "epoch": 0.08254259219322838, "grad_norm": 0.8546530604362488, "learning_rate": 9.990798828196146e-06, "loss": 0.7256, "step": 1531 }, { "epoch": 0.0825965063618719, "grad_norm": 0.8240640759468079, "learning_rate": 9.990785967876478e-06, "loss": 0.8083, "step": 1532 }, { "epoch": 0.08265042053051543, "grad_norm": 0.8650565147399902, "learning_rate": 9.99077309858405e-06, "loss": 0.8274, "step": 1533 }, { "epoch": 0.08270433469915894, "grad_norm": 0.7865849137306213, "learning_rate": 9.990760220318884e-06, "loss": 0.7978, "step": 1534 }, { "epoch": 0.08275824886780246, "grad_norm": 0.8567995429039001, "learning_rate": 9.990747333081005e-06, "loss": 0.8172, "step": 1535 }, { "epoch": 0.08281216303644598, "grad_norm": 0.8242521286010742, "learning_rate": 9.990734436870435e-06, "loss": 0.8045, "step": 1536 }, { "epoch": 0.0828660772050895, "grad_norm": 0.801266074180603, "learning_rate": 9.990721531687197e-06, "loss": 0.8312, "step": 1537 }, { "epoch": 0.08291999137373302, "grad_norm": 0.8027862906455994, "learning_rate": 9.990708617531314e-06, "loss": 0.7227, "step": 1538 }, { "epoch": 0.08297390554237653, "grad_norm": 1.0332401990890503, "learning_rate": 9.990695694402811e-06, "loss": 0.9091, "step": 1539 }, { "epoch": 0.08302781971102005, "grad_norm": 0.8537373542785645, "learning_rate": 9.99068276230171e-06, "loss": 0.7573, "step": 1540 }, { "epoch": 0.08308173387966357, "grad_norm": 0.8734087944030762, "learning_rate": 9.990669821228037e-06, "loss": 0.901, "step": 1541 }, { "epoch": 0.0831356480483071, "grad_norm": 0.8546577095985413, "learning_rate": 9.99065687118181e-06, "loss": 0.8294, "step": 1542 }, { "epoch": 0.08318956221695062, "grad_norm": 0.9555438756942749, "learning_rate": 9.990643912163055e-06, "loss": 0.83, "step": 1543 }, { "epoch": 0.08324347638559414, "grad_norm": 0.8778670430183411, "learning_rate": 9.990630944171798e-06, "loss": 0.8694, "step": 1544 }, { "epoch": 0.08329739055423765, "grad_norm": 0.973791241645813, "learning_rate": 9.990617967208058e-06, "loss": 0.8348, "step": 1545 }, { "epoch": 0.08335130472288117, "grad_norm": 0.7933714389801025, "learning_rate": 9.990604981271858e-06, "loss": 0.8208, "step": 1546 }, { "epoch": 0.08340521889152469, "grad_norm": 0.9328469634056091, "learning_rate": 9.990591986363226e-06, "loss": 0.8188, "step": 1547 }, { "epoch": 0.08345913306016821, "grad_norm": 0.8217103481292725, "learning_rate": 9.990578982482183e-06, "loss": 0.7948, "step": 1548 }, { "epoch": 0.08351304722881173, "grad_norm": 0.8556894659996033, "learning_rate": 9.990565969628749e-06, "loss": 0.8129, "step": 1549 }, { "epoch": 0.08356696139745524, "grad_norm": 0.901633083820343, "learning_rate": 9.990552947802954e-06, "loss": 0.9025, "step": 1550 }, { "epoch": 0.08362087556609878, "grad_norm": 0.9021494388580322, "learning_rate": 9.990539917004815e-06, "loss": 0.8882, "step": 1551 }, { "epoch": 0.0836747897347423, "grad_norm": 0.8187722563743591, "learning_rate": 9.990526877234359e-06, "loss": 0.7385, "step": 1552 }, { "epoch": 0.08372870390338581, "grad_norm": 0.9237630367279053, "learning_rate": 9.990513828491609e-06, "loss": 0.851, "step": 1553 }, { "epoch": 0.08378261807202933, "grad_norm": 1.1868582963943481, "learning_rate": 9.990500770776589e-06, "loss": 0.7701, "step": 1554 }, { "epoch": 0.08383653224067285, "grad_norm": 0.9831421971321106, "learning_rate": 9.990487704089322e-06, "loss": 0.836, "step": 1555 }, { "epoch": 0.08389044640931637, "grad_norm": 0.9255663752555847, "learning_rate": 9.99047462842983e-06, "loss": 0.7916, "step": 1556 }, { "epoch": 0.08394436057795988, "grad_norm": 1.0069084167480469, "learning_rate": 9.990461543798137e-06, "loss": 0.8652, "step": 1557 }, { "epoch": 0.0839982747466034, "grad_norm": 0.943044900894165, "learning_rate": 9.990448450194267e-06, "loss": 0.9511, "step": 1558 }, { "epoch": 0.08405218891524693, "grad_norm": 0.9996150135993958, "learning_rate": 9.990435347618246e-06, "loss": 0.8751, "step": 1559 }, { "epoch": 0.08410610308389045, "grad_norm": 0.9531681537628174, "learning_rate": 9.990422236070094e-06, "loss": 0.8988, "step": 1560 }, { "epoch": 0.08416001725253397, "grad_norm": 0.9504678249359131, "learning_rate": 9.990409115549837e-06, "loss": 0.808, "step": 1561 }, { "epoch": 0.08421393142117749, "grad_norm": 0.9796282052993774, "learning_rate": 9.990395986057496e-06, "loss": 0.778, "step": 1562 }, { "epoch": 0.084267845589821, "grad_norm": 0.8871618509292603, "learning_rate": 9.990382847593096e-06, "loss": 0.8945, "step": 1563 }, { "epoch": 0.08432175975846452, "grad_norm": 0.8253110647201538, "learning_rate": 9.990369700156662e-06, "loss": 0.8206, "step": 1564 }, { "epoch": 0.08437567392710804, "grad_norm": 0.8799824118614197, "learning_rate": 9.990356543748216e-06, "loss": 0.7665, "step": 1565 }, { "epoch": 0.08442958809575156, "grad_norm": 0.8275637626647949, "learning_rate": 9.990343378367782e-06, "loss": 0.8468, "step": 1566 }, { "epoch": 0.08448350226439508, "grad_norm": 1.0431691408157349, "learning_rate": 9.990330204015382e-06, "loss": 0.8539, "step": 1567 }, { "epoch": 0.08453741643303861, "grad_norm": 1.298999547958374, "learning_rate": 9.990317020691043e-06, "loss": 0.8989, "step": 1568 }, { "epoch": 0.08459133060168213, "grad_norm": 0.865868866443634, "learning_rate": 9.990303828394787e-06, "loss": 0.8296, "step": 1569 }, { "epoch": 0.08464524477032564, "grad_norm": 0.9162652492523193, "learning_rate": 9.990290627126637e-06, "loss": 0.8617, "step": 1570 }, { "epoch": 0.08469915893896916, "grad_norm": 0.9753283858299255, "learning_rate": 9.990277416886618e-06, "loss": 0.8082, "step": 1571 }, { "epoch": 0.08475307310761268, "grad_norm": 0.9561176300048828, "learning_rate": 9.990264197674754e-06, "loss": 0.8678, "step": 1572 }, { "epoch": 0.0848069872762562, "grad_norm": 0.833341658115387, "learning_rate": 9.990250969491067e-06, "loss": 0.8164, "step": 1573 }, { "epoch": 0.08486090144489972, "grad_norm": 0.9928603172302246, "learning_rate": 9.990237732335581e-06, "loss": 0.6889, "step": 1574 }, { "epoch": 0.08491481561354323, "grad_norm": 1.0163367986679077, "learning_rate": 9.990224486208322e-06, "loss": 0.8278, "step": 1575 }, { "epoch": 0.08496872978218675, "grad_norm": 0.9905970096588135, "learning_rate": 9.990211231109312e-06, "loss": 0.8094, "step": 1576 }, { "epoch": 0.08502264395083028, "grad_norm": 0.9112648963928223, "learning_rate": 9.990197967038574e-06, "loss": 0.8782, "step": 1577 }, { "epoch": 0.0850765581194738, "grad_norm": 1.1176974773406982, "learning_rate": 9.990184693996136e-06, "loss": 0.8826, "step": 1578 }, { "epoch": 0.08513047228811732, "grad_norm": 0.7696222066879272, "learning_rate": 9.990171411982016e-06, "loss": 0.8025, "step": 1579 }, { "epoch": 0.08518438645676084, "grad_norm": 0.9288634061813354, "learning_rate": 9.990158120996242e-06, "loss": 0.8777, "step": 1580 }, { "epoch": 0.08523830062540436, "grad_norm": 0.9235022068023682, "learning_rate": 9.990144821038839e-06, "loss": 0.9339, "step": 1581 }, { "epoch": 0.08529221479404787, "grad_norm": 0.9124205708503723, "learning_rate": 9.990131512109826e-06, "loss": 0.8368, "step": 1582 }, { "epoch": 0.08534612896269139, "grad_norm": 0.8409048914909363, "learning_rate": 9.990118194209229e-06, "loss": 0.7772, "step": 1583 }, { "epoch": 0.08540004313133491, "grad_norm": 0.8279136419296265, "learning_rate": 9.990104867337074e-06, "loss": 0.738, "step": 1584 }, { "epoch": 0.08545395729997843, "grad_norm": 0.8895745873451233, "learning_rate": 9.990091531493382e-06, "loss": 0.7669, "step": 1585 }, { "epoch": 0.08550787146862196, "grad_norm": 0.9280734062194824, "learning_rate": 9.99007818667818e-06, "loss": 0.9052, "step": 1586 }, { "epoch": 0.08556178563726548, "grad_norm": 0.7676610350608826, "learning_rate": 9.990064832891491e-06, "loss": 0.807, "step": 1587 }, { "epoch": 0.085615699805909, "grad_norm": 0.9035676121711731, "learning_rate": 9.990051470133337e-06, "loss": 0.8848, "step": 1588 }, { "epoch": 0.08566961397455251, "grad_norm": 1.0960334539413452, "learning_rate": 9.990038098403742e-06, "loss": 0.8279, "step": 1589 }, { "epoch": 0.08572352814319603, "grad_norm": 0.87922203540802, "learning_rate": 9.990024717702736e-06, "loss": 0.8325, "step": 1590 }, { "epoch": 0.08577744231183955, "grad_norm": 0.922815203666687, "learning_rate": 9.990011328030335e-06, "loss": 0.881, "step": 1591 }, { "epoch": 0.08583135648048307, "grad_norm": 0.9880780577659607, "learning_rate": 9.989997929386567e-06, "loss": 0.7506, "step": 1592 }, { "epoch": 0.08588527064912659, "grad_norm": 0.8827483057975769, "learning_rate": 9.989984521771456e-06, "loss": 0.8961, "step": 1593 }, { "epoch": 0.0859391848177701, "grad_norm": 0.8395072817802429, "learning_rate": 9.989971105185026e-06, "loss": 0.8564, "step": 1594 }, { "epoch": 0.08599309898641364, "grad_norm": 0.8731534481048584, "learning_rate": 9.989957679627302e-06, "loss": 0.8209, "step": 1595 }, { "epoch": 0.08604701315505715, "grad_norm": 0.7969424724578857, "learning_rate": 9.989944245098305e-06, "loss": 0.8031, "step": 1596 }, { "epoch": 0.08610092732370067, "grad_norm": 0.8420547246932983, "learning_rate": 9.989930801598062e-06, "loss": 0.8027, "step": 1597 }, { "epoch": 0.08615484149234419, "grad_norm": 0.7900253534317017, "learning_rate": 9.989917349126597e-06, "loss": 0.8246, "step": 1598 }, { "epoch": 0.08620875566098771, "grad_norm": 0.8860716819763184, "learning_rate": 9.989903887683934e-06, "loss": 0.7846, "step": 1599 }, { "epoch": 0.08626266982963122, "grad_norm": 0.907744288444519, "learning_rate": 9.989890417270097e-06, "loss": 0.7813, "step": 1600 }, { "epoch": 0.08631658399827474, "grad_norm": 0.764076828956604, "learning_rate": 9.989876937885108e-06, "loss": 0.7953, "step": 1601 }, { "epoch": 0.08637049816691826, "grad_norm": 1.0143790245056152, "learning_rate": 9.989863449528994e-06, "loss": 0.8854, "step": 1602 }, { "epoch": 0.08642441233556178, "grad_norm": 0.8605815172195435, "learning_rate": 9.989849952201779e-06, "loss": 0.9289, "step": 1603 }, { "epoch": 0.08647832650420531, "grad_norm": 0.8897641897201538, "learning_rate": 9.989836445903487e-06, "loss": 0.8659, "step": 1604 }, { "epoch": 0.08653224067284883, "grad_norm": 0.8893518447875977, "learning_rate": 9.989822930634141e-06, "loss": 0.8724, "step": 1605 }, { "epoch": 0.08658615484149235, "grad_norm": 0.8152129054069519, "learning_rate": 9.989809406393767e-06, "loss": 0.8321, "step": 1606 }, { "epoch": 0.08664006901013586, "grad_norm": 0.8394732475280762, "learning_rate": 9.98979587318239e-06, "loss": 0.8074, "step": 1607 }, { "epoch": 0.08669398317877938, "grad_norm": 0.8038346767425537, "learning_rate": 9.989782331000031e-06, "loss": 0.8132, "step": 1608 }, { "epoch": 0.0867478973474229, "grad_norm": 0.8574134111404419, "learning_rate": 9.989768779846717e-06, "loss": 0.8191, "step": 1609 }, { "epoch": 0.08680181151606642, "grad_norm": 1.0049889087677002, "learning_rate": 9.989755219722472e-06, "loss": 0.8771, "step": 1610 }, { "epoch": 0.08685572568470994, "grad_norm": 0.9765112996101379, "learning_rate": 9.989741650627319e-06, "loss": 0.839, "step": 1611 }, { "epoch": 0.08690963985335347, "grad_norm": 0.9430082440376282, "learning_rate": 9.989728072561284e-06, "loss": 1.0316, "step": 1612 }, { "epoch": 0.08696355402199699, "grad_norm": 0.841590404510498, "learning_rate": 9.989714485524391e-06, "loss": 0.8727, "step": 1613 }, { "epoch": 0.0870174681906405, "grad_norm": 0.9475975632667542, "learning_rate": 9.989700889516664e-06, "loss": 0.8131, "step": 1614 }, { "epoch": 0.08707138235928402, "grad_norm": 0.8059530258178711, "learning_rate": 9.98968728453813e-06, "loss": 0.8297, "step": 1615 }, { "epoch": 0.08712529652792754, "grad_norm": 0.8513601422309875, "learning_rate": 9.989673670588808e-06, "loss": 0.8016, "step": 1616 }, { "epoch": 0.08717921069657106, "grad_norm": 0.8434658646583557, "learning_rate": 9.989660047668728e-06, "loss": 0.866, "step": 1617 }, { "epoch": 0.08723312486521458, "grad_norm": 0.9081484079360962, "learning_rate": 9.989646415777912e-06, "loss": 0.816, "step": 1618 }, { "epoch": 0.0872870390338581, "grad_norm": 0.7941877841949463, "learning_rate": 9.989632774916385e-06, "loss": 0.7191, "step": 1619 }, { "epoch": 0.08734095320250161, "grad_norm": 0.8800172209739685, "learning_rate": 9.98961912508417e-06, "loss": 0.8135, "step": 1620 }, { "epoch": 0.08739486737114514, "grad_norm": 0.7940575480461121, "learning_rate": 9.989605466281292e-06, "loss": 0.8124, "step": 1621 }, { "epoch": 0.08744878153978866, "grad_norm": 0.9570618271827698, "learning_rate": 9.989591798507779e-06, "loss": 0.9043, "step": 1622 }, { "epoch": 0.08750269570843218, "grad_norm": 0.8635395169258118, "learning_rate": 9.98957812176365e-06, "loss": 0.835, "step": 1623 }, { "epoch": 0.0875566098770757, "grad_norm": 0.8289955258369446, "learning_rate": 9.989564436048932e-06, "loss": 0.8265, "step": 1624 }, { "epoch": 0.08761052404571922, "grad_norm": 0.9519028663635254, "learning_rate": 9.989550741363654e-06, "loss": 0.8127, "step": 1625 }, { "epoch": 0.08766443821436273, "grad_norm": 0.9611422419548035, "learning_rate": 9.989537037707834e-06, "loss": 0.8422, "step": 1626 }, { "epoch": 0.08771835238300625, "grad_norm": 0.8824746608734131, "learning_rate": 9.9895233250815e-06, "loss": 0.8669, "step": 1627 }, { "epoch": 0.08777226655164977, "grad_norm": 0.8402838706970215, "learning_rate": 9.989509603484676e-06, "loss": 0.8072, "step": 1628 }, { "epoch": 0.08782618072029329, "grad_norm": 0.7537099719047546, "learning_rate": 9.989495872917386e-06, "loss": 0.7127, "step": 1629 }, { "epoch": 0.08788009488893682, "grad_norm": 0.78285151720047, "learning_rate": 9.989482133379656e-06, "loss": 0.819, "step": 1630 }, { "epoch": 0.08793400905758034, "grad_norm": 0.9339445233345032, "learning_rate": 9.98946838487151e-06, "loss": 0.8694, "step": 1631 }, { "epoch": 0.08798792322622385, "grad_norm": 0.8022040128707886, "learning_rate": 9.989454627392973e-06, "loss": 0.7601, "step": 1632 }, { "epoch": 0.08804183739486737, "grad_norm": 0.8593827486038208, "learning_rate": 9.98944086094407e-06, "loss": 0.8536, "step": 1633 }, { "epoch": 0.08809575156351089, "grad_norm": 0.8415039777755737, "learning_rate": 9.989427085524824e-06, "loss": 0.9027, "step": 1634 }, { "epoch": 0.08814966573215441, "grad_norm": 0.9551103711128235, "learning_rate": 9.989413301135263e-06, "loss": 0.8063, "step": 1635 }, { "epoch": 0.08820357990079793, "grad_norm": 0.8554351925849915, "learning_rate": 9.989399507775407e-06, "loss": 0.7694, "step": 1636 }, { "epoch": 0.08825749406944144, "grad_norm": 0.8688547015190125, "learning_rate": 9.989385705445285e-06, "loss": 0.8862, "step": 1637 }, { "epoch": 0.08831140823808496, "grad_norm": 0.816558837890625, "learning_rate": 9.98937189414492e-06, "loss": 0.7302, "step": 1638 }, { "epoch": 0.0883653224067285, "grad_norm": 0.8164445757865906, "learning_rate": 9.989358073874337e-06, "loss": 0.8724, "step": 1639 }, { "epoch": 0.08841923657537201, "grad_norm": 0.8909460306167603, "learning_rate": 9.989344244633564e-06, "loss": 0.7618, "step": 1640 }, { "epoch": 0.08847315074401553, "grad_norm": 1.0117470026016235, "learning_rate": 9.98933040642262e-06, "loss": 0.8191, "step": 1641 }, { "epoch": 0.08852706491265905, "grad_norm": 0.8317937850952148, "learning_rate": 9.989316559241533e-06, "loss": 0.8339, "step": 1642 }, { "epoch": 0.08858097908130257, "grad_norm": 0.7955135107040405, "learning_rate": 9.98930270309033e-06, "loss": 0.7799, "step": 1643 }, { "epoch": 0.08863489324994608, "grad_norm": 0.996306300163269, "learning_rate": 9.98928883796903e-06, "loss": 0.8547, "step": 1644 }, { "epoch": 0.0886888074185896, "grad_norm": 0.9679511189460754, "learning_rate": 9.989274963877664e-06, "loss": 1.0831, "step": 1645 }, { "epoch": 0.08874272158723312, "grad_norm": 0.8471615314483643, "learning_rate": 9.989261080816253e-06, "loss": 0.7765, "step": 1646 }, { "epoch": 0.08879663575587664, "grad_norm": 0.8662555813789368, "learning_rate": 9.989247188784826e-06, "loss": 0.8894, "step": 1647 }, { "epoch": 0.08885054992452017, "grad_norm": 0.9549373388290405, "learning_rate": 9.989233287783402e-06, "loss": 0.8341, "step": 1648 }, { "epoch": 0.08890446409316369, "grad_norm": 0.8179014325141907, "learning_rate": 9.989219377812014e-06, "loss": 0.8653, "step": 1649 }, { "epoch": 0.0889583782618072, "grad_norm": 0.9237802624702454, "learning_rate": 9.989205458870678e-06, "loss": 0.8206, "step": 1650 }, { "epoch": 0.08901229243045072, "grad_norm": 0.940217137336731, "learning_rate": 9.989191530959426e-06, "loss": 0.8695, "step": 1651 }, { "epoch": 0.08906620659909424, "grad_norm": 0.9200409054756165, "learning_rate": 9.98917759407828e-06, "loss": 0.7984, "step": 1652 }, { "epoch": 0.08912012076773776, "grad_norm": 0.9270562529563904, "learning_rate": 9.989163648227265e-06, "loss": 0.8265, "step": 1653 }, { "epoch": 0.08917403493638128, "grad_norm": 0.9945223331451416, "learning_rate": 9.989149693406408e-06, "loss": 0.84, "step": 1654 }, { "epoch": 0.0892279491050248, "grad_norm": 0.826195478439331, "learning_rate": 9.98913572961573e-06, "loss": 0.7862, "step": 1655 }, { "epoch": 0.08928186327366831, "grad_norm": 0.9132022857666016, "learning_rate": 9.989121756855263e-06, "loss": 0.826, "step": 1656 }, { "epoch": 0.08933577744231185, "grad_norm": 0.8559401631355286, "learning_rate": 9.989107775125023e-06, "loss": 0.8007, "step": 1657 }, { "epoch": 0.08938969161095536, "grad_norm": 0.8000867366790771, "learning_rate": 9.989093784425044e-06, "loss": 0.7547, "step": 1658 }, { "epoch": 0.08944360577959888, "grad_norm": 0.7761433720588684, "learning_rate": 9.989079784755346e-06, "loss": 0.8083, "step": 1659 }, { "epoch": 0.0894975199482424, "grad_norm": 0.8072230815887451, "learning_rate": 9.989065776115956e-06, "loss": 0.892, "step": 1660 }, { "epoch": 0.08955143411688592, "grad_norm": 0.9021360874176025, "learning_rate": 9.989051758506898e-06, "loss": 0.8715, "step": 1661 }, { "epoch": 0.08960534828552943, "grad_norm": 0.7585147023200989, "learning_rate": 9.989037731928197e-06, "loss": 0.7115, "step": 1662 }, { "epoch": 0.08965926245417295, "grad_norm": 0.9388399124145508, "learning_rate": 9.98902369637988e-06, "loss": 0.8976, "step": 1663 }, { "epoch": 0.08971317662281647, "grad_norm": 0.8454418778419495, "learning_rate": 9.989009651861972e-06, "loss": 0.8063, "step": 1664 }, { "epoch": 0.08976709079146, "grad_norm": 0.82308030128479, "learning_rate": 9.988995598374496e-06, "loss": 0.8044, "step": 1665 }, { "epoch": 0.08982100496010352, "grad_norm": 1.006800651550293, "learning_rate": 9.98898153591748e-06, "loss": 0.8609, "step": 1666 }, { "epoch": 0.08987491912874704, "grad_norm": 0.8325724601745605, "learning_rate": 9.988967464490947e-06, "loss": 0.8295, "step": 1667 }, { "epoch": 0.08992883329739056, "grad_norm": 0.7575547695159912, "learning_rate": 9.988953384094923e-06, "loss": 0.8252, "step": 1668 }, { "epoch": 0.08998274746603407, "grad_norm": 0.869877278804779, "learning_rate": 9.988939294729436e-06, "loss": 0.8304, "step": 1669 }, { "epoch": 0.09003666163467759, "grad_norm": 0.7840037941932678, "learning_rate": 9.988925196394508e-06, "loss": 0.7742, "step": 1670 }, { "epoch": 0.09009057580332111, "grad_norm": 0.8044409155845642, "learning_rate": 9.988911089090163e-06, "loss": 0.8371, "step": 1671 }, { "epoch": 0.09014448997196463, "grad_norm": 0.8635613322257996, "learning_rate": 9.988896972816431e-06, "loss": 0.7693, "step": 1672 }, { "epoch": 0.09019840414060815, "grad_norm": 0.7780656814575195, "learning_rate": 9.988882847573335e-06, "loss": 0.841, "step": 1673 }, { "epoch": 0.09025231830925168, "grad_norm": 0.8938048481941223, "learning_rate": 9.9888687133609e-06, "loss": 0.8149, "step": 1674 }, { "epoch": 0.0903062324778952, "grad_norm": 0.8432002663612366, "learning_rate": 9.988854570179152e-06, "loss": 0.853, "step": 1675 }, { "epoch": 0.09036014664653871, "grad_norm": 0.8222450613975525, "learning_rate": 9.988840418028118e-06, "loss": 0.897, "step": 1676 }, { "epoch": 0.09041406081518223, "grad_norm": 0.8370371460914612, "learning_rate": 9.98882625690782e-06, "loss": 0.8288, "step": 1677 }, { "epoch": 0.09046797498382575, "grad_norm": 0.8510713577270508, "learning_rate": 9.988812086818285e-06, "loss": 0.7637, "step": 1678 }, { "epoch": 0.09052188915246927, "grad_norm": 0.8271141648292542, "learning_rate": 9.98879790775954e-06, "loss": 0.853, "step": 1679 }, { "epoch": 0.09057580332111279, "grad_norm": 1.0627025365829468, "learning_rate": 9.988783719731607e-06, "loss": 0.7569, "step": 1680 }, { "epoch": 0.0906297174897563, "grad_norm": 0.880283534526825, "learning_rate": 9.988769522734517e-06, "loss": 0.8362, "step": 1681 }, { "epoch": 0.09068363165839982, "grad_norm": 0.8721734881401062, "learning_rate": 9.988755316768288e-06, "loss": 0.8585, "step": 1682 }, { "epoch": 0.09073754582704335, "grad_norm": 0.8830682039260864, "learning_rate": 9.988741101832952e-06, "loss": 0.8853, "step": 1683 }, { "epoch": 0.09079145999568687, "grad_norm": 0.7676220536231995, "learning_rate": 9.988726877928534e-06, "loss": 0.7832, "step": 1684 }, { "epoch": 0.09084537416433039, "grad_norm": 0.866149365901947, "learning_rate": 9.988712645055055e-06, "loss": 0.8534, "step": 1685 }, { "epoch": 0.09089928833297391, "grad_norm": 0.8467028141021729, "learning_rate": 9.988698403212546e-06, "loss": 0.8637, "step": 1686 }, { "epoch": 0.09095320250161743, "grad_norm": 0.913436770439148, "learning_rate": 9.988684152401028e-06, "loss": 0.855, "step": 1687 }, { "epoch": 0.09100711667026094, "grad_norm": 0.8307977914810181, "learning_rate": 9.98866989262053e-06, "loss": 0.8538, "step": 1688 }, { "epoch": 0.09106103083890446, "grad_norm": 1.13442862033844, "learning_rate": 9.988655623871075e-06, "loss": 0.8129, "step": 1689 }, { "epoch": 0.09111494500754798, "grad_norm": 0.8950080871582031, "learning_rate": 9.988641346152692e-06, "loss": 0.8674, "step": 1690 }, { "epoch": 0.0911688591761915, "grad_norm": 0.9107043147087097, "learning_rate": 9.988627059465403e-06, "loss": 0.9507, "step": 1691 }, { "epoch": 0.09122277334483503, "grad_norm": 0.8210874795913696, "learning_rate": 9.988612763809237e-06, "loss": 0.8913, "step": 1692 }, { "epoch": 0.09127668751347855, "grad_norm": 1.0306476354599, "learning_rate": 9.988598459184217e-06, "loss": 0.8589, "step": 1693 }, { "epoch": 0.09133060168212206, "grad_norm": 0.7582615613937378, "learning_rate": 9.98858414559037e-06, "loss": 0.7482, "step": 1694 }, { "epoch": 0.09138451585076558, "grad_norm": 0.8572216629981995, "learning_rate": 9.98856982302772e-06, "loss": 0.822, "step": 1695 }, { "epoch": 0.0914384300194091, "grad_norm": 0.9358139038085938, "learning_rate": 9.988555491496297e-06, "loss": 0.8298, "step": 1696 }, { "epoch": 0.09149234418805262, "grad_norm": 0.8705672025680542, "learning_rate": 9.988541150996123e-06, "loss": 0.8818, "step": 1697 }, { "epoch": 0.09154625835669614, "grad_norm": 0.9081273674964905, "learning_rate": 9.988526801527224e-06, "loss": 0.8994, "step": 1698 }, { "epoch": 0.09160017252533965, "grad_norm": 0.7358905076980591, "learning_rate": 9.988512443089627e-06, "loss": 0.7752, "step": 1699 }, { "epoch": 0.09165408669398317, "grad_norm": 0.8570963740348816, "learning_rate": 9.988498075683357e-06, "loss": 0.908, "step": 1700 }, { "epoch": 0.0917080008626267, "grad_norm": 0.8998208045959473, "learning_rate": 9.988483699308442e-06, "loss": 0.8561, "step": 1701 }, { "epoch": 0.09176191503127022, "grad_norm": 0.7481779456138611, "learning_rate": 9.988469313964903e-06, "loss": 0.7184, "step": 1702 }, { "epoch": 0.09181582919991374, "grad_norm": 1.052809238433838, "learning_rate": 9.988454919652772e-06, "loss": 0.8579, "step": 1703 }, { "epoch": 0.09186974336855726, "grad_norm": 0.8492130637168884, "learning_rate": 9.988440516372071e-06, "loss": 0.8796, "step": 1704 }, { "epoch": 0.09192365753720078, "grad_norm": 0.884483277797699, "learning_rate": 9.988426104122826e-06, "loss": 0.8781, "step": 1705 }, { "epoch": 0.0919775717058443, "grad_norm": 0.8844857811927795, "learning_rate": 9.988411682905065e-06, "loss": 0.8981, "step": 1706 }, { "epoch": 0.09203148587448781, "grad_norm": 0.906216025352478, "learning_rate": 9.988397252718811e-06, "loss": 0.8741, "step": 1707 }, { "epoch": 0.09208540004313133, "grad_norm": 0.8565787076950073, "learning_rate": 9.988382813564092e-06, "loss": 0.7358, "step": 1708 }, { "epoch": 0.09213931421177485, "grad_norm": 0.8036391139030457, "learning_rate": 9.988368365440935e-06, "loss": 0.7966, "step": 1709 }, { "epoch": 0.09219322838041838, "grad_norm": 1.1708556413650513, "learning_rate": 9.988353908349361e-06, "loss": 0.8385, "step": 1710 }, { "epoch": 0.0922471425490619, "grad_norm": 0.8536746501922607, "learning_rate": 9.988339442289403e-06, "loss": 0.7387, "step": 1711 }, { "epoch": 0.09230105671770542, "grad_norm": 0.8376518487930298, "learning_rate": 9.988324967261083e-06, "loss": 0.8537, "step": 1712 }, { "epoch": 0.09235497088634893, "grad_norm": 0.8793227672576904, "learning_rate": 9.988310483264426e-06, "loss": 0.8028, "step": 1713 }, { "epoch": 0.09240888505499245, "grad_norm": 0.8186830282211304, "learning_rate": 9.98829599029946e-06, "loss": 0.8478, "step": 1714 }, { "epoch": 0.09246279922363597, "grad_norm": 0.8845428824424744, "learning_rate": 9.98828148836621e-06, "loss": 0.8524, "step": 1715 }, { "epoch": 0.09251671339227949, "grad_norm": 1.0494492053985596, "learning_rate": 9.988266977464704e-06, "loss": 0.8542, "step": 1716 }, { "epoch": 0.092570627560923, "grad_norm": 0.8876493573188782, "learning_rate": 9.988252457594966e-06, "loss": 0.8989, "step": 1717 }, { "epoch": 0.09262454172956654, "grad_norm": 0.8787088394165039, "learning_rate": 9.988237928757024e-06, "loss": 0.8214, "step": 1718 }, { "epoch": 0.09267845589821005, "grad_norm": 1.069684624671936, "learning_rate": 9.988223390950901e-06, "loss": 0.9714, "step": 1719 }, { "epoch": 0.09273237006685357, "grad_norm": 0.7957501411437988, "learning_rate": 9.988208844176626e-06, "loss": 0.7562, "step": 1720 }, { "epoch": 0.09278628423549709, "grad_norm": 0.8354908227920532, "learning_rate": 9.988194288434225e-06, "loss": 0.7494, "step": 1721 }, { "epoch": 0.09284019840414061, "grad_norm": 0.8205936551094055, "learning_rate": 9.988179723723722e-06, "loss": 0.7727, "step": 1722 }, { "epoch": 0.09289411257278413, "grad_norm": 0.8364951014518738, "learning_rate": 9.988165150045146e-06, "loss": 0.861, "step": 1723 }, { "epoch": 0.09294802674142764, "grad_norm": 0.8664119243621826, "learning_rate": 9.98815056739852e-06, "loss": 0.8512, "step": 1724 }, { "epoch": 0.09300194091007116, "grad_norm": 0.9565482139587402, "learning_rate": 9.988135975783874e-06, "loss": 0.8606, "step": 1725 }, { "epoch": 0.09305585507871468, "grad_norm": 0.8696085214614868, "learning_rate": 9.988121375201232e-06, "loss": 0.8614, "step": 1726 }, { "epoch": 0.09310976924735821, "grad_norm": 0.8623467683792114, "learning_rate": 9.98810676565062e-06, "loss": 0.8547, "step": 1727 }, { "epoch": 0.09316368341600173, "grad_norm": 0.8284831047058105, "learning_rate": 9.988092147132064e-06, "loss": 0.8376, "step": 1728 }, { "epoch": 0.09321759758464525, "grad_norm": 0.7768245339393616, "learning_rate": 9.988077519645591e-06, "loss": 0.7472, "step": 1729 }, { "epoch": 0.09327151175328877, "grad_norm": 1.221225619316101, "learning_rate": 9.988062883191228e-06, "loss": 0.9052, "step": 1730 }, { "epoch": 0.09332542592193228, "grad_norm": 1.0027954578399658, "learning_rate": 9.988048237769002e-06, "loss": 0.9411, "step": 1731 }, { "epoch": 0.0933793400905758, "grad_norm": 0.8029824495315552, "learning_rate": 9.988033583378937e-06, "loss": 0.8141, "step": 1732 }, { "epoch": 0.09343325425921932, "grad_norm": 0.8081389665603638, "learning_rate": 9.98801892002106e-06, "loss": 0.7977, "step": 1733 }, { "epoch": 0.09348716842786284, "grad_norm": 0.887438952922821, "learning_rate": 9.988004247695398e-06, "loss": 0.8574, "step": 1734 }, { "epoch": 0.09354108259650636, "grad_norm": 0.887238085269928, "learning_rate": 9.987989566401977e-06, "loss": 0.9041, "step": 1735 }, { "epoch": 0.09359499676514989, "grad_norm": 0.9135997891426086, "learning_rate": 9.987974876140822e-06, "loss": 0.738, "step": 1736 }, { "epoch": 0.0936489109337934, "grad_norm": 0.7749861478805542, "learning_rate": 9.987960176911964e-06, "loss": 0.773, "step": 1737 }, { "epoch": 0.09370282510243692, "grad_norm": 0.7850096225738525, "learning_rate": 9.987945468715425e-06, "loss": 0.7924, "step": 1738 }, { "epoch": 0.09375673927108044, "grad_norm": 0.8044145107269287, "learning_rate": 9.987930751551231e-06, "loss": 0.8196, "step": 1739 }, { "epoch": 0.09381065343972396, "grad_norm": 0.8781464695930481, "learning_rate": 9.987916025419413e-06, "loss": 0.9337, "step": 1740 }, { "epoch": 0.09386456760836748, "grad_norm": 1.0839952230453491, "learning_rate": 9.987901290319993e-06, "loss": 0.8092, "step": 1741 }, { "epoch": 0.093918481777011, "grad_norm": 0.7910736203193665, "learning_rate": 9.987886546253e-06, "loss": 0.8775, "step": 1742 }, { "epoch": 0.09397239594565451, "grad_norm": 0.887287974357605, "learning_rate": 9.98787179321846e-06, "loss": 0.8271, "step": 1743 }, { "epoch": 0.09402631011429803, "grad_norm": 1.1318427324295044, "learning_rate": 9.987857031216397e-06, "loss": 0.8328, "step": 1744 }, { "epoch": 0.09408022428294156, "grad_norm": 0.8660401105880737, "learning_rate": 9.987842260246842e-06, "loss": 0.8647, "step": 1745 }, { "epoch": 0.09413413845158508, "grad_norm": 0.9396790266036987, "learning_rate": 9.98782748030982e-06, "loss": 0.9373, "step": 1746 }, { "epoch": 0.0941880526202286, "grad_norm": 0.8715323209762573, "learning_rate": 9.987812691405353e-06, "loss": 0.8621, "step": 1747 }, { "epoch": 0.09424196678887212, "grad_norm": 0.7882347106933594, "learning_rate": 9.987797893533475e-06, "loss": 0.7283, "step": 1748 }, { "epoch": 0.09429588095751563, "grad_norm": 0.9641733765602112, "learning_rate": 9.987783086694208e-06, "loss": 0.8038, "step": 1749 }, { "epoch": 0.09434979512615915, "grad_norm": 0.8808518648147583, "learning_rate": 9.98776827088758e-06, "loss": 0.8072, "step": 1750 }, { "epoch": 0.09440370929480267, "grad_norm": 0.7720713019371033, "learning_rate": 9.987753446113618e-06, "loss": 0.7786, "step": 1751 }, { "epoch": 0.09445762346344619, "grad_norm": 1.0507936477661133, "learning_rate": 9.987738612372346e-06, "loss": 0.9302, "step": 1752 }, { "epoch": 0.0945115376320897, "grad_norm": 0.7705017328262329, "learning_rate": 9.987723769663795e-06, "loss": 0.7366, "step": 1753 }, { "epoch": 0.09456545180073324, "grad_norm": 0.82464200258255, "learning_rate": 9.987708917987989e-06, "loss": 0.8063, "step": 1754 }, { "epoch": 0.09461936596937676, "grad_norm": 0.9387272000312805, "learning_rate": 9.987694057344953e-06, "loss": 0.8108, "step": 1755 }, { "epoch": 0.09467328013802027, "grad_norm": 0.9161933064460754, "learning_rate": 9.987679187734717e-06, "loss": 0.8331, "step": 1756 }, { "epoch": 0.09472719430666379, "grad_norm": 0.9379769563674927, "learning_rate": 9.987664309157306e-06, "loss": 0.9064, "step": 1757 }, { "epoch": 0.09478110847530731, "grad_norm": 0.9597976803779602, "learning_rate": 9.987649421612748e-06, "loss": 0.7785, "step": 1758 }, { "epoch": 0.09483502264395083, "grad_norm": 0.8689720630645752, "learning_rate": 9.98763452510107e-06, "loss": 0.7828, "step": 1759 }, { "epoch": 0.09488893681259435, "grad_norm": 0.9207726716995239, "learning_rate": 9.987619619622296e-06, "loss": 0.7853, "step": 1760 }, { "epoch": 0.09494285098123786, "grad_norm": 0.8130320310592651, "learning_rate": 9.987604705176455e-06, "loss": 0.858, "step": 1761 }, { "epoch": 0.09499676514988138, "grad_norm": 0.9004638195037842, "learning_rate": 9.987589781763574e-06, "loss": 0.8148, "step": 1762 }, { "epoch": 0.09505067931852491, "grad_norm": 0.8554181456565857, "learning_rate": 9.987574849383678e-06, "loss": 0.8103, "step": 1763 }, { "epoch": 0.09510459348716843, "grad_norm": 0.9148527979850769, "learning_rate": 9.987559908036797e-06, "loss": 0.9467, "step": 1764 }, { "epoch": 0.09515850765581195, "grad_norm": 0.890083909034729, "learning_rate": 9.987544957722956e-06, "loss": 0.8338, "step": 1765 }, { "epoch": 0.09521242182445547, "grad_norm": 0.8118012547492981, "learning_rate": 9.98752999844218e-06, "loss": 0.8355, "step": 1766 }, { "epoch": 0.09526633599309899, "grad_norm": 0.8115151524543762, "learning_rate": 9.987515030194498e-06, "loss": 0.9172, "step": 1767 }, { "epoch": 0.0953202501617425, "grad_norm": 0.8750082850456238, "learning_rate": 9.987500052979938e-06, "loss": 0.8301, "step": 1768 }, { "epoch": 0.09537416433038602, "grad_norm": 0.9008756875991821, "learning_rate": 9.987485066798525e-06, "loss": 0.8642, "step": 1769 }, { "epoch": 0.09542807849902954, "grad_norm": 0.8335922956466675, "learning_rate": 9.987470071650287e-06, "loss": 0.8466, "step": 1770 }, { "epoch": 0.09548199266767307, "grad_norm": 0.8604272603988647, "learning_rate": 9.987455067535249e-06, "loss": 0.8801, "step": 1771 }, { "epoch": 0.09553590683631659, "grad_norm": 0.889854371547699, "learning_rate": 9.98744005445344e-06, "loss": 0.8804, "step": 1772 }, { "epoch": 0.09558982100496011, "grad_norm": 0.8756876587867737, "learning_rate": 9.987425032404887e-06, "loss": 0.8367, "step": 1773 }, { "epoch": 0.09564373517360363, "grad_norm": 0.9071298837661743, "learning_rate": 9.987410001389616e-06, "loss": 0.8875, "step": 1774 }, { "epoch": 0.09569764934224714, "grad_norm": 0.8214284777641296, "learning_rate": 9.987394961407654e-06, "loss": 0.7859, "step": 1775 }, { "epoch": 0.09575156351089066, "grad_norm": 0.940034806728363, "learning_rate": 9.98737991245903e-06, "loss": 0.8272, "step": 1776 }, { "epoch": 0.09580547767953418, "grad_norm": 0.8156501054763794, "learning_rate": 9.987364854543768e-06, "loss": 0.7831, "step": 1777 }, { "epoch": 0.0958593918481777, "grad_norm": 0.8450450301170349, "learning_rate": 9.987349787661898e-06, "loss": 0.7888, "step": 1778 }, { "epoch": 0.09591330601682121, "grad_norm": 0.8143148422241211, "learning_rate": 9.987334711813446e-06, "loss": 0.7593, "step": 1779 }, { "epoch": 0.09596722018546475, "grad_norm": 1.0489457845687866, "learning_rate": 9.987319626998437e-06, "loss": 0.8248, "step": 1780 }, { "epoch": 0.09602113435410826, "grad_norm": 0.9584689140319824, "learning_rate": 9.987304533216901e-06, "loss": 0.9025, "step": 1781 }, { "epoch": 0.09607504852275178, "grad_norm": 0.8366501331329346, "learning_rate": 9.987289430468862e-06, "loss": 0.7513, "step": 1782 }, { "epoch": 0.0961289626913953, "grad_norm": 0.9896461963653564, "learning_rate": 9.987274318754352e-06, "loss": 0.8598, "step": 1783 }, { "epoch": 0.09618287686003882, "grad_norm": 1.1904568672180176, "learning_rate": 9.987259198073396e-06, "loss": 0.9143, "step": 1784 }, { "epoch": 0.09623679102868234, "grad_norm": 0.8100086450576782, "learning_rate": 9.987244068426019e-06, "loss": 0.7733, "step": 1785 }, { "epoch": 0.09629070519732585, "grad_norm": 0.7814387083053589, "learning_rate": 9.987228929812249e-06, "loss": 0.7735, "step": 1786 }, { "epoch": 0.09634461936596937, "grad_norm": 0.8880924582481384, "learning_rate": 9.987213782232115e-06, "loss": 0.8377, "step": 1787 }, { "epoch": 0.09639853353461289, "grad_norm": 0.8739203810691833, "learning_rate": 9.987198625685643e-06, "loss": 0.8851, "step": 1788 }, { "epoch": 0.09645244770325642, "grad_norm": 0.8984062671661377, "learning_rate": 9.987183460172861e-06, "loss": 0.8773, "step": 1789 }, { "epoch": 0.09650636187189994, "grad_norm": 1.2485296726226807, "learning_rate": 9.987168285693795e-06, "loss": 0.787, "step": 1790 }, { "epoch": 0.09656027604054346, "grad_norm": 0.8414161205291748, "learning_rate": 9.987153102248474e-06, "loss": 0.7895, "step": 1791 }, { "epoch": 0.09661419020918698, "grad_norm": 0.7895180583000183, "learning_rate": 9.987137909836924e-06, "loss": 0.7592, "step": 1792 }, { "epoch": 0.0966681043778305, "grad_norm": 1.0752787590026855, "learning_rate": 9.987122708459173e-06, "loss": 0.8472, "step": 1793 }, { "epoch": 0.09672201854647401, "grad_norm": 0.9069424271583557, "learning_rate": 9.987107498115247e-06, "loss": 0.8746, "step": 1794 }, { "epoch": 0.09677593271511753, "grad_norm": 0.8566716909408569, "learning_rate": 9.987092278805175e-06, "loss": 0.7604, "step": 1795 }, { "epoch": 0.09682984688376105, "grad_norm": 0.833852231502533, "learning_rate": 9.987077050528983e-06, "loss": 0.8645, "step": 1796 }, { "epoch": 0.09688376105240457, "grad_norm": 0.8439596891403198, "learning_rate": 9.9870618132867e-06, "loss": 0.7673, "step": 1797 }, { "epoch": 0.0969376752210481, "grad_norm": 0.9743669629096985, "learning_rate": 9.987046567078352e-06, "loss": 0.7754, "step": 1798 }, { "epoch": 0.09699158938969162, "grad_norm": 0.9291634559631348, "learning_rate": 9.987031311903968e-06, "loss": 0.8431, "step": 1799 }, { "epoch": 0.09704550355833513, "grad_norm": 1.169450283050537, "learning_rate": 9.987016047763571e-06, "loss": 0.9321, "step": 1800 }, { "epoch": 0.09709941772697865, "grad_norm": 0.7758163809776306, "learning_rate": 9.987000774657195e-06, "loss": 0.7832, "step": 1801 }, { "epoch": 0.09715333189562217, "grad_norm": 0.9673672914505005, "learning_rate": 9.986985492584863e-06, "loss": 0.9822, "step": 1802 }, { "epoch": 0.09720724606426569, "grad_norm": 1.1516417264938354, "learning_rate": 9.986970201546605e-06, "loss": 0.9956, "step": 1803 }, { "epoch": 0.0972611602329092, "grad_norm": 0.9660587906837463, "learning_rate": 9.986954901542445e-06, "loss": 0.8248, "step": 1804 }, { "epoch": 0.09731507440155272, "grad_norm": 0.9452739953994751, "learning_rate": 9.986939592572413e-06, "loss": 0.8805, "step": 1805 }, { "epoch": 0.09736898857019624, "grad_norm": 0.9339364171028137, "learning_rate": 9.986924274636538e-06, "loss": 0.8819, "step": 1806 }, { "epoch": 0.09742290273883977, "grad_norm": 0.9344542026519775, "learning_rate": 9.986908947734844e-06, "loss": 0.8531, "step": 1807 }, { "epoch": 0.09747681690748329, "grad_norm": 0.8910528421401978, "learning_rate": 9.986893611867362e-06, "loss": 0.8949, "step": 1808 }, { "epoch": 0.09753073107612681, "grad_norm": 0.8484895825386047, "learning_rate": 9.986878267034115e-06, "loss": 0.8028, "step": 1809 }, { "epoch": 0.09758464524477033, "grad_norm": 1.0784810781478882, "learning_rate": 9.986862913235135e-06, "loss": 0.9564, "step": 1810 }, { "epoch": 0.09763855941341384, "grad_norm": 0.8350296020507812, "learning_rate": 9.98684755047045e-06, "loss": 0.8672, "step": 1811 }, { "epoch": 0.09769247358205736, "grad_norm": 0.8558050990104675, "learning_rate": 9.986832178740084e-06, "loss": 0.8538, "step": 1812 }, { "epoch": 0.09774638775070088, "grad_norm": 0.8633396029472351, "learning_rate": 9.986816798044066e-06, "loss": 0.8356, "step": 1813 }, { "epoch": 0.0978003019193444, "grad_norm": 0.8256344199180603, "learning_rate": 9.986801408382424e-06, "loss": 0.7552, "step": 1814 }, { "epoch": 0.09785421608798792, "grad_norm": 0.872844398021698, "learning_rate": 9.986786009755186e-06, "loss": 0.9153, "step": 1815 }, { "epoch": 0.09790813025663145, "grad_norm": 0.842241108417511, "learning_rate": 9.986770602162378e-06, "loss": 0.7965, "step": 1816 }, { "epoch": 0.09796204442527497, "grad_norm": 0.9673634171485901, "learning_rate": 9.98675518560403e-06, "loss": 0.8317, "step": 1817 }, { "epoch": 0.09801595859391848, "grad_norm": 0.8744896650314331, "learning_rate": 9.98673976008017e-06, "loss": 0.7342, "step": 1818 }, { "epoch": 0.098069872762562, "grad_norm": 0.7830422520637512, "learning_rate": 9.986724325590825e-06, "loss": 0.721, "step": 1819 }, { "epoch": 0.09812378693120552, "grad_norm": 1.0335441827774048, "learning_rate": 9.986708882136021e-06, "loss": 0.8088, "step": 1820 }, { "epoch": 0.09817770109984904, "grad_norm": 0.841342568397522, "learning_rate": 9.986693429715785e-06, "loss": 0.8847, "step": 1821 }, { "epoch": 0.09823161526849256, "grad_norm": 0.9405834674835205, "learning_rate": 9.98667796833015e-06, "loss": 0.8878, "step": 1822 }, { "epoch": 0.09828552943713607, "grad_norm": 0.8358225226402283, "learning_rate": 9.986662497979138e-06, "loss": 0.7377, "step": 1823 }, { "epoch": 0.0983394436057796, "grad_norm": 0.8844004273414612, "learning_rate": 9.98664701866278e-06, "loss": 0.7236, "step": 1824 }, { "epoch": 0.09839335777442312, "grad_norm": 0.8165417313575745, "learning_rate": 9.986631530381105e-06, "loss": 0.819, "step": 1825 }, { "epoch": 0.09844727194306664, "grad_norm": 0.9569553732872009, "learning_rate": 9.986616033134137e-06, "loss": 0.9337, "step": 1826 }, { "epoch": 0.09850118611171016, "grad_norm": 0.8311771750450134, "learning_rate": 9.986600526921907e-06, "loss": 0.8516, "step": 1827 }, { "epoch": 0.09855510028035368, "grad_norm": 0.9444357752799988, "learning_rate": 9.986585011744441e-06, "loss": 0.805, "step": 1828 }, { "epoch": 0.0986090144489972, "grad_norm": 1.0128875970840454, "learning_rate": 9.986569487601769e-06, "loss": 0.8514, "step": 1829 }, { "epoch": 0.09866292861764071, "grad_norm": 0.8973994255065918, "learning_rate": 9.986553954493917e-06, "loss": 0.7938, "step": 1830 }, { "epoch": 0.09871684278628423, "grad_norm": 0.8571779131889343, "learning_rate": 9.986538412420912e-06, "loss": 0.7506, "step": 1831 }, { "epoch": 0.09877075695492775, "grad_norm": 0.9053436517715454, "learning_rate": 9.986522861382785e-06, "loss": 0.8551, "step": 1832 }, { "epoch": 0.09882467112357128, "grad_norm": 0.9941746592521667, "learning_rate": 9.986507301379562e-06, "loss": 0.8828, "step": 1833 }, { "epoch": 0.0988785852922148, "grad_norm": 0.9620066285133362, "learning_rate": 9.986491732411272e-06, "loss": 0.8982, "step": 1834 }, { "epoch": 0.09893249946085832, "grad_norm": 0.9470074772834778, "learning_rate": 9.986476154477941e-06, "loss": 0.8295, "step": 1835 }, { "epoch": 0.09898641362950183, "grad_norm": 0.9962137937545776, "learning_rate": 9.986460567579599e-06, "loss": 0.8714, "step": 1836 }, { "epoch": 0.09904032779814535, "grad_norm": 0.8492829203605652, "learning_rate": 9.986444971716273e-06, "loss": 0.8234, "step": 1837 }, { "epoch": 0.09909424196678887, "grad_norm": 0.9463719725608826, "learning_rate": 9.986429366887994e-06, "loss": 0.7769, "step": 1838 }, { "epoch": 0.09914815613543239, "grad_norm": 0.8588153123855591, "learning_rate": 9.986413753094786e-06, "loss": 0.8883, "step": 1839 }, { "epoch": 0.0992020703040759, "grad_norm": 0.7692183256149292, "learning_rate": 9.986398130336677e-06, "loss": 0.7691, "step": 1840 }, { "epoch": 0.09925598447271942, "grad_norm": 0.8377199172973633, "learning_rate": 9.986382498613699e-06, "loss": 0.789, "step": 1841 }, { "epoch": 0.09930989864136296, "grad_norm": 0.9783869385719299, "learning_rate": 9.986366857925876e-06, "loss": 0.8517, "step": 1842 }, { "epoch": 0.09936381281000647, "grad_norm": 0.8233169913291931, "learning_rate": 9.986351208273239e-06, "loss": 0.8701, "step": 1843 }, { "epoch": 0.09941772697864999, "grad_norm": 0.9393780827522278, "learning_rate": 9.986335549655814e-06, "loss": 0.8837, "step": 1844 }, { "epoch": 0.09947164114729351, "grad_norm": 0.8517693877220154, "learning_rate": 9.986319882073631e-06, "loss": 0.9043, "step": 1845 }, { "epoch": 0.09952555531593703, "grad_norm": 0.8296724557876587, "learning_rate": 9.986304205526718e-06, "loss": 0.7406, "step": 1846 }, { "epoch": 0.09957946948458055, "grad_norm": 0.8372161388397217, "learning_rate": 9.986288520015102e-06, "loss": 0.7763, "step": 1847 }, { "epoch": 0.09963338365322406, "grad_norm": 0.8086470365524292, "learning_rate": 9.986272825538812e-06, "loss": 0.8786, "step": 1848 }, { "epoch": 0.09968729782186758, "grad_norm": 0.8562842011451721, "learning_rate": 9.986257122097875e-06, "loss": 0.8391, "step": 1849 }, { "epoch": 0.0997412119905111, "grad_norm": 0.9052720665931702, "learning_rate": 9.986241409692321e-06, "loss": 0.948, "step": 1850 }, { "epoch": 0.09979512615915463, "grad_norm": 0.8220609426498413, "learning_rate": 9.986225688322178e-06, "loss": 0.8039, "step": 1851 }, { "epoch": 0.09984904032779815, "grad_norm": 0.8018030524253845, "learning_rate": 9.98620995798747e-06, "loss": 0.7748, "step": 1852 }, { "epoch": 0.09990295449644167, "grad_norm": 0.8150879144668579, "learning_rate": 9.986194218688235e-06, "loss": 0.7304, "step": 1853 }, { "epoch": 0.09995686866508519, "grad_norm": 0.8677535653114319, "learning_rate": 9.98617847042449e-06, "loss": 0.8756, "step": 1854 }, { "epoch": 0.1000107828337287, "grad_norm": 0.8889294862747192, "learning_rate": 9.986162713196272e-06, "loss": 0.8926, "step": 1855 } ], "logging_steps": 1, "max_steps": 74192, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 1855, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.474209732975657e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }